]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/aarch64/aarch64.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.cc
CommitLineData
bdb7bf8a 1/* Machine description for AArch64 architecture.
83ffe9cd 2 Copyright (C) 2009-2023 Free Software Foundation, Inc.
43e9d192
IB
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
01736018 23#define INCLUDE_STRING
83d796d3
RS
24#define INCLUDE_ALGORITHM
25#include "config.h"
43e9d192
IB
26#include "system.h"
27#include "coretypes.h"
c7131fb2 28#include "backend.h"
e11c4407
AM
29#include "target.h"
30#include "rtl.h"
c7131fb2 31#include "tree.h"
e73cf9a2 32#include "memmodel.h"
c7131fb2 33#include "gimple.h"
e11c4407
AM
34#include "cfghooks.h"
35#include "cfgloop.h"
c7131fb2 36#include "df.h"
e11c4407
AM
37#include "tm_p.h"
38#include "stringpool.h"
314e6352 39#include "attribs.h"
e11c4407
AM
40#include "optabs.h"
41#include "regs.h"
42#include "emit-rtl.h"
43#include "recog.h"
d9186814 44#include "cgraph.h"
e11c4407 45#include "diagnostic.h"
43e9d192 46#include "insn-attr.h"
40e23961 47#include "alias.h"
40e23961 48#include "fold-const.h"
d8a2d370
DN
49#include "stor-layout.h"
50#include "calls.h"
51#include "varasm.h"
43e9d192 52#include "output.h"
36566b39 53#include "flags.h"
36566b39 54#include "explow.h"
43e9d192
IB
55#include "expr.h"
56#include "reload.h"
43e9d192 57#include "langhooks.h"
5a2c8331 58#include "opts.h"
45b0be94 59#include "gimplify.h"
43e9d192 60#include "dwarf2.h"
61d371eb 61#include "gimple-iterator.h"
8990e73a 62#include "tree-vectorizer.h"
d1bcc29f 63#include "aarch64-cost-tables.h"
0ee859b5 64#include "dumpfile.h"
9b2b7279 65#include "builtins.h"
8baff86e 66#include "rtl-iter.h"
9bbe08fe 67#include "tm-constrs.h"
d03f7e44 68#include "sched-int.h"
d78006d9 69#include "target-globals.h"
a3eb8a52 70#include "common/common-target.h"
43cacb12 71#include "cfgrtl.h"
51b86113
DM
72#include "selftest.h"
73#include "selftest-rtl.h"
43cacb12 74#include "rtx-vector-builder.h"
d9186814 75#include "intl.h"
7d8bdfa7 76#include "expmed.h"
002ffd3c 77#include "function-abi.h"
1205a8ca
RS
78#include "gimple-pretty-print.h"
79#include "tree-ssa-loop-niter.h"
83d796d3 80#include "fractional-cost.h"
63834c84 81#include "rtlanal.h"
526e1639 82#include "tree-dfa.h"
ce09ab17 83#include "asan.h"
11a113d5 84#include "aarch64-feature-deps.h"
43e9d192 85
994c5d85 86/* This file should be included last. */
d58627a0
RS
87#include "target-def.h"
88
28514dda
YZ
89/* Defined for convenience. */
90#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
91
b187677b 92/* Information about a legitimate vector immediate operand. */
48063b9d
IB
93struct simd_immediate_info
94{
0b1fe8cf 95 enum insn_type { MOV, MVN, INDEX, PTRUE };
b187677b
RS
96 enum modifier_type { LSL, MSL };
97
98 simd_immediate_info () {}
99 simd_immediate_info (scalar_float_mode, rtx);
100 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
101 insn_type = MOV, modifier_type = LSL,
102 unsigned int = 0);
43cacb12 103 simd_immediate_info (scalar_mode, rtx, rtx);
0b1fe8cf 104 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
b187677b
RS
105
106 /* The mode of the elements. */
107 scalar_mode elt_mode;
108
b187677b
RS
109 /* The instruction to use to move the immediate into a vector. */
110 insn_type insn;
111
1da83cce
RS
112 union
113 {
114 /* For MOV and MVN. */
115 struct
116 {
117 /* The value of each element. */
118 rtx value;
119
120 /* The kind of shift modifier to use, and the number of bits to shift.
121 This is (LSL, 0) if no shift is needed. */
122 modifier_type modifier;
123 unsigned int shift;
124 } mov;
125
126 /* For INDEX. */
127 struct
128 {
129 /* The value of the first element and the step to be added for each
130 subsequent element. */
131 rtx base, step;
132 } index;
0b1fe8cf
RS
133
134 /* For PTRUE. */
135 aarch64_svpattern pattern;
1da83cce 136 } u;
48063b9d
IB
137};
138
b187677b
RS
139/* Construct a floating-point immediate in which each element has mode
140 ELT_MODE_IN and value VALUE_IN. */
141inline simd_immediate_info
142::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
1da83cce
RS
143 : elt_mode (elt_mode_in), insn (MOV)
144{
145 u.mov.value = value_in;
146 u.mov.modifier = LSL;
147 u.mov.shift = 0;
148}
b187677b
RS
149
150/* Construct an integer immediate in which each element has mode ELT_MODE_IN
151 and value VALUE_IN. The other parameters are as for the structure
152 fields. */
153inline simd_immediate_info
154::simd_immediate_info (scalar_int_mode elt_mode_in,
155 unsigned HOST_WIDE_INT value_in,
156 insn_type insn_in, modifier_type modifier_in,
157 unsigned int shift_in)
1da83cce
RS
158 : elt_mode (elt_mode_in), insn (insn_in)
159{
160 u.mov.value = gen_int_mode (value_in, elt_mode_in);
161 u.mov.modifier = modifier_in;
162 u.mov.shift = shift_in;
163}
43cacb12
RS
164
165/* Construct an integer immediate in which each element has mode ELT_MODE_IN
1da83cce 166 and where element I is equal to BASE_IN + I * STEP_IN. */
43cacb12 167inline simd_immediate_info
1da83cce
RS
168::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
169 : elt_mode (elt_mode_in), insn (INDEX)
170{
171 u.index.base = base_in;
172 u.index.step = step_in;
173}
b187677b 174
0b1fe8cf
RS
175/* Construct a predicate that controls elements of mode ELT_MODE_IN
176 and has PTRUE pattern PATTERN_IN. */
177inline simd_immediate_info
178::simd_immediate_info (scalar_int_mode elt_mode_in,
179 aarch64_svpattern pattern_in)
180 : elt_mode (elt_mode_in), insn (PTRUE)
181{
182 u.pattern = pattern_in;
183}
184
38e62001
RS
185namespace {
186
187/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
188class pure_scalable_type_info
189{
190public:
191 /* Represents the result of analyzing a type. All values are nonzero,
192 in the possibly forlorn hope that accidental conversions to bool
193 trigger a warning. */
194 enum analysis_result
195 {
196 /* The type does not have an ABI identity; i.e. it doesn't contain
197 at least one object whose type is a Fundamental Data Type. */
198 NO_ABI_IDENTITY = 1,
199
200 /* The type is definitely a Pure Scalable Type. */
201 IS_PST,
202
203 /* The type is definitely not a Pure Scalable Type. */
204 ISNT_PST,
205
206 /* It doesn't matter for PCS purposes whether the type is a Pure
207 Scalable Type or not, since the type will be handled the same
208 way regardless.
209
210 Specifically, this means that if the type is a Pure Scalable Type,
211 there aren't enough argument registers to hold it, and so it will
212 need to be passed or returned in memory. If the type isn't a
213 Pure Scalable Type, it's too big to be passed or returned in core
214 or SIMD&FP registers, and so again will need to go in memory. */
215 DOESNT_MATTER
216 };
217
218 /* Aggregates of 17 bytes or more are normally passed and returned
219 in memory, so aggregates of that size can safely be analyzed as
220 DOESNT_MATTER. We need to be able to collect enough pieces to
221 represent a PST that is smaller than that. Since predicates are
222 2 bytes in size for -msve-vector-bits=128, that means we need to be
223 able to store at least 8 pieces.
224
225 We also need to be able to store enough pieces to represent
226 a single vector in each vector argument register and a single
227 predicate in each predicate argument register. This means that
228 we need at least 12 pieces. */
229 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
38e62001 230 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
38e62001
RS
231
232 /* Describes one piece of a PST. Each piece is one of:
233
234 - a single Scalable Vector Type (SVT)
235 - a single Scalable Predicate Type (SPT)
236 - a PST containing 2, 3 or 4 SVTs, with no padding
237
238 It either represents a single built-in type or a PST formed from
239 multiple homogeneous built-in types. */
240 struct piece
241 {
242 rtx get_rtx (unsigned int, unsigned int) const;
243
244 /* The number of vector and predicate registers that the piece
245 occupies. One of the two is always zero. */
246 unsigned int num_zr;
247 unsigned int num_pr;
248
249 /* The mode of the registers described above. */
250 machine_mode mode;
251
252 /* If this piece is formed from multiple homogeneous built-in types,
253 this is the mode of the built-in types, otherwise it is MODE. */
254 machine_mode orig_mode;
255
256 /* The offset in bytes of the piece from the start of the type. */
257 poly_uint64_pod offset;
258 };
259
260 /* Divides types analyzed as IS_PST into individual pieces. The pieces
261 are in memory order. */
262 auto_vec<piece, MAX_PIECES> pieces;
263
264 unsigned int num_zr () const;
265 unsigned int num_pr () const;
266
267 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
268
269 analysis_result analyze (const_tree);
270 bool analyze_registers (const_tree);
271
272private:
273 analysis_result analyze_array (const_tree);
274 analysis_result analyze_record (const_tree);
275 void add_piece (const piece &);
276};
277}
278
43e9d192
IB
279/* The current code model. */
280enum aarch64_code_model aarch64_cmodel;
281
43cacb12
RS
282/* The number of 64-bit elements in an SVE vector. */
283poly_uint16 aarch64_sve_vg;
284
43e9d192
IB
285#ifdef HAVE_AS_TLS
286#undef TARGET_HAVE_TLS
287#define TARGET_HAVE_TLS 1
288#endif
289
ef4bddc2 290static bool aarch64_composite_type_p (const_tree, machine_mode);
38e62001 291static bool aarch64_return_in_memory_1 (const_tree);
ef4bddc2 292static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
43e9d192 293 const_tree,
ef4bddc2 294 machine_mode *, int *,
56fe3ca3 295 bool *, bool);
43e9d192
IB
296static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
297static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
43e9d192 298static void aarch64_override_options_after_change (void);
ef4bddc2 299static bool aarch64_vector_mode_supported_p (machine_mode);
ef4bddc2 300static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
7df76747
N
301static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
302 const_tree type,
303 int misalignment,
304 bool is_packed);
43cacb12 305static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
a25831ac
AV
306static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
307 aarch64_addr_query_type);
88b08073 308
43e9d192 309/* The processor for which instructions should be scheduled. */
02fdbd5b 310enum aarch64_processor aarch64_tune = cortexa53;
43e9d192 311
43e9d192 312/* Mask to specify which instruction scheduling options should be used. */
28108a53 313uint64_t aarch64_tune_flags = 0;
43e9d192 314
1be34295 315/* Global flag for PC relative loads. */
9ee6540a 316bool aarch64_pcrelative_literal_loads;
1be34295 317
d6cb6d6a
WD
318/* Global flag for whether frame pointer is enabled. */
319bool aarch64_use_frame_pointer;
320
efac62a3
ST
321#define BRANCH_PROTECT_STR_MAX 255
322char *accepted_branch_protection_string = NULL;
323
324static enum aarch64_parse_opt_result
325aarch64_parse_branch_protection (const char*, char**);
326
8dec06f2
JG
327/* Support for command line parsing of boolean flags in the tuning
328 structures. */
329struct aarch64_flag_desc
330{
331 const char* name;
332 unsigned int flag;
333};
334
ed9fa8d2 335#define AARCH64_FUSION_PAIR(name, internal_name) \
8dec06f2
JG
336 { name, AARCH64_FUSE_##internal_name },
337static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
338{
339 { "none", AARCH64_FUSE_NOTHING },
340#include "aarch64-fusion-pairs.def"
341 { "all", AARCH64_FUSE_ALL },
342 { NULL, AARCH64_FUSE_NOTHING }
343};
8dec06f2 344
a339a01c 345#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
8dec06f2
JG
346 { name, AARCH64_EXTRA_TUNE_##internal_name },
347static const struct aarch64_flag_desc aarch64_tuning_flags[] =
348{
349 { "none", AARCH64_EXTRA_TUNE_NONE },
350#include "aarch64-tuning-flags.def"
351 { "all", AARCH64_EXTRA_TUNE_ALL },
352 { NULL, AARCH64_EXTRA_TUNE_NONE }
353};
8dec06f2 354
43e9d192
IB
355/* Tuning parameters. */
356
43e9d192
IB
357static const struct cpu_addrcost_table generic_addrcost_table =
358{
67747367 359 {
2fae724a 360 1, /* hi */
bd95e655
JG
361 0, /* si */
362 0, /* di */
2fae724a 363 1, /* ti */
67747367 364 },
bd95e655
JG
365 0, /* pre_modify */
366 0, /* post_modify */
6b8b0c8e
RS
367 0, /* post_modify_ld3_st3 */
368 0, /* post_modify_ld4_st4 */
bd95e655 369 0, /* register_offset */
783879e6
EM
370 0, /* register_sextend */
371 0, /* register_zextend */
bd95e655 372 0 /* imm_offset */
43e9d192
IB
373};
374
5ec1ae3b
EM
375static const struct cpu_addrcost_table exynosm1_addrcost_table =
376{
377 {
378 0, /* hi */
379 0, /* si */
380 0, /* di */
381 2, /* ti */
382 },
383 0, /* pre_modify */
384 0, /* post_modify */
6b8b0c8e
RS
385 0, /* post_modify_ld3_st3 */
386 0, /* post_modify_ld4_st4 */
5ec1ae3b
EM
387 1, /* register_offset */
388 1, /* register_sextend */
389 2, /* register_zextend */
390 0, /* imm_offset */
391};
392
381e27aa
PT
393static const struct cpu_addrcost_table xgene1_addrcost_table =
394{
381e27aa 395 {
bd95e655
JG
396 1, /* hi */
397 0, /* si */
398 0, /* di */
399 1, /* ti */
381e27aa 400 },
bd95e655 401 1, /* pre_modify */
52ddefd8 402 1, /* post_modify */
6b8b0c8e
RS
403 1, /* post_modify_ld3_st3 */
404 1, /* post_modify_ld4_st4 */
bd95e655 405 0, /* register_offset */
783879e6
EM
406 1, /* register_sextend */
407 1, /* register_zextend */
bd95e655 408 0, /* imm_offset */
381e27aa
PT
409};
410
d1261ac6 411static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
ad611a4c
VP
412{
413 {
5f407e57
AP
414 1, /* hi */
415 1, /* si */
416 1, /* di */
ad611a4c
VP
417 2, /* ti */
418 },
419 0, /* pre_modify */
420 0, /* post_modify */
6b8b0c8e
RS
421 0, /* post_modify_ld3_st3 */
422 0, /* post_modify_ld4_st4 */
ad611a4c
VP
423 2, /* register_offset */
424 3, /* register_sextend */
425 3, /* register_zextend */
426 0, /* imm_offset */
427};
428
fa477e45
AY
429static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
430{
431 {
432 1, /* hi */
433 1, /* si */
434 1, /* di */
435 2, /* ti */
436 },
437 0, /* pre_modify */
438 0, /* post_modify */
6b8b0c8e
RS
439 0, /* post_modify_ld3_st3 */
440 0, /* post_modify_ld4_st4 */
fa477e45
AY
441 2, /* register_offset */
442 3, /* register_sextend */
443 3, /* register_zextend */
444 0, /* imm_offset */
445};
446
910f72e7
SZ
447static const struct cpu_addrcost_table tsv110_addrcost_table =
448{
449 {
450 1, /* hi */
451 0, /* si */
452 0, /* di */
453 1, /* ti */
454 },
455 0, /* pre_modify */
456 0, /* post_modify */
6b8b0c8e
RS
457 0, /* post_modify_ld3_st3 */
458 0, /* post_modify_ld4_st4 */
910f72e7
SZ
459 0, /* register_offset */
460 1, /* register_sextend */
461 1, /* register_zextend */
462 0, /* imm_offset */
463};
464
8d39ea2f
LM
465static const struct cpu_addrcost_table qdf24xx_addrcost_table =
466{
467 {
468 1, /* hi */
469 1, /* si */
470 1, /* di */
471 2, /* ti */
472 },
473 1, /* pre_modify */
474 1, /* post_modify */
6b8b0c8e
RS
475 1, /* post_modify_ld3_st3 */
476 1, /* post_modify_ld4_st4 */
8d39ea2f 477 3, /* register_offset */
31508b39 478 3, /* register_sextend */
8d39ea2f
LM
479 3, /* register_zextend */
480 2, /* imm_offset */
481};
482
3f325179
QJ
483static const struct cpu_addrcost_table a64fx_addrcost_table =
484{
485 {
486 1, /* hi */
487 1, /* si */
488 1, /* di */
489 2, /* ti */
490 },
491 0, /* pre_modify */
492 0, /* post_modify */
6b8b0c8e
RS
493 0, /* post_modify_ld3_st3 */
494 0, /* post_modify_ld4_st4 */
3f325179
QJ
495 2, /* register_offset */
496 3, /* register_sextend */
497 3, /* register_zextend */
498 0, /* imm_offset */
499};
500
6b8b0c8e
RS
501static const struct cpu_addrcost_table neoversev1_addrcost_table =
502{
503 {
504 1, /* hi */
505 0, /* si */
506 0, /* di */
507 1, /* ti */
508 },
509 0, /* pre_modify */
510 0, /* post_modify */
511 3, /* post_modify_ld3_st3 */
512 3, /* post_modify_ld4_st4 */
513 0, /* register_offset */
514 0, /* register_sextend */
515 0, /* register_zextend */
516 0 /* imm_offset */
517};
518
a8509301
AV
519static const struct cpu_addrcost_table neoversen2_addrcost_table =
520{
521 {
522 1, /* hi */
523 0, /* si */
524 0, /* di */
525 1, /* ti */
526 },
527 0, /* pre_modify */
528 0, /* post_modify */
529 2, /* post_modify_ld3_st3 */
530 2, /* post_modify_ld4_st4 */
531 0, /* register_offset */
532 0, /* register_sextend */
533 0, /* register_zextend */
534 0 /* imm_offset */
535};
536
14d4b4fb 537static const struct cpu_addrcost_table neoversev2_addrcost_table =
27d8748d
AV
538{
539 {
540 1, /* hi */
541 0, /* si */
542 0, /* di */
543 1, /* ti */
544 },
545 0, /* pre_modify */
546 0, /* post_modify */
547 2, /* post_modify_ld3_st3 */
548 2, /* post_modify_ld4_st4 */
549 0, /* register_offset */
550 0, /* register_sextend */
551 0, /* register_zextend */
552 0 /* imm_offset */
553};
554
43e9d192
IB
555static const struct cpu_regmove_cost generic_regmove_cost =
556{
bd95e655 557 1, /* GP2GP */
3969c510
WD
558 /* Avoid the use of slow int<->fp moves for spilling by setting
559 their cost higher than memmov_cost. */
bd95e655
JG
560 5, /* GP2FP */
561 5, /* FP2GP */
562 2 /* FP2FP */
43e9d192
IB
563};
564
e4a9c55a
WD
565static const struct cpu_regmove_cost cortexa57_regmove_cost =
566{
bd95e655 567 1, /* GP2GP */
e4a9c55a
WD
568 /* Avoid the use of slow int<->fp moves for spilling by setting
569 their cost higher than memmov_cost. */
bd95e655
JG
570 5, /* GP2FP */
571 5, /* FP2GP */
572 2 /* FP2FP */
e4a9c55a
WD
573};
574
575static const struct cpu_regmove_cost cortexa53_regmove_cost =
576{
bd95e655 577 1, /* GP2GP */
e4a9c55a
WD
578 /* Avoid the use of slow int<->fp moves for spilling by setting
579 their cost higher than memmov_cost. */
bd95e655
JG
580 5, /* GP2FP */
581 5, /* FP2GP */
582 2 /* FP2FP */
e4a9c55a
WD
583};
584
5ec1ae3b
EM
585static const struct cpu_regmove_cost exynosm1_regmove_cost =
586{
587 1, /* GP2GP */
588 /* Avoid the use of slow int<->fp moves for spilling by setting
589 their cost higher than memmov_cost (actual, 4 and 9). */
590 9, /* GP2FP */
591 9, /* FP2GP */
592 1 /* FP2FP */
593};
594
d1bcc29f
AP
595static const struct cpu_regmove_cost thunderx_regmove_cost =
596{
bd95e655
JG
597 2, /* GP2GP */
598 2, /* GP2FP */
599 6, /* FP2GP */
600 4 /* FP2FP */
d1bcc29f
AP
601};
602
381e27aa
PT
603static const struct cpu_regmove_cost xgene1_regmove_cost =
604{
bd95e655 605 1, /* GP2GP */
381e27aa
PT
606 /* Avoid the use of slow int<->fp moves for spilling by setting
607 their cost higher than memmov_cost. */
bd95e655
JG
608 8, /* GP2FP */
609 8, /* FP2GP */
610 2 /* FP2FP */
381e27aa
PT
611};
612
ee446d9f
JW
613static const struct cpu_regmove_cost qdf24xx_regmove_cost =
614{
615 2, /* GP2GP */
616 /* Avoid the use of int<->fp moves for spilling. */
617 6, /* GP2FP */
618 6, /* FP2GP */
619 4 /* FP2FP */
620};
621
d1261ac6 622static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
ad611a4c
VP
623{
624 1, /* GP2GP */
625 /* Avoid the use of int<->fp moves for spilling. */
2aeccecb
AY
626 5, /* GP2FP */
627 6, /* FP2GP */
628 3, /* FP2FP */
ad611a4c
VP
629};
630
fa477e45
AY
631static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
632{
633 1, /* GP2GP */
634 /* Avoid the use of int<->fp moves for spilling. */
635 4, /* GP2FP */
636 5, /* FP2GP */
637 4 /* FP2FP */
638};
639
910f72e7
SZ
640static const struct cpu_regmove_cost tsv110_regmove_cost =
641{
642 1, /* GP2GP */
643 /* Avoid the use of slow int<->fp moves for spilling by setting
644 their cost higher than memmov_cost. */
645 2, /* GP2FP */
646 3, /* FP2GP */
647 2 /* FP2FP */
648};
649
3f325179
QJ
650static const struct cpu_regmove_cost a64fx_regmove_cost =
651{
652 1, /* GP2GP */
653 /* Avoid the use of slow int<->fp moves for spilling by setting
654 their cost higher than memmov_cost. */
655 5, /* GP2FP */
656 7, /* FP2GP */
657 2 /* FP2FP */
658};
659
a8509301
AV
660static const struct cpu_regmove_cost neoversen2_regmove_cost =
661{
662 1, /* GP2GP */
663 /* Spilling to int<->fp instead of memory is recommended so set
664 realistic costs compared to memmov_cost. */
665 3, /* GP2FP */
666 2, /* FP2GP */
667 2 /* FP2FP */
668};
669
930eb8b6
AV
670static const struct cpu_regmove_cost neoversev1_regmove_cost =
671{
672 1, /* GP2GP */
673 /* Spilling to int<->fp instead of memory is recommended so set
674 realistic costs compared to memmov_cost. */
675 3, /* GP2FP */
676 2, /* FP2GP */
677 2 /* FP2FP */
678};
679
14d4b4fb 680static const struct cpu_regmove_cost neoversev2_regmove_cost =
27d8748d
AV
681{
682 1, /* GP2GP */
683 /* Spilling to int<->fp instead of memory is recommended so set
684 realistic costs compared to memmov_cost. */
685 3, /* GP2FP */
686 2, /* FP2GP */
687 2 /* FP2FP */
688};
689
76e4f444
KT
690/* Generic costs for Advanced SIMD vector operations. */
691static const advsimd_vec_cost generic_advsimd_vector_cost =
692{
693 1, /* int_stmt_cost */
694 1, /* fp_stmt_cost */
b1a831f0
RS
695 0, /* ld2_st2_permute_cost */
696 0, /* ld3_st3_permute_cost */
697 0, /* ld4_st4_permute_cost */
76e4f444 698 2, /* permute_cost */
e253bb8b
RS
699 2, /* reduc_i8_cost */
700 2, /* reduc_i16_cost */
701 2, /* reduc_i32_cost */
702 2, /* reduc_i64_cost */
703 2, /* reduc_f16_cost */
704 2, /* reduc_f32_cost */
705 2, /* reduc_f64_cost */
d1ff0847 706 2, /* store_elt_extra_cost */
76e4f444
KT
707 2, /* vec_to_scalar_cost */
708 1, /* scalar_to_vec_cost */
709 1, /* align_load_cost */
710 1, /* unalign_load_cost */
711 1, /* unalign_store_cost */
712 1 /* store_cost */
713};
714
715/* Generic costs for SVE vector operations. */
716static const sve_vec_cost generic_sve_vector_cost =
717{
1282988b
RS
718 {
719 1, /* int_stmt_cost */
720 1, /* fp_stmt_cost */
b1a831f0
RS
721 0, /* ld2_st2_permute_cost */
722 0, /* ld3_st3_permute_cost */
723 0, /* ld4_st4_permute_cost */
1282988b
RS
724 2, /* permute_cost */
725 2, /* reduc_i8_cost */
726 2, /* reduc_i16_cost */
727 2, /* reduc_i32_cost */
728 2, /* reduc_i64_cost */
729 2, /* reduc_f16_cost */
730 2, /* reduc_f32_cost */
731 2, /* reduc_f64_cost */
d1ff0847 732 2, /* store_elt_extra_cost */
1282988b
RS
733 2, /* vec_to_scalar_cost */
734 1, /* scalar_to_vec_cost */
735 1, /* align_load_cost */
736 1, /* unalign_load_cost */
737 1, /* unalign_store_cost */
738 1 /* store_cost */
739 },
740 2, /* clast_cost */
741 2, /* fadda_f16_cost */
742 2, /* fadda_f32_cost */
7c679969 743 2, /* fadda_f64_cost */
78770e0e
RS
744 4, /* gather_load_x32_cost */
745 2, /* gather_load_x64_cost */
7c679969 746 1 /* scatter_store_elt_cost */
76e4f444
KT
747};
748
8990e73a 749/* Generic costs for vector insn classes. */
8990e73a
TB
750static const struct cpu_vector_cost generic_vector_cost =
751{
cd8ae5ed
AP
752 1, /* scalar_int_stmt_cost */
753 1, /* scalar_fp_stmt_cost */
bd95e655
JG
754 1, /* scalar_load_cost */
755 1, /* scalar_store_cost */
bd95e655 756 3, /* cond_taken_branch_cost */
76e4f444
KT
757 1, /* cond_not_taken_branch_cost */
758 &generic_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
759 &generic_sve_vector_cost, /* sve */
760 nullptr /* issue_info */
76e4f444
KT
761};
762
3f325179
QJ
763static const advsimd_vec_cost a64fx_advsimd_vector_cost =
764{
765 2, /* int_stmt_cost */
766 5, /* fp_stmt_cost */
b1a831f0
RS
767 0, /* ld2_st2_permute_cost */
768 0, /* ld3_st3_permute_cost */
769 0, /* ld4_st4_permute_cost */
3f325179 770 3, /* permute_cost */
e253bb8b
RS
771 13, /* reduc_i8_cost */
772 13, /* reduc_i16_cost */
773 13, /* reduc_i32_cost */
774 13, /* reduc_i64_cost */
775 13, /* reduc_f16_cost */
776 13, /* reduc_f32_cost */
777 13, /* reduc_f64_cost */
d1ff0847 778 13, /* store_elt_extra_cost */
3f325179
QJ
779 13, /* vec_to_scalar_cost */
780 4, /* scalar_to_vec_cost */
781 6, /* align_load_cost */
782 6, /* unalign_load_cost */
783 1, /* unalign_store_cost */
784 1 /* store_cost */
785};
786
787static const sve_vec_cost a64fx_sve_vector_cost =
788{
1282988b
RS
789 {
790 2, /* int_stmt_cost */
791 5, /* fp_stmt_cost */
b1a831f0
RS
792 0, /* ld2_st2_permute_cost */
793 0, /* ld3_st3_permute_cost */
794 0, /* ld4_st4_permute_cost */
1282988b
RS
795 3, /* permute_cost */
796 13, /* reduc_i8_cost */
797 13, /* reduc_i16_cost */
798 13, /* reduc_i32_cost */
799 13, /* reduc_i64_cost */
800 13, /* reduc_f16_cost */
801 13, /* reduc_f32_cost */
802 13, /* reduc_f64_cost */
d1ff0847 803 13, /* store_elt_extra_cost */
1282988b
RS
804 13, /* vec_to_scalar_cost */
805 4, /* scalar_to_vec_cost */
806 6, /* align_load_cost */
807 6, /* unalign_load_cost */
808 1, /* unalign_store_cost */
809 1 /* store_cost */
810 },
811 13, /* clast_cost */
812 13, /* fadda_f16_cost */
813 13, /* fadda_f32_cost */
7c679969 814 13, /* fadda_f64_cost */
78770e0e
RS
815 64, /* gather_load_x32_cost */
816 32, /* gather_load_x64_cost */
7c679969 817 1 /* scatter_store_elt_cost */
3f325179
QJ
818};
819
820static const struct cpu_vector_cost a64fx_vector_cost =
821{
822 1, /* scalar_int_stmt_cost */
823 5, /* scalar_fp_stmt_cost */
824 4, /* scalar_load_cost */
825 1, /* scalar_store_cost */
826 3, /* cond_taken_branch_cost */
827 1, /* cond_not_taken_branch_cost */
828 &a64fx_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
829 &a64fx_sve_vector_cost, /* sve */
830 nullptr /* issue_info */
3f325179
QJ
831};
832
76e4f444
KT
833static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
834{
835 1, /* int_stmt_cost */
836 3, /* fp_stmt_cost */
b1a831f0
RS
837 0, /* ld2_st2_permute_cost */
838 0, /* ld3_st3_permute_cost */
839 0, /* ld4_st4_permute_cost */
76e4f444 840 2, /* permute_cost */
e253bb8b
RS
841 1, /* reduc_i8_cost */
842 1, /* reduc_i16_cost */
843 1, /* reduc_i32_cost */
844 1, /* reduc_i64_cost */
845 1, /* reduc_f16_cost */
846 1, /* reduc_f32_cost */
847 1, /* reduc_f64_cost */
d1ff0847 848 1, /* store_elt_extra_cost */
76e4f444
KT
849 1, /* vec_to_scalar_cost */
850 1, /* scalar_to_vec_cost */
851 1, /* align_load_cost */
852 1, /* unalign_load_cost */
853 1, /* unalign_store_cost */
854 1 /* store_cost */
8990e73a
TB
855};
856
e75bc10e
LM
857/* QDF24XX costs for vector insn classes. */
858static const struct cpu_vector_cost qdf24xx_vector_cost =
859{
860 1, /* scalar_int_stmt_cost */
861 1, /* scalar_fp_stmt_cost */
862 1, /* scalar_load_cost */
863 1, /* scalar_store_cost */
e75bc10e 864 3, /* cond_taken_branch_cost */
76e4f444
KT
865 1, /* cond_not_taken_branch_cost */
866 &qdf24xx_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
867 nullptr, /* sve */
868 nullptr /* issue_info */
76e4f444
KT
869};
870
871
872static const advsimd_vec_cost thunderx_advsimd_vector_cost =
873{
874 4, /* int_stmt_cost */
875 1, /* fp_stmt_cost */
b1a831f0
RS
876 0, /* ld2_st2_permute_cost */
877 0, /* ld3_st3_permute_cost */
878 0, /* ld4_st4_permute_cost */
76e4f444 879 4, /* permute_cost */
e253bb8b
RS
880 2, /* reduc_i8_cost */
881 2, /* reduc_i16_cost */
882 2, /* reduc_i32_cost */
883 2, /* reduc_i64_cost */
884 2, /* reduc_f16_cost */
885 2, /* reduc_f32_cost */
886 2, /* reduc_f64_cost */
d1ff0847 887 2, /* store_elt_extra_cost */
76e4f444
KT
888 2, /* vec_to_scalar_cost */
889 2, /* scalar_to_vec_cost */
890 3, /* align_load_cost */
891 5, /* unalign_load_cost */
892 5, /* unalign_store_cost */
893 1 /* store_cost */
e75bc10e
LM
894};
895
c3f20327
AP
896/* ThunderX costs for vector insn classes. */
897static const struct cpu_vector_cost thunderx_vector_cost =
898{
cd8ae5ed
AP
899 1, /* scalar_int_stmt_cost */
900 1, /* scalar_fp_stmt_cost */
c3f20327
AP
901 3, /* scalar_load_cost */
902 1, /* scalar_store_cost */
c3f20327 903 3, /* cond_taken_branch_cost */
76e4f444
KT
904 3, /* cond_not_taken_branch_cost */
905 &thunderx_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
906 nullptr, /* sve */
907 nullptr /* issue_info */
76e4f444
KT
908};
909
910static const advsimd_vec_cost tsv110_advsimd_vector_cost =
911{
912 2, /* int_stmt_cost */
913 2, /* fp_stmt_cost */
b1a831f0
RS
914 0, /* ld2_st2_permute_cost */
915 0, /* ld3_st3_permute_cost */
916 0, /* ld4_st4_permute_cost */
76e4f444 917 2, /* permute_cost */
e253bb8b
RS
918 3, /* reduc_i8_cost */
919 3, /* reduc_i16_cost */
920 3, /* reduc_i32_cost */
921 3, /* reduc_i64_cost */
922 3, /* reduc_f16_cost */
923 3, /* reduc_f32_cost */
924 3, /* reduc_f64_cost */
d1ff0847 925 3, /* store_elt_extra_cost */
76e4f444
KT
926 3, /* vec_to_scalar_cost */
927 2, /* scalar_to_vec_cost */
928 5, /* align_load_cost */
929 5, /* unalign_load_cost */
930 1, /* unalign_store_cost */
931 1 /* store_cost */
c3f20327
AP
932};
933
910f72e7
SZ
934static const struct cpu_vector_cost tsv110_vector_cost =
935{
936 1, /* scalar_int_stmt_cost */
937 1, /* scalar_fp_stmt_cost */
938 5, /* scalar_load_cost */
939 1, /* scalar_store_cost */
910f72e7 940 1, /* cond_taken_branch_cost */
76e4f444
KT
941 1, /* cond_not_taken_branch_cost */
942 &tsv110_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
943 nullptr, /* sve */
944 nullptr /* issue_info */
910f72e7
SZ
945};
946
76e4f444
KT
947static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
948{
949 2, /* int_stmt_cost */
950 2, /* fp_stmt_cost */
b1a831f0
RS
951 0, /* ld2_st2_permute_cost */
952 0, /* ld3_st3_permute_cost */
953 0, /* ld4_st4_permute_cost */
76e4f444 954 3, /* permute_cost */
e253bb8b
RS
955 8, /* reduc_i8_cost */
956 8, /* reduc_i16_cost */
957 8, /* reduc_i32_cost */
958 8, /* reduc_i64_cost */
959 8, /* reduc_f16_cost */
960 8, /* reduc_f32_cost */
961 8, /* reduc_f64_cost */
d1ff0847 962 8, /* store_elt_extra_cost */
76e4f444
KT
963 8, /* vec_to_scalar_cost */
964 8, /* scalar_to_vec_cost */
965 4, /* align_load_cost */
966 4, /* unalign_load_cost */
967 1, /* unalign_store_cost */
968 1 /* store_cost */
969};
970
971/* Cortex-A57 costs for vector insn classes. */
60bff090
JG
972static const struct cpu_vector_cost cortexa57_vector_cost =
973{
cd8ae5ed
AP
974 1, /* scalar_int_stmt_cost */
975 1, /* scalar_fp_stmt_cost */
bd95e655
JG
976 4, /* scalar_load_cost */
977 1, /* scalar_store_cost */
bd95e655 978 1, /* cond_taken_branch_cost */
76e4f444
KT
979 1, /* cond_not_taken_branch_cost */
980 &cortexa57_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
981 nullptr, /* sve */
982 nullptr /* issue_info */
76e4f444
KT
983};
984
985static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
986{
987 3, /* int_stmt_cost */
988 3, /* fp_stmt_cost */
b1a831f0
RS
989 0, /* ld2_st2_permute_cost */
990 0, /* ld3_st3_permute_cost */
991 0, /* ld4_st4_permute_cost */
76e4f444 992 3, /* permute_cost */
e253bb8b
RS
993 3, /* reduc_i8_cost */
994 3, /* reduc_i16_cost */
995 3, /* reduc_i32_cost */
996 3, /* reduc_i64_cost */
997 3, /* reduc_f16_cost */
998 3, /* reduc_f32_cost */
999 3, /* reduc_f64_cost */
d1ff0847 1000 3, /* store_elt_extra_cost */
76e4f444
KT
1001 3, /* vec_to_scalar_cost */
1002 3, /* scalar_to_vec_cost */
1003 5, /* align_load_cost */
1004 5, /* unalign_load_cost */
1005 1, /* unalign_store_cost */
1006 1 /* store_cost */
60bff090
JG
1007};
1008
5ec1ae3b
EM
1009static const struct cpu_vector_cost exynosm1_vector_cost =
1010{
cd8ae5ed
AP
1011 1, /* scalar_int_stmt_cost */
1012 1, /* scalar_fp_stmt_cost */
5ec1ae3b
EM
1013 5, /* scalar_load_cost */
1014 1, /* scalar_store_cost */
5ec1ae3b 1015 1, /* cond_taken_branch_cost */
76e4f444
KT
1016 1, /* cond_not_taken_branch_cost */
1017 &exynosm1_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1018 nullptr, /* sve */
1019 nullptr /* issue_info */
76e4f444
KT
1020};
1021
1022static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1023{
1024 2, /* int_stmt_cost */
1025 2, /* fp_stmt_cost */
b1a831f0
RS
1026 0, /* ld2_st2_permute_cost */
1027 0, /* ld3_st3_permute_cost */
1028 0, /* ld4_st4_permute_cost */
76e4f444 1029 2, /* permute_cost */
e253bb8b
RS
1030 4, /* reduc_i8_cost */
1031 4, /* reduc_i16_cost */
1032 4, /* reduc_i32_cost */
1033 4, /* reduc_i64_cost */
1034 4, /* reduc_f16_cost */
1035 4, /* reduc_f32_cost */
1036 4, /* reduc_f64_cost */
d1ff0847 1037 4, /* store_elt_extra_cost */
76e4f444
KT
1038 4, /* vec_to_scalar_cost */
1039 4, /* scalar_to_vec_cost */
1040 10, /* align_load_cost */
1041 10, /* unalign_load_cost */
1042 2, /* unalign_store_cost */
1043 2 /* store_cost */
5ec1ae3b
EM
1044};
1045
381e27aa 1046/* Generic costs for vector insn classes. */
381e27aa
PT
1047static const struct cpu_vector_cost xgene1_vector_cost =
1048{
cd8ae5ed
AP
1049 1, /* scalar_int_stmt_cost */
1050 1, /* scalar_fp_stmt_cost */
bd95e655
JG
1051 5, /* scalar_load_cost */
1052 1, /* scalar_store_cost */
bd95e655 1053 2, /* cond_taken_branch_cost */
76e4f444
KT
1054 1, /* cond_not_taken_branch_cost */
1055 &xgene1_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1056 nullptr, /* sve */
1057 nullptr /* issue_info */
76e4f444
KT
1058};
1059
1060static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1061{
1062 4, /* int_stmt_cost */
1063 5, /* fp_stmt_cost */
b1a831f0
RS
1064 0, /* ld2_st2_permute_cost */
1065 0, /* ld3_st3_permute_cost */
1066 0, /* ld4_st4_permute_cost */
76e4f444 1067 10, /* permute_cost */
e253bb8b
RS
1068 6, /* reduc_i8_cost */
1069 6, /* reduc_i16_cost */
1070 6, /* reduc_i32_cost */
1071 6, /* reduc_i64_cost */
1072 6, /* reduc_f16_cost */
1073 6, /* reduc_f32_cost */
1074 6, /* reduc_f64_cost */
d1ff0847 1075 6, /* store_elt_extra_cost */
76e4f444
KT
1076 6, /* vec_to_scalar_cost */
1077 5, /* scalar_to_vec_cost */
1078 4, /* align_load_cost */
1079 4, /* unalign_load_cost */
1080 1, /* unalign_store_cost */
1081 1 /* store_cost */
381e27aa
PT
1082};
1083
ad611a4c 1084/* Costs for vector insn classes for Vulcan. */
d1261ac6 1085static const struct cpu_vector_cost thunderx2t99_vector_cost =
ad611a4c 1086{
cd8ae5ed
AP
1087 1, /* scalar_int_stmt_cost */
1088 6, /* scalar_fp_stmt_cost */
ad611a4c
VP
1089 4, /* scalar_load_cost */
1090 1, /* scalar_store_cost */
ad611a4c 1091 2, /* cond_taken_branch_cost */
76e4f444
KT
1092 1, /* cond_not_taken_branch_cost */
1093 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1094 nullptr, /* sve */
1095 nullptr /* issue_info */
76e4f444
KT
1096};
1097
1098static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1099{
1100 5, /* int_stmt_cost */
1101 5, /* fp_stmt_cost */
b1a831f0
RS
1102 0, /* ld2_st2_permute_cost */
1103 0, /* ld3_st3_permute_cost */
1104 0, /* ld4_st4_permute_cost */
76e4f444 1105 10, /* permute_cost */
e253bb8b
RS
1106 5, /* reduc_i8_cost */
1107 5, /* reduc_i16_cost */
1108 5, /* reduc_i32_cost */
1109 5, /* reduc_i64_cost */
1110 5, /* reduc_f16_cost */
1111 5, /* reduc_f32_cost */
1112 5, /* reduc_f64_cost */
d1ff0847 1113 5, /* store_elt_extra_cost */
76e4f444
KT
1114 5, /* vec_to_scalar_cost */
1115 5, /* scalar_to_vec_cost */
1116 4, /* align_load_cost */
1117 4, /* unalign_load_cost */
1118 4, /* unalign_store_cost */
1119 4 /* store_cost */
ad611a4c
VP
1120};
1121
fa477e45
AY
1122static const struct cpu_vector_cost thunderx3t110_vector_cost =
1123{
1124 1, /* scalar_int_stmt_cost */
1125 5, /* scalar_fp_stmt_cost */
1126 4, /* scalar_load_cost */
1127 1, /* scalar_store_cost */
fa477e45 1128 2, /* cond_taken_branch_cost */
76e4f444
KT
1129 1, /* cond_not_taken_branch_cost */
1130 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
1131 nullptr, /* sve */
1132 nullptr /* issue_info */
fa477e45
AY
1133};
1134
67b0d47e
PT
1135static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1136{
1137 3, /* int_stmt_cost */
1138 3, /* fp_stmt_cost */
1139 0, /* ld2_st2_permute_cost */
1140 0, /* ld3_st3_permute_cost */
1141 0, /* ld4_st4_permute_cost */
1142 2, /* permute_cost */
1143 12, /* reduc_i8_cost */
1144 9, /* reduc_i16_cost */
1145 6, /* reduc_i32_cost */
1146 5, /* reduc_i64_cost */
1147 9, /* reduc_f16_cost */
1148 6, /* reduc_f32_cost */
1149 5, /* reduc_f64_cost */
1150 8, /* store_elt_extra_cost */
1151 6, /* vec_to_scalar_cost */
1152 7, /* scalar_to_vec_cost */
1153 5, /* align_load_cost */
1154 5, /* unalign_load_cost */
1155 2, /* unalign_store_cost */
1156 2 /* store_cost */
1157};
1158
1159/* Ampere-1 costs for vector insn classes. */
1160static const struct cpu_vector_cost ampere1_vector_cost =
1161{
1162 1, /* scalar_int_stmt_cost */
1163 1, /* scalar_fp_stmt_cost */
1164 4, /* scalar_load_cost */
1165 1, /* scalar_store_cost */
1166 1, /* cond_taken_branch_cost */
1167 1, /* cond_not_taken_branch_cost */
1168 &ampere1_advsimd_vector_cost, /* advsimd */
1169 nullptr, /* sve */
1170 nullptr /* issue_info */
1171};
fa477e45 1172
b9066f5a
MW
1173/* Generic costs for branch instructions. */
1174static const struct cpu_branch_cost generic_branch_cost =
1175{
9094d4a4
WD
1176 1, /* Predictable. */
1177 3 /* Unpredictable. */
b9066f5a
MW
1178};
1179
9acc9cbe
EM
1180/* Generic approximation modes. */
1181static const cpu_approx_modes generic_approx_modes =
1182{
79a2bc2d 1183 AARCH64_APPROX_NONE, /* division */
98daafa0 1184 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
1185 AARCH64_APPROX_NONE /* recip_sqrt */
1186};
1187
1188/* Approximation modes for Exynos M1. */
1189static const cpu_approx_modes exynosm1_approx_modes =
1190{
79a2bc2d 1191 AARCH64_APPROX_NONE, /* division */
98daafa0 1192 AARCH64_APPROX_ALL, /* sqrt */
9acc9cbe
EM
1193 AARCH64_APPROX_ALL /* recip_sqrt */
1194};
1195
1196/* Approximation modes for X-Gene 1. */
1197static const cpu_approx_modes xgene1_approx_modes =
1198{
79a2bc2d 1199 AARCH64_APPROX_NONE, /* division */
98daafa0 1200 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
1201 AARCH64_APPROX_ALL /* recip_sqrt */
1202};
1203
9d2c6e2e
MK
1204/* Generic prefetch settings (which disable prefetch). */
1205static const cpu_prefetch_tune generic_prefetch_tune =
1206{
1207 0, /* num_slots */
1208 -1, /* l1_cache_size */
1209 -1, /* l1_cache_line_size */
16b2cafd 1210 -1, /* l2_cache_size */
d2ff35c0 1211 true, /* prefetch_dynamic_strides */
59100dfc 1212 -1, /* minimum_stride */
16b2cafd 1213 -1 /* default_opt_level */
9d2c6e2e
MK
1214};
1215
1216static const cpu_prefetch_tune exynosm1_prefetch_tune =
1217{
1218 0, /* num_slots */
1219 -1, /* l1_cache_size */
1220 64, /* l1_cache_line_size */
16b2cafd 1221 -1, /* l2_cache_size */
d2ff35c0 1222 true, /* prefetch_dynamic_strides */
59100dfc 1223 -1, /* minimum_stride */
16b2cafd 1224 -1 /* default_opt_level */
9d2c6e2e
MK
1225};
1226
1227static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1228{
70c51b58
MK
1229 4, /* num_slots */
1230 32, /* l1_cache_size */
9d2c6e2e 1231 64, /* l1_cache_line_size */
725e2110 1232 512, /* l2_cache_size */
d2ff35c0 1233 false, /* prefetch_dynamic_strides */
59100dfc
LM
1234 2048, /* minimum_stride */
1235 3 /* default_opt_level */
9d2c6e2e
MK
1236};
1237
f1e247d0
AP
1238static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1239{
1240 8, /* num_slots */
1241 32, /* l1_cache_size */
1242 128, /* l1_cache_line_size */
1243 16*1024, /* l2_cache_size */
d2ff35c0 1244 true, /* prefetch_dynamic_strides */
59100dfc 1245 -1, /* minimum_stride */
f1e247d0
AP
1246 3 /* default_opt_level */
1247};
1248
1249static const cpu_prefetch_tune thunderx_prefetch_tune =
1250{
1251 8, /* num_slots */
1252 32, /* l1_cache_size */
1253 128, /* l1_cache_line_size */
1254 -1, /* l2_cache_size */
d2ff35c0 1255 true, /* prefetch_dynamic_strides */
59100dfc 1256 -1, /* minimum_stride */
f1e247d0
AP
1257 -1 /* default_opt_level */
1258};
1259
9d2c6e2e
MK
1260static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1261{
f1e247d0
AP
1262 8, /* num_slots */
1263 32, /* l1_cache_size */
9d2c6e2e 1264 64, /* l1_cache_line_size */
f1e247d0 1265 256, /* l2_cache_size */
d2ff35c0 1266 true, /* prefetch_dynamic_strides */
59100dfc 1267 -1, /* minimum_stride */
16b2cafd 1268 -1 /* default_opt_level */
9d2c6e2e
MK
1269};
1270
fa477e45
AY
1271static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1272{
1273 8, /* num_slots */
1274 32, /* l1_cache_size */
1275 64, /* l1_cache_line_size */
1276 256, /* l2_cache_size */
1277 true, /* prefetch_dynamic_strides */
1278 -1, /* minimum_stride */
1279 -1 /* default_opt_level */
1280};
1281
910f72e7
SZ
1282static const cpu_prefetch_tune tsv110_prefetch_tune =
1283{
1284 0, /* num_slots */
1285 64, /* l1_cache_size */
1286 64, /* l1_cache_line_size */
1287 512, /* l2_cache_size */
1288 true, /* prefetch_dynamic_strides */
1289 -1, /* minimum_stride */
1290 -1 /* default_opt_level */
1291};
1292
d5e9851e
CM
1293static const cpu_prefetch_tune xgene1_prefetch_tune =
1294{
1295 8, /* num_slots */
1296 32, /* l1_cache_size */
1297 64, /* l1_cache_line_size */
1298 256, /* l2_cache_size */
1299 true, /* prefetch_dynamic_strides */
1300 -1, /* minimum_stride */
1301 -1 /* default_opt_level */
1302};
1303
02f21aea
QJ
1304static const cpu_prefetch_tune a64fx_prefetch_tune =
1305{
1306 8, /* num_slots */
1307 64, /* l1_cache_size */
1308 256, /* l1_cache_line_size */
1309 32768, /* l2_cache_size */
1310 true, /* prefetch_dynamic_strides */
1311 -1, /* minimum_stride */
1312 -1 /* default_opt_level */
1313};
1314
67b0d47e
PT
1315static const cpu_prefetch_tune ampere1_prefetch_tune =
1316{
1317 0, /* num_slots */
1318 64, /* l1_cache_size */
1319 64, /* l1_cache_line_size */
1320 2048, /* l2_cache_size */
1321 true, /* prefetch_dynamic_strides */
1322 -1, /* minimum_stride */
1323 -1 /* default_opt_level */
1324};
1325
43e9d192
IB
1326static const struct tune_params generic_tunings =
1327{
4e2cd668 1328 &cortexa57_extra_costs,
43e9d192
IB
1329 &generic_addrcost_table,
1330 &generic_regmove_cost,
8990e73a 1331 &generic_vector_cost,
b9066f5a 1332 &generic_branch_cost,
9acc9cbe 1333 &generic_approx_modes,
2d56d6ba 1334 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1335 { 4, /* load_int. */
1336 4, /* store_int. */
1337 4, /* load_fp. */
1338 4, /* store_fp. */
1339 4, /* load_pred. */
1340 4 /* store_pred. */
1341 }, /* memmov_cost. */
bd95e655 1342 2, /* issue_rate */
6ed8c923 1343 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
4e55aefa 1344 "16:12", /* function_align. */
c518c102
ML
1345 "4", /* jump_align. */
1346 "8", /* loop_align. */
cee66c68
WD
1347 2, /* int_reassoc_width. */
1348 4, /* fp_reassoc_width. */
0c1b0a23 1349 1, /* fma_reassoc_width. */
50093a33
WD
1350 1, /* vec_reassoc_width. */
1351 2, /* min_div_recip_mul_sf. */
dfba575f 1352 2, /* min_div_recip_mul_df. */
50487d79 1353 0, /* max_case_values. */
3b4c0f7e 1354 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
8f0c9d53
KT
1355 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1356 Neoverse V1. It does not have a noticeable effect on A64FX and should
1357 have at most a very minor effect on SVE2 cores. */
1358 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
9d2c6e2e 1359 &generic_prefetch_tune
43e9d192
IB
1360};
1361
1c72a3ca
JG
1362static const struct tune_params cortexa35_tunings =
1363{
1364 &cortexa53_extra_costs,
1365 &generic_addrcost_table,
1366 &cortexa53_regmove_cost,
1367 &generic_vector_cost,
aca97ef8 1368 &generic_branch_cost,
9acc9cbe 1369 &generic_approx_modes,
2d56d6ba 1370 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1371 { 4, /* load_int. */
1372 4, /* store_int. */
1373 4, /* load_fp. */
1374 4, /* store_fp. */
1375 4, /* load_pred. */
1376 4 /* store_pred. */
1377 }, /* memmov_cost. */
1c72a3ca 1378 1, /* issue_rate */
0bc24338 1379 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1c72a3ca 1380 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1381 "16", /* function_align. */
1382 "4", /* jump_align. */
1383 "8", /* loop_align. */
1c72a3ca
JG
1384 2, /* int_reassoc_width. */
1385 4, /* fp_reassoc_width. */
0c1b0a23 1386 1, /* fma_reassoc_width. */
1c72a3ca
JG
1387 1, /* vec_reassoc_width. */
1388 2, /* min_div_recip_mul_sf. */
1389 2, /* min_div_recip_mul_df. */
1390 0, /* max_case_values. */
1c72a3ca 1391 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1392 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1393 &generic_prefetch_tune
1c72a3ca
JG
1394};
1395
984239ad
KT
1396static const struct tune_params cortexa53_tunings =
1397{
1398 &cortexa53_extra_costs,
1399 &generic_addrcost_table,
e4a9c55a 1400 &cortexa53_regmove_cost,
984239ad 1401 &generic_vector_cost,
aca97ef8 1402 &generic_branch_cost,
9acc9cbe 1403 &generic_approx_modes,
2d56d6ba 1404 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1405 { 4, /* load_int. */
1406 4, /* store_int. */
1407 4, /* load_fp. */
1408 4, /* store_fp. */
1409 4, /* load_pred. */
1410 4 /* store_pred. */
1411 }, /* memmov_cost. */
bd95e655 1412 2, /* issue_rate */
00a8574a 1413 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 1414 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1415 "16", /* function_align. */
1416 "4", /* jump_align. */
1417 "8", /* loop_align. */
cee66c68
WD
1418 2, /* int_reassoc_width. */
1419 4, /* fp_reassoc_width. */
0c1b0a23 1420 1, /* fma_reassoc_width. */
50093a33
WD
1421 1, /* vec_reassoc_width. */
1422 2, /* min_div_recip_mul_sf. */
dfba575f 1423 2, /* min_div_recip_mul_df. */
50487d79 1424 0, /* max_case_values. */
2d6bc7fa 1425 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1426 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1427 &generic_prefetch_tune
984239ad
KT
1428};
1429
4fd92af6
KT
1430static const struct tune_params cortexa57_tunings =
1431{
1432 &cortexa57_extra_costs,
a39d4348 1433 &generic_addrcost_table,
e4a9c55a 1434 &cortexa57_regmove_cost,
60bff090 1435 &cortexa57_vector_cost,
aca97ef8 1436 &generic_branch_cost,
9acc9cbe 1437 &generic_approx_modes,
2d56d6ba 1438 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1439 { 4, /* load_int. */
1440 4, /* store_int. */
1441 4, /* load_fp. */
1442 4, /* store_fp. */
1443 4, /* load_pred. */
1444 4 /* store_pred. */
1445 }, /* memmov_cost. */
bd95e655 1446 3, /* issue_rate */
00a8574a 1447 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 1448 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
1449 "16", /* function_align. */
1450 "4", /* jump_align. */
1451 "8", /* loop_align. */
cee66c68
WD
1452 2, /* int_reassoc_width. */
1453 4, /* fp_reassoc_width. */
0c1b0a23 1454 1, /* fma_reassoc_width. */
50093a33
WD
1455 1, /* vec_reassoc_width. */
1456 2, /* min_div_recip_mul_sf. */
dfba575f 1457 2, /* min_div_recip_mul_df. */
50487d79 1458 0, /* max_case_values. */
2d6bc7fa 1459 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1460 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1461 &generic_prefetch_tune
dfba575f
JG
1462};
1463
1464static const struct tune_params cortexa72_tunings =
1465{
1466 &cortexa57_extra_costs,
a39d4348 1467 &generic_addrcost_table,
dfba575f
JG
1468 &cortexa57_regmove_cost,
1469 &cortexa57_vector_cost,
aca97ef8 1470 &generic_branch_cost,
9acc9cbe 1471 &generic_approx_modes,
2d56d6ba 1472 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1473 { 4, /* load_int. */
1474 4, /* store_int. */
1475 4, /* load_fp. */
1476 4, /* store_fp. */
1477 4, /* load_pred. */
1478 4 /* store_pred. */
1479 }, /* memmov_cost. */
dfba575f 1480 3, /* issue_rate */
00a8574a 1481 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
dfba575f 1482 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
1483 "16", /* function_align. */
1484 "4", /* jump_align. */
1485 "8", /* loop_align. */
dfba575f
JG
1486 2, /* int_reassoc_width. */
1487 4, /* fp_reassoc_width. */
0c1b0a23 1488 1, /* fma_reassoc_width. */
dfba575f
JG
1489 1, /* vec_reassoc_width. */
1490 2, /* min_div_recip_mul_sf. */
1491 2, /* min_div_recip_mul_df. */
50487d79 1492 0, /* max_case_values. */
0bc24338 1493 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1494 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1495 &generic_prefetch_tune
4fd92af6
KT
1496};
1497
4fb570c4
KT
1498static const struct tune_params cortexa73_tunings =
1499{
1500 &cortexa57_extra_costs,
a39d4348 1501 &generic_addrcost_table,
4fb570c4
KT
1502 &cortexa57_regmove_cost,
1503 &cortexa57_vector_cost,
aca97ef8 1504 &generic_branch_cost,
4fb570c4 1505 &generic_approx_modes,
2d56d6ba 1506 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1507 { 4, /* load_int. */
1508 4, /* store_int. */
1509 4, /* load_fp. */
1510 4, /* store_fp. */
1511 4, /* load_pred. */
1512 4 /* store_pred. */
1513 }, /* memmov_cost. */
4fb570c4
KT
1514 2, /* issue_rate. */
1515 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1516 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1517 "16", /* function_align. */
1518 "4", /* jump_align. */
1519 "8", /* loop_align. */
4fb570c4
KT
1520 2, /* int_reassoc_width. */
1521 4, /* fp_reassoc_width. */
0c1b0a23 1522 1, /* fma_reassoc_width. */
4fb570c4
KT
1523 1, /* vec_reassoc_width. */
1524 2, /* min_div_recip_mul_sf. */
1525 2, /* min_div_recip_mul_df. */
1526 0, /* max_case_values. */
4fb570c4 1527 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1528 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1529 &generic_prefetch_tune
4fb570c4
KT
1530};
1531
9d2c6e2e
MK
1532
1533
5ec1ae3b
EM
1534static const struct tune_params exynosm1_tunings =
1535{
1536 &exynosm1_extra_costs,
1537 &exynosm1_addrcost_table,
1538 &exynosm1_regmove_cost,
1539 &exynosm1_vector_cost,
1540 &generic_branch_cost,
9acc9cbe 1541 &exynosm1_approx_modes,
2d56d6ba 1542 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1543 { 4, /* load_int. */
1544 4, /* store_int. */
1545 4, /* load_fp. */
1546 4, /* store_fp. */
1547 4, /* load_pred. */
1548 4 /* store_pred. */
1549 }, /* memmov_cost. */
5ec1ae3b 1550 3, /* issue_rate */
25cc2199 1551 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
c518c102
ML
1552 "4", /* function_align. */
1553 "4", /* jump_align. */
1554 "4", /* loop_align. */
5ec1ae3b
EM
1555 2, /* int_reassoc_width. */
1556 4, /* fp_reassoc_width. */
0c1b0a23 1557 1, /* fma_reassoc_width. */
5ec1ae3b
EM
1558 1, /* vec_reassoc_width. */
1559 2, /* min_div_recip_mul_sf. */
1560 2, /* min_div_recip_mul_df. */
1561 48, /* max_case_values. */
220379df 1562 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1563 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1564 &exynosm1_prefetch_tune
5ec1ae3b
EM
1565};
1566
f1e247d0
AP
1567static const struct tune_params thunderxt88_tunings =
1568{
1569 &thunderx_extra_costs,
1570 &generic_addrcost_table,
1571 &thunderx_regmove_cost,
1572 &thunderx_vector_cost,
1573 &generic_branch_cost,
1574 &generic_approx_modes,
2d56d6ba 1575 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1576 { 6, /* load_int. */
1577 6, /* store_int. */
1578 6, /* load_fp. */
1579 6, /* store_fp. */
1580 6, /* load_pred. */
1581 6 /* store_pred. */
1582 }, /* memmov_cost. */
f1e247d0 1583 2, /* issue_rate */
a4f3fa71 1584 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1585 "8", /* function_align. */
1586 "8", /* jump_align. */
1587 "8", /* loop_align. */
f1e247d0
AP
1588 2, /* int_reassoc_width. */
1589 4, /* fp_reassoc_width. */
0c1b0a23 1590 1, /* fma_reassoc_width. */
f1e247d0
AP
1591 1, /* vec_reassoc_width. */
1592 2, /* min_div_recip_mul_sf. */
1593 2, /* min_div_recip_mul_df. */
1594 0, /* max_case_values. */
1595 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1596 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1597 &thunderxt88_prefetch_tune
1598};
1599
d1bcc29f
AP
1600static const struct tune_params thunderx_tunings =
1601{
1602 &thunderx_extra_costs,
1603 &generic_addrcost_table,
1604 &thunderx_regmove_cost,
c3f20327 1605 &thunderx_vector_cost,
b9066f5a 1606 &generic_branch_cost,
9acc9cbe 1607 &generic_approx_modes,
2d56d6ba 1608 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1609 { 6, /* load_int. */
1610 6, /* store_int. */
1611 6, /* load_fp. */
1612 6, /* store_fp. */
1613 6, /* load_pred. */
1614 6 /* store_pred. */
1615 }, /* memmov_cost. */
bd95e655 1616 2, /* issue_rate */
a4f3fa71 1617 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1618 "8", /* function_align. */
1619 "8", /* jump_align. */
1620 "8", /* loop_align. */
cee66c68
WD
1621 2, /* int_reassoc_width. */
1622 4, /* fp_reassoc_width. */
0c1b0a23 1623 1, /* fma_reassoc_width. */
50093a33
WD
1624 1, /* vec_reassoc_width. */
1625 2, /* min_div_recip_mul_sf. */
dfba575f 1626 2, /* min_div_recip_mul_df. */
50487d79 1627 0, /* max_case_values. */
2d6bc7fa 1628 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
b10f1009
AP
1629 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1630 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
f1e247d0 1631 &thunderx_prefetch_tune
d1bcc29f
AP
1632};
1633
910f72e7
SZ
1634static const struct tune_params tsv110_tunings =
1635{
1636 &tsv110_extra_costs,
1637 &tsv110_addrcost_table,
1638 &tsv110_regmove_cost,
1639 &tsv110_vector_cost,
1640 &generic_branch_cost,
1641 &generic_approx_modes,
2d56d6ba 1642 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1643 { 4, /* load_int. */
1644 4, /* store_int. */
1645 4, /* load_fp. */
1646 4, /* store_fp. */
1647 4, /* load_pred. */
1648 4 /* store_pred. */
1649 }, /* memmov_cost. */
910f72e7 1650 4, /* issue_rate */
a4f3fa71
WD
1651 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1652 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
910f72e7
SZ
1653 "16", /* function_align. */
1654 "4", /* jump_align. */
1655 "8", /* loop_align. */
1656 2, /* int_reassoc_width. */
1657 4, /* fp_reassoc_width. */
0c1b0a23 1658 1, /* fma_reassoc_width. */
910f72e7
SZ
1659 1, /* vec_reassoc_width. */
1660 2, /* min_div_recip_mul_sf. */
1661 2, /* min_div_recip_mul_df. */
1662 0, /* max_case_values. */
1663 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1664 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1665 &tsv110_prefetch_tune
1666};
1667
381e27aa 1668static const struct tune_params xgene1_tunings =
e02669db
CM
1669{
1670 &xgene1_extra_costs,
1671 &xgene1_addrcost_table,
1672 &xgene1_regmove_cost,
1673 &xgene1_vector_cost,
1674 &generic_branch_cost,
1675 &xgene1_approx_modes,
2d56d6ba 1676 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1677 { 6, /* load_int. */
1678 6, /* store_int. */
1679 6, /* load_fp. */
1680 6, /* store_fp. */
1681 6, /* load_pred. */
1682 6 /* store_pred. */
1683 }, /* memmov_cost. */
e02669db
CM
1684 4, /* issue_rate */
1685 AARCH64_FUSE_NOTHING, /* fusible_ops */
1686 "16", /* function_align. */
1687 "16", /* jump_align. */
1688 "16", /* loop_align. */
1689 2, /* int_reassoc_width. */
1690 4, /* fp_reassoc_width. */
0c1b0a23 1691 1, /* fma_reassoc_width. */
e02669db
CM
1692 1, /* vec_reassoc_width. */
1693 2, /* min_div_recip_mul_sf. */
1694 2, /* min_div_recip_mul_df. */
1695 17, /* max_case_values. */
1696 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1697 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1698 &xgene1_prefetch_tune
1699};
1700
1701static const struct tune_params emag_tunings =
381e27aa
PT
1702{
1703 &xgene1_extra_costs,
1704 &xgene1_addrcost_table,
1705 &xgene1_regmove_cost,
1706 &xgene1_vector_cost,
b9066f5a 1707 &generic_branch_cost,
9acc9cbe 1708 &xgene1_approx_modes,
2d56d6ba 1709 SVE_NOT_IMPLEMENTED,
b074fa69
AV
1710 { 6, /* load_int. */
1711 6, /* store_int. */
1712 6, /* load_fp. */
1713 6, /* store_fp. */
1714 6, /* load_pred. */
1715 6 /* store_pred. */
1716 }, /* memmov_cost. */
bd95e655 1717 4, /* issue_rate */
e9a3a175 1718 AARCH64_FUSE_NOTHING, /* fusible_ops */
c518c102 1719 "16", /* function_align. */
cf28c77e 1720 "16", /* jump_align. */
c518c102 1721 "16", /* loop_align. */
381e27aa
PT
1722 2, /* int_reassoc_width. */
1723 4, /* fp_reassoc_width. */
0c1b0a23 1724 1, /* fma_reassoc_width. */
50093a33
WD
1725 1, /* vec_reassoc_width. */
1726 2, /* min_div_recip_mul_sf. */
dfba575f 1727 2, /* min_div_recip_mul_df. */
cf28c77e 1728 17, /* max_case_values. */
2d6bc7fa 1729 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
9f5361c8 1730 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
d5e9851e 1731 &xgene1_prefetch_tune
381e27aa
PT
1732};
1733
ee446d9f
JW
1734static const struct tune_params qdf24xx_tunings =
1735{
1736 &qdf24xx_extra_costs,
8d39ea2f 1737 &qdf24xx_addrcost_table,
ee446d9f 1738 &qdf24xx_regmove_cost,
e75bc10e 1739 &qdf24xx_vector_cost,
ee446d9f
JW
1740 &generic_branch_cost,
1741 &generic_approx_modes,
2d56d6ba 1742 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1743 { 4, /* load_int. */
1744 4, /* store_int. */
1745 4, /* load_fp. */
1746 4, /* store_fp. */
1747 4, /* load_pred. */
1748 4 /* store_pred. */
1749 }, /* memmov_cost. */
ee446d9f
JW
1750 4, /* issue_rate */
1751 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1752 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1753 "16", /* function_align. */
1754 "8", /* jump_align. */
1755 "16", /* loop_align. */
ee446d9f
JW
1756 2, /* int_reassoc_width. */
1757 4, /* fp_reassoc_width. */
0c1b0a23 1758 1, /* fma_reassoc_width. */
ee446d9f
JW
1759 1, /* vec_reassoc_width. */
1760 2, /* min_div_recip_mul_sf. */
1761 2, /* min_div_recip_mul_df. */
1762 0, /* max_case_values. */
4f2a94e6 1763 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
a98824ac 1764 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
9d2c6e2e 1765 &qdf24xx_prefetch_tune
ee446d9f
JW
1766};
1767
52ee8191
SP
1768/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1769 for now. */
1770static const struct tune_params saphira_tunings =
1771{
1772 &generic_extra_costs,
1773 &generic_addrcost_table,
1774 &generic_regmove_cost,
1775 &generic_vector_cost,
1776 &generic_branch_cost,
1777 &generic_approx_modes,
2d56d6ba 1778 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1779 { 4, /* load_int. */
1780 4, /* store_int. */
1781 4, /* load_fp. */
1782 4, /* store_fp. */
1783 4, /* load_pred. */
1784 4 /* store_pred. */
1785 }, /* memmov_cost. */
52ee8191
SP
1786 4, /* issue_rate */
1787 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1788 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1789 "16", /* function_align. */
1790 "8", /* jump_align. */
1791 "16", /* loop_align. */
52ee8191
SP
1792 2, /* int_reassoc_width. */
1793 4, /* fp_reassoc_width. */
0c1b0a23 1794 1, /* fma_reassoc_width. */
52ee8191
SP
1795 1, /* vec_reassoc_width. */
1796 2, /* min_div_recip_mul_sf. */
1797 2, /* min_div_recip_mul_df. */
1798 0, /* max_case_values. */
1799 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1800 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1801 &generic_prefetch_tune
1802};
1803
d1261ac6 1804static const struct tune_params thunderx2t99_tunings =
ad611a4c 1805{
d1261ac6
AP
1806 &thunderx2t99_extra_costs,
1807 &thunderx2t99_addrcost_table,
1808 &thunderx2t99_regmove_cost,
1809 &thunderx2t99_vector_cost,
aca97ef8 1810 &generic_branch_cost,
ad611a4c 1811 &generic_approx_modes,
2d56d6ba 1812 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1813 { 4, /* load_int. */
1814 4, /* store_int. */
1815 4, /* load_fp. */
1816 4, /* store_fp. */
1817 4, /* load_pred. */
1818 4 /* store_pred. */
1819 }, /* memmov_cost. */
ad611a4c 1820 4, /* issue_rate. */
a4f3fa71
WD
1821 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1822 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
c518c102
ML
1823 "16", /* function_align. */
1824 "8", /* jump_align. */
1825 "16", /* loop_align. */
ad611a4c
VP
1826 3, /* int_reassoc_width. */
1827 2, /* fp_reassoc_width. */
0c1b0a23 1828 1, /* fma_reassoc_width. */
ad611a4c
VP
1829 2, /* vec_reassoc_width. */
1830 2, /* min_div_recip_mul_sf. */
1831 2, /* min_div_recip_mul_df. */
1832 0, /* max_case_values. */
f1e247d0 1833 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1834 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1835 &thunderx2t99_prefetch_tune
ad611a4c
VP
1836};
1837
fa477e45
AY
1838static const struct tune_params thunderx3t110_tunings =
1839{
1840 &thunderx3t110_extra_costs,
1841 &thunderx3t110_addrcost_table,
1842 &thunderx3t110_regmove_cost,
1843 &thunderx3t110_vector_cost,
1844 &generic_branch_cost,
1845 &generic_approx_modes,
1846 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1847 { 4, /* load_int. */
1848 4, /* store_int. */
1849 4, /* load_fp. */
1850 4, /* store_fp. */
1851 4, /* load_pred. */
1852 4 /* store_pred. */
1853 }, /* memmov_cost. */
fa477e45
AY
1854 6, /* issue_rate. */
1855 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1856 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1857 "16", /* function_align. */
1858 "8", /* jump_align. */
1859 "16", /* loop_align. */
1860 3, /* int_reassoc_width. */
1861 2, /* fp_reassoc_width. */
0c1b0a23 1862 1, /* fma_reassoc_width. */
fa477e45
AY
1863 2, /* vec_reassoc_width. */
1864 2, /* min_div_recip_mul_sf. */
1865 2, /* min_div_recip_mul_df. */
1866 0, /* max_case_values. */
1867 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1868 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1869 &thunderx3t110_prefetch_tune
1870};
1871
9ed6834d 1872static const struct tune_params neoversen1_tunings =
fc881de2 1873{
5c5a67e6 1874 &cortexa76_extra_costs,
fc881de2
KT
1875 &generic_addrcost_table,
1876 &generic_regmove_cost,
1877 &cortexa57_vector_cost,
1878 &generic_branch_cost,
1879 &generic_approx_modes,
1880 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1881 { 4, /* load_int. */
1882 2, /* store_int. */
1883 5, /* load_fp. */
1884 2, /* store_fp. */
1885 4, /* load_pred. */
1886 4 /* store_pred. */
1887 }, /* memmov_cost. */
fc881de2 1888 3, /* issue_rate */
6ed8c923 1889 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
fc881de2 1890 "32:16", /* function_align. */
3a434597 1891 "4", /* jump_align. */
fc881de2
KT
1892 "32:16", /* loop_align. */
1893 2, /* int_reassoc_width. */
1894 4, /* fp_reassoc_width. */
0c1b0a23 1895 1, /* fma_reassoc_width. */
fc881de2
KT
1896 2, /* vec_reassoc_width. */
1897 2, /* min_div_recip_mul_sf. */
1898 2, /* min_div_recip_mul_df. */
1899 0, /* max_case_values. */
1900 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
b326f495 1901 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
fc881de2
KT
1902 &generic_prefetch_tune
1903};
1904
67b0d47e
PT
1905static const struct tune_params ampere1_tunings =
1906{
1907 &ampere1_extra_costs,
1908 &generic_addrcost_table,
1909 &generic_regmove_cost,
1910 &ampere1_vector_cost,
1911 &generic_branch_cost,
1912 &generic_approx_modes,
1913 SVE_NOT_IMPLEMENTED, /* sve_width */
b074fa69
AV
1914 { 4, /* load_int. */
1915 4, /* store_int. */
1916 4, /* load_fp. */
1917 4, /* store_fp. */
1918 4, /* load_pred. */
1919 4 /* store_pred. */
1920 }, /* memmov_cost. */
67b0d47e
PT
1921 4, /* issue_rate */
1922 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1923 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1924 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1925 AARCH64_FUSE_CMP_BRANCH),
1926 /* fusible_ops */
1927 "32", /* function_align. */
1928 "4", /* jump_align. */
1929 "32:16", /* loop_align. */
1930 2, /* int_reassoc_width. */
1931 4, /* fp_reassoc_width. */
0c1b0a23 1932 1, /* fma_reassoc_width. */
67b0d47e
PT
1933 2, /* vec_reassoc_width. */
1934 2, /* min_div_recip_mul_sf. */
1935 2, /* min_div_recip_mul_df. */
1936 0, /* max_case_values. */
1937 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1938 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1939 &ampere1_prefetch_tune
1940};
1941
590a06af
PT
1942static const struct tune_params ampere1a_tunings =
1943{
1944 &ampere1a_extra_costs,
1945 &generic_addrcost_table,
1946 &generic_regmove_cost,
1947 &ampere1_vector_cost,
1948 &generic_branch_cost,
1949 &generic_approx_modes,
1950 SVE_NOT_IMPLEMENTED, /* sve_width */
1951 { 4, /* load_int. */
1952 4, /* store_int. */
1953 4, /* load_fp. */
1954 4, /* store_fp. */
1955 4, /* load_pred. */
1956 4 /* store_pred. */
1957 }, /* memmov_cost. */
1958 4, /* issue_rate */
1959 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1960 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1961 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1962 AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1963 AARCH64_FUSE_ADDSUB_2REG_CONST1),
1964 /* fusible_ops */
1965 "32", /* function_align. */
1966 "4", /* jump_align. */
1967 "32:16", /* loop_align. */
1968 2, /* int_reassoc_width. */
1969 4, /* fp_reassoc_width. */
0c1b0a23 1970 1, /* fma_reassoc_width. */
590a06af
PT
1971 2, /* vec_reassoc_width. */
1972 2, /* min_div_recip_mul_sf. */
1973 2, /* min_div_recip_mul_df. */
1974 0, /* max_case_values. */
1975 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1976 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1977 &ampere1_prefetch_tune
1978};
1979
14bd21c2
RS
1980static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1981{
1982 2, /* int_stmt_cost */
1983 2, /* fp_stmt_cost */
1984 4, /* ld2_st2_permute_cost */
1985 4, /* ld3_st3_permute_cost */
1986 5, /* ld4_st4_permute_cost */
1987 3, /* permute_cost */
1988 4, /* reduc_i8_cost */
1989 4, /* reduc_i16_cost */
1990 2, /* reduc_i32_cost */
1991 2, /* reduc_i64_cost */
1992 6, /* reduc_f16_cost */
1993 3, /* reduc_f32_cost */
1994 2, /* reduc_f64_cost */
1995 2, /* store_elt_extra_cost */
1996 /* This value is just inherited from the Cortex-A57 table. */
1997 8, /* vec_to_scalar_cost */
1998 /* This depends very much on what the scalar value is and
1999 where it comes from. E.g. some constants take two dependent
2000 instructions or a load, while others might be moved from a GPR.
2001 4 seems to be a reasonable compromise in practice. */
2002 4, /* scalar_to_vec_cost */
2003 4, /* align_load_cost */
2004 4, /* unalign_load_cost */
2005 /* Although stores have a latency of 2 and compete for the
2006 vector pipes, in practice it's better not to model that. */
2007 1, /* unalign_store_cost */
2008 1 /* store_cost */
2009};
2010
2011static const sve_vec_cost neoversev1_sve_vector_cost =
2012{
2013 {
2014 2, /* int_stmt_cost */
2015 2, /* fp_stmt_cost */
2016 4, /* ld2_st2_permute_cost */
2017 7, /* ld3_st3_permute_cost */
2018 8, /* ld4_st4_permute_cost */
2019 3, /* permute_cost */
2020 /* Theoretically, a reduction involving 31 scalar ADDs could
2021 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
2022 completes in 14 cycles, so give it a cost of 31 + 5. */
2023 36, /* reduc_i8_cost */
2024 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
2025 22, /* reduc_i16_cost */
2026 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
2027 14, /* reduc_i32_cost */
2028 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
2029 11, /* reduc_i64_cost */
2030 /* Theoretically, a reduction involving 15 scalar FADDs could
2031 complete in ~9 cycles and would have a cost of 30. FADDV
2032 completes in 13 cycles, so give it a cost of 30 + 4. */
2033 34, /* reduc_f16_cost */
2034 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
2035 19, /* reduc_f32_cost */
2036 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
2037 11, /* reduc_f64_cost */
2038 2, /* store_elt_extra_cost */
2039 /* This value is just inherited from the Cortex-A57 table. */
2040 8, /* vec_to_scalar_cost */
2041 /* See the comment above the Advanced SIMD versions. */
2042 4, /* scalar_to_vec_cost */
2043 4, /* align_load_cost */
2044 4, /* unalign_load_cost */
2045 /* Although stores have a latency of 2 and compete for the
2046 vector pipes, in practice it's better not to model that. */
2047 1, /* unalign_store_cost */
2048 1 /* store_cost */
2049 },
2050 3, /* clast_cost */
2051 19, /* fadda_f16_cost */
2052 11, /* fadda_f32_cost */
2053 8, /* fadda_f64_cost */
78770e0e
RS
2054 32, /* gather_load_x32_cost */
2055 16, /* gather_load_x64_cost */
14bd21c2
RS
2056 3 /* scatter_store_elt_cost */
2057};
2058
1205a8ca
RS
2059static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2060{
2061 3, /* loads_stores_per_cycle */
2062 2, /* stores_per_cycle */
2063 4, /* general_ops_per_cycle */
2064 0, /* fp_simd_load_general_ops */
2065 1 /* fp_simd_store_general_ops */
2066};
2067
2068static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2069{
2070 {
2071 3, /* loads_stores_per_cycle */
2072 2, /* stores_per_cycle */
2073 4, /* general_ops_per_cycle */
2074 0, /* fp_simd_load_general_ops */
2075 1 /* fp_simd_store_general_ops */
2076 },
2077 2, /* ld2_st2_general_ops */
2078 2, /* ld3_st3_general_ops */
2079 3 /* ld4_st4_general_ops */
2080};
2081
2082static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2083{
2084 {
2085 {
2086 2, /* loads_per_cycle */
2087 2, /* stores_per_cycle */
2088 2, /* general_ops_per_cycle */
2089 0, /* fp_simd_load_general_ops */
2090 1 /* fp_simd_store_general_ops */
2091 },
2092 2, /* ld2_st2_general_ops */
2093 2, /* ld3_st3_general_ops */
2094 3 /* ld4_st4_general_ops */
2095 },
2096 1, /* pred_ops_per_cycle */
2097 2, /* while_pred_ops */
2098 2, /* int_cmp_pred_ops */
2099 1, /* fp_cmp_pred_ops */
2100 1, /* gather_scatter_pair_general_ops */
2101 1 /* gather_scatter_pair_pred_ops */
2102};
2103
2104static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2105{
2106 &neoversev1_scalar_issue_info,
2107 &neoversev1_advsimd_issue_info,
2108 &neoversev1_sve_issue_info
2109};
2110
14bd21c2
RS
2111/* Neoverse V1 costs for vector insn classes. */
2112static const struct cpu_vector_cost neoversev1_vector_cost =
2113{
2114 1, /* scalar_int_stmt_cost */
2115 2, /* scalar_fp_stmt_cost */
2116 4, /* scalar_load_cost */
2117 1, /* scalar_store_cost */
2118 1, /* cond_taken_branch_cost */
2119 1, /* cond_not_taken_branch_cost */
2120 &neoversev1_advsimd_vector_cost, /* advsimd */
1205a8ca
RS
2121 &neoversev1_sve_vector_cost, /* sve */
2122 &neoversev1_vec_issue_info /* issue_info */
14bd21c2
RS
2123};
2124
c8c77ed7
KT
2125static const struct tune_params neoversev1_tunings =
2126{
5c5a67e6 2127 &cortexa76_extra_costs,
6b8b0c8e 2128 &neoversev1_addrcost_table,
930eb8b6 2129 &neoversev1_regmove_cost,
14bd21c2 2130 &neoversev1_vector_cost,
c8c77ed7
KT
2131 &generic_branch_cost,
2132 &generic_approx_modes,
2133 SVE_256, /* sve_width */
b074fa69 2134 { 4, /* load_int. */
930eb8b6 2135 2, /* store_int. */
b074fa69
AV
2136 6, /* load_fp. */
2137 2, /* store_fp. */
2138 6, /* load_pred. */
2139 1 /* store_pred. */
2140 }, /* memmov_cost. */
c8c77ed7
KT
2141 3, /* issue_rate */
2142 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2143 "32:16", /* function_align. */
2144 "4", /* jump_align. */
2145 "32:16", /* loop_align. */
2146 2, /* int_reassoc_width. */
2147 4, /* fp_reassoc_width. */
0c1b0a23 2148 4, /* fma_reassoc_width. */
c8c77ed7
KT
2149 2, /* vec_reassoc_width. */
2150 2, /* min_div_recip_mul_sf. */
2151 2, /* min_div_recip_mul_df. */
2152 0, /* max_case_values. */
2153 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
14bd21c2 2154 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
3b924b0d 2155 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
c437d334
WD
2156 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2157 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
c8c77ed7
KT
2158 &generic_prefetch_tune
2159};
2160
048039c4
RS
2161static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2162{
2163 {
2164 2, /* int_stmt_cost */
2165 2, /* fp_stmt_cost */
2166 4, /* ld2_st2_permute_cost */
2167 5, /* ld3_st3_permute_cost */
2168 5, /* ld4_st4_permute_cost */
2169 3, /* permute_cost */
2170 /* Theoretically, a reduction involving 15 scalar ADDs could
2171 complete in ~5 cycles and would have a cost of 15. Assume that
2172 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
2173 21, /* reduc_i8_cost */
2174 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2175 13, /* reduc_i16_cost */
2176 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2177 9, /* reduc_i32_cost */
2178 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
2179 8, /* reduc_i64_cost */
2180 /* Theoretically, a reduction involving 7 scalar FADDs could
2181 complete in ~6 cycles and would have a cost of 14. Assume that
2182 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
2183 16, /* reduc_f16_cost */
2184 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2185 8, /* reduc_f32_cost */
2186 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
2187 4, /* reduc_f64_cost */
2188 2, /* store_elt_extra_cost */
2189 /* This value is just inherited from the Cortex-A57 table. */
2190 8, /* vec_to_scalar_cost */
2191 /* This depends very much on what the scalar value is and
2192 where it comes from. E.g. some constants take two dependent
2193 instructions or a load, while others might be moved from a GPR.
2194 4 seems to be a reasonable compromise in practice. */
2195 4, /* scalar_to_vec_cost */
2196 4, /* align_load_cost */
2197 4, /* unalign_load_cost */
2198 /* Although stores generally have a latency of 2 and compete for the
2199 vector pipes, in practice it's better not to model that. */
2200 1, /* unalign_store_cost */
2201 1 /* store_cost */
2202 },
2203 3, /* clast_cost */
2204 10, /* fadda_f16_cost */
2205 6, /* fadda_f32_cost */
2206 4, /* fadda_f64_cost */
2207 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2208 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2209 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2210 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2211 (cost 2) to that, to avoid the difference being lost in rounding.
2212
2213 There is no easy comparison between a strided Advanced SIMD x32 load
2214 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2215 operation more than a 64-bit gather. */
2216 14, /* gather_load_x32_cost */
2217 12, /* gather_load_x64_cost */
2218 3 /* scatter_store_elt_cost */
2219};
2220
2221static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2222{
2223 {
2224 {
2225 3, /* loads_per_cycle */
2226 2, /* stores_per_cycle */
2227 4, /* general_ops_per_cycle */
2228 0, /* fp_simd_load_general_ops */
2229 1 /* fp_simd_store_general_ops */
2230 },
2231 2, /* ld2_st2_general_ops */
2232 2, /* ld3_st3_general_ops */
2233 3 /* ld4_st4_general_ops */
2234 },
2235 2, /* pred_ops_per_cycle */
2236 2, /* while_pred_ops */
2237 2, /* int_cmp_pred_ops */
2238 1, /* fp_cmp_pred_ops */
2239 1, /* gather_scatter_pair_general_ops */
2240 1 /* gather_scatter_pair_pred_ops */
2241};
2242
2243static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2244{
2245 &neoversev1_scalar_issue_info,
2246 &neoversev1_advsimd_issue_info,
2247 &neoverse512tvb_sve_issue_info
2248};
2249
2250static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2251{
2252 1, /* scalar_int_stmt_cost */
2253 2, /* scalar_fp_stmt_cost */
2254 4, /* scalar_load_cost */
2255 1, /* scalar_store_cost */
2256 1, /* cond_taken_branch_cost */
2257 1, /* cond_not_taken_branch_cost */
2258 &neoversev1_advsimd_vector_cost, /* advsimd */
2259 &neoverse512tvb_sve_vector_cost, /* sve */
2260 &neoverse512tvb_vec_issue_info /* issue_info */
2261};
2262
2263static const struct tune_params neoverse512tvb_tunings =
2264{
2265 &cortexa76_extra_costs,
2266 &neoversev1_addrcost_table,
930eb8b6 2267 &neoversev1_regmove_cost,
048039c4
RS
2268 &neoverse512tvb_vector_cost,
2269 &generic_branch_cost,
2270 &generic_approx_modes,
2271 SVE_128 | SVE_256, /* sve_width */
b074fa69 2272 { 4, /* load_int. */
930eb8b6 2273 2, /* store_int. */
b074fa69
AV
2274 6, /* load_fp. */
2275 2, /* store_fp. */
2276 6, /* load_pred. */
2277 1 /* store_pred. */
2278 }, /* memmov_cost. */
048039c4
RS
2279 3, /* issue_rate */
2280 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2281 "32:16", /* function_align. */
2282 "4", /* jump_align. */
2283 "32:16", /* loop_align. */
2284 2, /* int_reassoc_width. */
2285 4, /* fp_reassoc_width. */
0c1b0a23 2286 4, /* fma_reassoc_width. */
048039c4
RS
2287 2, /* vec_reassoc_width. */
2288 2, /* min_div_recip_mul_sf. */
2289 2, /* min_div_recip_mul_df. */
2290 0, /* max_case_values. */
2291 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2292 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2293 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2294 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2295 &generic_prefetch_tune
2296};
2297
a8509301
AV
2298static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2299{
2300 2, /* int_stmt_cost */
2301 2, /* fp_stmt_cost */
2302 2, /* ld2_st2_permute_cost */
2303 2, /* ld3_st3_permute_cost */
2304 3, /* ld4_st4_permute_cost */
2305 3, /* permute_cost */
2306 4, /* reduc_i8_cost */
2307 4, /* reduc_i16_cost */
2308 2, /* reduc_i32_cost */
2309 2, /* reduc_i64_cost */
2310 6, /* reduc_f16_cost */
2311 4, /* reduc_f32_cost */
2312 2, /* reduc_f64_cost */
2313 2, /* store_elt_extra_cost */
2314 /* This value is just inherited from the Cortex-A57 table. */
2315 8, /* vec_to_scalar_cost */
2316 /* This depends very much on what the scalar value is and
2317 where it comes from. E.g. some constants take two dependent
2318 instructions or a load, while others might be moved from a GPR.
2319 4 seems to be a reasonable compromise in practice. */
2320 4, /* scalar_to_vec_cost */
2321 4, /* align_load_cost */
2322 4, /* unalign_load_cost */
2323 /* Although stores have a latency of 2 and compete for the
2324 vector pipes, in practice it's better not to model that. */
2325 1, /* unalign_store_cost */
2326 1 /* store_cost */
2327};
2328
2329static const sve_vec_cost neoversen2_sve_vector_cost =
2330{
2331 {
2332 2, /* int_stmt_cost */
2333 2, /* fp_stmt_cost */
2334 3, /* ld2_st2_permute_cost */
2335 4, /* ld3_st3_permute_cost */
2336 4, /* ld4_st4_permute_cost */
2337 3, /* permute_cost */
2338 /* Theoretically, a reduction involving 15 scalar ADDs could
2339 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
2340 completes in 11 cycles, so give it a cost of 15 + 6. */
2341 21, /* reduc_i8_cost */
2342 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2343 13, /* reduc_i16_cost */
2344 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2345 9, /* reduc_i32_cost */
2346 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2347 2, /* reduc_i64_cost */
2348 /* Theoretically, a reduction involving 7 scalar FADDs could
2349 complete in ~8 cycles and would have a cost of 14. FADDV
2350 completes in 6 cycles, so give it a cost of 14 - 2. */
2351 12, /* reduc_f16_cost */
2352 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
2353 6, /* reduc_f32_cost */
2354 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2355 2, /* reduc_f64_cost */
2356 2, /* store_elt_extra_cost */
2357 /* This value is just inherited from the Cortex-A57 table. */
2358 8, /* vec_to_scalar_cost */
2359 /* See the comment above the Advanced SIMD versions. */
2360 4, /* scalar_to_vec_cost */
2361 4, /* align_load_cost */
2362 4, /* unalign_load_cost */
2363 /* Although stores have a latency of 2 and compete for the
2364 vector pipes, in practice it's better not to model that. */
2365 1, /* unalign_store_cost */
2366 1 /* store_cost */
2367 },
2368 3, /* clast_cost */
2369 10, /* fadda_f16_cost */
2370 6, /* fadda_f32_cost */
2371 4, /* fadda_f64_cost */
2372 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2373 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2374 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2375 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2376 (cost 2) to that, to avoid the difference being lost in rounding.
2377
2378 There is no easy comparison between a strided Advanced SIMD x32 load
2379 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2380 operation more than a 64-bit gather. */
2381 14, /* gather_load_x32_cost */
2382 12, /* gather_load_x64_cost */
2383 3 /* scatter_store_elt_cost */
2384};
2385
2386static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2387{
2388 3, /* loads_stores_per_cycle */
2389 2, /* stores_per_cycle */
2390 4, /* general_ops_per_cycle */
2391 0, /* fp_simd_load_general_ops */
2392 1 /* fp_simd_store_general_ops */
2393};
2394
2395static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2396{
2397 {
2398 3, /* loads_stores_per_cycle */
2399 2, /* stores_per_cycle */
2400 2, /* general_ops_per_cycle */
2401 0, /* fp_simd_load_general_ops */
2402 1 /* fp_simd_store_general_ops */
2403 },
2404 2, /* ld2_st2_general_ops */
2405 2, /* ld3_st3_general_ops */
2406 3 /* ld4_st4_general_ops */
2407};
2408
2409static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2410{
2411 {
2412 {
2413 3, /* loads_per_cycle */
2414 2, /* stores_per_cycle */
2415 2, /* general_ops_per_cycle */
2416 0, /* fp_simd_load_general_ops */
2417 1 /* fp_simd_store_general_ops */
2418 },
2419 2, /* ld2_st2_general_ops */
2420 3, /* ld3_st3_general_ops */
2421 3 /* ld4_st4_general_ops */
2422 },
2423 2, /* pred_ops_per_cycle */
2424 2, /* while_pred_ops */
2425 2, /* int_cmp_pred_ops */
2426 1, /* fp_cmp_pred_ops */
2427 1, /* gather_scatter_pair_general_ops */
2428 1 /* gather_scatter_pair_pred_ops */
2429};
2430
2431static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2432{
2433 &neoversen2_scalar_issue_info,
2434 &neoversen2_advsimd_issue_info,
2435 &neoversen2_sve_issue_info
2436};
2437
2438/* Neoverse N2 costs for vector insn classes. */
2439static const struct cpu_vector_cost neoversen2_vector_cost =
2440{
2441 1, /* scalar_int_stmt_cost */
2442 2, /* scalar_fp_stmt_cost */
2443 4, /* scalar_load_cost */
2444 1, /* scalar_store_cost */
2445 1, /* cond_taken_branch_cost */
2446 1, /* cond_not_taken_branch_cost */
2447 &neoversen2_advsimd_vector_cost, /* advsimd */
2448 &neoversen2_sve_vector_cost, /* sve */
2449 &neoversen2_vec_issue_info /* issue_info */
2450};
2451
25095d1e
KT
2452static const struct tune_params neoversen2_tunings =
2453{
5c5a67e6 2454 &cortexa76_extra_costs,
a8509301
AV
2455 &neoversen2_addrcost_table,
2456 &neoversen2_regmove_cost,
2457 &neoversen2_vector_cost,
25095d1e
KT
2458 &generic_branch_cost,
2459 &generic_approx_modes,
2460 SVE_128, /* sve_width */
b074fa69
AV
2461 { 4, /* load_int. */
2462 1, /* store_int. */
2463 6, /* load_fp. */
2464 2, /* store_fp. */
2465 6, /* load_pred. */
2466 1 /* store_pred. */
2467 }, /* memmov_cost. */
25095d1e
KT
2468 3, /* issue_rate */
2469 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2470 "32:16", /* function_align. */
2471 "4", /* jump_align. */
2472 "32:16", /* loop_align. */
2473 2, /* int_reassoc_width. */
2474 4, /* fp_reassoc_width. */
0c1b0a23 2475 1, /* fma_reassoc_width. */
25095d1e
KT
2476 2, /* vec_reassoc_width. */
2477 2, /* min_div_recip_mul_sf. */
2478 2, /* min_div_recip_mul_df. */
2479 0, /* max_case_values. */
2480 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
a8509301
AV
2481 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2482 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2483 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2484 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
25095d1e
KT
2485 &generic_prefetch_tune
2486};
2487
14d4b4fb 2488static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
27d8748d
AV
2489{
2490 2, /* int_stmt_cost */
2491 2, /* fp_stmt_cost */
2492 2, /* ld2_st2_permute_cost */
2493 2, /* ld3_st3_permute_cost */
2494 3, /* ld4_st4_permute_cost */
2495 3, /* permute_cost */
2496 4, /* reduc_i8_cost */
2497 4, /* reduc_i16_cost */
2498 2, /* reduc_i32_cost */
2499 2, /* reduc_i64_cost */
2500 6, /* reduc_f16_cost */
2501 3, /* reduc_f32_cost */
2502 2, /* reduc_f64_cost */
2503 2, /* store_elt_extra_cost */
2504 /* This value is just inherited from the Cortex-A57 table. */
2505 8, /* vec_to_scalar_cost */
2506 /* This depends very much on what the scalar value is and
2507 where it comes from. E.g. some constants take two dependent
2508 instructions or a load, while others might be moved from a GPR.
2509 4 seems to be a reasonable compromise in practice. */
2510 4, /* scalar_to_vec_cost */
2511 4, /* align_load_cost */
2512 4, /* unalign_load_cost */
2513 /* Although stores have a latency of 2 and compete for the
2514 vector pipes, in practice it's better not to model that. */
2515 1, /* unalign_store_cost */
2516 1 /* store_cost */
2517};
2518
14d4b4fb 2519static const sve_vec_cost neoversev2_sve_vector_cost =
27d8748d
AV
2520{
2521 {
2522 2, /* int_stmt_cost */
2523 2, /* fp_stmt_cost */
2524 3, /* ld2_st2_permute_cost */
2525 3, /* ld3_st3_permute_cost */
2526 4, /* ld4_st4_permute_cost */
2527 3, /* permute_cost */
2528 /* Theoretically, a reduction involving 15 scalar ADDs could
2529 complete in ~3 cycles and would have a cost of 15. [SU]ADDV
2530 completes in 11 cycles, so give it a cost of 15 + 8. */
2531 21, /* reduc_i8_cost */
2532 /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
2533 14, /* reduc_i16_cost */
2534 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
2535 7, /* reduc_i32_cost */
2536 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2537 2, /* reduc_i64_cost */
2538 /* Theoretically, a reduction involving 7 scalar FADDs could
2539 complete in ~6 cycles and would have a cost of 14. FADDV
2540 completes in 8 cycles, so give it a cost of 14 + 2. */
2541 16, /* reduc_f16_cost */
2542 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2543 8, /* reduc_f32_cost */
2544 /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
2545 4, /* reduc_f64_cost */
2546 2, /* store_elt_extra_cost */
2547 /* This value is just inherited from the Cortex-A57 table. */
2548 8, /* vec_to_scalar_cost */
2549 /* See the comment above the Advanced SIMD versions. */
2550 4, /* scalar_to_vec_cost */
2551 4, /* align_load_cost */
2552 4, /* unalign_load_cost */
2553 /* Although stores have a latency of 2 and compete for the
2554 vector pipes, in practice it's better not to model that. */
2555 1, /* unalign_store_cost */
2556 1 /* store_cost */
2557 },
2558 3, /* clast_cost */
2559 10, /* fadda_f16_cost */
2560 6, /* fadda_f32_cost */
2561 4, /* fadda_f64_cost */
2562 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2563 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2564 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2565 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2566 (cost 2) to that, to avoid the difference being lost in rounding.
2567
2568 There is no easy comparison between a strided Advanced SIMD x32 load
2569 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2570 operation more than a 64-bit gather. */
2571 14, /* gather_load_x32_cost */
2572 12, /* gather_load_x64_cost */
2573 3 /* scatter_store_elt_cost */
2574};
2575
14d4b4fb 2576static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
27d8748d
AV
2577{
2578 3, /* loads_stores_per_cycle */
2579 2, /* stores_per_cycle */
2580 6, /* general_ops_per_cycle */
2581 0, /* fp_simd_load_general_ops */
2582 1 /* fp_simd_store_general_ops */
2583};
2584
14d4b4fb 2585static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
27d8748d
AV
2586{
2587 {
2588 3, /* loads_stores_per_cycle */
2589 2, /* stores_per_cycle */
2590 4, /* general_ops_per_cycle */
2591 0, /* fp_simd_load_general_ops */
2592 1 /* fp_simd_store_general_ops */
2593 },
2594 2, /* ld2_st2_general_ops */
2595 2, /* ld3_st3_general_ops */
2596 3 /* ld4_st4_general_ops */
2597};
2598
14d4b4fb 2599static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
27d8748d
AV
2600{
2601 {
2602 {
2603 3, /* loads_per_cycle */
2604 2, /* stores_per_cycle */
2605 4, /* general_ops_per_cycle */
2606 0, /* fp_simd_load_general_ops */
2607 1 /* fp_simd_store_general_ops */
2608 },
2609 2, /* ld2_st2_general_ops */
2610 3, /* ld3_st3_general_ops */
2611 3 /* ld4_st4_general_ops */
2612 },
2613 2, /* pred_ops_per_cycle */
2614 2, /* while_pred_ops */
2615 2, /* int_cmp_pred_ops */
2616 1, /* fp_cmp_pred_ops */
2617 1, /* gather_scatter_pair_general_ops */
2618 1 /* gather_scatter_pair_pred_ops */
2619};
2620
14d4b4fb 2621static const aarch64_vec_issue_info neoversev2_vec_issue_info =
27d8748d 2622{
14d4b4fb
KT
2623 &neoversev2_scalar_issue_info,
2624 &neoversev2_advsimd_issue_info,
2625 &neoversev2_sve_issue_info
27d8748d
AV
2626};
2627
2628/* Demeter costs for vector insn classes. */
14d4b4fb 2629static const struct cpu_vector_cost neoversev2_vector_cost =
27d8748d
AV
2630{
2631 1, /* scalar_int_stmt_cost */
2632 2, /* scalar_fp_stmt_cost */
2633 4, /* scalar_load_cost */
2634 1, /* scalar_store_cost */
2635 1, /* cond_taken_branch_cost */
2636 1, /* cond_not_taken_branch_cost */
14d4b4fb
KT
2637 &neoversev2_advsimd_vector_cost, /* advsimd */
2638 &neoversev2_sve_vector_cost, /* sve */
2639 &neoversev2_vec_issue_info /* issue_info */
27d8748d
AV
2640};
2641
14d4b4fb 2642static const struct tune_params neoversev2_tunings =
27d8748d
AV
2643{
2644 &cortexa76_extra_costs,
14d4b4fb
KT
2645 &neoversev2_addrcost_table,
2646 &neoversev2_regmove_cost,
2647 &neoversev2_vector_cost,
27d8748d
AV
2648 &generic_branch_cost,
2649 &generic_approx_modes,
2650 SVE_128, /* sve_width */
2651 { 4, /* load_int. */
2652 2, /* store_int. */
2653 6, /* load_fp. */
2654 1, /* store_fp. */
2655 6, /* load_pred. */
2656 2 /* store_pred. */
2657 }, /* memmov_cost. */
2658 5, /* issue_rate */
2659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2660 "32:16", /* function_align. */
2661 "4", /* jump_align. */
2662 "32:16", /* loop_align. */
2663 3, /* int_reassoc_width. */
2664 6, /* fp_reassoc_width. */
0c1b0a23 2665 4, /* fma_reassoc_width. */
27d8748d
AV
2666 3, /* vec_reassoc_width. */
2667 2, /* min_div_recip_mul_sf. */
2668 2, /* min_div_recip_mul_df. */
2669 0, /* max_case_values. */
2670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2671 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2672 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2673 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2674 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2675 &generic_prefetch_tune
2676};
2677
02f21aea
QJ
2678static const struct tune_params a64fx_tunings =
2679{
3f325179
QJ
2680 &a64fx_extra_costs,
2681 &a64fx_addrcost_table,
2682 &a64fx_regmove_cost,
2683 &a64fx_vector_cost,
02f21aea
QJ
2684 &generic_branch_cost,
2685 &generic_approx_modes,
2686 SVE_512, /* sve_width */
b074fa69
AV
2687 { 4, /* load_int. */
2688 4, /* store_int. */
2689 4, /* load_fp. */
2690 4, /* store_fp. */
2691 4, /* load_pred. */
2692 4 /* store_pred. */
2693 }, /* memmov_cost. */
02f21aea
QJ
2694 7, /* issue_rate */
2695 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2696 "32", /* function_align. */
2697 "16", /* jump_align. */
2698 "32", /* loop_align. */
2699 4, /* int_reassoc_width. */
2700 2, /* fp_reassoc_width. */
0c1b0a23 2701 1, /* fma_reassoc_width. */
02f21aea
QJ
2702 2, /* vec_reassoc_width. */
2703 2, /* min_div_recip_mul_sf. */
2704 2, /* min_div_recip_mul_df. */
2705 0, /* max_case_values. */
2706 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2707 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
2708 &a64fx_prefetch_tune
2709};
2710
8dec06f2
JG
2711/* Support for fine-grained override of the tuning structures. */
2712struct aarch64_tuning_override_function
2713{
2714 const char* name;
2715 void (*parse_override)(const char*, struct tune_params*);
2716};
2717
2718static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2719static void aarch64_parse_tune_string (const char*, struct tune_params*);
886f092f 2720static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
8dec06f2
JG
2721
2722static const struct aarch64_tuning_override_function
2723aarch64_tuning_override_functions[] =
2724{
2725 { "fuse", aarch64_parse_fuse_string },
2726 { "tune", aarch64_parse_tune_string },
886f092f 2727 { "sve_width", aarch64_parse_sve_width_string },
8dec06f2
JG
2728 { NULL, NULL }
2729};
2730
43e9d192
IB
2731/* A processor implementing AArch64. */
2732struct processor
2733{
60dee638
RS
2734 const char *name;
2735 aarch64_processor ident;
2736 aarch64_processor sched_core;
2737 aarch64_arch arch;
fed55a60 2738 aarch64_feature_flags flags;
60dee638 2739 const tune_params *tune;
43e9d192
IB
2740};
2741
393ae126 2742/* Architectures implementing AArch64. */
f95d3d5d 2743static CONSTEXPR const processor all_architectures[] =
393ae126 2744{
11a113d5
RS
2745#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2746 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2747 feature_deps::ARCH_IDENT ().enable, NULL},
393ae126 2748#include "aarch64-arches.def"
ae54c1b0 2749 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
393ae126
KT
2750};
2751
43e9d192
IB
2752/* Processor cores implementing AArch64. */
2753static const struct processor all_cores[] =
2754{
11a113d5
RS
2755#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2756 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2757 feature_deps::cpu_##IDENT, &COSTS##_tunings},
43e9d192 2758#include "aarch64-cores.def"
00c22ba6 2759 {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
11a113d5 2760 feature_deps::V8A ().enable, &generic_tunings},
ae54c1b0 2761 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
43e9d192
IB
2762};
2763
b175b679
JG
2764/* The current tuning set. */
2765struct tune_params aarch64_tune_params = generic_tunings;
2766
c600df9a
RS
2767/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2768
2769static tree
2770handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2771 int, bool *no_add_attrs)
2772{
2773 /* Since we set fn_type_req to true, the caller should have checked
2774 this for us. */
2775 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2776 switch ((arm_pcs) fntype_abi (*node).id ())
2777 {
2778 case ARM_PCS_AAPCS64:
2779 case ARM_PCS_SIMD:
2780 return NULL_TREE;
2781
2782 case ARM_PCS_SVE:
2783 error ("the %qE attribute cannot be applied to an SVE function type",
2784 name);
2785 *no_add_attrs = true;
2786 return NULL_TREE;
2787
2788 case ARM_PCS_TLSDESC:
2789 case ARM_PCS_UNKNOWN:
2790 break;
2791 }
2792 gcc_unreachable ();
2793}
2794
a0d0b980
SE
2795/* Table of machine attributes. */
2796static const struct attribute_spec aarch64_attribute_table[] =
2797{
2798 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2799 affects_type_identity, handler, exclude } */
c600df9a
RS
2800 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2801 handle_aarch64_vector_pcs_attribute, NULL },
38e62001
RS
2802 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2803 aarch64_sve::handle_arm_sve_vector_bits_attribute,
2804 NULL },
31427b97 2805 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
683e93d1 2806 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
5002dae3 2807 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
a0d0b980
SE
2808 { NULL, 0, 0, false, false, false, false, NULL, NULL }
2809};
2810
43e9d192
IB
2811/* An ISA extension in the co-processor and main instruction set space. */
2812struct aarch64_option_extension
2813{
2814 const char *const name;
2815 const unsigned long flags_on;
2816 const unsigned long flags_off;
2817};
2818
43e9d192
IB
2819typedef enum aarch64_cond_code
2820{
2821 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2822 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2823 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2824}
2825aarch64_cc;
2826
2827#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2828
efac62a3
ST
2829struct aarch64_branch_protect_type
2830{
2831 /* The type's name that the user passes to the branch-protection option
2832 string. */
2833 const char* name;
2834 /* Function to handle the protection type and set global variables.
2835 First argument is the string token corresponding with this type and the
2836 second argument is the next token in the option string.
2837 Return values:
2838 * AARCH64_PARSE_OK: Handling was sucessful.
2839 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2840 should print an error.
2841 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2842 own error. */
2843 enum aarch64_parse_opt_result (*handler)(char*, char*);
2844 /* A list of types that can follow this type in the option string. */
2845 const aarch64_branch_protect_type* subtypes;
2846 unsigned int num_subtypes;
2847};
2848
2849static enum aarch64_parse_opt_result
2850aarch64_handle_no_branch_protection (char* str, char* rest)
2851{
2852 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
30afdf34 2853 aarch64_enable_bti = 0;
efac62a3
ST
2854 if (rest)
2855 {
2856 error ("unexpected %<%s%> after %<%s%>", rest, str);
2857 return AARCH64_PARSE_INVALID_FEATURE;
2858 }
2859 return AARCH64_PARSE_OK;
2860}
2861
2862static enum aarch64_parse_opt_result
2863aarch64_handle_standard_branch_protection (char* str, char* rest)
2864{
2865 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 2866 aarch64_ra_sign_key = AARCH64_KEY_A;
30afdf34 2867 aarch64_enable_bti = 1;
efac62a3
ST
2868 if (rest)
2869 {
2870 error ("unexpected %<%s%> after %<%s%>", rest, str);
2871 return AARCH64_PARSE_INVALID_FEATURE;
2872 }
2873 return AARCH64_PARSE_OK;
2874}
2875
2876static enum aarch64_parse_opt_result
2877aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2878 char* rest ATTRIBUTE_UNUSED)
2879{
2880 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 2881 aarch64_ra_sign_key = AARCH64_KEY_A;
efac62a3
ST
2882 return AARCH64_PARSE_OK;
2883}
2884
2885static enum aarch64_parse_opt_result
2886aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2887 char* rest ATTRIBUTE_UNUSED)
2888{
2889 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2890 return AARCH64_PARSE_OK;
2891}
2892
8fc16d72
ST
2893static enum aarch64_parse_opt_result
2894aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2895 char* rest ATTRIBUTE_UNUSED)
2896{
2897 aarch64_ra_sign_key = AARCH64_KEY_B;
2898 return AARCH64_PARSE_OK;
2899}
2900
30afdf34
SD
2901static enum aarch64_parse_opt_result
2902aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2903 char* rest ATTRIBUTE_UNUSED)
2904{
2905 aarch64_enable_bti = 1;
2906 return AARCH64_PARSE_OK;
2907}
2908
efac62a3
ST
2909static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2910 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
8fc16d72 2911 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
efac62a3
ST
2912 { NULL, NULL, NULL, 0 }
2913};
2914
2915static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2916 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2917 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2918 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2919 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
30afdf34 2920 { "bti", aarch64_handle_bti_protection, NULL, 0 },
efac62a3
ST
2921 { NULL, NULL, NULL, 0 }
2922};
2923
43e9d192
IB
2924/* The condition codes of the processor, and the inverse function. */
2925static const char * const aarch64_condition_codes[] =
2926{
2927 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2928 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2929};
2930
57d6f4d0
RS
2931/* The preferred condition codes for SVE conditions. */
2932static const char *const aarch64_sve_condition_codes[] =
2933{
2934 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2935 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2936};
2937
0b1fe8cf
RS
2938/* Return the assembly token for svpattern value VALUE. */
2939
2940static const char *
2941svpattern_token (enum aarch64_svpattern pattern)
2942{
2943 switch (pattern)
2944 {
2945#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2946 AARCH64_FOR_SVPATTERN (CASE)
2947#undef CASE
2948 case AARCH64_NUM_SVPATTERNS:
2949 break;
2950 }
2951 gcc_unreachable ();
2952}
2953
38e62001
RS
2954/* Return the location of a piece that is known to be passed or returned
2955 in registers. FIRST_ZR is the first unused vector argument register
2956 and FIRST_PR is the first unused predicate argument register. */
2957
2958rtx
2959pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2960 unsigned int first_pr) const
2961{
2962 gcc_assert (VECTOR_MODE_P (mode)
2963 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2964 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2965
2966 if (num_zr > 0 && num_pr == 0)
2967 return gen_rtx_REG (mode, first_zr);
2968
2969 if (num_zr == 0 && num_pr == 1)
2970 return gen_rtx_REG (mode, first_pr);
2971
2972 gcc_unreachable ();
2973}
2974
2975/* Return the total number of vector registers required by the PST. */
2976
2977unsigned int
2978pure_scalable_type_info::num_zr () const
2979{
2980 unsigned int res = 0;
2981 for (unsigned int i = 0; i < pieces.length (); ++i)
2982 res += pieces[i].num_zr;
2983 return res;
2984}
2985
2986/* Return the total number of predicate registers required by the PST. */
2987
2988unsigned int
2989pure_scalable_type_info::num_pr () const
2990{
2991 unsigned int res = 0;
2992 for (unsigned int i = 0; i < pieces.length (); ++i)
2993 res += pieces[i].num_pr;
2994 return res;
2995}
2996
2997/* Return the location of a PST that is known to be passed or returned
2998 in registers. FIRST_ZR is the first unused vector argument register
2999 and FIRST_PR is the first unused predicate argument register. */
3000
3001rtx
3002pure_scalable_type_info::get_rtx (machine_mode mode,
3003 unsigned int first_zr,
3004 unsigned int first_pr) const
3005{
3006 /* Try to return a single REG if possible. This leads to better
3007 code generation; it isn't required for correctness. */
3008 if (mode == pieces[0].mode)
3009 {
3010 gcc_assert (pieces.length () == 1);
3011 return pieces[0].get_rtx (first_zr, first_pr);
3012 }
3013
3014 /* Build up a PARALLEL that contains the individual pieces. */
3015 rtvec rtxes = rtvec_alloc (pieces.length ());
3016 for (unsigned int i = 0; i < pieces.length (); ++i)
3017 {
3018 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
3019 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
3020 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
3021 first_zr += pieces[i].num_zr;
3022 first_pr += pieces[i].num_pr;
3023 }
3024 return gen_rtx_PARALLEL (mode, rtxes);
3025}
3026
3027/* Analyze whether TYPE is a Pure Scalable Type according to the rules
3028 in the AAPCS64. */
3029
3030pure_scalable_type_info::analysis_result
3031pure_scalable_type_info::analyze (const_tree type)
3032{
3033 /* Prevent accidental reuse. */
3034 gcc_assert (pieces.is_empty ());
3035
3036 /* No code will be generated for erroneous types, so we won't establish
3037 an ABI mapping. */
3038 if (type == error_mark_node)
3039 return NO_ABI_IDENTITY;
3040
3041 /* Zero-sized types disappear in the language->ABI mapping. */
3042 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3043 return NO_ABI_IDENTITY;
3044
3045 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
3046 piece p = {};
3047 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
3048 {
3049 machine_mode mode = TYPE_MODE_RAW (type);
3050 gcc_assert (VECTOR_MODE_P (mode)
3051 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
3052
3053 p.mode = p.orig_mode = mode;
3054 add_piece (p);
3055 return IS_PST;
3056 }
3057
3058 /* Check for user-defined PSTs. */
3059 if (TREE_CODE (type) == ARRAY_TYPE)
3060 return analyze_array (type);
3061 if (TREE_CODE (type) == RECORD_TYPE)
3062 return analyze_record (type);
3063
3064 return ISNT_PST;
3065}
3066
3067/* Analyze a type that is known not to be passed or returned in memory.
3068 Return true if it has an ABI identity and is a Pure Scalable Type. */
3069
3070bool
3071pure_scalable_type_info::analyze_registers (const_tree type)
3072{
3073 analysis_result result = analyze (type);
3074 gcc_assert (result != DOESNT_MATTER);
3075 return result == IS_PST;
3076}
3077
3078/* Subroutine of analyze for handling ARRAY_TYPEs. */
3079
3080pure_scalable_type_info::analysis_result
3081pure_scalable_type_info::analyze_array (const_tree type)
3082{
3083 /* Analyze the element type. */
3084 pure_scalable_type_info element_info;
3085 analysis_result result = element_info.analyze (TREE_TYPE (type));
3086 if (result != IS_PST)
3087 return result;
3088
3089 /* An array of unknown, flexible or variable length will be passed and
3090 returned by reference whatever we do. */
3091 tree nelts_minus_one = array_type_nelts (type);
3092 if (!tree_fits_uhwi_p (nelts_minus_one))
3093 return DOESNT_MATTER;
3094
3095 /* Likewise if the array is constant-sized but too big to be interesting.
3096 The double checks against MAX_PIECES are to protect against overflow. */
3097 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3098 if (count > MAX_PIECES)
3099 return DOESNT_MATTER;
3100 count += 1;
3101 if (count * element_info.pieces.length () > MAX_PIECES)
3102 return DOESNT_MATTER;
3103
3104 /* The above checks should have weeded out elements of unknown size. */
3105 poly_uint64 element_bytes;
3106 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3107 gcc_unreachable ();
3108
3109 /* Build up the list of individual vectors and predicates. */
3110 gcc_assert (!element_info.pieces.is_empty ());
3111 for (unsigned int i = 0; i < count; ++i)
3112 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3113 {
3114 piece p = element_info.pieces[j];
3115 p.offset += i * element_bytes;
3116 add_piece (p);
3117 }
3118 return IS_PST;
3119}
3120
3121/* Subroutine of analyze for handling RECORD_TYPEs. */
3122
3123pure_scalable_type_info::analysis_result
3124pure_scalable_type_info::analyze_record (const_tree type)
3125{
3126 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3127 {
3128 if (TREE_CODE (field) != FIELD_DECL)
3129 continue;
3130
3131 /* Zero-sized fields disappear in the language->ABI mapping. */
3132 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3133 continue;
3134
3135 /* All fields with an ABI identity must be PSTs for the record as
3136 a whole to be a PST. If any individual field is too big to be
3137 interesting then the record is too. */
3138 pure_scalable_type_info field_info;
3139 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3140 if (subresult == NO_ABI_IDENTITY)
3141 continue;
3142 if (subresult != IS_PST)
3143 return subresult;
3144
3145 /* Since all previous fields are PSTs, we ought to be able to track
3146 the field offset using poly_ints. */
3147 tree bitpos = bit_position (field);
3148 gcc_assert (poly_int_tree_p (bitpos));
3149
3150 /* For the same reason, it shouldn't be possible to create a PST field
3151 whose offset isn't byte-aligned. */
3152 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3153 BITS_PER_UNIT);
3154
3155 /* Punt if the record is too big to be interesting. */
3156 poly_uint64 bytepos;
3157 if (!wide_bytepos.to_uhwi (&bytepos)
3158 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3159 return DOESNT_MATTER;
3160
3161 /* Add the individual vectors and predicates in the field to the
3162 record's list. */
3163 gcc_assert (!field_info.pieces.is_empty ());
3164 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3165 {
3166 piece p = field_info.pieces[i];
3167 p.offset += bytepos;
3168 add_piece (p);
3169 }
3170 }
3171 /* Empty structures disappear in the language->ABI mapping. */
3172 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3173}
3174
3175/* Add P to the list of pieces in the type. */
3176
3177void
3178pure_scalable_type_info::add_piece (const piece &p)
3179{
3180 /* Try to fold the new piece into the previous one to form a
3181 single-mode PST. For example, if we see three consecutive vectors
3182 of the same mode, we can represent them using the corresponding
3183 3-tuple mode.
3184
3185 This is purely an optimization. */
3186 if (!pieces.is_empty ())
3187 {
3188 piece &prev = pieces.last ();
3189 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3190 unsigned int nelems1, nelems2;
3191 if (prev.orig_mode == p.orig_mode
3192 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3193 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3194 GET_MODE_NUNITS (p.orig_mode), &nelems1)
3195 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3196 GET_MODE_NUNITS (p.orig_mode), &nelems2)
3197 && targetm.array_mode (p.orig_mode,
3198 nelems1 + nelems2).exists (&prev.mode))
3199 {
3200 prev.num_zr += p.num_zr;
3201 prev.num_pr += p.num_pr;
3202 return;
3203 }
3204 }
3205 pieces.quick_push (p);
3206}
3207
3208/* Return true if at least one possible value of type TYPE includes at
3209 least one object of Pure Scalable Type, in the sense of the AAPCS64.
3210
3211 This is a relatively expensive test for some types, so it should
3212 generally be made as late as possible. */
3213
3214static bool
3215aarch64_some_values_include_pst_objects_p (const_tree type)
3216{
3217 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3218 return false;
3219
3220 if (aarch64_sve::builtin_type_p (type))
3221 return true;
3222
3223 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3224 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3225
3226 if (RECORD_OR_UNION_TYPE_P (type))
3227 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3228 if (TREE_CODE (field) == FIELD_DECL
3229 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3230 return true;
3231
3232 return false;
3233}
3234
002ffd3c
RS
3235/* Return the descriptor of the SIMD ABI. */
3236
3237static const predefined_function_abi &
3238aarch64_simd_abi (void)
3239{
3240 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3241 if (!simd_abi.initialized_p ())
3242 {
3243 HARD_REG_SET full_reg_clobbers
3244 = default_function_abi.full_reg_clobbers ();
3245 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3246 if (FP_SIMD_SAVED_REGNUM_P (regno))
3247 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3248 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3249 }
3250 return simd_abi;
3251}
3252
c600df9a
RS
3253/* Return the descriptor of the SVE PCS. */
3254
3255static const predefined_function_abi &
3256aarch64_sve_abi (void)
3257{
3258 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3259 if (!sve_abi.initialized_p ())
3260 {
3261 HARD_REG_SET full_reg_clobbers
3262 = default_function_abi.full_reg_clobbers ();
3263 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3264 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
cb26919c 3265 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
c600df9a
RS
3266 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3267 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3268 }
3269 return sve_abi;
3270}
3271
74b27d8e
RS
3272/* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3273 wraps, otherwise return X itself. */
3274
3275static rtx
3276strip_salt (rtx x)
3277{
3278 rtx search = x;
3279 if (GET_CODE (search) == CONST)
3280 search = XEXP (search, 0);
3281 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3282 x = XVECEXP (search, 0, 0);
3283 return x;
3284}
3285
3286/* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3287 expression. */
3288
3289static rtx
3290strip_offset_and_salt (rtx addr, poly_int64 *offset)
3291{
3292 return strip_salt (strip_offset (addr, offset));
3293}
3294
973d2e01
TP
3295/* Generate code to enable conditional branches in functions over 1 MiB. */
3296const char *
3297aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3298 const char * branch_format)
3299{
3300 rtx_code_label * tmp_label = gen_label_rtx ();
3301 char label_buf[256];
3302 char buffer[128];
3303 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3304 CODE_LABEL_NUMBER (tmp_label));
3305 const char *label_ptr = targetm.strip_name_encoding (label_buf);
3306 rtx dest_label = operands[pos_label];
3307 operands[pos_label] = tmp_label;
3308
3309 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3310 output_asm_insn (buffer, operands);
3311
3312 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3313 operands[pos_label] = dest_label;
3314 output_asm_insn (buffer, operands);
3315 return "";
3316}
3317
261fb553 3318void
fc29dfc9 3319aarch64_err_no_fpadvsimd (machine_mode mode)
261fb553 3320{
261fb553 3321 if (TARGET_GENERAL_REGS_ONLY)
fc29dfc9
SE
3322 if (FLOAT_MODE_P (mode))
3323 error ("%qs is incompatible with the use of floating-point types",
3324 "-mgeneral-regs-only");
3325 else
3326 error ("%qs is incompatible with the use of vector types",
3327 "-mgeneral-regs-only");
261fb553 3328 else
fc29dfc9
SE
3329 if (FLOAT_MODE_P (mode))
3330 error ("%qs feature modifier is incompatible with the use of"
3331 " floating-point types", "+nofp");
3332 else
3333 error ("%qs feature modifier is incompatible with the use of"
3334 " vector types", "+nofp");
261fb553
AL
3335}
3336
c0e0174b
RS
3337/* Report when we try to do something that requires SVE when SVE is disabled.
3338 This is an error of last resort and isn't very high-quality. It usually
3339 involves attempts to measure the vector length in some way. */
3340static void
3341aarch64_report_sve_required (void)
3342{
3343 static bool reported_p = false;
3344
3345 /* Avoid reporting a slew of messages for a single oversight. */
3346 if (reported_p)
3347 return;
3348
3349 error ("this operation requires the SVE ISA extension");
3350 inform (input_location, "you can enable SVE using the command-line"
3351 " option %<-march%>, or by using the %<target%>"
3352 " attribute or pragma");
3353 reported_p = true;
3354}
3355
183bfdaf
RS
3356/* Return true if REGNO is P0-P15 or one of the special FFR-related
3357 registers. */
3358inline bool
3359pr_or_ffr_regnum_p (unsigned int regno)
3360{
3361 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3362}
3363
c64f7d37 3364/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2eb2847e
WD
3365 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3366 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3367 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3368 and GENERAL_REGS is lower than the memory cost (in this case the best class
3369 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
3370 cost results in bad allocations with many redundant int<->FP moves which
3371 are expensive on various cores.
3372 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3373 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
3374 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
3375 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
31e2b5a3
WD
3376 The result of this is that it is no longer inefficient to have a higher
3377 memory move cost than the register move cost.
3378*/
c64f7d37
WD
3379
3380static reg_class_t
31e2b5a3
WD
3381aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3382 reg_class_t best_class)
c64f7d37 3383{
b8506a8a 3384 machine_mode mode;
c64f7d37 3385
67e5c59a
RS
3386 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3387 || !reg_class_subset_p (FP_REGS, allocno_class))
c64f7d37
WD
3388 return allocno_class;
3389
67e5c59a
RS
3390 if (!reg_class_subset_p (GENERAL_REGS, best_class)
3391 || !reg_class_subset_p (FP_REGS, best_class))
31e2b5a3
WD
3392 return best_class;
3393
c64f7d37
WD
3394 mode = PSEUDO_REGNO_MODE (regno);
3395 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3396}
3397
26e0ff94 3398static unsigned int
b8506a8a 3399aarch64_min_divisions_for_recip_mul (machine_mode mode)
26e0ff94 3400{
50093a33 3401 if (GET_MODE_UNIT_SIZE (mode) == 4)
b175b679
JG
3402 return aarch64_tune_params.min_div_recip_mul_sf;
3403 return aarch64_tune_params.min_div_recip_mul_df;
26e0ff94
WD
3404}
3405
b5b33e11 3406/* Return the reassociation width of treeop OPC with mode MODE. */
cee66c68 3407static int
b5b33e11 3408aarch64_reassociation_width (unsigned opc, machine_mode mode)
cee66c68
WD
3409{
3410 if (VECTOR_MODE_P (mode))
b175b679 3411 return aarch64_tune_params.vec_reassoc_width;
cee66c68 3412 if (INTEGRAL_MODE_P (mode))
b175b679 3413 return aarch64_tune_params.int_reassoc_width;
0c1b0a23
WD
3414 /* Reassociation reduces the number of FMAs which may result in worse
3415 performance. Use a per-CPU setting for FMA reassociation which allows
3416 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3417 CPUs with many FP pipes to enable reassociation.
3418 Since the reassociation pass doesn't understand FMA at all, assume
3419 that any FP addition might turn into FMA. */
3420 if (FLOAT_MODE_P (mode))
3421 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3422 : aarch64_tune_params.fp_reassoc_width;
cee66c68
WD
3423 return 1;
3424}
3425
43e9d192
IB
3426/* Provide a mapping from gcc register numbers to dwarf register numbers. */
3427unsigned
ca60bd93 3428aarch64_debugger_regno (unsigned regno)
43e9d192
IB
3429{
3430 if (GP_REGNUM_P (regno))
3431 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3432 else if (regno == SP_REGNUM)
3433 return AARCH64_DWARF_SP;
3434 else if (FP_REGNUM_P (regno))
3435 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
43cacb12
RS
3436 else if (PR_REGNUM_P (regno))
3437 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3438 else if (regno == VG_REGNUM)
3439 return AARCH64_DWARF_VG;
43e9d192
IB
3440
3441 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3442 equivalent DWARF register. */
3443 return DWARF_FRAME_REGISTERS;
3444}
3445
eb499454
RS
3446/* Implement TARGET_DWARF_FRAME_REG_MODE. */
3447static machine_mode
3448aarch64_dwarf_frame_reg_mode (int regno)
3449{
3450 /* Predicate registers are call-clobbered in the EH ABI (which is
3451 ARM_PCS_AAPCS64), so they should not be described by CFI.
3452 Their size changes as VL changes, so any values computed by
3453 __builtin_init_dwarf_reg_size_table might not be valid for
3454 all frames. */
3455 if (PR_REGNUM_P (regno))
3456 return VOIDmode;
3457 return default_dwarf_frame_reg_mode (regno);
3458}
3459
d29f7dd5
RS
3460/* If X is a CONST_DOUBLE, return its bit representation as a constant
3461 integer, otherwise return X unmodified. */
3462static rtx
3463aarch64_bit_representation (rtx x)
3464{
3465 if (CONST_DOUBLE_P (x))
3466 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3467 return x;
3468}
3469
3b924b0d
RS
3470/* Return an estimate for the number of quadwords in an SVE vector. This is
3471 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
3472static unsigned int
3473aarch64_estimated_sve_vq ()
3474{
3475 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3476}
3477
43cacb12
RS
3478/* Return true if MODE is an SVE predicate mode. */
3479static bool
3480aarch64_sve_pred_mode_p (machine_mode mode)
3481{
3482 return (TARGET_SVE
3483 && (mode == VNx16BImode
3484 || mode == VNx8BImode
3485 || mode == VNx4BImode
3486 || mode == VNx2BImode));
3487}
3488
3489/* Three mutually-exclusive flags describing a vector or predicate type. */
3490const unsigned int VEC_ADVSIMD = 1;
3491const unsigned int VEC_SVE_DATA = 2;
3492const unsigned int VEC_SVE_PRED = 4;
3493/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3494 a structure of 2, 3 or 4 vectors. */
3495const unsigned int VEC_STRUCT = 8;
550a3380
RS
3496/* Can be used in combination with VEC_SVE_DATA to indicate that the
3497 vector has fewer significant bytes than a full SVE vector. */
3498const unsigned int VEC_PARTIAL = 16;
43cacb12
RS
3499/* Useful combinations of the above. */
3500const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
3501const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3502
3503/* Return a set of flags describing the vector properties of mode MODE.
3504 Ignore modes that are not supported by the current target. */
3505static unsigned int
3506aarch64_classify_vector_mode (machine_mode mode)
3507{
43cacb12
RS
3508 if (aarch64_sve_pred_mode_p (mode))
3509 return VEC_SVE_PRED;
3510
806f69cd
RS
3511 /* Make the decision based on the mode's enum value rather than its
3512 properties, so that we keep the correct classification regardless
3513 of -msve-vector-bits. */
3514 switch (mode)
43cacb12 3515 {
550a3380
RS
3516 /* Partial SVE QI vectors. */
3517 case E_VNx2QImode:
3518 case E_VNx4QImode:
3519 case E_VNx8QImode:
3520 /* Partial SVE HI vectors. */
3521 case E_VNx2HImode:
3522 case E_VNx4HImode:
3523 /* Partial SVE SI vector. */
3524 case E_VNx2SImode:
cc68f7c2
RS
3525 /* Partial SVE HF vectors. */
3526 case E_VNx2HFmode:
3527 case E_VNx4HFmode:
6c3ce63b
RS
3528 /* Partial SVE BF vectors. */
3529 case E_VNx2BFmode:
3530 case E_VNx4BFmode:
cc68f7c2
RS
3531 /* Partial SVE SF vector. */
3532 case E_VNx2SFmode:
550a3380
RS
3533 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3534
806f69cd
RS
3535 case E_VNx16QImode:
3536 case E_VNx8HImode:
3537 case E_VNx4SImode:
3538 case E_VNx2DImode:
02fcd8ac 3539 case E_VNx8BFmode:
806f69cd
RS
3540 case E_VNx8HFmode:
3541 case E_VNx4SFmode:
3542 case E_VNx2DFmode:
3543 return TARGET_SVE ? VEC_SVE_DATA : 0;
3544
3545 /* x2 SVE vectors. */
3546 case E_VNx32QImode:
3547 case E_VNx16HImode:
3548 case E_VNx8SImode:
3549 case E_VNx4DImode:
02fcd8ac 3550 case E_VNx16BFmode:
806f69cd
RS
3551 case E_VNx16HFmode:
3552 case E_VNx8SFmode:
3553 case E_VNx4DFmode:
3554 /* x3 SVE vectors. */
3555 case E_VNx48QImode:
3556 case E_VNx24HImode:
3557 case E_VNx12SImode:
3558 case E_VNx6DImode:
02fcd8ac 3559 case E_VNx24BFmode:
806f69cd
RS
3560 case E_VNx24HFmode:
3561 case E_VNx12SFmode:
3562 case E_VNx6DFmode:
3563 /* x4 SVE vectors. */
3564 case E_VNx64QImode:
3565 case E_VNx32HImode:
3566 case E_VNx16SImode:
3567 case E_VNx8DImode:
02fcd8ac 3568 case E_VNx32BFmode:
806f69cd
RS
3569 case E_VNx32HFmode:
3570 case E_VNx16SFmode:
3571 case E_VNx8DFmode:
3572 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3573
66f206b8
JW
3574 case E_OImode:
3575 case E_CImode:
3576 case E_XImode:
721c0fb3 3577 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
66f206b8
JW
3578
3579 /* Structures of 64-bit Advanced SIMD vectors. */
3580 case E_V2x8QImode:
3581 case E_V2x4HImode:
3582 case E_V2x2SImode:
3583 case E_V2x1DImode:
3584 case E_V2x4BFmode:
3585 case E_V2x4HFmode:
3586 case E_V2x2SFmode:
3587 case E_V2x1DFmode:
3588 case E_V3x8QImode:
3589 case E_V3x4HImode:
3590 case E_V3x2SImode:
3591 case E_V3x1DImode:
3592 case E_V3x4BFmode:
3593 case E_V3x4HFmode:
3594 case E_V3x2SFmode:
3595 case E_V3x1DFmode:
3596 case E_V4x8QImode:
3597 case E_V4x4HImode:
3598 case E_V4x2SImode:
3599 case E_V4x1DImode:
3600 case E_V4x4BFmode:
3601 case E_V4x4HFmode:
3602 case E_V4x2SFmode:
3603 case E_V4x1DFmode:
721c0fb3 3604 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
66f206b8
JW
3605
3606 /* Structures of 128-bit Advanced SIMD vectors. */
3607 case E_V2x16QImode:
3608 case E_V2x8HImode:
3609 case E_V2x4SImode:
3610 case E_V2x2DImode:
3611 case E_V2x8BFmode:
3612 case E_V2x8HFmode:
3613 case E_V2x4SFmode:
3614 case E_V2x2DFmode:
3615 case E_V3x16QImode:
3616 case E_V3x8HImode:
3617 case E_V3x4SImode:
3618 case E_V3x2DImode:
3619 case E_V3x8BFmode:
3620 case E_V3x8HFmode:
3621 case E_V3x4SFmode:
3622 case E_V3x2DFmode:
3623 case E_V4x16QImode:
3624 case E_V4x8HImode:
3625 case E_V4x4SImode:
3626 case E_V4x2DImode:
3627 case E_V4x8BFmode:
3628 case E_V4x8HFmode:
3629 case E_V4x4SFmode:
3630 case E_V4x2DFmode:
721c0fb3 3631 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
66f206b8 3632
806f69cd
RS
3633 /* 64-bit Advanced SIMD vectors. */
3634 case E_V8QImode:
3635 case E_V4HImode:
3636 case E_V2SImode:
5ba864c5 3637 case E_V1DImode:
806f69cd 3638 case E_V4HFmode:
abbe1ed2 3639 case E_V4BFmode:
806f69cd
RS
3640 case E_V2SFmode:
3641 case E_V1DFmode:
3642 /* 128-bit Advanced SIMD vectors. */
3643 case E_V16QImode:
3644 case E_V8HImode:
3645 case E_V4SImode:
3646 case E_V2DImode:
3647 case E_V8HFmode:
abbe1ed2 3648 case E_V8BFmode:
806f69cd
RS
3649 case E_V4SFmode:
3650 case E_V2DFmode:
721c0fb3 3651 return TARGET_FLOAT ? VEC_ADVSIMD : 0;
806f69cd
RS
3652
3653 default:
3654 return 0;
43cacb12 3655 }
43cacb12
RS
3656}
3657
66f206b8
JW
3658/* Return true if MODE is any of the Advanced SIMD structure modes. */
3659bool
3660aarch64_advsimd_struct_mode_p (machine_mode mode)
3661{
3662 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3663 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3664}
3665
3666/* Return true if MODE is an Advanced SIMD D-register structure mode. */
3667static bool
3668aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3669{
3670 return (aarch64_classify_vector_mode (mode)
3671 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3672}
3673
3674/* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3675static bool
3676aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3677{
3678 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3679}
3680
43cacb12
RS
3681/* Return true if MODE is any of the data vector modes, including
3682 structure modes. */
43e9d192 3683static bool
43cacb12 3684aarch64_vector_data_mode_p (machine_mode mode)
43e9d192 3685{
43cacb12 3686 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
43e9d192
IB
3687}
3688
5c38705d
RS
3689/* Return true if MODE is any form of SVE mode, including predicates,
3690 vectors and structures. */
3691bool
3692aarch64_sve_mode_p (machine_mode mode)
3693{
3694 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3695}
3696
43cacb12
RS
3697/* Return true if MODE is an SVE data vector mode; either a single vector
3698 or a structure of vectors. */
43e9d192 3699static bool
43cacb12 3700aarch64_sve_data_mode_p (machine_mode mode)
43e9d192 3701{
43cacb12 3702 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
43e9d192
IB
3703}
3704
550a3380
RS
3705/* Return the number of defined bytes in one constituent vector of
3706 SVE mode MODE, which has vector flags VEC_FLAGS. */
3707static poly_int64
3708aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3709{
3710 if (vec_flags & VEC_PARTIAL)
3711 /* A single partial vector. */
3712 return GET_MODE_SIZE (mode);
3713
3714 if (vec_flags & VEC_SVE_DATA)
3715 /* A single vector or a tuple. */
3716 return BYTES_PER_SVE_VECTOR;
3717
3718 /* A single predicate. */
3719 gcc_assert (vec_flags & VEC_SVE_PRED);
3720 return BYTES_PER_SVE_PRED;
3721}
3722
05783fe6
RS
3723/* If MODE holds an array of vectors, return the number of vectors
3724 in the array, otherwise return 1. */
3725
3726static unsigned int
3727aarch64_ldn_stn_vectors (machine_mode mode)
3728{
3729 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3730 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3731 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3732 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3733 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3734 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3735 return exact_div (GET_MODE_SIZE (mode),
3736 BYTES_PER_SVE_VECTOR).to_constant ();
3737 return 1;
3738}
3739
66f206b8
JW
3740/* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3741 corresponding vector structure mode. */
3742static opt_machine_mode
3743aarch64_advsimd_vector_array_mode (machine_mode mode,
3744 unsigned HOST_WIDE_INT nelems)
3745{
3746 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3747 if (known_eq (GET_MODE_SIZE (mode), 8))
3748 flags |= VEC_PARTIAL;
3749
3750 machine_mode struct_mode;
3751 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3752 if (aarch64_classify_vector_mode (struct_mode) == flags
3753 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3754 && known_eq (GET_MODE_NUNITS (struct_mode),
3755 GET_MODE_NUNITS (mode) * nelems))
3756 return struct_mode;
3757 return opt_machine_mode ();
3758}
3759
3760/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3761
3762opt_machine_mode
3763aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3764{
3765 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3766 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3767 machine_mode mode;
3768 FOR_EACH_MODE_IN_CLASS (mode, mclass)
3769 if (inner_mode == GET_MODE_INNER (mode)
3770 && known_eq (nunits, GET_MODE_NUNITS (mode))
3771 && aarch64_sve_data_mode_p (mode))
3772 return mode;
3773 return opt_machine_mode ();
3774}
3775
9f4cbab8
RS
3776/* Implement target hook TARGET_ARRAY_MODE. */
3777static opt_machine_mode
3778aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3779{
3780 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3781 && IN_RANGE (nelems, 2, 4))
66f206b8
JW
3782 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3783 GET_MODE_NUNITS (mode) * nelems);
3784 if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3785 && IN_RANGE (nelems, 2, 4))
3786 return aarch64_advsimd_vector_array_mode (mode, nelems);
9f4cbab8
RS
3787
3788 return opt_machine_mode ();
3789}
3790
43e9d192
IB
3791/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3792static bool
ef4bddc2 3793aarch64_array_mode_supported_p (machine_mode mode,
43e9d192
IB
3794 unsigned HOST_WIDE_INT nelems)
3795{
3796 if (TARGET_SIMD
635e66fe
AL
3797 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3798 || AARCH64_VALID_SIMD_DREG_MODE (mode))
43e9d192
IB
3799 && (nelems >= 2 && nelems <= 4))
3800 return true;
3801
3802 return false;
3803}
3804
cc68f7c2
RS
3805/* MODE is some form of SVE vector mode. For data modes, return the number
3806 of vector register bits that each element of MODE occupies, such as 64
3807 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3808 in a 64-bit container). For predicate modes, return the number of
3809 data bits controlled by each significant predicate bit. */
3810
3811static unsigned int
3812aarch64_sve_container_bits (machine_mode mode)
3813{
3814 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3815 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3816 ? BITS_PER_SVE_VECTOR
3817 : GET_MODE_BITSIZE (mode));
3818 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3819}
3820
43cacb12
RS
3821/* Return the SVE predicate mode to use for elements that have
3822 ELEM_NBYTES bytes, if such a mode exists. */
3823
3824opt_machine_mode
3825aarch64_sve_pred_mode (unsigned int elem_nbytes)
3826{
3827 if (TARGET_SVE)
3828 {
3829 if (elem_nbytes == 1)
3830 return VNx16BImode;
3831 if (elem_nbytes == 2)
3832 return VNx8BImode;
3833 if (elem_nbytes == 4)
3834 return VNx4BImode;
3835 if (elem_nbytes == 8)
3836 return VNx2BImode;
3837 }
3838 return opt_machine_mode ();
3839}
3840
cc68f7c2
RS
3841/* Return the SVE predicate mode that should be used to control
3842 SVE mode MODE. */
3843
3844machine_mode
3845aarch64_sve_pred_mode (machine_mode mode)
3846{
3847 unsigned int bits = aarch64_sve_container_bits (mode);
3848 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3849}
3850
43cacb12
RS
3851/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3852
3853static opt_machine_mode
10116ec1 3854aarch64_get_mask_mode (machine_mode mode)
43cacb12 3855{
10116ec1
RS
3856 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3857 if (vec_flags & VEC_SVE_DATA)
cc68f7c2 3858 return aarch64_sve_pred_mode (mode);
43cacb12 3859
10116ec1 3860 return default_get_mask_mode (mode);
43cacb12
RS
3861}
3862
1044fa32
RS
3863/* Return the integer element mode associated with SVE mode MODE. */
3864
3865static scalar_int_mode
3866aarch64_sve_element_int_mode (machine_mode mode)
3867{
cc68f7c2
RS
3868 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3869 ? BITS_PER_SVE_VECTOR
3870 : GET_MODE_BITSIZE (mode));
3871 unsigned int elt_bits = vector_element_size (vector_bits,
1044fa32
RS
3872 GET_MODE_NUNITS (mode));
3873 return int_mode_for_size (elt_bits, 0).require ();
3874}
3875
cc68f7c2
RS
3876/* Return an integer element mode that contains exactly
3877 aarch64_sve_container_bits (MODE) bits. This is wider than
3878 aarch64_sve_element_int_mode if MODE is a partial vector,
3879 otherwise it's the same. */
3880
3881static scalar_int_mode
3882aarch64_sve_container_int_mode (machine_mode mode)
3883{
3884 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3885}
3886
d7a09c44 3887/* Return the integer vector mode associated with SVE mode MODE.
d083ee47 3888 Unlike related_int_vector_mode, this can handle the case in which
d7a09c44
RS
3889 MODE is a predicate (and thus has a different total size). */
3890
624d0f07 3891machine_mode
d7a09c44
RS
3892aarch64_sve_int_mode (machine_mode mode)
3893{
3894 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3895 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3896}
3897
74166aab
RS
3898/* Implement TARGET_VECTORIZE_RELATED_MODE. */
3899
3900static opt_machine_mode
3901aarch64_vectorize_related_mode (machine_mode vector_mode,
3902 scalar_mode element_mode,
3903 poly_uint64 nunits)
3904{
3905 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3906
cc68f7c2
RS
3907 /* If we're operating on SVE vectors, try to return an SVE mode. */
3908 poly_uint64 sve_nunits;
3909 if ((vec_flags & VEC_SVE_DATA)
3910 && multiple_p (BYTES_PER_SVE_VECTOR,
3911 GET_MODE_SIZE (element_mode), &sve_nunits))
3912 {
3913 machine_mode sve_mode;
3914 if (maybe_ne (nunits, 0U))
3915 {
3916 /* Try to find a full or partial SVE mode with exactly
3917 NUNITS units. */
3918 if (multiple_p (sve_nunits, nunits)
3919 && aarch64_sve_data_mode (element_mode,
3920 nunits).exists (&sve_mode))
3921 return sve_mode;
3922 }
3923 else
3924 {
3925 /* Take the preferred number of units from the number of bytes
3926 that fit in VECTOR_MODE. We always start by "autodetecting"
3927 a full vector mode with preferred_simd_mode, so vectors
3928 chosen here will also be full vector modes. Then
3929 autovectorize_vector_modes tries smaller starting modes
3930 and thus smaller preferred numbers of units. */
3931 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3932 if (aarch64_sve_data_mode (element_mode,
3933 sve_nunits).exists (&sve_mode))
3934 return sve_mode;
3935 }
3936 }
3937
74166aab 3938 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
721c0fb3
RS
3939 if (TARGET_SIMD
3940 && (vec_flags & VEC_ADVSIMD)
74166aab
RS
3941 && known_eq (nunits, 0U)
3942 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3943 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3944 * GET_MODE_NUNITS (vector_mode), 128U))
3945 {
3946 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3947 if (VECTOR_MODE_P (res))
3948 return res;
3949 }
3950
3951 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3952}
3953
b41d1f6e
RS
3954/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3955 prefer to use the first arithmetic operand as the else value if
3956 the else value doesn't matter, since that exactly matches the SVE
3957 destructive merging form. For ternary operations we could either
3958 pick the first operand and use FMAD-like instructions or the last
3959 operand and use FMLA-like instructions; the latter seems more
3960 natural. */
6a86928d
RS
3961
3962static tree
b41d1f6e 3963aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
6a86928d 3964{
b41d1f6e 3965 return nops == 3 ? ops[2] : ops[0];
6a86928d
RS
3966}
3967
c43f4279 3968/* Implement TARGET_HARD_REGNO_NREGS. */
43e9d192 3969
c43f4279 3970static unsigned int
ef4bddc2 3971aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
43e9d192 3972{
6a70badb
RS
3973 /* ??? Logically we should only need to provide a value when
3974 HARD_REGNO_MODE_OK says that the combination is valid,
3975 but at the moment we need to handle all modes. Just ignore
3976 any runtime parts for registers that can't store them. */
3977 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
43e9d192
IB
3978 switch (aarch64_regno_regclass (regno))
3979 {
3980 case FP_REGS:
3981 case FP_LO_REGS:
163b1f6a 3982 case FP_LO8_REGS:
550a3380
RS
3983 {
3984 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3985 if (vec_flags & VEC_SVE_DATA)
3986 return exact_div (GET_MODE_SIZE (mode),
3987 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
66f206b8
JW
3988 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3989 return GET_MODE_SIZE (mode).to_constant () / 8;
550a3380
RS
3990 return CEIL (lowest_size, UNITS_PER_VREG);
3991 }
43cacb12
RS
3992 case PR_REGS:
3993 case PR_LO_REGS:
3994 case PR_HI_REGS:
183bfdaf
RS
3995 case FFR_REGS:
3996 case PR_AND_FFR_REGS:
43cacb12 3997 return 1;
43e9d192 3998 default:
6a70badb 3999 return CEIL (lowest_size, UNITS_PER_WORD);
43e9d192
IB
4000 }
4001 gcc_unreachable ();
4002}
4003
f939c3e6 4004/* Implement TARGET_HARD_REGNO_MODE_OK. */
43e9d192 4005
f939c3e6 4006static bool
ef4bddc2 4007aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
43e9d192 4008{
dd159a41
PW
4009 if (mode == V8DImode)
4010 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
4011 && multiple_p (regno - R0_REGNUM, 2);
4012
43e9d192
IB
4013 if (GET_MODE_CLASS (mode) == MODE_CC)
4014 return regno == CC_REGNUM;
4015
43cacb12
RS
4016 if (regno == VG_REGNUM)
4017 /* This must have the same size as _Unwind_Word. */
4018 return mode == DImode;
4019
4020 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4021 if (vec_flags & VEC_SVE_PRED)
183bfdaf 4022 return pr_or_ffr_regnum_p (regno);
43cacb12 4023
183bfdaf
RS
4024 if (pr_or_ffr_regnum_p (regno))
4025 return false;
43cacb12 4026
9259db42
YZ
4027 if (regno == SP_REGNUM)
4028 /* The purpose of comparing with ptr_mode is to support the
4029 global register variable associated with the stack pointer
4030 register via the syntax of asm ("wsp") in ILP32. */
4031 return mode == Pmode || mode == ptr_mode;
4032
4033 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
43e9d192
IB
4034 return mode == Pmode;
4035
563cc649
RH
4036 if (GP_REGNUM_P (regno))
4037 {
721c0fb3 4038 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
aa1a2795 4039 return false;
563cc649
RH
4040 if (known_le (GET_MODE_SIZE (mode), 8))
4041 return true;
aa1a2795 4042 if (known_le (GET_MODE_SIZE (mode), 16))
563cc649
RH
4043 return (regno & 1) == 0;
4044 }
4045 else if (FP_REGNUM_P (regno))
43e9d192 4046 {
43cacb12 4047 if (vec_flags & VEC_STRUCT)
4edd6298 4048 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
43e9d192 4049 else
43cacb12 4050 return !VECTOR_MODE_P (mode) || vec_flags != 0;
43e9d192
IB
4051 }
4052
f939c3e6 4053 return false;
43e9d192
IB
4054}
4055
c600df9a
RS
4056/* Return true if a function with type FNTYPE returns its value in
4057 SVE vector or predicate registers. */
4058
4059static bool
4060aarch64_returns_value_in_sve_regs_p (const_tree fntype)
4061{
c600df9a 4062 tree return_type = TREE_TYPE (fntype);
38e62001
RS
4063
4064 pure_scalable_type_info pst_info;
4065 switch (pst_info.analyze (return_type))
4066 {
4067 case pure_scalable_type_info::IS_PST:
4068 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
4069 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
4070
4071 case pure_scalable_type_info::DOESNT_MATTER:
4072 gcc_assert (aarch64_return_in_memory_1 (return_type));
4073 return false;
4074
4075 case pure_scalable_type_info::NO_ABI_IDENTITY:
4076 case pure_scalable_type_info::ISNT_PST:
4077 return false;
4078 }
4079 gcc_unreachable ();
c600df9a
RS
4080}
4081
4082/* Return true if a function with type FNTYPE takes arguments in
4083 SVE vector or predicate registers. */
4084
4085static bool
4086aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4087{
4088 CUMULATIVE_ARGS args_so_far_v;
4089 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4090 NULL_TREE, 0, true);
4091 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4092
4093 for (tree chain = TYPE_ARG_TYPES (fntype);
4094 chain && chain != void_list_node;
4095 chain = TREE_CHAIN (chain))
4096 {
4097 tree arg_type = TREE_VALUE (chain);
4098 if (arg_type == error_mark_node)
4099 return false;
4100
4101 function_arg_info arg (arg_type, /*named=*/true);
4102 apply_pass_by_reference_rules (&args_so_far_v, arg);
38e62001
RS
4103 pure_scalable_type_info pst_info;
4104 if (pst_info.analyze_registers (arg.type))
4105 {
4106 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4107 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4108 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4109 return true;
4110 }
c600df9a
RS
4111
4112 targetm.calls.function_arg_advance (args_so_far, arg);
4113 }
4114 return false;
4115}
4116
002ffd3c
RS
4117/* Implement TARGET_FNTYPE_ABI. */
4118
4119static const predefined_function_abi &
4120aarch64_fntype_abi (const_tree fntype)
4121{
4122 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4123 return aarch64_simd_abi ();
c600df9a
RS
4124
4125 if (aarch64_returns_value_in_sve_regs_p (fntype)
4126 || aarch64_takes_arguments_in_sve_regs_p (fntype))
4127 return aarch64_sve_abi ();
4128
002ffd3c
RS
4129 return default_function_abi;
4130}
4131
482b2b43
RS
4132/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
4133
4134static bool
4135aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4136{
4137 return (aarch64_sve::builtin_type_p (type1)
4138 == aarch64_sve::builtin_type_p (type2));
4139}
4140
c600df9a 4141/* Return true if we should emit CFI for register REGNO. */
a0d0b980
SE
4142
4143static bool
c600df9a 4144aarch64_emit_cfi_for_reg_p (unsigned int regno)
a0d0b980 4145{
c600df9a
RS
4146 return (GP_REGNUM_P (regno)
4147 || !default_function_abi.clobbers_full_reg_p (regno));
a0d0b980
SE
4148}
4149
c600df9a 4150/* Return the mode we should use to save and restore register REGNO. */
a0d0b980
SE
4151
4152static machine_mode
c600df9a 4153aarch64_reg_save_mode (unsigned int regno)
a0d0b980 4154{
c600df9a
RS
4155 if (GP_REGNUM_P (regno))
4156 return DImode;
4157
4158 if (FP_REGNUM_P (regno))
4159 switch (crtl->abi->id ())
4160 {
4161 case ARM_PCS_AAPCS64:
4162 /* Only the low 64 bits are saved by the base PCS. */
4163 return DFmode;
4164
4165 case ARM_PCS_SIMD:
4166 /* The vector PCS saves the low 128 bits (which is the full
4167 register on non-SVE targets). */
4168 return TFmode;
4169
4170 case ARM_PCS_SVE:
4171 /* Use vectors of DImode for registers that need frame
4172 information, so that the first 64 bytes of the save slot
4173 are always the equivalent of what storing D<n> would give. */
4174 if (aarch64_emit_cfi_for_reg_p (regno))
4175 return VNx2DImode;
4176
4177 /* Use vectors of bytes otherwise, so that the layout is
4178 endian-agnostic, and so that we can use LDR and STR for
4179 big-endian targets. */
4180 return VNx16QImode;
4181
4182 case ARM_PCS_TLSDESC:
4183 case ARM_PCS_UNKNOWN:
4184 break;
4185 }
4186
4187 if (PR_REGNUM_P (regno))
4188 /* Save the full predicate register. */
4189 return VNx16BImode;
4190
4191 gcc_unreachable ();
a0d0b980
SE
4192}
4193
5a5a3bc5 4194/* Implement TARGET_INSN_CALLEE_ABI. */
b3650d40 4195
5a5a3bc5
RS
4196const predefined_function_abi &
4197aarch64_insn_callee_abi (const rtx_insn *insn)
b3650d40 4198{
08cc4d92
RS
4199 rtx pat = PATTERN (insn);
4200 gcc_assert (GET_CODE (pat) == PARALLEL);
4201 rtx unspec = XVECEXP (pat, 0, 1);
4202 gcc_assert (GET_CODE (unspec) == UNSPEC
4203 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4204 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
b3650d40
SE
4205}
4206
80ec73f4
RS
4207/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
4208 the lower 64 bits of a 128-bit register. Tell the compiler the callee
4209 clobbers the top 64 bits when restoring the bottom 64 bits. */
4210
4211static bool
6ee2cc70
RS
4212aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4213 unsigned int regno,
473574ee 4214 machine_mode mode)
80ec73f4 4215{
c600df9a 4216 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
51051f47 4217 {
51051f47
RS
4218 poly_int64 per_register_size = GET_MODE_SIZE (mode);
4219 unsigned int nregs = hard_regno_nregs (regno, mode);
4220 if (nregs > 1)
4221 per_register_size = exact_div (per_register_size, nregs);
bb6ce448
RS
4222 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4223 return maybe_gt (per_register_size, 16);
4224 return maybe_gt (per_register_size, 8);
51051f47
RS
4225 }
4226 return false;
473574ee
SE
4227}
4228
43cacb12
RS
4229/* Implement REGMODE_NATURAL_SIZE. */
4230poly_uint64
4231aarch64_regmode_natural_size (machine_mode mode)
4232{
4233 /* The natural size for SVE data modes is one SVE data vector,
4234 and similarly for predicates. We can't independently modify
4235 anything smaller than that. */
4236 /* ??? For now, only do this for variable-width SVE registers.
e53b6e56 4237 Doing it for constant-sized registers breaks lower-subreg.cc. */
43cacb12
RS
4238 /* ??? And once that's fixed, we should probably have similar
4239 code for Advanced SIMD. */
4240 if (!aarch64_sve_vg.is_constant ())
4241 {
4242 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4243 if (vec_flags & VEC_SVE_PRED)
4244 return BYTES_PER_SVE_PRED;
4245 if (vec_flags & VEC_SVE_DATA)
4246 return BYTES_PER_SVE_VECTOR;
4247 }
4248 return UNITS_PER_WORD;
4249}
4250
73d9ac6a 4251/* Implement HARD_REGNO_CALLER_SAVE_MODE. */
ef4bddc2 4252machine_mode
43cacb12
RS
4253aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4254 machine_mode mode)
4255{
4256 /* The predicate mode determines which bits are significant and
4257 which are "don't care". Decreasing the number of lanes would
4258 lose data while increasing the number of lanes would make bits
4259 unnecessarily significant. */
4260 if (PR_REGNUM_P (regno))
4261 return mode;
6a70badb
RS
4262 if (known_ge (GET_MODE_SIZE (mode), 4))
4263 return mode;
73d9ac6a 4264 else
6a70badb 4265 return SImode;
73d9ac6a
IB
4266}
4267
231c52ae
ST
4268/* Return true if I's bits are consecutive ones from the MSB. */
4269bool
4270aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4271{
4272 return exact_log2 (-i) != HOST_WIDE_INT_M1;
4273}
4274
58e17cf8
RS
4275/* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
4276 that strcpy from constants will be faster. */
4277
4278static HOST_WIDE_INT
4279aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4280{
4281 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4282 return MAX (align, BITS_PER_WORD);
4283 return align;
4284}
4285
43e9d192
IB
4286/* Return true if calls to DECL should be treated as
4287 long-calls (ie called via a register). */
4288static bool
4289aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4290{
4291 return false;
4292}
4293
4294/* Return true if calls to symbol-ref SYM should be treated as
4295 long-calls (ie called via a register). */
4296bool
4297aarch64_is_long_call_p (rtx sym)
4298{
4299 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4300}
4301
b60d63cb
JW
4302/* Return true if calls to symbol-ref SYM should not go through
4303 plt stubs. */
4304
4305bool
4306aarch64_is_noplt_call_p (rtx sym)
4307{
4308 const_tree decl = SYMBOL_REF_DECL (sym);
4309
4310 if (flag_pic
4311 && decl
4312 && (!flag_plt
4313 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4314 && !targetm.binds_local_p (decl))
4315 return true;
4316
4317 return false;
4318}
4319
43e9d192
IB
4320/* Emit an insn that's a simple single-set. Both the operands must be
4321 known to be valid. */
827ab47a 4322inline static rtx_insn *
43e9d192
IB
4323emit_set_insn (rtx x, rtx y)
4324{
f7df4a84 4325 return emit_insn (gen_rtx_SET (x, y));
43e9d192
IB
4326}
4327
4328/* X and Y are two things to compare using CODE. Emit the compare insn and
4329 return the rtx for register 0 in the proper mode. */
4330rtx
4331aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4332{
4a2095eb
RH
4333 machine_mode cmp_mode = GET_MODE (x);
4334 machine_mode cc_mode;
4335 rtx cc_reg;
43e9d192 4336
4a2095eb
RH
4337 if (cmp_mode == TImode)
4338 {
4339 gcc_assert (code == NE);
4340
4341 cc_mode = CCmode;
4342 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4343
4344 rtx x_lo = operand_subword (x, 0, 0, TImode);
4345 rtx y_lo = operand_subword (y, 0, 0, TImode);
4346 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4347
4348 rtx x_hi = operand_subword (x, 1, 0, TImode);
4349 rtx y_hi = operand_subword (y, 1, 0, TImode);
865257c4
RS
4350 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4351 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4352 GEN_INT (AARCH64_EQ)));
4a2095eb
RH
4353 }
4354 else
4355 {
4356 cc_mode = SELECT_CC_MODE (code, x, y);
4357 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4358 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4359 }
43e9d192
IB
4360 return cc_reg;
4361}
4362
d400fda3
RH
4363/* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
4364
4365static rtx
4366aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4367 machine_mode y_mode)
4368{
4369 if (y_mode == E_QImode || y_mode == E_HImode)
4370 {
4371 if (CONST_INT_P (y))
df562b12
JJ
4372 {
4373 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4374 y_mode = SImode;
4375 }
d400fda3
RH
4376 else
4377 {
4378 rtx t, cc_reg;
4379 machine_mode cc_mode;
4380
4381 t = gen_rtx_ZERO_EXTEND (SImode, y);
4382 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4383 cc_mode = CC_SWPmode;
4384 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4385 emit_set_insn (cc_reg, t);
4386 return cc_reg;
4387 }
4388 }
4389
846f78d4
PK
4390 if (!aarch64_plus_operand (y, y_mode))
4391 y = force_reg (y_mode, y);
4392
d400fda3
RH
4393 return aarch64_gen_compare_reg (code, x, y);
4394}
4395
8e84b2b3
RS
4396/* Consider the operation:
4397
4398 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4399
4400 where:
4401
4402 - CODE is [SU]MAX or [SU]MIN
4403 - OPERANDS[2] and OPERANDS[3] are constant integers
4404 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4405 - all operands have mode MODE
4406
4407 Decide whether it is possible to implement the operation using:
4408
4409 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4410 or
4411 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4412
4413 followed by:
4414
4415 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4416
4417 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
4418 If GENERATE_P is true, also update OPERANDS as follows:
4419
4420 OPERANDS[4] = -OPERANDS[3]
4421 OPERANDS[5] = the rtl condition representing <cond>
4422 OPERANDS[6] = <tmp>
4423 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
4424bool
4425aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4426{
4427 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4428 rtx dst = operands[0];
4429 rtx maxmin_op = operands[2];
4430 rtx add_op = operands[3];
4431 machine_mode mode = GET_MODE (dst);
4432
4433 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4434 == (x >= y ? x : y) - z
4435 == (x > y ? x : y) - z
4436 == (x > y - 1 ? x : y) - z
4437
4438 min (x, y) - z == (x <= y - 1 ? x : y) - z
4439 == (x <= y ? x : y) - z
4440 == (x < y ? x : y) - z
4441 == (x < y + 1 ? x : y) - z
4442
4443 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4444 which x is compared with z. Set DIFF to y - z. Thus the supported
4445 combinations are as follows, with DIFF being the value after the ":":
4446
4447 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
4448 == x >= y ? x - y : 0 [z == y]
4449 == x > y ? x - y : 0 [z == y]
4450 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
4451
4452 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
4453 == x <= y ? x - y : 0 [z == y]
4454 == x < y ? x - y : 0 [z == y]
4455 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
4456 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4457 auto add_val = rtx_mode_t (add_op, mode);
4458 auto sub_val = wi::neg (add_val);
4459 auto diff = wi::sub (maxmin_val, sub_val);
4460 if (!(diff == 0
4461 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4462 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4463 return false;
4464
4465 if (!generate_p)
4466 return true;
4467
4468 rtx_code cmp;
4469 switch (code)
4470 {
4471 case SMAX:
4472 cmp = diff == 1 ? GT : GE;
4473 break;
4474 case UMAX:
4475 cmp = diff == 1 ? GTU : GEU;
4476 break;
4477 case SMIN:
4478 cmp = diff == -1 ? LT : LE;
4479 break;
4480 case UMIN:
4481 cmp = diff == -1 ? LTU : LEU;
4482 break;
4483 default:
4484 gcc_unreachable ();
4485 }
4486 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4487
4488 operands[4] = immed_wide_int_const (sub_val, mode);
4489 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4490 if (can_create_pseudo_p ())
4491 operands[6] = gen_reg_rtx (mode);
4492 else
4493 operands[6] = dst;
4494 operands[7] = immed_wide_int_const (diff, mode);
4495
4496 return true;
4497}
4498
4499
43e9d192
IB
4500/* Build the SYMBOL_REF for __tls_get_addr. */
4501
4502static GTY(()) rtx tls_get_addr_libfunc;
4503
4504rtx
4505aarch64_tls_get_addr (void)
4506{
4507 if (!tls_get_addr_libfunc)
4508 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4509 return tls_get_addr_libfunc;
4510}
4511
4512/* Return the TLS model to use for ADDR. */
4513
4514static enum tls_model
4515tls_symbolic_operand_type (rtx addr)
4516{
4517 enum tls_model tls_kind = TLS_MODEL_NONE;
74b27d8e
RS
4518 poly_int64 offset;
4519 addr = strip_offset_and_salt (addr, &offset);
3793ecc1 4520 if (SYMBOL_REF_P (addr))
43e9d192
IB
4521 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4522
4523 return tls_kind;
4524}
4525
4526/* We'll allow lo_sum's in addresses in our legitimate addresses
4527 so that combine would take care of combining addresses where
4528 necessary, but for generation purposes, we'll generate the address
4529 as :
4530 RTL Absolute
4531 tmp = hi (symbol_ref); adrp x1, foo
4532 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
4533 nop
4534
4535 PIC TLS
4536 adrp x1, :got:foo adrp tmp, :tlsgd:foo
4537 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
4538 bl __tls_get_addr
4539 nop
4540
4541 Load TLS symbol, depending on TLS mechanism and TLS access model.
4542
4543 Global Dynamic - Traditional TLS:
4544 adrp tmp, :tlsgd:imm
4545 add dest, tmp, #:tlsgd_lo12:imm
4546 bl __tls_get_addr
4547
4548 Global Dynamic - TLS Descriptors:
4549 adrp dest, :tlsdesc:imm
4550 ldr tmp, [dest, #:tlsdesc_lo12:imm]
4551 add dest, dest, #:tlsdesc_lo12:imm
4552 blr tmp
4553 mrs tp, tpidr_el0
4554 add dest, dest, tp
4555
4556 Initial Exec:
4557 mrs tp, tpidr_el0
4558 adrp tmp, :gottprel:imm
4559 ldr dest, [tmp, #:gottprel_lo12:imm]
4560 add dest, dest, tp
4561
4562 Local Exec:
4563 mrs tp, tpidr_el0
0699caae
RL
4564 add t0, tp, #:tprel_hi12:imm, lsl #12
4565 add t0, t0, #:tprel_lo12_nc:imm
43e9d192
IB
4566*/
4567
4568static void
4569aarch64_load_symref_appropriately (rtx dest, rtx imm,
4570 enum aarch64_symbol_type type)
4571{
4572 switch (type)
4573 {
4574 case SYMBOL_SMALL_ABSOLUTE:
4575 {
28514dda 4576 /* In ILP32, the mode of dest can be either SImode or DImode. */
43e9d192 4577 rtx tmp_reg = dest;
ef4bddc2 4578 machine_mode mode = GET_MODE (dest);
28514dda
YZ
4579
4580 gcc_assert (mode == Pmode || mode == ptr_mode);
4581
43e9d192 4582 if (can_create_pseudo_p ())
28514dda 4583 tmp_reg = gen_reg_rtx (mode);
43e9d192 4584
95215562 4585 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
43e9d192
IB
4586 emit_insn (gen_add_losym (dest, tmp_reg, imm));
4587 return;
4588 }
4589
a5350ddc 4590 case SYMBOL_TINY_ABSOLUTE:
f7df4a84 4591 emit_insn (gen_rtx_SET (dest, imm));
a5350ddc
CSS
4592 return;
4593
1b1e81f8
JW
4594 case SYMBOL_SMALL_GOT_28K:
4595 {
4596 machine_mode mode = GET_MODE (dest);
4597 rtx gp_rtx = pic_offset_table_rtx;
53021678
JW
4598 rtx insn;
4599 rtx mem;
1b1e81f8
JW
4600
4601 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4602 here before rtl expand. Tree IVOPT will generate rtl pattern to
4603 decide rtx costs, in which case pic_offset_table_rtx is not
4604 initialized. For that case no need to generate the first adrp
026c3cfd 4605 instruction as the final cost for global variable access is
1b1e81f8
JW
4606 one instruction. */
4607 if (gp_rtx != NULL)
4608 {
4609 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4610 using the page base as GOT base, the first page may be wasted,
4611 in the worst scenario, there is only 28K space for GOT).
4612
4613 The generate instruction sequence for accessing global variable
4614 is:
4615
a3957742 4616 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1b1e81f8
JW
4617
4618 Only one instruction needed. But we must initialize
4619 pic_offset_table_rtx properly. We generate initialize insn for
4620 every global access, and allow CSE to remove all redundant.
4621
4622 The final instruction sequences will look like the following
4623 for multiply global variables access.
4624
a3957742 4625 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1b1e81f8 4626
a3957742
JW
4627 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4628 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4629 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4630 ... */
1b1e81f8
JW
4631
4632 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4633 crtl->uses_pic_offset_table = 1;
4634 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4635
4636 if (mode != GET_MODE (gp_rtx))
4ba8f0a3
AP
4637 gp_rtx = gen_lowpart (mode, gp_rtx);
4638
1b1e81f8
JW
4639 }
4640
4641 if (mode == ptr_mode)
4642 {
4643 if (mode == DImode)
53021678 4644 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1b1e81f8 4645 else
53021678
JW
4646 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4647
4648 mem = XVECEXP (SET_SRC (insn), 0, 0);
1b1e81f8
JW
4649 }
4650 else
4651 {
4652 gcc_assert (mode == Pmode);
53021678
JW
4653
4654 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4655 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1b1e81f8
JW
4656 }
4657
53021678
JW
4658 /* The operand is expected to be MEM. Whenever the related insn
4659 pattern changed, above code which calculate mem should be
4660 updated. */
3793ecc1 4661 gcc_assert (MEM_P (mem));
53021678
JW
4662 MEM_READONLY_P (mem) = 1;
4663 MEM_NOTRAP_P (mem) = 1;
4664 emit_insn (insn);
1b1e81f8
JW
4665 return;
4666 }
4667
6642bdb4 4668 case SYMBOL_SMALL_GOT_4G:
a195c727
WD
4669 emit_insn (gen_rtx_SET (dest, imm));
4670 return;
43e9d192
IB
4671
4672 case SYMBOL_SMALL_TLSGD:
4673 {
5d8a22a5 4674 rtx_insn *insns;
87ca615a
AP
4675 /* The return type of __tls_get_addr is the C pointer type
4676 so use ptr_mode. */
4677 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4678 rtx tmp_reg = dest;
4679
4680 if (GET_MODE (dest) != ptr_mode)
4681 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
43e9d192
IB
4682
4683 start_sequence ();
87ca615a 4684 if (ptr_mode == SImode)
23b88fda
N
4685 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4686 else
4687 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
43e9d192
IB
4688 insns = get_insns ();
4689 end_sequence ();
4690
4691 RTL_CONST_CALL_P (insns) = 1;
87ca615a
AP
4692 emit_libcall_block (insns, tmp_reg, result, imm);
4693 /* Convert back to the mode of the dest adding a zero_extend
4694 from SImode (ptr_mode) to DImode (Pmode). */
4695 if (dest != tmp_reg)
4696 convert_move (dest, tmp_reg, true);
43e9d192
IB
4697 return;
4698 }
4699
4700 case SYMBOL_SMALL_TLSDESC:
4701 {
ef4bddc2 4702 machine_mode mode = GET_MODE (dest);
621ad2de 4703 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
43e9d192
IB
4704 rtx tp;
4705
621ad2de
AP
4706 gcc_assert (mode == Pmode || mode == ptr_mode);
4707
2876a13f
JW
4708 /* In ILP32, the got entry is always of SImode size. Unlike
4709 small GOT, the dest is fixed at reg 0. */
4710 if (TARGET_ILP32)
4711 emit_insn (gen_tlsdesc_small_si (imm));
621ad2de 4712 else
2876a13f 4713 emit_insn (gen_tlsdesc_small_di (imm));
43e9d192 4714 tp = aarch64_load_tp (NULL);
621ad2de
AP
4715
4716 if (mode != Pmode)
4717 tp = gen_lowpart (mode, tp);
4718
2876a13f 4719 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
241dbd9d
QZ
4720 if (REG_P (dest))
4721 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
4722 return;
4723 }
4724
79496620 4725 case SYMBOL_SMALL_TLSIE:
43e9d192 4726 {
621ad2de
AP
4727 /* In ILP32, the mode of dest can be either SImode or DImode,
4728 while the got entry is always of SImode size. The mode of
4729 dest depends on how dest is used: if dest is assigned to a
4730 pointer (e.g. in the memory), it has SImode; it may have
4731 DImode if dest is dereferenced to access the memeory.
4732 This is why we have to handle three different tlsie_small
4733 patterns here (two patterns for ILP32). */
ef4bddc2 4734 machine_mode mode = GET_MODE (dest);
621ad2de 4735 rtx tmp_reg = gen_reg_rtx (mode);
43e9d192 4736 rtx tp = aarch64_load_tp (NULL);
621ad2de
AP
4737
4738 if (mode == ptr_mode)
4739 {
4740 if (mode == DImode)
4741 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4742 else
4743 {
4744 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4745 tp = gen_lowpart (mode, tp);
4746 }
4747 }
4748 else
4749 {
4750 gcc_assert (mode == Pmode);
4751 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4752 }
4753
f7df4a84 4754 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
241dbd9d
QZ
4755 if (REG_P (dest))
4756 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
4757 return;
4758 }
4759
cbf5629e 4760 case SYMBOL_TLSLE12:
d18ba284 4761 case SYMBOL_TLSLE24:
cbf5629e
JW
4762 case SYMBOL_TLSLE32:
4763 case SYMBOL_TLSLE48:
43e9d192 4764 {
cbf5629e 4765 machine_mode mode = GET_MODE (dest);
43e9d192 4766 rtx tp = aarch64_load_tp (NULL);
e6f7f0e9 4767
cbf5629e
JW
4768 if (mode != Pmode)
4769 tp = gen_lowpart (mode, tp);
4770
4771 switch (type)
4772 {
4773 case SYMBOL_TLSLE12:
4774 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4775 (dest, tp, imm));
4776 break;
4777 case SYMBOL_TLSLE24:
4778 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4779 (dest, tp, imm));
4780 break;
4781 case SYMBOL_TLSLE32:
4782 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4783 (dest, imm));
4784 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4785 (dest, dest, tp));
4786 break;
4787 case SYMBOL_TLSLE48:
4788 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4789 (dest, imm));
4790 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4791 (dest, dest, tp));
4792 break;
4793 default:
4794 gcc_unreachable ();
4795 }
e6f7f0e9 4796
241dbd9d
QZ
4797 if (REG_P (dest))
4798 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
4799 return;
4800 }
4801
87dd8ab0 4802 case SYMBOL_TINY_GOT:
d91480de
D
4803 {
4804 rtx insn;
4805 machine_mode mode = GET_MODE (dest);
4806
4807 if (mode == ptr_mode)
4808 insn = gen_ldr_got_tiny (mode, dest, imm);
4809 else
4810 {
4811 gcc_assert (mode == Pmode);
4812 insn = gen_ldr_got_tiny_sidi (dest, imm);
4813 }
4814
4815 emit_insn (insn);
4816 return;
4817 }
87dd8ab0 4818
5ae7caad
JW
4819 case SYMBOL_TINY_TLSIE:
4820 {
4821 machine_mode mode = GET_MODE (dest);
4822 rtx tp = aarch64_load_tp (NULL);
4823
4824 if (mode == ptr_mode)
4825 {
4826 if (mode == DImode)
4827 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4828 else
4829 {
4830 tp = gen_lowpart (mode, tp);
4831 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4832 }
4833 }
4834 else
4835 {
4836 gcc_assert (mode == Pmode);
4837 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4838 }
4839
241dbd9d
QZ
4840 if (REG_P (dest))
4841 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
5ae7caad
JW
4842 return;
4843 }
4844
43e9d192
IB
4845 default:
4846 gcc_unreachable ();
4847 }
4848}
4849
4850/* Emit a move from SRC to DEST. Assume that the move expanders can
4851 handle all moves if !can_create_pseudo_p (). The distinction is
4852 important because, unlike emit_move_insn, the move expanders know
4853 how to force Pmode objects into the constant pool even when the
4854 constant pool address is not itself legitimate. */
4855static rtx
4856aarch64_emit_move (rtx dest, rtx src)
4857{
4858 return (can_create_pseudo_p ()
4859 ? emit_move_insn (dest, src)
4860 : emit_move_insn_1 (dest, src));
4861}
4862
f22d7973
RS
4863/* Apply UNOPTAB to OP and store the result in DEST. */
4864
4865static void
4866aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4867{
4868 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4869 if (dest != tmp)
4870 emit_move_insn (dest, tmp);
4871}
4872
4873/* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4874
4875static void
4876aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4877{
4878 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4879 OPTAB_DIRECT);
4880 if (dest != tmp)
4881 emit_move_insn (dest, tmp);
4882}
4883
030d03b8
RE
4884/* Split a 128-bit move operation into two 64-bit move operations,
4885 taking care to handle partial overlap of register to register
4886 copies. Special cases are needed when moving between GP regs and
4887 FP regs. SRC can be a register, constant or memory; DST a register
4888 or memory. If either operand is memory it must not have any side
4889 effects. */
43e9d192
IB
4890void
4891aarch64_split_128bit_move (rtx dst, rtx src)
4892{
030d03b8
RE
4893 rtx dst_lo, dst_hi;
4894 rtx src_lo, src_hi;
43e9d192 4895
ef4bddc2 4896 machine_mode mode = GET_MODE (dst);
12dc6974 4897
0dc8e1e7 4898 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
030d03b8
RE
4899 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4900 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
43e9d192
IB
4901
4902 if (REG_P (dst) && REG_P (src))
4903 {
030d03b8
RE
4904 int src_regno = REGNO (src);
4905 int dst_regno = REGNO (dst);
43e9d192 4906
030d03b8 4907 /* Handle FP <-> GP regs. */
43e9d192
IB
4908 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4909 {
030d03b8
RE
4910 src_lo = gen_lowpart (word_mode, src);
4911 src_hi = gen_highpart (word_mode, src);
4912
0016d8d9
RS
4913 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4914 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
030d03b8 4915 return;
43e9d192
IB
4916 }
4917 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4918 {
030d03b8
RE
4919 dst_lo = gen_lowpart (word_mode, dst);
4920 dst_hi = gen_highpart (word_mode, dst);
4921
0016d8d9
RS
4922 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4923 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
030d03b8 4924 return;
43e9d192 4925 }
43e9d192
IB
4926 }
4927
030d03b8
RE
4928 dst_lo = gen_lowpart (word_mode, dst);
4929 dst_hi = gen_highpart (word_mode, dst);
4930 src_lo = gen_lowpart (word_mode, src);
4931 src_hi = gen_highpart_mode (word_mode, mode, src);
4932
4933 /* At most one pairing may overlap. */
4934 if (reg_overlap_mentioned_p (dst_lo, src_hi))
4935 {
4936 aarch64_emit_move (dst_hi, src_hi);
4937 aarch64_emit_move (dst_lo, src_lo);
4938 }
4939 else
4940 {
4941 aarch64_emit_move (dst_lo, src_lo);
4942 aarch64_emit_move (dst_hi, src_hi);
4943 }
43e9d192
IB
4944}
4945
d4f9e819
RS
4946/* Return true if we should split a move from 128-bit value SRC
4947 to 128-bit register DEST. */
4948
43e9d192
IB
4949bool
4950aarch64_split_128bit_move_p (rtx dst, rtx src)
4951{
d4f9e819
RS
4952 if (FP_REGNUM_P (REGNO (dst)))
4953 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4954 /* All moves to GPRs need to be split. */
4955 return true;
43e9d192
IB
4956}
4957
fd4842cd
SN
4958/* Split a complex SIMD move. */
4959
4960void
4961aarch64_split_simd_move (rtx dst, rtx src)
4962{
ef4bddc2
RS
4963 machine_mode src_mode = GET_MODE (src);
4964 machine_mode dst_mode = GET_MODE (dst);
fd4842cd
SN
4965
4966 gcc_assert (VECTOR_MODE_P (dst_mode));
4967
4968 if (REG_P (dst) && REG_P (src))
4969 {
4970 gcc_assert (VECTOR_MODE_P (src_mode));
0016d8d9 4971 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
fd4842cd
SN
4972 }
4973}
4974
ef22810a
RH
4975bool
4976aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4977 machine_mode ymode, rtx y)
4978{
4979 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4980 gcc_assert (r != NULL);
4981 return rtx_equal_p (x, r);
4982}
ef22810a 4983
678faefc
RS
4984/* Return TARGET if it is nonnull and a register of mode MODE.
4985 Otherwise, return a fresh register of mode MODE if we can,
4986 or TARGET reinterpreted as MODE if we can't. */
4987
4988static rtx
4989aarch64_target_reg (rtx target, machine_mode mode)
4990{
4991 if (target && REG_P (target) && GET_MODE (target) == mode)
4992 return target;
4993 if (!can_create_pseudo_p ())
4994 {
4995 gcc_assert (target);
4996 return gen_lowpart (mode, target);
4997 }
4998 return gen_reg_rtx (mode);
4999}
5000
5001/* Return a register that contains the constant in BUILDER, given that
5002 the constant is a legitimate move operand. Use TARGET as the register
5003 if it is nonnull and convenient. */
5004
5005static rtx
5006aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
5007{
5008 rtx src = builder.build ();
5009 target = aarch64_target_reg (target, GET_MODE (src));
5010 emit_insn (gen_rtx_SET (target, src));
5011 return target;
5012}
5013
43e9d192 5014static rtx
ef4bddc2 5015aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
43e9d192
IB
5016{
5017 if (can_create_pseudo_p ())
e18b4a81 5018 return force_reg (mode, value);
43e9d192
IB
5019 else
5020 {
f5470a77
RS
5021 gcc_assert (x);
5022 aarch64_emit_move (x, value);
43e9d192
IB
5023 return x;
5024 }
5025}
5026
0b1fe8cf
RS
5027/* Return true if predicate value X is a constant in which every element
5028 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
5029 value, i.e. as a predicate in which all bits are significant. */
5030
5031static bool
5032aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
5033{
568b9c0e 5034 if (!CONST_VECTOR_P (x))
0b1fe8cf
RS
5035 return false;
5036
5037 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
5038 GET_MODE_NUNITS (GET_MODE (x)));
5039 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
5040 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
5041 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
5042
5043 unsigned int nelts = const_vector_encoded_nelts (x);
5044 for (unsigned int i = 0; i < nelts; ++i)
5045 {
5046 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
5047 if (!CONST_INT_P (elt))
5048 return false;
5049
5050 builder.quick_push (elt);
5051 for (unsigned int j = 1; j < factor; ++j)
5052 builder.quick_push (const0_rtx);
5053 }
5054 builder.finalize ();
5055 return true;
5056}
5057
5058/* BUILDER contains a predicate constant of mode VNx16BI. Return the
5059 widest predicate element size it can have (that is, the largest size
5060 for which each element would still be 0 or 1). */
5061
5062unsigned int
5063aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
5064{
5065 /* Start with the most optimistic assumption: that we only need
5066 one bit per pattern. This is what we will use if only the first
5067 bit in each pattern is ever set. */
5068 unsigned int mask = GET_MODE_SIZE (DImode);
5069 mask |= builder.npatterns ();
5070
5071 /* Look for set bits. */
5072 unsigned int nelts = builder.encoded_nelts ();
5073 for (unsigned int i = 1; i < nelts; ++i)
5074 if (INTVAL (builder.elt (i)) != 0)
5075 {
5076 if (i & 1)
5077 return 1;
5078 mask |= i;
5079 }
5080 return mask & -mask;
5081}
5082
624d0f07
RS
5083/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5084 return that predicate mode, otherwise return opt_machine_mode (). */
5085
5086opt_machine_mode
5087aarch64_ptrue_all_mode (rtx x)
5088{
5089 gcc_assert (GET_MODE (x) == VNx16BImode);
568b9c0e 5090 if (!CONST_VECTOR_P (x)
624d0f07
RS
5091 || !CONST_VECTOR_DUPLICATE_P (x)
5092 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5093 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5094 return opt_machine_mode ();
5095
5096 unsigned int nelts = const_vector_encoded_nelts (x);
5097 for (unsigned int i = 1; i < nelts; ++i)
5098 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5099 return opt_machine_mode ();
5100
5101 return aarch64_sve_pred_mode (nelts);
5102}
5103
0b1fe8cf
RS
5104/* BUILDER is a predicate constant of mode VNx16BI. Consider the value
5105 that the constant would have with predicate element size ELT_SIZE
5106 (ignoring the upper bits in each element) and return:
5107
5108 * -1 if all bits are set
5109 * N if the predicate has N leading set bits followed by all clear bits
5110 * 0 if the predicate does not have any of these forms. */
5111
5112int
5113aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5114 unsigned int elt_size)
5115{
5116 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5117 followed by set bits. */
5118 if (builder.nelts_per_pattern () == 3)
5119 return 0;
5120
5121 /* Skip over leading set bits. */
5122 unsigned int nelts = builder.encoded_nelts ();
5123 unsigned int i = 0;
5124 for (; i < nelts; i += elt_size)
5125 if (INTVAL (builder.elt (i)) == 0)
5126 break;
5127 unsigned int vl = i / elt_size;
5128
5129 /* Check for the all-true case. */
5130 if (i == nelts)
5131 return -1;
5132
5133 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5134 repeating pattern of set bits followed by clear bits. */
5135 if (builder.nelts_per_pattern () != 2)
5136 return 0;
5137
5138 /* We have a "foreground" value and a duplicated "background" value.
5139 If the background might repeat and the last set bit belongs to it,
5140 we might have set bits followed by clear bits followed by set bits. */
5141 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5142 return 0;
5143
5144 /* Make sure that the rest are all clear. */
5145 for (; i < nelts; i += elt_size)
5146 if (INTVAL (builder.elt (i)) != 0)
5147 return 0;
5148
5149 return vl;
5150}
5151
5152/* See if there is an svpattern that encodes an SVE predicate of mode
5153 PRED_MODE in which the first VL bits are set and the rest are clear.
5154 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5155 A VL of -1 indicates an all-true vector. */
5156
5157aarch64_svpattern
5158aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5159{
5160 if (vl < 0)
5161 return AARCH64_SV_ALL;
5162
5163 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5164 return AARCH64_NUM_SVPATTERNS;
5165
5166 if (vl >= 1 && vl <= 8)
5167 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5168
5169 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5170 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5171
5172 int max_vl;
5173 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5174 {
5175 if (vl == (max_vl / 3) * 3)
5176 return AARCH64_SV_MUL3;
5177 /* These would only trigger for non-power-of-2 lengths. */
5178 if (vl == (max_vl & -4))
5179 return AARCH64_SV_MUL4;
5180 if (vl == (1 << floor_log2 (max_vl)))
5181 return AARCH64_SV_POW2;
5182 if (vl == max_vl)
5183 return AARCH64_SV_ALL;
5184 }
5185 return AARCH64_NUM_SVPATTERNS;
5186}
5187
34467289
RS
5188/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5189 bits has the lowest bit set and the upper bits clear. This is the
5190 VNx16BImode equivalent of a PTRUE for controlling elements of
5191 ELT_SIZE bytes. However, because the constant is VNx16BImode,
5192 all bits are significant, even the upper zeros. */
5193
5194rtx
5195aarch64_ptrue_all (unsigned int elt_size)
5196{
5197 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5198 builder.quick_push (const1_rtx);
5199 for (unsigned int i = 1; i < elt_size; ++i)
5200 builder.quick_push (const0_rtx);
5201 return builder.build ();
5202}
5203
16de3637
RS
5204/* Return an all-true predicate register of mode MODE. */
5205
5206rtx
5207aarch64_ptrue_reg (machine_mode mode)
5208{
5209 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
5210 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5211 return gen_lowpart (mode, reg);
16de3637
RS
5212}
5213
e7053b0c
RS
5214/* Return an all-false predicate register of mode MODE. */
5215
5216rtx
5217aarch64_pfalse_reg (machine_mode mode)
5218{
5219 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
5220 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5221 return gen_lowpart (mode, reg);
5222}
5223
00fa90d9
RS
5224/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5225 for it. PRED2[0] is the predicate for the instruction whose result
5226 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5227 for it. Return true if we can prove that the two predicates are
5228 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5229 with PRED1[0] without changing behavior. */
5230
5231bool
5232aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5233{
5234 machine_mode mode = GET_MODE (pred1[0]);
5235 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5236 && mode == GET_MODE (pred2[0])
5237 && aarch64_sve_ptrue_flag (pred1[1], SImode)
5238 && aarch64_sve_ptrue_flag (pred2[1], SImode));
5239
5240 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5241 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5242 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5243 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5244 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5245}
5246
5247/* Emit a comparison CMP between OP0 and OP1, both of which have mode
5248 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5249 Use TARGET as the target register if nonnull and convenient. */
5250
5251static rtx
5252aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5253 machine_mode data_mode, rtx op1, rtx op2)
5254{
5255 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5256 expand_operand ops[5];
5257 create_output_operand (&ops[0], target, pred_mode);
5258 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5259 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5260 create_input_operand (&ops[3], op1, data_mode);
5261 create_input_operand (&ops[4], op2, data_mode);
5262 expand_insn (icode, 5, ops);
5263 return ops[0].value;
5264}
5265
678faefc
RS
5266/* Use a comparison to convert integer vector SRC into MODE, which is
5267 the corresponding SVE predicate mode. Use TARGET for the result
5268 if it's nonnull and convenient. */
5269
624d0f07 5270rtx
678faefc
RS
5271aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5272{
5273 machine_mode src_mode = GET_MODE (src);
00fa90d9
RS
5274 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5275 src, CONST0_RTX (src_mode));
e7053b0c
RS
5276}
5277
624d0f07
RS
5278/* Return the assembly token for svprfop value PRFOP. */
5279
5280static const char *
5281svprfop_token (enum aarch64_svprfop prfop)
5282{
5283 switch (prfop)
5284 {
5285#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5286 AARCH64_FOR_SVPRFOP (CASE)
5287#undef CASE
5288 case AARCH64_NUM_SVPRFOPS:
5289 break;
5290 }
5291 gcc_unreachable ();
5292}
5293
5294/* Return the assembly string for an SVE prefetch operation with
5295 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5296 and that SUFFIX is the format for the remaining operands. */
5297
5298char *
5299aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5300 const char *suffix)
5301{
5302 static char buffer[128];
5303 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5304 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5305 mnemonic, svprfop_token (prfop), suffix);
5306 gcc_assert (written < sizeof (buffer));
5307 return buffer;
5308}
5309
5310/* Check whether we can calculate the number of elements in PATTERN
5311 at compile time, given that there are NELTS_PER_VQ elements per
5312 128-bit block. Return the value if so, otherwise return -1. */
5313
5314HOST_WIDE_INT
5315aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5316{
5317 unsigned int vl, const_vg;
5318 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5319 vl = 1 + (pattern - AARCH64_SV_VL1);
5320 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5321 vl = 16 << (pattern - AARCH64_SV_VL16);
5322 else if (aarch64_sve_vg.is_constant (&const_vg))
5323 {
5324 /* There are two vector granules per quadword. */
5325 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5326 switch (pattern)
5327 {
5328 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5329 case AARCH64_SV_MUL4: return nelts & -4;
5330 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5331 case AARCH64_SV_ALL: return nelts;
5332 default: gcc_unreachable ();
5333 }
5334 }
5335 else
5336 return -1;
5337
5338 /* There are two vector granules per quadword. */
5339 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5340 if (known_le (vl, nelts_all))
5341 return vl;
5342
5343 /* Requesting more elements than are available results in a PFALSE. */
5344 if (known_gt (vl, nelts_all))
5345 return 0;
5346
5347 return -1;
5348}
5349
43cacb12
RS
5350/* Return true if we can move VALUE into a register using a single
5351 CNT[BHWD] instruction. */
5352
5353static bool
5354aarch64_sve_cnt_immediate_p (poly_int64 value)
5355{
5356 HOST_WIDE_INT factor = value.coeffs[0];
5357 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
5358 return (value.coeffs[1] == factor
5359 && IN_RANGE (factor, 2, 16 * 16)
5360 && (factor & 1) == 0
5361 && factor <= 16 * (factor & -factor));
5362}
5363
5364/* Likewise for rtx X. */
5365
5366bool
5367aarch64_sve_cnt_immediate_p (rtx x)
5368{
5369 poly_int64 value;
5370 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5371}
5372
5373/* Return the asm string for an instruction with a CNT-like vector size
5374 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5375 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5376 first part of the operands template (the part that comes before the
139df05a
RS
5377 vector size itself). PATTERN is the pattern to use. FACTOR is the
5378 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
5379 in each quadword. If it is zero, we can use any element size. */
43cacb12
RS
5380
5381static char *
5382aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
139df05a 5383 aarch64_svpattern pattern,
43cacb12
RS
5384 unsigned int factor,
5385 unsigned int nelts_per_vq)
5386{
139df05a 5387 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
43cacb12
RS
5388
5389 if (nelts_per_vq == 0)
5390 /* There is some overlap in the ranges of the four CNT instructions.
5391 Here we always use the smallest possible element size, so that the
5392 multiplier is 1 whereever possible. */
5393 nelts_per_vq = factor & -factor;
5394 int shift = std::min (exact_log2 (nelts_per_vq), 4);
5395 gcc_assert (IN_RANGE (shift, 1, 4));
5396 char suffix = "dwhb"[shift - 1];
5397
5398 factor >>= shift;
5399 unsigned int written;
139df05a 5400 if (pattern == AARCH64_SV_ALL && factor == 1)
43cacb12
RS
5401 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5402 prefix, suffix, operands);
139df05a
RS
5403 else if (factor == 1)
5404 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5405 prefix, suffix, operands, svpattern_token (pattern));
43cacb12 5406 else
139df05a
RS
5407 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5408 prefix, suffix, operands, svpattern_token (pattern),
5409 factor);
43cacb12
RS
5410 gcc_assert (written < sizeof (buffer));
5411 return buffer;
5412}
5413
5414/* Return the asm string for an instruction with a CNT-like vector size
5415 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5416 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5417 first part of the operands template (the part that comes before the
5418 vector size itself). X is the value of the vector size operand,
139df05a
RS
5419 as a polynomial integer rtx; we need to convert this into an "all"
5420 pattern with a multiplier. */
43cacb12
RS
5421
5422char *
5423aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5424 rtx x)
5425{
5426 poly_int64 value = rtx_to_poly_int64 (x);
5427 gcc_assert (aarch64_sve_cnt_immediate_p (value));
139df05a 5428 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
43cacb12
RS
5429 value.coeffs[1], 0);
5430}
5431
624d0f07
RS
5432/* Return the asm string for an instruction with a CNT-like vector size
5433 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5434 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5435 first part of the operands template (the part that comes before the
5436 vector size itself). CNT_PAT[0..2] are the operands of the
5437 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
5438
5439char *
5440aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5441 const char *operands, rtx *cnt_pat)
5442{
5443 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5444 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5445 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5446 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5447 factor, nelts_per_vq);
5448}
5449
0fdc30bc
RS
5450/* Return true if we can add X using a single SVE INC or DEC instruction. */
5451
5452bool
5453aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5454{
5455 poly_int64 value;
5456 return (poly_int_rtx_p (x, &value)
5457 && (aarch64_sve_cnt_immediate_p (value)
5458 || aarch64_sve_cnt_immediate_p (-value)));
5459}
5460
5461/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5462 operand 0. */
5463
5464char *
5465aarch64_output_sve_scalar_inc_dec (rtx offset)
5466{
5467 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5468 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5469 if (offset_value.coeffs[1] > 0)
139df05a 5470 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
5471 offset_value.coeffs[1], 0);
5472 else
139df05a 5473 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
5474 -offset_value.coeffs[1], 0);
5475}
5476
43cacb12
RS
5477/* Return true if we can add VALUE to a register using a single ADDVL
5478 or ADDPL instruction. */
5479
5480static bool
5481aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5482{
5483 HOST_WIDE_INT factor = value.coeffs[0];
5484 if (factor == 0 || value.coeffs[1] != factor)
5485 return false;
5486 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5487 and a value of 16 is one vector width. */
5488 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5489 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5490}
5491
5492/* Likewise for rtx X. */
5493
5494bool
5495aarch64_sve_addvl_addpl_immediate_p (rtx x)
5496{
5497 poly_int64 value;
5498 return (poly_int_rtx_p (x, &value)
5499 && aarch64_sve_addvl_addpl_immediate_p (value));
5500}
5501
0fdc30bc
RS
5502/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5503 to operand 1 and storing the result in operand 0. */
43cacb12
RS
5504
5505char *
0fdc30bc 5506aarch64_output_sve_addvl_addpl (rtx offset)
43cacb12
RS
5507{
5508 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5509 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5510 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5511
43cacb12
RS
5512 int factor = offset_value.coeffs[1];
5513 if ((factor & 15) == 0)
5514 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5515 else
5516 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5517 return buffer;
5518}
5519
5520/* Return true if X is a valid immediate for an SVE vector INC or DEC
5521 instruction. If it is, store the number of elements in each vector
5522 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5523 factor in *FACTOR_OUT (if nonnull). */
5524
5525bool
0fdc30bc
RS
5526aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5527 unsigned int *nelts_per_vq_out)
43cacb12
RS
5528{
5529 rtx elt;
5530 poly_int64 value;
5531
5532 if (!const_vec_duplicate_p (x, &elt)
5533 || !poly_int_rtx_p (elt, &value))
5534 return false;
5535
5536 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5537 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5538 /* There's no vector INCB. */
5539 return false;
5540
5541 HOST_WIDE_INT factor = value.coeffs[0];
5542 if (value.coeffs[1] != factor)
5543 return false;
5544
5545 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
5546 if ((factor % nelts_per_vq) != 0
5547 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5548 return false;
5549
5550 if (factor_out)
5551 *factor_out = factor;
5552 if (nelts_per_vq_out)
5553 *nelts_per_vq_out = nelts_per_vq;
5554 return true;
5555}
5556
5557/* Return true if X is a valid immediate for an SVE vector INC or DEC
5558 instruction. */
5559
5560bool
0fdc30bc 5561aarch64_sve_vector_inc_dec_immediate_p (rtx x)
43cacb12 5562{
0fdc30bc 5563 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
43cacb12
RS
5564}
5565
5566/* Return the asm template for an SVE vector INC or DEC instruction.
5567 OPERANDS gives the operands before the vector count and X is the
5568 value of the vector count operand itself. */
5569
5570char *
0fdc30bc 5571aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
43cacb12
RS
5572{
5573 int factor;
5574 unsigned int nelts_per_vq;
0fdc30bc 5575 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
43cacb12
RS
5576 gcc_unreachable ();
5577 if (factor < 0)
139df05a
RS
5578 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5579 -factor, nelts_per_vq);
43cacb12 5580 else
139df05a
RS
5581 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5582 factor, nelts_per_vq);
43cacb12 5583}
43e9d192 5584
a0960365
WD
5585/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5586
5587static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5588 {
5589 0x0000000100000001ull,
5590 0x0001000100010001ull,
5591 0x0101010101010101ull,
5592 0x1111111111111111ull,
5593 0x5555555555555555ull,
5594 };
5595
5596
5597
5598/* Return true if 64-bit VAL is a valid bitmask immediate. */
5599static bool
5600aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5601{
5602 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5603 int bits;
5604
5605 /* Check for a single sequence of one bits and return quickly if so.
5606 The special cases of all ones and all zeroes returns false. */
5607 tmp = val + (val & -val);
5608
5609 if (tmp == (tmp & -tmp))
5610 return (val + 1) > 1;
5611
5612 /* Invert if the immediate doesn't start with a zero bit - this means we
5613 only need to search for sequences of one bits. */
5614 if (val & 1)
5615 val = ~val;
5616
5617 /* Find the first set bit and set tmp to val with the first sequence of one
5618 bits removed. Return success if there is a single sequence of ones. */
5619 first_one = val & -val;
5620 tmp = val & (val + first_one);
5621
5622 if (tmp == 0)
5623 return true;
5624
5625 /* Find the next set bit and compute the difference in bit position. */
5626 next_one = tmp & -tmp;
5627 bits = clz_hwi (first_one) - clz_hwi (next_one);
5628 mask = val ^ tmp;
5629
5630 /* Check the bit position difference is a power of 2, and that the first
5631 sequence of one bits fits within 'bits' bits. */
5632 if ((mask >> bits) != 0 || bits != (bits & -bits))
5633 return false;
5634
5635 /* Check the sequence of one bits is repeated 64/bits times. */
5636 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5637}
5638
5639
5640/* Return true if VAL is a valid bitmask immediate for MODE. */
5641bool
ba1536da 5642aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
a0960365
WD
5643{
5644 if (mode == DImode)
ba1536da 5645 return aarch64_bitmask_imm (val);
a0960365
WD
5646
5647 if (mode == SImode)
5648 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5649
5650 /* Replicate small immediates to fit 64 bits. */
5651 int size = GET_MODE_UNIT_PRECISION (mode);
5652 val &= (HOST_WIDE_INT_1U << size) - 1;
5653 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5654
5655 return aarch64_bitmask_imm (val);
5656}
5657
5658
5659/* Return true if the immediate VAL can be a bitfield immediate
5660 by changing the given MASK bits in VAL to zeroes, ones or bits
5661 from the other half of VAL. Return the new immediate in VAL2. */
5662static inline bool
5663aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5664 unsigned HOST_WIDE_INT &val2,
5665 unsigned HOST_WIDE_INT mask)
5666{
5667 val2 = val & ~mask;
5668 if (val2 != val && aarch64_bitmask_imm (val2))
5669 return true;
5670 val2 = val | mask;
5671 if (val2 != val && aarch64_bitmask_imm (val2))
5672 return true;
5673 val = val & ~mask;
5674 val2 = val | (((val >> 32) | (val << 32)) & mask);
5675 if (val2 != val && aarch64_bitmask_imm (val2))
5676 return true;
5677 val2 = val | (((val >> 16) | (val << 48)) & mask);
5678 if (val2 != val && aarch64_bitmask_imm (val2))
5679 return true;
5680 return false;
5681}
5682
5683
ba1536da
WD
5684/* Return true if VAL is a valid MOVZ immediate. */
5685static inline bool
5686aarch64_is_movz (unsigned HOST_WIDE_INT val)
a0960365 5687{
ba1536da 5688 return (val >> (ctz_hwi (val) & 48)) < 65536;
a0960365
WD
5689}
5690
5691
ba1536da 5692/* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
a0960365 5693bool
ba1536da 5694aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
a0960365 5695{
ba1536da
WD
5696 return aarch64_is_movz (val) || aarch64_is_movz (~val)
5697 || aarch64_bitmask_imm (val);
5698}
a0960365 5699
ba1536da
WD
5700
5701/* Return true if VAL is an immediate that can be created by a single
5702 MOV instruction. */
5703bool
5704aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5705{
5706 gcc_assert (mode == SImode || mode == DImode);
5707
5708 if (val < 65536)
5709 return true;
5710
5711 unsigned HOST_WIDE_INT mask =
5712 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5713
5714 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5715 return true;
5716
5717 val = (val & mask) | ((val << 32) & ~mask);
5718 return aarch64_bitmask_imm (val);
a0960365
WD
5719}
5720
5721
82614948
RR
5722static int
5723aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
ba1536da 5724 machine_mode mode)
43e9d192 5725{
43e9d192 5726 int i;
9a4865db
WD
5727 unsigned HOST_WIDE_INT val, val2, mask;
5728 int one_match, zero_match;
5729 int num_insns;
43e9d192 5730
ba1536da
WD
5731 gcc_assert (mode == SImode || mode == DImode);
5732
9a4865db
WD
5733 val = INTVAL (imm);
5734
5735 if (aarch64_move_imm (val, mode))
43e9d192 5736 {
82614948 5737 if (generate)
f7df4a84 5738 emit_insn (gen_rtx_SET (dest, imm));
9a4865db 5739 return 1;
43e9d192
IB
5740 }
5741
9a4865db 5742 if ((val >> 32) == 0 || mode == SImode)
43e9d192 5743 {
82614948
RR
5744 if (generate)
5745 {
9a4865db
WD
5746 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5747 if (mode == SImode)
5748 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5749 GEN_INT ((val >> 16) & 0xffff)));
5750 else
5751 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5752 GEN_INT ((val >> 16) & 0xffff)));
82614948 5753 }
9a4865db 5754 return 2;
43e9d192
IB
5755 }
5756
5757 /* Remaining cases are all for DImode. */
5758
43e9d192 5759 mask = 0xffff;
9a4865db
WD
5760 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5761 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5762 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5763 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
43e9d192 5764
ba1536da
WD
5765 /* Try a bitmask immediate and a movk to generate the immediate
5766 in 2 instructions. */
5767
a0960365 5768 if (zero_match < 2 && one_match < 2)
43e9d192 5769 {
a0960365 5770 for (i = 0; i < 64; i += 16)
ba1536da
WD
5771 {
5772 if (aarch64_check_bitmask (val, val2, mask << i))
5773 break;
5774
5775 val2 = val & ~(mask << i);
5776 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5777 break;
5778 }
5779
5780 if (i != 64)
5781 {
5782 if (generate)
5783 {
5784 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5785 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5786 GEN_INT ((val >> i) & 0xffff)));
5787 }
5788 return 2;
5789 }
a0960365
WD
5790 }
5791
5792 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
5793 if (zero_match + one_match == 0)
5794 {
5795 for (i = 0; i < 48; i += 16)
5796 for (int j = i + 16; j < 64; j += 16)
5797 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
43e9d192 5798 {
a0960365
WD
5799 if (generate)
5800 {
5801 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5802 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5803 GEN_INT ((val >> i) & 0xffff)));
5804 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5805 GEN_INT ((val >> j) & 0xffff)));
5806 }
5807 return 3;
43e9d192 5808 }
43e9d192
IB
5809 }
5810
9a4865db
WD
5811 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5812 are emitted by the initial mov. If one_match > zero_match, skip set bits,
5813 otherwise skip zero bits. */
2c274197 5814
9a4865db 5815 num_insns = 1;
43e9d192 5816 mask = 0xffff;
9a4865db
WD
5817 val2 = one_match > zero_match ? ~val : val;
5818 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5819
5820 if (generate)
5821 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5822 ? (val | ~(mask << i))
5823 : (val & (mask << i)))));
5824 for (i += 16; i < 64; i += 16)
43e9d192 5825 {
9a4865db
WD
5826 if ((val2 & (mask << i)) == 0)
5827 continue;
5828 if (generate)
5829 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5830 GEN_INT ((val >> i) & 0xffff)));
5831 num_insns ++;
82614948
RR
5832 }
5833
5834 return num_insns;
5835}
5836
c0bb5bc5
WD
5837/* Return whether imm is a 128-bit immediate which is simple enough to
5838 expand inline. */
5839bool
5840aarch64_mov128_immediate (rtx imm)
5841{
3793ecc1 5842 if (CONST_INT_P (imm))
c0bb5bc5
WD
5843 return true;
5844
5845 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5846
5847 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5848 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5849
5850 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5851 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5852}
5853
5854
a0960365
WD
5855/* Return true if val can be encoded as a 12-bit unsigned immediate with
5856 a left shift of 0 or 12 bits. */
5857bool
ba1536da 5858aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
a0960365 5859{
ba1536da 5860 return val < 4096 || (val & 0xfff000) == val;
a0960365
WD
5861}
5862
5863/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5864 that can be created with a left shift of 0 or 12. */
5865static HOST_WIDE_INT
ba1536da 5866aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
a0960365
WD
5867{
5868 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5869 handle correctly. */
ba1536da 5870 gcc_assert (val < 0x1000000);
a0960365 5871
ba1536da 5872 if (val < 4096)
a0960365
WD
5873 return val;
5874
ba1536da 5875 return val & 0xfff000;
a0960365
WD
5876}
5877
5878
5879/* Test whether:
5880
5881 X = (X & AND_VAL) | IOR_VAL;
5882
5883 can be implemented using:
5884
5885 MOVK X, #(IOR_VAL >> shift), LSL #shift
5886
5887 Return the shift if so, otherwise return -1. */
5888int
5889aarch64_movk_shift (const wide_int_ref &and_val,
5890 const wide_int_ref &ior_val)
5891{
5892 unsigned int precision = and_val.get_precision ();
5893 unsigned HOST_WIDE_INT mask = 0xffff;
5894 for (unsigned int shift = 0; shift < precision; shift += 16)
5895 {
5896 if (and_val == ~mask && (ior_val & mask) == ior_val)
5897 return shift;
5898 mask <<= 16;
5899 }
5900 return -1;
5901}
5902
5903/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5904 Assumed precondition: VAL_IN Is not zero. */
5905
5906unsigned HOST_WIDE_INT
5907aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5908{
5909 int lowest_bit_set = ctz_hwi (val_in);
5910 int highest_bit_set = floor_log2 (val_in);
5911 gcc_assert (val_in != 0);
5912
5913 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5914 (HOST_WIDE_INT_1U << lowest_bit_set));
5915}
5916
5917/* Create constant where bits outside of lowest bit set to highest bit set
5918 are set to 1. */
5919
5920unsigned HOST_WIDE_INT
5921aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5922{
5923 return val_in | ~aarch64_and_split_imm1 (val_in);
5924}
5925
5926/* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5927
5928bool
5929aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5930{
5931 scalar_int_mode int_mode;
5932 if (!is_a <scalar_int_mode> (mode, &int_mode))
5933 return false;
5934
5935 if (aarch64_bitmask_imm (val_in, int_mode))
5936 return false;
5937
5938 if (aarch64_move_imm (val_in, int_mode))
5939 return false;
5940
5941 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5942
5943 return aarch64_bitmask_imm (imm2, int_mode);
5944}
5945
43cacb12
RS
5946/* Return the number of temporary registers that aarch64_add_offset_1
5947 would need to add OFFSET to a register. */
5948
5949static unsigned int
5950aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5951{
1bb3e2c0 5952 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
43cacb12
RS
5953}
5954
f5470a77
RS
5955/* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5956 a non-polynomial OFFSET. MODE is the mode of the addition.
5957 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5958 be set and CFA adjustments added to the generated instructions.
5959
5960 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5961 temporary if register allocation is already complete. This temporary
5962 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5963 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5964 the immediate again.
0100c5f9
RS
5965
5966 Since this function may be used to adjust the stack pointer, we must
5967 ensure that it cannot cause transient stack deallocation (for example
5968 by first incrementing SP and then decrementing when adjusting by a
5969 large immediate). */
5970
5971static void
f5470a77
RS
5972aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5973 rtx src, HOST_WIDE_INT offset, rtx temp1,
5974 bool frame_related_p, bool emit_move_imm)
0100c5f9 5975{
f5470a77
RS
5976 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5977 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5978
42bc589e 5979 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
0100c5f9
RS
5980 rtx_insn *insn;
5981
f5470a77
RS
5982 if (!moffset)
5983 {
5984 if (!rtx_equal_p (dest, src))
5985 {
5986 insn = emit_insn (gen_rtx_SET (dest, src));
5987 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5988 }
5989 return;
5990 }
0100c5f9
RS
5991
5992 /* Single instruction adjustment. */
f5470a77 5993 if (aarch64_uimm12_shift (moffset))
0100c5f9 5994 {
f5470a77 5995 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
0100c5f9
RS
5996 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5997 return;
5998 }
5999
f5470a77
RS
6000 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
6001 and either:
6002
6003 a) the offset cannot be loaded by a 16-bit move or
6004 b) there is no spare register into which we can move it. */
6005 if (moffset < 0x1000000
6006 && ((!temp1 && !can_create_pseudo_p ())
6007 || !aarch64_move_imm (moffset, mode)))
0100c5f9 6008 {
f5470a77 6009 HOST_WIDE_INT low_off = moffset & 0xfff;
0100c5f9 6010
f5470a77
RS
6011 low_off = offset < 0 ? -low_off : low_off;
6012 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
0100c5f9 6013 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77 6014 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
0100c5f9
RS
6015 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6016 return;
6017 }
6018
6019 /* Emit a move immediate if required and an addition/subtraction. */
0100c5f9 6020 if (emit_move_imm)
f5470a77
RS
6021 {
6022 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
7aa605c9
JJ
6023 temp1 = aarch64_force_temporary (mode, temp1,
6024 gen_int_mode (moffset, mode));
f5470a77
RS
6025 }
6026 insn = emit_insn (offset < 0
6027 ? gen_sub3_insn (dest, src, temp1)
6028 : gen_add3_insn (dest, src, temp1));
0100c5f9
RS
6029 if (frame_related_p)
6030 {
6031 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77
RS
6032 rtx adj = plus_constant (mode, src, offset);
6033 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
0100c5f9
RS
6034 }
6035}
6036
43cacb12
RS
6037/* Return the number of temporary registers that aarch64_add_offset
6038 would need to move OFFSET into a register or add OFFSET to a register;
6039 ADD_P is true if we want the latter rather than the former. */
6040
6041static unsigned int
6042aarch64_offset_temporaries (bool add_p, poly_int64 offset)
6043{
6044 /* This follows the same structure as aarch64_add_offset. */
6045 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
6046 return 0;
6047
6048 unsigned int count = 0;
6049 HOST_WIDE_INT factor = offset.coeffs[1];
6050 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6051 poly_int64 poly_offset (factor, factor);
6052 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6053 /* Need one register for the ADDVL/ADDPL result. */
6054 count += 1;
6055 else if (factor != 0)
6056 {
6057 factor = abs (factor);
6058 if (factor > 16 * (factor & -factor))
6059 /* Need one register for the CNT result and one for the multiplication
6060 factor. If necessary, the second temporary can be reused for the
6061 constant part of the offset. */
6062 return 2;
6063 /* Need one register for the CNT result (which might then
6064 be shifted). */
6065 count += 1;
6066 }
6067 return count + aarch64_add_offset_1_temporaries (constant);
6068}
6069
6070/* If X can be represented as a poly_int64, return the number
6071 of temporaries that are required to add it to a register.
6072 Return -1 otherwise. */
6073
6074int
6075aarch64_add_offset_temporaries (rtx x)
6076{
6077 poly_int64 offset;
6078 if (!poly_int_rtx_p (x, &offset))
6079 return -1;
6080 return aarch64_offset_temporaries (true, offset);
6081}
6082
f5470a77
RS
6083/* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
6084 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
6085 be set and CFA adjustments added to the generated instructions.
6086
6087 TEMP1, if nonnull, is a register of mode MODE that can be used as a
6088 temporary if register allocation is already complete. This temporary
43cacb12
RS
6089 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
6090 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6091 false to avoid emitting the immediate again.
6092
6093 TEMP2, if nonnull, is a second temporary register that doesn't
6094 overlap either DEST or REG.
f5470a77
RS
6095
6096 Since this function may be used to adjust the stack pointer, we must
6097 ensure that it cannot cause transient stack deallocation (for example
6098 by first incrementing SP and then decrementing when adjusting by a
6099 large immediate). */
6100
6101static void
6102aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
43cacb12
RS
6103 poly_int64 offset, rtx temp1, rtx temp2,
6104 bool frame_related_p, bool emit_move_imm = true)
0100c5f9 6105{
f5470a77
RS
6106 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6107 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
43cacb12
RS
6108 gcc_assert (temp1 == NULL_RTX
6109 || !frame_related_p
6110 || !reg_overlap_mentioned_p (temp1, dest));
6111 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6112
6113 /* Try using ADDVL or ADDPL to add the whole value. */
6114 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6115 {
6116 rtx offset_rtx = gen_int_mode (offset, mode);
6117 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6118 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6119 return;
6120 }
6121
6122 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6123 SVE vector register, over and above the minimum size of 128 bits.
6124 This is equivalent to half the value returned by CNTD with a
6125 vector shape of ALL. */
6126 HOST_WIDE_INT factor = offset.coeffs[1];
6127 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6128
6129 /* Try using ADDVL or ADDPL to add the VG-based part. */
6130 poly_int64 poly_offset (factor, factor);
6131 if (src != const0_rtx
6132 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6133 {
6134 rtx offset_rtx = gen_int_mode (poly_offset, mode);
6135 if (frame_related_p)
6136 {
6137 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6138 RTX_FRAME_RELATED_P (insn) = true;
6139 src = dest;
6140 }
6141 else
6142 {
6143 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6144 src = aarch64_force_temporary (mode, temp1, addr);
6145 temp1 = temp2;
6146 temp2 = NULL_RTX;
6147 }
6148 }
6149 /* Otherwise use a CNT-based sequence. */
6150 else if (factor != 0)
6151 {
6152 /* Use a subtraction if we have a negative factor. */
6153 rtx_code code = PLUS;
6154 if (factor < 0)
6155 {
6156 factor = -factor;
6157 code = MINUS;
6158 }
6159
6160 /* Calculate CNTD * FACTOR / 2. First try to fold the division
6161 into the multiplication. */
6162 rtx val;
6163 int shift = 0;
6164 if (factor & 1)
6165 /* Use a right shift by 1. */
6166 shift = -1;
6167 else
6168 factor /= 2;
6169 HOST_WIDE_INT low_bit = factor & -factor;
6170 if (factor <= 16 * low_bit)
6171 {
6172 if (factor > 16 * 8)
6173 {
6174 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6175 the value with the minimum multiplier and shift it into
6176 position. */
6177 int extra_shift = exact_log2 (low_bit);
6178 shift += extra_shift;
6179 factor >>= extra_shift;
6180 }
6181 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6182 }
6183 else
6184 {
7d8bdfa7
RS
6185 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6186 directly, since that should increase the chances of being
6187 able to use a shift and add sequence. If LOW_BIT itself
6188 is out of range, just use CNTD. */
6189 if (low_bit <= 16 * 8)
6190 factor /= low_bit;
6191 else
6192 low_bit = 1;
6193
6194 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
43cacb12
RS
6195 val = aarch64_force_temporary (mode, temp1, val);
6196
7d8bdfa7
RS
6197 if (can_create_pseudo_p ())
6198 {
6199 rtx coeff1 = gen_int_mode (factor, mode);
d7cea7ce 6200 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
7d8bdfa7
RS
6201 }
6202 else
43cacb12 6203 {
7d8bdfa7
RS
6204 /* Go back to using a negative multiplication factor if we have
6205 no register from which to subtract. */
6206 if (code == MINUS && src == const0_rtx)
6207 {
6208 factor = -factor;
6209 code = PLUS;
6210 }
6211 rtx coeff1 = gen_int_mode (factor, mode);
6212 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6213 val = gen_rtx_MULT (mode, val, coeff1);
43cacb12 6214 }
43cacb12
RS
6215 }
6216
6217 if (shift > 0)
6218 {
6219 /* Multiply by 1 << SHIFT. */
6220 val = aarch64_force_temporary (mode, temp1, val);
6221 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6222 }
6223 else if (shift == -1)
6224 {
6225 /* Divide by 2. */
6226 val = aarch64_force_temporary (mode, temp1, val);
6227 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6228 }
6229
6230 /* Calculate SRC +/- CNTD * FACTOR / 2. */
6231 if (src != const0_rtx)
6232 {
6233 val = aarch64_force_temporary (mode, temp1, val);
6234 val = gen_rtx_fmt_ee (code, mode, src, val);
6235 }
6236 else if (code == MINUS)
6237 {
6238 val = aarch64_force_temporary (mode, temp1, val);
6239 val = gen_rtx_NEG (mode, val);
6240 }
6241
6242 if (constant == 0 || frame_related_p)
6243 {
6244 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6245 if (frame_related_p)
6246 {
6247 RTX_FRAME_RELATED_P (insn) = true;
6248 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6249 gen_rtx_SET (dest, plus_constant (Pmode, src,
6250 poly_offset)));
6251 }
6252 src = dest;
6253 if (constant == 0)
6254 return;
6255 }
6256 else
6257 {
6258 src = aarch64_force_temporary (mode, temp1, val);
6259 temp1 = temp2;
6260 temp2 = NULL_RTX;
6261 }
6262
6263 emit_move_imm = true;
6264 }
f5470a77 6265
f5470a77
RS
6266 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6267 frame_related_p, emit_move_imm);
0100c5f9
RS
6268}
6269
43cacb12
RS
6270/* Like aarch64_add_offset, but the offset is given as an rtx rather
6271 than a poly_int64. */
6272
6273void
6274aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6275 rtx offset_rtx, rtx temp1, rtx temp2)
6276{
6277 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6278 temp1, temp2, false);
6279}
6280
f5470a77
RS
6281/* Add DELTA to the stack pointer, marking the instructions frame-related.
6282 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
6283 if TEMP1 already contains abs (DELTA). */
6284
0100c5f9 6285static inline void
43cacb12 6286aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
0100c5f9 6287{
f5470a77 6288 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
43cacb12 6289 temp1, temp2, true, emit_move_imm);
0100c5f9
RS
6290}
6291
f5470a77
RS
6292/* Subtract DELTA from the stack pointer, marking the instructions
6293 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
6294 if nonnull. */
6295
0100c5f9 6296static inline void
cd1bef27
JL
6297aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6298 bool emit_move_imm = true)
0100c5f9 6299{
f5470a77 6300 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
cd1bef27 6301 temp1, temp2, frame_related_p, emit_move_imm);
0100c5f9 6302}
82614948 6303
43cacb12
RS
6304/* Set DEST to (vec_series BASE STEP). */
6305
6306static void
6307aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
82614948
RR
6308{
6309 machine_mode mode = GET_MODE (dest);
43cacb12
RS
6310 scalar_mode inner = GET_MODE_INNER (mode);
6311
6312 /* Each operand can be a register or an immediate in the range [-16, 15]. */
6313 if (!aarch64_sve_index_immediate_p (base))
6314 base = force_reg (inner, base);
6315 if (!aarch64_sve_index_immediate_p (step))
6316 step = force_reg (inner, step);
6317
6318 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6319}
82614948 6320
4aeb1ba7
RS
6321/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6322 register of mode MODE. Use TARGET for the result if it's nonnull
6323 and convenient.
6324
6325 The two vector modes must have the same element mode. The behavior
6326 is to duplicate architectural lane N of SRC into architectural lanes
6327 N + I * STEP of the result. On big-endian targets, architectural
6328 lane 0 of an Advanced SIMD vector is the last element of the vector
6329 in memory layout, so for big-endian targets this operation has the
6330 effect of reversing SRC before duplicating it. Callers need to
6331 account for this. */
43cacb12 6332
4aeb1ba7
RS
6333rtx
6334aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6335{
6336 machine_mode src_mode = GET_MODE (src);
6337 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6338 insn_code icode = (BYTES_BIG_ENDIAN
6339 ? code_for_aarch64_vec_duplicate_vq_be (mode)
6340 : code_for_aarch64_vec_duplicate_vq_le (mode));
6341
6342 unsigned int i = 0;
6343 expand_operand ops[3];
6344 create_output_operand (&ops[i++], target, mode);
6345 create_output_operand (&ops[i++], src, src_mode);
6346 if (BYTES_BIG_ENDIAN)
6347 {
6348 /* Create a PARALLEL describing the reversal of SRC. */
6349 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6350 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6351 nelts_per_vq - 1, -1);
6352 create_fixed_operand (&ops[i++], sel);
43cacb12 6353 }
4aeb1ba7
RS
6354 expand_insn (icode, i, ops);
6355 return ops[0].value;
6356}
6357
6358/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6359 the memory image into DEST. Return true on success. */
43cacb12 6360
4aeb1ba7
RS
6361static bool
6362aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6363{
6364 src = force_const_mem (GET_MODE (src), src);
43cacb12
RS
6365 if (!src)
6366 return false;
6367
6368 /* Make sure that the address is legitimate. */
4aeb1ba7 6369 if (!aarch64_sve_ld1rq_operand_p (src))
43cacb12
RS
6370 {
6371 rtx addr = force_reg (Pmode, XEXP (src, 0));
6372 src = replace_equiv_address (src, addr);
6373 }
6374
947b1372 6375 machine_mode mode = GET_MODE (dest);
cc68f7c2 6376 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
16de3637 6377 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4aeb1ba7 6378 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
43cacb12
RS
6379 return true;
6380}
6381
a065e0bb
RS
6382/* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6383 by N "background" values. Try to move it into TARGET using:
6384
6385 PTRUE PRED.<T>, VL<N>
6386 MOV TRUE.<T>, #<foreground>
6387 MOV FALSE.<T>, #<background>
6388 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6389
6390 The PTRUE is always a single instruction but the MOVs might need a
6391 longer sequence. If the background value is zero (as it often is),
6392 the sequence can sometimes collapse to a PTRUE followed by a
6393 zero-predicated move.
6394
6395 Return the target on success, otherwise return null. */
6396
6397static rtx
6398aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6399{
6400 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6401
6402 /* Make sure that the PTRUE is valid. */
6403 machine_mode mode = GET_MODE (src);
6404 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6405 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6406 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6407 == AARCH64_NUM_SVPATTERNS)
6408 return NULL_RTX;
6409
6410 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6411 rtx_vector_builder true_builder (mode, npatterns, 1);
6412 rtx_vector_builder false_builder (mode, npatterns, 1);
6413 for (unsigned int i = 0; i < npatterns; ++i)
6414 {
6415 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6416 pred_builder.quick_push (CONST1_RTX (BImode));
6417 }
6418 for (unsigned int i = 0; i < npatterns; ++i)
6419 {
6420 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6421 pred_builder.quick_push (CONST0_RTX (BImode));
6422 }
6423 expand_operand ops[4];
6424 create_output_operand (&ops[0], target, mode);
6425 create_input_operand (&ops[1], true_builder.build (), mode);
6426 create_input_operand (&ops[2], false_builder.build (), mode);
6427 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6428 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6429 return target;
6430}
6431
4aeb1ba7
RS
6432/* Return a register containing CONST_VECTOR SRC, given that SRC has an
6433 SVE data mode and isn't a legitimate constant. Use TARGET for the
6434 result if convenient.
43cacb12 6435
4aeb1ba7
RS
6436 The returned register can have whatever mode seems most natural
6437 given the contents of SRC. */
6438
6439static rtx
6440aarch64_expand_sve_const_vector (rtx target, rtx src)
43cacb12
RS
6441{
6442 machine_mode mode = GET_MODE (src);
6443 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6444 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4aeb1ba7
RS
6445 scalar_mode elt_mode = GET_MODE_INNER (mode);
6446 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
cc68f7c2
RS
6447 unsigned int container_bits = aarch64_sve_container_bits (mode);
6448 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6449
6450 if (nelts_per_pattern == 1
6451 && encoded_bits <= 128
6452 && container_bits != elt_bits)
6453 {
6454 /* We have a partial vector mode and a constant whose full-vector
6455 equivalent would occupy a repeating 128-bit sequence. Build that
6456 full-vector equivalent instead, so that we have the option of
6457 using LD1RQ and Advanced SIMD operations. */
6458 unsigned int repeat = container_bits / elt_bits;
6459 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6460 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6461 for (unsigned int i = 0; i < npatterns; ++i)
6462 for (unsigned int j = 0; j < repeat; ++j)
6463 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6464 target = aarch64_target_reg (target, full_mode);
6465 return aarch64_expand_sve_const_vector (target, builder.build ());
6466 }
4aeb1ba7
RS
6467
6468 if (nelts_per_pattern == 1 && encoded_bits == 128)
6469 {
6470 /* The constant is a duplicated quadword but can't be narrowed
6471 beyond a quadword. Get the memory image of the first quadword
6472 as a 128-bit vector and try using LD1RQ to load it from memory.
6473
6474 The effect for both endiannesses is to load memory lane N into
6475 architectural lanes N + I * STEP of the result. On big-endian
6476 targets, the layout of the 128-bit vector in an Advanced SIMD
6477 register would be different from its layout in an SVE register,
6478 but this 128-bit vector is a memory value only. */
6479 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6480 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6481 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6482 return target;
6483 }
6484
6485 if (nelts_per_pattern == 1 && encoded_bits < 128)
6486 {
6487 /* The vector is a repeating sequence of 64 bits or fewer.
6488 See if we can load them using an Advanced SIMD move and then
6489 duplicate it to fill a vector. This is better than using a GPR
6490 move because it keeps everything in the same register file. */
6491 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6492 rtx_vector_builder builder (vq_mode, npatterns, 1);
6493 for (unsigned int i = 0; i < npatterns; ++i)
6494 {
6495 /* We want memory lane N to go into architectural lane N,
6496 so reverse for big-endian targets. The DUP .Q pattern
6497 has a compensating reverse built-in. */
6498 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6499 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6500 }
6501 rtx vq_src = builder.build ();
6502 if (aarch64_simd_valid_immediate (vq_src, NULL))
6503 {
6504 vq_src = force_reg (vq_mode, vq_src);
6505 return aarch64_expand_sve_dupq (target, mode, vq_src);
6506 }
6507
6508 /* Get an integer representation of the repeating part of Advanced
6509 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
6510 which for big-endian targets is lane-swapped wrt a normal
6511 Advanced SIMD vector. This means that for both endiannesses,
6512 memory lane N of SVE vector SRC corresponds to architectural
6513 lane N of a register holding VQ_SRC. This in turn means that
6514 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6515 as a single 128-bit value) and thus that memory lane 0 of SRC is
6516 in the lsb of the integer. Duplicating the integer therefore
6517 ensures that memory lane N of SRC goes into architectural lane
6518 N + I * INDEX of the SVE register. */
6519 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6520 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6521 if (elt_value)
6522 {
6523 /* Pretend that we had a vector of INT_MODE to start with. */
6524 elt_mode = int_mode;
6525 mode = aarch64_full_sve_mode (int_mode).require ();
6526
6527 /* If the integer can be moved into a general register by a
6528 single instruction, do that and duplicate the result. */
6529 if (CONST_INT_P (elt_value)
952c8a1d
WD
6530 && aarch64_move_imm (INTVAL (elt_value),
6531 encoded_bits <= 32 ? SImode : DImode))
4aeb1ba7
RS
6532 {
6533 elt_value = force_reg (elt_mode, elt_value);
6534 return expand_vector_broadcast (mode, elt_value);
6535 }
6536 }
6537 else if (npatterns == 1)
6538 /* We're duplicating a single value, but can't do better than
6539 force it to memory and load from there. This handles things
6540 like symbolic constants. */
6541 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
43cacb12 6542
4aeb1ba7 6543 if (elt_value)
8179efe0 6544 {
4aeb1ba7
RS
6545 /* Load the element from memory if we can, otherwise move it into
6546 a register and use a DUP. */
6547 rtx op = force_const_mem (elt_mode, elt_value);
6548 if (!op)
6549 op = force_reg (elt_mode, elt_value);
6550 return expand_vector_broadcast (mode, op);
8179efe0 6551 }
43cacb12
RS
6552 }
6553
4aeb1ba7
RS
6554 /* Try using INDEX. */
6555 rtx base, step;
6556 if (const_vec_series_p (src, &base, &step))
6557 {
6558 aarch64_expand_vec_series (target, base, step);
6559 return target;
6560 }
6561
6562 /* From here on, it's better to force the whole constant to memory
6563 if we can. */
6564 if (GET_MODE_NUNITS (mode).is_constant ())
6565 return NULL_RTX;
6566
a065e0bb
RS
6567 if (nelts_per_pattern == 2)
6568 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6569 return res;
6570
43cacb12 6571 /* Expand each pattern individually. */
4aeb1ba7 6572 gcc_assert (npatterns > 1);
43cacb12
RS
6573 rtx_vector_builder builder;
6574 auto_vec<rtx, 16> vectors (npatterns);
6575 for (unsigned int i = 0; i < npatterns; ++i)
6576 {
6577 builder.new_vector (mode, 1, nelts_per_pattern);
6578 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6579 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6580 vectors.quick_push (force_reg (mode, builder.build ()));
6581 }
6582
6583 /* Use permutes to interleave the separate vectors. */
6584 while (npatterns > 1)
6585 {
6586 npatterns /= 2;
6587 for (unsigned int i = 0; i < npatterns; ++i)
6588 {
4aeb1ba7 6589 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
43cacb12
RS
6590 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6591 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6592 vectors[i] = tmp;
6593 }
6594 }
4aeb1ba7
RS
6595 gcc_assert (vectors[0] == target);
6596 return target;
43cacb12
RS
6597}
6598
678faefc
RS
6599/* Use WHILE to set a predicate register of mode MODE in which the first
6600 VL bits are set and the rest are clear. Use TARGET for the register
6601 if it's nonnull and convenient. */
0b1fe8cf 6602
678faefc
RS
6603static rtx
6604aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6605 unsigned int vl)
0b1fe8cf
RS
6606{
6607 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
678faefc 6608 target = aarch64_target_reg (target, mode);
6ad9571b 6609 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
624d0f07 6610 target, const0_rtx, limit));
678faefc
RS
6611 return target;
6612}
6613
2803bc3b
RS
6614static rtx
6615aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6616
6617/* BUILDER is a constant predicate in which the index of every set bit
6618 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6619 by inverting every element at a multiple of ELT_SIZE and EORing the
6620 result with an ELT_SIZE PTRUE.
6621
6622 Return a register that contains the constant on success, otherwise
6623 return null. Use TARGET as the register if it is nonnull and
6624 convenient. */
6625
6626static rtx
6627aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6628 unsigned int elt_size)
6629{
6630 /* Invert every element at a multiple of ELT_SIZE, keeping the
6631 other bits zero. */
6632 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6633 builder.nelts_per_pattern ());
6634 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6635 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6636 inv_builder.quick_push (const1_rtx);
6637 else
6638 inv_builder.quick_push (const0_rtx);
6639 inv_builder.finalize ();
6640
6641 /* See if we can load the constant cheaply. */
6642 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6643 if (!inv)
6644 return NULL_RTX;
6645
6646 /* EOR the result with an ELT_SIZE PTRUE. */
6647 rtx mask = aarch64_ptrue_all (elt_size);
6648 mask = force_reg (VNx16BImode, mask);
26bebf57 6649 inv = gen_lowpart (VNx16BImode, inv);
2803bc3b
RS
6650 target = aarch64_target_reg (target, VNx16BImode);
6651 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6652 return target;
6653}
6654
6655/* BUILDER is a constant predicate in which the index of every set bit
6656 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6657 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
6658 register on success, otherwise return null. Use TARGET as the register
6659 if nonnull and convenient. */
6660
6661static rtx
6662aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6663 unsigned int elt_size,
6664 unsigned int permute_size)
6665{
6666 /* We're going to split the constant into two new constants A and B,
6667 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6668 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6669
6670 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6671 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6672
6673 where _ indicates elements that will be discarded by the permute.
6674
6675 First calculate the ELT_SIZEs for A and B. */
6676 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6677 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6678 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6679 if (INTVAL (builder.elt (i)) != 0)
6680 {
6681 if (i & permute_size)
6682 b_elt_size |= i - permute_size;
6683 else
6684 a_elt_size |= i;
6685 }
6686 a_elt_size &= -a_elt_size;
6687 b_elt_size &= -b_elt_size;
6688
6689 /* Now construct the vectors themselves. */
6690 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6691 builder.nelts_per_pattern ());
6692 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6693 builder.nelts_per_pattern ());
6694 unsigned int nelts = builder.encoded_nelts ();
6695 for (unsigned int i = 0; i < nelts; ++i)
6696 if (i & (elt_size - 1))
6697 {
6698 a_builder.quick_push (const0_rtx);
6699 b_builder.quick_push (const0_rtx);
6700 }
6701 else if ((i & permute_size) == 0)
6702 {
6703 /* The A and B elements are significant. */
6704 a_builder.quick_push (builder.elt (i));
6705 b_builder.quick_push (builder.elt (i + permute_size));
6706 }
6707 else
6708 {
6709 /* The A and B elements are going to be discarded, so pick whatever
6710 is likely to give a nice constant. We are targeting element
6711 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6712 with the aim of each being a sequence of ones followed by
6713 a sequence of zeros. So:
6714
6715 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6716 duplicate the last X_ELT_SIZE element, to extend the
6717 current sequence of ones or zeros.
6718
6719 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6720 zero, so that the constant really does have X_ELT_SIZE and
6721 not a smaller size. */
6722 if (a_elt_size > permute_size)
6723 a_builder.quick_push (const0_rtx);
6724 else
6725 a_builder.quick_push (a_builder.elt (i - a_elt_size));
6726 if (b_elt_size > permute_size)
6727 b_builder.quick_push (const0_rtx);
6728 else
6729 b_builder.quick_push (b_builder.elt (i - b_elt_size));
6730 }
6731 a_builder.finalize ();
6732 b_builder.finalize ();
6733
6734 /* Try loading A into a register. */
6735 rtx_insn *last = get_last_insn ();
6736 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6737 if (!a)
6738 return NULL_RTX;
6739
6740 /* Try loading B into a register. */
6741 rtx b = a;
6742 if (a_builder != b_builder)
6743 {
6744 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6745 if (!b)
6746 {
6747 delete_insns_since (last);
6748 return NULL_RTX;
6749 }
6750 }
6751
8535755a
TC
6752 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6753 operands but permutes them as though they had mode MODE. */
2803bc3b 6754 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
8535755a
TC
6755 target = aarch64_target_reg (target, GET_MODE (a));
6756 rtx type_reg = CONST0_RTX (mode);
6757 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
2803bc3b
RS
6758 return target;
6759}
6760
678faefc
RS
6761/* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6762 constant in BUILDER into an SVE predicate register. Return the register
6763 on success, otherwise return null. Use TARGET for the register if
2803bc3b
RS
6764 nonnull and convenient.
6765
6766 ALLOW_RECURSE_P is true if we can use methods that would call this
6767 function recursively. */
678faefc
RS
6768
6769static rtx
2803bc3b
RS
6770aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6771 bool allow_recurse_p)
678faefc
RS
6772{
6773 if (builder.encoded_nelts () == 1)
6774 /* A PFALSE or a PTRUE .B ALL. */
6775 return aarch64_emit_set_immediate (target, builder);
6776
6777 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6778 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6779 {
6780 /* If we can load the constant using PTRUE, use it as-is. */
6781 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6782 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6783 return aarch64_emit_set_immediate (target, builder);
6784
6785 /* Otherwise use WHILE to set the first VL bits. */
6786 return aarch64_sve_move_pred_via_while (target, mode, vl);
6787 }
6788
2803bc3b
RS
6789 if (!allow_recurse_p)
6790 return NULL_RTX;
6791
6792 /* Try inverting the vector in element size ELT_SIZE and then EORing
6793 the result with an ELT_SIZE PTRUE. */
6794 if (INTVAL (builder.elt (0)) == 0)
6795 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6796 elt_size))
6797 return res;
6798
6799 /* Try using TRN1 to permute two simpler constants. */
6800 for (unsigned int i = elt_size; i <= 8; i *= 2)
6801 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6802 elt_size, i))
6803 return res;
6804
678faefc
RS
6805 return NULL_RTX;
6806}
6807
6808/* Return an SVE predicate register that contains the VNx16BImode
6809 constant in BUILDER, without going through the move expanders.
6810
6811 The returned register can have whatever mode seems most natural
6812 given the contents of BUILDER. Use TARGET for the result if
6813 convenient. */
6814
6815static rtx
6816aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6817{
6818 /* Try loading the constant using pure predicate operations. */
2803bc3b 6819 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
678faefc
RS
6820 return res;
6821
6822 /* Try forcing the constant to memory. */
6823 if (builder.full_nelts ().is_constant ())
6824 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6825 {
6826 target = aarch64_target_reg (target, VNx16BImode);
6827 emit_move_insn (target, mem);
6828 return target;
6829 }
6830
6831 /* The last resort is to load the constant as an integer and then
6832 compare it against zero. Use -1 for set bits in order to increase
6833 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6834 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6835 builder.nelts_per_pattern ());
6836 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6837 int_builder.quick_push (INTVAL (builder.elt (i))
6838 ? constm1_rtx : const0_rtx);
6839 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6840 int_builder.build ());
0b1fe8cf
RS
6841}
6842
4aeb1ba7 6843/* Set DEST to immediate IMM. */
43cacb12
RS
6844
6845void
4aeb1ba7 6846aarch64_expand_mov_immediate (rtx dest, rtx imm)
43cacb12
RS
6847{
6848 machine_mode mode = GET_MODE (dest);
82614948
RR
6849
6850 /* Check on what type of symbol it is. */
77e994c9 6851 scalar_int_mode int_mode;
3793ecc1
AC
6852 if ((SYMBOL_REF_P (imm)
6853 || LABEL_REF_P (imm)
43cacb12
RS
6854 || GET_CODE (imm) == CONST
6855 || GET_CODE (imm) == CONST_POLY_INT)
77e994c9 6856 && is_a <scalar_int_mode> (mode, &int_mode))
82614948 6857 {
43cacb12
RS
6858 rtx mem;
6859 poly_int64 offset;
6860 HOST_WIDE_INT const_offset;
82614948
RR
6861 enum aarch64_symbol_type sty;
6862
6863 /* If we have (const (plus symbol offset)), separate out the offset
6864 before we start classifying the symbol. */
43cacb12 6865 rtx base = strip_offset (imm, &offset);
82614948 6866
43cacb12
RS
6867 /* We must always add an offset involving VL separately, rather than
6868 folding it into the relocation. */
6869 if (!offset.is_constant (&const_offset))
6870 {
c0e0174b
RS
6871 if (!TARGET_SVE)
6872 {
6873 aarch64_report_sve_required ();
6874 return;
6875 }
43cacb12
RS
6876 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6877 emit_insn (gen_rtx_SET (dest, imm));
6878 else
6879 {
6880 /* Do arithmetic on 32-bit values if the result is smaller
6881 than that. */
6882 if (partial_subreg_p (int_mode, SImode))
6883 {
6884 /* It is invalid to do symbol calculations in modes
6885 narrower than SImode. */
6886 gcc_assert (base == const0_rtx);
6887 dest = gen_lowpart (SImode, dest);
6888 int_mode = SImode;
6889 }
6890 if (base != const0_rtx)
6891 {
6892 base = aarch64_force_temporary (int_mode, dest, base);
6893 aarch64_add_offset (int_mode, dest, base, offset,
6894 NULL_RTX, NULL_RTX, false);
6895 }
6896 else
6897 aarch64_add_offset (int_mode, dest, base, offset,
6898 dest, NULL_RTX, false);
6899 }
6900 return;
6901 }
6902
6903 sty = aarch64_classify_symbol (base, const_offset);
82614948
RR
6904 switch (sty)
6905 {
6906 case SYMBOL_FORCE_TO_MEM:
e8beba1c
RS
6907 if (int_mode != ptr_mode)
6908 imm = convert_memory_address (ptr_mode, imm);
6909
43cacb12 6910 if (const_offset != 0
e8beba1c 6911 && targetm.cannot_force_const_mem (ptr_mode, imm))
82614948
RR
6912 {
6913 gcc_assert (can_create_pseudo_p ());
77e994c9 6914 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
6915 aarch64_add_offset (int_mode, dest, base, const_offset,
6916 NULL_RTX, NULL_RTX, false);
82614948
RR
6917 return;
6918 }
b4f50fd4 6919
82614948
RR
6920 mem = force_const_mem (ptr_mode, imm);
6921 gcc_assert (mem);
b4f50fd4
RR
6922
6923 /* If we aren't generating PC relative literals, then
6924 we need to expand the literal pool access carefully.
6925 This is something that needs to be done in a number
6926 of places, so could well live as a separate function. */
9ee6540a 6927 if (!aarch64_pcrelative_literal_loads)
b4f50fd4
RR
6928 {
6929 gcc_assert (can_create_pseudo_p ());
6930 base = gen_reg_rtx (ptr_mode);
6931 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
00eee3fa
WD
6932 if (ptr_mode != Pmode)
6933 base = convert_memory_address (Pmode, base);
b4f50fd4
RR
6934 mem = gen_rtx_MEM (ptr_mode, base);
6935 }
6936
77e994c9
RS
6937 if (int_mode != ptr_mode)
6938 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
b4f50fd4 6939
f7df4a84 6940 emit_insn (gen_rtx_SET (dest, mem));
b4f50fd4 6941
82614948
RR
6942 return;
6943
6944 case SYMBOL_SMALL_TLSGD:
6945 case SYMBOL_SMALL_TLSDESC:
79496620 6946 case SYMBOL_SMALL_TLSIE:
1b1e81f8 6947 case SYMBOL_SMALL_GOT_28K:
6642bdb4 6948 case SYMBOL_SMALL_GOT_4G:
82614948 6949 case SYMBOL_TINY_GOT:
5ae7caad 6950 case SYMBOL_TINY_TLSIE:
43cacb12 6951 if (const_offset != 0)
82614948
RR
6952 {
6953 gcc_assert(can_create_pseudo_p ());
77e994c9 6954 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
6955 aarch64_add_offset (int_mode, dest, base, const_offset,
6956 NULL_RTX, NULL_RTX, false);
82614948
RR
6957 return;
6958 }
6959 /* FALLTHRU */
6960
82614948
RR
6961 case SYMBOL_SMALL_ABSOLUTE:
6962 case SYMBOL_TINY_ABSOLUTE:
cbf5629e 6963 case SYMBOL_TLSLE12:
d18ba284 6964 case SYMBOL_TLSLE24:
cbf5629e
JW
6965 case SYMBOL_TLSLE32:
6966 case SYMBOL_TLSLE48:
82614948
RR
6967 aarch64_load_symref_appropriately (dest, imm, sty);
6968 return;
6969
6970 default:
6971 gcc_unreachable ();
6972 }
6973 }
6974
6975 if (!CONST_INT_P (imm))
6976 {
678faefc
RS
6977 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6978 {
6979 /* Only the low bit of each .H, .S and .D element is defined,
6980 so we can set the upper bits to whatever we like. If the
6981 predicate is all-true in MODE, prefer to set all the undefined
6982 bits as well, so that we can share a single .B predicate for
6983 all modes. */
6984 if (imm == CONSTM1_RTX (mode))
6985 imm = CONSTM1_RTX (VNx16BImode);
6986
6987 /* All methods for constructing predicate modes wider than VNx16BI
6988 will set the upper bits of each element to zero. Expose this
6989 by moving such constants as a VNx16BI, so that all bits are
6990 significant and so that constants for different modes can be
6991 shared. The wider constant will still be available as a
6992 REG_EQUAL note. */
6993 rtx_vector_builder builder;
6994 if (aarch64_get_sve_pred_bits (builder, imm))
6995 {
6996 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6997 if (dest != res)
6998 emit_move_insn (dest, gen_lowpart (mode, res));
6999 return;
7000 }
7001 }
7002
43cacb12
RS
7003 if (GET_CODE (imm) == HIGH
7004 || aarch64_simd_valid_immediate (imm, NULL))
43cacb12 7005 {
4aeb1ba7
RS
7006 emit_insn (gen_rtx_SET (dest, imm));
7007 return;
43e9d192 7008 }
82614948 7009
568b9c0e 7010 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
4aeb1ba7
RS
7011 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
7012 {
7013 if (dest != res)
7014 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
7015 return;
7016 }
7017
7018 rtx mem = force_const_mem (mode, imm);
7019 gcc_assert (mem);
7020 emit_move_insn (dest, mem);
82614948 7021 return;
43e9d192 7022 }
82614948 7023
ba1536da 7024 aarch64_internal_mov_immediate (dest, imm, true, mode);
43e9d192
IB
7025}
7026
74b27d8e
RS
7027/* Return the MEM rtx that provides the canary value that should be used
7028 for stack-smashing protection. MODE is the mode of the memory.
7029 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
7030 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
7031 indicates whether the caller is performing a SET or a TEST operation. */
7032
7033rtx
7034aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
7035 aarch64_salt_type salt_type)
7036{
7037 rtx addr;
7038 if (aarch64_stack_protector_guard == SSP_GLOBAL)
7039 {
7040 gcc_assert (MEM_P (decl_rtl));
7041 addr = XEXP (decl_rtl, 0);
7042 poly_int64 offset;
7043 rtx base = strip_offset_and_salt (addr, &offset);
7044 if (!SYMBOL_REF_P (base))
7045 return decl_rtl;
7046
7047 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
7048 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
7049 addr = gen_rtx_CONST (Pmode, addr);
7050 addr = plus_constant (Pmode, addr, offset);
7051 }
7052 else
7053 {
7054 /* Calculate the address from the system register. */
7055 rtx salt = GEN_INT (salt_type);
7056 addr = gen_reg_rtx (mode);
7057 if (mode == DImode)
7058 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
7059 else
7060 {
7061 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
7062 addr = convert_memory_address (Pmode, addr);
7063 }
7064 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
7065 }
7066 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
7067}
7068
43cacb12
RS
7069/* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
7070 that is known to contain PTRUE. */
7071
7072void
7073aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
7074{
0c63a8ee
TC
7075 expand_operand ops[3];
7076 machine_mode mode = GET_MODE (dest);
7077 create_output_operand (&ops[0], dest, mode);
7078 create_input_operand (&ops[1], pred, GET_MODE(pred));
7079 create_input_operand (&ops[2], src, mode);
f2b29269 7080 temporary_volatile_ok v (true);
0c63a8ee 7081 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
43cacb12
RS
7082}
7083
7084/* Expand a pre-RA SVE data move from SRC to DEST in which at least one
7085 operand is in memory. In this case we need to use the predicated LD1
7086 and ST1 instead of LDR and STR, both for correctness on big-endian
7087 targets and because LD1 and ST1 support a wider range of addressing modes.
7088 PRED_MODE is the mode of the predicate.
7089
7090 See the comment at the head of aarch64-sve.md for details about the
7091 big-endian handling. */
7092
7093void
7094aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7095{
7096 machine_mode mode = GET_MODE (dest);
16de3637 7097 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
7098 if (!register_operand (src, mode)
7099 && !register_operand (dest, mode))
7100 {
7101 rtx tmp = gen_reg_rtx (mode);
7102 if (MEM_P (src))
7103 aarch64_emit_sve_pred_move (tmp, ptrue, src);
7104 else
7105 emit_move_insn (tmp, src);
7106 src = tmp;
7107 }
7108 aarch64_emit_sve_pred_move (dest, ptrue, src);
7109}
7110
002092be
RS
7111/* Called only on big-endian targets. See whether an SVE vector move
7112 from SRC to DEST is effectively a REV[BHW] instruction, because at
7113 least one operand is a subreg of an SVE vector that has wider or
7114 narrower elements. Return true and emit the instruction if so.
7115
7116 For example:
7117
7118 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7119
7120 represents a VIEW_CONVERT between the following vectors, viewed
7121 in memory order:
7122
7123 R2: { [0].high, [0].low, [1].high, [1].low, ... }
7124 R1: { [0], [1], [2], [3], ... }
7125
7126 The high part of lane X in R2 should therefore correspond to lane X*2
7127 of R1, but the register representations are:
7128
7129 msb lsb
7130 R2: ...... [1].high [1].low [0].high [0].low
7131 R1: ...... [3] [2] [1] [0]
7132
7133 where the low part of lane X in R2 corresponds to lane X*2 in R1.
7134 We therefore need a reverse operation to swap the high and low values
7135 around.
7136
7137 This is purely an optimization. Without it we would spill the
7138 subreg operand to the stack in one mode and reload it in the
7139 other mode, which has the same effect as the REV. */
7140
7141bool
7142aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7143{
7144 gcc_assert (BYTES_BIG_ENDIAN);
a4d9837e
RS
7145
7146 /* Do not try to optimize subregs that LRA has created for matched
7147 reloads. These subregs only exist as a temporary measure to make
7148 the RTL well-formed, but they are exempt from the usual
7149 TARGET_CAN_CHANGE_MODE_CLASS rules.
7150
7151 For example, if we have:
7152
7153 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7154
7155 and the constraints require R1 and R2 to be in the same register,
7156 LRA may need to create RTL such as:
7157
7158 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7159 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7160 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7161
7162 which forces both the input and output of the original instruction
7163 to use the same hard register. But for this to work, the normal
7164 rules have to be suppressed on the subreg input, otherwise LRA
7165 would need to reload that input too, meaning that the process
7166 would never terminate. To compensate for this, the normal rules
7167 are also suppressed for the subreg output of the first move.
7168 Ignoring the special case and handling the first move normally
7169 would therefore generate wrong code: we would reverse the elements
7170 for the first subreg but not reverse them back for the second subreg. */
7171 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
002092be 7172 dest = SUBREG_REG (dest);
a4d9837e 7173 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
002092be
RS
7174 src = SUBREG_REG (src);
7175
7176 /* The optimization handles two single SVE REGs with different element
7177 sizes. */
7178 if (!REG_P (dest)
7179 || !REG_P (src)
7180 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7181 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7182 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7183 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7184 return false;
7185
7186 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
16de3637 7187 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
002092be
RS
7188 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7189 UNSPEC_REV_SUBREG);
7190 emit_insn (gen_rtx_SET (dest, unspec));
7191 return true;
7192}
7193
7194/* Return a copy of X with mode MODE, without changing its other
7195 attributes. Unlike gen_lowpart, this doesn't care whether the
7196 mode change is valid. */
7197
624d0f07 7198rtx
002092be
RS
7199aarch64_replace_reg_mode (rtx x, machine_mode mode)
7200{
7201 if (GET_MODE (x) == mode)
7202 return x;
7203
7204 x = shallow_copy_rtx (x);
7205 set_mode_and_regno (x, mode, REGNO (x));
7206 return x;
7207}
7208
d7a09c44
RS
7209/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7210 stored in wider integer containers. */
7211
7212static unsigned int
7213aarch64_sve_rev_unspec (machine_mode mode)
7214{
7215 switch (GET_MODE_UNIT_SIZE (mode))
7216 {
7217 case 1: return UNSPEC_REVB;
7218 case 2: return UNSPEC_REVH;
7219 case 4: return UNSPEC_REVW;
7220 }
7221 gcc_unreachable ();
7222}
7223
002092be
RS
7224/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7225 operands. */
7226
7227void
7228aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7229{
d7a09c44
RS
7230 /* Decide which REV operation we need. The mode with wider elements
7231 determines the mode of the operands and the mode with the narrower
002092be 7232 elements determines the reverse width. */
5c06093c
RS
7233 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7234 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
002092be
RS
7235 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7236 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7237 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7238
d7a09c44 7239 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
cc68f7c2 7240 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
002092be 7241
d7a09c44 7242 /* Get the operands in the appropriate modes and emit the instruction. */
002092be 7243 ptrue = gen_lowpart (pred_mode, ptrue);
d7a09c44
RS
7244 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7245 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7246 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7247 dest, ptrue, src));
002092be
RS
7248}
7249
43e9d192 7250static bool
c600df9a 7251aarch64_function_ok_for_sibcall (tree, tree exp)
43e9d192 7252{
c600df9a 7253 if (crtl->abi->id () != expr_callee_abi (exp).id ())
a0d0b980
SE
7254 return false;
7255
43e9d192
IB
7256 return true;
7257}
7258
38e62001
RS
7259/* Subroutine of aarch64_pass_by_reference for arguments that are not
7260 passed in SVE registers. */
43e9d192
IB
7261
7262static bool
56fe3ca3
RS
7263aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7264 const function_arg_info &arg)
43e9d192
IB
7265{
7266 HOST_WIDE_INT size;
ef4bddc2 7267 machine_mode dummymode;
43e9d192
IB
7268 int nregs;
7269
7270 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
52090e4d
RS
7271 if (arg.mode == BLKmode && arg.type)
7272 size = int_size_in_bytes (arg.type);
6a70badb
RS
7273 else
7274 /* No frontends can create types with variable-sized modes, so we
7275 shouldn't be asked to pass or return them. */
52090e4d 7276 size = GET_MODE_SIZE (arg.mode).to_constant ();
43e9d192 7277
aadc1c43 7278 /* Aggregates are passed by reference based on their size. */
52090e4d
RS
7279 if (arg.aggregate_type_p ())
7280 size = int_size_in_bytes (arg.type);
43e9d192
IB
7281
7282 /* Variable sized arguments are always returned by reference. */
7283 if (size < 0)
7284 return true;
7285
7286 /* Can this be a candidate to be passed in fp/simd register(s)? */
52090e4d 7287 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
56fe3ca3
RS
7288 &dummymode, &nregs, NULL,
7289 !pcum || pcum->silent_p))
43e9d192
IB
7290 return false;
7291
7292 /* Arguments which are variable sized or larger than 2 registers are
7293 passed by reference unless they are a homogenous floating point
7294 aggregate. */
7295 return size > 2 * UNITS_PER_WORD;
7296}
7297
38e62001
RS
7298/* Implement TARGET_PASS_BY_REFERENCE. */
7299
7300static bool
7301aarch64_pass_by_reference (cumulative_args_t pcum_v,
7302 const function_arg_info &arg)
7303{
7304 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7305
7306 if (!arg.type)
56fe3ca3 7307 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
7308
7309 pure_scalable_type_info pst_info;
7310 switch (pst_info.analyze (arg.type))
7311 {
7312 case pure_scalable_type_info::IS_PST:
7313 if (pcum && !pcum->silent_p && !TARGET_SVE)
7314 /* We can't gracefully recover at this point, so make this a
7315 fatal error. */
7316 fatal_error (input_location, "arguments of type %qT require"
7317 " the SVE ISA extension", arg.type);
7318
7319 /* Variadic SVE types are passed by reference. Normal non-variadic
7320 arguments are too if we've run out of registers. */
7321 return (!arg.named
7322 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7323 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7324
7325 case pure_scalable_type_info::DOESNT_MATTER:
56fe3ca3 7326 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
38e62001
RS
7327 return true;
7328
7329 case pure_scalable_type_info::NO_ABI_IDENTITY:
7330 case pure_scalable_type_info::ISNT_PST:
56fe3ca3 7331 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
7332 }
7333 gcc_unreachable ();
7334}
7335
43e9d192
IB
7336/* Return TRUE if VALTYPE is padded to its least significant bits. */
7337static bool
7338aarch64_return_in_msb (const_tree valtype)
7339{
ef4bddc2 7340 machine_mode dummy_mode;
43e9d192
IB
7341 int dummy_int;
7342
7343 /* Never happens in little-endian mode. */
7344 if (!BYTES_BIG_ENDIAN)
7345 return false;
7346
7347 /* Only composite types smaller than or equal to 16 bytes can
7348 be potentially returned in registers. */
7349 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7350 || int_size_in_bytes (valtype) <= 0
7351 || int_size_in_bytes (valtype) > 16)
7352 return false;
7353
7354 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7355 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7356 is always passed/returned in the least significant bits of fp/simd
7357 register(s). */
7358 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
56fe3ca3
RS
7359 &dummy_mode, &dummy_int, NULL,
7360 false))
43e9d192
IB
7361 return false;
7362
38e62001
RS
7363 /* Likewise pure scalable types for SVE vector and predicate registers. */
7364 pure_scalable_type_info pst_info;
7365 if (pst_info.analyze_registers (valtype))
7366 return false;
7367
43e9d192
IB
7368 return true;
7369}
7370
38e62001
RS
7371/* Implement TARGET_FUNCTION_VALUE.
7372 Define how to find the value returned by a function. */
7373
43e9d192 7374static rtx
38e62001
RS
7375aarch64_function_value (const_tree type, const_tree func,
7376 bool outgoing ATTRIBUTE_UNUSED)
43e9d192 7377{
38e62001
RS
7378 machine_mode mode;
7379 int unsignedp;
c600df9a 7380
38e62001
RS
7381 mode = TYPE_MODE (type);
7382 if (INTEGRAL_TYPE_P (type))
7383 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
c600df9a 7384
38e62001
RS
7385 pure_scalable_type_info pst_info;
7386 if (type && pst_info.analyze_registers (type))
7387 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
c600df9a 7388
38e62001
RS
7389 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7390 are returned in memory, not by value. */
7391 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7392 bool sve_p = (vec_flags & VEC_ANY_SVE);
c600df9a 7393
43e9d192
IB
7394 if (aarch64_return_in_msb (type))
7395 {
7396 HOST_WIDE_INT size = int_size_in_bytes (type);
7397
7398 if (size % UNITS_PER_WORD != 0)
7399 {
7400 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
f4b31647 7401 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
43e9d192
IB
7402 }
7403 }
7404
6aa5370c
RS
7405 int count;
7406 machine_mode ag_mode;
56fe3ca3
RS
7407 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7408 NULL, false))
43e9d192 7409 {
38e62001 7410 gcc_assert (!sve_p);
43e9d192
IB
7411 if (!aarch64_composite_type_p (type, mode))
7412 {
7413 gcc_assert (count == 1 && mode == ag_mode);
7414 return gen_rtx_REG (mode, V0_REGNUM);
7415 }
eb04ccf4
JW
7416 else if (aarch64_advsimd_full_struct_mode_p (mode)
7417 && known_eq (GET_MODE_SIZE (ag_mode), 16))
7418 return gen_rtx_REG (mode, V0_REGNUM);
7419 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7420 && known_eq (GET_MODE_SIZE (ag_mode), 8))
7421 return gen_rtx_REG (mode, V0_REGNUM);
43e9d192
IB
7422 else
7423 {
7424 int i;
7425 rtx par;
7426
7427 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7428 for (i = 0; i < count; i++)
7429 {
7430 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6a70badb
RS
7431 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7432 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
7433 XVECEXP (par, 0, i) = tmp;
7434 }
7435 return par;
7436 }
7437 }
7438 else
6aa5370c 7439 {
38e62001
RS
7440 if (sve_p)
7441 {
7442 /* Vector types can acquire a partial SVE mode using things like
7443 __attribute__((vector_size(N))), and this is potentially useful.
7444 However, the choice of mode doesn't affect the type's ABI
7445 identity, so we should treat the types as though they had
7446 the associated integer mode, just like they did before SVE
7447 was introduced.
7448
7449 We know that the vector must be 128 bits or smaller,
7450 otherwise we'd have returned it in memory instead. */
7451 gcc_assert (type
7452 && (aarch64_some_values_include_pst_objects_p (type)
7453 || (vec_flags & VEC_PARTIAL)));
7454
7455 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7456 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7457 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7458 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7459 }
7460 return gen_rtx_REG (mode, R0_REGNUM);
6aa5370c 7461 }
6aa5370c
RS
7462}
7463
43e9d192
IB
7464/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7465 Return true if REGNO is the number of a hard register in which the values
7466 of called function may come back. */
7467
7468static bool
7469aarch64_function_value_regno_p (const unsigned int regno)
7470{
7471 /* Maximum of 16 bytes can be returned in the general registers. Examples
7472 of 16-byte return values are: 128-bit integers and 16-byte small
7473 structures (excluding homogeneous floating-point aggregates). */
7474 if (regno == R0_REGNUM || regno == R1_REGNUM)
7475 return true;
7476
7477 /* Up to four fp/simd registers can return a function value, e.g. a
7478 homogeneous floating-point aggregate having four members. */
7479 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
d5726973 7480 return TARGET_FLOAT;
43e9d192
IB
7481
7482 return false;
7483}
7484
38e62001
RS
7485/* Subroutine for aarch64_return_in_memory for types that are not returned
7486 in SVE registers. */
43e9d192
IB
7487
7488static bool
38e62001 7489aarch64_return_in_memory_1 (const_tree type)
43e9d192
IB
7490{
7491 HOST_WIDE_INT size;
ef4bddc2 7492 machine_mode ag_mode;
43e9d192
IB
7493 int count;
7494
7495 if (!AGGREGATE_TYPE_P (type)
7496 && TREE_CODE (type) != COMPLEX_TYPE
7497 && TREE_CODE (type) != VECTOR_TYPE)
7498 /* Simple scalar types always returned in registers. */
7499 return false;
7500
56fe3ca3
RS
7501 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7502 &ag_mode, &count, NULL, false))
43e9d192
IB
7503 return false;
7504
7505 /* Types larger than 2 registers returned in memory. */
7506 size = int_size_in_bytes (type);
7507 return (size < 0 || size > 2 * UNITS_PER_WORD);
7508}
7509
38e62001
RS
7510/* Implement TARGET_RETURN_IN_MEMORY.
7511
7512 If the type T of the result of a function is such that
7513 void func (T arg)
7514 would require that arg be passed as a value in a register (or set of
7515 registers) according to the parameter passing rules, then the result
7516 is returned in the same registers as would be used for such an
7517 argument. */
7518
7519static bool
7520aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7521{
7522 pure_scalable_type_info pst_info;
7523 switch (pst_info.analyze (type))
7524 {
7525 case pure_scalable_type_info::IS_PST:
7526 return (pst_info.num_zr () > NUM_FP_ARG_REGS
7527 || pst_info.num_pr () > NUM_PR_ARG_REGS);
7528
7529 case pure_scalable_type_info::DOESNT_MATTER:
7530 gcc_assert (aarch64_return_in_memory_1 (type));
7531 return true;
7532
7533 case pure_scalable_type_info::NO_ABI_IDENTITY:
7534 case pure_scalable_type_info::ISNT_PST:
7535 return aarch64_return_in_memory_1 (type);
7536 }
7537 gcc_unreachable ();
7538}
7539
43e9d192 7540static bool
ef4bddc2 7541aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
43e9d192
IB
7542 const_tree type, int *nregs)
7543{
7544 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
56fe3ca3 7545 return aarch64_vfp_is_call_or_return_candidate (mode, type,
43e9d192 7546 &pcum->aapcs_vfp_rmode,
56fe3ca3 7547 nregs, NULL, pcum->silent_p);
43e9d192
IB
7548}
7549
985b8393 7550/* Given MODE and TYPE of a function argument, return the alignment in
43e9d192 7551 bits. The idea is to suppress any stronger alignment requested by
c590597c 7552 the user and opt for the natural alignment (specified in AAPCS64 \S
3df1a115 7553 4.1). ABI_BREAK is set to the old alignment if the alignment was
6610daa1
CL
7554 incorrectly calculated in versions of GCC prior to GCC-9.
7555 ABI_BREAK_PACKED is set to the old alignment if it was incorrectly
7556 calculated in versions between GCC-9 and GCC-13. This is a helper
7557 function for local use only. */
43e9d192 7558
985b8393 7559static unsigned int
c590597c 7560aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6610daa1
CL
7561 unsigned int *abi_break,
7562 unsigned int *abi_break_packed)
43e9d192 7563{
49813aad 7564 *abi_break = 0;
6610daa1 7565 *abi_break_packed = 0;
75d6cc81 7566 if (!type)
985b8393 7567 return GET_MODE_ALIGNMENT (mode);
2ec07fa6 7568
75d6cc81 7569 if (integer_zerop (TYPE_SIZE (type)))
985b8393 7570 return 0;
43e9d192 7571
75d6cc81
AL
7572 gcc_assert (TYPE_MODE (type) == mode);
7573
7574 if (!AGGREGATE_TYPE_P (type))
985b8393 7575 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
75d6cc81
AL
7576
7577 if (TREE_CODE (type) == ARRAY_TYPE)
985b8393 7578 return TYPE_ALIGN (TREE_TYPE (type));
75d6cc81 7579
985b8393 7580 unsigned int alignment = 0;
6610daa1 7581 unsigned int bitfield_alignment_with_packed = 0;
c590597c 7582 unsigned int bitfield_alignment = 0;
75d6cc81 7583 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
985b8393 7584 if (TREE_CODE (field) == FIELD_DECL)
c590597c 7585 {
56fe3ca3
RS
7586 /* Note that we explicitly consider zero-sized fields here,
7587 even though they don't map to AAPCS64 machine types.
7588 For example, in:
7589
7590 struct __attribute__((aligned(8))) empty {};
7591
7592 struct s {
7593 [[no_unique_address]] empty e;
7594 int x;
7595 };
7596
7597 "s" contains only one Fundamental Data Type (the int field)
7598 but gains 8-byte alignment and size thanks to "e". */
c590597c
RE
7599 alignment = std::max (alignment, DECL_ALIGN (field));
7600 if (DECL_BIT_FIELD_TYPE (field))
6610daa1
CL
7601 {
7602 /* Take the bit-field type's alignment into account only
7603 if the user didn't reduce this field's alignment with
7604 the packed attribute. */
7605 if (!DECL_PACKED (field))
7606 bitfield_alignment
7607 = std::max (bitfield_alignment,
7608 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7609
7610 /* Compute the alignment even if the bit-field is
7611 packed, so that we can emit a warning in case the
7612 alignment changed between GCC versions. */
7613 bitfield_alignment_with_packed
7614 = std::max (bitfield_alignment_with_packed,
7615 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7616 }
c590597c
RE
7617 }
7618
6610daa1
CL
7619 /* Emit a warning if the alignment is different when taking the
7620 'packed' attribute into account. */
7621 if (bitfield_alignment != bitfield_alignment_with_packed
7622 && bitfield_alignment_with_packed > alignment)
7623 *abi_break_packed = bitfield_alignment_with_packed;
7624
c590597c
RE
7625 if (bitfield_alignment > alignment)
7626 {
49813aad 7627 *abi_break = alignment;
c590597c
RE
7628 return bitfield_alignment;
7629 }
43e9d192 7630
985b8393 7631 return alignment;
43e9d192
IB
7632}
7633
7634/* Layout a function argument according to the AAPCS64 rules. The rule
6aa5370c
RS
7635 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7636 mode that was originally given to us by the target hook, whereas the
7637 mode in ARG might be the result of replacing partial SVE modes with
7638 the equivalent integer mode. */
43e9d192
IB
7639
7640static void
38e62001 7641aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
7642{
7643 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
c600df9a
RS
7644 tree type = arg.type;
7645 machine_mode mode = arg.mode;
43e9d192
IB
7646 int ncrn, nvrn, nregs;
7647 bool allocate_ncrn, allocate_nvrn;
3abf17cf 7648 HOST_WIDE_INT size;
49813aad 7649 unsigned int abi_break;
6610daa1 7650 unsigned int abi_break_packed;
43e9d192
IB
7651
7652 /* We need to do this once per argument. */
7653 if (pcum->aapcs_arg_processed)
7654 return;
7655
3df1a115
CL
7656 bool warn_pcs_change
7657 = (warn_psabi
7658 && !pcum->silent_p
7659 && (currently_expanding_function_start
7660 || currently_expanding_gimple_stmt));
7661
6610daa1
CL
7662 /* There are several things to note here:
7663
7664 - Both the C and AAPCS64 interpretations of a type's alignment should
7665 give a value that is no greater than the type's size.
7666
7667 - Types bigger than 16 bytes are passed indirectly.
7668
7669 - If an argument of type T is passed indirectly, TYPE and MODE describe
7670 a pointer to T rather than T iself.
7671
7672 It follows that the AAPCS64 alignment of TYPE must be no greater
7673 than 16 bytes.
7674
7675 Versions prior to GCC 9.1 ignored a bitfield's underlying type
7676 and so could calculate an alignment that was too small. If this
7677 happened for TYPE then ABI_BREAK is this older, too-small alignment.
7678
7679 Although GCC 9.1 fixed that bug, it introduced a different one:
7680 it would consider the alignment of a bitfield's underlying type even
7681 if the field was packed (which should have the effect of overriding
7682 the alignment of the underlying type). This was fixed in GCC 13.1.
7683
7684 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7685 that was too big. If this happened for TYPE, ABI_BREAK_PACKED is
7686 this older, too-big alignment.
7687
7688 Also, the fact that GCC 9 to GCC 12 considered irrelevant
7689 alignments meant they could calculate type alignments that were
7690 bigger than the type's size, contrary to the assumption above.
7691 The handling of register arguments was nevertheless (and justifiably)
7692 written to follow the assumption that the alignment can never be
7693 greater than the size. The same was not true for stack arguments;
7694 their alignment was instead handled by MIN bounds in
7695 aarch64_function_arg_boundary.
7696
7697 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7698 an alignment of more than 16 bytes for TYPE then:
7699
7700 - If the argument was passed in registers, these GCC versions
7701 would treat the alignment as though it was *less than* 16 bytes.
7702
7703 - If the argument was passed on the stack, these GCC versions
7704 would treat the alignment as though it was *equal to* 16 bytes.
7705
7706 Both behaviors were wrong, but in different cases. */
3df1a115 7707 unsigned int alignment
6610daa1
CL
7708 = aarch64_function_arg_alignment (mode, type, &abi_break,
7709 &abi_break_packed);
7710 gcc_assert (alignment <= 16 * BITS_PER_UNIT
7711 && (!alignment || abi_break < alignment)
7712 && (!abi_break_packed || alignment < abi_break_packed));
3df1a115 7713
43e9d192
IB
7714 pcum->aapcs_arg_processed = true;
7715
38e62001
RS
7716 pure_scalable_type_info pst_info;
7717 if (type && pst_info.analyze_registers (type))
c600df9a 7718 {
3df1a115
CL
7719 /* aarch64_function_arg_alignment has never had an effect on
7720 this case. */
7721
c600df9a
RS
7722 /* The PCS says that it is invalid to pass an SVE value to an
7723 unprototyped function. There is no ABI-defined location we
7724 can return in this case, so we have no real choice but to raise
7725 an error immediately, even though this is only a query function. */
7726 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7727 {
7728 gcc_assert (!pcum->silent_p);
7729 error ("SVE type %qT cannot be passed to an unprototyped function",
7730 arg.type);
7731 /* Avoid repeating the message, and avoid tripping the assert
7732 below. */
7733 pcum->pcs_variant = ARM_PCS_SVE;
7734 }
7735
7736 /* We would have converted the argument into pass-by-reference
7737 form if it didn't fit in registers. */
38e62001
RS
7738 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7739 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
c600df9a
RS
7740 gcc_assert (arg.named
7741 && pcum->pcs_variant == ARM_PCS_SVE
c600df9a
RS
7742 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7743 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
38e62001
RS
7744 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7745 P0_REGNUM + pcum->aapcs_nprn);
c600df9a
RS
7746 return;
7747 }
7748
38e62001
RS
7749 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7750 are passed by reference, not by value. */
7751 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7752 bool sve_p = (vec_flags & VEC_ANY_SVE);
7753 if (sve_p)
7754 /* Vector types can acquire a partial SVE mode using things like
7755 __attribute__((vector_size(N))), and this is potentially useful.
7756 However, the choice of mode doesn't affect the type's ABI
7757 identity, so we should treat the types as though they had
7758 the associated integer mode, just like they did before SVE
7759 was introduced.
7760
7761 We know that the vector must be 128 bits or smaller,
7762 otherwise we'd have passed it in memory instead. */
7763 gcc_assert (type
7764 && (aarch64_some_values_include_pst_objects_p (type)
7765 || (vec_flags & VEC_PARTIAL)));
c600df9a 7766
3abf17cf 7767 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6a70badb
RS
7768 if (type)
7769 size = int_size_in_bytes (type);
7770 else
7771 /* No frontends can create types with variable-sized modes, so we
7772 shouldn't be asked to pass or return them. */
7773 size = GET_MODE_SIZE (mode).to_constant ();
7774 size = ROUND_UP (size, UNITS_PER_WORD);
3abf17cf 7775
43e9d192
IB
7776 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7777 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7778 mode,
7779 type,
7780 &nregs);
38e62001 7781 gcc_assert (!sve_p || !allocate_nvrn);
43e9d192
IB
7782
7783 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7784 The following code thus handles passing by SIMD/FP registers first. */
7785
7786 nvrn = pcum->aapcs_nvrn;
7787
7788 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7789 and homogenous short-vector aggregates (HVA). */
7790 if (allocate_nvrn)
7791 {
3df1a115
CL
7792 /* aarch64_function_arg_alignment has never had an effect on
7793 this case. */
c600df9a 7794 if (!pcum->silent_p && !TARGET_FLOAT)
fc29dfc9 7795 aarch64_err_no_fpadvsimd (mode);
261fb553 7796
43e9d192
IB
7797 if (nvrn + nregs <= NUM_FP_ARG_REGS)
7798 {
7799 pcum->aapcs_nextnvrn = nvrn + nregs;
7800 if (!aarch64_composite_type_p (type, mode))
7801 {
7802 gcc_assert (nregs == 1);
7803 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7804 }
eb04ccf4
JW
7805 else if (aarch64_advsimd_full_struct_mode_p (mode)
7806 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7807 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7808 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7809 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7810 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
43e9d192
IB
7811 else
7812 {
7813 rtx par;
7814 int i;
7815 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7816 for (i = 0; i < nregs; i++)
7817 {
7818 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7819 V0_REGNUM + nvrn + i);
6a70badb
RS
7820 rtx offset = gen_int_mode
7821 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7822 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
7823 XVECEXP (par, 0, i) = tmp;
7824 }
7825 pcum->aapcs_reg = par;
7826 }
7827 return;
7828 }
7829 else
7830 {
7831 /* C.3 NSRN is set to 8. */
7832 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7833 goto on_stack;
7834 }
7835 }
7836
7837 ncrn = pcum->aapcs_ncrn;
3abf17cf 7838 nregs = size / UNITS_PER_WORD;
43e9d192
IB
7839
7840 /* C6 - C9. though the sign and zero extension semantics are
7841 handled elsewhere. This is the case where the argument fits
7842 entirely general registers. */
7843 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7844 {
43e9d192
IB
7845 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7846
7847 /* C.8 if the argument has an alignment of 16 then the NGRN is
c590597c 7848 rounded up to the next even number. */
985b8393 7849 if (nregs == 2
6610daa1
CL
7850 && ncrn % 2)
7851 {
7852 /* Emit a warning if the alignment changed when taking the
7853 'packed' attribute into account. */
7854 if (warn_pcs_change
7855 && abi_break_packed
7856 && ((abi_break_packed == 16 * BITS_PER_UNIT)
7857 != (alignment == 16 * BITS_PER_UNIT)))
7858 inform (input_location, "parameter passing for argument of type "
7859 "%qT changed in GCC 13.1", type);
7860
2ec07fa6 7861 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
985b8393 7862 comparison is there because for > 16 * BITS_PER_UNIT
2ec07fa6
RR
7863 alignment nregs should be > 2 and therefore it should be
7864 passed by reference rather than value. */
6610daa1
CL
7865 if (alignment == 16 * BITS_PER_UNIT)
7866 {
7867 if (warn_pcs_change && abi_break)
7868 inform (input_location, "parameter passing for argument of type "
7869 "%qT changed in GCC 9.1", type);
7870 ++ncrn;
7871 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7872 }
43e9d192 7873 }
2ec07fa6 7874
38e62001
RS
7875 /* If an argument with an SVE mode needs to be shifted up to the
7876 high part of the register, treat it as though it had an integer mode.
7877 Using the normal (parallel [...]) would suppress the shifting. */
7878 if (sve_p
7879 && BYTES_BIG_ENDIAN
7880 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7881 && aarch64_pad_reg_upward (mode, type, false))
7882 {
7883 mode = int_mode_for_mode (mode).require ();
7884 sve_p = false;
7885 }
7886
43e9d192 7887 /* NREGS can be 0 when e.g. an empty structure is to be passed.
c590597c 7888 A reg is still generated for it, but the caller should be smart
43e9d192 7889 enough not to use it. */
38e62001
RS
7890 if (nregs == 0
7891 || (nregs == 1 && !sve_p)
7892 || GET_MODE_CLASS (mode) == MODE_INT)
2ec07fa6 7893 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
43e9d192
IB
7894 else
7895 {
7896 rtx par;
7897 int i;
7898
7899 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7900 for (i = 0; i < nregs; i++)
7901 {
38e62001
RS
7902 scalar_int_mode reg_mode = word_mode;
7903 if (nregs == 1)
7904 reg_mode = int_mode_for_mode (mode).require ();
7905 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
43e9d192
IB
7906 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7907 GEN_INT (i * UNITS_PER_WORD));
7908 XVECEXP (par, 0, i) = tmp;
7909 }
7910 pcum->aapcs_reg = par;
7911 }
7912
7913 pcum->aapcs_nextncrn = ncrn + nregs;
7914 return;
7915 }
7916
7917 /* C.11 */
7918 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7919
7920 /* The argument is passed on stack; record the needed number of words for
3abf17cf 7921 this argument and align the total size if necessary. */
43e9d192 7922on_stack:
3abf17cf 7923 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2ec07fa6 7924
6610daa1
CL
7925 if (warn_pcs_change
7926 && abi_break_packed
7927 && ((abi_break_packed >= 16 * BITS_PER_UNIT)
7928 != (alignment >= 16 * BITS_PER_UNIT)))
7929 inform (input_location, "parameter passing for argument of type "
7930 "%qT changed in GCC 13.1", type);
7931
7932 if (alignment == 16 * BITS_PER_UNIT)
c590597c
RE
7933 {
7934 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7935 if (pcum->aapcs_stack_size != new_size)
7936 {
3df1a115 7937 if (warn_pcs_change && abi_break)
c590597c
RE
7938 inform (input_location, "parameter passing for argument of type "
7939 "%qT changed in GCC 9.1", type);
7940 pcum->aapcs_stack_size = new_size;
7941 }
7942 }
43e9d192
IB
7943 return;
7944}
7945
7946/* Implement TARGET_FUNCTION_ARG. */
7947
7948static rtx
6783fdb7 7949aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
7950{
7951 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 7952 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
7953 || pcum->pcs_variant == ARM_PCS_SIMD
7954 || pcum->pcs_variant == ARM_PCS_SVE);
43e9d192 7955
6783fdb7 7956 if (arg.end_marker_p ())
08cc4d92 7957 return gen_int_mode (pcum->pcs_variant, DImode);
43e9d192 7958
38e62001 7959 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
7960 return pcum->aapcs_reg;
7961}
7962
7963void
7964aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
08cc4d92
RS
7965 const_tree fntype,
7966 rtx libname ATTRIBUTE_UNUSED,
7967 const_tree fndecl ATTRIBUTE_UNUSED,
c600df9a
RS
7968 unsigned n_named ATTRIBUTE_UNUSED,
7969 bool silent_p)
43e9d192
IB
7970{
7971 pcum->aapcs_ncrn = 0;
7972 pcum->aapcs_nvrn = 0;
c600df9a 7973 pcum->aapcs_nprn = 0;
43e9d192
IB
7974 pcum->aapcs_nextncrn = 0;
7975 pcum->aapcs_nextnvrn = 0;
c600df9a 7976 pcum->aapcs_nextnprn = 0;
08cc4d92
RS
7977 if (fntype)
7978 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7979 else
7980 pcum->pcs_variant = ARM_PCS_AAPCS64;
43e9d192
IB
7981 pcum->aapcs_reg = NULL_RTX;
7982 pcum->aapcs_arg_processed = false;
7983 pcum->aapcs_stack_words = 0;
7984 pcum->aapcs_stack_size = 0;
c600df9a 7985 pcum->silent_p = silent_p;
43e9d192 7986
c600df9a
RS
7987 if (!silent_p
7988 && !TARGET_FLOAT
261fb553
AL
7989 && fntype && fntype != error_mark_node)
7990 {
7991 const_tree type = TREE_TYPE (fntype);
7992 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7993 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7994 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
56fe3ca3 7995 &mode, &nregs, NULL, false))
fc29dfc9 7996 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
261fb553 7997 }
c600df9a
RS
7998
7999 if (!silent_p
8000 && !TARGET_SVE
8001 && pcum->pcs_variant == ARM_PCS_SVE)
8002 {
8003 /* We can't gracefully recover at this point, so make this a
8004 fatal error. */
8005 if (fndecl)
8006 fatal_error (input_location, "%qE requires the SVE ISA extension",
8007 fndecl);
8008 else
8009 fatal_error (input_location, "calls to functions of type %qT require"
8010 " the SVE ISA extension", fntype);
8011 }
43e9d192
IB
8012}
8013
8014static void
8015aarch64_function_arg_advance (cumulative_args_t pcum_v,
6930c98c 8016 const function_arg_info &arg)
43e9d192
IB
8017{
8018 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 8019 if (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
8020 || pcum->pcs_variant == ARM_PCS_SIMD
8021 || pcum->pcs_variant == ARM_PCS_SVE)
43e9d192 8022 {
38e62001 8023 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
8024 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
8025 != (pcum->aapcs_stack_words != 0));
8026 pcum->aapcs_arg_processed = false;
8027 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
8028 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
c600df9a 8029 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
43e9d192
IB
8030 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
8031 pcum->aapcs_stack_words = 0;
8032 pcum->aapcs_reg = NULL_RTX;
8033 }
8034}
8035
8036bool
8037aarch64_function_arg_regno_p (unsigned regno)
8038{
8039 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
8040 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
8041}
8042
8043/* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
8044 PARM_BOUNDARY bits of alignment, but will be given anything up
8045 to STACK_BOUNDARY bits if the type requires it. This makes sure
8046 that both before and after the layout of each argument, the Next
8047 Stacked Argument Address (NSAA) will have a minimum alignment of
8048 8 bytes. */
8049
8050static unsigned int
ef4bddc2 8051aarch64_function_arg_boundary (machine_mode mode, const_tree type)
43e9d192 8052{
49813aad 8053 unsigned int abi_break;
6610daa1 8054 unsigned int abi_break_packed;
c590597c 8055 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6610daa1
CL
8056 &abi_break,
8057 &abi_break_packed);
8058 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
8059 to emit warnings about ABI incompatibility. */
49813aad 8060 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
49813aad 8061 return alignment;
43e9d192
IB
8062}
8063
43cacb12
RS
8064/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
8065
8066static fixed_size_mode
8067aarch64_get_reg_raw_mode (int regno)
8068{
8069 if (TARGET_SVE && FP_REGNUM_P (regno))
8070 /* Don't use the SVE part of the register for __builtin_apply and
8071 __builtin_return. The SVE registers aren't used by the normal PCS,
8072 so using them there would be a waste of time. The PCS extensions
8073 for SVE types are fundamentally incompatible with the
8074 __builtin_return/__builtin_apply interface. */
8075 return as_a <fixed_size_mode> (V16QImode);
8076 return default_get_reg_raw_mode (regno);
8077}
8078
76b0cbf8 8079/* Implement TARGET_FUNCTION_ARG_PADDING.
43e9d192
IB
8080
8081 Small aggregate types are placed in the lowest memory address.
8082
8083 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
8084
76b0cbf8
RS
8085static pad_direction
8086aarch64_function_arg_padding (machine_mode mode, const_tree type)
43e9d192
IB
8087{
8088 /* On little-endian targets, the least significant byte of every stack
8089 argument is passed at the lowest byte address of the stack slot. */
8090 if (!BYTES_BIG_ENDIAN)
76b0cbf8 8091 return PAD_UPWARD;
43e9d192 8092
00edcfbe 8093 /* Otherwise, integral, floating-point and pointer types are padded downward:
43e9d192
IB
8094 the least significant byte of a stack argument is passed at the highest
8095 byte address of the stack slot. */
8096 if (type
00edcfbe
YZ
8097 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
8098 || POINTER_TYPE_P (type))
43e9d192 8099 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
76b0cbf8 8100 return PAD_DOWNWARD;
43e9d192
IB
8101
8102 /* Everything else padded upward, i.e. data in first byte of stack slot. */
76b0cbf8 8103 return PAD_UPWARD;
43e9d192
IB
8104}
8105
8106/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
8107
8108 It specifies padding for the last (may also be the only)
8109 element of a block move between registers and memory. If
8110 assuming the block is in the memory, padding upward means that
8111 the last element is padded after its highest significant byte,
8112 while in downward padding, the last element is padded at the
8113 its least significant byte side.
8114
8115 Small aggregates and small complex types are always padded
8116 upwards.
8117
8118 We don't need to worry about homogeneous floating-point or
8119 short-vector aggregates; their move is not affected by the
8120 padding direction determined here. Regardless of endianness,
8121 each element of such an aggregate is put in the least
8122 significant bits of a fp/simd register.
8123
8124 Return !BYTES_BIG_ENDIAN if the least significant byte of the
8125 register has useful data, and return the opposite if the most
8126 significant byte does. */
8127
8128bool
ef4bddc2 8129aarch64_pad_reg_upward (machine_mode mode, const_tree type,
43e9d192
IB
8130 bool first ATTRIBUTE_UNUSED)
8131{
8132
38e62001
RS
8133 /* Aside from pure scalable types, small composite types are always
8134 padded upward. */
43e9d192
IB
8135 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8136 {
6a70badb
RS
8137 HOST_WIDE_INT size;
8138 if (type)
8139 size = int_size_in_bytes (type);
8140 else
8141 /* No frontends can create types with variable-sized modes, so we
8142 shouldn't be asked to pass or return them. */
8143 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192 8144 if (size < 2 * UNITS_PER_WORD)
38e62001
RS
8145 {
8146 pure_scalable_type_info pst_info;
8147 if (pst_info.analyze_registers (type))
8148 return false;
8149 return true;
8150 }
43e9d192
IB
8151 }
8152
8153 /* Otherwise, use the default padding. */
8154 return !BYTES_BIG_ENDIAN;
8155}
8156
095a2d76 8157static scalar_int_mode
43e9d192
IB
8158aarch64_libgcc_cmp_return_mode (void)
8159{
8160 return SImode;
8161}
8162
a3eb8a52
EB
8163#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8164
8165/* We use the 12-bit shifted immediate arithmetic instructions so values
8166 must be multiple of (1 << 12), i.e. 4096. */
8167#define ARITH_FACTOR 4096
8168
8169#if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8170#error Cannot use simple address calculation for stack probing
8171#endif
8172
6a70badb 8173/* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
a3eb8a52
EB
8174 inclusive. These are offsets from the current stack pointer. */
8175
8176static void
6a70badb 8177aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
a3eb8a52 8178{
6a70badb
RS
8179 HOST_WIDE_INT size;
8180 if (!poly_size.is_constant (&size))
8181 {
8182 sorry ("stack probes for SVE frames");
8183 return;
8184 }
8185
5773855c 8186 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
a3eb8a52
EB
8187
8188 /* See the same assertion on PROBE_INTERVAL above. */
8189 gcc_assert ((first % ARITH_FACTOR) == 0);
8190
8191 /* See if we have a constant small number of probes to generate. If so,
8192 that's the easy case. */
8193 if (size <= PROBE_INTERVAL)
8194 {
8195 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8196
8197 emit_set_insn (reg1,
5f5c5e0f 8198 plus_constant (Pmode,
a3eb8a52 8199 stack_pointer_rtx, -(first + base)));
5f5c5e0f 8200 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
a3eb8a52
EB
8201 }
8202
8203 /* The run-time loop is made up of 8 insns in the generic case while the
8204 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
8205 else if (size <= 4 * PROBE_INTERVAL)
8206 {
8207 HOST_WIDE_INT i, rem;
8208
8209 emit_set_insn (reg1,
5f5c5e0f 8210 plus_constant (Pmode,
a3eb8a52
EB
8211 stack_pointer_rtx,
8212 -(first + PROBE_INTERVAL)));
8213 emit_stack_probe (reg1);
8214
8215 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8216 it exceeds SIZE. If only two probes are needed, this will not
8217 generate any code. Then probe at FIRST + SIZE. */
8218 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8219 {
8220 emit_set_insn (reg1,
5f5c5e0f 8221 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
a3eb8a52
EB
8222 emit_stack_probe (reg1);
8223 }
8224
8225 rem = size - (i - PROBE_INTERVAL);
8226 if (rem > 256)
8227 {
8228 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8229
5f5c5e0f
EB
8230 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8231 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
a3eb8a52
EB
8232 }
8233 else
5f5c5e0f 8234 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
a3eb8a52
EB
8235 }
8236
8237 /* Otherwise, do the same as above, but in a loop. Note that we must be
8238 extra careful with variables wrapping around because we might be at
8239 the very top (or the very bottom) of the address space and we have
8240 to be able to handle this case properly; in particular, we use an
8241 equality test for the loop condition. */
8242 else
8243 {
5773855c 8244 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
a3eb8a52
EB
8245
8246 /* Step 1: round SIZE to the previous multiple of the interval. */
8247
8248 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8249
8250
8251 /* Step 2: compute initial and final value of the loop counter. */
8252
8253 /* TEST_ADDR = SP + FIRST. */
8254 emit_set_insn (reg1,
5f5c5e0f 8255 plus_constant (Pmode, stack_pointer_rtx, -first));
a3eb8a52
EB
8256
8257 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
13f752b2
JL
8258 HOST_WIDE_INT adjustment = - (first + rounded_size);
8259 if (! aarch64_uimm12_shift (adjustment))
8260 {
8261 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8262 true, Pmode);
8263 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8264 }
8265 else
8dd64cdf
EB
8266 emit_set_insn (reg2,
8267 plus_constant (Pmode, stack_pointer_rtx, adjustment));
8268
a3eb8a52
EB
8269 /* Step 3: the loop
8270
8271 do
8272 {
8273 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8274 probe at TEST_ADDR
8275 }
8276 while (TEST_ADDR != LAST_ADDR)
8277
8278 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8279 until it is equal to ROUNDED_SIZE. */
8280
5f5c5e0f 8281 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
a3eb8a52
EB
8282
8283
8284 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8285 that SIZE is equal to ROUNDED_SIZE. */
8286
8287 if (size != rounded_size)
8288 {
8289 HOST_WIDE_INT rem = size - rounded_size;
8290
8291 if (rem > 256)
8292 {
8293 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8294
5f5c5e0f
EB
8295 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8296 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
a3eb8a52
EB
8297 }
8298 else
5f5c5e0f 8299 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
a3eb8a52
EB
8300 }
8301 }
8302
8303 /* Make sure nothing is scheduled before we are done. */
8304 emit_insn (gen_blockage ());
8305}
8306
8307/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
8308 absolute addresses. */
8309
8310const char *
8311aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8312{
8313 static int labelno = 0;
8314 char loop_lab[32];
8315 rtx xops[2];
8316
8317 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8318
8319 /* Loop. */
8320 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8321
cd1bef27 8322 HOST_WIDE_INT stack_clash_probe_interval
028d4092 8323 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 8324
a3eb8a52
EB
8325 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
8326 xops[0] = reg1;
cd1bef27
JL
8327 HOST_WIDE_INT interval;
8328 if (flag_stack_clash_protection)
8329 interval = stack_clash_probe_interval;
8330 else
8331 interval = PROBE_INTERVAL;
8332
8333 gcc_assert (aarch64_uimm12_shift (interval));
8334 xops[1] = GEN_INT (interval);
8335
a3eb8a52
EB
8336 output_asm_insn ("sub\t%0, %0, %1", xops);
8337
cd1bef27
JL
8338 /* If doing stack clash protection then we probe up by the ABI specified
8339 amount. We do this because we're dropping full pages at a time in the
8340 loop. But if we're doing non-stack clash probing, probe at SP 0. */
8341 if (flag_stack_clash_protection)
8342 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8343 else
8344 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8345
8346 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
8347 by this amount for each iteration. */
8348 output_asm_insn ("str\txzr, [%0, %1]", xops);
a3eb8a52
EB
8349
8350 /* Test if TEST_ADDR == LAST_ADDR. */
8351 xops[1] = reg2;
8352 output_asm_insn ("cmp\t%0, %1", xops);
8353
8354 /* Branch. */
8355 fputs ("\tb.ne\t", asm_out_file);
8356 assemble_name_raw (asm_out_file, loop_lab);
8357 fputc ('\n', asm_out_file);
8358
8359 return "";
8360}
8361
eb471ba3
TC
8362/* Emit the probe loop for doing stack clash probes and stack adjustments for
8363 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8364 of GUARD_SIZE. When a probe is emitted it is done at most
8365 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8366 at most MIN_PROBE_THRESHOLD. By the end of this function
8367 BASE = BASE - ADJUSTMENT. */
8368
8369const char *
8370aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8371 rtx min_probe_threshold, rtx guard_size)
8372{
8373 /* This function is not allowed to use any instruction generation function
8374 like gen_ and friends. If you do you'll likely ICE during CFG validation,
8375 so instead emit the code you want using output_asm_insn. */
8376 gcc_assert (flag_stack_clash_protection);
8377 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8378 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8379
8380 /* The minimum required allocation before the residual requires probing. */
8381 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8382
8383 /* Clamp the value down to the nearest value that can be used with a cmp. */
8384 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8385 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8386
8387 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8388 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8389
8390 static int labelno = 0;
8391 char loop_start_lab[32];
8392 char loop_end_lab[32];
8393 rtx xops[2];
8394
8395 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8396 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8397
8398 /* Emit loop start label. */
8399 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8400
8401 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
8402 xops[0] = adjustment;
8403 xops[1] = probe_offset_value_rtx;
8404 output_asm_insn ("cmp\t%0, %1", xops);
8405
8406 /* Branch to end if not enough adjustment to probe. */
8407 fputs ("\tb.lt\t", asm_out_file);
8408 assemble_name_raw (asm_out_file, loop_end_lab);
8409 fputc ('\n', asm_out_file);
8410
8411 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
8412 xops[0] = base;
8413 xops[1] = probe_offset_value_rtx;
8414 output_asm_insn ("sub\t%0, %0, %1", xops);
8415
8416 /* Probe at BASE. */
8417 xops[1] = const0_rtx;
8418 output_asm_insn ("str\txzr, [%0, %1]", xops);
8419
8420 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
8421 xops[0] = adjustment;
8422 xops[1] = probe_offset_value_rtx;
8423 output_asm_insn ("sub\t%0, %0, %1", xops);
8424
8425 /* Branch to start if still more bytes to allocate. */
8426 fputs ("\tb\t", asm_out_file);
8427 assemble_name_raw (asm_out_file, loop_start_lab);
8428 fputc ('\n', asm_out_file);
8429
8430 /* No probe leave. */
8431 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8432
8433 /* BASE = BASE - ADJUSTMENT. */
8434 xops[0] = base;
8435 xops[1] = adjustment;
8436 output_asm_insn ("sub\t%0, %0, %1", xops);
8437 return "";
8438}
8439
d6cb6d6a
WD
8440/* Determine whether a frame chain needs to be generated. */
8441static bool
8442aarch64_needs_frame_chain (void)
8443{
8444 /* Force a frame chain for EH returns so the return address is at FP+8. */
8445 if (frame_pointer_needed || crtl->calls_eh_return)
8446 return true;
8447
8448 /* A leaf function cannot have calls or write LR. */
8449 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8450
8451 /* Don't use a frame chain in leaf functions if leaf frame pointers
8452 are disabled. */
8453 if (flag_omit_leaf_frame_pointer && is_leaf)
8454 return false;
8455
8456 return aarch64_use_frame_pointer;
8457}
8458
43e9d192
IB
8459/* Mark the registers that need to be saved by the callee and calculate
8460 the size of the callee-saved registers area and frame record (both FP
33a2e348 8461 and LR may be omitted). */
43e9d192
IB
8462static void
8463aarch64_layout_frame (void)
8464{
c600df9a 8465 poly_int64 offset = 0;
4b0685d9 8466 int regno, last_fp_reg = INVALID_REGNUM;
c600df9a
RS
8467 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8468 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8469 bool frame_related_fp_reg_p = false;
ab43763e 8470 aarch64_frame &frame = cfun->machine->frame;
43e9d192 8471
ab43763e 8472 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7040939b 8473
8c6e3b23
TC
8474 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8475 the mid-end is doing. */
8476 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8477
97826595
MS
8478#define SLOT_NOT_REQUIRED (-2)
8479#define SLOT_REQUIRED (-1)
8480
ce09ab17
DL
8481 frame.wb_push_candidate1 = INVALID_REGNUM;
8482 frame.wb_push_candidate2 = INVALID_REGNUM;
c600df9a 8483 frame.spare_pred_reg = INVALID_REGNUM;
363ffa50 8484
43e9d192 8485 /* First mark all the registers that really need to be saved... */
c600df9a 8486 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
ab43763e 8487 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
43e9d192
IB
8488
8489 /* ... that includes the eh data registers (if needed)... */
8490 if (crtl->calls_eh_return)
8491 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
ab43763e 8492 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
43e9d192
IB
8493
8494 /* ... and any callee saved register that dataflow says is live. */
8495 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8496 if (df_regs_ever_live_p (regno)
dcdd0f05 8497 && !fixed_regs[regno]
1c923b60 8498 && (regno == R30_REGNUM
dcdd0f05 8499 || !crtl->abi->clobbers_full_reg_p (regno)))
ab43763e 8500 frame.reg_offset[regno] = SLOT_REQUIRED;
43e9d192
IB
8501
8502 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8503 if (df_regs_ever_live_p (regno)
dcdd0f05
RS
8504 && !fixed_regs[regno]
8505 && !crtl->abi->clobbers_full_reg_p (regno))
4b0685d9 8506 {
ab43763e 8507 frame.reg_offset[regno] = SLOT_REQUIRED;
4b0685d9 8508 last_fp_reg = regno;
c600df9a
RS
8509 if (aarch64_emit_cfi_for_reg_p (regno))
8510 frame_related_fp_reg_p = true;
4b0685d9 8511 }
43e9d192 8512
c600df9a
RS
8513 /* Big-endian SVE frames need a spare predicate register in order
8514 to save Z8-Z15. Decide which register they should use. Prefer
8515 an unused argument register if possible, so that we don't force P4
8516 to be saved unnecessarily. */
8517 if (frame_related_fp_reg_p
8518 && crtl->abi->id () == ARM_PCS_SVE
8519 && BYTES_BIG_ENDIAN)
8520 {
8521 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8522 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8523 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8524 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8525 break;
8526 gcc_assert (regno <= P7_REGNUM);
8527 frame.spare_pred_reg = regno;
8528 df_set_regs_ever_live (regno, true);
8529 }
8530
8531 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8532 if (df_regs_ever_live_p (regno)
8533 && !fixed_regs[regno]
8534 && !crtl->abi->clobbers_full_reg_p (regno))
8535 frame.reg_offset[regno] = SLOT_REQUIRED;
8536
d6430e3c
TC
8537 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
8538 LR counts as an implicit probe which allows us to maintain the invariant
8539 described in the comment at expand_prologue. */
c600df9a
RS
8540 gcc_assert (crtl->is_leaf
8541 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8542
8543 /* Now assign stack slots for the registers. Start with the predicate
8544 registers, since predicate LDR and STR have a relatively small
8545 offset range. These saves happen below the hard frame pointer. */
8546 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8547 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8548 {
8549 frame.reg_offset[regno] = offset;
8550 offset += BYTES_PER_SVE_PRED;
8551 }
8552
c600df9a
RS
8553 if (maybe_ne (offset, 0))
8554 {
cb26919c
RS
8555 /* If we have any vector registers to save above the predicate registers,
8556 the offset of the vector register save slots need to be a multiple
8557 of the vector size. This lets us use the immediate forms of LDR/STR
8558 (or LD1/ST1 for big-endian).
8559
8560 A vector register is 8 times the size of a predicate register,
8561 and we need to save a maximum of 12 predicate registers, so the
8562 first vector register will be at either #1, MUL VL or #2, MUL VL.
8563
8564 If we don't have any vector registers to save, and we know how
8565 big the predicate save area is, we can just round it up to the
8566 next 16-byte boundary. */
8567 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8568 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8569 else
8570 {
8571 if (known_le (offset, vector_save_size))
8572 offset = vector_save_size;
8573 else if (known_le (offset, vector_save_size * 2))
8574 offset = vector_save_size * 2;
8575 else
8576 gcc_unreachable ();
8577 }
c600df9a
RS
8578 }
8579
8580 /* If we need to save any SVE vector registers, add them next. */
8581 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8582 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8583 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8584 {
8585 frame.reg_offset[regno] = offset;
8586 offset += vector_save_size;
8587 }
8588
8589 /* OFFSET is now the offset of the hard frame pointer from the bottom
8590 of the callee save area. */
8591 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8592 frame.below_hard_fp_saved_regs_size = offset;
ab43763e 8593 if (frame.emit_frame_chain)
43e9d192 8594 {
2e1cdae5 8595 /* FP and LR are placed in the linkage record. */
c600df9a 8596 frame.reg_offset[R29_REGNUM] = offset;
ce09ab17 8597 frame.wb_push_candidate1 = R29_REGNUM;
c600df9a 8598 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
ce09ab17 8599 frame.wb_push_candidate2 = R30_REGNUM;
c600df9a 8600 offset += 2 * UNITS_PER_WORD;
1f7bffd0 8601 }
43e9d192 8602
2e1cdae5 8603 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
c600df9a 8604 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 8605 {
ab43763e 8606 frame.reg_offset[regno] = offset;
ce09ab17
DL
8607 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8608 frame.wb_push_candidate1 = regno;
8609 else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8610 frame.wb_push_candidate2 = regno;
43e9d192
IB
8611 offset += UNITS_PER_WORD;
8612 }
8613
c600df9a
RS
8614 poly_int64 max_int_offset = offset;
8615 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8616 bool has_align_gap = maybe_ne (offset, max_int_offset);
4b0685d9 8617
43e9d192 8618 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
c600df9a 8619 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 8620 {
4b0685d9
WD
8621 /* If there is an alignment gap between integer and fp callee-saves,
8622 allocate the last fp register to it if possible. */
a0d0b980
SE
8623 if (regno == last_fp_reg
8624 && has_align_gap
c600df9a
RS
8625 && known_eq (vector_save_size, 8)
8626 && multiple_p (offset, 16))
4b0685d9 8627 {
ab43763e 8628 frame.reg_offset[regno] = max_int_offset;
4b0685d9
WD
8629 break;
8630 }
8631
ab43763e 8632 frame.reg_offset[regno] = offset;
ce09ab17
DL
8633 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8634 frame.wb_push_candidate1 = regno;
8635 else if (frame.wb_push_candidate2 == INVALID_REGNUM
8636 && frame.wb_push_candidate1 >= V0_REGNUM)
8637 frame.wb_push_candidate2 = regno;
c600df9a 8638 offset += vector_save_size;
43e9d192
IB
8639 }
8640
c600df9a 8641 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192 8642
ab43763e 8643 frame.saved_regs_size = offset;
1c960e02 8644
c600df9a 8645 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
71bfb77a 8646
c600df9a 8647 poly_int64 above_outgoing_args
6a70badb
RS
8648 = aligned_upper_bound (varargs_and_saved_regs_size
8649 + get_frame_size (),
8650 STACK_BOUNDARY / BITS_PER_UNIT);
1c960e02 8651
c600df9a
RS
8652 frame.hard_fp_offset
8653 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8654
6a70badb
RS
8655 /* Both these values are already aligned. */
8656 gcc_assert (multiple_p (crtl->outgoing_args_size,
8657 STACK_BOUNDARY / BITS_PER_UNIT));
c600df9a 8658 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
1c960e02 8659
ab43763e 8660 frame.locals_offset = frame.saved_varargs_size;
71bfb77a 8661
ab43763e
RS
8662 frame.initial_adjust = 0;
8663 frame.final_adjust = 0;
8664 frame.callee_adjust = 0;
c600df9a 8665 frame.sve_callee_adjust = 0;
ab43763e 8666 frame.callee_offset = 0;
71bfb77a 8667
ce09ab17
DL
8668 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8669 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8670
8671 /* Shadow call stack only deals with functions where the LR is pushed
8672 onto the stack and without specifying the "no_sanitize" attribute
8673 with the argument "shadow-call-stack". */
8674 frame.is_scs_enabled
8675 = (!crtl->calls_eh_return
8676 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8677 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8678
8679 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8680 restore x30, and we don't need to pop x30 again in the traditional
8681 way. Pop candidates record the registers that need to be popped
8682 eventually. */
8683 if (frame.is_scs_enabled)
8684 {
8685 if (frame.wb_pop_candidate2 == R30_REGNUM)
8686 frame.wb_pop_candidate2 = INVALID_REGNUM;
8687 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8688 frame.wb_pop_candidate1 = INVALID_REGNUM;
8689 }
8690
8691 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8692 256 to ensure that the offset meets the requirements of emit_move_insn.
8693 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8694 max_push_offset to 0, because no registers are popped at this time,
8695 so callee_adjust cannot be adjusted. */
71bfb77a 8696 HOST_WIDE_INT max_push_offset = 0;
ce09ab17 8697 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
71bfb77a 8698 max_push_offset = 512;
ce09ab17 8699 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
71bfb77a
WD
8700 max_push_offset = 256;
8701
9b17a646 8702 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
c600df9a 8703 HOST_WIDE_INT const_saved_regs_size;
ab43763e 8704 if (frame.frame_size.is_constant (&const_size)
6a70badb 8705 && const_size < max_push_offset
c600df9a 8706 && known_eq (frame.hard_fp_offset, const_size))
71bfb77a
WD
8707 {
8708 /* Simple, small frame with no outgoing arguments:
c600df9a 8709
71bfb77a
WD
8710 stp reg1, reg2, [sp, -frame_size]!
8711 stp reg3, reg4, [sp, 16] */
ab43763e 8712 frame.callee_adjust = const_size;
71bfb77a 8713 }
9b17a646 8714 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
c600df9a
RS
8715 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8716 && const_outgoing_args_size + const_saved_regs_size < 512
8717 /* We could handle this case even with outgoing args, provided
8718 that the number of args left us with valid offsets for all
8719 predicate and vector save slots. It's such a rare case that
8720 it hardly seems worth the effort though. */
8721 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
71bfb77a 8722 && !(cfun->calls_alloca
9b17a646
RS
8723 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8724 && const_fp_offset < max_push_offset))
71bfb77a
WD
8725 {
8726 /* Frame with small outgoing arguments:
c600df9a 8727
71bfb77a
WD
8728 sub sp, sp, frame_size
8729 stp reg1, reg2, [sp, outgoing_args_size]
8730 stp reg3, reg4, [sp, outgoing_args_size + 16] */
ab43763e 8731 frame.initial_adjust = frame.frame_size;
9b17a646 8732 frame.callee_offset = const_outgoing_args_size;
71bfb77a 8733 }
c600df9a
RS
8734 else if (saves_below_hard_fp_p
8735 && known_eq (frame.saved_regs_size,
8736 frame.below_hard_fp_saved_regs_size))
8737 {
8738 /* Frame in which all saves are SVE saves:
8739
8740 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8741 save SVE registers relative to SP
8742 sub sp, sp, outgoing_args_size */
8743 frame.initial_adjust = (frame.hard_fp_offset
8744 + frame.below_hard_fp_saved_regs_size);
8745 frame.final_adjust = crtl->outgoing_args_size;
8746 }
ab43763e 8747 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6a70badb 8748 && const_fp_offset < max_push_offset)
71bfb77a 8749 {
c600df9a
RS
8750 /* Frame with large outgoing arguments or SVE saves, but with
8751 a small local area:
8752
71bfb77a
WD
8753 stp reg1, reg2, [sp, -hard_fp_offset]!
8754 stp reg3, reg4, [sp, 16]
c600df9a
RS
8755 [sub sp, sp, below_hard_fp_saved_regs_size]
8756 [save SVE registers relative to SP]
71bfb77a 8757 sub sp, sp, outgoing_args_size */
ab43763e 8758 frame.callee_adjust = const_fp_offset;
c600df9a 8759 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 8760 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a 8761 }
71bfb77a
WD
8762 else
8763 {
c600df9a
RS
8764 /* Frame with large local area and outgoing arguments or SVE saves,
8765 using frame pointer:
8766
71bfb77a
WD
8767 sub sp, sp, hard_fp_offset
8768 stp x29, x30, [sp, 0]
8769 add x29, sp, 0
8770 stp reg3, reg4, [sp, 16]
c600df9a
RS
8771 [sub sp, sp, below_hard_fp_saved_regs_size]
8772 [save SVE registers relative to SP]
71bfb77a 8773 sub sp, sp, outgoing_args_size */
ab43763e 8774 frame.initial_adjust = frame.hard_fp_offset;
c600df9a 8775 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 8776 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a
WD
8777 }
8778
8e66b377
RS
8779 /* Make sure the individual adjustments add up to the full frame size. */
8780 gcc_assert (known_eq (frame.initial_adjust
8781 + frame.callee_adjust
c600df9a 8782 + frame.sve_callee_adjust
8e66b377
RS
8783 + frame.final_adjust, frame.frame_size));
8784
59a3d73d
RS
8785 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8786 {
8787 /* We've decided not to associate any register saves with the initial
8788 stack allocation. */
ce09ab17
DL
8789 frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8790 frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
59a3d73d
RS
8791 }
8792
ab43763e 8793 frame.laid_out = true;
43e9d192
IB
8794}
8795
04ddfe06
KT
8796/* Return true if the register REGNO is saved on entry to
8797 the current function. */
8798
43e9d192
IB
8799static bool
8800aarch64_register_saved_on_entry (int regno)
8801{
c600df9a 8802 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
43e9d192
IB
8803}
8804
04ddfe06
KT
8805/* Return the next register up from REGNO up to LIMIT for the callee
8806 to save. */
8807
64dedd72
JW
8808static unsigned
8809aarch64_next_callee_save (unsigned regno, unsigned limit)
8810{
8811 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8812 regno ++;
8813 return regno;
8814}
43e9d192 8815
04ddfe06
KT
8816/* Push the register number REGNO of mode MODE to the stack with write-back
8817 adjusting the stack by ADJUSTMENT. */
8818
c5e1f66e 8819static void
ef4bddc2 8820aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
c5e1f66e
JW
8821 HOST_WIDE_INT adjustment)
8822 {
8823 rtx base_rtx = stack_pointer_rtx;
8824 rtx insn, reg, mem;
8825
8826 reg = gen_rtx_REG (mode, regno);
8827 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8828 plus_constant (Pmode, base_rtx, -adjustment));
30079dde 8829 mem = gen_frame_mem (mode, mem);
c5e1f66e
JW
8830
8831 insn = emit_move_insn (mem, reg);
8832 RTX_FRAME_RELATED_P (insn) = 1;
8833}
8834
04ddfe06
KT
8835/* Generate and return an instruction to store the pair of registers
8836 REG and REG2 of mode MODE to location BASE with write-back adjusting
8837 the stack location BASE by ADJUSTMENT. */
8838
80c11907 8839static rtx
ef4bddc2 8840aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
80c11907
JW
8841 HOST_WIDE_INT adjustment)
8842{
8843 switch (mode)
8844 {
4e10a5a7 8845 case E_DImode:
80c11907
JW
8846 return gen_storewb_pairdi_di (base, base, reg, reg2,
8847 GEN_INT (-adjustment),
8848 GEN_INT (UNITS_PER_WORD - adjustment));
4e10a5a7 8849 case E_DFmode:
80c11907
JW
8850 return gen_storewb_pairdf_di (base, base, reg, reg2,
8851 GEN_INT (-adjustment),
8852 GEN_INT (UNITS_PER_WORD - adjustment));
a0d0b980
SE
8853 case E_TFmode:
8854 return gen_storewb_pairtf_di (base, base, reg, reg2,
8855 GEN_INT (-adjustment),
8856 GEN_INT (UNITS_PER_VREG - adjustment));
80c11907
JW
8857 default:
8858 gcc_unreachable ();
8859 }
8860}
8861
04ddfe06
KT
8862/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8863 stack pointer by ADJUSTMENT. */
8864
80c11907 8865static void
89ac681e 8866aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
80c11907 8867{
5d8a22a5 8868 rtx_insn *insn;
c600df9a 8869 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e 8870
71bfb77a 8871 if (regno2 == INVALID_REGNUM)
89ac681e
WD
8872 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8873
80c11907
JW
8874 rtx reg1 = gen_rtx_REG (mode, regno1);
8875 rtx reg2 = gen_rtx_REG (mode, regno2);
8876
8877 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8878 reg2, adjustment));
8879 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
80c11907
JW
8880 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8881 RTX_FRAME_RELATED_P (insn) = 1;
8882}
8883
04ddfe06
KT
8884/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8885 adjusting it by ADJUSTMENT afterwards. */
8886
159313d9 8887static rtx
ef4bddc2 8888aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
159313d9
JW
8889 HOST_WIDE_INT adjustment)
8890{
8891 switch (mode)
8892 {
4e10a5a7 8893 case E_DImode:
159313d9 8894 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 8895 GEN_INT (UNITS_PER_WORD));
4e10a5a7 8896 case E_DFmode:
159313d9 8897 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 8898 GEN_INT (UNITS_PER_WORD));
a0d0b980
SE
8899 case E_TFmode:
8900 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8901 GEN_INT (UNITS_PER_VREG));
159313d9
JW
8902 default:
8903 gcc_unreachable ();
8904 }
8905}
8906
04ddfe06
KT
8907/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8908 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8909 into CFI_OPS. */
8910
89ac681e
WD
8911static void
8912aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8913 rtx *cfi_ops)
8914{
c600df9a 8915 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e
WD
8916 rtx reg1 = gen_rtx_REG (mode, regno1);
8917
8918 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8919
71bfb77a 8920 if (regno2 == INVALID_REGNUM)
89ac681e
WD
8921 {
8922 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8923 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
30079dde 8924 emit_move_insn (reg1, gen_frame_mem (mode, mem));
89ac681e
WD
8925 }
8926 else
8927 {
8928 rtx reg2 = gen_rtx_REG (mode, regno2);
8929 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8930 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8931 reg2, adjustment));
8932 }
8933}
8934
04ddfe06
KT
8935/* Generate and return a store pair instruction of mode MODE to store
8936 register REG1 to MEM1 and register REG2 to MEM2. */
8937
72df5c1f 8938static rtx
ef4bddc2 8939aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
72df5c1f
JW
8940 rtx reg2)
8941{
8942 switch (mode)
8943 {
4e10a5a7 8944 case E_DImode:
dfe1da23 8945 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
72df5c1f 8946
4e10a5a7 8947 case E_DFmode:
dfe1da23 8948 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
72df5c1f 8949
a0d0b980
SE
8950 case E_TFmode:
8951 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8952
7cda9e08
SD
8953 case E_V4SImode:
8954 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8955
54bbde55
SD
8956 case E_V16QImode:
8957 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8958
72df5c1f
JW
8959 default:
8960 gcc_unreachable ();
8961 }
8962}
8963
04ddfe06
KT
8964/* Generate and regurn a load pair isntruction of mode MODE to load register
8965 REG1 from MEM1 and register REG2 from MEM2. */
8966
72df5c1f 8967static rtx
ef4bddc2 8968aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
72df5c1f
JW
8969 rtx mem2)
8970{
8971 switch (mode)
8972 {
4e10a5a7 8973 case E_DImode:
dfe1da23 8974 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
72df5c1f 8975
4e10a5a7 8976 case E_DFmode:
dfe1da23 8977 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
72df5c1f 8978
a0d0b980
SE
8979 case E_TFmode:
8980 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8981
7cda9e08
SD
8982 case E_V4SImode:
8983 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8984
72df5c1f
JW
8985 default:
8986 gcc_unreachable ();
8987 }
8988}
8989
db58fd89
JW
8990/* Return TRUE if return address signing should be enabled for the current
8991 function, otherwise return FALSE. */
8992
8993bool
8994aarch64_return_address_signing_enabled (void)
8995{
8996 /* This function should only be called after frame laid out. */
8997 gcc_assert (cfun->machine->frame.laid_out);
8998
2bc95be3
SN
8999 /* Turn return address signing off in any function that uses
9000 __builtin_eh_return. The address passed to __builtin_eh_return
9001 is not signed so either it has to be signed (with original sp)
9002 or the code path that uses it has to avoid authenticating it.
9003 Currently eh return introduces a return to anywhere gadget, no
9004 matter what we do here since it uses ret with user provided
9005 address. An ideal fix for that is to use indirect branch which
9006 can be protected with BTI j (to some extent). */
9007 if (crtl->calls_eh_return)
9008 return false;
9009
db58fd89 9010 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8fc16d72 9011 if its LR is pushed onto stack. */
db58fd89
JW
9012 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
9013 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
c600df9a 9014 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
db58fd89
JW
9015}
9016
30afdf34
SD
9017/* Return TRUE if Branch Target Identification Mechanism is enabled. */
9018bool
9019aarch64_bti_enabled (void)
9020{
9021 return (aarch64_enable_bti == 1);
9022}
9023
c600df9a
RS
9024/* The caller is going to use ST1D or LD1D to save or restore an SVE
9025 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
9026 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
9027
9028 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
9029 or LD1D address
9030
9031 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
9032 if the variable isn't already nonnull
9033
9034 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
9035 Handle this case using a temporary base register that is suitable for
9036 all offsets in that range. Use ANCHOR_REG as this base register if it
9037 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
9038
9039static inline void
9040aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
9041 rtx &anchor_reg, poly_int64 &offset,
9042 rtx &ptrue)
9043{
9044 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
9045 {
9046 /* This is the maximum valid offset of the anchor from the base.
9047 Lower values would be valid too. */
9048 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
9049 if (!anchor_reg)
9050 {
9051 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9052 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9053 gen_int_mode (anchor_offset, Pmode)));
9054 }
9055 base_rtx = anchor_reg;
9056 offset -= anchor_offset;
9057 }
9058 if (!ptrue)
9059 {
9060 int pred_reg = cfun->machine->frame.spare_pred_reg;
9061 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
9062 CONSTM1_RTX (VNx16BImode));
9063 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
9064 }
9065}
9066
9067/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
9068 is saved at BASE + OFFSET. */
9069
9070static void
9071aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
9072 rtx base, poly_int64 offset)
9073{
9074 rtx mem = gen_frame_mem (GET_MODE (reg),
9075 plus_constant (Pmode, base, offset));
9076 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
9077}
9078
04ddfe06
KT
9079/* Emit code to save the callee-saved registers from register number START
9080 to LIMIT to the stack at the location starting at offset START_OFFSET,
c600df9a
RS
9081 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
9082 is true if the hard frame pointer has been set up. */
43e9d192 9083
43e9d192 9084static void
c600df9a
RS
9085aarch64_save_callee_saves (poly_int64 start_offset,
9086 unsigned start, unsigned limit, bool skip_wb,
9087 bool hard_fp_valid_p)
43e9d192 9088{
5d8a22a5 9089 rtx_insn *insn;
43e9d192
IB
9090 unsigned regno;
9091 unsigned regno2;
c600df9a 9092 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
43e9d192 9093
0ec74a1e 9094 for (regno = aarch64_next_callee_save (start, limit);
64dedd72
JW
9095 regno <= limit;
9096 regno = aarch64_next_callee_save (regno + 1, limit))
43e9d192 9097 {
ae13fce3 9098 rtx reg, mem;
6a70badb 9099 poly_int64 offset;
c600df9a 9100 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
64dedd72 9101
ae13fce3 9102 if (skip_wb
ce09ab17
DL
9103 && (regno == cfun->machine->frame.wb_push_candidate1
9104 || regno == cfun->machine->frame.wb_push_candidate2))
ae13fce3
JW
9105 continue;
9106
827ab47a 9107 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 9108 continue;
827ab47a 9109
c600df9a 9110 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3
JW
9111 reg = gen_rtx_REG (mode, regno);
9112 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
9113 rtx base_rtx = stack_pointer_rtx;
9114 poly_int64 sp_offset = offset;
64dedd72 9115
c600df9a
RS
9116 HOST_WIDE_INT const_offset;
9117 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9118 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9119 offset, ptrue);
9120 else if (GP_REGNUM_P (regno)
9121 && (!offset.is_constant (&const_offset) || const_offset >= 512))
9122 {
9123 gcc_assert (known_eq (start_offset, 0));
9124 poly_int64 fp_offset
9125 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9126 if (hard_fp_valid_p)
9127 base_rtx = hard_frame_pointer_rtx;
9128 else
9129 {
9130 if (!anchor_reg)
9131 {
9132 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9133 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9134 gen_int_mode (fp_offset, Pmode)));
9135 }
9136 base_rtx = anchor_reg;
9137 }
9138 offset -= fp_offset;
9139 }
9140 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9141 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
64dedd72 9142
c600df9a
RS
9143 if (!aarch64_sve_mode_p (mode)
9144 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 9145 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
9146 && known_eq (GET_MODE_SIZE (mode),
9147 cfun->machine->frame.reg_offset[regno2]
9148 - cfun->machine->frame.reg_offset[regno]))
43e9d192 9149 {
0ec74a1e 9150 rtx reg2 = gen_rtx_REG (mode, regno2);
64dedd72
JW
9151 rtx mem2;
9152
c600df9a
RS
9153 offset += GET_MODE_SIZE (mode);
9154 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62
JW
9155 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9156 reg2));
0b4a9743 9157
64dedd72
JW
9158 /* The first part of a frame-related parallel insn is
9159 always assumed to be relevant to the frame
9160 calculations; subsequent parts, are only
9161 frame-related if explicitly marked. */
c600df9a
RS
9162 if (aarch64_emit_cfi_for_reg_p (regno2))
9163 {
9164 if (need_cfa_note_p)
9165 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9166 sp_offset + GET_MODE_SIZE (mode));
9167 else
9168 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9169 }
9170
64dedd72
JW
9171 regno = regno2;
9172 }
c600df9a
RS
9173 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9174 {
9175 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9176 need_cfa_note_p = true;
9177 }
9178 else if (aarch64_sve_mode_p (mode))
9179 insn = emit_insn (gen_rtx_SET (mem, reg));
64dedd72 9180 else
8ed2fc62
JW
9181 insn = emit_move_insn (mem, reg);
9182
c600df9a
RS
9183 RTX_FRAME_RELATED_P (insn) = frame_related_p;
9184 if (frame_related_p && need_cfa_note_p)
9185 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8ed2fc62
JW
9186 }
9187}
9188
c600df9a
RS
9189/* Emit code to restore the callee registers from register number START
9190 up to and including LIMIT. Restore from the stack offset START_OFFSET,
9191 skipping any write-back candidates if SKIP_WB is true. Write the
9192 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
04ddfe06 9193
8ed2fc62 9194static void
c600df9a 9195aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
dd991abb 9196 unsigned limit, bool skip_wb, rtx *cfi_ops)
8ed2fc62 9197{
8ed2fc62
JW
9198 unsigned regno;
9199 unsigned regno2;
6a70badb 9200 poly_int64 offset;
c600df9a 9201 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8ed2fc62
JW
9202
9203 for (regno = aarch64_next_callee_save (start, limit);
9204 regno <= limit;
9205 regno = aarch64_next_callee_save (regno + 1, limit))
9206 {
c600df9a 9207 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
827ab47a 9208 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 9209 continue;
827ab47a 9210
ae13fce3 9211 rtx reg, mem;
8ed2fc62 9212
ae13fce3 9213 if (skip_wb
ce09ab17
DL
9214 && (regno == cfun->machine->frame.wb_pop_candidate1
9215 || regno == cfun->machine->frame.wb_pop_candidate2))
ae13fce3
JW
9216 continue;
9217
c600df9a 9218 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3 9219 reg = gen_rtx_REG (mode, regno);
8ed2fc62 9220 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
9221 rtx base_rtx = stack_pointer_rtx;
9222 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9223 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9224 offset, ptrue);
30079dde 9225 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62 9226
c600df9a
RS
9227 if (!aarch64_sve_mode_p (mode)
9228 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 9229 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
9230 && known_eq (GET_MODE_SIZE (mode),
9231 cfun->machine->frame.reg_offset[regno2]
9232 - cfun->machine->frame.reg_offset[regno]))
64dedd72 9233 {
8ed2fc62
JW
9234 rtx reg2 = gen_rtx_REG (mode, regno2);
9235 rtx mem2;
9236
c600df9a 9237 offset += GET_MODE_SIZE (mode);
30079dde 9238 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
dd991abb 9239 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8ed2fc62 9240
dd991abb 9241 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8ed2fc62 9242 regno = regno2;
43e9d192 9243 }
c600df9a
RS
9244 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9245 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9246 else if (aarch64_sve_mode_p (mode))
9247 emit_insn (gen_rtx_SET (reg, mem));
8ed2fc62 9248 else
dd991abb 9249 emit_move_insn (reg, mem);
c600df9a
RS
9250 if (frame_related_p)
9251 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
43e9d192 9252 }
43e9d192
IB
9253}
9254
43cacb12
RS
9255/* Return true if OFFSET is a signed 4-bit value multiplied by the size
9256 of MODE. */
9257
9258static inline bool
9259offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9260{
9261 HOST_WIDE_INT multiple;
9262 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9263 && IN_RANGE (multiple, -8, 7));
9264}
9265
ba15b0fa
RS
9266/* Return true if OFFSET is a signed 6-bit value multiplied by the size
9267 of MODE. */
9268
9269static inline bool
9270offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9271{
9272 HOST_WIDE_INT multiple;
9273 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9274 && IN_RANGE (multiple, -32, 31));
9275}
9276
9277/* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
43cacb12
RS
9278 of MODE. */
9279
9280static inline bool
9281offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9282{
9283 HOST_WIDE_INT multiple;
9284 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9285 && IN_RANGE (multiple, 0, 63));
9286}
9287
9288/* Return true if OFFSET is a signed 7-bit value multiplied by the size
9289 of MODE. */
9290
9291bool
9292aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9293{
9294 HOST_WIDE_INT multiple;
9295 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9296 && IN_RANGE (multiple, -64, 63));
9297}
9298
9299/* Return true if OFFSET is a signed 9-bit value. */
9300
3c5af608
MM
9301bool
9302aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9303 poly_int64 offset)
827ab47a 9304{
6a70badb
RS
9305 HOST_WIDE_INT const_offset;
9306 return (offset.is_constant (&const_offset)
9307 && IN_RANGE (const_offset, -256, 255));
827ab47a
KT
9308}
9309
43cacb12
RS
9310/* Return true if OFFSET is a signed 9-bit value multiplied by the size
9311 of MODE. */
9312
827ab47a 9313static inline bool
43cacb12 9314offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 9315{
6a70badb
RS
9316 HOST_WIDE_INT multiple;
9317 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 9318 && IN_RANGE (multiple, -256, 255));
827ab47a
KT
9319}
9320
43cacb12
RS
9321/* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9322 of MODE. */
9323
9324static inline bool
9325offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 9326{
6a70badb
RS
9327 HOST_WIDE_INT multiple;
9328 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 9329 && IN_RANGE (multiple, 0, 4095));
827ab47a
KT
9330}
9331
9332/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9333
9334static sbitmap
9335aarch64_get_separate_components (void)
9336{
827ab47a
KT
9337 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9338 bitmap_clear (components);
9339
9340 /* The registers we need saved to the frame. */
9341 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9342 if (aarch64_register_saved_on_entry (regno))
9343 {
c600df9a
RS
9344 /* Punt on saves and restores that use ST1D and LD1D. We could
9345 try to be smarter, but it would involve making sure that the
9346 spare predicate register itself is safe to use at the save
9347 and restore points. Also, when a frame pointer is being used,
9348 the slots are often out of reach of ST1D and LD1D anyway. */
9349 machine_mode mode = aarch64_reg_save_mode (regno);
9350 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9351 continue;
9352
6a70badb 9353 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
9354
9355 /* If the register is saved in the first SVE save slot, we use
9356 it as a stack probe for -fstack-clash-protection. */
9357 if (flag_stack_clash_protection
9358 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9359 && known_eq (offset, 0))
9360 continue;
9361
9362 /* Get the offset relative to the register we'll use. */
9363 if (frame_pointer_needed)
9364 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9365 else
9366 offset += crtl->outgoing_args_size;
9367
827ab47a
KT
9368 /* Check that we can access the stack slot of the register with one
9369 direct load with no adjustments needed. */
c600df9a
RS
9370 if (aarch64_sve_mode_p (mode)
9371 ? offset_9bit_signed_scaled_p (mode, offset)
9372 : offset_12bit_unsigned_scaled_p (mode, offset))
827ab47a
KT
9373 bitmap_set_bit (components, regno);
9374 }
9375
9376 /* Don't mess with the hard frame pointer. */
9377 if (frame_pointer_needed)
9378 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9379
c600df9a
RS
9380 /* If the spare predicate register used by big-endian SVE code
9381 is call-preserved, it must be saved in the main prologue
9382 before any saves that use it. */
9383 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9384 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9385
ce09ab17
DL
9386 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9387 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
0795f659 9388 /* If registers have been chosen to be stored/restored with
827ab47a
KT
9389 writeback don't interfere with them to avoid having to output explicit
9390 stack adjustment instructions. */
9391 if (reg2 != INVALID_REGNUM)
9392 bitmap_clear_bit (components, reg2);
9393 if (reg1 != INVALID_REGNUM)
9394 bitmap_clear_bit (components, reg1);
9395
9396 bitmap_clear_bit (components, LR_REGNUM);
9397 bitmap_clear_bit (components, SP_REGNUM);
9398
9399 return components;
9400}
9401
9402/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9403
9404static sbitmap
9405aarch64_components_for_bb (basic_block bb)
9406{
9407 bitmap in = DF_LIVE_IN (bb);
9408 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9409 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9410
9411 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9412 bitmap_clear (components);
9413
ce9d2a37
RS
9414 /* Clobbered registers don't generate values in any meaningful sense,
9415 since nothing after the clobber can rely on their value. And we can't
9416 say that partially-clobbered registers are unconditionally killed,
9417 because whether they're killed or not depends on the mode of the
9418 value they're holding. Thus partially call-clobbered registers
9419 appear in neither the kill set nor the gen set.
9420
9421 Check manually for any calls that clobber more of a register than the
9422 current function can. */
9423 function_abi_aggregator callee_abis;
9424 rtx_insn *insn;
9425 FOR_BB_INSNS (bb, insn)
9426 if (CALL_P (insn))
9427 callee_abis.note_callee_abi (insn_callee_abi (insn));
9428 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9429
827ab47a
KT
9430 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9431 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
dcdd0f05
RS
9432 if (!fixed_regs[regno]
9433 && !crtl->abi->clobbers_full_reg_p (regno)
ce9d2a37
RS
9434 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9435 || bitmap_bit_p (in, regno)
9436 || bitmap_bit_p (gen, regno)
9437 || bitmap_bit_p (kill, regno)))
3f26f054 9438 {
3f26f054
WD
9439 bitmap_set_bit (components, regno);
9440
9441 /* If there is a callee-save at an adjacent offset, add it too
9442 to increase the use of LDP/STP. */
c600df9a
RS
9443 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9444 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
3f26f054
WD
9445
9446 if (regno2 <= LAST_SAVED_REGNUM)
9447 {
c600df9a
RS
9448 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9449 if (regno < regno2
9450 ? known_eq (offset + 8, offset2)
9451 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
3f26f054
WD
9452 bitmap_set_bit (components, regno2);
9453 }
9454 }
827ab47a
KT
9455
9456 return components;
9457}
9458
9459/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9460 Nothing to do for aarch64. */
9461
9462static void
9463aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9464{
9465}
9466
9467/* Return the next set bit in BMP from START onwards. Return the total number
9468 of bits in BMP if no set bit is found at or after START. */
9469
9470static unsigned int
9471aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9472{
9473 unsigned int nbits = SBITMAP_SIZE (bmp);
9474 if (start == nbits)
9475 return start;
9476
9477 gcc_assert (start < nbits);
9478 for (unsigned int i = start; i < nbits; i++)
9479 if (bitmap_bit_p (bmp, i))
9480 return i;
9481
9482 return nbits;
9483}
9484
9485/* Do the work for aarch64_emit_prologue_components and
9486 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9487 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9488 for these components or the epilogue sequence. That is, it determines
9489 whether we should emit stores or loads and what kind of CFA notes to attach
9490 to the insns. Otherwise the logic for the two sequences is very
9491 similar. */
9492
9493static void
9494aarch64_process_components (sbitmap components, bool prologue_p)
9495{
9496 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9497 ? HARD_FRAME_POINTER_REGNUM
9498 : STACK_POINTER_REGNUM);
9499
9500 unsigned last_regno = SBITMAP_SIZE (components);
9501 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9502 rtx_insn *insn = NULL;
9503
9504 while (regno != last_regno)
9505 {
c600df9a
RS
9506 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9507 machine_mode mode = aarch64_reg_save_mode (regno);
a0d0b980 9508
827ab47a 9509 rtx reg = gen_rtx_REG (mode, regno);
6a70badb 9510 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
9511 if (frame_pointer_needed)
9512 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9513 else
9514 offset += crtl->outgoing_args_size;
9515
827ab47a
KT
9516 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9517 rtx mem = gen_frame_mem (mode, addr);
9518
9519 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9520 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9521 /* No more registers to handle after REGNO.
9522 Emit a single save/restore and exit. */
9523 if (regno2 == last_regno)
9524 {
9525 insn = emit_insn (set);
c600df9a
RS
9526 if (frame_related_p)
9527 {
9528 RTX_FRAME_RELATED_P (insn) = 1;
9529 if (prologue_p)
9530 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9531 else
9532 add_reg_note (insn, REG_CFA_RESTORE, reg);
9533 }
827ab47a
KT
9534 break;
9535 }
9536
6a70badb 9537 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
827ab47a
KT
9538 /* The next register is not of the same class or its offset is not
9539 mergeable with the current one into a pair. */
c600df9a
RS
9540 if (aarch64_sve_mode_p (mode)
9541 || !satisfies_constraint_Ump (mem)
827ab47a 9542 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
dcdd0f05 9543 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6a70badb
RS
9544 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9545 GET_MODE_SIZE (mode)))
827ab47a
KT
9546 {
9547 insn = emit_insn (set);
c600df9a
RS
9548 if (frame_related_p)
9549 {
9550 RTX_FRAME_RELATED_P (insn) = 1;
9551 if (prologue_p)
9552 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9553 else
9554 add_reg_note (insn, REG_CFA_RESTORE, reg);
9555 }
827ab47a
KT
9556
9557 regno = regno2;
9558 continue;
9559 }
9560
c600df9a
RS
9561 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9562
827ab47a
KT
9563 /* REGNO2 can be saved/restored in a pair with REGNO. */
9564 rtx reg2 = gen_rtx_REG (mode, regno2);
c600df9a
RS
9565 if (frame_pointer_needed)
9566 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9567 else
9568 offset2 += crtl->outgoing_args_size;
827ab47a
KT
9569 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9570 rtx mem2 = gen_frame_mem (mode, addr2);
9571 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9572 : gen_rtx_SET (reg2, mem2);
9573
9574 if (prologue_p)
9575 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9576 else
9577 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9578
c600df9a 9579 if (frame_related_p || frame_related2_p)
827ab47a 9580 {
c600df9a
RS
9581 RTX_FRAME_RELATED_P (insn) = 1;
9582 if (prologue_p)
9583 {
9584 if (frame_related_p)
9585 add_reg_note (insn, REG_CFA_OFFSET, set);
9586 if (frame_related2_p)
9587 add_reg_note (insn, REG_CFA_OFFSET, set2);
9588 }
9589 else
9590 {
9591 if (frame_related_p)
9592 add_reg_note (insn, REG_CFA_RESTORE, reg);
9593 if (frame_related2_p)
9594 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9595 }
827ab47a
KT
9596 }
9597
9598 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9599 }
9600}
9601
9602/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9603
9604static void
9605aarch64_emit_prologue_components (sbitmap components)
9606{
9607 aarch64_process_components (components, true);
9608}
9609
9610/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9611
9612static void
9613aarch64_emit_epilogue_components (sbitmap components)
9614{
9615 aarch64_process_components (components, false);
9616}
9617
9618/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9619
9620static void
9621aarch64_set_handled_components (sbitmap components)
9622{
9623 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9624 if (bitmap_bit_p (components, regno))
9625 cfun->machine->reg_is_wrapped_separately[regno] = true;
9626}
9627
8c6e3b23
TC
9628/* On AArch64 we have an ABI defined safe buffer. This constant is used to
9629 determining the probe offset for alloca. */
9630
9631static HOST_WIDE_INT
9632aarch64_stack_clash_protection_alloca_probe_range (void)
9633{
9634 return STACK_CLASH_CALLER_GUARD;
9635}
9636
9637
cd1bef27
JL
9638/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9639 registers. If POLY_SIZE is not large enough to require a probe this function
9640 will only adjust the stack. When allocating the stack space
9641 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9642 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9643 arguments. If we are then we ensure that any allocation larger than the ABI
9644 defined buffer needs a probe so that the invariant of having a 1KB buffer is
9645 maintained.
9646
9647 We emit barriers after each stack adjustment to prevent optimizations from
9648 breaking the invariant that we never drop the stack more than a page. This
9649 invariant is needed to make it easier to correctly handle asynchronous
9650 events, e.g. if we were to allow the stack to be dropped by more than a page
9651 and then have multiple probes up and we take a signal somewhere in between
9652 then the signal handler doesn't know the state of the stack and can make no
9653 assumptions about which pages have been probed. */
9654
9655static void
9656aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9657 poly_int64 poly_size,
9658 bool frame_related_p,
9659 bool final_adjustment_p)
9660{
9661 HOST_WIDE_INT guard_size
028d4092 9662 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 9663 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
cd1bef27 9664 HOST_WIDE_INT min_probe_threshold
c600df9a
RS
9665 = (final_adjustment_p
9666 ? guard_used_by_caller
9667 : guard_size - guard_used_by_caller);
9668 /* When doing the final adjustment for the outgoing arguments, take into
9669 account any unprobed space there is above the current SP. There are
9670 two cases:
9671
9672 - When saving SVE registers below the hard frame pointer, we force
9673 the lowest save to take place in the prologue before doing the final
9674 adjustment (i.e. we don't allow the save to be shrink-wrapped).
9675 This acts as a probe at SP, so there is no unprobed space.
9676
9677 - When there are no SVE register saves, we use the store of the link
9678 register as a probe. We can't assume that LR was saved at position 0
9679 though, so treat any space below it as unprobed. */
9680 if (final_adjustment_p
9681 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9682 {
9683 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9684 if (known_ge (lr_offset, 0))
9685 min_probe_threshold -= lr_offset.to_constant ();
9686 else
9687 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9688 }
cd1bef27
JL
9689
9690 poly_int64 frame_size = cfun->machine->frame.frame_size;
9691
9692 /* We should always have a positive probe threshold. */
9693 gcc_assert (min_probe_threshold > 0);
9694
9695 if (flag_stack_clash_protection && !final_adjustment_p)
9696 {
9697 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
c600df9a 9698 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
cd1bef27
JL
9699 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9700
9701 if (known_eq (frame_size, 0))
9702 {
9703 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9704 }
c600df9a
RS
9705 else if (known_lt (initial_adjust + sve_callee_adjust,
9706 guard_size - guard_used_by_caller)
cd1bef27
JL
9707 && known_lt (final_adjust, guard_used_by_caller))
9708 {
9709 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9710 }
9711 }
9712
cd1bef27
JL
9713 /* If SIZE is not large enough to require probing, just adjust the stack and
9714 exit. */
eb471ba3 9715 if (known_lt (poly_size, min_probe_threshold)
cd1bef27
JL
9716 || !flag_stack_clash_protection)
9717 {
9718 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9719 return;
9720 }
9721
eb471ba3
TC
9722 HOST_WIDE_INT size;
9723 /* Handle the SVE non-constant case first. */
9724 if (!poly_size.is_constant (&size))
9725 {
9726 if (dump_file)
9727 {
9728 fprintf (dump_file, "Stack clash SVE prologue: ");
9729 print_dec (poly_size, dump_file);
9730 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9731 }
9732
9733 /* First calculate the amount of bytes we're actually spilling. */
9734 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9735 poly_size, temp1, temp2, false, true);
9736
9737 rtx_insn *insn = get_last_insn ();
9738
9739 if (frame_related_p)
9740 {
9741 /* This is done to provide unwinding information for the stack
9742 adjustments we're about to do, however to prevent the optimizers
143d3b15 9743 from removing the R11 move and leaving the CFA note (which would be
eb471ba3
TC
9744 very wrong) we tie the old and new stack pointer together.
9745 The tie will expand to nothing but the optimizers will not touch
9746 the instruction. */
143d3b15 9747 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
eb471ba3
TC
9748 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9749 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9750
9751 /* We want the CFA independent of the stack pointer for the
9752 duration of the loop. */
9753 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9754 RTX_FRAME_RELATED_P (insn) = 1;
9755 }
9756
9757 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9758 rtx guard_const = gen_int_mode (guard_size, Pmode);
9759
9760 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9761 stack_pointer_rtx, temp1,
9762 probe_const, guard_const));
9763
9764 /* Now reset the CFA register if needed. */
9765 if (frame_related_p)
9766 {
9767 add_reg_note (insn, REG_CFA_DEF_CFA,
9768 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9769 gen_int_mode (poly_size, Pmode)));
9770 RTX_FRAME_RELATED_P (insn) = 1;
9771 }
9772
9773 return;
9774 }
9775
cd1bef27
JL
9776 if (dump_file)
9777 fprintf (dump_file,
eb471ba3
TC
9778 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9779 " bytes, probing will be required.\n", size);
cd1bef27
JL
9780
9781 /* Round size to the nearest multiple of guard_size, and calculate the
9782 residual as the difference between the original size and the rounded
9783 size. */
9784 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9785 HOST_WIDE_INT residual = size - rounded_size;
9786
9787 /* We can handle a small number of allocations/probes inline. Otherwise
9788 punt to a loop. */
9789 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9790 {
9791 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9792 {
9793 aarch64_sub_sp (NULL, temp2, guard_size, true);
9794 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9795 guard_used_by_caller));
9796 emit_insn (gen_blockage ());
9797 }
9798 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9799 }
9800 else
9801 {
9802 /* Compute the ending address. */
9803 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9804 temp1, NULL, false, true);
9805 rtx_insn *insn = get_last_insn ();
9806
9807 /* For the initial allocation, we don't have a frame pointer
9808 set up, so we always need CFI notes. If we're doing the
9809 final allocation, then we may have a frame pointer, in which
9810 case it is the CFA, otherwise we need CFI notes.
9811
9812 We can determine which allocation we are doing by looking at
9813 the value of FRAME_RELATED_P since the final allocations are not
9814 frame related. */
9815 if (frame_related_p)
9816 {
9817 /* We want the CFA independent of the stack pointer for the
9818 duration of the loop. */
9819 add_reg_note (insn, REG_CFA_DEF_CFA,
9820 plus_constant (Pmode, temp1, rounded_size));
9821 RTX_FRAME_RELATED_P (insn) = 1;
9822 }
9823
9824 /* This allocates and probes the stack. Note that this re-uses some of
9825 the existing Ada stack protection code. However we are guaranteed not
9826 to enter the non loop or residual branches of that code.
9827
9828 The non-loop part won't be entered because if our allocation amount
9829 doesn't require a loop, the case above would handle it.
9830
9831 The residual amount won't be entered because TEMP1 is a mutliple of
9832 the allocation size. The residual will always be 0. As such, the only
9833 part we are actually using from that code is the loop setup. The
9834 actual probing is done in aarch64_output_probe_stack_range. */
9835 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9836 stack_pointer_rtx, temp1));
9837
9838 /* Now reset the CFA register if needed. */
9839 if (frame_related_p)
9840 {
9841 add_reg_note (insn, REG_CFA_DEF_CFA,
9842 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9843 RTX_FRAME_RELATED_P (insn) = 1;
9844 }
9845
9846 emit_insn (gen_blockage ());
9847 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9848 }
9849
9850 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9851 be probed. This maintains the requirement that each page is probed at
9852 least once. For initial probing we probe only if the allocation is
9853 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9854 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9855 GUARD_SIZE. This works that for any allocation that is large enough to
9856 trigger a probe here, we'll have at least one, and if they're not large
9857 enough for this code to emit anything for them, The page would have been
9858 probed by the saving of FP/LR either by this function or any callees. If
9859 we don't have any callees then we won't have more stack adjustments and so
9860 are still safe. */
9861 if (residual)
9862 {
9863 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9864 /* If we're doing final adjustments, and we've done any full page
9865 allocations then any residual needs to be probed. */
9866 if (final_adjustment_p && rounded_size != 0)
9867 min_probe_threshold = 0;
9868 /* If doing a small final adjustment, we always probe at offset 0.
9869 This is done to avoid issues when LR is not at position 0 or when
9870 the final adjustment is smaller than the probing offset. */
9871 else if (final_adjustment_p && rounded_size == 0)
9872 residual_probe_offset = 0;
9873
9874 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9875 if (residual >= min_probe_threshold)
9876 {
9877 if (dump_file)
9878 fprintf (dump_file,
9879 "Stack clash AArch64 prologue residuals: "
9880 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9881 "\n", residual);
9882
9883 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9884 residual_probe_offset));
9885 emit_insn (gen_blockage ());
9886 }
9887 }
9888}
9889
a0d0b980
SE
9890/* Return 1 if the register is used by the epilogue. We need to say the
9891 return register is used, but only after epilogue generation is complete.
9892 Note that in the case of sibcalls, the values "used by the epilogue" are
9893 considered live at the start of the called function.
9894
9895 For SIMD functions we need to return 1 for FP registers that are saved and
9896 restored by a function but are not zero in call_used_regs. If we do not do
9897 this optimizations may remove the restore of the register. */
9898
9899int
9900aarch64_epilogue_uses (int regno)
9901{
9902 if (epilogue_completed)
9903 {
9904 if (regno == LR_REGNUM)
9905 return 1;
a0d0b980
SE
9906 }
9907 return 0;
9908}
9909
43e9d192
IB
9910/* AArch64 stack frames generated by this compiler look like:
9911
9912 +-------------------------------+
9913 | |
9914 | incoming stack arguments |
9915 | |
34834420
MS
9916 +-------------------------------+
9917 | | <-- incoming stack pointer (aligned)
43e9d192
IB
9918 | callee-allocated save area |
9919 | for register varargs |
9920 | |
34834420
MS
9921 +-------------------------------+
9922 | local variables | <-- frame_pointer_rtx
43e9d192
IB
9923 | |
9924 +-------------------------------+
cd1bef27 9925 | padding | \
454fdba9 9926 +-------------------------------+ |
454fdba9 9927 | callee-saved registers | | frame.saved_regs_size
454fdba9
RL
9928 +-------------------------------+ |
9929 | LR' | |
9930 +-------------------------------+ |
c600df9a
RS
9931 | FP' | |
9932 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
9933 | SVE vector registers | | \
9934 +-------------------------------+ | | below_hard_fp_saved_regs_size
9935 | SVE predicate registers | / /
9936 +-------------------------------+
43e9d192
IB
9937 | dynamic allocation |
9938 +-------------------------------+
34834420
MS
9939 | padding |
9940 +-------------------------------+
9941 | outgoing stack arguments | <-- arg_pointer
9942 | |
9943 +-------------------------------+
9944 | | <-- stack_pointer_rtx (aligned)
43e9d192 9945
34834420
MS
9946 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9947 but leave frame_pointer_rtx and hard_frame_pointer_rtx
cd1bef27
JL
9948 unchanged.
9949
9950 By default for stack-clash we assume the guard is at least 64KB, but this
9951 value is configurable to either 4KB or 64KB. We also force the guard size to
9952 be the same as the probing interval and both values are kept in sync.
9953
9954 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9955 on the guard size) of stack space without probing.
9956
9957 When probing is needed, we emit a probe at the start of the prologue
9958 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9959
9960 We have to track how much space has been allocated and the only stores
9961 to the stack we track as implicit probes are the FP/LR stores.
9962
9963 For outgoing arguments we probe if the size is larger than 1KB, such that
143d3b15
TC
9964 the ABI specified buffer is maintained for the next callee.
9965
9966 The following registers are reserved during frame layout and should not be
9967 used for any other purpose:
9968
c600df9a
RS
9969 - r11: Used by stack clash protection when SVE is enabled, and also
9970 as an anchor register when saving and restoring registers
143d3b15
TC
9971 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9972 - r14 and r15: Used for speculation tracking.
9973 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9974 - r30(LR), r29(FP): Used by standard frame layout.
9975
9976 These registers must be avoided in frame layout related code unless the
9977 explicit intention is to interact with one of the features listed above. */
43e9d192
IB
9978
9979/* Generate the prologue instructions for entry into a function.
9980 Establish the stack frame by decreasing the stack pointer with a
9981 properly calculated size and, if necessary, create a frame record
9982 filled with the values of LR and previous frame pointer. The
6991c977 9983 current FP is also set up if it is in use. */
43e9d192
IB
9984
9985void
9986aarch64_expand_prologue (void)
9987{
6a70badb
RS
9988 poly_int64 frame_size = cfun->machine->frame.frame_size;
9989 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 9990 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
9991 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9992 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
9993 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9994 poly_int64 below_hard_fp_saved_regs_size
9995 = cfun->machine->frame.below_hard_fp_saved_regs_size;
ce09ab17
DL
9996 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9997 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
204d2c03 9998 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
71bfb77a 9999 rtx_insn *insn;
43e9d192 10000
c600df9a
RS
10001 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
10002 {
10003 /* Fold the SVE allocation into the initial allocation.
10004 We don't do this in aarch64_layout_arg to avoid pessimizing
10005 the epilogue code. */
10006 initial_adjust += sve_callee_adjust;
10007 sve_callee_adjust = 0;
10008 }
10009
db58fd89
JW
10010 /* Sign return address for functions. */
10011 if (aarch64_return_address_signing_enabled ())
27169e45 10012 {
8fc16d72
ST
10013 switch (aarch64_ra_sign_key)
10014 {
10015 case AARCH64_KEY_A:
10016 insn = emit_insn (gen_paciasp ());
10017 break;
10018 case AARCH64_KEY_B:
10019 insn = emit_insn (gen_pacibsp ());
10020 break;
10021 default:
10022 gcc_unreachable ();
10023 }
27169e45
JW
10024 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10025 RTX_FRAME_RELATED_P (insn) = 1;
10026 }
db58fd89 10027
ce09ab17
DL
10028 /* Push return address to shadow call stack. */
10029 if (cfun->machine->frame.is_scs_enabled)
10030 emit_insn (gen_scs_push ());
10031
dd991abb 10032 if (flag_stack_usage_info)
6a70badb 10033 current_function_static_stack_size = constant_lower_bound (frame_size);
43e9d192 10034
a3eb8a52
EB
10035 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10036 {
10037 if (crtl->is_leaf && !cfun->calls_alloca)
10038 {
6a70badb
RS
10039 if (maybe_gt (frame_size, PROBE_INTERVAL)
10040 && maybe_gt (frame_size, get_stack_check_protect ()))
8c1dd970
JL
10041 aarch64_emit_probe_stack_range (get_stack_check_protect (),
10042 (frame_size
10043 - get_stack_check_protect ()));
a3eb8a52 10044 }
6a70badb 10045 else if (maybe_gt (frame_size, 0))
8c1dd970 10046 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
a3eb8a52
EB
10047 }
10048
901e66e0
SD
10049 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10050 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 10051
cd1bef27
JL
10052 /* In theory we should never have both an initial adjustment
10053 and a callee save adjustment. Verify that is the case since the
10054 code below does not handle it for -fstack-clash-protection. */
10055 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
10056
10057 /* Will only probe if the initial adjustment is larger than the guard
10058 less the amount of the guard reserved for use by the caller's
10059 outgoing args. */
901e66e0 10060 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
cd1bef27 10061 true, false);
43e9d192 10062
71bfb77a
WD
10063 if (callee_adjust != 0)
10064 aarch64_push_regs (reg1, reg2, callee_adjust);
43e9d192 10065
c600df9a
RS
10066 /* The offset of the frame chain record (if any) from the current SP. */
10067 poly_int64 chain_offset = (initial_adjust + callee_adjust
10068 - cfun->machine->frame.hard_fp_offset);
10069 gcc_assert (known_ge (chain_offset, 0));
10070
10071 /* The offset of the bottom of the save area from the current SP. */
10072 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
10073
204d2c03 10074 if (emit_frame_chain)
43e9d192 10075 {
71bfb77a 10076 if (callee_adjust == 0)
43cacb12
RS
10077 {
10078 reg1 = R29_REGNUM;
10079 reg2 = R30_REGNUM;
c600df9a
RS
10080 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
10081 false, false);
43cacb12 10082 }
c600df9a
RS
10083 else
10084 gcc_assert (known_eq (chain_offset, 0));
f5470a77 10085 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
c600df9a 10086 stack_pointer_rtx, chain_offset,
901e66e0 10087 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
43cacb12
RS
10088 if (frame_pointer_needed && !frame_size.is_constant ())
10089 {
10090 /* Variable-sized frames need to describe the save slot
10091 address using DW_CFA_expression rather than DW_CFA_offset.
10092 This means that, without taking further action, the
10093 locations of the registers that we've already saved would
10094 remain based on the stack pointer even after we redefine
10095 the CFA based on the frame pointer. We therefore need new
10096 DW_CFA_expressions to re-express the save slots with addresses
10097 based on the frame pointer. */
10098 rtx_insn *insn = get_last_insn ();
10099 gcc_assert (RTX_FRAME_RELATED_P (insn));
10100
10101 /* Add an explicit CFA definition if this was previously
10102 implicit. */
10103 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
10104 {
10105 rtx src = plus_constant (Pmode, stack_pointer_rtx,
10106 callee_offset);
10107 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10108 gen_rtx_SET (hard_frame_pointer_rtx, src));
10109 }
10110
10111 /* Change the save slot expressions for the registers that
10112 we've already saved. */
c600df9a
RS
10113 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
10114 hard_frame_pointer_rtx, UNITS_PER_WORD);
10115 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10116 hard_frame_pointer_rtx, 0);
43cacb12 10117 }
71bfb77a 10118 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
43e9d192 10119 }
71bfb77a 10120
c600df9a
RS
10121 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10122 callee_adjust != 0 || emit_frame_chain,
10123 emit_frame_chain);
10124 if (maybe_ne (sve_callee_adjust, 0))
10125 {
10126 gcc_assert (!flag_stack_clash_protection
10127 || known_eq (initial_adjust, 0));
10128 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10129 sve_callee_adjust,
10130 !frame_pointer_needed, false);
10131 saved_regs_offset += sve_callee_adjust;
10132 }
10133 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10134 false, emit_frame_chain);
10135 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10136 callee_adjust != 0 || emit_frame_chain,
10137 emit_frame_chain);
cd1bef27
JL
10138
10139 /* We may need to probe the final adjustment if it is larger than the guard
10140 that is assumed by the called. */
901e66e0 10141 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
cd1bef27 10142 !frame_pointer_needed, true);
43e9d192
IB
10143}
10144
4f942779
RL
10145/* Return TRUE if we can use a simple_return insn.
10146
10147 This function checks whether the callee saved stack is empty, which
10148 means no restore actions are need. The pro_and_epilogue will use
10149 this to check whether shrink-wrapping opt is feasible. */
10150
10151bool
10152aarch64_use_return_insn_p (void)
10153{
10154 if (!reload_completed)
10155 return false;
10156
10157 if (crtl->profile)
10158 return false;
10159
6a70badb 10160 return known_eq (cfun->machine->frame.frame_size, 0);
4f942779
RL
10161}
10162
71bfb77a
WD
10163/* Generate the epilogue instructions for returning from a function.
10164 This is almost exactly the reverse of the prolog sequence, except
10165 that we need to insert barriers to avoid scheduling loads that read
10166 from a deallocated stack, and we optimize the unwind records by
10167 emitting them all together if possible. */
43e9d192
IB
10168void
10169aarch64_expand_epilogue (bool for_sibcall)
10170{
6a70badb 10171 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 10172 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
10173 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10174 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
10175 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10176 poly_int64 below_hard_fp_saved_regs_size
10177 = cfun->machine->frame.below_hard_fp_saved_regs_size;
ce09ab17
DL
10178 unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10179 unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10180 unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10181 ? R29_REGNUM : R30_REGNUM);
71bfb77a
WD
10182 rtx cfi_ops = NULL;
10183 rtx_insn *insn;
901e66e0
SD
10184 /* A stack clash protection prologue may not have left EP0_REGNUM or
10185 EP1_REGNUM in a usable state. The same is true for allocations
43cacb12 10186 with an SVE component, since we then need both temporary registers
cd1bef27
JL
10187 for each allocation. For stack clash we are in a usable state if
10188 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10189 HOST_WIDE_INT guard_size
028d4092 10190 = 1 << param_stack_clash_protection_guard_size;
cd1bef27
JL
10191 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10192
c600df9a
RS
10193 /* We can re-use the registers when:
10194
10195 (a) the deallocation amount is the same as the corresponding
10196 allocation amount (which is false if we combine the initial
10197 and SVE callee save allocations in the prologue); and
10198
10199 (b) the allocation amount doesn't need a probe (which is false
10200 if the amount is guard_size - guard_used_by_caller or greater).
10201
10202 In such situations the register should remain live with the correct
cd1bef27 10203 value. */
43cacb12 10204 bool can_inherit_p = (initial_adjust.is_constant ()
c600df9a 10205 && final_adjust.is_constant ()
cd1bef27 10206 && (!flag_stack_clash_protection
c600df9a
RS
10207 || (known_lt (initial_adjust,
10208 guard_size - guard_used_by_caller)
10209 && known_eq (sve_callee_adjust, 0))));
44c0e7b9 10210
71bfb77a 10211 /* We need to add memory barrier to prevent read from deallocated stack. */
6a70badb
RS
10212 bool need_barrier_p
10213 = maybe_ne (get_frame_size ()
10214 + cfun->machine->frame.saved_varargs_size, 0);
43e9d192 10215
71bfb77a 10216 /* Emit a barrier to prevent loads from a deallocated stack. */
6a70badb
RS
10217 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10218 || cfun->calls_alloca
8144a493 10219 || crtl->calls_eh_return)
43e9d192 10220 {
71bfb77a
WD
10221 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10222 need_barrier_p = false;
10223 }
7e8c2bd5 10224
71bfb77a
WD
10225 /* Restore the stack pointer from the frame pointer if it may not
10226 be the same as the stack pointer. */
901e66e0
SD
10227 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10228 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6a70badb
RS
10229 if (frame_pointer_needed
10230 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
f5470a77
RS
10231 /* If writeback is used when restoring callee-saves, the CFA
10232 is restored on the instruction doing the writeback. */
10233 aarch64_add_offset (Pmode, stack_pointer_rtx,
c600df9a
RS
10234 hard_frame_pointer_rtx,
10235 -callee_offset - below_hard_fp_saved_regs_size,
901e66e0 10236 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
71bfb77a 10237 else
cd1bef27
JL
10238 /* The case where we need to re-use the register here is very rare, so
10239 avoid the complicated condition and just always emit a move if the
10240 immediate doesn't fit. */
901e66e0 10241 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
43e9d192 10242
c600df9a
RS
10243 /* Restore the vector registers before the predicate registers,
10244 so that we can use P4 as a temporary for big-endian SVE frames. */
10245 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10246 callee_adjust != 0, &cfi_ops);
10247 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10248 false, &cfi_ops);
10249 if (maybe_ne (sve_callee_adjust, 0))
10250 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
ce09ab17
DL
10251
10252 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10253 restore x30, we don't need to restore x30 again in the traditional
10254 way. */
c600df9a 10255 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
ce09ab17 10256 R0_REGNUM, last_gpr,
71bfb77a 10257 callee_adjust != 0, &cfi_ops);
43e9d192 10258
71bfb77a
WD
10259 if (need_barrier_p)
10260 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10261
10262 if (callee_adjust != 0)
10263 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10264
1ccbfffb
RS
10265 /* If we have no register restore information, the CFA must have been
10266 defined in terms of the stack pointer since the end of the prologue. */
10267 gcc_assert (cfi_ops || !frame_pointer_needed);
10268
10269 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
71bfb77a
WD
10270 {
10271 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
89ac681e 10272 insn = get_last_insn ();
71bfb77a
WD
10273 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10274 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
43e9d192 10275 RTX_FRAME_RELATED_P (insn) = 1;
71bfb77a 10276 cfi_ops = NULL;
43e9d192
IB
10277 }
10278
901e66e0
SD
10279 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10280 add restriction on emit_move optimization to leaf functions. */
10281 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10282 (!can_inherit_p || !crtl->is_leaf
10283 || df_regs_ever_live_p (EP0_REGNUM)));
7e8c2bd5 10284
71bfb77a
WD
10285 if (cfi_ops)
10286 {
10287 /* Emit delayed restores and reset the CFA to be SP. */
10288 insn = get_last_insn ();
10289 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10290 REG_NOTES (insn) = cfi_ops;
10291 RTX_FRAME_RELATED_P (insn) = 1;
dd991abb
RH
10292 }
10293
ce09ab17
DL
10294 /* Pop return address from shadow call stack. */
10295 if (cfun->machine->frame.is_scs_enabled)
10296 {
10297 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10298 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10299
10300 insn = emit_insn (gen_scs_pop ());
10301 add_reg_note (insn, REG_CFA_RESTORE, reg);
10302 RTX_FRAME_RELATED_P (insn) = 1;
10303 }
10304
db58fd89
JW
10305 /* We prefer to emit the combined return/authenticate instruction RETAA,
10306 however there are three cases in which we must instead emit an explicit
10307 authentication instruction.
10308
10309 1) Sibcalls don't return in a normal way, so if we're about to call one
10310 we must authenticate.
10311
10312 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10313 generating code for !TARGET_ARMV8_3 we can't use it and must
10314 explicitly authenticate.
db58fd89
JW
10315 */
10316 if (aarch64_return_address_signing_enabled ()
14d31404 10317 && (for_sibcall || !TARGET_ARMV8_3))
27169e45 10318 {
8fc16d72
ST
10319 switch (aarch64_ra_sign_key)
10320 {
10321 case AARCH64_KEY_A:
10322 insn = emit_insn (gen_autiasp ());
10323 break;
10324 case AARCH64_KEY_B:
10325 insn = emit_insn (gen_autibsp ());
10326 break;
10327 default:
10328 gcc_unreachable ();
10329 }
27169e45
JW
10330 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10331 RTX_FRAME_RELATED_P (insn) = 1;
10332 }
db58fd89 10333
dd991abb 10334 /* Stack adjustment for exception handler. */
b5b9147d 10335 if (crtl->calls_eh_return && !for_sibcall)
dd991abb
RH
10336 {
10337 /* We need to unwind the stack by the offset computed by
10338 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
10339 to be SP; letting the CFA move during this adjustment
10340 is just as correct as retaining the CFA from the body
10341 of the function. Therefore, do nothing special. */
10342 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
43e9d192
IB
10343 }
10344
10345 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10346 if (!for_sibcall)
10347 emit_jump_insn (ret_rtx);
10348}
10349
8144a493
WD
10350/* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
10351 normally or return to a previous frame after unwinding.
1c960e02 10352
8144a493
WD
10353 An EH return uses a single shared return sequence. The epilogue is
10354 exactly like a normal epilogue except that it has an extra input
10355 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10356 that must be applied after the frame has been destroyed. An extra label
10357 is inserted before the epilogue which initializes this register to zero,
10358 and this is the entry point for a normal return.
43e9d192 10359
8144a493
WD
10360 An actual EH return updates the return address, initializes the stack
10361 adjustment and jumps directly into the epilogue (bypassing the zeroing
10362 of the adjustment). Since the return address is typically saved on the
10363 stack when a function makes a call, the saved LR must be updated outside
10364 the epilogue.
43e9d192 10365
8144a493
WD
10366 This poses problems as the store is generated well before the epilogue,
10367 so the offset of LR is not known yet. Also optimizations will remove the
10368 store as it appears dead, even after the epilogue is generated (as the
10369 base or offset for loading LR is different in many cases).
43e9d192 10370
8144a493
WD
10371 To avoid these problems this implementation forces the frame pointer
10372 in eh_return functions so that the location of LR is fixed and known early.
10373 It also marks the store volatile, so no optimization is permitted to
10374 remove the store. */
10375rtx
10376aarch64_eh_return_handler_rtx (void)
10377{
10378 rtx tmp = gen_frame_mem (Pmode,
10379 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
43e9d192 10380
8144a493
WD
10381 /* Mark the store volatile, so no optimization is permitted to remove it. */
10382 MEM_VOLATILE_P (tmp) = true;
10383 return tmp;
43e9d192
IB
10384}
10385
43e9d192
IB
10386/* Output code to add DELTA to the first argument, and then jump
10387 to FUNCTION. Used for C++ multiple inheritance. */
10388static void
10389aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10390 HOST_WIDE_INT delta,
10391 HOST_WIDE_INT vcall_offset,
10392 tree function)
10393{
10394 /* The this pointer is always in x0. Note that this differs from
10395 Arm where the this pointer maybe bumped to r1 if r0 is required
10396 to return a pointer to an aggregate. On AArch64 a result value
10397 pointer will be in x8. */
10398 int this_regno = R0_REGNUM;
5d8a22a5
DM
10399 rtx this_rtx, temp0, temp1, addr, funexp;
10400 rtx_insn *insn;
6b5777c6 10401 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
43e9d192 10402
c904388d
SD
10403 if (aarch64_bti_enabled ())
10404 emit_insn (gen_bti_c());
10405
75f1d6fc
SN
10406 reload_completed = 1;
10407 emit_note (NOTE_INSN_PROLOGUE_END);
43e9d192 10408
f5470a77 10409 this_rtx = gen_rtx_REG (Pmode, this_regno);
901e66e0
SD
10410 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10411 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 10412
43e9d192 10413 if (vcall_offset == 0)
43cacb12 10414 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
43e9d192
IB
10415 else
10416 {
28514dda 10417 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
43e9d192 10418
75f1d6fc
SN
10419 addr = this_rtx;
10420 if (delta != 0)
10421 {
10422 if (delta >= -256 && delta < 256)
10423 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10424 plus_constant (Pmode, this_rtx, delta));
10425 else
43cacb12
RS
10426 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10427 temp1, temp0, false);
43e9d192
IB
10428 }
10429
28514dda
YZ
10430 if (Pmode == ptr_mode)
10431 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10432 else
10433 aarch64_emit_move (temp0,
10434 gen_rtx_ZERO_EXTEND (Pmode,
10435 gen_rtx_MEM (ptr_mode, addr)));
75f1d6fc 10436
28514dda 10437 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
75f1d6fc 10438 addr = plus_constant (Pmode, temp0, vcall_offset);
43e9d192
IB
10439 else
10440 {
f43657b4
JW
10441 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10442 Pmode);
75f1d6fc 10443 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
43e9d192
IB
10444 }
10445
28514dda
YZ
10446 if (Pmode == ptr_mode)
10447 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10448 else
10449 aarch64_emit_move (temp1,
10450 gen_rtx_SIGN_EXTEND (Pmode,
10451 gen_rtx_MEM (ptr_mode, addr)));
10452
75f1d6fc 10453 emit_insn (gen_add2_insn (this_rtx, temp1));
43e9d192
IB
10454 }
10455
75f1d6fc
SN
10456 /* Generate a tail call to the target function. */
10457 if (!TREE_USED (function))
10458 {
10459 assemble_external (function);
10460 TREE_USED (function) = 1;
10461 }
10462 funexp = XEXP (DECL_RTL (function), 0);
10463 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
08cc4d92
RS
10464 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10465 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
75f1d6fc
SN
10466 SIBLING_CALL_P (insn) = 1;
10467
10468 insn = get_insns ();
10469 shorten_branches (insn);
6b5777c6
MF
10470
10471 assemble_start_function (thunk, fnname);
75f1d6fc
SN
10472 final_start_function (insn, file, 1);
10473 final (insn, file, 1);
43e9d192 10474 final_end_function ();
6b5777c6 10475 assemble_end_function (thunk, fnname);
75f1d6fc
SN
10476
10477 /* Stop pretending to be a post-reload pass. */
10478 reload_completed = 0;
43e9d192
IB
10479}
10480
43e9d192
IB
10481static bool
10482aarch64_tls_referenced_p (rtx x)
10483{
10484 if (!TARGET_HAVE_TLS)
10485 return false;
e7de8563
RS
10486 subrtx_iterator::array_type array;
10487 FOR_EACH_SUBRTX (iter, array, x, ALL)
10488 {
10489 const_rtx x = *iter;
3793ecc1 10490 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
e7de8563
RS
10491 return true;
10492 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10493 TLS offsets, not real symbol references. */
10494 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10495 iter.skip_subrtxes ();
10496 }
10497 return false;
43e9d192
IB
10498}
10499
10500
43e9d192 10501static bool
ef4bddc2 10502aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
43e9d192 10503{
43e9d192
IB
10504 if (GET_CODE (x) == HIGH)
10505 return true;
10506
43cacb12
RS
10507 /* There's no way to calculate VL-based values using relocations. */
10508 subrtx_iterator::array_type array;
10509 FOR_EACH_SUBRTX (iter, array, x, ALL)
10510 if (GET_CODE (*iter) == CONST_POLY_INT)
10511 return true;
10512
74b27d8e
RS
10513 poly_int64 offset;
10514 rtx base = strip_offset_and_salt (x, &offset);
3793ecc1 10515 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
28514dda 10516 {
74b27d8e
RS
10517 /* We checked for POLY_INT_CST offsets above. */
10518 if (aarch64_classify_symbol (base, offset.to_constant ())
28514dda
YZ
10519 != SYMBOL_FORCE_TO_MEM)
10520 return true;
10521 else
10522 /* Avoid generating a 64-bit relocation in ILP32; leave
10523 to aarch64_expand_mov_immediate to handle it properly. */
10524 return mode != ptr_mode;
10525 }
43e9d192
IB
10526
10527 return aarch64_tls_referenced_p (x);
10528}
10529
e79136e4
WD
10530/* Implement TARGET_CASE_VALUES_THRESHOLD.
10531 The expansion for a table switch is quite expensive due to the number
10532 of instructions, the table lookup and hard to predict indirect jump.
10533 When optimizing for speed, and -O3 enabled, use the per-core tuning if
9c751b88
WD
10534 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10535 performance. When optimizing for size, use 8 for smallest codesize. */
50487d79
EM
10536
10537static unsigned int
10538aarch64_case_values_threshold (void)
10539{
10540 /* Use the specified limit for the number of cases before using jump
10541 tables at higher optimization levels. */
10542 if (optimize > 2
ae54c1b0
WD
10543 && aarch64_tune_params.max_case_values != 0)
10544 return aarch64_tune_params.max_case_values;
50487d79 10545 else
9c751b88 10546 return optimize_size ? 8 : 11;
50487d79
EM
10547}
10548
43e9d192
IB
10549/* Return true if register REGNO is a valid index register.
10550 STRICT_P is true if REG_OK_STRICT is in effect. */
10551
10552bool
10553aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10554{
10555 if (!HARD_REGISTER_NUM_P (regno))
10556 {
10557 if (!strict_p)
10558 return true;
10559
10560 if (!reg_renumber)
10561 return false;
10562
10563 regno = reg_renumber[regno];
10564 }
10565 return GP_REGNUM_P (regno);
10566}
10567
10568/* Return true if register REGNO is a valid base register for mode MODE.
10569 STRICT_P is true if REG_OK_STRICT is in effect. */
10570
10571bool
10572aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10573{
10574 if (!HARD_REGISTER_NUM_P (regno))
10575 {
10576 if (!strict_p)
10577 return true;
10578
10579 if (!reg_renumber)
10580 return false;
10581
10582 regno = reg_renumber[regno];
10583 }
10584
10585 /* The fake registers will be eliminated to either the stack or
10586 hard frame pointer, both of which are usually valid base registers.
10587 Reload deals with the cases where the eliminated form isn't valid. */
10588 return (GP_REGNUM_P (regno)
10589 || regno == SP_REGNUM
10590 || regno == FRAME_POINTER_REGNUM
10591 || regno == ARG_POINTER_REGNUM);
10592}
10593
10594/* Return true if X is a valid base register for mode MODE.
10595 STRICT_P is true if REG_OK_STRICT is in effect. */
10596
10597static bool
10598aarch64_base_register_rtx_p (rtx x, bool strict_p)
10599{
76160199 10600 if (!strict_p
3793ecc1 10601 && SUBREG_P (x)
76160199 10602 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
43e9d192
IB
10603 x = SUBREG_REG (x);
10604
10605 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10606}
10607
10608/* Return true if address offset is a valid index. If it is, fill in INFO
10609 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10610
10611static bool
10612aarch64_classify_index (struct aarch64_address_info *info, rtx x,
ef4bddc2 10613 machine_mode mode, bool strict_p)
43e9d192
IB
10614{
10615 enum aarch64_address_type type;
10616 rtx index;
10617 int shift;
10618
10619 /* (reg:P) */
3793ecc1 10620 if ((REG_P (x) || SUBREG_P (x))
43e9d192
IB
10621 && GET_MODE (x) == Pmode)
10622 {
10623 type = ADDRESS_REG_REG;
10624 index = x;
10625 shift = 0;
10626 }
10627 /* (sign_extend:DI (reg:SI)) */
10628 else if ((GET_CODE (x) == SIGN_EXTEND
10629 || GET_CODE (x) == ZERO_EXTEND)
10630 && GET_MODE (x) == DImode
10631 && GET_MODE (XEXP (x, 0)) == SImode)
10632 {
10633 type = (GET_CODE (x) == SIGN_EXTEND)
10634 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10635 index = XEXP (x, 0);
10636 shift = 0;
10637 }
10638 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10639 else if (GET_CODE (x) == MULT
10640 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10641 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10642 && GET_MODE (XEXP (x, 0)) == DImode
10643 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10644 && CONST_INT_P (XEXP (x, 1)))
10645 {
10646 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10647 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10648 index = XEXP (XEXP (x, 0), 0);
10649 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10650 }
10651 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10652 else if (GET_CODE (x) == ASHIFT
10653 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10654 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10655 && GET_MODE (XEXP (x, 0)) == DImode
10656 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10657 && CONST_INT_P (XEXP (x, 1)))
10658 {
10659 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10660 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10661 index = XEXP (XEXP (x, 0), 0);
10662 shift = INTVAL (XEXP (x, 1));
10663 }
43e9d192
IB
10664 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10665 (const_int 0xffffffff<<shift)) */
10666 else if (GET_CODE (x) == AND
10667 && GET_MODE (x) == DImode
10668 && GET_CODE (XEXP (x, 0)) == MULT
10669 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10670 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10671 && CONST_INT_P (XEXP (x, 1)))
10672 {
10673 type = ADDRESS_REG_UXTW;
10674 index = XEXP (XEXP (x, 0), 0);
10675 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10676 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10677 shift = -1;
10678 }
43e9d192
IB
10679 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10680 (const_int 0xffffffff<<shift)) */
10681 else if (GET_CODE (x) == AND
10682 && GET_MODE (x) == DImode
10683 && GET_CODE (XEXP (x, 0)) == ASHIFT
10684 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10685 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10686 && CONST_INT_P (XEXP (x, 1)))
10687 {
10688 type = ADDRESS_REG_UXTW;
10689 index = XEXP (XEXP (x, 0), 0);
10690 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10691 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10692 shift = -1;
10693 }
10694 /* (mult:P (reg:P) (const_int scale)) */
10695 else if (GET_CODE (x) == MULT
10696 && GET_MODE (x) == Pmode
10697 && GET_MODE (XEXP (x, 0)) == Pmode
10698 && CONST_INT_P (XEXP (x, 1)))
10699 {
10700 type = ADDRESS_REG_REG;
10701 index = XEXP (x, 0);
10702 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10703 }
10704 /* (ashift:P (reg:P) (const_int shift)) */
10705 else if (GET_CODE (x) == ASHIFT
10706 && GET_MODE (x) == Pmode
10707 && GET_MODE (XEXP (x, 0)) == Pmode
10708 && CONST_INT_P (XEXP (x, 1)))
10709 {
10710 type = ADDRESS_REG_REG;
10711 index = XEXP (x, 0);
10712 shift = INTVAL (XEXP (x, 1));
10713 }
10714 else
10715 return false;
10716
76160199 10717 if (!strict_p
3793ecc1 10718 && SUBREG_P (index)
76160199 10719 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
43e9d192
IB
10720 index = SUBREG_REG (index);
10721
43cacb12
RS
10722 if (aarch64_sve_data_mode_p (mode))
10723 {
10724 if (type != ADDRESS_REG_REG
10725 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10726 return false;
10727 }
10728 else
10729 {
10730 if (shift != 0
10731 && !(IN_RANGE (shift, 1, 3)
10732 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10733 return false;
10734 }
10735
10736 if (REG_P (index)
43e9d192
IB
10737 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10738 {
10739 info->type = type;
10740 info->offset = index;
10741 info->shift = shift;
10742 return true;
10743 }
10744
10745 return false;
10746}
10747
abc52318
KT
10748/* Return true if MODE is one of the modes for which we
10749 support LDP/STP operations. */
10750
10751static bool
10752aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10753{
10754 return mode == SImode || mode == DImode
10755 || mode == SFmode || mode == DFmode
0dc8e1e7 10756 || mode == SDmode || mode == DDmode
abc52318 10757 || (aarch64_vector_mode_supported_p (mode)
9f5361c8
KT
10758 && (known_eq (GET_MODE_SIZE (mode), 8)
10759 || (known_eq (GET_MODE_SIZE (mode), 16)
10760 && (aarch64_tune_params.extra_tuning_flags
10761 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
abc52318
KT
10762}
10763
9e0218fc
RH
10764/* Return true if REGNO is a virtual pointer register, or an eliminable
10765 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10766 include stack_pointer or hard_frame_pointer. */
10767static bool
10768virt_or_elim_regno_p (unsigned regno)
10769{
10770 return ((regno >= FIRST_VIRTUAL_REGISTER
10771 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10772 || regno == FRAME_POINTER_REGNUM
10773 || regno == ARG_POINTER_REGNUM);
10774}
10775
a97d8b98
RS
10776/* Return true if X is a valid address of type TYPE for machine mode MODE.
10777 If it is, fill in INFO appropriately. STRICT_P is true if
10778 REG_OK_STRICT is in effect. */
43e9d192 10779
a98824ac 10780bool
43e9d192 10781aarch64_classify_address (struct aarch64_address_info *info,
a97d8b98 10782 rtx x, machine_mode mode, bool strict_p,
a98824ac 10783 aarch64_addr_query_type type)
43e9d192
IB
10784{
10785 enum rtx_code code = GET_CODE (x);
10786 rtx op0, op1;
dc640181
RS
10787 poly_int64 offset;
10788
6a70badb 10789 HOST_WIDE_INT const_size;
2d8c6dc1 10790
550a3380
RS
10791 /* Whether a vector mode is partial doesn't affect address legitimacy.
10792 Partial vectors like VNx8QImode allow the same indexed addressing
10793 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10794 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10795 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10796 vec_flags &= ~VEC_PARTIAL;
10797
80d43579 10798 /* On BE, we use load/store pair for all large int mode load/stores.
0dc8e1e7 10799 TI/TF/TDmode may also use a load/store pair. */
43cacb12 10800 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
a97d8b98 10801 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
a25831ac 10802 || type == ADDR_QUERY_LDP_STP_N
80d43579
WD
10803 || mode == TImode
10804 || mode == TFmode
0dc8e1e7 10805 || mode == TDmode
721c0fb3
RS
10806 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10807 && advsimd_struct_p));
a25831ac
AV
10808 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10809 corresponds to the actual size of the memory being loaded/stored and the
10810 mode of the corresponding addressing mode is half of that. */
83d7e720
RS
10811 if (type == ADDR_QUERY_LDP_STP_N)
10812 {
10813 if (known_eq (GET_MODE_SIZE (mode), 16))
10814 mode = DFmode;
10815 else if (known_eq (GET_MODE_SIZE (mode), 8))
10816 mode = SFmode;
10817 else
10818 return false;
10819 }
a25831ac 10820
6a70badb 10821 bool allow_reg_index_p = (!load_store_pair_p
512b3835
KT
10822 && ((vec_flags == 0
10823 && known_lt (GET_MODE_SIZE (mode), 16))
43cacb12 10824 || vec_flags == VEC_ADVSIMD
fa9863e7 10825 || vec_flags & VEC_SVE_DATA));
43cacb12 10826
512b3835
KT
10827 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10828 The latter is not valid for SVE predicates, and that's rejected through
10829 allow_reg_index_p above. */
43cacb12
RS
10830 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10831 && (code != REG && code != PLUS))
10832 return false;
2d8c6dc1
AH
10833
10834 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10835 REG addressing. */
43cacb12 10836 if (advsimd_struct_p
721c0fb3 10837 && TARGET_SIMD
43cacb12 10838 && !BYTES_BIG_ENDIAN
43e9d192
IB
10839 && (code != POST_INC && code != REG))
10840 return false;
10841
43cacb12
RS
10842 gcc_checking_assert (GET_MODE (x) == VOIDmode
10843 || SCALAR_INT_MODE_P (GET_MODE (x)));
10844
43e9d192
IB
10845 switch (code)
10846 {
10847 case REG:
10848 case SUBREG:
10849 info->type = ADDRESS_REG_IMM;
10850 info->base = x;
10851 info->offset = const0_rtx;
dc640181 10852 info->const_offset = 0;
43e9d192
IB
10853 return aarch64_base_register_rtx_p (x, strict_p);
10854
10855 case PLUS:
10856 op0 = XEXP (x, 0);
10857 op1 = XEXP (x, 1);
15c0c5c9
JW
10858
10859 if (! strict_p
4aa81c2e 10860 && REG_P (op0)
9e0218fc 10861 && virt_or_elim_regno_p (REGNO (op0))
dc640181 10862 && poly_int_rtx_p (op1, &offset))
15c0c5c9
JW
10863 {
10864 info->type = ADDRESS_REG_IMM;
10865 info->base = op0;
10866 info->offset = op1;
dc640181 10867 info->const_offset = offset;
15c0c5c9
JW
10868
10869 return true;
10870 }
10871
6a70badb 10872 if (maybe_ne (GET_MODE_SIZE (mode), 0)
dc640181
RS
10873 && aarch64_base_register_rtx_p (op0, strict_p)
10874 && poly_int_rtx_p (op1, &offset))
43e9d192 10875 {
43e9d192
IB
10876 info->type = ADDRESS_REG_IMM;
10877 info->base = op0;
10878 info->offset = op1;
dc640181 10879 info->const_offset = offset;
43e9d192 10880
0dc8e1e7 10881 /* TImode, TFmode and TDmode values are allowed in both pairs of X
43e9d192
IB
10882 registers and individual Q registers. The available
10883 address modes are:
10884 X,X: 7-bit signed scaled offset
10885 Q: 9-bit signed offset
10886 We conservatively require an offset representable in either mode.
8ed49fab
KT
10887 When performing the check for pairs of X registers i.e. LDP/STP
10888 pass down DImode since that is the natural size of the LDP/STP
10889 instruction memory accesses. */
0dc8e1e7 10890 if (mode == TImode || mode == TFmode || mode == TDmode)
8ed49fab 10891 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3c5af608 10892 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8734dfac 10893 || offset_12bit_unsigned_scaled_p (mode, offset)));
43e9d192 10894
fdcddba8
PW
10895 if (mode == V8DImode)
10896 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10897 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10898
2d8c6dc1 10899 /* A 7bit offset check because OImode will emit a ldp/stp
721c0fb3 10900 instruction (only !TARGET_SIMD or big endian will get here).
2d8c6dc1
AH
10901 For ldp/stp instructions, the offset is scaled for the size of a
10902 single element of the pair. */
66f206b8
JW
10903 if (aarch64_advsimd_partial_struct_mode_p (mode)
10904 && known_eq (GET_MODE_SIZE (mode), 16))
10905 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10906 if (aarch64_advsimd_full_struct_mode_p (mode)
10907 && known_eq (GET_MODE_SIZE (mode), 32))
2d8c6dc1
AH
10908 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10909
10910 /* Three 9/12 bit offsets checks because CImode will emit three
721c0fb3
RS
10911 ldr/str instructions (only !TARGET_SIMD or big endian will
10912 get here). */
66f206b8
JW
10913 if (aarch64_advsimd_partial_struct_mode_p (mode)
10914 && known_eq (GET_MODE_SIZE (mode), 24))
10915 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10916 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10917 offset + 16)
10918 || offset_12bit_unsigned_scaled_p (DImode,
10919 offset + 16)));
10920 if (aarch64_advsimd_full_struct_mode_p (mode)
10921 && known_eq (GET_MODE_SIZE (mode), 48))
2d8c6dc1 10922 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
66f206b8 10923 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
3c5af608 10924 offset + 32)
66f206b8 10925 || offset_12bit_unsigned_scaled_p (TImode,
2d8c6dc1
AH
10926 offset + 32)));
10927
10928 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10929 instructions (only big endian will get here). */
66f206b8
JW
10930 if (aarch64_advsimd_partial_struct_mode_p (mode)
10931 && known_eq (GET_MODE_SIZE (mode), 32))
10932 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10933 && aarch64_offset_7bit_signed_scaled_p (DImode,
10934 offset + 16));
10935 if (aarch64_advsimd_full_struct_mode_p (mode)
10936 && known_eq (GET_MODE_SIZE (mode), 64))
2d8c6dc1
AH
10937 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10938 && aarch64_offset_7bit_signed_scaled_p (TImode,
10939 offset + 32));
10940
43cacb12
RS
10941 /* Make "m" use the LD1 offset range for SVE data modes, so
10942 that pre-RTL optimizers like ivopts will work to that
10943 instead of the wider LDR/STR range. */
10944 if (vec_flags == VEC_SVE_DATA)
10945 return (type == ADDR_QUERY_M
10946 ? offset_4bit_signed_scaled_p (mode, offset)
10947 : offset_9bit_signed_scaled_p (mode, offset));
10948
9f4cbab8
RS
10949 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10950 {
10951 poly_int64 end_offset = (offset
10952 + GET_MODE_SIZE (mode)
10953 - BYTES_PER_SVE_VECTOR);
10954 return (type == ADDR_QUERY_M
10955 ? offset_4bit_signed_scaled_p (mode, offset)
10956 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10957 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10958 end_offset)));
10959 }
10960
43cacb12
RS
10961 if (vec_flags == VEC_SVE_PRED)
10962 return offset_9bit_signed_scaled_p (mode, offset);
10963
2d8c6dc1 10964 if (load_store_pair_p)
6a70badb 10965 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
10966 || known_eq (GET_MODE_SIZE (mode), 8)
10967 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 10968 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 10969 else
3c5af608 10970 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
43e9d192
IB
10971 || offset_12bit_unsigned_scaled_p (mode, offset));
10972 }
10973
10974 if (allow_reg_index_p)
10975 {
10976 /* Look for base + (scaled/extended) index register. */
10977 if (aarch64_base_register_rtx_p (op0, strict_p)
10978 && aarch64_classify_index (info, op1, mode, strict_p))
10979 {
10980 info->base = op0;
10981 return true;
10982 }
10983 if (aarch64_base_register_rtx_p (op1, strict_p)
10984 && aarch64_classify_index (info, op0, mode, strict_p))
10985 {
10986 info->base = op1;
10987 return true;
10988 }
10989 }
10990
10991 return false;
10992
10993 case POST_INC:
10994 case POST_DEC:
10995 case PRE_INC:
10996 case PRE_DEC:
10997 info->type = ADDRESS_REG_WB;
10998 info->base = XEXP (x, 0);
10999 info->offset = NULL_RTX;
11000 return aarch64_base_register_rtx_p (info->base, strict_p);
11001
11002 case POST_MODIFY:
11003 case PRE_MODIFY:
11004 info->type = ADDRESS_REG_WB;
11005 info->base = XEXP (x, 0);
11006 if (GET_CODE (XEXP (x, 1)) == PLUS
dc640181 11007 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
43e9d192
IB
11008 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
11009 && aarch64_base_register_rtx_p (info->base, strict_p))
11010 {
43e9d192 11011 info->offset = XEXP (XEXP (x, 1), 1);
dc640181 11012 info->const_offset = offset;
43e9d192 11013
0dc8e1e7 11014 /* TImode, TFmode and TDmode values are allowed in both pairs of X
43e9d192
IB
11015 registers and individual Q registers. The available
11016 address modes are:
11017 X,X: 7-bit signed scaled offset
11018 Q: 9-bit signed offset
11019 We conservatively require an offset representable in either mode.
11020 */
0dc8e1e7 11021 if (mode == TImode || mode == TFmode || mode == TDmode)
44707478 11022 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3c5af608 11023 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
43e9d192 11024
2d8c6dc1 11025 if (load_store_pair_p)
6a70badb 11026 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
11027 || known_eq (GET_MODE_SIZE (mode), 8)
11028 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 11029 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 11030 else
3c5af608 11031 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
43e9d192
IB
11032 }
11033 return false;
11034
11035 case CONST:
11036 case SYMBOL_REF:
11037 case LABEL_REF:
79517551
SN
11038 /* load literal: pc-relative constant pool entry. Only supported
11039 for SI mode or larger. */
43e9d192 11040 info->type = ADDRESS_SYMBOLIC;
2d8c6dc1 11041
6a70badb
RS
11042 if (!load_store_pair_p
11043 && GET_MODE_SIZE (mode).is_constant (&const_size)
11044 && const_size >= 4)
43e9d192 11045 {
74b27d8e
RS
11046 poly_int64 offset;
11047 rtx sym = strip_offset_and_salt (x, &offset);
3793ecc1
AC
11048 return ((LABEL_REF_P (sym)
11049 || (SYMBOL_REF_P (sym)
b4f50fd4 11050 && CONSTANT_POOL_ADDRESS_P (sym)
9ee6540a 11051 && aarch64_pcrelative_literal_loads)));
43e9d192
IB
11052 }
11053 return false;
11054
11055 case LO_SUM:
11056 info->type = ADDRESS_LO_SUM;
11057 info->base = XEXP (x, 0);
11058 info->offset = XEXP (x, 1);
11059 if (allow_reg_index_p
11060 && aarch64_base_register_rtx_p (info->base, strict_p))
11061 {
74b27d8e
RS
11062 poly_int64 offset;
11063 HOST_WIDE_INT const_offset;
11064 rtx sym = strip_offset_and_salt (info->offset, &offset);
3793ecc1 11065 if (SYMBOL_REF_P (sym)
74b27d8e
RS
11066 && offset.is_constant (&const_offset)
11067 && (aarch64_classify_symbol (sym, const_offset)
43cacb12 11068 == SYMBOL_SMALL_ABSOLUTE))
43e9d192
IB
11069 {
11070 /* The symbol and offset must be aligned to the access size. */
11071 unsigned int align;
43e9d192
IB
11072
11073 if (CONSTANT_POOL_ADDRESS_P (sym))
11074 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11075 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11076 {
11077 tree exp = SYMBOL_REF_DECL (sym);
11078 align = TYPE_ALIGN (TREE_TYPE (exp));
58e17cf8 11079 align = aarch64_constant_alignment (exp, align);
43e9d192
IB
11080 }
11081 else if (SYMBOL_REF_DECL (sym))
11082 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6c031d8d
KV
11083 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11084 && SYMBOL_REF_BLOCK (sym) != NULL)
11085 align = SYMBOL_REF_BLOCK (sym)->alignment;
43e9d192
IB
11086 else
11087 align = BITS_PER_UNIT;
11088
6a70badb
RS
11089 poly_int64 ref_size = GET_MODE_SIZE (mode);
11090 if (known_eq (ref_size, 0))
43e9d192
IB
11091 ref_size = GET_MODE_SIZE (DImode);
11092
74b27d8e 11093 return (multiple_p (const_offset, ref_size)
6a70badb 11094 && multiple_p (align / BITS_PER_UNIT, ref_size));
43e9d192
IB
11095 }
11096 }
11097 return false;
11098
11099 default:
11100 return false;
11101 }
11102}
11103
9bf2f779
KT
11104/* Return true if the address X is valid for a PRFM instruction.
11105 STRICT_P is true if we should do strict checking with
11106 aarch64_classify_address. */
11107
11108bool
11109aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11110{
11111 struct aarch64_address_info addr;
11112
11113 /* PRFM accepts the same addresses as DImode... */
a97d8b98 11114 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9bf2f779
KT
11115 if (!res)
11116 return false;
11117
11118 /* ... except writeback forms. */
11119 return addr.type != ADDRESS_REG_WB;
11120}
11121
43e9d192
IB
11122bool
11123aarch64_symbolic_address_p (rtx x)
11124{
74b27d8e
RS
11125 poly_int64 offset;
11126 x = strip_offset_and_salt (x, &offset);
3793ecc1 11127 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
43e9d192
IB
11128}
11129
a6e0bfa7 11130/* Classify the base of symbolic expression X. */
da4f13a4
MS
11131
11132enum aarch64_symbol_type
a6e0bfa7 11133aarch64_classify_symbolic_expression (rtx x)
43e9d192
IB
11134{
11135 rtx offset;
da4f13a4 11136
43e9d192 11137 split_const (x, &x, &offset);
43cacb12 11138 return aarch64_classify_symbol (x, INTVAL (offset));
43e9d192
IB
11139}
11140
11141
11142/* Return TRUE if X is a legitimate address for accessing memory in
11143 mode MODE. */
11144static bool
ef4bddc2 11145aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
43e9d192
IB
11146{
11147 struct aarch64_address_info addr;
11148
a97d8b98 11149 return aarch64_classify_address (&addr, x, mode, strict_p);
43e9d192
IB
11150}
11151
a97d8b98
RS
11152/* Return TRUE if X is a legitimate address of type TYPE for accessing
11153 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
43e9d192 11154bool
a97d8b98
RS
11155aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11156 aarch64_addr_query_type type)
43e9d192
IB
11157{
11158 struct aarch64_address_info addr;
11159
a97d8b98 11160 return aarch64_classify_address (&addr, x, mode, strict_p, type);
43e9d192
IB
11161}
11162
9005477f
RS
11163/* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11164
491ec060 11165static bool
9005477f
RS
11166aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11167 poly_int64 orig_offset,
11168 machine_mode mode)
491ec060 11169{
6a70badb
RS
11170 HOST_WIDE_INT size;
11171 if (GET_MODE_SIZE (mode).is_constant (&size))
11172 {
9005477f
RS
11173 HOST_WIDE_INT const_offset, second_offset;
11174
11175 /* A general SVE offset is A * VQ + B. Remove the A component from
11176 coefficient 0 in order to get the constant B. */
11177 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11178
11179 /* Split an out-of-range address displacement into a base and
11180 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11181 range otherwise to increase opportunities for sharing the base
11182 address of different sizes. Unaligned accesses use the signed
0dc8e1e7 11183 9-bit range, TImode/TFmode/TDmode use the intersection of signed
9005477f 11184 scaled 7-bit and signed 9-bit offset. */
0dc8e1e7 11185 if (mode == TImode || mode == TFmode || mode == TDmode)
9005477f
RS
11186 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11187 else if ((const_offset & (size - 1)) != 0)
11188 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6a70badb 11189 else
9005477f 11190 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
491ec060 11191
9005477f
RS
11192 if (second_offset == 0 || known_eq (orig_offset, second_offset))
11193 return false;
11194
11195 /* Split the offset into second_offset and the rest. */
11196 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11197 *offset2 = gen_int_mode (second_offset, Pmode);
11198 return true;
11199 }
11200 else
11201 {
11202 /* Get the mode we should use as the basis of the range. For structure
11203 modes this is the mode of one vector. */
11204 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11205 machine_mode step_mode
11206 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11207
11208 /* Get the "mul vl" multiplier we'd like to use. */
11209 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11210 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11211 if (vec_flags & VEC_SVE_DATA)
11212 /* LDR supports a 9-bit range, but the move patterns for
11213 structure modes require all vectors to be in range of the
11214 same base. The simplest way of accomodating that while still
11215 promoting reuse of anchor points between different modes is
11216 to use an 8-bit range unconditionally. */
11217 vnum = ((vnum + 128) & 255) - 128;
11218 else
11219 /* Predicates are only handled singly, so we might as well use
11220 the full range. */
11221 vnum = ((vnum + 256) & 511) - 256;
11222 if (vnum == 0)
11223 return false;
11224
11225 /* Convert the "mul vl" multiplier into a byte offset. */
11226 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11227 if (known_eq (second_offset, orig_offset))
11228 return false;
11229
11230 /* Split the offset into second_offset and the rest. */
11231 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11232 *offset2 = gen_int_mode (second_offset, Pmode);
6a70badb
RS
11233 return true;
11234 }
491ec060
WD
11235}
11236
a2170965
TC
11237/* Return the binary representation of floating point constant VALUE in INTVAL.
11238 If the value cannot be converted, return false without setting INTVAL.
11239 The conversion is done in the given MODE. */
11240bool
11241aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11242{
11243
11244 /* We make a general exception for 0. */
11245 if (aarch64_float_const_zero_rtx_p (value))
11246 {
11247 *intval = 0;
11248 return true;
11249 }
11250
0d0e0188 11251 scalar_float_mode mode;
3793ecc1 11252 if (!CONST_DOUBLE_P (value)
0d0e0188 11253 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
a2170965
TC
11254 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11255 /* Only support up to DF mode. */
11256 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11257 return false;
11258
11259 unsigned HOST_WIDE_INT ival = 0;
11260
11261 long res[2];
11262 real_to_target (res,
11263 CONST_DOUBLE_REAL_VALUE (value),
11264 REAL_MODE_FORMAT (mode));
11265
0dc8e1e7 11266 if (mode == DFmode || mode == DDmode)
5c22bb48
TC
11267 {
11268 int order = BYTES_BIG_ENDIAN ? 1 : 0;
11269 ival = zext_hwi (res[order], 32);
11270 ival |= (zext_hwi (res[1 - order], 32) << 32);
11271 }
11272 else
11273 ival = zext_hwi (res[0], 32);
a2170965
TC
11274
11275 *intval = ival;
11276 return true;
11277}
11278
11279/* Return TRUE if rtx X is an immediate constant that can be moved using a
11280 single MOV(+MOVK) followed by an FMOV. */
11281bool
11282aarch64_float_const_rtx_p (rtx x)
11283{
11284 machine_mode mode = GET_MODE (x);
11285 if (mode == VOIDmode)
11286 return false;
11287
11288 /* Determine whether it's cheaper to write float constants as
11289 mov/movk pairs over ldr/adrp pairs. */
11290 unsigned HOST_WIDE_INT ival;
11291
3793ecc1 11292 if (CONST_DOUBLE_P (x)
a2170965
TC
11293 && SCALAR_FLOAT_MODE_P (mode)
11294 && aarch64_reinterpret_float_as_int (x, &ival))
11295 {
ba1536da 11296 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
a2170965
TC
11297 int num_instr = aarch64_internal_mov_immediate
11298 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11299 return num_instr < 3;
11300 }
11301
11302 return false;
11303}
11304
0dc8e1e7
CL
11305/* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11306 Floating Point). */
43e9d192 11307bool
3520f7cc 11308aarch64_float_const_zero_rtx_p (rtx x)
43e9d192 11309{
0dc8e1e7
CL
11310 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11311 zr as our callers expect, so no need to check the actual
11312 value if X is of Decimal Floating Point type. */
11313 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
43e9d192
IB
11314 return false;
11315
34a72c33 11316 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
43e9d192 11317 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
34a72c33 11318 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
43e9d192
IB
11319}
11320
a2170965
TC
11321/* Return TRUE if rtx X is immediate constant that fits in a single
11322 MOVI immediate operation. */
11323bool
11324aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11325{
11326 if (!TARGET_SIMD)
11327 return false;
11328
77e994c9
RS
11329 machine_mode vmode;
11330 scalar_int_mode imode;
a2170965
TC
11331 unsigned HOST_WIDE_INT ival;
11332
3793ecc1 11333 if (CONST_DOUBLE_P (x)
a2170965
TC
11334 && SCALAR_FLOAT_MODE_P (mode))
11335 {
11336 if (!aarch64_reinterpret_float_as_int (x, &ival))
11337 return false;
11338
35c38fa6
TC
11339 /* We make a general exception for 0. */
11340 if (aarch64_float_const_zero_rtx_p (x))
11341 return true;
11342
304b9962 11343 imode = int_mode_for_mode (mode).require ();
a2170965 11344 }
3793ecc1 11345 else if (CONST_INT_P (x)
77e994c9
RS
11346 && is_a <scalar_int_mode> (mode, &imode))
11347 ival = INTVAL (x);
a2170965
TC
11348 else
11349 return false;
11350
0dc8e1e7 11351 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
a2170965 11352 a 128 bit vector mode. */
77e994c9 11353 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
a2170965
TC
11354
11355 vmode = aarch64_simd_container_mode (imode, width);
11356 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11357
b187677b 11358 return aarch64_simd_valid_immediate (v_op, NULL);
a2170965
TC
11359}
11360
11361
70f09188
AP
11362/* Return the fixed registers used for condition codes. */
11363
11364static bool
11365aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11366{
11367 *p1 = CC_REGNUM;
11368 *p2 = INVALID_REGNUM;
11369 return true;
11370}
11371
47210a04
RL
11372/* This function is used by the call expanders of the machine description.
11373 RESULT is the register in which the result is returned. It's NULL for
11374 "call" and "sibcall".
11375 MEM is the location of the function call.
08cc4d92 11376 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
47210a04
RL
11377 SIBCALL indicates whether this function call is normal call or sibling call.
11378 It will generate different pattern accordingly. */
11379
11380void
08cc4d92 11381aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
47210a04
RL
11382{
11383 rtx call, callee, tmp;
11384 rtvec vec;
11385 machine_mode mode;
11386
11387 gcc_assert (MEM_P (mem));
11388 callee = XEXP (mem, 0);
11389 mode = GET_MODE (callee);
11390 gcc_assert (mode == Pmode);
11391
11392 /* Decide if we should generate indirect calls by loading the
11393 address of the callee into a register before performing
11394 the branch-and-link. */
11395 if (SYMBOL_REF_P (callee)
11396 ? (aarch64_is_long_call_p (callee)
11397 || aarch64_is_noplt_call_p (callee))
11398 : !REG_P (callee))
11399 XEXP (mem, 0) = force_reg (mode, callee);
11400
11401 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11402
11403 if (result != NULL_RTX)
11404 call = gen_rtx_SET (result, call);
11405
11406 if (sibcall)
11407 tmp = ret_rtx;
11408 else
11409 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11410
08cc4d92
RS
11411 gcc_assert (CONST_INT_P (callee_abi));
11412 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11413 UNSPEC_CALLEE_ABI);
11414
11415 vec = gen_rtvec (3, call, callee_abi, tmp);
47210a04
RL
11416 call = gen_rtx_PARALLEL (VOIDmode, vec);
11417
11418 aarch64_emit_call_insn (call);
11419}
11420
78607708
TV
11421/* Emit call insn with PAT and do aarch64-specific handling. */
11422
d07a3fed 11423void
78607708
TV
11424aarch64_emit_call_insn (rtx pat)
11425{
11426 rtx insn = emit_call_insn (pat);
11427
11428 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11429 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11430 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11431}
11432
ef4bddc2 11433machine_mode
43e9d192
IB
11434aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11435{
f7343f20
RE
11436 machine_mode mode_x = GET_MODE (x);
11437 rtx_code code_x = GET_CODE (x);
11438
43e9d192
IB
11439 /* All floating point compares return CCFP if it is an equality
11440 comparison, and CCFPE otherwise. */
f7343f20 11441 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
43e9d192
IB
11442 {
11443 switch (code)
11444 {
11445 case EQ:
11446 case NE:
11447 case UNORDERED:
11448 case ORDERED:
11449 case UNLT:
11450 case UNLE:
11451 case UNGT:
11452 case UNGE:
11453 case UNEQ:
43e9d192
IB
11454 return CCFPmode;
11455
11456 case LT:
11457 case LE:
11458 case GT:
11459 case GE:
8332c5ee 11460 case LTGT:
43e9d192
IB
11461 return CCFPEmode;
11462
11463 default:
11464 gcc_unreachable ();
11465 }
11466 }
11467
2b8568fe
KT
11468 /* Equality comparisons of short modes against zero can be performed
11469 using the TST instruction with the appropriate bitmask. */
f73dc006 11470 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
2b8568fe 11471 && (code == EQ || code == NE)
f7343f20 11472 && (mode_x == HImode || mode_x == QImode))
1cccf644 11473 return CC_Zmode;
2b8568fe 11474
b06335f9
KT
11475 /* Similarly, comparisons of zero_extends from shorter modes can
11476 be performed using an ANDS with an immediate mask. */
f7343f20
RE
11477 if (y == const0_rtx && code_x == ZERO_EXTEND
11478 && (mode_x == SImode || mode_x == DImode)
b06335f9
KT
11479 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11480 && (code == EQ || code == NE))
1cccf644
WD
11481 return CC_Zmode;
11482
11483 /* Zero extracts support equality comparisons. */
11484 if ((mode_x == SImode || mode_x == DImode)
11485 && y == const0_rtx
11486 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11487 && CONST_INT_P (XEXP (x, 2)))
11488 && (code == EQ || code == NE))
11489 return CC_Zmode;
11490
11491 /* ANDS/BICS/TST support equality and all signed comparisons. */
11492 if ((mode_x == SImode || mode_x == DImode)
11493 && y == const0_rtx
11494 && (code_x == AND)
11495 && (code == EQ || code == NE || code == LT || code == GE
11496 || code == GT || code == LE))
11497 return CC_NZVmode;
b06335f9 11498
1cccf644 11499 /* ADDS/SUBS correctly set N and Z flags. */
f7343f20 11500 if ((mode_x == SImode || mode_x == DImode)
43e9d192
IB
11501 && y == const0_rtx
11502 && (code == EQ || code == NE || code == LT || code == GE)
1cccf644 11503 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
43e9d192
IB
11504 return CC_NZmode;
11505
1c992d1e 11506 /* A compare with a shifted operand. Because of canonicalization,
43e9d192
IB
11507 the comparison will have to be swapped when we emit the assembly
11508 code. */
f7343f20 11509 if ((mode_x == SImode || mode_x == DImode)
3793ecc1 11510 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
f7343f20
RE
11511 && (code_x == ASHIFT || code_x == ASHIFTRT
11512 || code_x == LSHIFTRT
11513 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
43e9d192
IB
11514 return CC_SWPmode;
11515
1c992d1e
RE
11516 /* Similarly for a negated operand, but we can only do this for
11517 equalities. */
f7343f20 11518 if ((mode_x == SImode || mode_x == DImode)
3793ecc1 11519 && (REG_P (y) || SUBREG_P (y))
1c992d1e 11520 && (code == EQ || code == NE)
f7343f20 11521 && code_x == NEG)
1c992d1e
RE
11522 return CC_Zmode;
11523
f7343f20
RE
11524 /* A test for unsigned overflow from an addition. */
11525 if ((mode_x == DImode || mode_x == TImode)
11526 && (code == LTU || code == GEU)
11527 && code_x == PLUS
11528 && rtx_equal_p (XEXP (x, 0), y))
ef22810a
RH
11529 return CC_Cmode;
11530
f7343f20
RE
11531 /* A test for unsigned overflow from an add with carry. */
11532 if ((mode_x == DImode || mode_x == TImode)
11533 && (code == LTU || code == GEU)
11534 && code_x == PLUS
11535 && CONST_SCALAR_INT_P (y)
11536 && (rtx_mode_t (y, mode_x)
11537 == (wi::shwi (1, mode_x)
11538 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11539 return CC_ADCmode;
11540
30c46053 11541 /* A test for signed overflow. */
f7343f20 11542 if ((mode_x == DImode || mode_x == TImode)
30c46053 11543 && code == NE
f7343f20 11544 && code_x == PLUS
30c46053
MC
11545 && GET_CODE (y) == SIGN_EXTEND)
11546 return CC_Vmode;
11547
43e9d192
IB
11548 /* For everything else, return CCmode. */
11549 return CCmode;
11550}
11551
3dfa7055 11552static int
b8506a8a 11553aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
3dfa7055 11554
cd5660ab 11555int
43e9d192
IB
11556aarch64_get_condition_code (rtx x)
11557{
ef4bddc2 11558 machine_mode mode = GET_MODE (XEXP (x, 0));
43e9d192
IB
11559 enum rtx_code comp_code = GET_CODE (x);
11560
11561 if (GET_MODE_CLASS (mode) != MODE_CC)
11562 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3dfa7055
ZC
11563 return aarch64_get_condition_code_1 (mode, comp_code);
11564}
43e9d192 11565
3dfa7055 11566static int
b8506a8a 11567aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
3dfa7055 11568{
43e9d192
IB
11569 switch (mode)
11570 {
4e10a5a7
RS
11571 case E_CCFPmode:
11572 case E_CCFPEmode:
43e9d192
IB
11573 switch (comp_code)
11574 {
11575 case GE: return AARCH64_GE;
11576 case GT: return AARCH64_GT;
11577 case LE: return AARCH64_LS;
11578 case LT: return AARCH64_MI;
11579 case NE: return AARCH64_NE;
11580 case EQ: return AARCH64_EQ;
11581 case ORDERED: return AARCH64_VC;
11582 case UNORDERED: return AARCH64_VS;
11583 case UNLT: return AARCH64_LT;
11584 case UNLE: return AARCH64_LE;
11585 case UNGT: return AARCH64_HI;
11586 case UNGE: return AARCH64_PL;
cd5660ab 11587 default: return -1;
43e9d192
IB
11588 }
11589 break;
11590
4e10a5a7 11591 case E_CCmode:
43e9d192
IB
11592 switch (comp_code)
11593 {
11594 case NE: return AARCH64_NE;
11595 case EQ: return AARCH64_EQ;
11596 case GE: return AARCH64_GE;
11597 case GT: return AARCH64_GT;
11598 case LE: return AARCH64_LE;
11599 case LT: return AARCH64_LT;
11600 case GEU: return AARCH64_CS;
11601 case GTU: return AARCH64_HI;
11602 case LEU: return AARCH64_LS;
11603 case LTU: return AARCH64_CC;
cd5660ab 11604 default: return -1;
43e9d192
IB
11605 }
11606 break;
11607
4e10a5a7 11608 case E_CC_SWPmode:
43e9d192
IB
11609 switch (comp_code)
11610 {
11611 case NE: return AARCH64_NE;
11612 case EQ: return AARCH64_EQ;
11613 case GE: return AARCH64_LE;
11614 case GT: return AARCH64_LT;
11615 case LE: return AARCH64_GE;
11616 case LT: return AARCH64_GT;
11617 case GEU: return AARCH64_LS;
11618 case GTU: return AARCH64_CC;
11619 case LEU: return AARCH64_CS;
11620 case LTU: return AARCH64_HI;
cd5660ab 11621 default: return -1;
43e9d192
IB
11622 }
11623 break;
11624
57d6f4d0
RS
11625 case E_CC_NZCmode:
11626 switch (comp_code)
11627 {
11628 case NE: return AARCH64_NE; /* = any */
11629 case EQ: return AARCH64_EQ; /* = none */
11630 case GE: return AARCH64_PL; /* = nfrst */
11631 case LT: return AARCH64_MI; /* = first */
11632 case GEU: return AARCH64_CS; /* = nlast */
11633 case GTU: return AARCH64_HI; /* = pmore */
11634 case LEU: return AARCH64_LS; /* = plast */
11635 case LTU: return AARCH64_CC; /* = last */
11636 default: return -1;
11637 }
11638 break;
11639
1cccf644
WD
11640 case E_CC_NZVmode:
11641 switch (comp_code)
11642 {
11643 case NE: return AARCH64_NE;
11644 case EQ: return AARCH64_EQ;
11645 case GE: return AARCH64_PL;
11646 case LT: return AARCH64_MI;
11647 case GT: return AARCH64_GT;
11648 case LE: return AARCH64_LE;
11649 default: return -1;
11650 }
11651 break;
11652
4e10a5a7 11653 case E_CC_NZmode:
43e9d192
IB
11654 switch (comp_code)
11655 {
11656 case NE: return AARCH64_NE;
11657 case EQ: return AARCH64_EQ;
11658 case GE: return AARCH64_PL;
11659 case LT: return AARCH64_MI;
cd5660ab 11660 default: return -1;
43e9d192
IB
11661 }
11662 break;
11663
4e10a5a7 11664 case E_CC_Zmode:
1c992d1e
RE
11665 switch (comp_code)
11666 {
11667 case NE: return AARCH64_NE;
11668 case EQ: return AARCH64_EQ;
cd5660ab 11669 default: return -1;
1c992d1e
RE
11670 }
11671 break;
11672
4e10a5a7 11673 case E_CC_Cmode:
ef22810a
RH
11674 switch (comp_code)
11675 {
f7343f20
RE
11676 case LTU: return AARCH64_CS;
11677 case GEU: return AARCH64_CC;
11678 default: return -1;
11679 }
11680 break;
11681
11682 case E_CC_ADCmode:
11683 switch (comp_code)
11684 {
11685 case GEU: return AARCH64_CS;
11686 case LTU: return AARCH64_CC;
ef22810a
RH
11687 default: return -1;
11688 }
11689 break;
11690
30c46053
MC
11691 case E_CC_Vmode:
11692 switch (comp_code)
11693 {
11694 case NE: return AARCH64_VS;
11695 case EQ: return AARCH64_VC;
11696 default: return -1;
11697 }
11698 break;
11699
43e9d192 11700 default:
cd5660ab 11701 return -1;
43e9d192 11702 }
3dfa7055 11703
3dfa7055 11704 return -1;
43e9d192
IB
11705}
11706
ddeabd3e
AL
11707bool
11708aarch64_const_vec_all_same_in_range_p (rtx x,
6a70badb
RS
11709 HOST_WIDE_INT minval,
11710 HOST_WIDE_INT maxval)
ddeabd3e 11711{
6a70badb
RS
11712 rtx elt;
11713 return (const_vec_duplicate_p (x, &elt)
11714 && CONST_INT_P (elt)
11715 && IN_RANGE (INTVAL (elt), minval, maxval));
ddeabd3e
AL
11716}
11717
11718bool
11719aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11720{
11721 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11722}
11723
43cacb12
RS
11724/* Return true if VEC is a constant in which every element is in the range
11725 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11726
11727static bool
11728aarch64_const_vec_all_in_range_p (rtx vec,
11729 HOST_WIDE_INT minval,
11730 HOST_WIDE_INT maxval)
11731{
568b9c0e 11732 if (!CONST_VECTOR_P (vec)
43cacb12
RS
11733 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11734 return false;
11735
11736 int nunits;
11737 if (!CONST_VECTOR_STEPPED_P (vec))
11738 nunits = const_vector_encoded_nelts (vec);
11739 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11740 return false;
11741
11742 for (int i = 0; i < nunits; i++)
11743 {
11744 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11745 if (!CONST_INT_P (vec_elem)
11746 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11747 return false;
11748 }
11749 return true;
11750}
43e9d192 11751
cf670503
ZC
11752/* N Z C V. */
11753#define AARCH64_CC_V 1
11754#define AARCH64_CC_C (1 << 1)
11755#define AARCH64_CC_Z (1 << 2)
11756#define AARCH64_CC_N (1 << 3)
11757
c8012fbc
WD
11758/* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11759static const int aarch64_nzcv_codes[] =
11760{
11761 0, /* EQ, Z == 1. */
11762 AARCH64_CC_Z, /* NE, Z == 0. */
11763 0, /* CS, C == 1. */
11764 AARCH64_CC_C, /* CC, C == 0. */
11765 0, /* MI, N == 1. */
11766 AARCH64_CC_N, /* PL, N == 0. */
11767 0, /* VS, V == 1. */
11768 AARCH64_CC_V, /* VC, V == 0. */
11769 0, /* HI, C ==1 && Z == 0. */
11770 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11771 AARCH64_CC_V, /* GE, N == V. */
11772 0, /* LT, N != V. */
11773 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11774 0, /* LE, !(Z == 0 && N == V). */
11775 0, /* AL, Any. */
11776 0 /* NV, Any. */
cf670503
ZC
11777};
11778
43cacb12
RS
11779/* Print floating-point vector immediate operand X to F, negating it
11780 first if NEGATE is true. Return true on success, false if it isn't
11781 a constant we can handle. */
11782
11783static bool
11784aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11785{
11786 rtx elt;
11787
11788 if (!const_vec_duplicate_p (x, &elt))
11789 return false;
11790
11791 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11792 if (negate)
11793 r = real_value_negate (&r);
11794
d29f7dd5
RS
11795 /* Handle the SVE single-bit immediates specially, since they have a
11796 fixed form in the assembly syntax. */
43cacb12
RS
11797 if (real_equal (&r, &dconst0))
11798 asm_fprintf (f, "0.0");
a19ba9e1
RS
11799 else if (real_equal (&r, &dconst2))
11800 asm_fprintf (f, "2.0");
43cacb12
RS
11801 else if (real_equal (&r, &dconst1))
11802 asm_fprintf (f, "1.0");
11803 else if (real_equal (&r, &dconsthalf))
11804 asm_fprintf (f, "0.5");
11805 else
d29f7dd5
RS
11806 {
11807 const int buf_size = 20;
11808 char float_buf[buf_size] = {'\0'};
11809 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11810 1, GET_MODE (elt));
11811 asm_fprintf (f, "%s", float_buf);
11812 }
43cacb12
RS
11813
11814 return true;
11815}
11816
9f4cbab8
RS
11817/* Return the equivalent letter for size. */
11818static char
11819sizetochar (int size)
11820{
11821 switch (size)
11822 {
11823 case 64: return 'd';
11824 case 32: return 's';
11825 case 16: return 'h';
11826 case 8 : return 'b';
11827 default: gcc_unreachable ();
11828 }
11829}
11830
bcf19844
JW
11831/* Print operand X to file F in a target specific manner according to CODE.
11832 The acceptable formatting commands given by CODE are:
11833 'c': An integer or symbol address without a preceding #
11834 sign.
43cacb12
RS
11835 'C': Take the duplicated element in a vector constant
11836 and print it in hex.
11837 'D': Take the duplicated element in a vector constant
11838 and print it as an unsigned integer, in decimal.
bcf19844 11839 'e': Print the sign/zero-extend size as a character 8->b,
d113ece6
RS
11840 16->h, 32->w. Can also be used for masks:
11841 0xff->b, 0xffff->h, 0xffffffff->w.
d29f7dd5
RS
11842 'I': If the operand is a duplicated vector constant,
11843 replace it with the duplicated scalar. If the
11844 operand is then a floating-point constant, replace
11845 it with the integer bit representation. Print the
11846 transformed constant as a signed decimal number.
bcf19844
JW
11847 'p': Prints N such that 2^N == X (X must be power of 2 and
11848 const int).
11849 'P': Print the number of non-zero bits in X (a const_int).
11850 'H': Print the higher numbered register of a pair (TImode)
11851 of regs.
11852 'm': Print a condition (eq, ne, etc).
11853 'M': Same as 'm', but invert condition.
43cacb12
RS
11854 'N': Take the duplicated element in a vector constant
11855 and print the negative of it in decimal.
bcf19844
JW
11856 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11857 'S/T/U/V': Print a FP/SIMD register name for a register list.
11858 The register printed is the FP/SIMD register name
11859 of X + 0/1/2/3 for S/T/U/V.
e3f15286 11860 'R': Print a scalar Integer/FP/SIMD register name + 1.
bcf19844
JW
11861 'X': Print bottom 16 bits of integer constant in hex.
11862 'w/x': Print a general register name or the zero register
11863 (32-bit or 64-bit).
11864 '0': Print a normal operand, if it's a general register,
11865 then we assume DImode.
11866 'k': Print NZCV for conditional compare instructions.
11867 'A': Output address constant representing the first
11868 argument of X, specifying a relocation offset
11869 if appropriate.
11870 'L': Output constant address specified by X
11871 with a relocation offset if appropriate.
11872 'G': Prints address of X, specifying a PC relative
e69a816d
WD
11873 relocation mode if appropriate.
11874 'y': Output address of LDP or STP - this is used for
11875 some LDP/STPs which don't use a PARALLEL in their
11876 pattern (so the mode needs to be adjusted).
11877 'z': Output address of a typical LDP or STP. */
bcf19844 11878
cc8ca59e
JB
11879static void
11880aarch64_print_operand (FILE *f, rtx x, int code)
43e9d192 11881{
43cacb12 11882 rtx elt;
43e9d192
IB
11883 switch (code)
11884 {
f541a481 11885 case 'c':
74b27d8e
RS
11886 if (CONST_INT_P (x))
11887 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11888 else
f541a481 11889 {
74b27d8e
RS
11890 poly_int64 offset;
11891 rtx base = strip_offset_and_salt (x, &offset);
11892 if (SYMBOL_REF_P (base))
11893 output_addr_const (f, x);
11894 else
11895 output_operand_lossage ("unsupported operand for code '%c'", code);
f541a481
KT
11896 }
11897 break;
11898
43e9d192 11899 case 'e':
43e9d192 11900 {
d113ece6
RS
11901 x = unwrap_const_vec_duplicate (x);
11902 if (!CONST_INT_P (x))
43e9d192
IB
11903 {
11904 output_operand_lossage ("invalid operand for '%%%c'", code);
11905 return;
11906 }
11907
d113ece6
RS
11908 HOST_WIDE_INT val = INTVAL (x);
11909 if ((val & ~7) == 8 || val == 0xff)
11910 fputc ('b', f);
11911 else if ((val & ~7) == 16 || val == 0xffff)
11912 fputc ('h', f);
11913 else if ((val & ~7) == 32 || val == 0xffffffff)
11914 fputc ('w', f);
11915 else
43e9d192 11916 {
43e9d192
IB
11917 output_operand_lossage ("invalid operand for '%%%c'", code);
11918 return;
11919 }
11920 }
11921 break;
11922
11923 case 'p':
11924 {
11925 int n;
11926
4aa81c2e 11927 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
43e9d192
IB
11928 {
11929 output_operand_lossage ("invalid operand for '%%%c'", code);
11930 return;
11931 }
11932
11933 asm_fprintf (f, "%d", n);
11934 }
11935 break;
11936
11937 case 'P':
4aa81c2e 11938 if (!CONST_INT_P (x))
43e9d192
IB
11939 {
11940 output_operand_lossage ("invalid operand for '%%%c'", code);
11941 return;
11942 }
11943
8d55c61b 11944 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
43e9d192
IB
11945 break;
11946
11947 case 'H':
c0111dc4
RE
11948 if (x == const0_rtx)
11949 {
11950 asm_fprintf (f, "xzr");
11951 break;
11952 }
11953
4aa81c2e 11954 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
43e9d192
IB
11955 {
11956 output_operand_lossage ("invalid operand for '%%%c'", code);
11957 return;
11958 }
11959
01a3a324 11960 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
43e9d192
IB
11961 break;
11962
d29f7dd5
RS
11963 case 'I':
11964 {
11965 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11966 if (CONST_INT_P (x))
11967 asm_fprintf (f, "%wd", INTVAL (x));
11968 else
11969 {
11970 output_operand_lossage ("invalid operand for '%%%c'", code);
11971 return;
11972 }
11973 break;
11974 }
11975
43e9d192 11976 case 'M':
c8012fbc 11977 case 'm':
cd5660ab
KT
11978 {
11979 int cond_code;
c8012fbc
WD
11980 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11981 if (x == const_true_rtx)
cd5660ab 11982 {
c8012fbc
WD
11983 if (code == 'M')
11984 fputs ("nv", f);
cd5660ab
KT
11985 return;
11986 }
43e9d192 11987
cd5660ab
KT
11988 if (!COMPARISON_P (x))
11989 {
11990 output_operand_lossage ("invalid operand for '%%%c'", code);
11991 return;
11992 }
c8012fbc 11993
cd5660ab
KT
11994 cond_code = aarch64_get_condition_code (x);
11995 gcc_assert (cond_code >= 0);
c8012fbc
WD
11996 if (code == 'M')
11997 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
57d6f4d0
RS
11998 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11999 fputs (aarch64_sve_condition_codes[cond_code], f);
12000 else
12001 fputs (aarch64_condition_codes[cond_code], f);
cd5660ab 12002 }
43e9d192
IB
12003 break;
12004
43cacb12
RS
12005 case 'N':
12006 if (!const_vec_duplicate_p (x, &elt))
12007 {
12008 output_operand_lossage ("invalid vector constant");
12009 return;
12010 }
12011
12012 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
1c0c371d 12013 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
43cacb12
RS
12014 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12015 && aarch64_print_vector_float_operand (f, x, true))
12016 ;
12017 else
12018 {
12019 output_operand_lossage ("invalid vector constant");
12020 return;
12021 }
12022 break;
12023
43e9d192
IB
12024 case 'b':
12025 case 'h':
12026 case 's':
12027 case 'd':
12028 case 'q':
43e9d192
IB
12029 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12030 {
12031 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12032 return;
12033 }
50ce6f88 12034 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
43e9d192
IB
12035 break;
12036
12037 case 'S':
12038 case 'T':
12039 case 'U':
12040 case 'V':
43e9d192
IB
12041 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12042 {
12043 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12044 return;
12045 }
43cacb12
RS
12046 asm_fprintf (f, "%c%d",
12047 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12048 REGNO (x) - V0_REGNUM + (code - 'S'));
43e9d192
IB
12049 break;
12050
2d8c6dc1 12051 case 'R':
66f206b8
JW
12052 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12053 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12054 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12055 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
e3f15286
RH
12056 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12057 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12058 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12059 else
12060 output_operand_lossage ("incompatible register operand for '%%%c'",
12061 code);
2d8c6dc1
AH
12062 break;
12063
a05c0ddf 12064 case 'X':
4aa81c2e 12065 if (!CONST_INT_P (x))
a05c0ddf
IB
12066 {
12067 output_operand_lossage ("invalid operand for '%%%c'", code);
12068 return;
12069 }
50d38551 12070 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
a05c0ddf
IB
12071 break;
12072
43cacb12
RS
12073 case 'C':
12074 {
12075 /* Print a replicated constant in hex. */
12076 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12077 {
12078 output_operand_lossage ("invalid operand for '%%%c'", code);
12079 return;
12080 }
12081 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12082 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12083 }
12084 break;
12085
12086 case 'D':
12087 {
12088 /* Print a replicated constant in decimal, treating it as
12089 unsigned. */
12090 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12091 {
12092 output_operand_lossage ("invalid operand for '%%%c'", code);
12093 return;
12094 }
12095 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12096 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12097 }
12098 break;
12099
43e9d192
IB
12100 case 'w':
12101 case 'x':
3520f7cc
JG
12102 if (x == const0_rtx
12103 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
43e9d192 12104 {
50ce6f88 12105 asm_fprintf (f, "%czr", code);
43e9d192
IB
12106 break;
12107 }
12108
12109 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12110 {
50ce6f88 12111 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
43e9d192
IB
12112 break;
12113 }
12114
12115 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12116 {
50ce6f88 12117 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
43e9d192
IB
12118 break;
12119 }
12120
12121 /* Fall through */
12122
12123 case 0:
43e9d192
IB
12124 if (x == NULL)
12125 {
12126 output_operand_lossage ("missing operand");
12127 return;
12128 }
12129
12130 switch (GET_CODE (x))
12131 {
12132 case REG:
43cacb12 12133 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9f4cbab8
RS
12134 {
12135 if (REG_NREGS (x) == 1)
12136 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12137 else
12138 {
12139 char suffix
12140 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12141 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12142 REGNO (x) - V0_REGNUM, suffix,
12143 END_REGNO (x) - V0_REGNUM - 1, suffix);
12144 }
12145 }
43cacb12
RS
12146 else
12147 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
43e9d192
IB
12148 break;
12149
12150 case MEM:
cc8ca59e 12151 output_address (GET_MODE (x), XEXP (x, 0));
43e9d192
IB
12152 break;
12153
12154 case LABEL_REF:
12155 case SYMBOL_REF:
12156 output_addr_const (asm_out_file, x);
12157 break;
12158
12159 case CONST_INT:
12160 asm_fprintf (f, "%wd", INTVAL (x));
12161 break;
12162
43cacb12
RS
12163 case CONST:
12164 if (!VECTOR_MODE_P (GET_MODE (x)))
3520f7cc 12165 {
43cacb12
RS
12166 output_addr_const (asm_out_file, x);
12167 break;
3520f7cc 12168 }
43cacb12
RS
12169 /* fall through */
12170
12171 case CONST_VECTOR:
12172 if (!const_vec_duplicate_p (x, &elt))
3520f7cc 12173 {
43cacb12
RS
12174 output_operand_lossage ("invalid vector constant");
12175 return;
3520f7cc 12176 }
43cacb12
RS
12177
12178 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12179 asm_fprintf (f, "%wd", INTVAL (elt));
12180 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12181 && aarch64_print_vector_float_operand (f, x, false))
12182 ;
3520f7cc 12183 else
43cacb12
RS
12184 {
12185 output_operand_lossage ("invalid vector constant");
12186 return;
12187 }
43e9d192
IB
12188 break;
12189
3520f7cc 12190 case CONST_DOUBLE:
2ca5b430
KT
12191 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12192 be getting CONST_DOUBLEs holding integers. */
12193 gcc_assert (GET_MODE (x) != VOIDmode);
12194 if (aarch64_float_const_zero_rtx_p (x))
3520f7cc
JG
12195 {
12196 fputc ('0', f);
12197 break;
12198 }
12199 else if (aarch64_float_const_representable_p (x))
12200 {
12201#define buf_size 20
12202 char float_buf[buf_size] = {'\0'};
34a72c33
RS
12203 real_to_decimal_for_mode (float_buf,
12204 CONST_DOUBLE_REAL_VALUE (x),
3520f7cc
JG
12205 buf_size, buf_size,
12206 1, GET_MODE (x));
12207 asm_fprintf (asm_out_file, "%s", float_buf);
12208 break;
12209#undef buf_size
12210 }
12211 output_operand_lossage ("invalid constant");
12212 return;
43e9d192
IB
12213 default:
12214 output_operand_lossage ("invalid operand");
12215 return;
12216 }
12217 break;
12218
12219 case 'A':
12220 if (GET_CODE (x) == HIGH)
12221 x = XEXP (x, 0);
12222
a6e0bfa7 12223 switch (aarch64_classify_symbolic_expression (x))
43e9d192 12224 {
6642bdb4 12225 case SYMBOL_SMALL_GOT_4G:
43e9d192
IB
12226 asm_fprintf (asm_out_file, ":got:");
12227 break;
12228
12229 case SYMBOL_SMALL_TLSGD:
12230 asm_fprintf (asm_out_file, ":tlsgd:");
12231 break;
12232
12233 case SYMBOL_SMALL_TLSDESC:
12234 asm_fprintf (asm_out_file, ":tlsdesc:");
12235 break;
12236
79496620 12237 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
12238 asm_fprintf (asm_out_file, ":gottprel:");
12239 break;
12240
d18ba284 12241 case SYMBOL_TLSLE24:
43e9d192
IB
12242 asm_fprintf (asm_out_file, ":tprel:");
12243 break;
12244
87dd8ab0
MS
12245 case SYMBOL_TINY_GOT:
12246 gcc_unreachable ();
12247 break;
12248
43e9d192
IB
12249 default:
12250 break;
12251 }
12252 output_addr_const (asm_out_file, x);
12253 break;
12254
12255 case 'L':
a6e0bfa7 12256 switch (aarch64_classify_symbolic_expression (x))
43e9d192 12257 {
6642bdb4 12258 case SYMBOL_SMALL_GOT_4G:
a195c727 12259 asm_fprintf (asm_out_file, ":got_lo12:");
43e9d192
IB
12260 break;
12261
12262 case SYMBOL_SMALL_TLSGD:
12263 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12264 break;
12265
12266 case SYMBOL_SMALL_TLSDESC:
12267 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12268 break;
12269
79496620 12270 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
12271 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12272 break;
12273
cbf5629e
JW
12274 case SYMBOL_TLSLE12:
12275 asm_fprintf (asm_out_file, ":tprel_lo12:");
12276 break;
12277
d18ba284 12278 case SYMBOL_TLSLE24:
43e9d192
IB
12279 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12280 break;
12281
87dd8ab0
MS
12282 case SYMBOL_TINY_GOT:
12283 asm_fprintf (asm_out_file, ":got:");
12284 break;
12285
5ae7caad
JW
12286 case SYMBOL_TINY_TLSIE:
12287 asm_fprintf (asm_out_file, ":gottprel:");
12288 break;
12289
43e9d192
IB
12290 default:
12291 break;
12292 }
12293 output_addr_const (asm_out_file, x);
12294 break;
12295
12296 case 'G':
a6e0bfa7 12297 switch (aarch64_classify_symbolic_expression (x))
43e9d192 12298 {
d18ba284 12299 case SYMBOL_TLSLE24:
43e9d192
IB
12300 asm_fprintf (asm_out_file, ":tprel_hi12:");
12301 break;
12302 default:
12303 break;
12304 }
12305 output_addr_const (asm_out_file, x);
12306 break;
12307
cf670503
ZC
12308 case 'k':
12309 {
c8012fbc 12310 HOST_WIDE_INT cond_code;
cf670503 12311
c8012fbc 12312 if (!CONST_INT_P (x))
cf670503
ZC
12313 {
12314 output_operand_lossage ("invalid operand for '%%%c'", code);
12315 return;
12316 }
12317
c8012fbc
WD
12318 cond_code = INTVAL (x);
12319 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12320 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
cf670503
ZC
12321 }
12322 break;
12323
e69a816d
WD
12324 case 'y':
12325 case 'z':
12326 {
12327 machine_mode mode = GET_MODE (x);
12328
3793ecc1 12329 if (!MEM_P (x)
83d7e720
RS
12330 || (code == 'y'
12331 && maybe_ne (GET_MODE_SIZE (mode), 8)
12332 && maybe_ne (GET_MODE_SIZE (mode), 16)))
e69a816d
WD
12333 {
12334 output_operand_lossage ("invalid operand for '%%%c'", code);
12335 return;
12336 }
12337
a25831ac
AV
12338 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12339 code == 'y'
12340 ? ADDR_QUERY_LDP_STP_N
12341 : ADDR_QUERY_LDP_STP))
c348cab0 12342 output_operand_lossage ("invalid operand prefix '%%%c'", code);
e69a816d
WD
12343 }
12344 break;
12345
43e9d192
IB
12346 default:
12347 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12348 return;
12349 }
12350}
12351
e69a816d
WD
12352/* Print address 'x' of a memory access with mode 'mode'.
12353 'op' is the context required by aarch64_classify_address. It can either be
12354 MEM for a normal memory access or PARALLEL for LDP/STP. */
c348cab0 12355static bool
a97d8b98
RS
12356aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12357 aarch64_addr_query_type type)
43e9d192
IB
12358{
12359 struct aarch64_address_info addr;
550a3380 12360 unsigned int size, vec_flags;
43e9d192 12361
e69a816d 12362 /* Check all addresses are Pmode - including ILP32. */
31460ed2
JJ
12363 if (GET_MODE (x) != Pmode
12364 && (!CONST_INT_P (x)
12365 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12366 {
12367 output_operand_lossage ("invalid address mode");
12368 return false;
12369 }
e69a816d 12370
a97d8b98 12371 if (aarch64_classify_address (&addr, x, mode, true, type))
43e9d192
IB
12372 switch (addr.type)
12373 {
12374 case ADDRESS_REG_IMM:
dc640181 12375 if (known_eq (addr.const_offset, 0))
43cacb12 12376 {
550a3380
RS
12377 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12378 return true;
43cacb12 12379 }
550a3380
RS
12380
12381 vec_flags = aarch64_classify_vector_mode (mode);
12382 if (vec_flags & VEC_ANY_SVE)
43cacb12
RS
12383 {
12384 HOST_WIDE_INT vnum
12385 = exact_div (addr.const_offset,
550a3380 12386 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
43cacb12
RS
12387 asm_fprintf (f, "[%s, #%wd, mul vl]",
12388 reg_names[REGNO (addr.base)], vnum);
550a3380 12389 return true;
43cacb12 12390 }
550a3380
RS
12391
12392 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12393 INTVAL (addr.offset));
c348cab0 12394 return true;
43e9d192
IB
12395
12396 case ADDRESS_REG_REG:
12397 if (addr.shift == 0)
16a3246f 12398 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
01a3a324 12399 reg_names [REGNO (addr.offset)]);
43e9d192 12400 else
16a3246f 12401 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
01a3a324 12402 reg_names [REGNO (addr.offset)], addr.shift);
c348cab0 12403 return true;
43e9d192
IB
12404
12405 case ADDRESS_REG_UXTW:
12406 if (addr.shift == 0)
16a3246f 12407 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
12408 REGNO (addr.offset) - R0_REGNUM);
12409 else
16a3246f 12410 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 12411 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 12412 return true;
43e9d192
IB
12413
12414 case ADDRESS_REG_SXTW:
12415 if (addr.shift == 0)
16a3246f 12416 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
12417 REGNO (addr.offset) - R0_REGNUM);
12418 else
16a3246f 12419 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 12420 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 12421 return true;
43e9d192
IB
12422
12423 case ADDRESS_REG_WB:
6a70badb
RS
12424 /* Writeback is only supported for fixed-width modes. */
12425 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192
IB
12426 switch (GET_CODE (x))
12427 {
12428 case PRE_INC:
6a70badb 12429 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
c348cab0 12430 return true;
43e9d192 12431 case POST_INC:
6a70badb 12432 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
c348cab0 12433 return true;
43e9d192 12434 case PRE_DEC:
6a70badb 12435 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
c348cab0 12436 return true;
43e9d192 12437 case POST_DEC:
6a70badb 12438 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
c348cab0 12439 return true;
43e9d192 12440 case PRE_MODIFY:
6a70badb 12441 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
43e9d192 12442 INTVAL (addr.offset));
c348cab0 12443 return true;
43e9d192 12444 case POST_MODIFY:
6a70badb 12445 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
43e9d192 12446 INTVAL (addr.offset));
c348cab0 12447 return true;
43e9d192
IB
12448 default:
12449 break;
12450 }
12451 break;
12452
12453 case ADDRESS_LO_SUM:
16a3246f 12454 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
43e9d192
IB
12455 output_addr_const (f, addr.offset);
12456 asm_fprintf (f, "]");
c348cab0 12457 return true;
43e9d192
IB
12458
12459 case ADDRESS_SYMBOLIC:
d6591257 12460 output_addr_const (f, x);
c348cab0 12461 return true;
43e9d192
IB
12462 }
12463
c348cab0 12464 return false;
43e9d192
IB
12465}
12466
e69a816d
WD
12467/* Print address 'x' of a memory access with mode 'mode'. */
12468static void
12469aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12470{
43cacb12 12471 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
c348cab0 12472 output_addr_const (f, x);
e69a816d
WD
12473}
12474
74b27d8e
RS
12475/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12476
12477static bool
12478aarch64_output_addr_const_extra (FILE *file, rtx x)
12479{
12480 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12481 {
12482 output_addr_const (file, XVECEXP (x, 0, 0));
12483 return true;
12484 }
12485 return false;
12486}
12487
43e9d192
IB
12488bool
12489aarch64_label_mentioned_p (rtx x)
12490{
12491 const char *fmt;
12492 int i;
12493
3793ecc1 12494 if (LABEL_REF_P (x))
43e9d192
IB
12495 return true;
12496
12497 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12498 referencing instruction, but they are constant offsets, not
12499 symbols. */
12500 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12501 return false;
12502
12503 fmt = GET_RTX_FORMAT (GET_CODE (x));
12504 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12505 {
12506 if (fmt[i] == 'E')
12507 {
12508 int j;
12509
12510 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12511 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12512 return 1;
12513 }
12514 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12515 return 1;
12516 }
12517
12518 return 0;
12519}
12520
12521/* Implement REGNO_REG_CLASS. */
12522
12523enum reg_class
12524aarch64_regno_regclass (unsigned regno)
12525{
96b7f495
MM
12526 if (STUB_REGNUM_P (regno))
12527 return STUB_REGS;
12528
43e9d192 12529 if (GP_REGNUM_P (regno))
a4a182c6 12530 return GENERAL_REGS;
43e9d192
IB
12531
12532 if (regno == SP_REGNUM)
12533 return STACK_REG;
12534
12535 if (regno == FRAME_POINTER_REGNUM
12536 || regno == ARG_POINTER_REGNUM)
f24bb080 12537 return POINTER_REGS;
43e9d192
IB
12538
12539 if (FP_REGNUM_P (regno))
163b1f6a
RS
12540 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12541 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
43e9d192 12542
43cacb12
RS
12543 if (PR_REGNUM_P (regno))
12544 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12545
183bfdaf
RS
12546 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12547 return FFR_REGS;
12548
43e9d192
IB
12549 return NO_REGS;
12550}
12551
6a70badb
RS
12552/* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12553 If OFFSET is out of range, return an offset of an anchor point
12554 that is in range. Return 0 otherwise. */
12555
12556static HOST_WIDE_INT
12557aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12558 machine_mode mode)
12559{
12560 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12561 if (size > 16)
12562 return (offset + 0x400) & ~0x7f0;
12563
12564 /* For offsets that aren't a multiple of the access size, the limit is
12565 -256...255. */
12566 if (offset & (size - 1))
12567 {
12568 /* BLKmode typically uses LDP of X-registers. */
12569 if (mode == BLKmode)
12570 return (offset + 512) & ~0x3ff;
12571 return (offset + 0x100) & ~0x1ff;
12572 }
12573
12574 /* Small negative offsets are supported. */
12575 if (IN_RANGE (offset, -256, 0))
12576 return 0;
12577
0dc8e1e7 12578 if (mode == TImode || mode == TFmode || mode == TDmode)
6a70badb
RS
12579 return (offset + 0x100) & ~0x1ff;
12580
12581 /* Use 12-bit offset by access size. */
12582 return offset & (~0xfff * size);
12583}
12584
0c4ec427 12585static rtx
ef4bddc2 12586aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
0c4ec427
RE
12587{
12588 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12589 where mask is selected by alignment and size of the offset.
12590 We try to pick as large a range for the offset as possible to
12591 maximize the chance of a CSE. However, for aligned addresses
12592 we limit the range to 4k so that structures with different sized
e8426e0a
BC
12593 elements are likely to use the same base. We need to be careful
12594 not to split a CONST for some forms of address expression, otherwise
12595 it will generate sub-optimal code. */
0c4ec427
RE
12596
12597 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12598 {
9e0218fc 12599 rtx base = XEXP (x, 0);
17d7bdd8 12600 rtx offset_rtx = XEXP (x, 1);
9e0218fc 12601 HOST_WIDE_INT offset = INTVAL (offset_rtx);
0c4ec427 12602
9e0218fc 12603 if (GET_CODE (base) == PLUS)
e8426e0a 12604 {
9e0218fc
RH
12605 rtx op0 = XEXP (base, 0);
12606 rtx op1 = XEXP (base, 1);
12607
12608 /* Force any scaling into a temp for CSE. */
12609 op0 = force_reg (Pmode, op0);
12610 op1 = force_reg (Pmode, op1);
12611
12612 /* Let the pointer register be in op0. */
12613 if (REG_POINTER (op1))
12614 std::swap (op0, op1);
12615
12616 /* If the pointer is virtual or frame related, then we know that
12617 virtual register instantiation or register elimination is going
12618 to apply a second constant. We want the two constants folded
12619 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12620 if (virt_or_elim_regno_p (REGNO (op0)))
e8426e0a 12621 {
9e0218fc
RH
12622 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12623 NULL_RTX, true, OPTAB_DIRECT);
12624 return gen_rtx_PLUS (Pmode, base, op1);
e8426e0a 12625 }
e8426e0a 12626
9e0218fc
RH
12627 /* Otherwise, in order to encourage CSE (and thence loop strength
12628 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12629 base = expand_binop (Pmode, add_optab, op0, op1,
12630 NULL_RTX, true, OPTAB_DIRECT);
12631 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
e8426e0a
BC
12632 }
12633
6a70badb
RS
12634 HOST_WIDE_INT size;
12635 if (GET_MODE_SIZE (mode).is_constant (&size))
ff0f3f1c 12636 {
6a70badb
RS
12637 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12638 mode);
12639 if (base_offset != 0)
12640 {
12641 base = plus_constant (Pmode, base, base_offset);
12642 base = force_operand (base, NULL_RTX);
12643 return plus_constant (Pmode, base, offset - base_offset);
12644 }
9e0218fc 12645 }
0c4ec427
RE
12646 }
12647
12648 return x;
12649}
12650
43e9d192
IB
12651static reg_class_t
12652aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12653 reg_class_t rclass,
ef4bddc2 12654 machine_mode mode,
43e9d192
IB
12655 secondary_reload_info *sri)
12656{
cc68f7c2
RS
12657 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12658 LDR and STR. See the comment at the head of aarch64-sve.md for
12659 more details about the big-endian handling. */
721c0fb3 12660 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
cc68f7c2 12661 if (reg_class_subset_p (rclass, FP_REGS)
9a1b9cb4
RS
12662 && !((REG_P (x) && HARD_REGISTER_P (x))
12663 || aarch64_simd_valid_immediate (x, NULL))
721c0fb3
RS
12664 && mode != VNx16QImode
12665 && (vec_flags & VEC_SVE_DATA)
12666 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
43cacb12 12667 {
721c0fb3
RS
12668 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12669 return NO_REGS;
43cacb12 12670 }
b4f50fd4
RR
12671
12672 /* If we have to disable direct literal pool loads and stores because the
12673 function is too big, then we need a scratch register. */
3793ecc1 12674 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
b4f50fd4
RR
12675 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12676 || targetm.vector_mode_supported_p (GET_MODE (x)))
9ee6540a 12677 && !aarch64_pcrelative_literal_loads)
b4f50fd4 12678 {
0016d8d9 12679 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
b4f50fd4
RR
12680 return NO_REGS;
12681 }
12682
43e9d192
IB
12683 /* Without the TARGET_SIMD instructions we cannot move a Q register
12684 to a Q register directly. We need a scratch. */
0dc8e1e7 12685 if (REG_P (x)
721c0fb3
RS
12686 && (mode == TFmode
12687 || mode == TImode
12688 || mode == TDmode
12689 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
0dc8e1e7 12690 && mode == GET_MODE (x)
721c0fb3
RS
12691 && !TARGET_SIMD
12692 && FP_REGNUM_P (REGNO (x))
43e9d192
IB
12693 && reg_class_subset_p (rclass, FP_REGS))
12694 {
0016d8d9 12695 sri->icode = code_for_aarch64_reload_mov (mode);
43e9d192
IB
12696 return NO_REGS;
12697 }
12698
0dc8e1e7 12699 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
43e9d192
IB
12700 because AArch64 has richer addressing modes for LDR/STR instructions
12701 than LDP/STP instructions. */
d5726973 12702 if (TARGET_FLOAT && rclass == GENERAL_REGS
6a70badb 12703 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
43e9d192
IB
12704 return FP_REGS;
12705
0dc8e1e7
CL
12706 if (rclass == FP_REGS
12707 && (mode == TImode || mode == TFmode || mode == TDmode)
12708 && CONSTANT_P(x))
a4a182c6 12709 return GENERAL_REGS;
43e9d192
IB
12710
12711 return NO_REGS;
12712}
12713
721c0fb3
RS
12714/* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12715
12716static bool
12717aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12718 reg_class_t class2)
12719{
12720 if (!TARGET_SIMD
12721 && reg_classes_intersect_p (class1, FP_REGS)
12722 && reg_classes_intersect_p (class2, FP_REGS))
12723 {
12724 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12725 so we can't easily split a move involving tuples of 128-bit
12726 vectors. Force the copy through memory instead.
12727
12728 (Tuples of 64-bit vectors are fine.) */
12729 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12730 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12731 return true;
12732 }
12733 return false;
12734}
12735
43e9d192 12736static bool
6216fd90 12737aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
43e9d192 12738{
6216fd90 12739 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
43e9d192 12740
6216fd90
WD
12741 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12742 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
43e9d192 12743 if (frame_pointer_needed)
6216fd90 12744 return to == HARD_FRAME_POINTER_REGNUM;
43e9d192
IB
12745 return true;
12746}
12747
6a70badb 12748poly_int64
43e9d192
IB
12749aarch64_initial_elimination_offset (unsigned from, unsigned to)
12750{
78c29983
MS
12751 if (to == HARD_FRAME_POINTER_REGNUM)
12752 {
12753 if (from == ARG_POINTER_REGNUM)
71bfb77a 12754 return cfun->machine->frame.hard_fp_offset;
78c29983
MS
12755
12756 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
12757 return cfun->machine->frame.hard_fp_offset
12758 - cfun->machine->frame.locals_offset;
78c29983
MS
12759 }
12760
12761 if (to == STACK_POINTER_REGNUM)
12762 {
12763 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
12764 return cfun->machine->frame.frame_size
12765 - cfun->machine->frame.locals_offset;
78c29983
MS
12766 }
12767
1c960e02 12768 return cfun->machine->frame.frame_size;
43e9d192
IB
12769}
12770
463a54e5
SN
12771
12772/* Get return address without mangling. */
12773
12774rtx
12775aarch64_return_addr_rtx (void)
12776{
12777 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12778 /* Note: aarch64_return_address_signing_enabled only
12779 works after cfun->machine->frame.laid_out is set,
12780 so here we don't know if the return address will
12781 be signed or not. */
12782 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12783 emit_move_insn (lr, val);
12784 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12785 return lr;
12786}
12787
12788
43e9d192
IB
12789/* Implement RETURN_ADDR_RTX. We do not support moving back to a
12790 previous frame. */
12791
12792rtx
12793aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12794{
12795 if (count != 0)
12796 return const0_rtx;
463a54e5 12797 return aarch64_return_addr_rtx ();
43e9d192
IB
12798}
12799
43e9d192
IB
12800static void
12801aarch64_asm_trampoline_template (FILE *f)
12802{
be7c41a5
OT
12803 /* Even if the current function doesn't have branch protection, some
12804 later function might, so since this template is only generated once
12805 we have to add a BTI just in case. */
12806 asm_fprintf (f, "\thint\t34 // bti c\n");
b5f794b4 12807
28514dda
YZ
12808 if (TARGET_ILP32)
12809 {
be178ecd
MM
12810 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12811 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
28514dda
YZ
12812 }
12813 else
12814 {
be178ecd
MM
12815 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12816 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
28514dda 12817 }
01a3a324 12818 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
b5f794b4 12819
be178ecd
MM
12820 /* We always emit a speculation barrier.
12821 This is because the same trampoline template is used for every nested
12822 function. Since nested functions are not particularly common or
12823 performant we don't worry too much about the extra instructions to copy
12824 around.
12825 This is not yet a problem, since we have not yet implemented function
12826 specific attributes to choose between hardening against straight line
12827 speculation or not, but such function specific attributes are likely to
12828 happen in the future. */
12829 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12830
28514dda
YZ
12831 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12832 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
43e9d192
IB
12833}
12834
12835static void
12836aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12837{
12838 rtx fnaddr, mem, a_tramp;
be178ecd 12839 const int tramp_code_sz = 24;
43e9d192
IB
12840
12841 /* Don't need to copy the trailing D-words, we fill those in below. */
be178ecd
MM
12842 /* We create our own memory address in Pmode so that `emit_block_move` can
12843 use parts of the backend which expect Pmode addresses. */
12844 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12845 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12846 assemble_trampoline_template (),
28514dda
YZ
12847 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12848 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
43e9d192 12849 fnaddr = XEXP (DECL_RTL (fndecl), 0);
28514dda
YZ
12850 if (GET_MODE (fnaddr) != ptr_mode)
12851 fnaddr = convert_memory_address (ptr_mode, fnaddr);
43e9d192
IB
12852 emit_move_insn (mem, fnaddr);
12853
28514dda 12854 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
43e9d192
IB
12855 emit_move_insn (mem, chain_value);
12856
12857 /* XXX We should really define a "clear_cache" pattern and use
12858 gen_clear_cache(). */
12859 a_tramp = XEXP (m_tramp, 0);
c05ece92
AO
12860 maybe_emit_call_builtin___clear_cache (a_tramp,
12861 plus_constant (ptr_mode,
12862 a_tramp,
12863 TRAMPOLINE_SIZE));
43e9d192
IB
12864}
12865
12866static unsigned char
ef4bddc2 12867aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
43e9d192 12868{
6a70badb
RS
12869 /* ??? Logically we should only need to provide a value when
12870 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12871 can hold MODE, but at the moment we need to handle all modes.
12872 Just ignore any runtime parts for registers that can't store them. */
12873 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
550a3380 12874 unsigned int nregs, vec_flags;
43e9d192
IB
12875 switch (regclass)
12876 {
96b7f495 12877 case STUB_REGS:
d677263e 12878 case TAILCALL_ADDR_REGS:
43e9d192
IB
12879 case POINTER_REGS:
12880 case GENERAL_REGS:
12881 case ALL_REGS:
f25a140b 12882 case POINTER_AND_FP_REGS:
43e9d192
IB
12883 case FP_REGS:
12884 case FP_LO_REGS:
163b1f6a 12885 case FP_LO8_REGS:
550a3380
RS
12886 vec_flags = aarch64_classify_vector_mode (mode);
12887 if ((vec_flags & VEC_SVE_DATA)
43cacb12 12888 && constant_multiple_p (GET_MODE_SIZE (mode),
550a3380 12889 aarch64_vl_bytes (mode, vec_flags), &nregs))
43cacb12 12890 return nregs;
550a3380 12891 return (vec_flags & VEC_ADVSIMD
6a70badb
RS
12892 ? CEIL (lowest_size, UNITS_PER_VREG)
12893 : CEIL (lowest_size, UNITS_PER_WORD));
43e9d192 12894 case STACK_REG:
43cacb12
RS
12895 case PR_REGS:
12896 case PR_LO_REGS:
12897 case PR_HI_REGS:
183bfdaf
RS
12898 case FFR_REGS:
12899 case PR_AND_FFR_REGS:
43e9d192
IB
12900 return 1;
12901
12902 case NO_REGS:
12903 return 0;
12904
12905 default:
12906 break;
12907 }
12908 gcc_unreachable ();
12909}
12910
12911static reg_class_t
78d8b9f0 12912aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
43e9d192 12913{
51bb310d 12914 if (regclass == POINTER_REGS)
78d8b9f0
IB
12915 return GENERAL_REGS;
12916
51bb310d
MS
12917 if (regclass == STACK_REG)
12918 {
12919 if (REG_P(x)
12920 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12921 return regclass;
12922
12923 return NO_REGS;
12924 }
12925
27bd251b
IB
12926 /* Register eliminiation can result in a request for
12927 SP+constant->FP_REGS. We cannot support such operations which
12928 use SP as source and an FP_REG as destination, so reject out
12929 right now. */
12930 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12931 {
12932 rtx lhs = XEXP (x, 0);
12933
12934 /* Look through a possible SUBREG introduced by ILP32. */
3793ecc1 12935 if (SUBREG_P (lhs))
27bd251b
IB
12936 lhs = SUBREG_REG (lhs);
12937
12938 gcc_assert (REG_P (lhs));
12939 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12940 POINTER_REGS));
12941 return NO_REGS;
12942 }
12943
78d8b9f0 12944 return regclass;
43e9d192
IB
12945}
12946
12947void
12948aarch64_asm_output_labelref (FILE* f, const char *name)
12949{
12950 asm_fprintf (f, "%U%s", name);
12951}
12952
12953static void
12954aarch64_elf_asm_constructor (rtx symbol, int priority)
12955{
12956 if (priority == DEFAULT_INIT_PRIORITY)
12957 default_ctor_section_asm_out_constructor (symbol, priority);
12958 else
12959 {
12960 section *s;
53d190c1
AT
12961 /* While priority is known to be in range [0, 65535], so 18 bytes
12962 would be enough, the compiler might not know that. To avoid
12963 -Wformat-truncation false positive, use a larger size. */
12964 char buf[23];
43e9d192 12965 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
fcef3abd 12966 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
12967 switch_to_section (s);
12968 assemble_align (POINTER_SIZE);
28514dda 12969 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
12970 }
12971}
12972
12973static void
12974aarch64_elf_asm_destructor (rtx symbol, int priority)
12975{
12976 if (priority == DEFAULT_INIT_PRIORITY)
12977 default_dtor_section_asm_out_destructor (symbol, priority);
12978 else
12979 {
12980 section *s;
53d190c1
AT
12981 /* While priority is known to be in range [0, 65535], so 18 bytes
12982 would be enough, the compiler might not know that. To avoid
12983 -Wformat-truncation false positive, use a larger size. */
12984 char buf[23];
43e9d192 12985 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
fcef3abd 12986 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
12987 switch_to_section (s);
12988 assemble_align (POINTER_SIZE);
28514dda 12989 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
12990 }
12991}
12992
12993const char*
12994aarch64_output_casesi (rtx *operands)
12995{
12996 char buf[100];
12997 char label[100];
b32d5189 12998 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
43e9d192
IB
12999 int index;
13000 static const char *const patterns[4][2] =
13001 {
13002 {
13003 "ldrb\t%w3, [%0,%w1,uxtw]",
13004 "add\t%3, %4, %w3, sxtb #2"
13005 },
13006 {
13007 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13008 "add\t%3, %4, %w3, sxth #2"
13009 },
13010 {
13011 "ldr\t%w3, [%0,%w1,uxtw #2]",
13012 "add\t%3, %4, %w3, sxtw #2"
13013 },
13014 /* We assume that DImode is only generated when not optimizing and
13015 that we don't really need 64-bit address offsets. That would
13016 imply an object file with 8GB of code in a single function! */
13017 {
13018 "ldr\t%w3, [%0,%w1,uxtw #2]",
13019 "add\t%3, %4, %w3, sxtw #2"
13020 }
13021 };
13022
13023 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13024
77e994c9
RS
13025 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13026 index = exact_log2 (GET_MODE_SIZE (mode));
43e9d192
IB
13027
13028 gcc_assert (index >= 0 && index <= 3);
13029
13030 /* Need to implement table size reduction, by chaning the code below. */
13031 output_asm_insn (patterns[index][0], operands);
13032 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13033 snprintf (buf, sizeof (buf),
13034 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13035 output_asm_insn (buf, operands);
13036 output_asm_insn (patterns[index][1], operands);
13037 output_asm_insn ("br\t%3", operands);
be178ecd
MM
13038 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13039 operands);
43e9d192
IB
13040 assemble_label (asm_out_file, label);
13041 return "";
13042}
13043
13044
13045/* Return size in bits of an arithmetic operand which is shifted/scaled and
13046 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13047 operator. */
13048
13049int
13050aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13051{
13052 if (shift >= 0 && shift <= 3)
13053 {
13054 int size;
13055 for (size = 8; size <= 32; size *= 2)
13056 {
13057 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13058 if (mask == bits << shift)
13059 return size;
13060 }
13061 }
13062 return 0;
13063}
13064
e78d485e
RR
13065/* Constant pools are per function only when PC relative
13066 literal loads are true or we are in the large memory
13067 model. */
13068
13069static inline bool
13070aarch64_can_use_per_function_literal_pools_p (void)
13071{
9ee6540a 13072 return (aarch64_pcrelative_literal_loads
e78d485e
RR
13073 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13074}
13075
43e9d192 13076static bool
e78d485e 13077aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
43e9d192 13078{
74a9301d
VM
13079 /* We can't use blocks for constants when we're using a per-function
13080 constant pool. */
13081 return !aarch64_can_use_per_function_literal_pools_p ();
43e9d192
IB
13082}
13083
e78d485e
RR
13084/* Select appropriate section for constants depending
13085 on where we place literal pools. */
13086
43e9d192 13087static section *
e78d485e
RR
13088aarch64_select_rtx_section (machine_mode mode,
13089 rtx x,
13090 unsigned HOST_WIDE_INT align)
43e9d192 13091{
e78d485e
RR
13092 if (aarch64_can_use_per_function_literal_pools_p ())
13093 return function_section (current_function_decl);
43e9d192 13094
e78d485e
RR
13095 return default_elf_select_rtx_section (mode, x, align);
13096}
43e9d192 13097
5fca7b66
RH
13098/* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13099void
13100aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13101 HOST_WIDE_INT offset)
13102{
13103 /* When using per-function literal pools, we must ensure that any code
13104 section is aligned to the minimal instruction length, lest we get
13105 errors from the assembler re "unaligned instructions". */
13106 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13107 ASM_OUTPUT_ALIGN (f, 2);
13108}
13109
43e9d192
IB
13110/* Costs. */
13111
13112/* Helper function for rtx cost calculation. Strip a shift expression
13113 from X. Returns the inner operand if successful, or the original
13114 expression on failure. */
13115static rtx
13116aarch64_strip_shift (rtx x)
13117{
13118 rtx op = x;
13119
57b77d46
RE
13120 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13121 we can convert both to ROR during final output. */
43e9d192
IB
13122 if ((GET_CODE (op) == ASHIFT
13123 || GET_CODE (op) == ASHIFTRT
57b77d46
RE
13124 || GET_CODE (op) == LSHIFTRT
13125 || GET_CODE (op) == ROTATERT
13126 || GET_CODE (op) == ROTATE)
43e9d192
IB
13127 && CONST_INT_P (XEXP (op, 1)))
13128 return XEXP (op, 0);
13129
13130 if (GET_CODE (op) == MULT
13131 && CONST_INT_P (XEXP (op, 1))
13132 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13133 return XEXP (op, 0);
13134
13135 return x;
13136}
13137
4745e701 13138/* Helper function for rtx cost calculation. Strip an extend
43e9d192
IB
13139 expression from X. Returns the inner operand if successful, or the
13140 original expression on failure. We deal with a number of possible
b10f1009
AP
13141 canonicalization variations here. If STRIP_SHIFT is true, then
13142 we can strip off a shift also. */
43e9d192 13143static rtx
b10f1009 13144aarch64_strip_extend (rtx x, bool strip_shift)
43e9d192 13145{
77e994c9 13146 scalar_int_mode mode;
43e9d192
IB
13147 rtx op = x;
13148
77e994c9
RS
13149 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13150 return op;
13151
43e9d192
IB
13152 if (GET_CODE (op) == AND
13153 && GET_CODE (XEXP (op, 0)) == MULT
13154 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13155 && CONST_INT_P (XEXP (op, 1))
13156 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13157 INTVAL (XEXP (op, 1))) != 0)
13158 return XEXP (XEXP (op, 0), 0);
13159
13160 /* Now handle extended register, as this may also have an optional
13161 left shift by 1..4. */
b10f1009
AP
13162 if (strip_shift
13163 && GET_CODE (op) == ASHIFT
43e9d192
IB
13164 && CONST_INT_P (XEXP (op, 1))
13165 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13166 op = XEXP (op, 0);
13167
13168 if (GET_CODE (op) == ZERO_EXTEND
13169 || GET_CODE (op) == SIGN_EXTEND)
13170 op = XEXP (op, 0);
13171
13172 if (op != x)
13173 return op;
13174
4745e701
JG
13175 return x;
13176}
13177
63834c84
JW
13178/* Helper function for rtx cost calculation. Strip extension as well as any
13179 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13180 successful, or the original expression on failure. */
13181static rtx
13182aarch64_strip_extend_vec_half (rtx x)
13183{
13184 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13185 {
13186 x = XEXP (x, 0);
13187 if (GET_CODE (x) == VEC_SELECT
13188 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13189 XEXP (x, 1)))
13190 x = XEXP (x, 0);
13191 }
13192 return x;
13193}
1d65c9d2
JW
13194
13195/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13196 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13197 operand if successful, or the original expression on failure. */
13198static rtx
13199aarch64_strip_duplicate_vec_elt (rtx x)
13200{
13201 if (GET_CODE (x) == VEC_DUPLICATE
13202 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13203 {
13204 x = XEXP (x, 0);
13205 if (GET_CODE (x) == VEC_SELECT)
13206 x = XEXP (x, 0);
13207 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13208 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13209 x = XEXP (XEXP (x, 0), 0);
13210 }
13211 return x;
13212}
13213
0a78ebe4
KT
13214/* Return true iff CODE is a shift supported in combination
13215 with arithmetic instructions. */
4d1919ed 13216
0a78ebe4
KT
13217static bool
13218aarch64_shift_p (enum rtx_code code)
13219{
13220 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13221}
13222
b10f1009
AP
13223
13224/* Return true iff X is a cheap shift without a sign extend. */
13225
13226static bool
13227aarch64_cheap_mult_shift_p (rtx x)
13228{
13229 rtx op0, op1;
13230
13231 op0 = XEXP (x, 0);
13232 op1 = XEXP (x, 1);
13233
13234 if (!(aarch64_tune_params.extra_tuning_flags
13235 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13236 return false;
13237
13238 if (GET_CODE (op0) == SIGN_EXTEND)
13239 return false;
13240
13241 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13242 && UINTVAL (op1) <= 4)
13243 return true;
13244
13245 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13246 return false;
13247
13248 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13249
13250 if (l2 > 0 && l2 <= 4)
13251 return true;
13252
13253 return false;
13254}
13255
4745e701 13256/* Helper function for rtx cost calculation. Calculate the cost of
0a78ebe4
KT
13257 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13258 Return the calculated cost of the expression, recursing manually in to
4745e701
JG
13259 operands where needed. */
13260
13261static int
e548c9df 13262aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
4745e701
JG
13263{
13264 rtx op0, op1;
13265 const struct cpu_cost_table *extra_cost
b175b679 13266 = aarch64_tune_params.insn_extra_cost;
4745e701 13267 int cost = 0;
0a78ebe4 13268 bool compound_p = (outer == PLUS || outer == MINUS);
ef4bddc2 13269 machine_mode mode = GET_MODE (x);
4745e701
JG
13270
13271 gcc_checking_assert (code == MULT);
13272
13273 op0 = XEXP (x, 0);
13274 op1 = XEXP (x, 1);
13275
13276 if (VECTOR_MODE_P (mode))
df81764b
TC
13277 {
13278 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
721c0fb3 13279 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
df81764b 13280 {
63834c84
JW
13281 /* The select-operand-high-half versions of the instruction have the
13282 same cost as the three vector version - don't add the costs of the
13283 extension or selection into the costs of the multiply. */
13284 op0 = aarch64_strip_extend_vec_half (op0);
13285 op1 = aarch64_strip_extend_vec_half (op1);
df81764b 13286 /* The by-element versions of the instruction have the same costs as
1d65c9d2
JW
13287 the normal 3-vector version. We make an assumption that the input
13288 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13289 costing of a MUL by element pre RA is a bit optimistic. */
13290 op0 = aarch64_strip_duplicate_vec_elt (op0);
13291 op1 = aarch64_strip_duplicate_vec_elt (op1);
df81764b 13292 }
a11ef532
AV
13293 cost += rtx_cost (op0, mode, MULT, 0, speed);
13294 cost += rtx_cost (op1, mode, MULT, 1, speed);
13295 if (speed)
13296 {
13297 if (GET_CODE (x) == MULT)
13298 cost += extra_cost->vect.mult;
13299 /* This is to catch the SSRA costing currently flowing here. */
13300 else
13301 cost += extra_cost->vect.alu;
13302 }
13303 return cost;
df81764b 13304 }
4745e701
JG
13305
13306 /* Integer multiply/fma. */
13307 if (GET_MODE_CLASS (mode) == MODE_INT)
13308 {
13309 /* The multiply will be canonicalized as a shift, cost it as such. */
0a78ebe4
KT
13310 if (aarch64_shift_p (GET_CODE (x))
13311 || (CONST_INT_P (op1)
13312 && exact_log2 (INTVAL (op1)) > 0))
4745e701 13313 {
0a78ebe4
KT
13314 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13315 || GET_CODE (op0) == SIGN_EXTEND;
4745e701
JG
13316 if (speed)
13317 {
0a78ebe4
KT
13318 if (compound_p)
13319 {
b10f1009
AP
13320 /* If the shift is considered cheap,
13321 then don't add any cost. */
13322 if (aarch64_cheap_mult_shift_p (x))
13323 ;
13324 else if (REG_P (op1))
0a78ebe4
KT
13325 /* ARITH + shift-by-register. */
13326 cost += extra_cost->alu.arith_shift_reg;
13327 else if (is_extend)
13328 /* ARITH + extended register. We don't have a cost field
13329 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13330 cost += extra_cost->alu.extend_arith;
13331 else
13332 /* ARITH + shift-by-immediate. */
13333 cost += extra_cost->alu.arith_shift;
13334 }
4745e701
JG
13335 else
13336 /* LSL (immediate). */
0a78ebe4
KT
13337 cost += extra_cost->alu.shift;
13338
4745e701 13339 }
0a78ebe4
KT
13340 /* Strip extends as we will have costed them in the case above. */
13341 if (is_extend)
b10f1009 13342 op0 = aarch64_strip_extend (op0, true);
4745e701 13343
e548c9df 13344 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
4745e701
JG
13345
13346 return cost;
13347 }
13348
d2ac256b
KT
13349 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13350 compound and let the below cases handle it. After all, MNEG is a
13351 special-case alias of MSUB. */
13352 if (GET_CODE (op0) == NEG)
13353 {
13354 op0 = XEXP (op0, 0);
13355 compound_p = true;
13356 }
13357
4745e701
JG
13358 /* Integer multiplies or FMAs have zero/sign extending variants. */
13359 if ((GET_CODE (op0) == ZERO_EXTEND
13360 && GET_CODE (op1) == ZERO_EXTEND)
13361 || (GET_CODE (op0) == SIGN_EXTEND
13362 && GET_CODE (op1) == SIGN_EXTEND))
13363 {
e548c9df
AM
13364 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13365 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
4745e701
JG
13366
13367 if (speed)
13368 {
0a78ebe4 13369 if (compound_p)
d2ac256b 13370 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
4745e701
JG
13371 cost += extra_cost->mult[0].extend_add;
13372 else
13373 /* MUL/SMULL/UMULL. */
13374 cost += extra_cost->mult[0].extend;
13375 }
13376
13377 return cost;
13378 }
13379
d2ac256b 13380 /* This is either an integer multiply or a MADD. In both cases
4745e701 13381 we want to recurse and cost the operands. */
e548c9df
AM
13382 cost += rtx_cost (op0, mode, MULT, 0, speed);
13383 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
13384
13385 if (speed)
13386 {
0a78ebe4 13387 if (compound_p)
d2ac256b 13388 /* MADD/MSUB. */
4745e701
JG
13389 cost += extra_cost->mult[mode == DImode].add;
13390 else
13391 /* MUL. */
13392 cost += extra_cost->mult[mode == DImode].simple;
13393 }
13394
13395 return cost;
13396 }
13397 else
13398 {
13399 if (speed)
13400 {
3d840f7d 13401 /* Floating-point FMA/FMUL can also support negations of the
d318517d
SN
13402 operands, unless the rounding mode is upward or downward in
13403 which case FNMUL is different than FMUL with operand negation. */
13404 bool neg0 = GET_CODE (op0) == NEG;
13405 bool neg1 = GET_CODE (op1) == NEG;
13406 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13407 {
13408 if (neg0)
13409 op0 = XEXP (op0, 0);
13410 if (neg1)
13411 op1 = XEXP (op1, 0);
13412 }
4745e701 13413
0a78ebe4 13414 if (compound_p)
4745e701
JG
13415 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13416 cost += extra_cost->fp[mode == DFmode].fma;
13417 else
3d840f7d 13418 /* FMUL/FNMUL. */
4745e701
JG
13419 cost += extra_cost->fp[mode == DFmode].mult;
13420 }
13421
e548c9df
AM
13422 cost += rtx_cost (op0, mode, MULT, 0, speed);
13423 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
13424 return cost;
13425 }
43e9d192
IB
13426}
13427
67747367
JG
13428static int
13429aarch64_address_cost (rtx x,
ef4bddc2 13430 machine_mode mode,
67747367
JG
13431 addr_space_t as ATTRIBUTE_UNUSED,
13432 bool speed)
13433{
13434 enum rtx_code c = GET_CODE (x);
b175b679 13435 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
67747367
JG
13436 struct aarch64_address_info info;
13437 int cost = 0;
13438 info.shift = 0;
13439
a97d8b98 13440 if (!aarch64_classify_address (&info, x, mode, false))
67747367 13441 {
3793ecc1 13442 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
67747367
JG
13443 {
13444 /* This is a CONST or SYMBOL ref which will be split
13445 in a different way depending on the code model in use.
13446 Cost it through the generic infrastructure. */
e548c9df 13447 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
67747367
JG
13448 /* Divide through by the cost of one instruction to
13449 bring it to the same units as the address costs. */
13450 cost_symbol_ref /= COSTS_N_INSNS (1);
13451 /* The cost is then the cost of preparing the address,
13452 followed by an immediate (possibly 0) offset. */
13453 return cost_symbol_ref + addr_cost->imm_offset;
13454 }
13455 else
13456 {
13457 /* This is most likely a jump table from a case
13458 statement. */
13459 return addr_cost->register_offset;
13460 }
13461 }
13462
13463 switch (info.type)
13464 {
13465 case ADDRESS_LO_SUM:
13466 case ADDRESS_SYMBOLIC:
13467 case ADDRESS_REG_IMM:
13468 cost += addr_cost->imm_offset;
13469 break;
13470
13471 case ADDRESS_REG_WB:
13472 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13473 cost += addr_cost->pre_modify;
13474 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6b8b0c8e 13475 {
05783fe6
RS
13476 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13477 if (nvectors == 3)
6b8b0c8e 13478 cost += addr_cost->post_modify_ld3_st3;
05783fe6 13479 else if (nvectors == 4)
6b8b0c8e
RS
13480 cost += addr_cost->post_modify_ld4_st4;
13481 else
13482 cost += addr_cost->post_modify;
13483 }
67747367
JG
13484 else
13485 gcc_unreachable ();
13486
13487 break;
13488
13489 case ADDRESS_REG_REG:
13490 cost += addr_cost->register_offset;
13491 break;
13492
67747367 13493 case ADDRESS_REG_SXTW:
783879e6
EM
13494 cost += addr_cost->register_sextend;
13495 break;
13496
13497 case ADDRESS_REG_UXTW:
13498 cost += addr_cost->register_zextend;
67747367
JG
13499 break;
13500
13501 default:
13502 gcc_unreachable ();
13503 }
13504
13505
13506 if (info.shift > 0)
13507 {
13508 /* For the sake of calculating the cost of the shifted register
13509 component, we can treat same sized modes in the same way. */
6a70badb
RS
13510 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13511 cost += addr_cost->addr_scale_costs.hi;
13512 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13513 cost += addr_cost->addr_scale_costs.si;
13514 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13515 cost += addr_cost->addr_scale_costs.di;
13516 else
13517 /* We can't tell, or this is a 128-bit vector. */
13518 cost += addr_cost->addr_scale_costs.ti;
67747367
JG
13519 }
13520
13521 return cost;
13522}
13523
b9066f5a
MW
13524/* Return the cost of a branch. If SPEED_P is true then the compiler is
13525 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13526 to be taken. */
13527
13528int
13529aarch64_branch_cost (bool speed_p, bool predictable_p)
13530{
13531 /* When optimizing for speed, use the cost of unpredictable branches. */
13532 const struct cpu_branch_cost *branch_costs =
b175b679 13533 aarch64_tune_params.branch_costs;
b9066f5a
MW
13534
13535 if (!speed_p || predictable_p)
13536 return branch_costs->predictable;
13537 else
13538 return branch_costs->unpredictable;
13539}
13540
7de23b8c 13541/* Return true if X is a zero or sign extract
7cc2145f
JG
13542 usable in an ADD or SUB (extended register) instruction. */
13543static bool
7de23b8c 13544aarch64_rtx_arith_op_extract_p (rtx x)
7cc2145f 13545{
e47c4031
KT
13546 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13547 No shift. */
7de23b8c
AC
13548 if (GET_CODE (x) == SIGN_EXTEND
13549 || GET_CODE (x) == ZERO_EXTEND)
e47c4031 13550 return REG_P (XEXP (x, 0));
7cc2145f
JG
13551
13552 return false;
13553}
13554
61263118
KT
13555static bool
13556aarch64_frint_unspec_p (unsigned int u)
13557{
13558 switch (u)
13559 {
13560 case UNSPEC_FRINTZ:
13561 case UNSPEC_FRINTP:
13562 case UNSPEC_FRINTM:
13563 case UNSPEC_FRINTA:
13564 case UNSPEC_FRINTN:
13565 case UNSPEC_FRINTX:
13566 case UNSPEC_FRINTI:
13567 return true;
13568
13569 default:
13570 return false;
13571 }
13572}
13573
fb0cb7fa
KT
13574/* Return true iff X is an rtx that will match an extr instruction
13575 i.e. as described in the *extr<mode>5_insn family of patterns.
13576 OP0 and OP1 will be set to the operands of the shifts involved
13577 on success and will be NULL_RTX otherwise. */
13578
13579static bool
13580aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13581{
13582 rtx op0, op1;
77e994c9
RS
13583 scalar_int_mode mode;
13584 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13585 return false;
fb0cb7fa
KT
13586
13587 *res_op0 = NULL_RTX;
13588 *res_op1 = NULL_RTX;
13589
13590 if (GET_CODE (x) != IOR)
13591 return false;
13592
13593 op0 = XEXP (x, 0);
13594 op1 = XEXP (x, 1);
13595
13596 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13597 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13598 {
13599 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13600 if (GET_CODE (op1) == ASHIFT)
13601 std::swap (op0, op1);
13602
13603 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13604 return false;
13605
13606 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13607 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13608
13609 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13610 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13611 {
13612 *res_op0 = XEXP (op0, 0);
13613 *res_op1 = XEXP (op1, 0);
13614 return true;
13615 }
13616 }
13617
13618 return false;
13619}
13620
2d5ffe46
AP
13621/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13622 storing it in *COST. Result is true if the total cost of the operation
13623 has now been calculated. */
13624static bool
13625aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13626{
b9e3afe9
AP
13627 rtx inner;
13628 rtx comparator;
13629 enum rtx_code cmpcode;
e2a14bec
RS
13630 const struct cpu_cost_table *extra_cost
13631 = aarch64_tune_params.insn_extra_cost;
b9e3afe9
AP
13632
13633 if (COMPARISON_P (op0))
13634 {
13635 inner = XEXP (op0, 0);
13636 comparator = XEXP (op0, 1);
13637 cmpcode = GET_CODE (op0);
13638 }
13639 else
13640 {
13641 inner = op0;
13642 comparator = const0_rtx;
13643 cmpcode = NE;
13644 }
13645
2d5ffe46
AP
13646 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13647 {
13648 /* Conditional branch. */
b9e3afe9 13649 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46
AP
13650 return true;
13651 else
13652 {
b9e3afe9 13653 if (cmpcode == NE || cmpcode == EQ)
2d5ffe46 13654 {
2d5ffe46
AP
13655 if (comparator == const0_rtx)
13656 {
13657 /* TBZ/TBNZ/CBZ/CBNZ. */
13658 if (GET_CODE (inner) == ZERO_EXTRACT)
13659 /* TBZ/TBNZ. */
e548c9df
AM
13660 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13661 ZERO_EXTRACT, 0, speed);
13662 else
13663 /* CBZ/CBNZ. */
13664 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
2d5ffe46 13665
e2a14bec
RS
13666 return true;
13667 }
13668 if (register_operand (inner, VOIDmode)
13669 && aarch64_imm24 (comparator, VOIDmode))
13670 {
13671 /* SUB and SUBS. */
13672 *cost += COSTS_N_INSNS (2);
13673 if (speed)
13674 *cost += extra_cost->alu.arith * 2;
13675 return true;
13676 }
2d5ffe46 13677 }
b9e3afe9 13678 else if (cmpcode == LT || cmpcode == GE)
2d5ffe46 13679 {
2d5ffe46
AP
13680 /* TBZ/TBNZ. */
13681 if (comparator == const0_rtx)
13682 return true;
13683 }
13684 }
13685 }
b9e3afe9 13686 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46 13687 {
786298dc 13688 /* CCMP. */
6dfeb7ce 13689 if (GET_CODE (op1) == COMPARE)
786298dc
WD
13690 {
13691 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13692 if (XEXP (op1, 1) == const0_rtx)
13693 *cost += 1;
13694 if (speed)
13695 {
13696 machine_mode mode = GET_MODE (XEXP (op1, 0));
786298dc
WD
13697
13698 if (GET_MODE_CLASS (mode) == MODE_INT)
13699 *cost += extra_cost->alu.arith;
13700 else
13701 *cost += extra_cost->fp[mode == DFmode].compare;
13702 }
13703 return true;
13704 }
13705
2d5ffe46
AP
13706 /* It's a conditional operation based on the status flags,
13707 so it must be some flavor of CSEL. */
13708
13709 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13710 if (GET_CODE (op1) == NEG
13711 || GET_CODE (op1) == NOT
13712 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13713 op1 = XEXP (op1, 0);
bad00732
KT
13714 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13715 {
13716 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13717 op1 = XEXP (op1, 0);
13718 op2 = XEXP (op2, 0);
13719 }
d572ad49
AC
13720 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13721 {
13722 inner = XEXP (op1, 0);
13723 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13724 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13725 op1 = XEXP (inner, 0);
13726 }
2d5ffe46 13727
e548c9df
AM
13728 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13729 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
2d5ffe46
AP
13730 return true;
13731 }
13732
13733 /* We don't know what this is, cost all operands. */
13734 return false;
13735}
13736
283b6c85
KT
13737/* Check whether X is a bitfield operation of the form shift + extend that
13738 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13739 operand to which the bitfield operation is applied. Otherwise return
13740 NULL_RTX. */
13741
13742static rtx
13743aarch64_extend_bitfield_pattern_p (rtx x)
13744{
13745 rtx_code outer_code = GET_CODE (x);
13746 machine_mode outer_mode = GET_MODE (x);
13747
13748 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13749 && outer_mode != SImode && outer_mode != DImode)
13750 return NULL_RTX;
13751
13752 rtx inner = XEXP (x, 0);
13753 rtx_code inner_code = GET_CODE (inner);
13754 machine_mode inner_mode = GET_MODE (inner);
13755 rtx op = NULL_RTX;
13756
13757 switch (inner_code)
13758 {
13759 case ASHIFT:
13760 if (CONST_INT_P (XEXP (inner, 1))
13761 && (inner_mode == QImode || inner_mode == HImode))
13762 op = XEXP (inner, 0);
13763 break;
13764 case LSHIFTRT:
13765 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13766 && (inner_mode == QImode || inner_mode == HImode))
13767 op = XEXP (inner, 0);
13768 break;
13769 case ASHIFTRT:
13770 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13771 && (inner_mode == QImode || inner_mode == HImode))
13772 op = XEXP (inner, 0);
13773 break;
13774 default:
13775 break;
13776 }
13777
13778 return op;
13779}
13780
8c83f71d
KT
13781/* Return true if the mask and a shift amount from an RTX of the form
13782 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13783 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13784
13785bool
77e994c9
RS
13786aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13787 rtx shft_amnt)
8c83f71d
KT
13788{
13789 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
17ad8cde
JJ
13790 && INTVAL (mask) > 0
13791 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13792 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13793 && (UINTVAL (mask)
13794 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
8c83f71d
KT
13795}
13796
6a0d3939
SE
13797/* Return true if the masks and a shift amount from an RTX of the form
13798 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13799 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13800
13801bool
13802aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13803 unsigned HOST_WIDE_INT mask1,
13804 unsigned HOST_WIDE_INT shft_amnt,
13805 unsigned HOST_WIDE_INT mask2)
13806{
13807 unsigned HOST_WIDE_INT t;
13808
13809 /* Verify that there is no overlap in what bits are set in the two masks. */
13810 if (mask1 != ~mask2)
13811 return false;
13812
13813 /* Verify that mask2 is not all zeros or ones. */
13814 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13815 return false;
13816
13817 /* The shift amount should always be less than the mode size. */
13818 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13819
13820 /* Verify that the mask being shifted is contiguous and would be in the
13821 least significant bits after shifting by shft_amnt. */
13822 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13823 return (t == (t & -t));
13824}
13825
43e9d192
IB
13826/* Calculate the cost of calculating X, storing it in *COST. Result
13827 is true if the total cost of the operation has now been calculated. */
13828static bool
e548c9df 13829aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
43e9d192
IB
13830 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13831{
a8eecd00 13832 rtx op0, op1, op2;
73250c4c 13833 const struct cpu_cost_table *extra_cost
b175b679 13834 = aarch64_tune_params.insn_extra_cost;
1d5c43db 13835 rtx_code code = GET_CODE (x);
b4206259 13836 scalar_int_mode int_mode;
43e9d192 13837
7fc5ef02
JG
13838 /* By default, assume that everything has equivalent cost to the
13839 cheapest instruction. Any additional costs are applied as a delta
13840 above this default. */
13841 *cost = COSTS_N_INSNS (1);
13842
43e9d192
IB
13843 switch (code)
13844 {
13845 case SET:
ba123b0d
JG
13846 /* The cost depends entirely on the operands to SET. */
13847 *cost = 0;
43e9d192
IB
13848 op0 = SET_DEST (x);
13849 op1 = SET_SRC (x);
13850
13851 switch (GET_CODE (op0))
13852 {
13853 case MEM:
13854 if (speed)
2961177e
JG
13855 {
13856 rtx address = XEXP (op0, 0);
b6875aac
KV
13857 if (VECTOR_MODE_P (mode))
13858 *cost += extra_cost->ldst.storev;
13859 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e 13860 *cost += extra_cost->ldst.store;
0dc8e1e7 13861 else if (mode == SFmode || mode == SDmode)
2961177e 13862 *cost += extra_cost->ldst.storef;
0dc8e1e7 13863 else if (mode == DFmode || mode == DDmode)
2961177e
JG
13864 *cost += extra_cost->ldst.stored;
13865
13866 *cost +=
13867 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13868 0, speed));
13869 }
43e9d192 13870
e548c9df 13871 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
13872 return true;
13873
13874 case SUBREG:
13875 if (! REG_P (SUBREG_REG (op0)))
e548c9df 13876 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
ba123b0d 13877
43e9d192
IB
13878 /* Fall through. */
13879 case REG:
b6875aac
KV
13880 /* The cost is one per vector-register copied. */
13881 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13882 {
fe1447a1
RS
13883 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13884 *cost = COSTS_N_INSNS (nregs);
b6875aac 13885 }
ba123b0d
JG
13886 /* const0_rtx is in general free, but we will use an
13887 instruction to set a register to 0. */
b6875aac
KV
13888 else if (REG_P (op1) || op1 == const0_rtx)
13889 {
13890 /* The cost is 1 per register copied. */
fe1447a1
RS
13891 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13892 *cost = COSTS_N_INSNS (nregs);
b6875aac 13893 }
ba123b0d
JG
13894 else
13895 /* Cost is just the cost of the RHS of the set. */
e548c9df 13896 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
13897 return true;
13898
ba123b0d 13899 case ZERO_EXTRACT:
43e9d192 13900 case SIGN_EXTRACT:
ba123b0d
JG
13901 /* Bit-field insertion. Strip any redundant widening of
13902 the RHS to meet the width of the target. */
568b9c0e 13903 if (SUBREG_P (op1))
43e9d192
IB
13904 op1 = SUBREG_REG (op1);
13905 if ((GET_CODE (op1) == ZERO_EXTEND
13906 || GET_CODE (op1) == SIGN_EXTEND)
4aa81c2e 13907 && CONST_INT_P (XEXP (op0, 1))
77e994c9
RS
13908 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13909 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
43e9d192 13910 op1 = XEXP (op1, 0);
ba123b0d
JG
13911
13912 if (CONST_INT_P (op1))
13913 {
13914 /* MOV immediate is assumed to always be cheap. */
13915 *cost = COSTS_N_INSNS (1);
13916 }
13917 else
13918 {
13919 /* BFM. */
13920 if (speed)
13921 *cost += extra_cost->alu.bfi;
e548c9df 13922 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
ba123b0d
JG
13923 }
13924
43e9d192
IB
13925 return true;
13926
13927 default:
ba123b0d
JG
13928 /* We can't make sense of this, assume default cost. */
13929 *cost = COSTS_N_INSNS (1);
61263118 13930 return false;
43e9d192
IB
13931 }
13932 return false;
13933
9dfc162c
JG
13934 case CONST_INT:
13935 /* If an instruction can incorporate a constant within the
13936 instruction, the instruction's expression avoids calling
13937 rtx_cost() on the constant. If rtx_cost() is called on a
13938 constant, then it is usually because the constant must be
13939 moved into a register by one or more instructions.
13940
13941 The exception is constant 0, which can be expressed
13942 as XZR/WZR and is therefore free. The exception to this is
13943 if we have (set (reg) (const0_rtx)) in which case we must cost
13944 the move. However, we can catch that when we cost the SET, so
13945 we don't need to consider that here. */
13946 if (x == const0_rtx)
13947 *cost = 0;
13948 else
13949 {
13950 /* To an approximation, building any other constant is
13951 proportionally expensive to the number of instructions
13952 required to build that constant. This is true whether we
13953 are compiling for SPEED or otherwise. */
ba1536da
WD
13954 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13955 ? SImode : DImode;
82614948 13956 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
ba1536da 13957 (NULL_RTX, x, false, imode));
9dfc162c
JG
13958 }
13959 return true;
13960
13961 case CONST_DOUBLE:
a2170965
TC
13962
13963 /* First determine number of instructions to do the move
13964 as an integer constant. */
13965 if (!aarch64_float_const_representable_p (x)
13966 && !aarch64_can_const_movi_rtx_p (x, mode)
13967 && aarch64_float_const_rtx_p (x))
13968 {
13969 unsigned HOST_WIDE_INT ival;
13970 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13971 gcc_assert (succeed);
13972
ba1536da
WD
13973 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13974 ? DImode : SImode;
a2170965
TC
13975 int ncost = aarch64_internal_mov_immediate
13976 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13977 *cost += COSTS_N_INSNS (ncost);
13978 return true;
13979 }
13980
9dfc162c
JG
13981 if (speed)
13982 {
13983 /* mov[df,sf]_aarch64. */
13984 if (aarch64_float_const_representable_p (x))
13985 /* FMOV (scalar immediate). */
0dc8e1e7 13986 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
9dfc162c
JG
13987 else if (!aarch64_float_const_zero_rtx_p (x))
13988 {
13989 /* This will be a load from memory. */
0dc8e1e7 13990 if (mode == DFmode || mode == DDmode)
9dfc162c
JG
13991 *cost += extra_cost->ldst.loadd;
13992 else
13993 *cost += extra_cost->ldst.loadf;
13994 }
13995 else
13996 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13997 or MOV v0.s[0], wzr - neither of which are modeled by the
13998 cost tables. Just use the default cost. */
13999 {
14000 }
14001 }
14002
14003 return true;
14004
43e9d192
IB
14005 case MEM:
14006 if (speed)
2961177e
JG
14007 {
14008 /* For loads we want the base cost of a load, plus an
14009 approximation for the additional cost of the addressing
14010 mode. */
14011 rtx address = XEXP (x, 0);
b6875aac
KV
14012 if (VECTOR_MODE_P (mode))
14013 *cost += extra_cost->ldst.loadv;
14014 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e 14015 *cost += extra_cost->ldst.load;
0dc8e1e7 14016 else if (mode == SFmode || mode == SDmode)
2961177e 14017 *cost += extra_cost->ldst.loadf;
0dc8e1e7 14018 else if (mode == DFmode || mode == DDmode)
2961177e
JG
14019 *cost += extra_cost->ldst.loadd;
14020
14021 *cost +=
14022 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14023 0, speed));
14024 }
43e9d192
IB
14025
14026 return true;
14027
14028 case NEG:
4745e701
JG
14029 op0 = XEXP (x, 0);
14030
b6875aac
KV
14031 if (VECTOR_MODE_P (mode))
14032 {
14033 if (speed)
14034 {
14035 /* FNEG. */
14036 *cost += extra_cost->vect.alu;
14037 }
14038 return false;
14039 }
14040
e548c9df
AM
14041 if (GET_MODE_CLASS (mode) == MODE_INT)
14042 {
4745e701
JG
14043 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14044 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14045 {
14046 /* CSETM. */
e548c9df 14047 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
4745e701
JG
14048 return true;
14049 }
14050
14051 /* Cost this as SUB wzr, X. */
e548c9df 14052 op0 = CONST0_RTX (mode);
4745e701
JG
14053 op1 = XEXP (x, 0);
14054 goto cost_minus;
14055 }
14056
e548c9df 14057 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
4745e701
JG
14058 {
14059 /* Support (neg(fma...)) as a single instruction only if
14060 sign of zeros is unimportant. This matches the decision
14061 making in aarch64.md. */
14062 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14063 {
14064 /* FNMADD. */
e548c9df 14065 *cost = rtx_cost (op0, mode, NEG, 0, speed);
4745e701
JG
14066 return true;
14067 }
d318517d
SN
14068 if (GET_CODE (op0) == MULT)
14069 {
14070 /* FNMUL. */
14071 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14072 return true;
14073 }
4745e701
JG
14074 if (speed)
14075 /* FNEG. */
14076 *cost += extra_cost->fp[mode == DFmode].neg;
14077 return false;
14078 }
14079
14080 return false;
43e9d192 14081
781aeb73
KT
14082 case CLRSB:
14083 case CLZ:
14084 if (speed)
b6875aac
KV
14085 {
14086 if (VECTOR_MODE_P (mode))
14087 *cost += extra_cost->vect.alu;
14088 else
14089 *cost += extra_cost->alu.clz;
14090 }
781aeb73
KT
14091
14092 return false;
14093
5bfc8303
WD
14094 case CTZ:
14095 *cost = COSTS_N_INSNS (2);
14096
14097 if (speed)
14098 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14099 return false;
14100
43e9d192
IB
14101 case COMPARE:
14102 op0 = XEXP (x, 0);
14103 op1 = XEXP (x, 1);
14104
14105 if (op1 == const0_rtx
14106 && GET_CODE (op0) == AND)
14107 {
14108 x = op0;
e548c9df 14109 mode = GET_MODE (op0);
43e9d192
IB
14110 goto cost_logic;
14111 }
14112
a8eecd00
JG
14113 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14114 {
14115 /* TODO: A write to the CC flags possibly costs extra, this
14116 needs encoding in the cost tables. */
14117
e548c9df 14118 mode = GET_MODE (op0);
a8eecd00
JG
14119 /* ANDS. */
14120 if (GET_CODE (op0) == AND)
14121 {
14122 x = op0;
14123 goto cost_logic;
14124 }
14125
14126 if (GET_CODE (op0) == PLUS)
14127 {
14128 /* ADDS (and CMN alias). */
14129 x = op0;
14130 goto cost_plus;
14131 }
14132
14133 if (GET_CODE (op0) == MINUS)
14134 {
14135 /* SUBS. */
14136 x = op0;
14137 goto cost_minus;
14138 }
14139
345854d8
KT
14140 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14141 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14142 && CONST_INT_P (XEXP (op0, 2)))
14143 {
14144 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14145 Handle it here directly rather than going to cost_logic
14146 since we know the immediate generated for the TST is valid
14147 so we can avoid creating an intermediate rtx for it only
14148 for costing purposes. */
14149 if (speed)
14150 *cost += extra_cost->alu.logical;
14151
14152 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14153 ZERO_EXTRACT, 0, speed);
14154 return true;
14155 }
14156
a8eecd00
JG
14157 if (GET_CODE (op1) == NEG)
14158 {
14159 /* CMN. */
14160 if (speed)
14161 *cost += extra_cost->alu.arith;
14162
e548c9df
AM
14163 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14164 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
a8eecd00
JG
14165 return true;
14166 }
14167
14168 /* CMP.
14169
14170 Compare can freely swap the order of operands, and
14171 canonicalization puts the more complex operation first.
14172 But the integer MINUS logic expects the shift/extend
14173 operation in op1. */
14174 if (! (REG_P (op0)
568b9c0e 14175 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
a8eecd00
JG
14176 {
14177 op0 = XEXP (x, 1);
14178 op1 = XEXP (x, 0);
14179 }
14180 goto cost_minus;
14181 }
14182
14183 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14184 {
14185 /* FCMP. */
14186 if (speed)
14187 *cost += extra_cost->fp[mode == DFmode].compare;
14188
14189 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14190 {
e548c9df 14191 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
a8eecd00
JG
14192 /* FCMP supports constant 0.0 for no extra cost. */
14193 return true;
14194 }
14195 return false;
14196 }
14197
b6875aac
KV
14198 if (VECTOR_MODE_P (mode))
14199 {
14200 /* Vector compare. */
14201 if (speed)
14202 *cost += extra_cost->vect.alu;
14203
14204 if (aarch64_float_const_zero_rtx_p (op1))
14205 {
14206 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14207 cost. */
14208 return true;
14209 }
14210 return false;
14211 }
a8eecd00 14212 return false;
43e9d192
IB
14213
14214 case MINUS:
4745e701
JG
14215 {
14216 op0 = XEXP (x, 0);
14217 op1 = XEXP (x, 1);
14218
14219cost_minus:
0c3aab7f
JW
14220 if (VECTOR_MODE_P (mode))
14221 {
14222 /* SUBL2 and SUBW2. */
14223 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
721c0fb3 14224 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
0c3aab7f
JW
14225 {
14226 /* The select-operand-high-half versions of the sub instruction
14227 have the same cost as the regular three vector version -
14228 don't add the costs of the select into the costs of the sub.
14229 */
14230 op0 = aarch64_strip_extend_vec_half (op0);
14231 op1 = aarch64_strip_extend_vec_half (op1);
14232 }
14233 }
14234
e548c9df 14235 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
23cb6618 14236
4745e701
JG
14237 /* Detect valid immediates. */
14238 if ((GET_MODE_CLASS (mode) == MODE_INT
14239 || (GET_MODE_CLASS (mode) == MODE_CC
14240 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14241 && CONST_INT_P (op1)
14242 && aarch64_uimm12_shift (INTVAL (op1)))
14243 {
4745e701
JG
14244 if (speed)
14245 /* SUB(S) (immediate). */
14246 *cost += extra_cost->alu.arith;
14247 return true;
4745e701
JG
14248 }
14249
7cc2145f 14250 /* Look for SUB (extended register). */
7de23b8c
AC
14251 if (is_a <scalar_int_mode> (mode)
14252 && aarch64_rtx_arith_op_extract_p (op1))
7cc2145f
JG
14253 {
14254 if (speed)
2533c820 14255 *cost += extra_cost->alu.extend_arith;
7cc2145f 14256
b10f1009 14257 op1 = aarch64_strip_extend (op1, true);
e47c4031 14258 *cost += rtx_cost (op1, VOIDmode,
e548c9df 14259 (enum rtx_code) GET_CODE (op1), 0, speed);
7cc2145f
JG
14260 return true;
14261 }
14262
b10f1009 14263 rtx new_op1 = aarch64_strip_extend (op1, false);
4745e701
JG
14264
14265 /* Cost this as an FMA-alike operation. */
14266 if ((GET_CODE (new_op1) == MULT
0a78ebe4 14267 || aarch64_shift_p (GET_CODE (new_op1)))
4745e701
JG
14268 && code != COMPARE)
14269 {
14270 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14271 (enum rtx_code) code,
14272 speed);
4745e701
JG
14273 return true;
14274 }
43e9d192 14275
e548c9df 14276 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
43e9d192 14277
4745e701
JG
14278 if (speed)
14279 {
b6875aac
KV
14280 if (VECTOR_MODE_P (mode))
14281 {
14282 /* Vector SUB. */
14283 *cost += extra_cost->vect.alu;
14284 }
14285 else if (GET_MODE_CLASS (mode) == MODE_INT)
14286 {
14287 /* SUB(S). */
14288 *cost += extra_cost->alu.arith;
14289 }
4745e701 14290 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
14291 {
14292 /* FSUB. */
14293 *cost += extra_cost->fp[mode == DFmode].addsub;
14294 }
4745e701
JG
14295 }
14296 return true;
14297 }
43e9d192
IB
14298
14299 case PLUS:
4745e701
JG
14300 {
14301 rtx new_op0;
43e9d192 14302
4745e701
JG
14303 op0 = XEXP (x, 0);
14304 op1 = XEXP (x, 1);
43e9d192 14305
a8eecd00 14306cost_plus:
8cd27a3b
JW
14307 if (VECTOR_MODE_P (mode))
14308 {
14309 /* ADDL2 and ADDW2. */
14310 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
721c0fb3 14311 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
8cd27a3b
JW
14312 {
14313 /* The select-operand-high-half versions of the add instruction
14314 have the same cost as the regular three vector version -
14315 don't add the costs of the select into the costs of the add.
14316 */
14317 op0 = aarch64_strip_extend_vec_half (op0);
14318 op1 = aarch64_strip_extend_vec_half (op1);
14319 }
14320 }
14321
4745e701
JG
14322 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14323 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14324 {
14325 /* CSINC. */
e548c9df
AM
14326 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14327 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
4745e701
JG
14328 return true;
14329 }
43e9d192 14330
4745e701 14331 if (GET_MODE_CLASS (mode) == MODE_INT
835d50c6 14332 && (aarch64_plus_immediate (op1, mode)
43cacb12 14333 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
4745e701 14334 {
e548c9df 14335 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
43e9d192 14336
4745e701 14337 if (speed)
a65b9ad8
KT
14338 {
14339 /* ADD (immediate). */
14340 *cost += extra_cost->alu.arith;
14341
14342 /* Some tunings prefer to not use the VL-based scalar ops.
14343 Increase the cost of the poly immediate to prevent their
14344 formation. */
14345 if (GET_CODE (op1) == CONST_POLY_INT
14346 && (aarch64_tune_params.extra_tuning_flags
14347 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14348 *cost += COSTS_N_INSNS (1);
14349 }
4745e701
JG
14350 return true;
14351 }
14352
2d7c73ee
WD
14353 if (aarch64_pluslong_immediate (op1, mode))
14354 {
14355 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14356 if ((INTVAL (op1) & 0xfff) != 0)
14357 *cost += COSTS_N_INSNS (1);
14358
14359 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14360 return true;
14361 }
14362
e548c9df 14363 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
23cb6618 14364
7cc2145f 14365 /* Look for ADD (extended register). */
7de23b8c
AC
14366 if (is_a <scalar_int_mode> (mode)
14367 && aarch64_rtx_arith_op_extract_p (op0))
7cc2145f
JG
14368 {
14369 if (speed)
2533c820 14370 *cost += extra_cost->alu.extend_arith;
7cc2145f 14371
b10f1009 14372 op0 = aarch64_strip_extend (op0, true);
e47c4031 14373 *cost += rtx_cost (op0, VOIDmode,
e548c9df 14374 (enum rtx_code) GET_CODE (op0), 0, speed);
7cc2145f
JG
14375 return true;
14376 }
14377
4745e701
JG
14378 /* Strip any extend, leave shifts behind as we will
14379 cost them through mult_cost. */
b10f1009 14380 new_op0 = aarch64_strip_extend (op0, false);
4745e701
JG
14381
14382 if (GET_CODE (new_op0) == MULT
0a78ebe4 14383 || aarch64_shift_p (GET_CODE (new_op0)))
4745e701
JG
14384 {
14385 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14386 speed);
4745e701
JG
14387 return true;
14388 }
14389
e548c9df 14390 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
4745e701
JG
14391
14392 if (speed)
14393 {
b6875aac
KV
14394 if (VECTOR_MODE_P (mode))
14395 {
14396 /* Vector ADD. */
14397 *cost += extra_cost->vect.alu;
14398 }
14399 else if (GET_MODE_CLASS (mode) == MODE_INT)
14400 {
14401 /* ADD. */
14402 *cost += extra_cost->alu.arith;
14403 }
4745e701 14404 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
14405 {
14406 /* FADD. */
14407 *cost += extra_cost->fp[mode == DFmode].addsub;
14408 }
4745e701
JG
14409 }
14410 return true;
14411 }
43e9d192 14412
18b42b2a
KT
14413 case BSWAP:
14414 *cost = COSTS_N_INSNS (1);
14415
14416 if (speed)
b6875aac
KV
14417 {
14418 if (VECTOR_MODE_P (mode))
14419 *cost += extra_cost->vect.alu;
14420 else
14421 *cost += extra_cost->alu.rev;
14422 }
18b42b2a
KT
14423 return false;
14424
43e9d192 14425 case IOR:
f7d5cf8d
KT
14426 if (aarch_rev16_p (x))
14427 {
14428 *cost = COSTS_N_INSNS (1);
14429
b6875aac
KV
14430 if (speed)
14431 {
14432 if (VECTOR_MODE_P (mode))
14433 *cost += extra_cost->vect.alu;
14434 else
14435 *cost += extra_cost->alu.rev;
14436 }
14437 return true;
f7d5cf8d 14438 }
fb0cb7fa
KT
14439
14440 if (aarch64_extr_rtx_p (x, &op0, &op1))
14441 {
e548c9df
AM
14442 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14443 *cost += rtx_cost (op1, mode, IOR, 1, speed);
fb0cb7fa
KT
14444 if (speed)
14445 *cost += extra_cost->alu.shift;
14446
14447 return true;
14448 }
f7d5cf8d 14449 /* Fall through. */
43e9d192
IB
14450 case XOR:
14451 case AND:
14452 cost_logic:
14453 op0 = XEXP (x, 0);
14454 op1 = XEXP (x, 1);
14455
b6875aac
KV
14456 if (VECTOR_MODE_P (mode))
14457 {
14458 if (speed)
14459 *cost += extra_cost->vect.alu;
14460 return true;
14461 }
14462
268c3b47
JG
14463 if (code == AND
14464 && GET_CODE (op0) == MULT
14465 && CONST_INT_P (XEXP (op0, 1))
14466 && CONST_INT_P (op1)
14467 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14468 INTVAL (op1)) != 0)
14469 {
14470 /* This is a UBFM/SBFM. */
e548c9df 14471 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
268c3b47
JG
14472 if (speed)
14473 *cost += extra_cost->alu.bfx;
14474 return true;
14475 }
14476
b4206259 14477 if (is_int_mode (mode, &int_mode))
43e9d192 14478 {
8c83f71d 14479 if (CONST_INT_P (op1))
43e9d192 14480 {
8c83f71d
KT
14481 /* We have a mask + shift version of a UBFIZ
14482 i.e. the *andim_ashift<mode>_bfiz pattern. */
14483 if (GET_CODE (op0) == ASHIFT
b4206259
RS
14484 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14485 XEXP (op0, 1)))
8c83f71d 14486 {
b4206259 14487 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8c83f71d
KT
14488 (enum rtx_code) code, 0, speed);
14489 if (speed)
14490 *cost += extra_cost->alu.bfx;
268c3b47 14491
8c83f71d
KT
14492 return true;
14493 }
b4206259 14494 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8c83f71d
KT
14495 {
14496 /* We possibly get the immediate for free, this is not
14497 modelled. */
b4206259
RS
14498 *cost += rtx_cost (op0, int_mode,
14499 (enum rtx_code) code, 0, speed);
8c83f71d
KT
14500 if (speed)
14501 *cost += extra_cost->alu.logical;
268c3b47 14502
8c83f71d
KT
14503 return true;
14504 }
43e9d192
IB
14505 }
14506 else
14507 {
268c3b47
JG
14508 rtx new_op0 = op0;
14509
14510 /* Handle ORN, EON, or BIC. */
43e9d192
IB
14511 if (GET_CODE (op0) == NOT)
14512 op0 = XEXP (op0, 0);
268c3b47
JG
14513
14514 new_op0 = aarch64_strip_shift (op0);
14515
14516 /* If we had a shift on op0 then this is a logical-shift-
14517 by-register/immediate operation. Otherwise, this is just
14518 a logical operation. */
14519 if (speed)
14520 {
14521 if (new_op0 != op0)
14522 {
14523 /* Shift by immediate. */
14524 if (CONST_INT_P (XEXP (op0, 1)))
14525 *cost += extra_cost->alu.log_shift;
14526 else
14527 *cost += extra_cost->alu.log_shift_reg;
14528 }
14529 else
14530 *cost += extra_cost->alu.logical;
14531 }
14532
14533 /* In both cases we want to cost both operands. */
b4206259
RS
14534 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14535 0, speed);
14536 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14537 1, speed);
268c3b47
JG
14538
14539 return true;
43e9d192 14540 }
43e9d192
IB
14541 }
14542 return false;
14543
268c3b47 14544 case NOT:
6365da9e
KT
14545 x = XEXP (x, 0);
14546 op0 = aarch64_strip_shift (x);
14547
b6875aac
KV
14548 if (VECTOR_MODE_P (mode))
14549 {
14550 /* Vector NOT. */
14551 *cost += extra_cost->vect.alu;
14552 return false;
14553 }
14554
6365da9e
KT
14555 /* MVN-shifted-reg. */
14556 if (op0 != x)
14557 {
e548c9df 14558 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6365da9e
KT
14559
14560 if (speed)
14561 *cost += extra_cost->alu.log_shift;
14562
14563 return true;
14564 }
14565 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14566 Handle the second form here taking care that 'a' in the above can
14567 be a shift. */
14568 else if (GET_CODE (op0) == XOR)
14569 {
14570 rtx newop0 = XEXP (op0, 0);
14571 rtx newop1 = XEXP (op0, 1);
14572 rtx op0_stripped = aarch64_strip_shift (newop0);
14573
e548c9df
AM
14574 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14575 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6365da9e
KT
14576
14577 if (speed)
14578 {
14579 if (op0_stripped != newop0)
14580 *cost += extra_cost->alu.log_shift;
14581 else
14582 *cost += extra_cost->alu.logical;
14583 }
14584
14585 return true;
14586 }
268c3b47
JG
14587 /* MVN. */
14588 if (speed)
14589 *cost += extra_cost->alu.logical;
14590
268c3b47
JG
14591 return false;
14592
43e9d192 14593 case ZERO_EXTEND:
b1685e62
JG
14594
14595 op0 = XEXP (x, 0);
14596 /* If a value is written in SI mode, then zero extended to DI
14597 mode, the operation will in general be free as a write to
14598 a 'w' register implicitly zeroes the upper bits of an 'x'
14599 register. However, if this is
14600
14601 (set (reg) (zero_extend (reg)))
14602
14603 we must cost the explicit register move. */
14604 if (mode == DImode
1d5c43db 14605 && GET_MODE (op0) == SImode)
b1685e62 14606 {
e548c9df 14607 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
b1685e62 14608
dde23f43
KM
14609 /* If OP_COST is non-zero, then the cost of the zero extend
14610 is effectively the cost of the inner operation. Otherwise
14611 we have a MOV instruction and we take the cost from the MOV
14612 itself. This is true independently of whether we are
14613 optimizing for space or time. */
14614 if (op_cost)
b1685e62
JG
14615 *cost = op_cost;
14616
14617 return true;
14618 }
e548c9df 14619 else if (MEM_P (op0))
43e9d192 14620 {
b1685e62 14621 /* All loads can zero extend to any size for free. */
e548c9df 14622 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
43e9d192
IB
14623 return true;
14624 }
b1685e62 14625
283b6c85
KT
14626 op0 = aarch64_extend_bitfield_pattern_p (x);
14627 if (op0)
14628 {
14629 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14630 if (speed)
14631 *cost += extra_cost->alu.bfx;
14632 return true;
14633 }
14634
b1685e62 14635 if (speed)
b6875aac
KV
14636 {
14637 if (VECTOR_MODE_P (mode))
14638 {
14639 /* UMOV. */
14640 *cost += extra_cost->vect.alu;
14641 }
14642 else
14643 {
63715e5e
WD
14644 /* We generate an AND instead of UXTB/UXTH. */
14645 *cost += extra_cost->alu.logical;
b6875aac
KV
14646 }
14647 }
43e9d192
IB
14648 return false;
14649
14650 case SIGN_EXTEND:
b1685e62 14651 if (MEM_P (XEXP (x, 0)))
43e9d192 14652 {
b1685e62
JG
14653 /* LDRSH. */
14654 if (speed)
14655 {
14656 rtx address = XEXP (XEXP (x, 0), 0);
14657 *cost += extra_cost->ldst.load_sign_extend;
14658
14659 *cost +=
14660 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14661 0, speed));
14662 }
43e9d192
IB
14663 return true;
14664 }
b1685e62 14665
283b6c85
KT
14666 op0 = aarch64_extend_bitfield_pattern_p (x);
14667 if (op0)
14668 {
14669 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14670 if (speed)
14671 *cost += extra_cost->alu.bfx;
14672 return true;
14673 }
14674
b1685e62 14675 if (speed)
b6875aac
KV
14676 {
14677 if (VECTOR_MODE_P (mode))
14678 *cost += extra_cost->vect.alu;
14679 else
14680 *cost += extra_cost->alu.extend;
14681 }
43e9d192
IB
14682 return false;
14683
ba0cfa17
JG
14684 case ASHIFT:
14685 op0 = XEXP (x, 0);
14686 op1 = XEXP (x, 1);
14687
14688 if (CONST_INT_P (op1))
14689 {
ba0cfa17 14690 if (speed)
b6875aac
KV
14691 {
14692 if (VECTOR_MODE_P (mode))
14693 {
14694 /* Vector shift (immediate). */
14695 *cost += extra_cost->vect.alu;
14696 }
14697 else
14698 {
14699 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
14700 aliases. */
14701 *cost += extra_cost->alu.shift;
14702 }
14703 }
ba0cfa17
JG
14704
14705 /* We can incorporate zero/sign extend for free. */
14706 if (GET_CODE (op0) == ZERO_EXTEND
14707 || GET_CODE (op0) == SIGN_EXTEND)
14708 op0 = XEXP (op0, 0);
14709
e548c9df 14710 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
ba0cfa17
JG
14711 return true;
14712 }
14713 else
14714 {
7813b280 14715 if (VECTOR_MODE_P (mode))
b6875aac 14716 {
7813b280
KT
14717 if (speed)
14718 /* Vector shift (register). */
14719 *cost += extra_cost->vect.alu;
14720 }
14721 else
14722 {
14723 if (speed)
14724 /* LSLV. */
14725 *cost += extra_cost->alu.shift_reg;
14726
14727 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14728 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
14729 && known_eq (INTVAL (XEXP (op1, 1)),
14730 GET_MODE_BITSIZE (mode) - 1))
b6875aac 14731 {
7813b280
KT
14732 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14733 /* We already demanded XEXP (op1, 0) to be REG_P, so
14734 don't recurse into it. */
14735 return true;
b6875aac
KV
14736 }
14737 }
ba0cfa17
JG
14738 return false; /* All arguments need to be in registers. */
14739 }
14740
43e9d192 14741 case ROTATE:
43e9d192
IB
14742 case ROTATERT:
14743 case LSHIFTRT:
43e9d192 14744 case ASHIFTRT:
ba0cfa17
JG
14745 op0 = XEXP (x, 0);
14746 op1 = XEXP (x, 1);
43e9d192 14747
ba0cfa17
JG
14748 if (CONST_INT_P (op1))
14749 {
14750 /* ASR (immediate) and friends. */
14751 if (speed)
b6875aac
KV
14752 {
14753 if (VECTOR_MODE_P (mode))
14754 *cost += extra_cost->vect.alu;
14755 else
14756 *cost += extra_cost->alu.shift;
14757 }
43e9d192 14758
e548c9df 14759 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
ba0cfa17
JG
14760 return true;
14761 }
14762 else
14763 {
7813b280 14764 if (VECTOR_MODE_P (mode))
b6875aac 14765 {
7813b280
KT
14766 if (speed)
14767 /* Vector shift (register). */
b6875aac 14768 *cost += extra_cost->vect.alu;
7813b280
KT
14769 }
14770 else
14771 {
14772 if (speed)
14773 /* ASR (register) and friends. */
b6875aac 14774 *cost += extra_cost->alu.shift_reg;
7813b280
KT
14775
14776 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14777 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
14778 && known_eq (INTVAL (XEXP (op1, 1)),
14779 GET_MODE_BITSIZE (mode) - 1))
7813b280
KT
14780 {
14781 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14782 /* We already demanded XEXP (op1, 0) to be REG_P, so
14783 don't recurse into it. */
14784 return true;
14785 }
b6875aac 14786 }
ba0cfa17
JG
14787 return false; /* All arguments need to be in registers. */
14788 }
43e9d192 14789
909734be
JG
14790 case SYMBOL_REF:
14791
1b1e81f8
JW
14792 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14793 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
909734be
JG
14794 {
14795 /* LDR. */
14796 if (speed)
14797 *cost += extra_cost->ldst.load;
14798 }
14799 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14800 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14801 {
14802 /* ADRP, followed by ADD. */
14803 *cost += COSTS_N_INSNS (1);
14804 if (speed)
14805 *cost += 2 * extra_cost->alu.arith;
14806 }
14807 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14808 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14809 {
14810 /* ADR. */
14811 if (speed)
14812 *cost += extra_cost->alu.arith;
14813 }
14814
14815 if (flag_pic)
14816 {
14817 /* One extra load instruction, after accessing the GOT. */
14818 *cost += COSTS_N_INSNS (1);
14819 if (speed)
14820 *cost += extra_cost->ldst.load;
14821 }
43e9d192
IB
14822 return true;
14823
909734be 14824 case HIGH:
43e9d192 14825 case LO_SUM:
909734be
JG
14826 /* ADRP/ADD (immediate). */
14827 if (speed)
14828 *cost += extra_cost->alu.arith;
43e9d192
IB
14829 return true;
14830
14831 case ZERO_EXTRACT:
14832 case SIGN_EXTRACT:
7cc2145f
JG
14833 /* UBFX/SBFX. */
14834 if (speed)
b6875aac
KV
14835 {
14836 if (VECTOR_MODE_P (mode))
14837 *cost += extra_cost->vect.alu;
14838 else
14839 *cost += extra_cost->alu.bfx;
14840 }
7cc2145f
JG
14841
14842 /* We can trust that the immediates used will be correct (there
14843 are no by-register forms), so we need only cost op0. */
e548c9df 14844 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
43e9d192
IB
14845 return true;
14846
14847 case MULT:
4745e701
JG
14848 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14849 /* aarch64_rtx_mult_cost always handles recursion to its
14850 operands. */
14851 return true;
43e9d192
IB
14852
14853 case MOD:
4f58fe36
KT
14854 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14855 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14856 an unconditional negate. This case should only ever be reached through
e53b6e56 14857 the set_smod_pow2_cheap check in expmed.cc. */
4f58fe36
KT
14858 if (CONST_INT_P (XEXP (x, 1))
14859 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14860 && (mode == SImode || mode == DImode))
14861 {
14862 /* We expand to 4 instructions. Reset the baseline. */
14863 *cost = COSTS_N_INSNS (4);
14864
14865 if (speed)
14866 *cost += 2 * extra_cost->alu.logical
14867 + 2 * extra_cost->alu.arith;
14868
14869 return true;
14870 }
14871
14872 /* Fall-through. */
43e9d192 14873 case UMOD:
43e9d192
IB
14874 if (speed)
14875 {
cb9ac430 14876 /* Slighly prefer UMOD over SMOD. */
b6875aac
KV
14877 if (VECTOR_MODE_P (mode))
14878 *cost += extra_cost->vect.alu;
e548c9df
AM
14879 else if (GET_MODE_CLASS (mode) == MODE_INT)
14880 *cost += (extra_cost->mult[mode == DImode].add
cb9ac430
TC
14881 + extra_cost->mult[mode == DImode].idiv
14882 + (code == MOD ? 1 : 0));
43e9d192
IB
14883 }
14884 return false; /* All arguments need to be in registers. */
14885
14886 case DIV:
14887 case UDIV:
4105fe38 14888 case SQRT:
43e9d192
IB
14889 if (speed)
14890 {
b6875aac
KV
14891 if (VECTOR_MODE_P (mode))
14892 *cost += extra_cost->vect.alu;
14893 else if (GET_MODE_CLASS (mode) == MODE_INT)
4105fe38
JG
14894 /* There is no integer SQRT, so only DIV and UDIV can get
14895 here. */
cb9ac430
TC
14896 *cost += (extra_cost->mult[mode == DImode].idiv
14897 /* Slighly prefer UDIV over SDIV. */
14898 + (code == DIV ? 1 : 0));
4105fe38
JG
14899 else
14900 *cost += extra_cost->fp[mode == DFmode].div;
43e9d192
IB
14901 }
14902 return false; /* All arguments need to be in registers. */
14903
a8eecd00 14904 case IF_THEN_ELSE:
2d5ffe46
AP
14905 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14906 XEXP (x, 2), cost, speed);
a8eecd00
JG
14907
14908 case EQ:
14909 case NE:
14910 case GT:
14911 case GTU:
14912 case LT:
14913 case LTU:
14914 case GE:
14915 case GEU:
14916 case LE:
14917 case LEU:
14918
14919 return false; /* All arguments must be in registers. */
14920
b292109f
JG
14921 case FMA:
14922 op0 = XEXP (x, 0);
14923 op1 = XEXP (x, 1);
14924 op2 = XEXP (x, 2);
14925
14926 if (speed)
b6875aac
KV
14927 {
14928 if (VECTOR_MODE_P (mode))
14929 *cost += extra_cost->vect.alu;
14930 else
14931 *cost += extra_cost->fp[mode == DFmode].fma;
14932 }
b292109f
JG
14933
14934 /* FMSUB, FNMADD, and FNMSUB are free. */
14935 if (GET_CODE (op0) == NEG)
14936 op0 = XEXP (op0, 0);
14937
14938 if (GET_CODE (op2) == NEG)
14939 op2 = XEXP (op2, 0);
14940
14941 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14942 and the by-element operand as operand 0. */
14943 if (GET_CODE (op1) == NEG)
14944 op1 = XEXP (op1, 0);
14945
14946 /* Catch vector-by-element operations. The by-element operand can
14947 either be (vec_duplicate (vec_select (x))) or just
14948 (vec_select (x)), depending on whether we are multiplying by
14949 a vector or a scalar.
14950
14951 Canonicalization is not very good in these cases, FMA4 will put the
14952 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14953 if (GET_CODE (op0) == VEC_DUPLICATE)
14954 op0 = XEXP (op0, 0);
14955 else if (GET_CODE (op1) == VEC_DUPLICATE)
14956 op1 = XEXP (op1, 0);
14957
14958 if (GET_CODE (op0) == VEC_SELECT)
14959 op0 = XEXP (op0, 0);
14960 else if (GET_CODE (op1) == VEC_SELECT)
14961 op1 = XEXP (op1, 0);
14962
14963 /* If the remaining parameters are not registers,
14964 get the cost to put them into registers. */
e548c9df
AM
14965 *cost += rtx_cost (op0, mode, FMA, 0, speed);
14966 *cost += rtx_cost (op1, mode, FMA, 1, speed);
14967 *cost += rtx_cost (op2, mode, FMA, 2, speed);
b292109f
JG
14968 return true;
14969
5e2a765b
KT
14970 case FLOAT:
14971 case UNSIGNED_FLOAT:
14972 if (speed)
14973 *cost += extra_cost->fp[mode == DFmode].fromint;
14974 return false;
14975
b292109f
JG
14976 case FLOAT_EXTEND:
14977 if (speed)
b6875aac
KV
14978 {
14979 if (VECTOR_MODE_P (mode))
14980 {
14981 /*Vector truncate. */
14982 *cost += extra_cost->vect.alu;
14983 }
14984 else
14985 *cost += extra_cost->fp[mode == DFmode].widen;
14986 }
b292109f
JG
14987 return false;
14988
14989 case FLOAT_TRUNCATE:
14990 if (speed)
b6875aac
KV
14991 {
14992 if (VECTOR_MODE_P (mode))
14993 {
14994 /*Vector conversion. */
14995 *cost += extra_cost->vect.alu;
14996 }
14997 else
14998 *cost += extra_cost->fp[mode == DFmode].narrow;
14999 }
b292109f
JG
15000 return false;
15001
61263118
KT
15002 case FIX:
15003 case UNSIGNED_FIX:
15004 x = XEXP (x, 0);
15005 /* Strip the rounding part. They will all be implemented
15006 by the fcvt* family of instructions anyway. */
15007 if (GET_CODE (x) == UNSPEC)
15008 {
15009 unsigned int uns_code = XINT (x, 1);
15010
15011 if (uns_code == UNSPEC_FRINTA
15012 || uns_code == UNSPEC_FRINTM
15013 || uns_code == UNSPEC_FRINTN
15014 || uns_code == UNSPEC_FRINTP
15015 || uns_code == UNSPEC_FRINTZ)
15016 x = XVECEXP (x, 0, 0);
15017 }
15018
15019 if (speed)
b6875aac
KV
15020 {
15021 if (VECTOR_MODE_P (mode))
15022 *cost += extra_cost->vect.alu;
15023 else
15024 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15025 }
39252973
KT
15026
15027 /* We can combine fmul by a power of 2 followed by a fcvt into a single
15028 fixed-point fcvt. */
15029 if (GET_CODE (x) == MULT
15030 && ((VECTOR_MODE_P (mode)
15031 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15032 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15033 {
15034 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15035 0, speed);
15036 return true;
15037 }
15038
e548c9df 15039 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
61263118
KT
15040 return true;
15041
b292109f 15042 case ABS:
b6875aac
KV
15043 if (VECTOR_MODE_P (mode))
15044 {
15045 /* ABS (vector). */
15046 if (speed)
15047 *cost += extra_cost->vect.alu;
15048 }
15049 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b292109f 15050 {
19261b99
KT
15051 op0 = XEXP (x, 0);
15052
15053 /* FABD, which is analogous to FADD. */
15054 if (GET_CODE (op0) == MINUS)
15055 {
e548c9df
AM
15056 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15057 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
19261b99
KT
15058 if (speed)
15059 *cost += extra_cost->fp[mode == DFmode].addsub;
15060
15061 return true;
15062 }
15063 /* Simple FABS is analogous to FNEG. */
b292109f
JG
15064 if (speed)
15065 *cost += extra_cost->fp[mode == DFmode].neg;
15066 }
15067 else
15068 {
15069 /* Integer ABS will either be split to
15070 two arithmetic instructions, or will be an ABS
15071 (scalar), which we don't model. */
15072 *cost = COSTS_N_INSNS (2);
15073 if (speed)
15074 *cost += 2 * extra_cost->alu.arith;
15075 }
15076 return false;
15077
15078 case SMAX:
15079 case SMIN:
15080 if (speed)
15081 {
b6875aac
KV
15082 if (VECTOR_MODE_P (mode))
15083 *cost += extra_cost->vect.alu;
15084 else
15085 {
15086 /* FMAXNM/FMINNM/FMAX/FMIN.
15087 TODO: This may not be accurate for all implementations, but
15088 we do not model this in the cost tables. */
15089 *cost += extra_cost->fp[mode == DFmode].addsub;
15090 }
b292109f
JG
15091 }
15092 return false;
15093
61263118
KT
15094 case UNSPEC:
15095 /* The floating point round to integer frint* instructions. */
15096 if (aarch64_frint_unspec_p (XINT (x, 1)))
15097 {
15098 if (speed)
15099 *cost += extra_cost->fp[mode == DFmode].roundint;
15100
15101 return false;
15102 }
781aeb73
KT
15103
15104 if (XINT (x, 1) == UNSPEC_RBIT)
15105 {
15106 if (speed)
15107 *cost += extra_cost->alu.rev;
15108
15109 return false;
15110 }
61263118
KT
15111 break;
15112
fb620c4a
JG
15113 case TRUNCATE:
15114
15115 /* Decompose <su>muldi3_highpart. */
15116 if (/* (truncate:DI */
15117 mode == DImode
15118 /* (lshiftrt:TI */
15119 && GET_MODE (XEXP (x, 0)) == TImode
15120 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15121 /* (mult:TI */
15122 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15123 /* (ANY_EXTEND:TI (reg:DI))
15124 (ANY_EXTEND:TI (reg:DI))) */
15125 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15126 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15127 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15128 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15129 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15130 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15131 /* (const_int 64) */
15132 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15133 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15134 {
15135 /* UMULH/SMULH. */
15136 if (speed)
15137 *cost += extra_cost->mult[mode == DImode].extend;
e548c9df
AM
15138 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15139 mode, MULT, 0, speed);
15140 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15141 mode, MULT, 1, speed);
fb620c4a
JG
15142 return true;
15143 }
1d5c43db
TC
15144 break;
15145 case CONST_VECTOR:
15146 {
15147 /* Load using MOVI/MVNI. */
15148 if (aarch64_simd_valid_immediate (x, NULL))
15149 *cost = extra_cost->vect.movi;
15150 else /* Load using constant pool. */
15151 *cost = extra_cost->ldst.load;
15152 break;
15153 }
15154 case VEC_CONCAT:
15155 /* depending on the operation, either DUP or INS.
15156 For now, keep default costing. */
15157 break;
15158 case VEC_DUPLICATE:
15159 /* Load using a DUP. */
15160 *cost = extra_cost->vect.dup;
15161 return false;
15162 case VEC_SELECT:
15163 {
15164 rtx op0 = XEXP (x, 0);
15165 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
fb620c4a 15166
1d5c43db
TC
15167 /* cost subreg of 0 as free, otherwise as DUP */
15168 rtx op1 = XEXP (x, 1);
15169 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15170 ;
15171 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15172 *cost = extra_cost->vect.dup;
15173 else
15174 *cost = extra_cost->vect.extract;
15175 return true;
15176 }
43e9d192 15177 default:
61263118 15178 break;
43e9d192 15179 }
61263118 15180
c10e3d7f
AP
15181 if (dump_file
15182 && flag_aarch64_verbose_cost)
61263118
KT
15183 fprintf (dump_file,
15184 "\nFailed to cost RTX. Assuming default cost.\n");
15185
15186 return true;
43e9d192
IB
15187}
15188
0ee859b5
JG
15189/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15190 calculated for X. This cost is stored in *COST. Returns true
15191 if the total cost of X was calculated. */
15192static bool
e548c9df 15193aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
0ee859b5
JG
15194 int param, int *cost, bool speed)
15195{
e548c9df 15196 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
0ee859b5 15197
c10e3d7f
AP
15198 if (dump_file
15199 && flag_aarch64_verbose_cost)
0ee859b5
JG
15200 {
15201 print_rtl_single (dump_file, x);
15202 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15203 speed ? "Hot" : "Cold",
15204 *cost, result ? "final" : "partial");
15205 }
15206
15207 return result;
15208}
15209
43e9d192 15210static int
ef4bddc2 15211aarch64_register_move_cost (machine_mode mode,
8a3a7e67 15212 reg_class_t from_i, reg_class_t to_i)
43e9d192 15213{
8a3a7e67
RH
15214 enum reg_class from = (enum reg_class) from_i;
15215 enum reg_class to = (enum reg_class) to_i;
43e9d192 15216 const struct cpu_regmove_cost *regmove_cost
b175b679 15217 = aarch64_tune_params.regmove_cost;
43e9d192 15218
3be07662 15219 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
96b7f495
MM
15220 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15221 || to == STUB_REGS)
3be07662
WD
15222 to = GENERAL_REGS;
15223
96b7f495
MM
15224 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15225 || from == STUB_REGS)
3be07662
WD
15226 from = GENERAL_REGS;
15227
183bfdaf
RS
15228 /* Make RDFFR very expensive. In particular, if we know that the FFR
15229 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15230 as a way of obtaining a PTRUE. */
15231 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15232 && hard_reg_set_subset_p (reg_class_contents[from_i],
15233 reg_class_contents[FFR_REGS]))
15234 return 80;
15235
6ee70f81
AP
15236 /* Moving between GPR and stack cost is the same as GP2GP. */
15237 if ((from == GENERAL_REGS && to == STACK_REG)
15238 || (to == GENERAL_REGS && from == STACK_REG))
15239 return regmove_cost->GP2GP;
15240
15241 /* To/From the stack register, we move via the gprs. */
15242 if (to == STACK_REG || from == STACK_REG)
15243 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15244 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15245
721c0fb3
RS
15246 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15247 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15248 && known_eq (GET_MODE_SIZE (mode), 16))
8919453c
WD
15249 {
15250 /* 128-bit operations on general registers require 2 instructions. */
15251 if (from == GENERAL_REGS && to == GENERAL_REGS)
15252 return regmove_cost->GP2GP * 2;
15253 else if (from == GENERAL_REGS)
15254 return regmove_cost->GP2FP * 2;
15255 else if (to == GENERAL_REGS)
15256 return regmove_cost->FP2GP * 2;
15257
15258 /* When AdvSIMD instructions are disabled it is not possible to move
15259 a 128-bit value directly between Q registers. This is handled in
15260 secondary reload. A general register is used as a scratch to move
15261 the upper DI value and the lower DI value is moved directly,
15262 hence the cost is the sum of three moves. */
15263 if (! TARGET_SIMD)
15264 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15265
15266 return regmove_cost->FP2FP;
15267 }
15268
43e9d192
IB
15269 if (from == GENERAL_REGS && to == GENERAL_REGS)
15270 return regmove_cost->GP2GP;
15271 else if (from == GENERAL_REGS)
15272 return regmove_cost->GP2FP;
15273 else if (to == GENERAL_REGS)
15274 return regmove_cost->FP2GP;
15275
721c0fb3
RS
15276 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15277 {
15278 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15279 The cost must be greater than 2 units to indicate that direct
15280 moves aren't possible. */
15281 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15282 + aarch64_tune_params.memmov_cost.store_fp);
15283 return MIN (CEIL (per_vector, 2), 4);
15284 }
15285
43e9d192
IB
15286 return regmove_cost->FP2FP;
15287}
15288
b074fa69 15289/* Implements TARGET_MEMORY_MOVE_COST. */
43e9d192 15290static int
b074fa69 15291aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
43e9d192 15292{
b074fa69
AV
15293 enum reg_class rclass = (enum reg_class) rclass_i;
15294 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15295 ? reg_classes_intersect_p (rclass, PR_REGS)
15296 : reg_class_subset_p (rclass, PR_REGS))
15297 return (in
15298 ? aarch64_tune_params.memmov_cost.load_pred
15299 : aarch64_tune_params.memmov_cost.store_pred);
15300
15301 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15302 ? reg_classes_intersect_p (rclass, FP_REGS)
15303 : reg_class_subset_p (rclass, FP_REGS))
15304 return (in
15305 ? aarch64_tune_params.memmov_cost.load_fp
15306 : aarch64_tune_params.memmov_cost.store_fp);
15307
15308 return (in
15309 ? aarch64_tune_params.memmov_cost.load_int
15310 : aarch64_tune_params.memmov_cost.store_int);
43e9d192
IB
15311}
15312
6d4d616a
RS
15313/* Implement TARGET_INIT_BUILTINS. */
15314static void
15315aarch64_init_builtins ()
15316{
15317 aarch64_general_init_builtins ();
624d0f07 15318 aarch64_sve::init_builtins ();
f9d4544d
MR
15319#ifdef SUBTARGET_INIT_BUILTINS
15320 SUBTARGET_INIT_BUILTINS;
15321#endif
6d4d616a
RS
15322}
15323
15324/* Implement TARGET_FOLD_BUILTIN. */
15325static tree
15326aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15327{
15328 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15329 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15330 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15331 switch (code & AARCH64_BUILTIN_CLASS)
15332 {
15333 case AARCH64_BUILTIN_GENERAL:
15334 return aarch64_general_fold_builtin (subcode, type, nargs, args);
624d0f07
RS
15335
15336 case AARCH64_BUILTIN_SVE:
15337 return NULL_TREE;
6d4d616a
RS
15338 }
15339 gcc_unreachable ();
15340}
15341
15342/* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15343static bool
15344aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15345{
15346 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15347 tree fndecl = gimple_call_fndecl (stmt);
15348 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15349 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15350 gimple *new_stmt = NULL;
15351 switch (code & AARCH64_BUILTIN_CLASS)
15352 {
15353 case AARCH64_BUILTIN_GENERAL:
ad44c6a5 15354 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
6d4d616a 15355 break;
624d0f07
RS
15356
15357 case AARCH64_BUILTIN_SVE:
15358 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15359 break;
6d4d616a
RS
15360 }
15361
15362 if (!new_stmt)
15363 return false;
15364
3893c9c0 15365 gsi_replace (gsi, new_stmt, false);
6d4d616a
RS
15366 return true;
15367}
15368
15369/* Implement TARGET_EXPAND_BUILTIN. */
15370static rtx
c5dc215d 15371aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
6d4d616a
RS
15372{
15373 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15374 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15375 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15376 switch (code & AARCH64_BUILTIN_CLASS)
15377 {
15378 case AARCH64_BUILTIN_GENERAL:
c5dc215d 15379 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
624d0f07
RS
15380
15381 case AARCH64_BUILTIN_SVE:
15382 return aarch64_sve::expand_builtin (subcode, exp, target);
6d4d616a
RS
15383 }
15384 gcc_unreachable ();
15385}
15386
15387/* Implement TARGET_BUILTIN_DECL. */
15388static tree
15389aarch64_builtin_decl (unsigned int code, bool initialize_p)
15390{
15391 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15392 switch (code & AARCH64_BUILTIN_CLASS)
15393 {
15394 case AARCH64_BUILTIN_GENERAL:
15395 return aarch64_general_builtin_decl (subcode, initialize_p);
624d0f07
RS
15396
15397 case AARCH64_BUILTIN_SVE:
15398 return aarch64_sve::builtin_decl (subcode, initialize_p);
6d4d616a
RS
15399 }
15400 gcc_unreachable ();
15401}
15402
0c30e0f3
EM
15403/* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15404 to optimize 1.0/sqrt. */
ee62a5a6
RS
15405
15406static bool
9acc9cbe 15407use_rsqrt_p (machine_mode mode)
ee62a5a6
RS
15408{
15409 return (!flag_trapping_math
15410 && flag_unsafe_math_optimizations
9acc9cbe
EM
15411 && ((aarch64_tune_params.approx_modes->recip_sqrt
15412 & AARCH64_APPROX_MODE (mode))
1a33079e 15413 || flag_mrecip_low_precision_sqrt));
ee62a5a6
RS
15414}
15415
0c30e0f3
EM
15416/* Function to decide when to use the approximate reciprocal square root
15417 builtin. */
a6fc00da
BH
15418
15419static tree
ee62a5a6 15420aarch64_builtin_reciprocal (tree fndecl)
a6fc00da 15421{
9acc9cbe
EM
15422 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15423
15424 if (!use_rsqrt_p (mode))
a6fc00da 15425 return NULL_TREE;
6d4d616a
RS
15426 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15427 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15428 switch (code & AARCH64_BUILTIN_CLASS)
15429 {
15430 case AARCH64_BUILTIN_GENERAL:
15431 return aarch64_general_builtin_rsqrt (subcode);
624d0f07
RS
15432
15433 case AARCH64_BUILTIN_SVE:
15434 return NULL_TREE;
6d4d616a
RS
15435 }
15436 gcc_unreachable ();
a6fc00da
BH
15437}
15438
04f307cb
RS
15439/* Emit code to perform the floating-point operation:
15440
15441 DST = SRC1 * SRC2
15442
15443 where all three operands are already known to be registers.
15444 If the operation is an SVE one, PTRUE is a suitable all-true
15445 predicate. */
15446
15447static void
15448aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15449{
15450 if (ptrue)
15451 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15452 dst, ptrue, src1, src2,
15453 gen_int_mode (SVE_RELAXED_GP, SImode)));
15454 else
15455 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15456}
15457
98daafa0
EM
15458/* Emit instruction sequence to compute either the approximate square root
15459 or its approximate reciprocal, depending on the flag RECP, and return
15460 whether the sequence was emitted or not. */
a6fc00da 15461
98daafa0
EM
15462bool
15463aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
a6fc00da 15464{
98daafa0 15465 machine_mode mode = GET_MODE (dst);
daef0a8c
JW
15466
15467 if (GET_MODE_INNER (mode) == HFmode)
2e19adc8
RE
15468 {
15469 gcc_assert (!recp);
15470 return false;
15471 }
15472
2e19adc8
RE
15473 if (!recp)
15474 {
15475 if (!(flag_mlow_precision_sqrt
15476 || (aarch64_tune_params.approx_modes->sqrt
15477 & AARCH64_APPROX_MODE (mode))))
15478 return false;
15479
902d28bd 15480 if (!flag_finite_math_only
2e19adc8
RE
15481 || flag_trapping_math
15482 || !flag_unsafe_math_optimizations
15483 || optimize_function_for_size_p (cfun))
15484 return false;
15485 }
15486 else
15487 /* Caller assumes we cannot fail. */
15488 gcc_assert (use_rsqrt_p (mode));
daef0a8c 15489
a0ee8352
RS
15490 rtx pg = NULL_RTX;
15491 if (aarch64_sve_mode_p (mode))
15492 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
d7814449 15493 machine_mode mmsk = (VECTOR_MODE_P (mode)
d083ee47 15494 ? related_int_vector_mode (mode).require ()
d7814449 15495 : int_mode_for_mode (mode).require ());
0df28e68 15496 rtx xmsk = NULL_RTX;
98daafa0 15497 if (!recp)
0df28e68
RS
15498 {
15499 /* When calculating the approximate square root, compare the
15500 argument with 0.0 and create a mask. */
a0ee8352
RS
15501 rtx zero = CONST0_RTX (mode);
15502 if (pg)
15503 {
15504 xmsk = gen_reg_rtx (GET_MODE (pg));
15505 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15506 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15507 xmsk, pg, hint, src, zero));
15508 }
15509 else
15510 {
15511 xmsk = gen_reg_rtx (mmsk);
15512 emit_insn (gen_rtx_SET (xmsk,
15513 gen_rtx_NEG (mmsk,
15514 gen_rtx_EQ (mmsk, src, zero))));
15515 }
0df28e68 15516 }
a6fc00da 15517
98daafa0
EM
15518 /* Estimate the approximate reciprocal square root. */
15519 rtx xdst = gen_reg_rtx (mode);
0016d8d9 15520 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
a6fc00da 15521
98daafa0
EM
15522 /* Iterate over the series twice for SF and thrice for DF. */
15523 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
a6fc00da 15524
98daafa0
EM
15525 /* Optionally iterate over the series once less for faster performance
15526 while sacrificing the accuracy. */
15527 if ((recp && flag_mrecip_low_precision_sqrt)
15528 || (!recp && flag_mlow_precision_sqrt))
a6fc00da
BH
15529 iterations--;
15530
98daafa0
EM
15531 /* Iterate over the series to calculate the approximate reciprocal square
15532 root. */
15533 rtx x1 = gen_reg_rtx (mode);
15534 while (iterations--)
a6fc00da 15535 {
a6fc00da 15536 rtx x2 = gen_reg_rtx (mode);
a0ee8352 15537 aarch64_emit_mult (x2, pg, xdst, xdst);
98daafa0 15538
0016d8d9 15539 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
a6fc00da 15540
98daafa0 15541 if (iterations > 0)
a0ee8352 15542 aarch64_emit_mult (xdst, pg, xdst, x1);
98daafa0
EM
15543 }
15544
15545 if (!recp)
15546 {
a0ee8352
RS
15547 if (pg)
15548 /* Multiply nonzero source values by the corresponding intermediate
15549 result elements, so that the final calculation is the approximate
15550 square root rather than its reciprocal. Select a zero result for
15551 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15552 otherwise. */
15553 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15554 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15555 else
15556 {
15557 /* Qualify the approximate reciprocal square root when the
15558 argument is 0.0 by squashing the intermediary result to 0.0. */
15559 rtx xtmp = gen_reg_rtx (mmsk);
15560 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15561 gen_rtx_SUBREG (mmsk, xdst, 0)));
15562 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
a6fc00da 15563
a0ee8352
RS
15564 /* Calculate the approximate square root. */
15565 aarch64_emit_mult (xdst, pg, xdst, src);
15566 }
a6fc00da
BH
15567 }
15568
98daafa0 15569 /* Finalize the approximation. */
a0ee8352 15570 aarch64_emit_mult (dst, pg, xdst, x1);
98daafa0
EM
15571
15572 return true;
a6fc00da
BH
15573}
15574
79a2bc2d
EM
15575/* Emit the instruction sequence to compute the approximation for the division
15576 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15577
15578bool
15579aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15580{
15581 machine_mode mode = GET_MODE (quo);
33d72b63
JW
15582
15583 if (GET_MODE_INNER (mode) == HFmode)
15584 return false;
15585
79a2bc2d
EM
15586 bool use_approx_division_p = (flag_mlow_precision_div
15587 || (aarch64_tune_params.approx_modes->division
15588 & AARCH64_APPROX_MODE (mode)));
15589
15590 if (!flag_finite_math_only
15591 || flag_trapping_math
15592 || !flag_unsafe_math_optimizations
15593 || optimize_function_for_size_p (cfun)
15594 || !use_approx_division_p)
15595 return false;
15596
1be49a38
RR
15597 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15598 return false;
15599
04f307cb
RS
15600 rtx pg = NULL_RTX;
15601 if (aarch64_sve_mode_p (mode))
15602 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15603
79a2bc2d
EM
15604 /* Estimate the approximate reciprocal. */
15605 rtx xrcp = gen_reg_rtx (mode);
0016d8d9 15606 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
79a2bc2d
EM
15607
15608 /* Iterate over the series twice for SF and thrice for DF. */
15609 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15610
dbf3dc75
BL
15611 /* Optionally iterate over the series less for faster performance,
15612 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
79a2bc2d 15613 if (flag_mlow_precision_div)
dbf3dc75
BL
15614 iterations = (GET_MODE_INNER (mode) == DFmode
15615 ? aarch64_double_recp_precision
15616 : aarch64_float_recp_precision);
79a2bc2d
EM
15617
15618 /* Iterate over the series to calculate the approximate reciprocal. */
15619 rtx xtmp = gen_reg_rtx (mode);
15620 while (iterations--)
15621 {
0016d8d9 15622 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
79a2bc2d
EM
15623
15624 if (iterations > 0)
04f307cb 15625 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
79a2bc2d
EM
15626 }
15627
15628 if (num != CONST1_RTX (mode))
15629 {
15630 /* As the approximate reciprocal of DEN is already calculated, only
15631 calculate the approximate division when NUM is not 1.0. */
15632 rtx xnum = force_reg (mode, num);
04f307cb 15633 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
79a2bc2d
EM
15634 }
15635
15636 /* Finalize the approximation. */
04f307cb 15637 aarch64_emit_mult (quo, pg, xrcp, xtmp);
79a2bc2d
EM
15638 return true;
15639}
15640
d126a4ae
AP
15641/* Return the number of instructions that can be issued per cycle. */
15642static int
15643aarch64_sched_issue_rate (void)
15644{
b175b679 15645 return aarch64_tune_params.issue_rate;
d126a4ae
AP
15646}
15647
d0bc0cb6
RS
15648/* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15649static int
15650aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15651{
15652 if (DEBUG_INSN_P (insn))
15653 return more;
15654
15655 rtx_code code = GET_CODE (PATTERN (insn));
15656 if (code == USE || code == CLOBBER)
15657 return more;
15658
15659 if (get_attr_type (insn) == TYPE_NO_INSN)
15660 return more;
15661
15662 return more - 1;
15663}
15664
d03f7e44
MK
15665static int
15666aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15667{
15668 int issue_rate = aarch64_sched_issue_rate ();
15669
15670 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15671}
15672
2d6bc7fa
KT
15673
15674/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
e53b6e56 15675 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
2d6bc7fa
KT
15676 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15677
15678static int
15679aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15680 int ready_index)
15681{
15682 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15683}
15684
15685
8990e73a
TB
15686/* Vectorizer cost model target hooks. */
15687
1205a8ca
RS
15688/* Information about how the CPU would issue the scalar, Advanced SIMD
15689 or SVE version of a vector loop, using the scheme defined by the
15690 aarch64_base_vec_issue_info hierarchy of structures. */
15aba5a6 15691class aarch64_vec_op_count
1205a8ca 15692{
15aba5a6 15693public:
1a5288fe 15694 aarch64_vec_op_count () = default;
2e1886ea
RS
15695 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15696 unsigned int = 1);
15aba5a6
RS
15697
15698 unsigned int vec_flags () const { return m_vec_flags; }
2e1886ea
RS
15699 unsigned int vf_factor () const { return m_vf_factor; }
15700
15aba5a6
RS
15701 const aarch64_base_vec_issue_info *base_issue_info () const;
15702 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15703 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15704
a82ffd43
RS
15705 fractional_cost rename_cycles_per_iter () const;
15706 fractional_cost min_nonpred_cycles_per_iter () const;
15707 fractional_cost min_pred_cycles_per_iter () const;
15708 fractional_cost min_cycles_per_iter () const;
15709
1205a8ca
RS
15710 void dump () const;
15711
15712 /* The number of individual "general" operations. See the comments
15713 in aarch64_base_vec_issue_info for details. */
15714 unsigned int general_ops = 0;
15715
15716 /* The number of load and store operations, under the same scheme
15717 as above. */
15718 unsigned int loads = 0;
15719 unsigned int stores = 0;
15720
15721 /* The minimum number of cycles needed to execute all loop-carried
15722 operations, which in the vector code become associated with
15723 reductions. */
15724 unsigned int reduction_latency = 0;
1205a8ca
RS
15725
15726 /* The number of individual predicate operations. See the comments
15727 in aarch64_sve_vec_issue_info for details. */
15728 unsigned int pred_ops = 0;
15aba5a6
RS
15729
15730private:
15731 /* The issue information for the core. */
1a5288fe 15732 const aarch64_vec_issue_info *m_issue_info = nullptr;
15aba5a6
RS
15733
15734 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15735 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15736 Advanced SIMD code.
15737 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15738 SVE code. */
1a5288fe 15739 unsigned int m_vec_flags = 0;
2e1886ea
RS
15740
15741 /* Assume that, when the code is executing on the core described
15742 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15743 times more data than the vectorizer anticipates.
15744
15745 This is only ever different from 1 for SVE. It allows us to consider
15746 what would happen on a 256-bit SVE target even when the -mtune
15747 parameters say that the “likely” SVE length is 128 bits. */
15748 unsigned int m_vf_factor = 1;
1205a8ca
RS
15749};
15750
15aba5a6
RS
15751aarch64_vec_op_count::
15752aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
2e1886ea 15753 unsigned int vec_flags, unsigned int vf_factor)
15aba5a6 15754 : m_issue_info (issue_info),
2e1886ea
RS
15755 m_vec_flags (vec_flags),
15756 m_vf_factor (vf_factor)
15aba5a6
RS
15757{
15758}
15759
15760/* Return the base issue information (i.e. the parts that make sense
15761 for both scalar and vector code). Return null if we have no issue
15762 information. */
15763const aarch64_base_vec_issue_info *
15764aarch64_vec_op_count::base_issue_info () const
15765{
15766 if (auto *ret = simd_issue_info ())
15767 return ret;
1a5288fe 15768 return m_issue_info->scalar;
15aba5a6
RS
15769}
15770
15771/* If the structure describes vector code and we have associated issue
15772 information, return that issue information, otherwise return null. */
15773const aarch64_simd_vec_issue_info *
15774aarch64_vec_op_count::simd_issue_info () const
15775{
15776 if (auto *ret = sve_issue_info ())
15777 return ret;
1a5288fe 15778 if (m_vec_flags)
15aba5a6
RS
15779 return m_issue_info->advsimd;
15780 return nullptr;
15781}
15782
15783/* If the structure describes SVE code and we have associated issue
15784 information, return that issue information, otherwise return null. */
15785const aarch64_sve_vec_issue_info *
15786aarch64_vec_op_count::sve_issue_info () const
15787{
1a5288fe 15788 if (m_vec_flags & VEC_ANY_SVE)
15aba5a6
RS
15789 return m_issue_info->sve;
15790 return nullptr;
15791}
15792
a82ffd43
RS
15793/* Estimate the minimum number of cycles per iteration needed to rename
15794 the instructions.
15795
15796 ??? For now this is done inline rather than via cost tables, since it
15797 isn't clear how it should be parameterized for the general case. */
15798fractional_cost
15799aarch64_vec_op_count::rename_cycles_per_iter () const
15800{
a8509301 15801 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
27d8748d 15802 || sve_issue_info () == &neoversen2_sve_issue_info
14d4b4fb 15803 || sve_issue_info () == &neoversev2_sve_issue_info)
a82ffd43
RS
15804 /* + 1 for an addition. We've already counted a general op for each
15805 store, so we don't need to account for stores separately. The branch
15806 reads no registers and so does not need to be counted either.
15807
15808 ??? This value is very much on the pessimistic side, but seems to work
15809 pretty well in practice. */
15810 return { general_ops + loads + pred_ops + 1, 5 };
15811
15812 return 0;
15813}
15814
15815/* Like min_cycles_per_iter, but excluding predicate operations. */
15816fractional_cost
15817aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15818{
15819 auto *issue_info = base_issue_info ();
15820
15821 fractional_cost cycles = MAX (reduction_latency, 1);
15822 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15823 cycles = std::max (cycles, { loads + stores,
15824 issue_info->loads_stores_per_cycle });
15825 cycles = std::max (cycles, { general_ops,
15826 issue_info->general_ops_per_cycle });
15827 cycles = std::max (cycles, rename_cycles_per_iter ());
15828 return cycles;
15829}
15830
15831/* Like min_cycles_per_iter, but including only the predicate operations. */
15832fractional_cost
15833aarch64_vec_op_count::min_pred_cycles_per_iter () const
15834{
15835 if (auto *issue_info = sve_issue_info ())
15836 return { pred_ops, issue_info->pred_ops_per_cycle };
15837 return 0;
15838}
15839
15840/* Estimate the minimum number of cycles needed to issue the operations.
15841 This is a very simplistic model! */
15842fractional_cost
15843aarch64_vec_op_count::min_cycles_per_iter () const
15844{
15845 return std::max (min_nonpred_cycles_per_iter (),
15846 min_pred_cycles_per_iter ());
15847}
15848
15849/* Dump information about the structure. */
15850void
15851aarch64_vec_op_count::dump () const
15852{
15853 dump_printf_loc (MSG_NOTE, vect_location,
15854 " load operations = %d\n", loads);
15855 dump_printf_loc (MSG_NOTE, vect_location,
15856 " store operations = %d\n", stores);
15857 dump_printf_loc (MSG_NOTE, vect_location,
15858 " general operations = %d\n", general_ops);
15859 if (sve_issue_info ())
15860 dump_printf_loc (MSG_NOTE, vect_location,
15861 " predicate operations = %d\n", pred_ops);
15862 dump_printf_loc (MSG_NOTE, vect_location,
15863 " reduction latency = %d\n", reduction_latency);
15864 if (auto rcpi = rename_cycles_per_iter ())
15865 dump_printf_loc (MSG_NOTE, vect_location,
15866 " estimated cycles per iteration to rename = %f\n",
15867 rcpi.as_double ());
15868 if (auto pred_cpi = min_pred_cycles_per_iter ())
15869 {
15870 dump_printf_loc (MSG_NOTE, vect_location,
15871 " estimated min cycles per iteration"
15872 " without predication = %f\n",
15873 min_nonpred_cycles_per_iter ().as_double ());
15874 dump_printf_loc (MSG_NOTE, vect_location,
15875 " estimated min cycles per iteration"
15876 " for predication = %f\n", pred_cpi.as_double ());
15877 }
15878 if (auto cpi = min_cycles_per_iter ())
15879 dump_printf_loc (MSG_NOTE, vect_location,
15880 " estimated min cycles per iteration = %f\n",
15881 cpi.as_double ());
15882}
15883
50a525b5 15884/* Information about vector code that we're in the process of costing. */
d43fc1df 15885class aarch64_vector_costs : public vector_costs
50a525b5 15886{
d43fc1df 15887public:
15aba5a6 15888 aarch64_vector_costs (vec_info *, bool);
6239dd05
RS
15889
15890 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
f24dfc76 15891 stmt_vec_info stmt_info, slp_tree, tree vectype,
6239dd05
RS
15892 int misalign,
15893 vect_cost_model_location where) override;
0612883d 15894 void finish_cost (const vector_costs *) override;
c6c5c5eb 15895 bool better_main_loop_than_p (const vector_costs *other) const override;
3b924b0d 15896
d43fc1df
RS
15897private:
15898 void record_potential_advsimd_unrolling (loop_vec_info);
15899 void analyze_loop_vinfo (loop_vec_info);
87fcff96
RS
15900 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15901 aarch64_vec_op_count *);
1a5288fe 15902 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
c6c5c5eb
RS
15903 fractional_cost, unsigned int,
15904 unsigned int *, bool *);
6756706e
RS
15905 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15906 unsigned int);
c6c5c5eb 15907 bool prefer_unrolled_loop () const;
2636660b 15908 unsigned int determine_suggested_unroll_factor ();
d43fc1df
RS
15909
15910 /* True if we have performed one-time initialization based on the
15911 vec_info. */
15912 bool m_analyzed_vinfo = false;
3b924b0d 15913
40d643d8
AV
15914 /* This loop uses an average operation that is not supported by SVE, but is
15915 supported by Advanced SIMD and SVE2. */
15916 bool m_has_avg = false;
15917
d43fc1df
RS
15918 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15919 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
3b924b0d 15920 SIMD code.
d43fc1df
RS
15921 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
15922 unsigned int m_vec_flags = 0;
3b924b0d 15923
49630797
RS
15924 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15925 This means that code such as:
15926
15927 a[0] = x;
15928 a[1] = x;
15929
15930 will be costed as two scalar instructions and two vector instructions
15931 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
15932 wins if the costs are equal, because of the fact that the vector costs
15933 include constant initializations whereas the scalar costs don't.
15934 We would therefore tend to vectorize the code above, even though
15935 the scalar version can use a single STP.
15936
15937 We should eventually fix this and model LDP and STP in the main costs;
15938 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15939 Until then, we look specifically for code that does nothing more than
15940 STP-like operations. We cost them on that basis in addition to the
15941 normal latency-based costs.
15942
15943 If the scalar or vector code could be a sequence of STPs +
15944 initialization, this variable counts the cost of the sequence,
15945 with 2 units per instruction. The variable is ~0U for other
15946 kinds of code. */
15947 unsigned int m_stp_sequence_cost = 0;
15948
3b924b0d
RS
15949 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15950 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
15951 situations, we try to predict whether an Advanced SIMD implementation
15952 of the loop could be completely unrolled and become straight-line code.
15953 If so, it is generally better to use the Advanced SIMD version rather
15954 than length-agnostic SVE, since the SVE loop would execute an unknown
15955 number of times and so could not be completely unrolled in the same way.
15956
d43fc1df 15957 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
3b924b0d 15958 number of Advanced SIMD loop iterations that would be unrolled and
d43fc1df 15959 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
3b924b0d
RS
15960 in the unrolled loop. Both values are zero if we're not applying
15961 the heuristic. */
d43fc1df
RS
15962 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15963 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
1205a8ca
RS
15964
15965 /* If we're vectorizing a loop that executes a constant number of times,
15966 this variable gives the number of times that the vector loop would
15967 iterate, otherwise it is zero. */
d43fc1df 15968 uint64_t m_num_vector_iterations = 0;
1205a8ca 15969
6756706e
RS
15970 /* Used only when vectorizing loops. Estimates the number and kind of
15971 operations that would be needed by one iteration of the scalar
1a5288fe
RS
15972 or vector loop. There is one entry for each tuning option of
15973 interest. */
15974 auto_vec<aarch64_vec_op_count, 2> m_ops;
50a525b5
RS
15975};
15976
15aba5a6
RS
15977aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15978 bool costing_for_scalar)
15979 : vector_costs (vinfo, costing_for_scalar),
6756706e 15980 m_vec_flags (costing_for_scalar ? 0
1a5288fe 15981 : aarch64_classify_vector_mode (vinfo->vector_mode))
15aba5a6 15982{
1a5288fe
RS
15983 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15984 {
15985 m_ops.quick_push ({ issue_info, m_vec_flags });
1a5288fe 15986 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
2e1886ea
RS
15987 {
15988 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15989 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15990 vf_factor });
15991 }
1a5288fe 15992 }
15aba5a6
RS
15993}
15994
6239dd05
RS
15995/* Implement TARGET_VECTORIZE_CREATE_COSTS. */
15996vector_costs *
15997aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
50a525b5 15998{
6239dd05 15999 return new aarch64_vector_costs (vinfo, costing_for_scalar);
50a525b5
RS
16000}
16001
e253bb8b
RS
16002/* Return true if the current CPU should use the new costs defined
16003 in GCC 11. This should be removed for GCC 12 and above, with the
16004 costs applying to all CPUs instead. */
16005static bool
16006aarch64_use_new_vector_costs_p ()
16007{
16008 return (aarch64_tune_params.extra_tuning_flags
16009 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16010}
16011
16012/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16013static const simd_vec_cost *
16014aarch64_simd_vec_costs (tree vectype)
16015{
16016 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16017 if (vectype != NULL
16018 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16019 && costs->sve != NULL)
16020 return costs->sve;
16021 return costs->advsimd;
16022}
16023
1205a8ca
RS
16024/* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16025static const simd_vec_cost *
16026aarch64_simd_vec_costs_for_flags (unsigned int flags)
16027{
16028 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16029 if ((flags & VEC_ANY_SVE) && costs->sve)
16030 return costs->sve;
16031 return costs->advsimd;
16032}
16033
902b7c9e
RS
16034/* If STMT_INFO is a memory reference, return the scalar memory type,
16035 otherwise return null. */
16036static tree
16037aarch64_dr_type (stmt_vec_info stmt_info)
16038{
16039 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16040 return TREE_TYPE (DR_REF (dr));
16041 return NULL_TREE;
16042}
16043
3b924b0d 16044/* Decide whether to use the unrolling heuristic described above
d43fc1df
RS
16045 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16046 describes the loop that we're vectorizing. */
16047void
16048aarch64_vector_costs::
16049record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
3b924b0d
RS
16050{
16051 /* The heuristic only makes sense on targets that have the same
16052 vector throughput for SVE and Advanced SIMD. */
16053 if (!(aarch64_tune_params.extra_tuning_flags
16054 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16055 return;
16056
16057 /* We only want to apply the heuristic if LOOP_VINFO is being
16058 vectorized for SVE. */
d43fc1df 16059 if (!(m_vec_flags & VEC_ANY_SVE))
3b924b0d
RS
16060 return;
16061
16062 /* Check whether it is possible in principle to use Advanced SIMD
16063 instead. */
16064 if (aarch64_autovec_preference == 2)
16065 return;
16066
16067 /* We don't want to apply the heuristic to outer loops, since it's
16068 harder to track two levels of unrolling. */
16069 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16070 return;
16071
16072 /* Only handle cases in which the number of Advanced SIMD iterations
16073 would be known at compile time but the number of SVE iterations
16074 would not. */
16075 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16076 || aarch64_sve_vg.is_constant ())
16077 return;
16078
16079 /* Guess how many times the Advanced SIMD loop would iterate and make
16080 sure that it is within the complete unrolling limit. Even if the
16081 number of iterations is small enough, the number of statements might
16082 not be, which is why we need to estimate the number of statements too. */
16083 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16084 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16085 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16086 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16087 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16088 return;
16089
16090 /* Record that we're applying the heuristic and should try to estimate
16091 the number of statements in the Advanced SIMD loop. */
d43fc1df 16092 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
3b924b0d
RS
16093}
16094
d43fc1df
RS
16095/* Do one-time initialization of the aarch64_vector_costs given that we're
16096 costing the loop vectorization described by LOOP_VINFO. */
16097void
16098aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
3b924b0d 16099{
1205a8ca
RS
16100 /* Record the number of times that the vector loop would execute,
16101 if known. */
16102 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16103 auto scalar_niters = max_stmt_executions_int (loop);
16104 if (scalar_niters >= 0)
16105 {
16106 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16107 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
d43fc1df 16108 m_num_vector_iterations = scalar_niters / vf;
1205a8ca 16109 else
d43fc1df 16110 m_num_vector_iterations = CEIL (scalar_niters, vf);
1205a8ca
RS
16111 }
16112
d43fc1df
RS
16113 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16114 heuristic described above m_unrolled_advsimd_niters. */
16115 record_potential_advsimd_unrolling (loop_vinfo);
1205a8ca
RS
16116
16117 /* Record the issue information for any SVE WHILE instructions that the
16118 loop needs. */
1a5288fe 16119 if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1205a8ca
RS
16120 {
16121 unsigned int num_masks = 0;
16122 rgroup_controls *rgm;
16123 unsigned int num_vectors_m1;
16124 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16125 if (rgm->type)
16126 num_masks += num_vectors_m1 + 1;
1a5288fe
RS
16127 for (auto &ops : m_ops)
16128 if (auto *issue = ops.sve_issue_info ())
16129 ops.pred_ops += num_masks * issue->while_pred_ops;
1205a8ca 16130 }
3b924b0d
RS
16131}
16132
8990e73a
TB
16133/* Implement targetm.vectorize.builtin_vectorization_cost. */
16134static int
16135aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16136 tree vectype,
16137 int misalign ATTRIBUTE_UNUSED)
16138{
16139 unsigned elements;
cd8ae5ed
AP
16140 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16141 bool fp = false;
16142
16143 if (vectype != NULL)
16144 fp = FLOAT_TYPE_P (vectype);
8990e73a 16145
e253bb8b 16146 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
76e4f444 16147
8990e73a
TB
16148 switch (type_of_cost)
16149 {
16150 case scalar_stmt:
cd8ae5ed 16151 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8990e73a
TB
16152
16153 case scalar_load:
cd8ae5ed 16154 return costs->scalar_load_cost;
8990e73a
TB
16155
16156 case scalar_store:
cd8ae5ed 16157 return costs->scalar_store_cost;
8990e73a
TB
16158
16159 case vector_stmt:
76e4f444
KT
16160 return fp ? simd_costs->fp_stmt_cost
16161 : simd_costs->int_stmt_cost;
8990e73a
TB
16162
16163 case vector_load:
76e4f444 16164 return simd_costs->align_load_cost;
8990e73a
TB
16165
16166 case vector_store:
76e4f444 16167 return simd_costs->store_cost;
8990e73a
TB
16168
16169 case vec_to_scalar:
76e4f444 16170 return simd_costs->vec_to_scalar_cost;
8990e73a
TB
16171
16172 case scalar_to_vec:
76e4f444 16173 return simd_costs->scalar_to_vec_cost;
8990e73a
TB
16174
16175 case unaligned_load:
cc9fe6bb 16176 case vector_gather_load:
76e4f444 16177 return simd_costs->unalign_load_cost;
8990e73a
TB
16178
16179 case unaligned_store:
cc9fe6bb 16180 case vector_scatter_store:
76e4f444 16181 return simd_costs->unalign_store_cost;
8990e73a
TB
16182
16183 case cond_branch_taken:
cd8ae5ed 16184 return costs->cond_taken_branch_cost;
8990e73a
TB
16185
16186 case cond_branch_not_taken:
cd8ae5ed 16187 return costs->cond_not_taken_branch_cost;
8990e73a
TB
16188
16189 case vec_perm:
76e4f444 16190 return simd_costs->permute_cost;
c428f91c 16191
8990e73a 16192 case vec_promote_demote:
76e4f444
KT
16193 return fp ? simd_costs->fp_stmt_cost
16194 : simd_costs->int_stmt_cost;
8990e73a
TB
16195
16196 case vec_construct:
6a70badb 16197 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
8990e73a
TB
16198 return elements / 2 + 1;
16199
16200 default:
16201 gcc_unreachable ();
16202 }
16203}
16204
b1a831f0
RS
16205/* Return true if an access of kind KIND for STMT_INFO represents one
16206 vector of an LD[234] or ST[234] operation. Return the total number of
16207 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16208static int
16209aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16210{
16211 if ((kind == vector_load
16212 || kind == unaligned_load
16213 || kind == vector_store
16214 || kind == unaligned_store)
16215 && STMT_VINFO_DATA_REF (stmt_info))
16216 {
16217 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16218 if (stmt_info
16219 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16220 return DR_GROUP_SIZE (stmt_info);
16221 }
16222 return 0;
16223}
16224
8b50d7a4
RS
16225/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16226 vectors would produce a series of LDP or STP operations. KIND is the
16227 kind of statement that STMT_INFO represents. */
16228static bool
16229aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16230 stmt_vec_info stmt_info)
16231{
16232 switch (kind)
16233 {
16234 case vector_load:
16235 case vector_store:
16236 case unaligned_load:
16237 case unaligned_store:
16238 break;
16239
16240 default:
16241 return false;
16242 }
16243
16244 if (aarch64_tune_params.extra_tuning_flags
16245 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16246 return false;
16247
16248 return is_gimple_assign (stmt_info->stmt);
16249}
16250
1205a8ca
RS
16251/* Return true if STMT_INFO is the second part of a two-statement multiply-add
16252 or multiply-subtract sequence that might be suitable for fusing into a
028059b4
RS
16253 single instruction. If VEC_FLAGS is zero, analyze the operation as
16254 a scalar one, otherwise analyze it as an operation on vectors with those
16255 VEC_* flags. */
1205a8ca 16256static bool
028059b4
RS
16257aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16258 unsigned int vec_flags)
1205a8ca
RS
16259{
16260 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16261 if (!assign)
16262 return false;
16263 tree_code code = gimple_assign_rhs_code (assign);
16264 if (code != PLUS_EXPR && code != MINUS_EXPR)
16265 return false;
16266
16267 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16268 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16269 return false;
16270
16271 for (int i = 1; i < 3; ++i)
16272 {
16273 tree rhs = gimple_op (assign, i);
16274 /* ??? Should we try to check for a single use as well? */
16275 if (TREE_CODE (rhs) != SSA_NAME)
16276 continue;
16277
16278 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16279 if (!def_stmt_info
16280 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16281 continue;
16282 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16283 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16284 continue;
16285
028059b4
RS
16286 if (vec_flags & VEC_ADVSIMD)
16287 {
16288 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16289 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16290 only supports MLA forms, so will require a move if the result
16291 cannot be tied to the accumulator. The most important case in
16292 which this is true is when the accumulator input is invariant. */
16293 rhs = gimple_op (assign, 3 - i);
16294 if (TREE_CODE (rhs) != SSA_NAME)
16295 return false;
16296 def_stmt_info = vinfo->lookup_def (rhs);
16297 if (!def_stmt_info
16298 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16299 return false;
16300 }
16301
1205a8ca
RS
16302 return true;
16303 }
16304 return false;
16305}
16306
26122469
RS
16307/* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16308 in-loop reduction that SVE supports directly, return its latency in cycles,
16309 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16310 instructions. */
1282988b
RS
16311static unsigned int
16312aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16313 stmt_vec_info stmt_info,
1282988b
RS
16314 const sve_vec_cost *sve_costs)
16315{
783d809f 16316 switch (vect_reduc_type (vinfo, stmt_info))
1282988b
RS
16317 {
16318 case EXTRACT_LAST_REDUCTION:
16319 return sve_costs->clast_cost;
16320
16321 case FOLD_LEFT_REDUCTION:
26122469 16322 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
1282988b
RS
16323 {
16324 case E_HFmode:
16325 case E_BFmode:
16326 return sve_costs->fadda_f16_cost;
16327
16328 case E_SFmode:
16329 return sve_costs->fadda_f32_cost;
16330
16331 case E_DFmode:
16332 return sve_costs->fadda_f64_cost;
16333
16334 default:
16335 break;
16336 }
16337 break;
16338 }
16339
16340 return 0;
16341}
16342
1205a8ca
RS
16343/* STMT_INFO describes a loop-carried operation in the original scalar code
16344 that we are considering implementing as a reduction. Return one of the
16345 following values, depending on VEC_FLAGS:
16346
16347 - If VEC_FLAGS is zero, return the loop carry latency of the original
16348 scalar operation.
16349
16350 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
eb55b5b0 16351 Advanced SIMD implementation.
1205a8ca
RS
16352
16353 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
26122469 16354 SVE implementation. */
1205a8ca
RS
16355static unsigned int
16356aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
26122469 16357 unsigned int vec_flags)
1205a8ca
RS
16358{
16359 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16360 const sve_vec_cost *sve_costs = nullptr;
16361 if (vec_flags & VEC_ANY_SVE)
16362 sve_costs = aarch64_tune_params.vec_costs->sve;
16363
16364 /* If the caller is asking for the SVE latency, check for forms of reduction
16365 that only SVE can handle directly. */
16366 if (sve_costs)
16367 {
16368 unsigned int latency
26122469 16369 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
1205a8ca
RS
16370 if (latency)
16371 return latency;
16372 }
16373
16374 /* Handle scalar costs. */
26122469 16375 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
1205a8ca
RS
16376 if (vec_flags == 0)
16377 {
26122469 16378 if (is_float)
1205a8ca
RS
16379 return vec_costs->scalar_fp_stmt_cost;
16380 return vec_costs->scalar_int_stmt_cost;
16381 }
16382
16383 /* Otherwise, the loop body just contains normal integer or FP operations,
16384 with a vector reduction outside the loop. */
16385 const simd_vec_cost *simd_costs
16386 = aarch64_simd_vec_costs_for_flags (vec_flags);
26122469 16387 if (is_float)
1205a8ca
RS
16388 return simd_costs->fp_stmt_cost;
16389 return simd_costs->int_stmt_cost;
16390}
16391
ed17ad5e
RS
16392/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16393 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16394 try to subdivide the target-independent categorization provided by KIND
16395 to get a more accurate cost. */
83d796d3 16396static fractional_cost
ed17ad5e
RS
16397aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16398 stmt_vec_info stmt_info,
83d796d3 16399 fractional_cost stmt_cost)
ed17ad5e
RS
16400{
16401 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16402 the extension with the load. */
783d809f 16403 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
ed17ad5e
RS
16404 return 0;
16405
16406 return stmt_cost;
16407}
16408
e253bb8b
RS
16409/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16410 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16411 when vectorized would operate on vector type VECTYPE. Try to subdivide
16412 the target-independent categorization provided by KIND to get a more
16413 accurate cost. WHERE specifies where the cost associated with KIND
16414 occurs. */
83d796d3 16415static fractional_cost
1282988b 16416aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
e253bb8b
RS
16417 stmt_vec_info stmt_info, tree vectype,
16418 enum vect_cost_model_location where,
83d796d3 16419 fractional_cost stmt_cost)
e253bb8b
RS
16420{
16421 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
1282988b
RS
16422 const sve_vec_cost *sve_costs = nullptr;
16423 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16424 sve_costs = aarch64_tune_params.vec_costs->sve;
16425
e4180ab2
RS
16426 /* It's generally better to avoid costing inductions, since the induction
16427 will usually be hidden by other operations. This is particularly true
16428 for things like COND_REDUCTIONS. */
16429 if (is_a<gphi *> (stmt_info->stmt))
16430 return 0;
16431
d1ff0847
RS
16432 /* Detect cases in which vec_to_scalar is describing the extraction of a
16433 vector element in preparation for a scalar store. The store itself is
16434 costed separately. */
783d809f 16435 if (vect_is_store_elt_extraction (kind, stmt_info))
d1ff0847
RS
16436 return simd_costs->store_elt_extra_cost;
16437
78770e0e
RS
16438 /* Detect SVE gather loads, which are costed as a single scalar_load
16439 for each element. We therefore need to divide the full-instruction
16440 cost by the number of elements in the vector. */
16441 if (kind == scalar_load
16442 && sve_costs
16443 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16444 {
16445 unsigned int nunits = vect_nunits_for_cost (vectype);
16446 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16447 return { sve_costs->gather_load_x64_cost, nunits };
16448 return { sve_costs->gather_load_x32_cost, nunits };
16449 }
16450
7c679969
RS
16451 /* Detect cases in which a scalar_store is really storing one element
16452 in a scatter operation. */
16453 if (kind == scalar_store
16454 && sve_costs
16455 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16456 return sve_costs->scatter_store_elt_cost;
16457
1282988b
RS
16458 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16459 if (kind == vec_to_scalar
16460 && where == vect_body
16461 && sve_costs)
16462 {
16463 unsigned int latency
26122469 16464 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
1282988b
RS
16465 if (latency)
16466 return latency;
16467 }
e253bb8b
RS
16468
16469 /* Detect cases in which vec_to_scalar represents a single reduction
16470 instruction like FADDP or MAXV. */
16471 if (kind == vec_to_scalar
16472 && where == vect_epilogue
783d809f 16473 && vect_is_reduction (stmt_info))
e253bb8b
RS
16474 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16475 {
16476 case E_QImode:
16477 return simd_costs->reduc_i8_cost;
16478
16479 case E_HImode:
16480 return simd_costs->reduc_i16_cost;
16481
16482 case E_SImode:
16483 return simd_costs->reduc_i32_cost;
16484
16485 case E_DImode:
16486 return simd_costs->reduc_i64_cost;
16487
16488 case E_HFmode:
16489 case E_BFmode:
16490 return simd_costs->reduc_f16_cost;
16491
16492 case E_SFmode:
16493 return simd_costs->reduc_f32_cost;
16494
16495 case E_DFmode:
16496 return simd_costs->reduc_f64_cost;
16497
16498 default:
16499 break;
16500 }
16501
16502 /* Otherwise stick with the original categorization. */
16503 return stmt_cost;
16504}
16505
217ccab8 16506/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
8b50d7a4
RS
16507 for STMT_INFO, which has cost kind KIND and which when vectorized would
16508 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16509 targets. */
83d796d3 16510static fractional_cost
308bc496 16511aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
8b50d7a4 16512 stmt_vec_info stmt_info, tree vectype,
83d796d3 16513 fractional_cost stmt_cost)
217ccab8
RS
16514{
16515 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16516 vector register size or number of units. Integer promotions of this
16517 type therefore map to SXT[BHW] or UXT[BHW].
16518
16519 Most loads have extending forms that can do the sign or zero extension
16520 on the fly. Optimistically assume that a load followed by an extension
16521 will fold to this form during combine, and that the extension therefore
16522 comes for free. */
783d809f 16523 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
217ccab8
RS
16524 stmt_cost = 0;
16525
2d56600c
RS
16526 /* For similar reasons, vector_stmt integer truncations are a no-op,
16527 because we can just ignore the unused upper bits of the source. */
783d809f 16528 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
2d56600c
RS
16529 stmt_cost = 0;
16530
8b50d7a4
RS
16531 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16532 but there are no equivalent instructions for SVE. This means that
16533 (all other things being equal) 128-bit SVE needs twice as many load
16534 and store instructions as Advanced SIMD in order to process vector pairs.
16535
16536 Also, scalar code can often use LDP and STP to access pairs of values,
16537 so it is too simplistic to say that one SVE load or store replaces
16538 VF scalar loads and stores.
16539
16540 Ideally we would account for this in the scalar and Advanced SIMD
16541 costs by making suitable load/store pairs as cheap as a single
16542 load/store. However, that would be a very invasive change and in
16543 practice it tends to stress other parts of the cost model too much.
16544 E.g. stores of scalar constants currently count just a store,
16545 whereas stores of vector constants count a store and a vec_init.
16546 This is an artificial distinction for AArch64, where stores of
16547 nonzero scalar constants need the same kind of register invariant
16548 as vector stores.
16549
16550 An alternative would be to double the cost of any SVE loads and stores
16551 that could be paired in Advanced SIMD (and possibly also paired in
16552 scalar code). But this tends to stress other parts of the cost model
16553 in the same way. It also means that we can fall back to Advanced SIMD
16554 even if full-loop predication would have been useful.
16555
16556 Here we go for a more conservative version: double the costs of SVE
16557 loads and stores if one iteration of the scalar loop processes enough
16558 elements for it to use a whole number of Advanced SIMD LDP or STP
16559 instructions. This makes it very likely that the VF would be 1 for
16560 Advanced SIMD, and so no epilogue should be needed. */
16561 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16562 {
16563 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16564 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16565 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16566 if (multiple_p (count * elt_bits, 256)
16567 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16568 stmt_cost *= 2;
16569 }
16570
217ccab8
RS
16571 return stmt_cost;
16572}
16573
b1a831f0
RS
16574/* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16575 and which when vectorized would operate on vector type VECTYPE. Add the
16576 cost of any embedded operations. */
83d796d3 16577static fractional_cost
b1a831f0 16578aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
83d796d3 16579 tree vectype, fractional_cost stmt_cost)
b1a831f0
RS
16580{
16581 if (vectype)
16582 {
16583 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16584
16585 /* Detect cases in which a vector load or store represents an
16586 LD[234] or ST[234] instruction. */
16587 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16588 {
16589 case 2:
16590 stmt_cost += simd_costs->ld2_st2_permute_cost;
16591 break;
16592
16593 case 3:
16594 stmt_cost += simd_costs->ld3_st3_permute_cost;
16595 break;
16596
16597 case 4:
16598 stmt_cost += simd_costs->ld4_st4_permute_cost;
16599 break;
16600 }
99f94ae5
RS
16601
16602 if (kind == vector_stmt || kind == vec_to_scalar)
783d809f 16603 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
99f94ae5
RS
16604 {
16605 if (FLOAT_TYPE_P (cmp_type))
16606 stmt_cost += simd_costs->fp_stmt_cost;
16607 else
16608 stmt_cost += simd_costs->int_stmt_cost;
16609 }
b1a831f0
RS
16610 }
16611
99f94ae5 16612 if (kind == scalar_stmt)
783d809f 16613 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
99f94ae5
RS
16614 {
16615 if (FLOAT_TYPE_P (cmp_type))
16616 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16617 else
16618 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16619 }
16620
b1a831f0
RS
16621 return stmt_cost;
16622}
16623
87fcff96
RS
16624/* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16625 and they describe an operation in the body of a vector loop. Record issue
16626 information relating to the vector operation in OPS. */
d43fc1df
RS
16627void
16628aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
87fcff96
RS
16629 stmt_vec_info stmt_info,
16630 aarch64_vec_op_count *ops)
1205a8ca 16631{
15aba5a6
RS
16632 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16633 if (!base_issue)
1205a8ca 16634 return;
15aba5a6
RS
16635 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16636 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
1205a8ca
RS
16637
16638 /* Calculate the minimum cycles per iteration imposed by a reduction
16639 operation. */
6756706e 16640 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
783d809f 16641 && vect_is_reduction (stmt_info))
1205a8ca
RS
16642 {
16643 unsigned int base
87fcff96 16644 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
1205a8ca 16645
87fcff96
RS
16646 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16647 that's not yet the case. */
16648 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
1205a8ca
RS
16649 }
16650
16651 /* Assume that multiply-adds will become a single operation. */
87fcff96 16652 if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
1205a8ca
RS
16653 return;
16654
1205a8ca
RS
16655 /* Count the basic operation cost associated with KIND. */
16656 switch (kind)
16657 {
16658 case cond_branch_taken:
16659 case cond_branch_not_taken:
16660 case vector_gather_load:
16661 case vector_scatter_store:
16662 /* We currently don't expect these to be used in a loop body. */
16663 break;
16664
16665 case vec_perm:
16666 case vec_promote_demote:
16667 case vec_construct:
16668 case vec_to_scalar:
16669 case scalar_to_vec:
1205a8ca
RS
16670 case vector_stmt:
16671 case scalar_stmt:
87fcff96 16672 ops->general_ops += count;
1205a8ca
RS
16673 break;
16674
16675 case scalar_load:
16676 case vector_load:
16677 case unaligned_load:
87fcff96
RS
16678 ops->loads += count;
16679 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16680 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
1205a8ca
RS
16681 break;
16682
16683 case vector_store:
16684 case unaligned_store:
16685 case scalar_store:
87fcff96
RS
16686 ops->stores += count;
16687 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16688 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
1205a8ca
RS
16689 break;
16690 }
16691
16692 /* Add any embedded comparison operations. */
16693 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
783d809f 16694 && vect_embedded_comparison_type (stmt_info))
87fcff96 16695 ops->general_ops += count;
1205a8ca 16696
87fcff96 16697 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
1205a8ca 16698 have only accounted for one. */
87fcff96
RS
16699 if ((kind == vector_stmt || kind == vec_to_scalar)
16700 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16701 ops->general_ops += count;
1205a8ca
RS
16702
16703 /* Count the predicate operations needed by an SVE comparison. */
16704 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
783d809f 16705 if (tree type = vect_comparison_type (stmt_info))
1205a8ca
RS
16706 {
16707 unsigned int base = (FLOAT_TYPE_P (type)
16708 ? sve_issue->fp_cmp_pred_ops
16709 : sve_issue->int_cmp_pred_ops);
87fcff96 16710 ops->pred_ops += base * count;
1205a8ca
RS
16711 }
16712
16713 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16714 if (simd_issue)
16715 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16716 {
16717 case 2:
87fcff96 16718 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
1205a8ca
RS
16719 break;
16720
16721 case 3:
87fcff96 16722 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
1205a8ca
RS
16723 break;
16724
16725 case 4:
87fcff96 16726 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
1205a8ca
RS
16727 break;
16728 }
16729
16730 /* Add any overhead associated with gather loads and scatter stores. */
16731 if (sve_issue
16732 && (kind == scalar_load || kind == scalar_store)
16733 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16734 {
16735 unsigned int pairs = CEIL (count, 2);
15aba5a6 16736 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
1205a8ca
RS
16737 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16738 }
16739}
16740
49630797
RS
16741/* Return true if STMT_INFO contains a memory access and if the constant
16742 component of the memory address is aligned to SIZE bytes. */
16743static bool
16744aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16745 poly_uint64 size)
16746{
16747 if (!STMT_VINFO_DATA_REF (stmt_info))
16748 return false;
16749
16750 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16751 stmt_info = first_stmt;
16752 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16753 /* Needed for gathers & scatters, for example. */
16754 if (!constant_offset)
16755 return false;
16756
16757 return multiple_p (wi::to_poly_offset (constant_offset), size);
16758}
16759
16760/* Check if a scalar or vector stmt could be part of a region of code
16761 that does nothing more than store values to memory, in the scalar
16762 case using STP. Return the cost of the stmt if so, counting 2 for
16763 one instruction. Return ~0U otherwise.
16764
16765 The arguments are a subset of those passed to add_stmt_cost. */
16766unsigned int
16767aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16768 stmt_vec_info stmt_info, tree vectype)
16769{
16770 /* Code that stores vector constants uses a vector_load to create
16771 the constant. We don't apply the heuristic to that case for two
16772 main reasons:
16773
16774 - At the moment, STPs are only formed via peephole2, and the
16775 constant scalar moves would often come between STRs and so
16776 prevent STP formation.
16777
16778 - The scalar code also has to load the constant somehow, and that
16779 isn't costed. */
16780 switch (kind)
16781 {
16782 case scalar_to_vec:
16783 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
16784 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16785
16786 case vec_construct:
16787 if (FLOAT_TYPE_P (vectype))
16788 /* Count 1 insn for the maximum number of FP->SIMD INS
16789 instructions. */
16790 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16791
16792 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16793 maximum number of GPR->SIMD INS instructions. */
16794 return vect_nunits_for_cost (vectype) * 4 * count;
16795
16796 case vector_store:
16797 case unaligned_store:
16798 /* Count 1 insn per vector if we can't form STP Q pairs. */
16799 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16800 return count * 2;
16801 if (aarch64_tune_params.extra_tuning_flags
16802 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16803 return count * 2;
16804
16805 if (stmt_info)
16806 {
16807 /* Assume we won't be able to use STP if the constant offset
16808 component of the address is misaligned. ??? This could be
16809 removed if we formed STP pairs earlier, rather than relying
16810 on peephole2. */
16811 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16812 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16813 return count * 2;
16814 }
16815 return CEIL (count, 2) * 2;
16816
16817 case scalar_store:
16818 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16819 {
16820 /* Check for a mode in which STP pairs can be formed. */
16821 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16822 if (maybe_ne (size, 4) && maybe_ne (size, 8))
16823 return ~0U;
16824
16825 /* Assume we won't be able to use STP if the constant offset
16826 component of the address is misaligned. ??? This could be
16827 removed if we formed STP pairs earlier, rather than relying
16828 on peephole2. */
16829 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16830 return ~0U;
16831 }
16832 return count;
16833
16834 default:
16835 return ~0U;
16836 }
16837}
16838
6239dd05
RS
16839unsigned
16840aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
f24dfc76
RB
16841 stmt_vec_info stmt_info, slp_tree,
16842 tree vectype, int misalign,
6239dd05 16843 vect_cost_model_location where)
8990e73a 16844{
f837785c
RS
16845 fractional_cost stmt_cost
16846 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
3b924b0d 16847
f837785c
RS
16848 bool in_inner_loop_p = (where == vect_body
16849 && stmt_info
6239dd05 16850 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
9690309b 16851
f837785c 16852 /* Do one-time initialization based on the vinfo. */
6239dd05 16853 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
d43fc1df 16854 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
f837785c
RS
16855 {
16856 if (loop_vinfo)
d43fc1df
RS
16857 analyze_loop_vinfo (loop_vinfo);
16858
16859 m_analyzed_vinfo = true;
f837785c
RS
16860 }
16861
49630797
RS
16862 /* Apply the heuristic described above m_stp_sequence_cost. */
16863 if (m_stp_sequence_cost != ~0U)
16864 {
16865 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16866 stmt_info, vectype);
16867 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16868 }
16869
f837785c
RS
16870 /* Try to get a more accurate cost by looking at STMT_INFO instead
16871 of just looking at KIND. */
16872 if (stmt_info && aarch64_use_new_vector_costs_p ())
16873 {
f837785c
RS
16874 /* If we scalarize a strided store, the vectorizer costs one
16875 vec_to_scalar for each element. However, we can store the first
16876 element using an FP store without a separate extract step. */
16877 if (vect_is_store_elt_extraction (kind, stmt_info))
16878 count -= 1;
16879
d43fc1df
RS
16880 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16881 stmt_info, stmt_cost);
f837785c 16882
d43fc1df 16883 if (vectype && m_vec_flags)
6239dd05 16884 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
f837785c
RS
16885 stmt_info, vectype,
16886 where, stmt_cost);
16887 }
16888
16889 /* Do any SVE-specific adjustments to the cost. */
16890 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
6239dd05 16891 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
f837785c
RS
16892 vectype, stmt_cost);
16893
16894 if (stmt_info && aarch64_use_new_vector_costs_p ())
16895 {
16896 /* Account for any extra "embedded" costs that apply additively
16897 to the base cost calculated above. */
16898 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16899 stmt_cost);
16900
16901 /* If we're recording a nonzero vector loop body cost for the
16902 innermost loop, also estimate the operations that would need
16903 to be issued by all relevant implementations of the loop. */
f837785c 16904 if (loop_vinfo
6756706e 16905 && (m_costing_for_scalar || where == vect_body)
f837785c 16906 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
f837785c 16907 && stmt_cost != 0)
87fcff96
RS
16908 for (auto &ops : m_ops)
16909 count_ops (count, kind, stmt_info, &ops);
b1a831f0 16910
f837785c
RS
16911 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16912 estimate the number of statements in the unrolled Advanced SIMD
16913 loop. For simplicitly, we assume that one iteration of the
16914 Advanced SIMD loop would need the same number of statements
16915 as one iteration of the SVE loop. */
d43fc1df
RS
16916 if (where == vect_body && m_unrolled_advsimd_niters)
16917 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
40d643d8
AV
16918
16919 /* Detect the use of an averaging operation. */
16920 gimple *stmt = stmt_info->stmt;
16921 if (is_gimple_call (stmt)
16922 && gimple_call_internal_p (stmt))
16923 {
16924 switch (gimple_call_internal_fn (stmt))
16925 {
16926 case IFN_AVG_FLOOR:
16927 case IFN_AVG_CEIL:
16928 m_has_avg = true;
16929 default:
16930 break;
16931 }
16932 }
f837785c 16933 }
6239dd05 16934 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
8990e73a
TB
16935}
16936
c6c5c5eb
RS
16937/* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16938 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16939 says that we should prefer the Advanced SIMD loop. */
16940bool
16941aarch64_vector_costs::prefer_unrolled_loop () const
16942{
16943 if (!m_unrolled_advsimd_stmts)
16944 return false;
16945
16946 if (dump_enabled_p ())
16947 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
b98c5262
TC
16948 " unrolled Advanced SIMD loop = "
16949 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
c6c5c5eb
RS
16950 m_unrolled_advsimd_stmts);
16951
16952 /* The balance here is tricky. On the one hand, we can't be sure whether
16953 the code is vectorizable with Advanced SIMD or not. However, even if
16954 it isn't vectorizable with Advanced SIMD, there's a possibility that
16955 the scalar code could also be unrolled. Some of the code might then
16956 benefit from SLP, or from using LDP and STP. We therefore apply
16957 the heuristic regardless of can_use_advsimd_p. */
16958 return (m_unrolled_advsimd_stmts
16959 && (m_unrolled_advsimd_stmts
16960 <= (unsigned int) param_max_completely_peeled_insns));
16961}
16962
d43fc1df
RS
16963/* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
16964 how fast the SVE code can be issued and compare it to the equivalent value
16965 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
16966 also compare it to the issue rate of Advanced SIMD code
16967 (ADVSIMD_CYCLES_PER_ITER).
b585f011 16968
d43fc1df
RS
16969 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16970 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
16971 is true if we think the loop body is too expensive. */
b585f011 16972
d43fc1df
RS
16973fractional_cost
16974aarch64_vector_costs::
1a5288fe 16975adjust_body_cost_sve (const aarch64_vec_op_count *ops,
d43fc1df 16976 fractional_cost scalar_cycles_per_iter,
c6c5c5eb
RS
16977 unsigned int orig_body_cost, unsigned int *body_cost,
16978 bool *should_disparage)
b585f011 16979{
a82ffd43
RS
16980 if (dump_enabled_p ())
16981 ops->dump ();
048039c4 16982
a82ffd43
RS
16983 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16984 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
b585f011
RS
16985
16986 /* If the scalar version of the loop could issue at least as
16987 quickly as the predicate parts of the SVE loop, make the SVE loop
16988 prohibitively expensive. In this case vectorization is adding an
16989 overhead that the original scalar code didn't have.
16990
16991 This is mostly intended to detect cases in which WHILELOs dominate
16992 for very tight loops, which is something that normal latency-based
16993 costs would not model. Adding this kind of cliffedge would be
16994 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16995 code in the caller handles that case in a more conservative way. */
a82ffd43 16996 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
b585f011
RS
16997 if (scalar_cycles_per_iter < sve_estimate)
16998 {
16999 unsigned int min_cost
17000 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17001 if (*body_cost < min_cost)
17002 {
17003 if (dump_enabled_p ())
17004 dump_printf_loc (MSG_NOTE, vect_location,
17005 "Increasing body cost to %d because the"
17006 " scalar code could issue within the limit"
17007 " imposed by predicate operations\n",
17008 min_cost);
17009 *body_cost = min_cost;
17010 *should_disparage = true;
17011 }
17012 }
17013
b585f011
RS
17014 return sve_cycles_per_iter;
17015}
17016
40d643d8 17017unsigned int
2636660b 17018aarch64_vector_costs::determine_suggested_unroll_factor ()
40d643d8
AV
17019{
17020 bool sve = m_vec_flags & VEC_ANY_SVE;
17021 /* If we are trying to unroll an Advanced SIMD main loop that contains
17022 an averaging operation that we do not support with SVE and we might use a
17023 predicated epilogue, we need to be conservative and block unrolling as
17024 this might lead to a less optimal loop for the first and only epilogue
17025 using the original loop's vectorization factor.
17026 TODO: Remove this constraint when we add support for multiple epilogue
17027 vectorization. */
17028 if (!sve && !TARGET_SVE2 && m_has_avg)
17029 return 1;
17030
17031 unsigned int max_unroll_factor = 1;
17032 for (auto vec_ops : m_ops)
17033 {
17034 aarch64_simd_vec_issue_info const *vec_issue
17035 = vec_ops.simd_issue_info ();
17036 if (!vec_issue)
17037 return 1;
17038 /* Limit unroll factor to a value adjustable by the user, the default
17039 value is 4. */
2636660b 17040 unsigned int unroll_factor = aarch64_vect_unroll_limit;
40d643d8
AV
17041 unsigned int factor
17042 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17043 unsigned int temp;
17044
17045 /* Sanity check, this should never happen. */
17046 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17047 return 1;
17048
17049 /* Check stores. */
17050 if (vec_ops.stores > 0)
17051 {
17052 temp = CEIL (factor * vec_issue->stores_per_cycle,
17053 vec_ops.stores);
17054 unroll_factor = MIN (unroll_factor, temp);
17055 }
17056
17057 /* Check loads + stores. */
17058 if (vec_ops.loads > 0)
17059 {
17060 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17061 vec_ops.loads + vec_ops.stores);
17062 unroll_factor = MIN (unroll_factor, temp);
17063 }
17064
17065 /* Check general ops. */
17066 if (vec_ops.general_ops > 0)
17067 {
17068 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17069 vec_ops.general_ops);
17070 unroll_factor = MIN (unroll_factor, temp);
17071 }
17072 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17073 }
17074
17075 /* Make sure unroll factor is power of 2. */
17076 return 1 << ceil_log2 (max_unroll_factor);
17077}
17078
d43fc1df
RS
17079/* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17080 and return the new cost. */
17081unsigned int
6756706e
RS
17082aarch64_vector_costs::
17083adjust_body_cost (loop_vec_info loop_vinfo,
17084 const aarch64_vector_costs *scalar_costs,
17085 unsigned int body_cost)
3b924b0d 17086{
1a5288fe
RS
17087 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17088 return body_cost;
17089
17090 const auto &scalar_ops = scalar_costs->m_ops[0];
17091 const auto &vector_ops = m_ops[0];
6756706e 17092 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
3b924b0d 17093 unsigned int orig_body_cost = body_cost;
1205a8ca
RS
17094 bool should_disparage = false;
17095
17096 if (dump_enabled_p ())
17097 dump_printf_loc (MSG_NOTE, vect_location,
17098 "Original vector body cost = %d\n", body_cost);
3b924b0d 17099
83d796d3 17100 fractional_cost scalar_cycles_per_iter
a82ffd43 17101 = scalar_ops.min_cycles_per_iter () * estimated_vf;
83d796d3 17102
a82ffd43 17103 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
1205a8ca
RS
17104
17105 if (dump_enabled_p ())
17106 {
d43fc1df 17107 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
1205a8ca
RS
17108 dump_printf_loc (MSG_NOTE, vect_location,
17109 "Vector loop iterates at most %wd times\n",
d43fc1df 17110 m_num_vector_iterations);
1205a8ca 17111 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
6756706e 17112 scalar_ops.dump ();
1205a8ca 17113 dump_printf_loc (MSG_NOTE, vect_location,
6756706e
RS
17114 " estimated cycles per vector iteration"
17115 " (for VF %d) = %f\n",
17116 estimated_vf, scalar_cycles_per_iter.as_double ());
1205a8ca
RS
17117 }
17118
1a5288fe 17119 if (vector_ops.sve_issue_info ())
1205a8ca 17120 {
1205a8ca 17121 if (dump_enabled_p ())
c6c5c5eb 17122 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
b585f011 17123 vector_cycles_per_iter
1a5288fe 17124 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
d43fc1df 17125 orig_body_cost, &body_cost, &should_disparage);
048039c4
RS
17126
17127 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17128 {
17129 /* Also take Neoverse V1 tuning into account, doubling the
17130 scalar and Advanced SIMD estimates to account for the
17131 doubling in SVE vector length. */
17132 if (dump_enabled_p ())
17133 dump_printf_loc (MSG_NOTE, vect_location,
17134 "Neoverse V1 estimate:\n");
2e1886ea
RS
17135 auto vf_factor = m_ops[1].vf_factor ();
17136 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
c6c5c5eb 17137 orig_body_cost, &body_cost, &should_disparage);
048039c4 17138 }
1205a8ca 17139 }
6756706e
RS
17140 else
17141 {
17142 if (dump_enabled_p ())
17143 {
17144 dump_printf_loc (MSG_NOTE, vect_location,
17145 "Vector issue estimate:\n");
1a5288fe 17146 vector_ops.dump ();
6756706e
RS
17147 }
17148 }
1205a8ca
RS
17149
17150 /* Decide whether to stick to latency-based costs or whether to try to
17151 take issue rates into account. */
17152 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
d43fc1df 17153 if (m_vec_flags & VEC_ANY_SVE)
1205a8ca
RS
17154 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17155
d43fc1df
RS
17156 if (m_num_vector_iterations >= 1
17157 && m_num_vector_iterations < threshold)
1205a8ca
RS
17158 {
17159 if (dump_enabled_p ())
17160 dump_printf_loc (MSG_NOTE, vect_location,
17161 "Low iteration count, so using pure latency"
17162 " costs\n");
17163 }
17164 /* Increase the cost of the vector code if it looks like the scalar code
17165 could issue more quickly. These values are only rough estimates,
17166 so minor differences should only result in minor changes. */
17167 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17168 {
83d796d3
RS
17169 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17170 scalar_cycles_per_iter);
1205a8ca
RS
17171 if (dump_enabled_p ())
17172 dump_printf_loc (MSG_NOTE, vect_location,
17173 "Increasing body cost to %d because scalar code"
17174 " would issue more quickly\n", body_cost);
17175 }
17176 /* In general, it's expected that the proposed vector code would be able
17177 to issue more quickly than the original scalar code. This should
17178 already be reflected to some extent in the latency-based costs.
17179
17180 However, the latency-based costs effectively assume that the scalar
17181 code and the vector code execute serially, which tends to underplay
17182 one important case: if the real (non-serialized) execution time of
17183 a scalar iteration is dominated by loop-carried dependencies,
17184 and if the vector code is able to reduce both the length of
17185 the loop-carried dependencies *and* the number of cycles needed
17186 to issue the code in general, we can be more confident that the
17187 vector code is an improvement, even if adding the other (non-loop-carried)
17188 latencies tends to hide this saving. We therefore reduce the cost of the
17189 vector loop body in proportion to the saving. */
1a5288fe 17190 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
6756706e 17191 && scalar_ops.reduction_latency == scalar_cycles_per_iter
1205a8ca
RS
17192 && scalar_cycles_per_iter > vector_cycles_per_iter
17193 && !should_disparage)
17194 {
83d796d3
RS
17195 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17196 scalar_cycles_per_iter);
1205a8ca
RS
17197 if (dump_enabled_p ())
17198 dump_printf_loc (MSG_NOTE, vect_location,
17199 "Decreasing body cost to %d account for smaller"
17200 " reduction latency\n", body_cost);
17201 }
17202
3b924b0d
RS
17203 return body_cost;
17204}
17205
6239dd05 17206void
6756706e 17207aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
50a525b5 17208{
6756706e
RS
17209 auto *scalar_costs
17210 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
d43fc1df
RS
17211 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17212 if (loop_vinfo
17213 && m_vec_flags
3b924b0d 17214 && aarch64_use_new_vector_costs_p ())
40d643d8
AV
17215 {
17216 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17217 m_costs[vect_body]);
2636660b 17218 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
40d643d8 17219 }
50a525b5 17220
49630797
RS
17221 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17222 the scalar code in the event of a tie, since there is more chance
17223 of scalar code being optimized with surrounding operations. */
17224 if (!loop_vinfo
17225 && scalar_costs
17226 && m_stp_sequence_cost != ~0U
17227 && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17228 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17229
0612883d 17230 vector_costs::finish_cost (scalar_costs);
50a525b5
RS
17231}
17232
c6c5c5eb
RS
17233bool
17234aarch64_vector_costs::
17235better_main_loop_than_p (const vector_costs *uncast_other) const
17236{
17237 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17238
17239 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17240 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17241
17242 if (dump_enabled_p ())
17243 dump_printf_loc (MSG_NOTE, vect_location,
17244 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17245 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17246 vect_vf_for_cost (this_loop_vinfo),
17247 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17248 vect_vf_for_cost (other_loop_vinfo));
17249
17250 /* Apply the unrolling heuristic described above
17251 m_unrolled_advsimd_niters. */
17252 if (bool (m_unrolled_advsimd_stmts)
17253 != bool (other->m_unrolled_advsimd_stmts))
17254 {
17255 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17256 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17257 if (this_prefer_unrolled != other_prefer_unrolled)
17258 {
17259 if (dump_enabled_p ())
17260 dump_printf_loc (MSG_NOTE, vect_location,
17261 "Preferring Advanced SIMD loop because"
17262 " it can be unrolled\n");
17263 return other_prefer_unrolled;
17264 }
17265 }
17266
17267 for (unsigned int i = 0; i < m_ops.length (); ++i)
17268 {
17269 if (dump_enabled_p ())
17270 {
17271 if (i)
17272 dump_printf_loc (MSG_NOTE, vect_location,
17273 "Reconsidering with subtuning %d\n", i);
17274 dump_printf_loc (MSG_NOTE, vect_location,
17275 "Issue info for %s loop:\n",
17276 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17277 this->m_ops[i].dump ();
17278 dump_printf_loc (MSG_NOTE, vect_location,
17279 "Issue info for %s loop:\n",
17280 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17281 other->m_ops[i].dump ();
17282 }
17283
17284 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17285 * this->m_ops[i].vf_factor ());
17286 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17287 * other->m_ops[i].vf_factor ());
17288
17289 /* If it appears that one loop could process the same amount of data
17290 in fewer cycles, prefer that loop over the other one. */
17291 fractional_cost this_cost
17292 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17293 fractional_cost other_cost
17294 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17295 if (dump_enabled_p ())
17296 {
17297 dump_printf_loc (MSG_NOTE, vect_location,
17298 "Weighted cycles per iteration of %s loop ~= %f\n",
17299 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17300 this_cost.as_double ());
17301 dump_printf_loc (MSG_NOTE, vect_location,
17302 "Weighted cycles per iteration of %s loop ~= %f\n",
17303 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17304 other_cost.as_double ());
17305 }
17306 if (this_cost != other_cost)
17307 {
17308 if (dump_enabled_p ())
17309 dump_printf_loc (MSG_NOTE, vect_location,
17310 "Preferring loop with lower cycles"
17311 " per iteration\n");
17312 return this_cost < other_cost;
17313 }
17314
17315 /* If the issue rate of SVE code is limited by predicate operations
17316 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17317 and if Advanced SIMD code could issue within the limit imposed
17318 by the predicate operations, the predicate operations are adding an
17319 overhead that the original code didn't have and so we should prefer
17320 the Advanced SIMD version. */
17321 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17322 const aarch64_vec_op_count &b) -> bool
17323 {
17324 if (a.pred_ops == 0
17325 && (b.min_pred_cycles_per_iter ()
17326 > b.min_nonpred_cycles_per_iter ()))
17327 {
17328 if (dump_enabled_p ())
17329 dump_printf_loc (MSG_NOTE, vect_location,
17330 "Preferring Advanced SIMD loop since"
17331 " SVE loop is predicate-limited\n");
17332 return true;
17333 }
17334 return false;
17335 };
17336 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17337 return true;
17338 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17339 return false;
17340 }
17341
17342 return vector_costs::better_main_loop_than_p (other);
17343}
17344
0cfff2a1 17345static void initialize_aarch64_code_model (struct gcc_options *);
43e9d192 17346
0cfff2a1
KT
17347/* Parse the TO_PARSE string and put the architecture struct that it
17348 selects into RES and the architectural features into ISA_FLAGS.
17349 Return an aarch64_parse_opt_result describing the parse result.
c7887347
ML
17350 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17351 When the TO_PARSE string contains an invalid extension,
17352 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 17353
0cfff2a1
KT
17354static enum aarch64_parse_opt_result
17355aarch64_parse_arch (const char *to_parse, const struct processor **res,
fed55a60
RS
17356 aarch64_feature_flags *isa_flags,
17357 std::string *invalid_extension)
43e9d192 17358{
ff150bc4 17359 const char *ext;
43e9d192 17360 const struct processor *arch;
43e9d192
IB
17361 size_t len;
17362
ff150bc4 17363 ext = strchr (to_parse, '+');
43e9d192
IB
17364
17365 if (ext != NULL)
ff150bc4 17366 len = ext - to_parse;
43e9d192 17367 else
ff150bc4 17368 len = strlen (to_parse);
43e9d192
IB
17369
17370 if (len == 0)
0cfff2a1
KT
17371 return AARCH64_PARSE_MISSING_ARG;
17372
43e9d192 17373
0cfff2a1 17374 /* Loop through the list of supported ARCHes to find a match. */
43e9d192
IB
17375 for (arch = all_architectures; arch->name != NULL; arch++)
17376 {
ff150bc4
ML
17377 if (strlen (arch->name) == len
17378 && strncmp (arch->name, to_parse, len) == 0)
43e9d192 17379 {
fed55a60 17380 auto isa_temp = arch->flags;
43e9d192
IB
17381
17382 if (ext != NULL)
17383 {
0cfff2a1
KT
17384 /* TO_PARSE string contains at least one extension. */
17385 enum aarch64_parse_opt_result ext_res
c7887347 17386 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 17387
0cfff2a1
KT
17388 if (ext_res != AARCH64_PARSE_OK)
17389 return ext_res;
ffee7aa9 17390 }
0cfff2a1
KT
17391 /* Extension parsing was successful. Confirm the result
17392 arch and ISA flags. */
17393 *res = arch;
17394 *isa_flags = isa_temp;
17395 return AARCH64_PARSE_OK;
43e9d192
IB
17396 }
17397 }
17398
17399 /* ARCH name not found in list. */
0cfff2a1 17400 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
17401}
17402
0cfff2a1
KT
17403/* Parse the TO_PARSE string and put the result tuning in RES and the
17404 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
17405 describing the parse result. If there is an error parsing, RES and
c7887347
ML
17406 ISA_FLAGS are left unchanged.
17407 When the TO_PARSE string contains an invalid extension,
17408 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 17409
0cfff2a1
KT
17410static enum aarch64_parse_opt_result
17411aarch64_parse_cpu (const char *to_parse, const struct processor **res,
fed55a60
RS
17412 aarch64_feature_flags *isa_flags,
17413 std::string *invalid_extension)
43e9d192 17414{
ff150bc4 17415 const char *ext;
43e9d192 17416 const struct processor *cpu;
43e9d192
IB
17417 size_t len;
17418
ff150bc4 17419 ext = strchr (to_parse, '+');
43e9d192
IB
17420
17421 if (ext != NULL)
ff150bc4 17422 len = ext - to_parse;
43e9d192 17423 else
ff150bc4 17424 len = strlen (to_parse);
43e9d192
IB
17425
17426 if (len == 0)
0cfff2a1
KT
17427 return AARCH64_PARSE_MISSING_ARG;
17428
43e9d192
IB
17429
17430 /* Loop through the list of supported CPUs to find a match. */
17431 for (cpu = all_cores; cpu->name != NULL; cpu++)
17432 {
ff150bc4 17433 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
43e9d192 17434 {
fed55a60 17435 auto isa_temp = cpu->flags;
43e9d192
IB
17436
17437 if (ext != NULL)
17438 {
0cfff2a1
KT
17439 /* TO_PARSE string contains at least one extension. */
17440 enum aarch64_parse_opt_result ext_res
c7887347 17441 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 17442
0cfff2a1
KT
17443 if (ext_res != AARCH64_PARSE_OK)
17444 return ext_res;
17445 }
17446 /* Extension parsing was successfull. Confirm the result
17447 cpu and ISA flags. */
17448 *res = cpu;
17449 *isa_flags = isa_temp;
17450 return AARCH64_PARSE_OK;
43e9d192
IB
17451 }
17452 }
17453
17454 /* CPU name not found in list. */
0cfff2a1 17455 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
17456}
17457
0cfff2a1
KT
17458/* Parse the TO_PARSE string and put the cpu it selects into RES.
17459 Return an aarch64_parse_opt_result describing the parse result.
17460 If the parsing fails the RES does not change. */
43e9d192 17461
0cfff2a1
KT
17462static enum aarch64_parse_opt_result
17463aarch64_parse_tune (const char *to_parse, const struct processor **res)
43e9d192
IB
17464{
17465 const struct processor *cpu;
43e9d192
IB
17466
17467 /* Loop through the list of supported CPUs to find a match. */
17468 for (cpu = all_cores; cpu->name != NULL; cpu++)
17469 {
ff150bc4 17470 if (strcmp (cpu->name, to_parse) == 0)
43e9d192 17471 {
0cfff2a1
KT
17472 *res = cpu;
17473 return AARCH64_PARSE_OK;
43e9d192
IB
17474 }
17475 }
17476
17477 /* CPU name not found in list. */
0cfff2a1 17478 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
17479}
17480
8dec06f2
JG
17481/* Parse TOKEN, which has length LENGTH to see if it is an option
17482 described in FLAG. If it is, return the index bit for that fusion type.
17483 If not, error (printing OPTION_NAME) and return zero. */
17484
17485static unsigned int
17486aarch64_parse_one_option_token (const char *token,
17487 size_t length,
17488 const struct aarch64_flag_desc *flag,
17489 const char *option_name)
17490{
17491 for (; flag->name != NULL; flag++)
17492 {
17493 if (length == strlen (flag->name)
17494 && !strncmp (flag->name, token, length))
17495 return flag->flag;
17496 }
17497
a3f9f006 17498 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
8dec06f2
JG
17499 return 0;
17500}
17501
17502/* Parse OPTION which is a comma-separated list of flags to enable.
17503 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17504 default state we inherit from the CPU tuning structures. OPTION_NAME
17505 gives the top-level option we are parsing in the -moverride string,
17506 for use in error messages. */
17507
17508static unsigned int
17509aarch64_parse_boolean_options (const char *option,
17510 const struct aarch64_flag_desc *flags,
17511 unsigned int initial_state,
17512 const char *option_name)
17513{
17514 const char separator = '.';
17515 const char* specs = option;
17516 const char* ntoken = option;
17517 unsigned int found_flags = initial_state;
17518
17519 while ((ntoken = strchr (specs, separator)))
17520 {
17521 size_t token_length = ntoken - specs;
17522 unsigned token_ops = aarch64_parse_one_option_token (specs,
17523 token_length,
17524 flags,
17525 option_name);
17526 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17527 in the token stream, reset the supported operations. So:
17528
17529 adrp+add.cmp+branch.none.adrp+add
17530
17531 would have the result of turning on only adrp+add fusion. */
17532 if (!token_ops)
17533 found_flags = 0;
17534
17535 found_flags |= token_ops;
17536 specs = ++ntoken;
17537 }
17538
17539 /* We ended with a comma, print something. */
17540 if (!(*specs))
17541 {
03a1a86b 17542 error ("%qs string ill-formed", option_name);
8dec06f2
JG
17543 return 0;
17544 }
17545
17546 /* We still have one more token to parse. */
17547 size_t token_length = strlen (specs);
17548 unsigned token_ops = aarch64_parse_one_option_token (specs,
17549 token_length,
17550 flags,
17551 option_name);
17552 if (!token_ops)
17553 found_flags = 0;
17554
17555 found_flags |= token_ops;
17556 return found_flags;
17557}
17558
17559/* Support for overriding instruction fusion. */
17560
17561static void
17562aarch64_parse_fuse_string (const char *fuse_string,
17563 struct tune_params *tune)
17564{
17565 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17566 aarch64_fusible_pairs,
17567 tune->fusible_ops,
17568 "fuse=");
17569}
17570
17571/* Support for overriding other tuning flags. */
17572
17573static void
17574aarch64_parse_tune_string (const char *tune_string,
17575 struct tune_params *tune)
17576{
17577 tune->extra_tuning_flags
17578 = aarch64_parse_boolean_options (tune_string,
17579 aarch64_tuning_flags,
17580 tune->extra_tuning_flags,
17581 "tune=");
17582}
17583
886f092f
KT
17584/* Parse the sve_width tuning moverride string in TUNE_STRING.
17585 Accept the valid SVE vector widths allowed by
17586 aarch64_sve_vector_bits_enum and use it to override sve_width
17587 in TUNE. */
17588
17589static void
17590aarch64_parse_sve_width_string (const char *tune_string,
17591 struct tune_params *tune)
17592{
17593 int width = -1;
17594
17595 int n = sscanf (tune_string, "%d", &width);
17596 if (n == EOF)
17597 {
03a1a86b 17598 error ("invalid format for %<sve_width%>");
886f092f
KT
17599 return;
17600 }
17601 switch (width)
17602 {
17603 case SVE_128:
17604 case SVE_256:
17605 case SVE_512:
17606 case SVE_1024:
17607 case SVE_2048:
17608 break;
17609 default:
03a1a86b 17610 error ("invalid %<sve_width%> value: %d", width);
886f092f
KT
17611 }
17612 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17613}
17614
8dec06f2
JG
17615/* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17616 we understand. If it is, extract the option string and handoff to
17617 the appropriate function. */
17618
17619void
17620aarch64_parse_one_override_token (const char* token,
17621 size_t length,
17622 struct tune_params *tune)
17623{
17624 const struct aarch64_tuning_override_function *fn
17625 = aarch64_tuning_override_functions;
17626
17627 const char *option_part = strchr (token, '=');
17628 if (!option_part)
17629 {
17630 error ("tuning string missing in option (%s)", token);
17631 return;
17632 }
17633
17634 /* Get the length of the option name. */
17635 length = option_part - token;
17636 /* Skip the '=' to get to the option string. */
17637 option_part++;
17638
17639 for (; fn->name != NULL; fn++)
17640 {
17641 if (!strncmp (fn->name, token, length))
17642 {
17643 fn->parse_override (option_part, tune);
17644 return;
17645 }
17646 }
17647
17648 error ("unknown tuning option (%s)",token);
17649 return;
17650}
17651
5eee3c34
JW
17652/* A checking mechanism for the implementation of the tls size. */
17653
17654static void
17655initialize_aarch64_tls_size (struct gcc_options *opts)
17656{
17657 if (aarch64_tls_size == 0)
17658 aarch64_tls_size = 24;
17659
17660 switch (opts->x_aarch64_cmodel_var)
17661 {
17662 case AARCH64_CMODEL_TINY:
17663 /* Both the default and maximum TLS size allowed under tiny is 1M which
17664 needs two instructions to address, so we clamp the size to 24. */
17665 if (aarch64_tls_size > 24)
17666 aarch64_tls_size = 24;
17667 break;
17668 case AARCH64_CMODEL_SMALL:
17669 /* The maximum TLS size allowed under small is 4G. */
17670 if (aarch64_tls_size > 32)
17671 aarch64_tls_size = 32;
17672 break;
17673 case AARCH64_CMODEL_LARGE:
17674 /* The maximum TLS size allowed under large is 16E.
17675 FIXME: 16E should be 64bit, we only support 48bit offset now. */
17676 if (aarch64_tls_size > 48)
17677 aarch64_tls_size = 48;
17678 break;
17679 default:
17680 gcc_unreachable ();
17681 }
17682
17683 return;
17684}
17685
ae54c1b0
WD
17686/* Return the CPU corresponding to the enum CPU. */
17687
17688static const struct processor *
17689aarch64_get_tune_cpu (enum aarch64_processor cpu)
17690{
17691 gcc_assert (cpu != aarch64_none);
17692
17693 return &all_cores[cpu];
17694}
17695
17696/* Return the architecture corresponding to the enum ARCH. */
17697
17698static const struct processor *
17699aarch64_get_arch (enum aarch64_arch arch)
17700{
17701 gcc_assert (arch != aarch64_no_arch);
17702
17703 return &all_architectures[arch];
17704}
17705
8dec06f2
JG
17706/* Parse STRING looking for options in the format:
17707 string :: option:string
17708 option :: name=substring
17709 name :: {a-z}
17710 substring :: defined by option. */
17711
17712static void
17713aarch64_parse_override_string (const char* input_string,
17714 struct tune_params* tune)
17715{
17716 const char separator = ':';
17717 size_t string_length = strlen (input_string) + 1;
17718 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17719 char *string = string_root;
17720 strncpy (string, input_string, string_length);
17721 string[string_length - 1] = '\0';
17722
17723 char* ntoken = string;
17724
17725 while ((ntoken = strchr (string, separator)))
17726 {
17727 size_t token_length = ntoken - string;
17728 /* Make this substring look like a string. */
17729 *ntoken = '\0';
17730 aarch64_parse_one_override_token (string, token_length, tune);
17731 string = ++ntoken;
17732 }
17733
17734 /* One last option to parse. */
17735 aarch64_parse_one_override_token (string, strlen (string), tune);
17736 free (string_root);
17737}
43e9d192 17738
8f0c9d53
KT
17739/* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17740 are best for a generic target with the currently-enabled architecture
17741 extensions. */
17742static void
17743aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17744{
17745 /* Neoverse V1 is the only core that is known to benefit from
17746 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
17747 point enabling it for SVE2 and above. */
17748 if (TARGET_SVE2)
17749 current_tune.extra_tuning_flags
17750 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17751}
43e9d192
IB
17752
17753static void
0cfff2a1 17754aarch64_override_options_after_change_1 (struct gcc_options *opts)
43e9d192 17755{
efac62a3
ST
17756 if (accepted_branch_protection_string)
17757 {
17758 opts->x_aarch64_branch_protection_string
17759 = xstrdup (accepted_branch_protection_string);
17760 }
17761
acea40ac
WD
17762 /* PR 70044: We have to be careful about being called multiple times for the
17763 same function. This means all changes should be repeatable. */
17764
d6cb6d6a
WD
17765 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17766 Disable the frame pointer flag so the mid-end will not use a frame
17767 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17768 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17769 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
17770 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
acea40ac 17771 if (opts->x_flag_omit_frame_pointer == 0)
a3dc8760 17772 opts->x_flag_omit_frame_pointer = 2;
43e9d192 17773
1be34295 17774 /* If not optimizing for size, set the default
0cfff2a1
KT
17775 alignment to what the target wants. */
17776 if (!opts->x_optimize_size)
43e9d192 17777 {
c518c102
ML
17778 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17779 opts->x_str_align_loops = aarch64_tune_params.loop_align;
17780 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17781 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17782 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17783 opts->x_str_align_functions = aarch64_tune_params.function_align;
43e9d192 17784 }
b4f50fd4 17785
9ee6540a
WD
17786 /* We default to no pc-relative literal loads. */
17787
17788 aarch64_pcrelative_literal_loads = false;
17789
17790 /* If -mpc-relative-literal-loads is set on the command line, this
b4f50fd4 17791 implies that the user asked for PC relative literal loads. */
9ee6540a
WD
17792 if (opts->x_pcrelative_literal_loads == 1)
17793 aarch64_pcrelative_literal_loads = true;
b4f50fd4 17794
9ee6540a
WD
17795 /* In the tiny memory model it makes no sense to disallow PC relative
17796 literal pool loads. */
17797 if (aarch64_cmodel == AARCH64_CMODEL_TINY
17798 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17799 aarch64_pcrelative_literal_loads = true;
98daafa0
EM
17800
17801 /* When enabling the lower precision Newton series for the square root, also
17802 enable it for the reciprocal square root, since the latter is an
17803 intermediary step for the former. */
17804 if (flag_mlow_precision_sqrt)
17805 flag_mrecip_low_precision_sqrt = true;
0cfff2a1 17806}
43e9d192 17807
0cfff2a1
KT
17808/* 'Unpack' up the internal tuning structs and update the options
17809 in OPTS. The caller must have set up selected_tune and selected_arch
17810 as all the other target-specific codegen decisions are
17811 derived from them. */
17812
e4ea20c8 17813void
0cfff2a1
KT
17814aarch64_override_options_internal (struct gcc_options *opts)
17815{
ae54c1b0
WD
17816 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17817 aarch64_tune_flags = tune->flags;
17818 aarch64_tune = tune->sched_core;
0cfff2a1
KT
17819 /* Make a copy of the tuning parameters attached to the core, which
17820 we may later overwrite. */
ae54c1b0
WD
17821 aarch64_tune_params = *(tune->tune);
17822 if (tune->tune == &generic_tunings)
8f0c9d53 17823 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
0cfff2a1
KT
17824
17825 if (opts->x_aarch64_override_tune_string)
17826 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
ae54c1b0 17827 &aarch64_tune_params);
0cfff2a1
KT
17828
17829 /* This target defaults to strict volatile bitfields. */
17830 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17831 opts->x_flag_strict_volatile_bitfields = 1;
17832
cd0b2d36
RR
17833 if (aarch64_stack_protector_guard == SSP_GLOBAL
17834 && opts->x_aarch64_stack_protector_guard_offset_str)
17835 {
41804907 17836 error ("incompatible options %<-mstack-protector-guard=global%> and "
63d42e89 17837 "%<-mstack-protector-guard-offset=%s%>",
cd0b2d36
RR
17838 aarch64_stack_protector_guard_offset_str);
17839 }
17840
17841 if (aarch64_stack_protector_guard == SSP_SYSREG
17842 && !(opts->x_aarch64_stack_protector_guard_offset_str
17843 && opts->x_aarch64_stack_protector_guard_reg_str))
17844 {
a3f9f006
ML
17845 error ("both %<-mstack-protector-guard-offset%> and "
17846 "%<-mstack-protector-guard-reg%> must be used "
17847 "with %<-mstack-protector-guard=sysreg%>");
cd0b2d36
RR
17848 }
17849
17850 if (opts->x_aarch64_stack_protector_guard_reg_str)
17851 {
17852 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
03a1a86b 17853 error ("specify a system register with a small string length");
cd0b2d36
RR
17854 }
17855
17856 if (opts->x_aarch64_stack_protector_guard_offset_str)
17857 {
17858 char *end;
17859 const char *str = aarch64_stack_protector_guard_offset_str;
17860 errno = 0;
17861 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17862 if (!*str || *end || errno)
17863 error ("%qs is not a valid offset in %qs", str,
63d42e89 17864 "-mstack-protector-guard-offset=");
cd0b2d36
RR
17865 aarch64_stack_protector_guard_offset = offs;
17866 }
17867
ce09ab17
DL
17868 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17869 && !fixed_regs[R18_REGNUM])
17870 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17871
0cfff2a1 17872 initialize_aarch64_code_model (opts);
5eee3c34 17873 initialize_aarch64_tls_size (opts);
63892fa2 17874
2d6bc7fa
KT
17875 int queue_depth = 0;
17876 switch (aarch64_tune_params.autoprefetcher_model)
17877 {
17878 case tune_params::AUTOPREFETCHER_OFF:
17879 queue_depth = -1;
17880 break;
17881 case tune_params::AUTOPREFETCHER_WEAK:
17882 queue_depth = 0;
17883 break;
17884 case tune_params::AUTOPREFETCHER_STRONG:
17885 queue_depth = max_insn_queue_index + 1;
17886 break;
17887 default:
17888 gcc_unreachable ();
17889 }
17890
17891 /* We don't mind passing in global_options_set here as we don't use
17892 the *options_set structs anyway. */
028d4092
ML
17893 SET_OPTION_IF_UNSET (opts, &global_options_set,
17894 param_sched_autopref_queue_depth, queue_depth);
2d6bc7fa 17895
5f29f3d5
KT
17896 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17897 comparison. */
17898 if (aarch64_autovec_preference == 1)
17899 SET_OPTION_IF_UNSET (opts, &global_options_set,
17900 aarch64_sve_compare_costs, 0);
17901
9d2c6e2e
MK
17902 /* Set up parameters to be used in prefetching algorithm. Do not
17903 override the defaults unless we are tuning for a core we have
17904 researched values for. */
17905 if (aarch64_tune_params.prefetch->num_slots > 0)
028d4092
ML
17906 SET_OPTION_IF_UNSET (opts, &global_options_set,
17907 param_simultaneous_prefetches,
17908 aarch64_tune_params.prefetch->num_slots);
9d2c6e2e 17909 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
028d4092
ML
17910 SET_OPTION_IF_UNSET (opts, &global_options_set,
17911 param_l1_cache_size,
17912 aarch64_tune_params.prefetch->l1_cache_size);
9d2c6e2e 17913 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
028d4092
ML
17914 SET_OPTION_IF_UNSET (opts, &global_options_set,
17915 param_l1_cache_line_size,
17916 aarch64_tune_params.prefetch->l1_cache_line_size);
76b75018
JM
17917
17918 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17919 {
17920 SET_OPTION_IF_UNSET (opts, &global_options_set,
17921 param_destruct_interfere_size,
17922 aarch64_tune_params.prefetch->l1_cache_line_size);
17923 SET_OPTION_IF_UNSET (opts, &global_options_set,
17924 param_construct_interfere_size,
17925 aarch64_tune_params.prefetch->l1_cache_line_size);
17926 }
17927 else
17928 {
17929 /* For a generic AArch64 target, cover the current range of cache line
17930 sizes. */
17931 SET_OPTION_IF_UNSET (opts, &global_options_set,
17932 param_destruct_interfere_size,
17933 256);
17934 SET_OPTION_IF_UNSET (opts, &global_options_set,
17935 param_construct_interfere_size,
17936 64);
17937 }
17938
9d2c6e2e 17939 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
028d4092
ML
17940 SET_OPTION_IF_UNSET (opts, &global_options_set,
17941 param_l2_cache_size,
17942 aarch64_tune_params.prefetch->l2_cache_size);
d2ff35c0 17943 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
028d4092
ML
17944 SET_OPTION_IF_UNSET (opts, &global_options_set,
17945 param_prefetch_dynamic_strides, 0);
59100dfc 17946 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
028d4092
ML
17947 SET_OPTION_IF_UNSET (opts, &global_options_set,
17948 param_prefetch_minimum_stride,
17949 aarch64_tune_params.prefetch->minimum_stride);
50487d79 17950
13494fcb 17951 /* Use the alternative scheduling-pressure algorithm by default. */
028d4092
ML
17952 SET_OPTION_IF_UNSET (opts, &global_options_set,
17953 param_sched_pressure_algorithm,
17954 SCHED_PRESSURE_MODEL);
13494fcb 17955
fbe9af50 17956 /* Validate the guard size. */
028d4092 17957 int guard_size = param_stack_clash_protection_guard_size;
fbe9af50 17958
8100e93b
ML
17959 if (guard_size != 12 && guard_size != 16)
17960 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17961 "size. Given value %d (%llu KB) is out of range",
17962 guard_size, (1ULL << guard_size) / 1024ULL);
17963
fbe9af50
TC
17964 /* Enforce that interval is the same size as size so the mid-end does the
17965 right thing. */
028d4092
ML
17966 SET_OPTION_IF_UNSET (opts, &global_options_set,
17967 param_stack_clash_protection_probe_interval,
17968 guard_size);
fbe9af50
TC
17969
17970 /* The maybe_set calls won't update the value if the user has explicitly set
17971 one. Which means we need to validate that probing interval and guard size
17972 are equal. */
17973 int probe_interval
028d4092 17974 = param_stack_clash_protection_probe_interval;
fbe9af50 17975 if (guard_size != probe_interval)
904f3daa
ML
17976 error ("stack clash guard size %<%d%> must be equal to probing interval "
17977 "%<%d%>", guard_size, probe_interval);
fbe9af50 17978
16b2cafd
MK
17979 /* Enable sw prefetching at specified optimization level for
17980 CPUS that have prefetch. Lower optimization level threshold by 1
17981 when profiling is enabled. */
17982 if (opts->x_flag_prefetch_loop_arrays < 0
17983 && !opts->x_optimize_size
17984 && aarch64_tune_params.prefetch->default_opt_level >= 0
17985 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17986 opts->x_flag_prefetch_loop_arrays = 1;
17987
0cfff2a1
KT
17988 aarch64_override_options_after_change_1 (opts);
17989}
43e9d192 17990
01f44038
KT
17991/* Print a hint with a suggestion for a core or architecture name that
17992 most closely resembles what the user passed in STR. ARCH is true if
17993 the user is asking for an architecture name. ARCH is false if the user
17994 is asking for a core name. */
17995
17996static void
17997aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17998{
17999 auto_vec<const char *> candidates;
18000 const struct processor *entry = arch ? all_architectures : all_cores;
18001 for (; entry->name != NULL; entry++)
18002 candidates.safe_push (entry->name);
a08b5429
ML
18003
18004#ifdef HAVE_LOCAL_CPU_DETECT
18005 /* Add also "native" as possible value. */
18006 if (arch)
18007 candidates.safe_push ("native");
18008#endif
18009
01f44038
KT
18010 char *s;
18011 const char *hint = candidates_list_and_hint (str, s, candidates);
18012 if (hint)
18013 inform (input_location, "valid arguments are: %s;"
18014 " did you mean %qs?", s, hint);
6285e915
ML
18015 else
18016 inform (input_location, "valid arguments are: %s", s);
18017
01f44038
KT
18018 XDELETEVEC (s);
18019}
18020
18021/* Print a hint with a suggestion for a core name that most closely resembles
18022 what the user passed in STR. */
18023
18024inline static void
18025aarch64_print_hint_for_core (const char *str)
18026{
18027 aarch64_print_hint_for_core_or_arch (str, false);
18028}
18029
18030/* Print a hint with a suggestion for an architecture name that most closely
18031 resembles what the user passed in STR. */
18032
18033inline static void
18034aarch64_print_hint_for_arch (const char *str)
18035{
18036 aarch64_print_hint_for_core_or_arch (str, true);
18037}
18038
c7887347
ML
18039
18040/* Print a hint with a suggestion for an extension name
18041 that most closely resembles what the user passed in STR. */
18042
18043void
18044aarch64_print_hint_for_extensions (const std::string &str)
18045{
18046 auto_vec<const char *> candidates;
18047 aarch64_get_all_extension_candidates (&candidates);
18048 char *s;
18049 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18050 if (hint)
18051 inform (input_location, "valid arguments are: %s;"
18052 " did you mean %qs?", s, hint);
18053 else
03a1a86b 18054 inform (input_location, "valid arguments are: %s", s);
c7887347
ML
18055
18056 XDELETEVEC (s);
18057}
18058
0cfff2a1
KT
18059/* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18060 specified in STR and throw errors if appropriate. Put the results if
361fb3ee
KT
18061 they are valid in RES and ISA_FLAGS. Return whether the option is
18062 valid. */
43e9d192 18063
361fb3ee 18064static bool
0cfff2a1 18065aarch64_validate_mcpu (const char *str, const struct processor **res,
fed55a60 18066 aarch64_feature_flags *isa_flags)
0cfff2a1 18067{
c7887347 18068 std::string invalid_extension;
0cfff2a1 18069 enum aarch64_parse_opt_result parse_res
c7887347 18070 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
18071
18072 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 18073 return true;
0cfff2a1
KT
18074
18075 switch (parse_res)
18076 {
18077 case AARCH64_PARSE_MISSING_ARG:
fb241da2 18078 error ("missing cpu name in %<-mcpu=%s%>", str);
0cfff2a1
KT
18079 break;
18080 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 18081 error ("unknown value %qs for %<-mcpu%>", str);
01f44038 18082 aarch64_print_hint_for_core (str);
0cfff2a1
KT
18083 break;
18084 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
18085 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18086 invalid_extension.c_str (), str);
18087 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
18088 break;
18089 default:
18090 gcc_unreachable ();
18091 }
361fb3ee
KT
18092
18093 return false;
0cfff2a1
KT
18094}
18095
a9ba2a9b
MM
18096/* Straight line speculation indicators. */
18097enum aarch64_sls_hardening_type
18098{
18099 SLS_NONE = 0,
18100 SLS_RETBR = 1,
18101 SLS_BLR = 2,
18102 SLS_ALL = 3,
18103};
18104static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18105
18106/* Return whether we should mitigatate Straight Line Speculation for the RET
18107 and BR instructions. */
18108bool
18109aarch64_harden_sls_retbr_p (void)
18110{
18111 return aarch64_sls_hardening & SLS_RETBR;
18112}
18113
18114/* Return whether we should mitigatate Straight Line Speculation for the BLR
18115 instruction. */
18116bool
18117aarch64_harden_sls_blr_p (void)
18118{
18119 return aarch64_sls_hardening & SLS_BLR;
18120}
18121
18122/* As of yet we only allow setting these options globally, in the future we may
18123 allow setting them per function. */
18124static void
18125aarch64_validate_sls_mitigation (const char *const_str)
18126{
18127 char *token_save = NULL;
18128 char *str = NULL;
18129
18130 if (strcmp (const_str, "none") == 0)
18131 {
18132 aarch64_sls_hardening = SLS_NONE;
18133 return;
18134 }
18135 if (strcmp (const_str, "all") == 0)
18136 {
18137 aarch64_sls_hardening = SLS_ALL;
18138 return;
18139 }
18140
18141 char *str_root = xstrdup (const_str);
18142 str = strtok_r (str_root, ",", &token_save);
18143 if (!str)
18144 error ("invalid argument given to %<-mharden-sls=%>");
18145
18146 int temp = SLS_NONE;
18147 while (str)
18148 {
18149 if (strcmp (str, "blr") == 0)
18150 temp |= SLS_BLR;
18151 else if (strcmp (str, "retbr") == 0)
18152 temp |= SLS_RETBR;
18153 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18154 {
03a1a86b 18155 error ("%qs must be by itself for %<-mharden-sls=%>", str);
a9ba2a9b
MM
18156 break;
18157 }
18158 else
18159 {
18160 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18161 break;
18162 }
18163 str = strtok_r (NULL, ",", &token_save);
18164 }
18165 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18166 free (str_root);
18167}
18168
efac62a3
ST
18169/* Parses CONST_STR for branch protection features specified in
18170 aarch64_branch_protect_types, and set any global variables required. Returns
18171 the parsing result and assigns LAST_STR to the last processed token from
18172 CONST_STR so that it can be used for error reporting. */
18173
18174static enum
18175aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
18176 char** last_str)
18177{
18178 char *str_root = xstrdup (const_str);
18179 char* token_save = NULL;
18180 char *str = strtok_r (str_root, "+", &token_save);
18181 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
18182 if (!str)
18183 res = AARCH64_PARSE_MISSING_ARG;
18184 else
18185 {
18186 char *next_str = strtok_r (NULL, "+", &token_save);
18187 /* Reset the branch protection features to their defaults. */
18188 aarch64_handle_no_branch_protection (NULL, NULL);
18189
18190 while (str && res == AARCH64_PARSE_OK)
18191 {
18192 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
18193 bool found = false;
18194 /* Search for this type. */
18195 while (type && type->name && !found && res == AARCH64_PARSE_OK)
18196 {
18197 if (strcmp (str, type->name) == 0)
18198 {
18199 found = true;
18200 res = type->handler (str, next_str);
18201 str = next_str;
18202 next_str = strtok_r (NULL, "+", &token_save);
18203 }
18204 else
18205 type++;
18206 }
18207 if (found && res == AARCH64_PARSE_OK)
18208 {
18209 bool found_subtype = true;
18210 /* Loop through each token until we find one that isn't a
18211 subtype. */
18212 while (found_subtype)
18213 {
18214 found_subtype = false;
18215 const aarch64_branch_protect_type *subtype = type->subtypes;
18216 /* Search for the subtype. */
18217 while (str && subtype && subtype->name && !found_subtype
18218 && res == AARCH64_PARSE_OK)
18219 {
18220 if (strcmp (str, subtype->name) == 0)
18221 {
18222 found_subtype = true;
18223 res = subtype->handler (str, next_str);
18224 str = next_str;
18225 next_str = strtok_r (NULL, "+", &token_save);
18226 }
18227 else
18228 subtype++;
18229 }
18230 }
18231 }
18232 else if (!found)
18233 res = AARCH64_PARSE_INVALID_ARG;
18234 }
18235 }
18236 /* Copy the last processed token into the argument to pass it back.
18237 Used by option and attribute validation to print the offending token. */
18238 if (last_str)
18239 {
18240 if (str) strcpy (*last_str, str);
18241 else *last_str = NULL;
18242 }
18243 if (res == AARCH64_PARSE_OK)
18244 {
18245 /* If needed, alloc the accepted string then copy in const_str.
18246 Used by override_option_after_change_1. */
18247 if (!accepted_branch_protection_string)
18248 accepted_branch_protection_string = (char *) xmalloc (
18249 BRANCH_PROTECT_STR_MAX
18250 + 1);
18251 strncpy (accepted_branch_protection_string, const_str,
18252 BRANCH_PROTECT_STR_MAX + 1);
18253 /* Forcibly null-terminate. */
18254 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
18255 }
18256 return res;
18257}
18258
18259static bool
18260aarch64_validate_mbranch_protection (const char *const_str)
18261{
18262 char *str = (char *) xmalloc (strlen (const_str));
18263 enum aarch64_parse_opt_result res =
18264 aarch64_parse_branch_protection (const_str, &str);
18265 if (res == AARCH64_PARSE_INVALID_ARG)
a9c697b8 18266 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
efac62a3 18267 else if (res == AARCH64_PARSE_MISSING_ARG)
a9c697b8 18268 error ("missing argument for %<-mbranch-protection=%>");
efac62a3
ST
18269 free (str);
18270 return res == AARCH64_PARSE_OK;
18271}
18272
0cfff2a1
KT
18273/* Validate a command-line -march option. Parse the arch and extensions
18274 (if any) specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
18275 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18276 option is valid. */
0cfff2a1 18277
361fb3ee 18278static bool
0cfff2a1 18279aarch64_validate_march (const char *str, const struct processor **res,
fed55a60 18280 aarch64_feature_flags *isa_flags)
0cfff2a1 18281{
c7887347 18282 std::string invalid_extension;
0cfff2a1 18283 enum aarch64_parse_opt_result parse_res
c7887347 18284 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
18285
18286 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 18287 return true;
0cfff2a1
KT
18288
18289 switch (parse_res)
18290 {
18291 case AARCH64_PARSE_MISSING_ARG:
fb241da2 18292 error ("missing arch name in %<-march=%s%>", str);
0cfff2a1
KT
18293 break;
18294 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 18295 error ("unknown value %qs for %<-march%>", str);
01f44038 18296 aarch64_print_hint_for_arch (str);
48b9c7d5
KT
18297 /* A common user error is confusing -march and -mcpu.
18298 If the -march string matches a known CPU suggest -mcpu. */
18299 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18300 if (parse_res == AARCH64_PARSE_OK)
18301 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
0cfff2a1
KT
18302 break;
18303 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
18304 error ("invalid feature modifier %qs in %<-march=%s%>",
18305 invalid_extension.c_str (), str);
18306 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
18307 break;
18308 default:
18309 gcc_unreachable ();
18310 }
361fb3ee
KT
18311
18312 return false;
0cfff2a1
KT
18313}
18314
18315/* Validate a command-line -mtune option. Parse the cpu
18316 specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
18317 result, if it is valid, in RES. Return whether the option is
18318 valid. */
0cfff2a1 18319
361fb3ee 18320static bool
0cfff2a1
KT
18321aarch64_validate_mtune (const char *str, const struct processor **res)
18322{
18323 enum aarch64_parse_opt_result parse_res
18324 = aarch64_parse_tune (str, res);
18325
18326 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 18327 return true;
0cfff2a1
KT
18328
18329 switch (parse_res)
18330 {
18331 case AARCH64_PARSE_MISSING_ARG:
fb241da2 18332 error ("missing cpu name in %<-mtune=%s%>", str);
0cfff2a1
KT
18333 break;
18334 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 18335 error ("unknown value %qs for %<-mtune%>", str);
01f44038 18336 aarch64_print_hint_for_core (str);
0cfff2a1
KT
18337 break;
18338 default:
18339 gcc_unreachable ();
18340 }
361fb3ee
KT
18341 return false;
18342}
18343
43cacb12
RS
18344/* Return the VG value associated with -msve-vector-bits= value VALUE. */
18345
18346static poly_uint16
18347aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18348{
9b070057
RS
18349 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18350 on big-endian targets, so we would need to forbid subregs that convert
18351 from one to the other. By default a reinterpret sequence would then
18352 involve a store to memory in one mode and a load back in the other.
18353 Even if we optimize that sequence using reverse instructions,
18354 it would still be a significant potential overhead.
18355
18356 For now, it seems better to generate length-agnostic code for that
18357 case instead. */
18358 if (value == SVE_SCALABLE
18359 || (value == SVE_128 && BYTES_BIG_ENDIAN))
43cacb12
RS
18360 return poly_uint16 (2, 2);
18361 else
18362 return (int) value / 64;
18363}
18364
2a269bda
RS
18365/* Set the global aarch64_asm_isa_flags to FLAGS and update
18366 aarch64_isa_flags accordingly. */
18367
18368void
18369aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18370{
18371 aarch64_set_asm_isa_flags (&global_options, flags);
18372}
18373
0cfff2a1
KT
18374/* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18375 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18376 tuning structs. In particular it must set selected_tune and
2a269bda 18377 aarch64_asm_isa_flags that define the available ISA features and tuning
0cfff2a1
KT
18378 decisions. It must also set selected_arch as this will be used to
18379 output the .arch asm tags for each function. */
18380
18381static void
18382aarch64_override_options (void)
18383{
fed55a60
RS
18384 aarch64_feature_flags cpu_isa = 0;
18385 aarch64_feature_flags arch_isa = 0;
2a269bda 18386 aarch64_set_asm_isa_flags (0);
0cfff2a1 18387
ae54c1b0
WD
18388 const struct processor *cpu = NULL;
18389 const struct processor *arch = NULL;
18390 const struct processor *tune = NULL;
0cfff2a1 18391
a9ba2a9b
MM
18392 if (aarch64_harden_sls_string)
18393 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18394
efac62a3
ST
18395 if (aarch64_branch_protection_string)
18396 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18397
0cfff2a1
KT
18398 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18399 If either of -march or -mtune is given, they override their
18400 respective component of -mcpu. */
18401 if (aarch64_cpu_string)
ae54c1b0 18402 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
0cfff2a1
KT
18403
18404 if (aarch64_arch_string)
ae54c1b0 18405 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
0cfff2a1
KT
18406
18407 if (aarch64_tune_string)
ae54c1b0 18408 aarch64_validate_mtune (aarch64_tune_string, &tune);
43e9d192 18409
6881e3c1
OH
18410#ifdef SUBTARGET_OVERRIDE_OPTIONS
18411 SUBTARGET_OVERRIDE_OPTIONS;
18412#endif
18413
ae54c1b0 18414 if (cpu && arch)
0cfff2a1 18415 {
1be715f3
WD
18416 /* If both -mcpu and -march are specified, warn if they are not
18417 architecturally compatible and prefer the -march ISA flags. */
ae54c1b0 18418 if (arch->arch != cpu->arch)
0cfff2a1 18419 {
a3f9f006 18420 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
349297b6
JH
18421 aarch64_cpu_string,
18422 aarch64_arch_string);
0cfff2a1 18423 }
1be715f3 18424
ae54c1b0 18425 selected_arch = arch->arch;
2a269bda 18426 aarch64_set_asm_isa_flags (arch_isa);
0cfff2a1 18427 }
ae54c1b0 18428 else if (cpu)
0cfff2a1 18429 {
ae54c1b0 18430 selected_arch = cpu->arch;
2a269bda 18431 aarch64_set_asm_isa_flags (cpu_isa);
43e9d192 18432 }
ae54c1b0 18433 else if (arch)
0cfff2a1 18434 {
ae54c1b0
WD
18435 cpu = &all_cores[arch->ident];
18436 selected_arch = arch->arch;
2a269bda 18437 aarch64_set_asm_isa_flags (arch_isa);
1be715f3
WD
18438 }
18439 else
18440 {
18441 /* No -mcpu or -march specified, so use the default CPU. */
ae54c1b0
WD
18442 cpu = &all_cores[TARGET_CPU_DEFAULT];
18443 selected_arch = cpu->arch;
2a269bda 18444 aarch64_set_asm_isa_flags (cpu->flags);
0cfff2a1 18445 }
43e9d192 18446
ae54c1b0 18447 selected_tune = tune ? tune->ident : cpu->ident;
43e9d192 18448
c7ff4f0f
SD
18449 if (aarch64_enable_bti == 2)
18450 {
18451#ifdef TARGET_ENABLE_BTI
18452 aarch64_enable_bti = 1;
18453#else
18454 aarch64_enable_bti = 0;
18455#endif
18456 }
18457
18458 /* Return address signing is currently not supported for ILP32 targets. For
18459 LP64 targets use the configured option in the absence of a command-line
18460 option for -mbranch-protection. */
18461 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18462 {
18463#ifdef TARGET_ENABLE_PAC_RET
18464 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
c7ff4f0f
SD
18465#else
18466 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18467#endif
18468 }
18469
0cfff2a1
KT
18470#ifndef HAVE_AS_MABI_OPTION
18471 /* The compiler may have been configured with 2.23.* binutils, which does
18472 not have support for ILP32. */
18473 if (TARGET_ILP32)
a3f9f006 18474 error ("assembler does not support %<-mabi=ilp32%>");
0cfff2a1 18475#endif
43e9d192 18476
43cacb12
RS
18477 /* Convert -msve-vector-bits to a VG count. */
18478 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18479
db58fd89 18480 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
a3f9f006 18481 sorry ("return address signing is only supported for %<-mabi=lp64%>");
db58fd89 18482
5f7dbaa0
RE
18483 /* The pass to insert speculation tracking runs before
18484 shrink-wrapping and the latter does not know how to update the
18485 tracking status. So disable it in this case. */
18486 if (aarch64_track_speculation)
18487 flag_shrink_wrap = 0;
18488
0cfff2a1
KT
18489 aarch64_override_options_internal (&global_options);
18490
18491 /* Save these options as the default ones in case we push and pop them later
18492 while processing functions with potential target attributes. */
18493 target_option_default_node = target_option_current_node
ba948b37 18494 = build_target_option_node (&global_options, &global_options_set);
43e9d192
IB
18495}
18496
18497/* Implement targetm.override_options_after_change. */
18498
18499static void
18500aarch64_override_options_after_change (void)
18501{
0cfff2a1 18502 aarch64_override_options_after_change_1 (&global_options);
43e9d192
IB
18503}
18504
29a14a1a
MK
18505/* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18506static char *
18507aarch64_offload_options (void)
18508{
18509 if (TARGET_ILP32)
18510 return xstrdup ("-foffload-abi=ilp32");
18511 else
18512 return xstrdup ("-foffload-abi=lp64");
18513}
18514
43e9d192
IB
18515static struct machine_function *
18516aarch64_init_machine_status (void)
18517{
18518 struct machine_function *machine;
766090c2 18519 machine = ggc_cleared_alloc<machine_function> ();
43e9d192
IB
18520 return machine;
18521}
18522
18523void
18524aarch64_init_expanders (void)
18525{
18526 init_machine_status = aarch64_init_machine_status;
18527}
18528
18529/* A checking mechanism for the implementation of the various code models. */
18530static void
0cfff2a1 18531initialize_aarch64_code_model (struct gcc_options *opts)
43e9d192 18532{
6c0ab626
X
18533 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18534 switch (opts->x_aarch64_cmodel_var)
18535 {
18536 case AARCH64_CMODEL_TINY:
18537 if (opts->x_flag_pic)
18538 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18539 break;
18540 case AARCH64_CMODEL_SMALL:
18541 if (opts->x_flag_pic)
18542 {
34ecdb0f 18543#ifdef HAVE_AS_SMALL_PIC_RELOCS
6c0ab626
X
18544 aarch64_cmodel = (flag_pic == 2
18545 ? AARCH64_CMODEL_SMALL_PIC
18546 : AARCH64_CMODEL_SMALL_SPIC);
34ecdb0f 18547#else
6c0ab626 18548 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
34ecdb0f 18549#endif
6c0ab626
X
18550 }
18551 break;
18552 case AARCH64_CMODEL_LARGE:
18553 if (opts->x_flag_pic)
18554 sorry ("code model %qs with %<-f%s%>", "large",
18555 opts->x_flag_pic > 1 ? "PIC" : "pic");
18556 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18557 sorry ("code model %qs not supported in ilp32 mode", "large");
18558 break;
18559 case AARCH64_CMODEL_TINY_PIC:
18560 case AARCH64_CMODEL_SMALL_PIC:
18561 case AARCH64_CMODEL_SMALL_SPIC:
18562 gcc_unreachable ();
18563 }
43e9d192
IB
18564}
18565
361fb3ee
KT
18566/* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18567 using the information saved in PTR. */
18568
18569static void
ba948b37 18570aarch64_option_restore (struct gcc_options *opts,
ae54c1b0
WD
18571 struct gcc_options * /* opts_set */,
18572 struct cl_target_option * /* ptr */)
361fb3ee 18573{
361fb3ee
KT
18574 aarch64_override_options_internal (opts);
18575}
18576
18577/* Implement TARGET_OPTION_PRINT. */
18578
18579static void
18580aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18581{
18582 const struct processor *cpu
ae54c1b0
WD
18583 = aarch64_get_tune_cpu (ptr->x_selected_tune);
18584 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
054b4005 18585 std::string extension
2a269bda 18586 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
ae54c1b0 18587 arch->flags);
361fb3ee
KT
18588
18589 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
054b4005
JG
18590 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18591 arch->name, extension.c_str ());
361fb3ee
KT
18592}
18593
d78006d9
KT
18594static GTY(()) tree aarch64_previous_fndecl;
18595
e4ea20c8
KT
18596void
18597aarch64_reset_previous_fndecl (void)
18598{
18599 aarch64_previous_fndecl = NULL;
18600}
18601
acfc1ac1
KT
18602/* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18603 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18604 make sure optab availability predicates are recomputed when necessary. */
18605
18606void
18607aarch64_save_restore_target_globals (tree new_tree)
18608{
18609 if (TREE_TARGET_GLOBALS (new_tree))
18610 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18611 else if (new_tree == target_option_default_node)
18612 restore_target_globals (&default_target_globals);
18613 else
18614 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18615}
18616
d78006d9
KT
18617/* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18618 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18619 of the function, if such exists. This function may be called multiple
18620 times on a single function so use aarch64_previous_fndecl to avoid
18621 setting up identical state. */
18622
18623static void
18624aarch64_set_current_function (tree fndecl)
18625{
acfc1ac1
KT
18626 if (!fndecl || fndecl == aarch64_previous_fndecl)
18627 return;
18628
d78006d9
KT
18629 tree old_tree = (aarch64_previous_fndecl
18630 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18631 : NULL_TREE);
18632
acfc1ac1 18633 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
d78006d9 18634
acfc1ac1
KT
18635 /* If current function has no attributes but the previous one did,
18636 use the default node. */
18637 if (!new_tree && old_tree)
18638 new_tree = target_option_default_node;
d78006d9 18639
acfc1ac1
KT
18640 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18641 the default have been handled by aarch64_save_restore_target_globals from
18642 aarch64_pragma_target_parse. */
18643 if (old_tree == new_tree)
18644 return;
d78006d9 18645
acfc1ac1 18646 aarch64_previous_fndecl = fndecl;
6e17a23b 18647
acfc1ac1 18648 /* First set the target options. */
ba948b37
JJ
18649 cl_target_option_restore (&global_options, &global_options_set,
18650 TREE_TARGET_OPTION (new_tree));
6e17a23b 18651
acfc1ac1 18652 aarch64_save_restore_target_globals (new_tree);
d78006d9 18653}
361fb3ee 18654
5a2c8331
KT
18655/* Enum describing the various ways we can handle attributes.
18656 In many cases we can reuse the generic option handling machinery. */
18657
18658enum aarch64_attr_opt_type
18659{
18660 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
18661 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
18662 aarch64_attr_enum, /* Attribute sets an enum variable. */
18663 aarch64_attr_custom /* Attribute requires a custom handling function. */
18664};
18665
18666/* All the information needed to handle a target attribute.
18667 NAME is the name of the attribute.
9c582551 18668 ATTR_TYPE specifies the type of behavior of the attribute as described
5a2c8331
KT
18669 in the definition of enum aarch64_attr_opt_type.
18670 ALLOW_NEG is true if the attribute supports a "no-" form.
ab93e9b7
SE
18671 HANDLER is the function that takes the attribute string as an argument
18672 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
5a2c8331 18673 OPT_NUM is the enum specifying the option that the attribute modifies.
9c582551 18674 This is needed for attributes that mirror the behavior of a command-line
5a2c8331
KT
18675 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18676 aarch64_attr_enum. */
18677
18678struct aarch64_attribute_info
18679{
18680 const char *name;
18681 enum aarch64_attr_opt_type attr_type;
18682 bool allow_neg;
ab93e9b7 18683 bool (*handler) (const char *);
5a2c8331
KT
18684 enum opt_code opt_num;
18685};
18686
ab93e9b7 18687/* Handle the ARCH_STR argument to the arch= target attribute. */
5a2c8331
KT
18688
18689static bool
ab93e9b7 18690aarch64_handle_attr_arch (const char *str)
5a2c8331
KT
18691{
18692 const struct processor *tmp_arch = NULL;
c7887347 18693 std::string invalid_extension;
2a269bda 18694 aarch64_feature_flags tmp_flags;
5a2c8331 18695 enum aarch64_parse_opt_result parse_res
2a269bda 18696 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
5a2c8331
KT
18697
18698 if (parse_res == AARCH64_PARSE_OK)
18699 {
18700 gcc_assert (tmp_arch);
ae54c1b0 18701 selected_arch = tmp_arch->arch;
2a269bda 18702 aarch64_set_asm_isa_flags (tmp_flags);
5a2c8331
KT
18703 return true;
18704 }
18705
18706 switch (parse_res)
18707 {
18708 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 18709 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
5a2c8331
KT
18710 break;
18711 case AARCH64_PARSE_INVALID_ARG:
c6e75a4a 18712 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
01f44038 18713 aarch64_print_hint_for_arch (str);
5a2c8331
KT
18714 break;
18715 case AARCH64_PARSE_INVALID_FEATURE:
c6e75a4a 18716 error ("invalid feature modifier %s of value %qs in "
c7887347
ML
18717 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18718 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
18719 break;
18720 default:
18721 gcc_unreachable ();
18722 }
18723
18724 return false;
18725}
18726
ab93e9b7 18727/* Handle the argument CPU_STR to the cpu= target attribute. */
5a2c8331
KT
18728
18729static bool
ab93e9b7 18730aarch64_handle_attr_cpu (const char *str)
5a2c8331
KT
18731{
18732 const struct processor *tmp_cpu = NULL;
c7887347 18733 std::string invalid_extension;
2a269bda 18734 aarch64_feature_flags tmp_flags;
5a2c8331 18735 enum aarch64_parse_opt_result parse_res
2a269bda 18736 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
5a2c8331
KT
18737
18738 if (parse_res == AARCH64_PARSE_OK)
18739 {
18740 gcc_assert (tmp_cpu);
ae54c1b0
WD
18741 selected_tune = tmp_cpu->ident;
18742 selected_arch = tmp_cpu->arch;
2a269bda 18743 aarch64_set_asm_isa_flags (tmp_flags);
5a2c8331
KT
18744 return true;
18745 }
18746
18747 switch (parse_res)
18748 {
18749 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 18750 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
5a2c8331
KT
18751 break;
18752 case AARCH64_PARSE_INVALID_ARG:
c6e75a4a 18753 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
01f44038 18754 aarch64_print_hint_for_core (str);
5a2c8331
KT
18755 break;
18756 case AARCH64_PARSE_INVALID_FEATURE:
c6e75a4a 18757 error ("invalid feature modifier %qs of value %qs in "
c7887347
ML
18758 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18759 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
18760 break;
18761 default:
18762 gcc_unreachable ();
18763 }
18764
18765 return false;
18766}
18767
efac62a3
ST
18768/* Handle the argument STR to the branch-protection= attribute. */
18769
18770 static bool
18771 aarch64_handle_attr_branch_protection (const char* str)
18772 {
81e40f3a 18773 char *err_str = (char *) xmalloc (strlen (str) + 1);
efac62a3
ST
18774 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18775 &err_str);
18776 bool success = false;
18777 switch (res)
18778 {
18779 case AARCH64_PARSE_MISSING_ARG:
18780 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18781 " attribute");
18782 break;
18783 case AARCH64_PARSE_INVALID_ARG:
c6e75a4a 18784 error ("invalid protection type %qs in %<target(\"branch-protection"
efac62a3
ST
18785 "=\")%> pragma or attribute", err_str);
18786 break;
18787 case AARCH64_PARSE_OK:
18788 success = true;
18789 /* Fall through. */
18790 case AARCH64_PARSE_INVALID_FEATURE:
18791 break;
18792 default:
18793 gcc_unreachable ();
18794 }
18795 free (err_str);
18796 return success;
18797 }
18798
ab93e9b7 18799/* Handle the argument STR to the tune= target attribute. */
5a2c8331
KT
18800
18801static bool
ab93e9b7 18802aarch64_handle_attr_tune (const char *str)
5a2c8331
KT
18803{
18804 const struct processor *tmp_tune = NULL;
18805 enum aarch64_parse_opt_result parse_res
18806 = aarch64_parse_tune (str, &tmp_tune);
18807
18808 if (parse_res == AARCH64_PARSE_OK)
18809 {
18810 gcc_assert (tmp_tune);
ae54c1b0 18811 selected_tune = tmp_tune->ident;
5a2c8331
KT
18812 return true;
18813 }
18814
18815 switch (parse_res)
18816 {
18817 case AARCH64_PARSE_INVALID_ARG:
c6e75a4a 18818 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
01f44038 18819 aarch64_print_hint_for_core (str);
5a2c8331
KT
18820 break;
18821 default:
18822 gcc_unreachable ();
18823 }
18824
18825 return false;
18826}
18827
18828/* Parse an architecture extensions target attribute string specified in STR.
18829 For example "+fp+nosimd". Show any errors if needed. Return TRUE
18830 if successful. Update aarch64_isa_flags to reflect the ISA features
ab93e9b7 18831 modified. */
5a2c8331
KT
18832
18833static bool
ab93e9b7 18834aarch64_handle_attr_isa_flags (char *str)
5a2c8331
KT
18835{
18836 enum aarch64_parse_opt_result parse_res;
2a269bda 18837 auto isa_flags = aarch64_asm_isa_flags;
5a2c8331 18838
e4ea20c8
KT
18839 /* We allow "+nothing" in the beginning to clear out all architectural
18840 features if the user wants to handpick specific features. */
18841 if (strncmp ("+nothing", str, 8) == 0)
18842 {
18843 isa_flags = 0;
18844 str += 8;
18845 }
18846
c7887347
ML
18847 std::string invalid_extension;
18848 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
5a2c8331
KT
18849
18850 if (parse_res == AARCH64_PARSE_OK)
18851 {
2a269bda 18852 aarch64_set_asm_isa_flags (isa_flags);
5a2c8331
KT
18853 return true;
18854 }
18855
18856 switch (parse_res)
18857 {
18858 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 18859 error ("missing value in %<target()%> pragma or attribute");
5a2c8331
KT
18860 break;
18861
18862 case AARCH64_PARSE_INVALID_FEATURE:
c6e75a4a 18863 error ("invalid feature modifier %qs of value %qs in "
c7887347 18864 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
5a2c8331
KT
18865 break;
18866
18867 default:
18868 gcc_unreachable ();
18869 }
18870
18871 return false;
18872}
18873
18874/* The target attributes that we support. On top of these we also support just
18875 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
18876 handled explicitly in aarch64_process_one_target_attr. */
18877
18878static const struct aarch64_attribute_info aarch64_attributes[] =
18879{
18880 { "general-regs-only", aarch64_attr_mask, false, NULL,
18881 OPT_mgeneral_regs_only },
18882 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18883 OPT_mfix_cortex_a53_835769 },
48bb1a55
CL
18884 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18885 OPT_mfix_cortex_a53_843419 },
5a2c8331 18886 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
675d044c 18887 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
5a2c8331
KT
18888 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18889 OPT_momit_leaf_frame_pointer },
18890 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18891 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18892 OPT_march_ },
18893 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18894 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18895 OPT_mtune_ },
efac62a3
ST
18896 { "branch-protection", aarch64_attr_custom, false,
18897 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
db58fd89
JW
18898 { "sign-return-address", aarch64_attr_enum, false, NULL,
18899 OPT_msign_return_address_ },
9e02b45f
ML
18900 { "outline-atomics", aarch64_attr_bool, true, NULL,
18901 OPT_moutline_atomics},
5a2c8331
KT
18902 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18903};
18904
18905/* Parse ARG_STR which contains the definition of one target attribute.
ab93e9b7 18906 Show appropriate errors if any or return true if the attribute is valid. */
5a2c8331
KT
18907
18908static bool
ab93e9b7 18909aarch64_process_one_target_attr (char *arg_str)
5a2c8331
KT
18910{
18911 bool invert = false;
18912
18913 size_t len = strlen (arg_str);
18914
18915 if (len == 0)
18916 {
ab93e9b7 18917 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
18918 return false;
18919 }
18920
18921 char *str_to_check = (char *) alloca (len + 1);
18922 strcpy (str_to_check, arg_str);
18923
5a2c8331
KT
18924 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18925 It is easier to detect and handle it explicitly here rather than going
18926 through the machinery for the rest of the target attributes in this
18927 function. */
18928 if (*str_to_check == '+')
ab93e9b7 18929 return aarch64_handle_attr_isa_flags (str_to_check);
5a2c8331 18930
c0129e2d 18931 if (len > 3 && startswith (str_to_check, "no-"))
5a2c8331
KT
18932 {
18933 invert = true;
18934 str_to_check += 3;
18935 }
18936 char *arg = strchr (str_to_check, '=');
18937
18938 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18939 and point ARG to "foo". */
18940 if (arg)
18941 {
18942 *arg = '\0';
18943 arg++;
18944 }
18945 const struct aarch64_attribute_info *p_attr;
16d12992 18946 bool found = false;
5a2c8331
KT
18947 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18948 {
18949 /* If the names don't match up, or the user has given an argument
18950 to an attribute that doesn't accept one, or didn't give an argument
18951 to an attribute that expects one, fail to match. */
18952 if (strcmp (str_to_check, p_attr->name) != 0)
18953 continue;
18954
16d12992 18955 found = true;
5a2c8331
KT
18956 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18957 || p_attr->attr_type == aarch64_attr_enum;
18958
18959 if (attr_need_arg_p ^ (arg != NULL))
18960 {
ab93e9b7 18961 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
5a2c8331
KT
18962 return false;
18963 }
18964
18965 /* If the name matches but the attribute does not allow "no-" versions
18966 then we can't match. */
18967 if (invert && !p_attr->allow_neg)
18968 {
ab93e9b7 18969 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
5a2c8331
KT
18970 return false;
18971 }
18972
18973 switch (p_attr->attr_type)
18974 {
18975 /* Has a custom handler registered.
18976 For example, cpu=, arch=, tune=. */
18977 case aarch64_attr_custom:
18978 gcc_assert (p_attr->handler);
ab93e9b7 18979 if (!p_attr->handler (arg))
5a2c8331
KT
18980 return false;
18981 break;
18982
18983 /* Either set or unset a boolean option. */
18984 case aarch64_attr_bool:
18985 {
18986 struct cl_decoded_option decoded;
18987
18988 generate_option (p_attr->opt_num, NULL, !invert,
18989 CL_TARGET, &decoded);
18990 aarch64_handle_option (&global_options, &global_options_set,
18991 &decoded, input_location);
18992 break;
18993 }
18994 /* Set or unset a bit in the target_flags. aarch64_handle_option
18995 should know what mask to apply given the option number. */
18996 case aarch64_attr_mask:
18997 {
18998 struct cl_decoded_option decoded;
18999 /* We only need to specify the option number.
19000 aarch64_handle_option will know which mask to apply. */
19001 decoded.opt_index = p_attr->opt_num;
19002 decoded.value = !invert;
19003 aarch64_handle_option (&global_options, &global_options_set,
19004 &decoded, input_location);
19005 break;
19006 }
19007 /* Use the option setting machinery to set an option to an enum. */
19008 case aarch64_attr_enum:
19009 {
19010 gcc_assert (arg);
19011 bool valid;
19012 int value;
19013 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19014 &value, CL_TARGET);
19015 if (valid)
19016 {
19017 set_option (&global_options, NULL, p_attr->opt_num, value,
19018 NULL, DK_UNSPECIFIED, input_location,
19019 global_dc);
19020 }
19021 else
19022 {
ab93e9b7 19023 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
5a2c8331
KT
19024 }
19025 break;
19026 }
19027 default:
19028 gcc_unreachable ();
19029 }
19030 }
19031
16d12992
KT
19032 /* If we reached here we either have found an attribute and validated
19033 it or didn't match any. If we matched an attribute but its arguments
19034 were malformed we will have returned false already. */
19035 return found;
5a2c8331
KT
19036}
19037
19038/* Count how many times the character C appears in
19039 NULL-terminated string STR. */
19040
19041static unsigned int
19042num_occurences_in_str (char c, char *str)
19043{
19044 unsigned int res = 0;
19045 while (*str != '\0')
19046 {
19047 if (*str == c)
19048 res++;
19049
19050 str++;
19051 }
19052
19053 return res;
19054}
19055
19056/* Parse the tree in ARGS that contains the target attribute information
ab93e9b7 19057 and update the global target options space. */
5a2c8331
KT
19058
19059bool
ab93e9b7 19060aarch64_process_target_attr (tree args)
5a2c8331
KT
19061{
19062 if (TREE_CODE (args) == TREE_LIST)
19063 {
19064 do
19065 {
19066 tree head = TREE_VALUE (args);
19067 if (head)
19068 {
ab93e9b7 19069 if (!aarch64_process_target_attr (head))
5a2c8331
KT
19070 return false;
19071 }
19072 args = TREE_CHAIN (args);
19073 } while (args);
19074
19075 return true;
19076 }
3b6cb9e3
ML
19077
19078 if (TREE_CODE (args) != STRING_CST)
19079 {
19080 error ("attribute %<target%> argument not a string");
19081 return false;
19082 }
5a2c8331
KT
19083
19084 size_t len = strlen (TREE_STRING_POINTER (args));
19085 char *str_to_check = (char *) alloca (len + 1);
19086 strcpy (str_to_check, TREE_STRING_POINTER (args));
19087
19088 if (len == 0)
19089 {
ab93e9b7 19090 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
19091 return false;
19092 }
19093
19094 /* Used to catch empty spaces between commas i.e.
19095 attribute ((target ("attr1,,attr2"))). */
19096 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19097
19098 /* Handle multiple target attributes separated by ','. */
7185a4eb 19099 char *token = strtok_r (str_to_check, ",", &str_to_check);
5a2c8331
KT
19100
19101 unsigned int num_attrs = 0;
19102 while (token)
19103 {
19104 num_attrs++;
ab93e9b7 19105 if (!aarch64_process_one_target_attr (token))
5a2c8331 19106 {
145be5ef
PK
19107 /* Check if token is possibly an arch extension without
19108 leading '+'. */
fed55a60 19109 aarch64_feature_flags isa_temp = 0;
145be5ef
PK
19110 auto with_plus = std::string ("+") + token;
19111 enum aarch64_parse_opt_result ext_res
19112 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19113
19114 if (ext_res == AARCH64_PARSE_OK)
19115 error ("arch extension %<%s%> should be prefixed by %<+%>",
19116 token);
19117 else
19118 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
5a2c8331
KT
19119 return false;
19120 }
19121
7185a4eb 19122 token = strtok_r (NULL, ",", &str_to_check);
5a2c8331
KT
19123 }
19124
19125 if (num_attrs != num_commas + 1)
19126 {
ab93e9b7 19127 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
5a2c8331
KT
19128 return false;
19129 }
19130
19131 return true;
19132}
19133
19134/* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19135 process attribute ((target ("..."))). */
19136
19137static bool
19138aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19139{
19140 struct cl_target_option cur_target;
19141 bool ret;
19142 tree old_optimize;
19143 tree new_target, new_optimize;
19144 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
91d0e8de
KT
19145
19146 /* If what we're processing is the current pragma string then the
19147 target option node is already stored in target_option_current_node
e53b6e56 19148 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
91d0e8de
KT
19149 having to re-parse the string. This is especially useful to keep
19150 arm_neon.h compile times down since that header contains a lot
19151 of intrinsics enclosed in pragmas. */
19152 if (!existing_target && args == current_target_pragma)
19153 {
19154 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19155 return true;
19156 }
5a2c8331
KT
19157 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19158
ba948b37
JJ
19159 old_optimize
19160 = build_optimization_node (&global_options, &global_options_set);
5a2c8331
KT
19161 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19162
19163 /* If the function changed the optimization levels as well as setting
19164 target options, start with the optimizations specified. */
19165 if (func_optimize && func_optimize != old_optimize)
ba948b37 19166 cl_optimization_restore (&global_options, &global_options_set,
5a2c8331
KT
19167 TREE_OPTIMIZATION (func_optimize));
19168
19169 /* Save the current target options to restore at the end. */
ba948b37 19170 cl_target_option_save (&cur_target, &global_options, &global_options_set);
5a2c8331
KT
19171
19172 /* If fndecl already has some target attributes applied to it, unpack
19173 them so that we add this attribute on top of them, rather than
19174 overwriting them. */
19175 if (existing_target)
19176 {
19177 struct cl_target_option *existing_options
19178 = TREE_TARGET_OPTION (existing_target);
19179
19180 if (existing_options)
ba948b37
JJ
19181 cl_target_option_restore (&global_options, &global_options_set,
19182 existing_options);
5a2c8331
KT
19183 }
19184 else
ba948b37
JJ
19185 cl_target_option_restore (&global_options, &global_options_set,
19186 TREE_TARGET_OPTION (target_option_current_node));
5a2c8331 19187
ab93e9b7 19188 ret = aarch64_process_target_attr (args);
5a2c8331
KT
19189
19190 /* Set up any additional state. */
19191 if (ret)
19192 {
19193 aarch64_override_options_internal (&global_options);
ba948b37
JJ
19194 new_target = build_target_option_node (&global_options,
19195 &global_options_set);
5a2c8331
KT
19196 }
19197 else
19198 new_target = NULL;
19199
ba948b37
JJ
19200 new_optimize = build_optimization_node (&global_options,
19201 &global_options_set);
5a2c8331
KT
19202
19203 if (fndecl && ret)
19204 {
19205 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19206
19207 if (old_optimize != new_optimize)
19208 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19209 }
19210
ba948b37 19211 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
5a2c8331
KT
19212
19213 if (old_optimize != new_optimize)
ba948b37 19214 cl_optimization_restore (&global_options, &global_options_set,
5a2c8331
KT
19215 TREE_OPTIMIZATION (old_optimize));
19216 return ret;
19217}
19218
1fd8d40c
KT
19219/* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
19220 tri-bool options (yes, no, don't care) and the default value is
19221 DEF, determine whether to reject inlining. */
19222
19223static bool
19224aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19225 int dont_care, int def)
19226{
19227 /* If the callee doesn't care, always allow inlining. */
19228 if (callee == dont_care)
19229 return true;
19230
19231 /* If the caller doesn't care, always allow inlining. */
19232 if (caller == dont_care)
19233 return true;
19234
19235 /* Otherwise, allow inlining if either the callee and caller values
19236 agree, or if the callee is using the default value. */
19237 return (callee == caller || callee == def);
19238}
19239
19240/* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
19241 to inline CALLEE into CALLER based on target-specific info.
19242 Make sure that the caller and callee have compatible architectural
19243 features. Then go through the other possible target attributes
19244 and see if they can block inlining. Try not to reject always_inline
19245 callees unless they are incompatible architecturally. */
19246
19247static bool
19248aarch64_can_inline_p (tree caller, tree callee)
19249{
19250 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19251 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19252
1fd8d40c
KT
19253 struct cl_target_option *caller_opts
19254 = TREE_TARGET_OPTION (caller_tree ? caller_tree
19255 : target_option_default_node);
19256
675d044c
SD
19257 struct cl_target_option *callee_opts
19258 = TREE_TARGET_OPTION (callee_tree ? callee_tree
19259 : target_option_default_node);
1fd8d40c
KT
19260
19261 /* Callee's ISA flags should be a subset of the caller's. */
2a269bda
RS
19262 if ((caller_opts->x_aarch64_asm_isa_flags
19263 & callee_opts->x_aarch64_asm_isa_flags)
19264 != callee_opts->x_aarch64_asm_isa_flags)
19265 return false;
1fd8d40c 19266 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
2a269bda 19267 != callee_opts->x_aarch64_isa_flags)
1fd8d40c
KT
19268 return false;
19269
19270 /* Allow non-strict aligned functions inlining into strict
19271 aligned ones. */
19272 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19273 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19274 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19275 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19276 return false;
19277
19278 bool always_inline = lookup_attribute ("always_inline",
19279 DECL_ATTRIBUTES (callee));
19280
19281 /* If the architectural features match up and the callee is always_inline
19282 then the other attributes don't matter. */
19283 if (always_inline)
19284 return true;
19285
19286 if (caller_opts->x_aarch64_cmodel_var
19287 != callee_opts->x_aarch64_cmodel_var)
19288 return false;
19289
19290 if (caller_opts->x_aarch64_tls_dialect
19291 != callee_opts->x_aarch64_tls_dialect)
19292 return false;
19293
19294 /* Honour explicit requests to workaround errata. */
19295 if (!aarch64_tribools_ok_for_inlining_p (
19296 caller_opts->x_aarch64_fix_a53_err835769,
19297 callee_opts->x_aarch64_fix_a53_err835769,
19298 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19299 return false;
19300
48bb1a55
CL
19301 if (!aarch64_tribools_ok_for_inlining_p (
19302 caller_opts->x_aarch64_fix_a53_err843419,
19303 callee_opts->x_aarch64_fix_a53_err843419,
19304 2, TARGET_FIX_ERR_A53_843419))
19305 return false;
19306
1fd8d40c
KT
19307 /* If the user explicitly specified -momit-leaf-frame-pointer for the
19308 caller and calle and they don't match up, reject inlining. */
19309 if (!aarch64_tribools_ok_for_inlining_p (
19310 caller_opts->x_flag_omit_leaf_frame_pointer,
19311 callee_opts->x_flag_omit_leaf_frame_pointer,
19312 2, 1))
19313 return false;
19314
19315 /* If the callee has specific tuning overrides, respect them. */
19316 if (callee_opts->x_aarch64_override_tune_string != NULL
19317 && caller_opts->x_aarch64_override_tune_string == NULL)
19318 return false;
19319
19320 /* If the user specified tuning override strings for the
19321 caller and callee and they don't match up, reject inlining.
19322 We just do a string compare here, we don't analyze the meaning
19323 of the string, as it would be too costly for little gain. */
19324 if (callee_opts->x_aarch64_override_tune_string
19325 && caller_opts->x_aarch64_override_tune_string
19326 && (strcmp (callee_opts->x_aarch64_override_tune_string,
19327 caller_opts->x_aarch64_override_tune_string) != 0))
19328 return false;
19329
19330 return true;
19331}
19332
bb6ce448
RS
19333/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19334 been already. */
19335
19336unsigned int
19337aarch64_tlsdesc_abi_id ()
19338{
19339 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19340 if (!tlsdesc_abi.initialized_p ())
19341 {
19342 HARD_REG_SET full_reg_clobbers;
19343 CLEAR_HARD_REG_SET (full_reg_clobbers);
19344 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19345 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19346 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19347 SET_HARD_REG_BIT (full_reg_clobbers, regno);
19348 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19349 }
19350 return tlsdesc_abi.id ();
19351}
19352
43e9d192
IB
19353/* Return true if SYMBOL_REF X binds locally. */
19354
19355static bool
19356aarch64_symbol_binds_local_p (const_rtx x)
19357{
19358 return (SYMBOL_REF_DECL (x)
19359 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19360 : SYMBOL_REF_LOCAL_P (x));
19361}
19362
19363/* Return true if SYMBOL_REF X is thread local */
19364static bool
19365aarch64_tls_symbol_p (rtx x)
19366{
19367 if (! TARGET_HAVE_TLS)
19368 return false;
19369
74b27d8e 19370 x = strip_salt (x);
3793ecc1 19371 if (!SYMBOL_REF_P (x))
43e9d192
IB
19372 return false;
19373
19374 return SYMBOL_REF_TLS_MODEL (x) != 0;
19375}
19376
19377/* Classify a TLS symbol into one of the TLS kinds. */
19378enum aarch64_symbol_type
19379aarch64_classify_tls_symbol (rtx x)
19380{
19381 enum tls_model tls_kind = tls_symbolic_operand_type (x);
19382
19383 switch (tls_kind)
19384 {
19385 case TLS_MODEL_GLOBAL_DYNAMIC:
19386 case TLS_MODEL_LOCAL_DYNAMIC:
19387 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19388
19389 case TLS_MODEL_INITIAL_EXEC:
5ae7caad
JW
19390 switch (aarch64_cmodel)
19391 {
19392 case AARCH64_CMODEL_TINY:
19393 case AARCH64_CMODEL_TINY_PIC:
19394 return SYMBOL_TINY_TLSIE;
19395 default:
79496620 19396 return SYMBOL_SMALL_TLSIE;
5ae7caad 19397 }
43e9d192
IB
19398
19399 case TLS_MODEL_LOCAL_EXEC:
cbf5629e
JW
19400 if (aarch64_tls_size == 12)
19401 return SYMBOL_TLSLE12;
19402 else if (aarch64_tls_size == 24)
19403 return SYMBOL_TLSLE24;
19404 else if (aarch64_tls_size == 32)
19405 return SYMBOL_TLSLE32;
19406 else if (aarch64_tls_size == 48)
19407 return SYMBOL_TLSLE48;
19408 else
19409 gcc_unreachable ();
43e9d192
IB
19410
19411 case TLS_MODEL_EMULATED:
19412 case TLS_MODEL_NONE:
19413 return SYMBOL_FORCE_TO_MEM;
19414
19415 default:
19416 gcc_unreachable ();
19417 }
19418}
19419
43cacb12
RS
19420/* Return the correct method for accessing X + OFFSET, where X is either
19421 a SYMBOL_REF or LABEL_REF. */
17f4d4bf 19422
43e9d192 19423enum aarch64_symbol_type
43cacb12 19424aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
43e9d192 19425{
74b27d8e
RS
19426 x = strip_salt (x);
19427
3793ecc1 19428 if (LABEL_REF_P (x))
43e9d192
IB
19429 {
19430 switch (aarch64_cmodel)
19431 {
19432 case AARCH64_CMODEL_LARGE:
19433 return SYMBOL_FORCE_TO_MEM;
19434
19435 case AARCH64_CMODEL_TINY_PIC:
19436 case AARCH64_CMODEL_TINY:
a5350ddc
CSS
19437 return SYMBOL_TINY_ABSOLUTE;
19438
1b1e81f8 19439 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
19440 case AARCH64_CMODEL_SMALL_PIC:
19441 case AARCH64_CMODEL_SMALL:
19442 return SYMBOL_SMALL_ABSOLUTE;
19443
19444 default:
19445 gcc_unreachable ();
19446 }
19447 }
19448
3793ecc1 19449 if (SYMBOL_REF_P (x))
43e9d192 19450 {
43e9d192
IB
19451 if (aarch64_tls_symbol_p (x))
19452 return aarch64_classify_tls_symbol (x);
19453
17f4d4bf
CSS
19454 switch (aarch64_cmodel)
19455 {
fb0746f3 19456 case AARCH64_CMODEL_TINY_PIC:
17f4d4bf 19457 case AARCH64_CMODEL_TINY:
fb0746f3
WD
19458 /* With -fPIC non-local symbols use the GOT. For orthogonality
19459 always use the GOT for extern weak symbols. */
19460 if ((flag_pic || SYMBOL_REF_WEAK (x))
19461 && !aarch64_symbol_binds_local_p (x))
19462 return SYMBOL_TINY_GOT;
19463
15f6e0da 19464 /* When we retrieve symbol + offset address, we have to make sure
f8b756b7
TB
19465 the offset does not cause overflow of the final address. But
19466 we have no way of knowing the address of symbol at compile time
19467 so we can't accurately say if the distance between the PC and
7d3b27ff
WD
19468 symbol + offset is outside the addressible range of +/-1MB in the
19469 TINY code model. So we limit the maximum offset to +/-64KB and
19470 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
fb0746f3 19471 If offset_within_block_p is true we allow larger offsets. */
7d3b27ff
WD
19472 if (!(IN_RANGE (offset, -0x10000, 0x10000)
19473 || offset_within_block_p (x, offset)))
19474 return SYMBOL_FORCE_TO_MEM;
19475
a5350ddc
CSS
19476 return SYMBOL_TINY_ABSOLUTE;
19477
fb0746f3
WD
19478
19479 case AARCH64_CMODEL_SMALL_SPIC:
19480 case AARCH64_CMODEL_SMALL_PIC:
17f4d4bf 19481 case AARCH64_CMODEL_SMALL:
fb0746f3
WD
19482 if ((flag_pic || SYMBOL_REF_WEAK (x))
19483 && !aarch64_symbol_binds_local_p (x))
19484 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19485 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19486
f8b756b7 19487 /* Same reasoning as the tiny code model, but the offset cap here is
7d3b27ff 19488 1MB, allowing +/-3.9GB for the offset to the symbol. */
7d3b27ff
WD
19489 if (!(IN_RANGE (offset, -0x100000, 0x100000)
19490 || offset_within_block_p (x, offset)))
19491 return SYMBOL_FORCE_TO_MEM;
19492
17f4d4bf 19493 return SYMBOL_SMALL_ABSOLUTE;
43e9d192 19494
9ee6540a
WD
19495 case AARCH64_CMODEL_LARGE:
19496 /* This is alright even in PIC code as the constant
19497 pool reference is always PC relative and within
19498 the same translation unit. */
d47d34bb 19499 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
9ee6540a
WD
19500 return SYMBOL_SMALL_ABSOLUTE;
19501 else
19502 return SYMBOL_FORCE_TO_MEM;
19503
17f4d4bf
CSS
19504 default:
19505 gcc_unreachable ();
19506 }
43e9d192 19507 }
17f4d4bf 19508
43e9d192
IB
19509 /* By default push everything into the constant pool. */
19510 return SYMBOL_FORCE_TO_MEM;
19511}
19512
43e9d192
IB
19513bool
19514aarch64_constant_address_p (rtx x)
19515{
19516 return (CONSTANT_P (x) && memory_address_p (DImode, x));
19517}
19518
19519bool
19520aarch64_legitimate_pic_operand_p (rtx x)
19521{
74b27d8e
RS
19522 poly_int64 offset;
19523 x = strip_offset_and_salt (x, &offset);
3793ecc1 19524 if (SYMBOL_REF_P (x))
74b27d8e 19525 return false;
43e9d192
IB
19526
19527 return true;
19528}
19529
26895c21
WD
19530/* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
19531 that should be rematerialized rather than spilled. */
3520f7cc 19532
43e9d192 19533static bool
ef4bddc2 19534aarch64_legitimate_constant_p (machine_mode mode, rtx x)
43e9d192 19535{
26895c21 19536 /* Support CSE and rematerialization of common constants. */
c0bb5bc5 19537 if (CONST_INT_P (x)
0dc8e1e7 19538 || CONST_DOUBLE_P (x))
26895c21
WD
19539 return true;
19540
1b5f74e8
RS
19541 /* Only accept variable-length vector constants if they can be
19542 handled directly.
19543
19544 ??? It would be possible (but complex) to handle rematerialization
19545 of other constants via secondary reloads. */
19546 if (!GET_MODE_SIZE (mode).is_constant ())
19547 return aarch64_simd_valid_immediate (x, NULL);
19548
19549 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19550 least be forced to memory and loaded from there. */
568b9c0e 19551 if (CONST_VECTOR_P (x))
1b5f74e8
RS
19552 return !targetm.cannot_force_const_mem (mode, x);
19553
43cacb12
RS
19554 /* Do not allow vector struct mode constants for Advanced SIMD.
19555 We could support 0 and -1 easily, but they need support in
19556 aarch64-simd.md. */
19557 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19558 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
43e9d192
IB
19559 return false;
19560
509bb9b6
RS
19561 if (GET_CODE (x) == HIGH)
19562 x = XEXP (x, 0);
19563
43cacb12
RS
19564 /* Accept polynomial constants that can be calculated by using the
19565 destination of a move as the sole temporary. Constants that
19566 require a second temporary cannot be rematerialized (they can't be
19567 forced to memory and also aren't legitimate constants). */
19568 poly_int64 offset;
19569 if (poly_int_rtx_p (x, &offset))
19570 return aarch64_offset_temporaries (false, offset) <= 1;
19571
19572 /* If an offset is being added to something else, we need to allow the
19573 base to be moved into the destination register, meaning that there
19574 are no free temporaries for the offset. */
74b27d8e 19575 x = strip_offset_and_salt (x, &offset);
43cacb12
RS
19576 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19577 return false;
26895c21 19578
43cacb12
RS
19579 /* Do not allow const (plus (anchor_symbol, const_int)). */
19580 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19581 return false;
26895c21 19582
f28e54bd
WD
19583 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
19584 so spilling them is better than rematerialization. */
19585 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19586 return true;
19587
26895c21 19588 /* Label references are always constant. */
3793ecc1 19589 if (LABEL_REF_P (x))
26895c21
WD
19590 return true;
19591
19592 return false;
43e9d192
IB
19593}
19594
a5bc806c 19595rtx
43e9d192
IB
19596aarch64_load_tp (rtx target)
19597{
19598 if (!target
19599 || GET_MODE (target) != Pmode
19600 || !register_operand (target, Pmode))
19601 target = gen_reg_rtx (Pmode);
19602
19603 /* Can return in any reg. */
19604 emit_insn (gen_aarch64_load_tp_hard (target));
19605 return target;
19606}
19607
43e9d192
IB
19608/* On AAPCS systems, this is the "struct __va_list". */
19609static GTY(()) tree va_list_type;
19610
19611/* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19612 Return the type to use as __builtin_va_list.
19613
19614 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19615
19616 struct __va_list
19617 {
19618 void *__stack;
19619 void *__gr_top;
19620 void *__vr_top;
19621 int __gr_offs;
19622 int __vr_offs;
19623 }; */
19624
19625static tree
19626aarch64_build_builtin_va_list (void)
19627{
19628 tree va_list_name;
19629 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19630
19631 /* Create the type. */
19632 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19633 /* Give it the required name. */
19634 va_list_name = build_decl (BUILTINS_LOCATION,
19635 TYPE_DECL,
19636 get_identifier ("__va_list"),
19637 va_list_type);
19638 DECL_ARTIFICIAL (va_list_name) = 1;
19639 TYPE_NAME (va_list_type) = va_list_name;
665c56c6 19640 TYPE_STUB_DECL (va_list_type) = va_list_name;
43e9d192
IB
19641
19642 /* Create the fields. */
19643 f_stack = build_decl (BUILTINS_LOCATION,
19644 FIELD_DECL, get_identifier ("__stack"),
19645 ptr_type_node);
19646 f_grtop = build_decl (BUILTINS_LOCATION,
19647 FIELD_DECL, get_identifier ("__gr_top"),
19648 ptr_type_node);
19649 f_vrtop = build_decl (BUILTINS_LOCATION,
19650 FIELD_DECL, get_identifier ("__vr_top"),
19651 ptr_type_node);
19652 f_groff = build_decl (BUILTINS_LOCATION,
19653 FIELD_DECL, get_identifier ("__gr_offs"),
19654 integer_type_node);
19655 f_vroff = build_decl (BUILTINS_LOCATION,
19656 FIELD_DECL, get_identifier ("__vr_offs"),
19657 integer_type_node);
19658
88e3bdd1 19659 /* Tell tree-stdarg pass about our internal offset fields.
3fd6b9cc
JW
19660 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19661 purpose to identify whether the code is updating va_list internal
19662 offset fields through irregular way. */
19663 va_list_gpr_counter_field = f_groff;
19664 va_list_fpr_counter_field = f_vroff;
19665
43e9d192
IB
19666 DECL_ARTIFICIAL (f_stack) = 1;
19667 DECL_ARTIFICIAL (f_grtop) = 1;
19668 DECL_ARTIFICIAL (f_vrtop) = 1;
19669 DECL_ARTIFICIAL (f_groff) = 1;
19670 DECL_ARTIFICIAL (f_vroff) = 1;
19671
19672 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19673 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19674 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19675 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19676 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19677
19678 TYPE_FIELDS (va_list_type) = f_stack;
19679 DECL_CHAIN (f_stack) = f_grtop;
19680 DECL_CHAIN (f_grtop) = f_vrtop;
19681 DECL_CHAIN (f_vrtop) = f_groff;
19682 DECL_CHAIN (f_groff) = f_vroff;
19683
19684 /* Compute its layout. */
19685 layout_type (va_list_type);
19686
19687 return va_list_type;
19688}
19689
19690/* Implement TARGET_EXPAND_BUILTIN_VA_START. */
19691static void
19692aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19693{
19694 const CUMULATIVE_ARGS *cum;
19695 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19696 tree stack, grtop, vrtop, groff, vroff;
19697 tree t;
88e3bdd1
JW
19698 int gr_save_area_size = cfun->va_list_gpr_size;
19699 int vr_save_area_size = cfun->va_list_fpr_size;
43e9d192
IB
19700 int vr_offset;
19701
19702 cum = &crtl->args.info;
88e3bdd1
JW
19703 if (cfun->va_list_gpr_size)
19704 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19705 cfun->va_list_gpr_size);
19706 if (cfun->va_list_fpr_size)
19707 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19708 * UNITS_PER_VREG, cfun->va_list_fpr_size);
43e9d192 19709
d5726973 19710 if (!TARGET_FLOAT)
43e9d192 19711 {
261fb553 19712 gcc_assert (cum->aapcs_nvrn == 0);
43e9d192
IB
19713 vr_save_area_size = 0;
19714 }
19715
19716 f_stack = TYPE_FIELDS (va_list_type_node);
19717 f_grtop = DECL_CHAIN (f_stack);
19718 f_vrtop = DECL_CHAIN (f_grtop);
19719 f_groff = DECL_CHAIN (f_vrtop);
19720 f_vroff = DECL_CHAIN (f_groff);
19721
19722 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19723 NULL_TREE);
19724 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19725 NULL_TREE);
19726 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19727 NULL_TREE);
19728 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19729 NULL_TREE);
19730 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19731 NULL_TREE);
19732
19733 /* Emit code to initialize STACK, which points to the next varargs stack
19734 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
19735 by named arguments. STACK is 8-byte aligned. */
19736 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19737 if (cum->aapcs_stack_size > 0)
19738 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19739 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19740 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19741
19742 /* Emit code to initialize GRTOP, the top of the GR save area.
19743 virtual_incoming_args_rtx should have been 16 byte aligned. */
19744 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19745 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19746 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19747
19748 /* Emit code to initialize VRTOP, the top of the VR save area.
19749 This address is gr_save_area_bytes below GRTOP, rounded
19750 down to the next 16-byte boundary. */
19751 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
4f59f9f2
UB
19752 vr_offset = ROUND_UP (gr_save_area_size,
19753 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
19754
19755 if (vr_offset)
19756 t = fold_build_pointer_plus_hwi (t, -vr_offset);
19757 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19758 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19759
19760 /* Emit code to initialize GROFF, the offset from GRTOP of the
19761 next GPR argument. */
19762 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19763 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19764 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19765
19766 /* Likewise emit code to initialize VROFF, the offset from FTOP
19767 of the next VR argument. */
19768 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19769 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19770 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19771}
19772
19773/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
19774
19775static tree
19776aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19777 gimple_seq *post_p ATTRIBUTE_UNUSED)
19778{
19779 tree addr;
19780 bool indirect_p;
19781 bool is_ha; /* is HFA or HVA. */
19782 bool dw_align; /* double-word align. */
ef4bddc2 19783 machine_mode ag_mode = VOIDmode;
43e9d192 19784 int nregs;
ef4bddc2 19785 machine_mode mode;
43e9d192
IB
19786
19787 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19788 tree stack, f_top, f_off, off, arg, roundup, on_stack;
19789 HOST_WIDE_INT size, rsize, adjust, align;
19790 tree t, u, cond1, cond2;
19791
fde65a89 19792 indirect_p = pass_va_arg_by_reference (type);
43e9d192
IB
19793 if (indirect_p)
19794 type = build_pointer_type (type);
19795
19796 mode = TYPE_MODE (type);
19797
19798 f_stack = TYPE_FIELDS (va_list_type_node);
19799 f_grtop = DECL_CHAIN (f_stack);
19800 f_vrtop = DECL_CHAIN (f_grtop);
19801 f_groff = DECL_CHAIN (f_vrtop);
19802 f_vroff = DECL_CHAIN (f_groff);
19803
19804 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19805 f_stack, NULL_TREE);
19806 size = int_size_in_bytes (type);
c590597c 19807
49813aad 19808 unsigned int abi_break;
6610daa1 19809 unsigned int abi_break_packed;
c590597c 19810 align
6610daa1
CL
19811 = aarch64_function_arg_alignment (mode, type, &abi_break, &abi_break_packed)
19812 / BITS_PER_UNIT;
43e9d192
IB
19813
19814 dw_align = false;
19815 adjust = 0;
56fe3ca3
RS
19816 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19817 &is_ha, false))
43e9d192 19818 {
6a70badb
RS
19819 /* No frontends can create types with variable-sized modes, so we
19820 shouldn't be asked to pass or return them. */
19821 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19822
43e9d192 19823 /* TYPE passed in fp/simd registers. */
d5726973 19824 if (!TARGET_FLOAT)
fc29dfc9 19825 aarch64_err_no_fpadvsimd (mode);
43e9d192
IB
19826
19827 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19828 unshare_expr (valist), f_vrtop, NULL_TREE);
19829 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19830 unshare_expr (valist), f_vroff, NULL_TREE);
19831
19832 rsize = nregs * UNITS_PER_VREG;
19833
19834 if (is_ha)
19835 {
6a70badb
RS
19836 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19837 adjust = UNITS_PER_VREG - ag_size;
43e9d192 19838 }
76b0cbf8 19839 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
19840 && size < UNITS_PER_VREG)
19841 {
19842 adjust = UNITS_PER_VREG - size;
19843 }
19844 }
19845 else
19846 {
19847 /* TYPE passed in general registers. */
19848 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19849 unshare_expr (valist), f_grtop, NULL_TREE);
19850 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19851 unshare_expr (valist), f_groff, NULL_TREE);
4f59f9f2 19852 rsize = ROUND_UP (size, UNITS_PER_WORD);
43e9d192
IB
19853 nregs = rsize / UNITS_PER_WORD;
19854
6610daa1
CL
19855 if (align <= 8 && abi_break_packed && warn_psabi)
19856 inform (input_location, "parameter passing for argument of type "
19857 "%qT changed in GCC 13.1", type);
19858
43e9d192 19859 if (align > 8)
c590597c
RE
19860 {
19861 if (abi_break && warn_psabi)
19862 inform (input_location, "parameter passing for argument of type "
19863 "%qT changed in GCC 9.1", type);
19864 dw_align = true;
19865 }
43e9d192 19866
76b0cbf8 19867 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
19868 && size < UNITS_PER_WORD)
19869 {
19870 adjust = UNITS_PER_WORD - size;
19871 }
19872 }
19873
19874 /* Get a local temporary for the field value. */
19875 off = get_initialized_tmp_var (f_off, pre_p, NULL);
19876
19877 /* Emit code to branch if off >= 0. */
19878 t = build2 (GE_EXPR, boolean_type_node, off,
19879 build_int_cst (TREE_TYPE (off), 0));
19880 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19881
19882 if (dw_align)
19883 {
19884 /* Emit: offs = (offs + 15) & -16. */
19885 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19886 build_int_cst (TREE_TYPE (off), 15));
19887 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19888 build_int_cst (TREE_TYPE (off), -16));
19889 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19890 }
19891 else
19892 roundup = NULL;
19893
19894 /* Update ap.__[g|v]r_offs */
19895 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19896 build_int_cst (TREE_TYPE (off), rsize));
19897 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19898
19899 /* String up. */
19900 if (roundup)
19901 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19902
19903 /* [cond2] if (ap.__[g|v]r_offs > 0) */
19904 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19905 build_int_cst (TREE_TYPE (f_off), 0));
19906 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19907
19908 /* String up: make sure the assignment happens before the use. */
19909 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19910 COND_EXPR_ELSE (cond1) = t;
19911
19912 /* Prepare the trees handling the argument that is passed on the stack;
19913 the top level node will store in ON_STACK. */
19914 arg = get_initialized_tmp_var (stack, pre_p, NULL);
19915 if (align > 8)
19916 {
19917 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
4bdc2738 19918 t = fold_build_pointer_plus_hwi (arg, 15);
43e9d192
IB
19919 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19920 build_int_cst (TREE_TYPE (t), -16));
43e9d192
IB
19921 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19922 }
19923 else
19924 roundup = NULL;
19925 /* Advance ap.__stack */
4bdc2738 19926 t = fold_build_pointer_plus_hwi (arg, size + 7);
43e9d192
IB
19927 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19928 build_int_cst (TREE_TYPE (t), -8));
43e9d192
IB
19929 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19930 /* String up roundup and advance. */
19931 if (roundup)
19932 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19933 /* String up with arg */
19934 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19935 /* Big-endianness related address adjustment. */
76b0cbf8 19936 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
19937 && size < UNITS_PER_WORD)
19938 {
19939 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19940 size_int (UNITS_PER_WORD - size));
19941 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19942 }
19943
19944 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19945 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19946
19947 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
19948 t = off;
19949 if (adjust)
19950 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19951 build_int_cst (TREE_TYPE (off), adjust));
19952
19953 t = fold_convert (sizetype, t);
19954 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19955
19956 if (is_ha)
19957 {
19958 /* type ha; // treat as "struct {ftype field[n];}"
19959 ... [computing offs]
19960 for (i = 0; i <nregs; ++i, offs += 16)
19961 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19962 return ha; */
19963 int i;
19964 tree tmp_ha, field_t, field_ptr_t;
19965
19966 /* Declare a local variable. */
19967 tmp_ha = create_tmp_var_raw (type, "ha");
19968 gimple_add_tmp_var (tmp_ha);
19969
19970 /* Establish the base type. */
19971 switch (ag_mode)
19972 {
4e10a5a7 19973 case E_SFmode:
43e9d192
IB
19974 field_t = float_type_node;
19975 field_ptr_t = float_ptr_type_node;
19976 break;
4e10a5a7 19977 case E_DFmode:
43e9d192
IB
19978 field_t = double_type_node;
19979 field_ptr_t = double_ptr_type_node;
19980 break;
4e10a5a7 19981 case E_TFmode:
43e9d192
IB
19982 field_t = long_double_type_node;
19983 field_ptr_t = long_double_ptr_type_node;
19984 break;
67d399d5 19985 case E_SDmode:
0dc8e1e7
CL
19986 field_t = dfloat32_type_node;
19987 field_ptr_t = build_pointer_type (dfloat32_type_node);
19988 break;
67d399d5 19989 case E_DDmode:
0dc8e1e7
CL
19990 field_t = dfloat64_type_node;
19991 field_ptr_t = build_pointer_type (dfloat64_type_node);
19992 break;
67d399d5 19993 case E_TDmode:
0dc8e1e7
CL
19994 field_t = dfloat128_type_node;
19995 field_ptr_t = build_pointer_type (dfloat128_type_node);
19996 break;
4e10a5a7 19997 case E_HFmode:
1b62ed4f
JG
19998 field_t = aarch64_fp16_type_node;
19999 field_ptr_t = aarch64_fp16_ptr_type_node;
43e9d192 20000 break;
abbe1ed2
SMW
20001 case E_BFmode:
20002 field_t = aarch64_bf16_type_node;
20003 field_ptr_t = aarch64_bf16_ptr_type_node;
20004 break;
4e10a5a7
RS
20005 case E_V2SImode:
20006 case E_V4SImode:
43e9d192
IB
20007 {
20008 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
20009 field_t = build_vector_type_for_mode (innertype, ag_mode);
20010 field_ptr_t = build_pointer_type (field_t);
20011 }
20012 break;
20013 default:
20014 gcc_assert (0);
20015 }
20016
20017 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
ab563903 20018 TREE_ADDRESSABLE (tmp_ha) = 1;
43e9d192
IB
20019 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
20020 addr = t;
20021 t = fold_convert (field_ptr_t, addr);
20022 t = build2 (MODIFY_EXPR, field_t,
20023 build1 (INDIRECT_REF, field_t, tmp_ha),
20024 build1 (INDIRECT_REF, field_t, t));
20025
20026 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
20027 for (i = 1; i < nregs; ++i)
20028 {
20029 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
20030 u = fold_convert (field_ptr_t, addr);
20031 u = build2 (MODIFY_EXPR, field_t,
20032 build2 (MEM_REF, field_t, tmp_ha,
20033 build_int_cst (field_ptr_t,
20034 (i *
20035 int_size_in_bytes (field_t)))),
20036 build1 (INDIRECT_REF, field_t, u));
20037 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
20038 }
20039
20040 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
20041 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
20042 }
20043
20044 COND_EXPR_ELSE (cond2) = t;
20045 addr = fold_convert (build_pointer_type (type), cond1);
20046 addr = build_va_arg_indirect_ref (addr);
20047
20048 if (indirect_p)
20049 addr = build_va_arg_indirect_ref (addr);
20050
20051 return addr;
20052}
20053
20054/* Implement TARGET_SETUP_INCOMING_VARARGS. */
20055
20056static void
e7056ca4
RS
20057aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
20058 const function_arg_info &arg,
20059 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
43e9d192
IB
20060{
20061 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
20062 CUMULATIVE_ARGS local_cum;
88e3bdd1
JW
20063 int gr_saved = cfun->va_list_gpr_size;
20064 int vr_saved = cfun->va_list_fpr_size;
43e9d192
IB
20065
20066 /* The caller has advanced CUM up to, but not beyond, the last named
20067 argument. Advance a local copy of CUM past the last "real" named
20068 argument, to find out how many registers are left over. */
20069 local_cum = *cum;
4fe34cdc
JM
20070 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
20071 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
43e9d192 20072
88e3bdd1
JW
20073 /* Found out how many registers we need to save.
20074 Honor tree-stdvar analysis results. */
20075 if (cfun->va_list_gpr_size)
20076 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
20077 cfun->va_list_gpr_size / UNITS_PER_WORD);
20078 if (cfun->va_list_fpr_size)
20079 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
20080 cfun->va_list_fpr_size / UNITS_PER_VREG);
43e9d192 20081
d5726973 20082 if (!TARGET_FLOAT)
43e9d192 20083 {
261fb553 20084 gcc_assert (local_cum.aapcs_nvrn == 0);
43e9d192
IB
20085 vr_saved = 0;
20086 }
20087
20088 if (!no_rtl)
20089 {
20090 if (gr_saved > 0)
20091 {
20092 rtx ptr, mem;
20093
20094 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
20095 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
20096 - gr_saved * UNITS_PER_WORD);
20097 mem = gen_frame_mem (BLKmode, ptr);
20098 set_mem_alias_set (mem, get_varargs_alias_set ());
20099
20100 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
20101 mem, gr_saved);
20102 }
20103 if (vr_saved > 0)
20104 {
20105 /* We can't use move_block_from_reg, because it will use
20106 the wrong mode, storing D regs only. */
ef4bddc2 20107 machine_mode mode = TImode;
88e3bdd1 20108 int off, i, vr_start;
43e9d192
IB
20109
20110 /* Set OFF to the offset from virtual_incoming_args_rtx of
20111 the first vector register. The VR save area lies below
20112 the GR one, and is aligned to 16 bytes. */
4f59f9f2
UB
20113 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
20114 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
20115 off -= vr_saved * UNITS_PER_VREG;
20116
88e3bdd1
JW
20117 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
20118 for (i = 0; i < vr_saved; ++i)
43e9d192
IB
20119 {
20120 rtx ptr, mem;
20121
20122 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
20123 mem = gen_frame_mem (mode, ptr);
20124 set_mem_alias_set (mem, get_varargs_alias_set ());
88e3bdd1 20125 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
43e9d192
IB
20126 off += UNITS_PER_VREG;
20127 }
20128 }
20129 }
20130
20131 /* We don't save the size into *PRETEND_SIZE because we want to avoid
20132 any complication of having crtl->args.pretend_args_size changed. */
8799637a 20133 cfun->machine->frame.saved_varargs_size
4f59f9f2
UB
20134 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20135 STACK_BOUNDARY / BITS_PER_UNIT)
43e9d192
IB
20136 + vr_saved * UNITS_PER_VREG);
20137}
20138
20139static void
20140aarch64_conditional_register_usage (void)
20141{
20142 int i;
20143 if (!TARGET_FLOAT)
20144 {
20145 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20146 {
20147 fixed_regs[i] = 1;
20148 call_used_regs[i] = 1;
f58d5545 20149 CLEAR_HARD_REG_BIT (operand_reg_set, i);
43e9d192
IB
20150 }
20151 }
43cacb12
RS
20152 if (!TARGET_SVE)
20153 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20154 {
20155 fixed_regs[i] = 1;
20156 call_used_regs[i] = 1;
20157 }
3751345d 20158
183bfdaf
RS
20159 /* Only allow the FFR and FFRT to be accessed via special patterns. */
20160 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20161 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20162
3751345d
RE
20163 /* When tracking speculation, we need a couple of call-clobbered registers
20164 to track the speculation state. It would be nice to just use
20165 IP0 and IP1, but currently there are numerous places that just
20166 assume these registers are free for other uses (eg pointer
20167 authentication). */
20168 if (aarch64_track_speculation)
20169 {
20170 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20171 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20172 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20173 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20174 }
43e9d192
IB
20175}
20176
38e62001
RS
20177/* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
20178
20179bool
20180aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20181{
20182 /* For records we're passed a FIELD_DECL, for arrays we're passed
20183 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
20184 const_tree type = TREE_TYPE (field_or_array);
20185
20186 /* Assign BLKmode to anything that contains multiple SVE predicates.
20187 For structures, the "multiple" case is indicated by MODE being
20188 VOIDmode. */
20189 unsigned int num_zr, num_pr;
20190 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20191 {
20192 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20193 return !simple_cst_equal (TYPE_SIZE (field_or_array),
20194 TYPE_SIZE (type));
20195 return mode == VOIDmode;
20196 }
20197
20198 return default_member_type_forces_blk (field_or_array, mode);
20199}
20200
56fe3ca3
RS
20201/* Bitmasks that indicate whether earlier versions of GCC would have
20202 taken a different path through the ABI logic. This should result in
20203 a -Wpsabi warning if the earlier path led to a different ABI decision.
20204
20205 WARN_PSABI_EMPTY_CXX17_BASE
20206 Indicates that the type includes an artificial empty C++17 base field
20207 that, prior to GCC 10.1, would prevent the type from being treated as
20208 a HFA or HVA. See PR94383 for details.
20209
20210 WARN_PSABI_NO_UNIQUE_ADDRESS
20211 Indicates that the type includes an empty [[no_unique_address]] field
20212 that, prior to GCC 10.1, would prevent the type from being treated as
20213 a HFA or HVA. */
20214const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20215const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
b243ad1a 20216const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
56fe3ca3 20217
43e9d192
IB
20218/* Walk down the type tree of TYPE counting consecutive base elements.
20219 If *MODEP is VOIDmode, then set it to the first valid floating point
20220 type. If a non-floating point type is found, or if a floating point
20221 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
e73a32d6
MM
20222 otherwise return the count in the sub-tree.
20223
56fe3ca3
RS
20224 The WARN_PSABI_FLAGS argument allows the caller to check whether this
20225 function has changed its behavior relative to earlier versions of GCC.
20226 Normally the argument should be nonnull and point to a zero-initialized
20227 variable. The function then records whether the ABI decision might
20228 be affected by a known fix to the ABI logic, setting the associated
20229 WARN_PSABI_* bits if so.
20230
20231 When the argument is instead a null pointer, the function tries to
20232 simulate the behavior of GCC before all such ABI fixes were made.
20233 This is useful to check whether the function returns something
20234 different after the ABI fixes. */
43e9d192 20235static int
e73a32d6 20236aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
56fe3ca3 20237 unsigned int *warn_psabi_flags)
43e9d192 20238{
ef4bddc2 20239 machine_mode mode;
43e9d192
IB
20240 HOST_WIDE_INT size;
20241
38e62001
RS
20242 if (aarch64_sve::builtin_type_p (type))
20243 return -1;
c600df9a 20244
43e9d192
IB
20245 switch (TREE_CODE (type))
20246 {
20247 case REAL_TYPE:
20248 mode = TYPE_MODE (type);
1b62ed4f 20249 if (mode != DFmode && mode != SFmode
0dc8e1e7
CL
20250 && mode != TFmode && mode != HFmode
20251 && mode != SDmode && mode != DDmode && mode != TDmode)
43e9d192
IB
20252 return -1;
20253
20254 if (*modep == VOIDmode)
20255 *modep = mode;
20256
20257 if (*modep == mode)
20258 return 1;
20259
20260 break;
20261
20262 case COMPLEX_TYPE:
20263 mode = TYPE_MODE (TREE_TYPE (type));
1b62ed4f
JG
20264 if (mode != DFmode && mode != SFmode
20265 && mode != TFmode && mode != HFmode)
43e9d192
IB
20266 return -1;
20267
20268 if (*modep == VOIDmode)
20269 *modep = mode;
20270
20271 if (*modep == mode)
20272 return 2;
20273
20274 break;
20275
20276 case VECTOR_TYPE:
20277 /* Use V2SImode and V4SImode as representatives of all 64-bit
20278 and 128-bit vector types. */
20279 size = int_size_in_bytes (type);
20280 switch (size)
20281 {
20282 case 8:
20283 mode = V2SImode;
20284 break;
20285 case 16:
20286 mode = V4SImode;
20287 break;
20288 default:
20289 return -1;
20290 }
20291
20292 if (*modep == VOIDmode)
20293 *modep = mode;
20294
20295 /* Vector modes are considered to be opaque: two vectors are
20296 equivalent for the purposes of being homogeneous aggregates
20297 if they are the same size. */
20298 if (*modep == mode)
20299 return 1;
20300
20301 break;
20302
20303 case ARRAY_TYPE:
20304 {
20305 int count;
20306 tree index = TYPE_DOMAIN (type);
20307
807e902e
KZ
20308 /* Can't handle incomplete types nor sizes that are not
20309 fixed. */
20310 if (!COMPLETE_TYPE_P (type)
20311 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
20312 return -1;
20313
e73a32d6 20314 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
56fe3ca3 20315 warn_psabi_flags);
43e9d192
IB
20316 if (count == -1
20317 || !index
20318 || !TYPE_MAX_VALUE (index)
cc269bb6 20319 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
43e9d192 20320 || !TYPE_MIN_VALUE (index)
cc269bb6 20321 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
43e9d192
IB
20322 || count < 0)
20323 return -1;
20324
ae7e9ddd
RS
20325 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20326 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
43e9d192
IB
20327
20328 /* There must be no padding. */
6a70badb
RS
20329 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20330 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
20331 return -1;
20332
20333 return count;
20334 }
20335
20336 case RECORD_TYPE:
20337 {
20338 int count = 0;
20339 int sub_count;
20340 tree field;
20341
807e902e
KZ
20342 /* Can't handle incomplete types nor sizes that are not
20343 fixed. */
20344 if (!COMPLETE_TYPE_P (type)
20345 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
20346 return -1;
20347
20348 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20349 {
20350 if (TREE_CODE (field) != FIELD_DECL)
20351 continue;
20352
56fe3ca3 20353 if (DECL_FIELD_ABI_IGNORED (field))
e73a32d6 20354 {
56fe3ca3
RS
20355 /* See whether this is something that earlier versions of
20356 GCC failed to ignore. */
20357 unsigned int flag;
20358 if (lookup_attribute ("no_unique_address",
20359 DECL_ATTRIBUTES (field)))
20360 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20361 else if (cxx17_empty_base_field_p (field))
20362 flag = WARN_PSABI_EMPTY_CXX17_BASE;
20363 else
20364 /* No compatibility problem. */
20365 continue;
20366
20367 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
20368 if (warn_psabi_flags)
20369 {
20370 *warn_psabi_flags |= flag;
20371 continue;
20372 }
e73a32d6 20373 }
b243ad1a
RE
20374 /* A zero-width bitfield may affect layout in some
20375 circumstances, but adds no members. The determination
20376 of whether or not a type is an HFA is performed after
20377 layout is complete, so if the type still looks like an
20378 HFA afterwards, it is still classed as one. This is
20379 potentially an ABI break for the hard-float ABI. */
20380 else if (DECL_BIT_FIELD (field)
20381 && integer_zerop (DECL_SIZE (field)))
20382 {
20383 /* Prior to GCC-12 these fields were striped early,
20384 hiding them from the back-end entirely and
20385 resulting in the correct behaviour for argument
20386 passing. Simulate that old behaviour without
20387 generating a warning. */
20388 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20389 continue;
20390 if (warn_psabi_flags)
20391 {
20392 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20393 continue;
20394 }
20395 }
e73a32d6
MM
20396
20397 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 20398 warn_psabi_flags);
43e9d192
IB
20399 if (sub_count < 0)
20400 return -1;
20401 count += sub_count;
20402 }
20403
20404 /* There must be no padding. */
6a70badb
RS
20405 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20406 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
20407 return -1;
20408
20409 return count;
20410 }
20411
20412 case UNION_TYPE:
20413 case QUAL_UNION_TYPE:
20414 {
20415 /* These aren't very interesting except in a degenerate case. */
20416 int count = 0;
20417 int sub_count;
20418 tree field;
20419
807e902e
KZ
20420 /* Can't handle incomplete types nor sizes that are not
20421 fixed. */
20422 if (!COMPLETE_TYPE_P (type)
20423 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
20424 return -1;
20425
20426 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20427 {
20428 if (TREE_CODE (field) != FIELD_DECL)
20429 continue;
20430
e73a32d6 20431 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 20432 warn_psabi_flags);
43e9d192
IB
20433 if (sub_count < 0)
20434 return -1;
20435 count = count > sub_count ? count : sub_count;
20436 }
20437
20438 /* There must be no padding. */
6a70badb
RS
20439 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20440 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
20441 return -1;
20442
20443 return count;
20444 }
20445
20446 default:
20447 break;
20448 }
20449
20450 return -1;
20451}
20452
b6ec6215
KT
20453/* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20454 type as described in AAPCS64 \S 4.1.2.
20455
20456 See the comment above aarch64_composite_type_p for the notes on MODE. */
20457
20458static bool
20459aarch64_short_vector_p (const_tree type,
20460 machine_mode mode)
20461{
6a70badb 20462 poly_int64 size = -1;
b6ec6215
KT
20463
20464 if (type && TREE_CODE (type) == VECTOR_TYPE)
38e62001
RS
20465 {
20466 if (aarch64_sve::builtin_type_p (type))
20467 return false;
20468 size = int_size_in_bytes (type);
20469 }
b6ec6215 20470 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
38e62001
RS
20471 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20472 {
73c3dace
RS
20473 /* The containing "else if" is too loose: it means that we look at TYPE
20474 if the type is a vector type (good), but that we otherwise ignore TYPE
20475 and look only at the mode. This is wrong because the type describes
20476 the language-level information whereas the mode is purely an internal
20477 GCC concept. We can therefore reach here for types that are not
20478 vectors in the AAPCS64 sense.
20479
20480 We can't "fix" that for the traditional Advanced SIMD vector modes
20481 without breaking backwards compatibility. However, there's no such
20482 baggage for the structure modes, which were introduced in GCC 12. */
20483 if (aarch64_advsimd_struct_mode_p (mode))
20484 return false;
20485
20486 /* For similar reasons, rely only on the type, not the mode, when
20487 processing SVE types. */
38e62001 20488 if (type && aarch64_some_values_include_pst_objects_p (type))
b2672dd6
FY
20489 /* Leave later code to report an error if SVE is disabled. */
20490 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
38e62001
RS
20491 else
20492 size = GET_MODE_SIZE (mode);
20493 }
20494 if (known_eq (size, 8) || known_eq (size, 16))
20495 {
20496 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20497 they are being treated as scalable AAPCS64 types. */
73c3dace
RS
20498 gcc_assert (!aarch64_sve_mode_p (mode)
20499 && !aarch64_advsimd_struct_mode_p (mode));
38e62001
RS
20500 return true;
20501 }
20502 return false;
b6ec6215
KT
20503}
20504
43e9d192
IB
20505/* Return TRUE if the type, as described by TYPE and MODE, is a composite
20506 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
20507 array types. The C99 floating-point complex types are also considered
20508 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
20509 types, which are GCC extensions and out of the scope of AAPCS64, are
20510 treated as composite types here as well.
20511
20512 Note that MODE itself is not sufficient in determining whether a type
20513 is such a composite type or not. This is because
e53b6e56 20514 stor-layout.cc:compute_record_mode may have already changed the MODE
43e9d192
IB
20515 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
20516 structure with only one field may have its MODE set to the mode of the
20517 field. Also an integer mode whose size matches the size of the
20518 RECORD_TYPE type may be used to substitute the original mode
20519 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
20520 solely relied on. */
20521
20522static bool
20523aarch64_composite_type_p (const_tree type,
ef4bddc2 20524 machine_mode mode)
43e9d192 20525{
b6ec6215
KT
20526 if (aarch64_short_vector_p (type, mode))
20527 return false;
20528
43e9d192
IB
20529 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20530 return true;
20531
20532 if (mode == BLKmode
20533 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20534 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20535 return true;
20536
20537 return false;
20538}
20539
43e9d192
IB
20540/* Return TRUE if an argument, whose type is described by TYPE and MODE,
20541 shall be passed or returned in simd/fp register(s) (providing these
20542 parameter passing registers are available).
20543
20544 Upon successful return, *COUNT returns the number of needed registers,
b6073c9f 20545 *BASE_MODE returns the mode of the individual register and when IS_HA
43e9d192 20546 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
56fe3ca3
RS
20547 floating-point aggregate or a homogeneous short-vector aggregate.
20548
20549 SILENT_P is true if the function should refrain from reporting any
20550 diagnostics. This should only be used if the caller is certain that
20551 any ABI decisions would eventually come through this function with
20552 SILENT_P set to false. */
43e9d192
IB
20553
20554static bool
ef4bddc2 20555aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
43e9d192 20556 const_tree type,
ef4bddc2 20557 machine_mode *base_mode,
43e9d192 20558 int *count,
56fe3ca3
RS
20559 bool *is_ha,
20560 bool silent_p)
43e9d192 20561{
c600df9a
RS
20562 if (is_ha != NULL) *is_ha = false;
20563
ef4bddc2 20564 machine_mode new_mode = VOIDmode;
43e9d192
IB
20565 bool composite_p = aarch64_composite_type_p (type, mode);
20566
0dc8e1e7
CL
20567 if ((!composite_p
20568 && (GET_MODE_CLASS (mode) == MODE_FLOAT
20569 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
43e9d192
IB
20570 || aarch64_short_vector_p (type, mode))
20571 {
20572 *count = 1;
20573 new_mode = mode;
20574 }
20575 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20576 {
20577 if (is_ha != NULL) *is_ha = true;
20578 *count = 2;
20579 new_mode = GET_MODE_INNER (mode);
20580 }
20581 else if (type && composite_p)
20582 {
56fe3ca3
RS
20583 unsigned int warn_psabi_flags = 0;
20584 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20585 &warn_psabi_flags);
43e9d192
IB
20586 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20587 {
e73a32d6
MM
20588 static unsigned last_reported_type_uid;
20589 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20590 int alt;
56fe3ca3
RS
20591 if (!silent_p
20592 && warn_psabi
20593 && warn_psabi_flags
e73a32d6
MM
20594 && uid != last_reported_type_uid
20595 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20596 != ag_count))
20597 {
b243ad1a 20598 const char *url10
e33a1eae 20599 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
b243ad1a
RE
20600 const char *url12
20601 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
e73a32d6
MM
20602 gcc_assert (alt == -1);
20603 last_reported_type_uid = uid;
56fe3ca3
RS
20604 /* Use TYPE_MAIN_VARIANT to strip any redundant const
20605 qualification. */
20606 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20607 inform (input_location, "parameter passing for argument of "
20608 "type %qT with %<[[no_unique_address]]%> members "
691eeb65 20609 "changed %{in GCC 10.1%}",
b243ad1a 20610 TYPE_MAIN_VARIANT (type), url10);
56fe3ca3
RS
20611 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20612 inform (input_location, "parameter passing for argument of "
20613 "type %qT when C++17 is enabled changed to match "
691eeb65 20614 "C++14 %{in GCC 10.1%}",
b243ad1a
RE
20615 TYPE_MAIN_VARIANT (type), url10);
20616 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20617 inform (input_location, "parameter passing for argument of "
20618 "type %qT changed %{in GCC 12.1%}",
20619 TYPE_MAIN_VARIANT (type), url12);
e73a32d6
MM
20620 }
20621
43e9d192
IB
20622 if (is_ha != NULL) *is_ha = true;
20623 *count = ag_count;
20624 }
20625 else
20626 return false;
20627 }
20628 else
20629 return false;
20630
38e62001 20631 gcc_assert (!aarch64_sve_mode_p (new_mode));
43e9d192
IB
20632 *base_mode = new_mode;
20633 return true;
20634}
20635
20636/* Implement TARGET_STRUCT_VALUE_RTX. */
20637
20638static rtx
20639aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20640 int incoming ATTRIBUTE_UNUSED)
20641{
20642 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20643}
20644
20645/* Implements target hook vector_mode_supported_p. */
20646static bool
ef4bddc2 20647aarch64_vector_mode_supported_p (machine_mode mode)
43e9d192 20648{
43cacb12 20649 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
cc68f7c2 20650 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
43e9d192
IB
20651}
20652
4aeb1ba7
RS
20653/* Return the full-width SVE vector mode for element mode MODE, if one
20654 exists. */
20655opt_machine_mode
20656aarch64_full_sve_mode (scalar_mode mode)
20657{
20658 switch (mode)
20659 {
20660 case E_DFmode:
20661 return VNx2DFmode;
20662 case E_SFmode:
20663 return VNx4SFmode;
20664 case E_HFmode:
20665 return VNx8HFmode;
02fcd8ac
RS
20666 case E_BFmode:
20667 return VNx8BFmode;
4aeb1ba7 20668 case E_DImode:
02fcd8ac 20669 return VNx2DImode;
4aeb1ba7
RS
20670 case E_SImode:
20671 return VNx4SImode;
20672 case E_HImode:
20673 return VNx8HImode;
20674 case E_QImode:
20675 return VNx16QImode;
20676 default:
20677 return opt_machine_mode ();
20678 }
20679}
20680
20681/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20682 if it exists. */
20683opt_machine_mode
20684aarch64_vq_mode (scalar_mode mode)
20685{
20686 switch (mode)
20687 {
20688 case E_DFmode:
20689 return V2DFmode;
20690 case E_SFmode:
20691 return V4SFmode;
20692 case E_HFmode:
20693 return V8HFmode;
abbe1ed2
SMW
20694 case E_BFmode:
20695 return V8BFmode;
4aeb1ba7
RS
20696 case E_SImode:
20697 return V4SImode;
20698 case E_HImode:
20699 return V8HImode;
20700 case E_QImode:
20701 return V16QImode;
20702 case E_DImode:
20703 return V2DImode;
20704 default:
20705 return opt_machine_mode ();
20706 }
20707}
20708
b7342d25
IB
20709/* Return appropriate SIMD container
20710 for MODE within a vector of WIDTH bits. */
ef4bddc2 20711static machine_mode
43cacb12 20712aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
43e9d192 20713{
9b070057
RS
20714 if (TARGET_SVE
20715 && maybe_ne (width, 128)
20716 && known_eq (width, BITS_PER_SVE_VECTOR))
4aeb1ba7 20717 return aarch64_full_sve_mode (mode).else_mode (word_mode);
43cacb12
RS
20718
20719 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
43e9d192 20720 if (TARGET_SIMD)
b7342d25 20721 {
43cacb12 20722 if (known_eq (width, 128))
4aeb1ba7 20723 return aarch64_vq_mode (mode).else_mode (word_mode);
b7342d25
IB
20724 else
20725 switch (mode)
20726 {
4e10a5a7 20727 case E_SFmode:
b7342d25 20728 return V2SFmode;
4e10a5a7 20729 case E_HFmode:
b719f884 20730 return V4HFmode;
abbe1ed2
SMW
20731 case E_BFmode:
20732 return V4BFmode;
4e10a5a7 20733 case E_SImode:
b7342d25 20734 return V2SImode;
4e10a5a7 20735 case E_HImode:
b7342d25 20736 return V4HImode;
4e10a5a7 20737 case E_QImode:
b7342d25
IB
20738 return V8QImode;
20739 default:
20740 break;
20741 }
20742 }
43e9d192
IB
20743 return word_mode;
20744}
20745
5f29f3d5
KT
20746/* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20747 and return whether the SVE mode should be preferred over the
20748 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
20749static bool
20750aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20751{
20752 /* Take into account the aarch64-autovec-preference param if non-zero. */
20753 bool only_asimd_p = aarch64_autovec_preference == 1;
20754 bool only_sve_p = aarch64_autovec_preference == 2;
20755
20756 if (only_asimd_p)
20757 return false;
20758 if (only_sve_p)
20759 return true;
20760
20761 /* The preference in case of a tie in costs. */
20762 bool prefer_asimd = aarch64_autovec_preference == 3;
20763 bool prefer_sve = aarch64_autovec_preference == 4;
20764
5f29f3d5
KT
20765 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20766 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20767 /* If the CPU information does not have an SVE width registered use the
20768 generic poly_int comparison that prefers SVE. If a preference is
20769 explicitly requested avoid this path. */
fa3ca615 20770 if (aarch64_tune_params.sve_width == SVE_SCALABLE
5f29f3d5
KT
20771 && !prefer_asimd
20772 && !prefer_sve)
20773 return maybe_gt (nunits_sve, nunits_asimd);
20774
20775 /* Otherwise estimate the runtime width of the modes involved. */
64432b68
KT
20776 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20777 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
5f29f3d5
KT
20778
20779 /* Preferring SVE means picking it first unless the Advanced SIMD mode
20780 is clearly wider. */
20781 if (prefer_sve)
20782 return est_sve >= est_asimd;
20783 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20784 is clearly wider. */
20785 if (prefer_asimd)
20786 return est_sve > est_asimd;
20787
20788 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
20789 return est_sve > est_asimd;
20790}
20791
b7342d25 20792/* Return 128-bit container as the preferred SIMD mode for MODE. */
ef4bddc2 20793static machine_mode
005ba29c 20794aarch64_preferred_simd_mode (scalar_mode mode)
b7342d25 20795{
5f29f3d5
KT
20796 /* Take into account explicit auto-vectorization ISA preferences through
20797 aarch64_cmp_autovec_modes. */
7ff5706f
RS
20798 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20799 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20800 if (TARGET_SIMD)
20801 return aarch64_vq_mode (mode).else_mode (word_mode);
20802 return word_mode;
b7342d25
IB
20803}
20804
86e36728 20805/* Return a list of possible vector sizes for the vectorizer
3b357264 20806 to iterate over. */
bcc7e346 20807static unsigned int
e021fb86 20808aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
3b357264 20809{
cc68f7c2
RS
20810 static const machine_mode sve_modes[] = {
20811 /* Try using full vectors for all element types. */
20812 VNx16QImode,
20813
20814 /* Try using 16-bit containers for 8-bit elements and full vectors
20815 for wider elements. */
20816 VNx8QImode,
20817
20818 /* Try using 32-bit containers for 8-bit and 16-bit elements and
20819 full vectors for wider elements. */
20820 VNx4QImode,
74166aab 20821
cc68f7c2
RS
20822 /* Try using 64-bit containers for all element types. */
20823 VNx2QImode
20824 };
20825
20826 static const machine_mode advsimd_modes[] = {
20827 /* Try using 128-bit vectors for all element types. */
20828 V16QImode,
20829
20830 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20831 for wider elements. */
20832 V8QImode,
20833
20834 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20835 for wider elements.
20836
20837 TODO: We could support a limited form of V4QImode too, so that
20838 we use 32-bit vectors for 8-bit elements. */
20839 V4HImode,
20840
20841 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20842 for 64-bit elements.
74166aab 20843
cc68f7c2
RS
20844 TODO: We could similarly support limited forms of V2QImode and V2HImode
20845 for this case. */
20846 V2SImode
20847 };
74166aab 20848
cc68f7c2
RS
20849 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20850 This is because:
74166aab 20851
cc68f7c2
RS
20852 - If we can't use N-byte Advanced SIMD vectors then the placement
20853 doesn't matter; we'll just continue as though the Advanced SIMD
20854 entry didn't exist.
74166aab 20855
cc68f7c2
RS
20856 - If an SVE main loop with N bytes ends up being cheaper than an
20857 Advanced SIMD main loop with N bytes then by default we'll replace
20858 the Advanced SIMD version with the SVE one.
74166aab 20859
cc68f7c2
RS
20860 - If an Advanced SIMD main loop with N bytes ends up being cheaper
20861 than an SVE main loop with N bytes then by default we'll try to
20862 use the SVE loop to vectorize the epilogue instead. */
5f29f3d5
KT
20863
20864 bool only_asimd_p = aarch64_autovec_preference == 1;
20865 bool only_sve_p = aarch64_autovec_preference == 2;
20866
20867 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
cc68f7c2 20868 unsigned int advsimd_i = 0;
5f29f3d5
KT
20869
20870 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
cc68f7c2
RS
20871 {
20872 if (sve_i < ARRAY_SIZE (sve_modes)
5f29f3d5
KT
20873 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20874 advsimd_modes[advsimd_i]))
cc68f7c2
RS
20875 modes->safe_push (sve_modes[sve_i++]);
20876 else
20877 modes->safe_push (advsimd_modes[advsimd_i++]);
20878 }
20879 while (sve_i < ARRAY_SIZE (sve_modes))
5f29f3d5 20880 modes->safe_push (sve_modes[sve_i++]);
bcc7e346 20881
eb23241b
RS
20882 unsigned int flags = 0;
20883 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20884 can compare SVE against Advanced SIMD and so that we can compare
20885 multiple SVE vectorization approaches against each other. There's
20886 not really any point doing this for Advanced SIMD only, since the
20887 first mode that works should always be the best. */
20888 if (TARGET_SVE && aarch64_sve_compare_costs)
20889 flags |= VECT_COMPARE_COSTS;
20890 return flags;
3b357264
JG
20891}
20892
ac2b960f
YZ
20893/* Implement TARGET_MANGLE_TYPE. */
20894
6f549691 20895static const char *
ac2b960f
YZ
20896aarch64_mangle_type (const_tree type)
20897{
20898 /* The AArch64 ABI documents say that "__va_list" has to be
17f8ace2 20899 mangled as if it is in the "std" namespace. */
ac2b960f
YZ
20900 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20901 return "St9__va_list";
20902
abbe1ed2 20903 /* Half-precision floating point types. */
c2ec330c 20904 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
abbe1ed2 20905 {
e564021e
JJ
20906 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20907 return NULL;
abbe1ed2
SMW
20908 if (TYPE_MODE (type) == BFmode)
20909 return "u6__bf16";
20910 else
20911 return "Dh";
20912 }
c2ec330c 20913
f9d53c27
TB
20914 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
20915 builtin types. */
20916 if (TYPE_NAME (type) != NULL)
624d0f07
RS
20917 {
20918 const char *res;
20919 if ((res = aarch64_general_mangle_builtin_type (type))
20920 || (res = aarch64_sve::mangle_builtin_type (type)))
20921 return res;
20922 }
c6fc9e43 20923
ac2b960f
YZ
20924 /* Use the default mangling. */
20925 return NULL;
20926}
20927
65ef05d0
RS
20928/* Implement TARGET_VERIFY_TYPE_CONTEXT. */
20929
20930static bool
20931aarch64_verify_type_context (location_t loc, type_context_kind context,
20932 const_tree type, bool silent_p)
20933{
20934 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20935}
20936
75cf1494
KT
20937/* Find the first rtx_insn before insn that will generate an assembly
20938 instruction. */
20939
20940static rtx_insn *
20941aarch64_prev_real_insn (rtx_insn *insn)
20942{
20943 if (!insn)
20944 return NULL;
20945
20946 do
20947 {
20948 insn = prev_real_insn (insn);
20949 }
20950 while (insn && recog_memoized (insn) < 0);
20951
20952 return insn;
20953}
20954
20955static bool
20956is_madd_op (enum attr_type t1)
20957{
20958 unsigned int i;
20959 /* A number of these may be AArch32 only. */
20960 enum attr_type mlatypes[] = {
20961 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20962 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20963 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20964 };
20965
ca32b29e 20966 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
75cf1494
KT
20967 {
20968 if (t1 == mlatypes[i])
20969 return true;
20970 }
20971
20972 return false;
20973}
20974
20975/* Check if there is a register dependency between a load and the insn
20976 for which we hold recog_data. */
20977
20978static bool
20979dep_between_memop_and_curr (rtx memop)
20980{
20981 rtx load_reg;
20982 int opno;
20983
8baff86e 20984 gcc_assert (GET_CODE (memop) == SET);
75cf1494
KT
20985
20986 if (!REG_P (SET_DEST (memop)))
20987 return false;
20988
20989 load_reg = SET_DEST (memop);
8baff86e 20990 for (opno = 1; opno < recog_data.n_operands; opno++)
75cf1494
KT
20991 {
20992 rtx operand = recog_data.operand[opno];
20993 if (REG_P (operand)
20994 && reg_overlap_mentioned_p (load_reg, operand))
20995 return true;
20996
20997 }
20998 return false;
20999}
21000
8baff86e
KT
21001
21002/* When working around the Cortex-A53 erratum 835769,
21003 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
21004 instruction and has a preceding memory instruction such that a NOP
21005 should be inserted between them. */
21006
75cf1494
KT
21007bool
21008aarch64_madd_needs_nop (rtx_insn* insn)
21009{
21010 enum attr_type attr_type;
21011 rtx_insn *prev;
21012 rtx body;
21013
b32c1043 21014 if (!TARGET_FIX_ERR_A53_835769)
75cf1494
KT
21015 return false;
21016
e322d6e3 21017 if (!INSN_P (insn) || recog_memoized (insn) < 0)
75cf1494
KT
21018 return false;
21019
21020 attr_type = get_attr_type (insn);
21021 if (!is_madd_op (attr_type))
21022 return false;
21023
21024 prev = aarch64_prev_real_insn (insn);
3fea1a75
KT
21025 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
21026 Restore recog state to INSN to avoid state corruption. */
21027 extract_constrain_insn_cached (insn);
21028
550e2205 21029 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
75cf1494
KT
21030 return false;
21031
21032 body = single_set (prev);
21033
21034 /* If the previous insn is a memory op and there is no dependency between
8baff86e
KT
21035 it and the DImode madd, emit a NOP between them. If body is NULL then we
21036 have a complex memory operation, probably a load/store pair.
21037 Be conservative for now and emit a NOP. */
21038 if (GET_MODE (recog_data.operand[0]) == DImode
21039 && (!body || !dep_between_memop_and_curr (body)))
75cf1494
KT
21040 return true;
21041
21042 return false;
21043
21044}
21045
8baff86e
KT
21046
21047/* Implement FINAL_PRESCAN_INSN. */
21048
75cf1494
KT
21049void
21050aarch64_final_prescan_insn (rtx_insn *insn)
21051{
21052 if (aarch64_madd_needs_nop (insn))
21053 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
21054}
21055
21056
43cacb12
RS
21057/* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
21058 instruction. */
21059
21060bool
21061aarch64_sve_index_immediate_p (rtx base_or_step)
21062{
21063 return (CONST_INT_P (base_or_step)
21064 && IN_RANGE (INTVAL (base_or_step), -16, 15));
21065}
21066
f3582fda
RS
21067/* Return true if X is a valid immediate for the SVE ADD and SUB instructions
21068 when applied to mode MODE. Negate X first if NEGATE_P is true. */
43cacb12
RS
21069
21070bool
f3582fda 21071aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
43cacb12 21072{
f3582fda
RS
21073 rtx elt = unwrap_const_vec_duplicate (x);
21074 if (!CONST_INT_P (elt))
43cacb12
RS
21075 return false;
21076
21077 HOST_WIDE_INT val = INTVAL (elt);
21078 if (negate_p)
21079 val = -val;
f3582fda 21080 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
43cacb12
RS
21081
21082 if (val & 0xff)
21083 return IN_RANGE (val, 0, 0xff);
21084 return IN_RANGE (val, 0, 0xff00);
21085}
21086
624d0f07 21087/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
f3582fda
RS
21088 instructions when applied to mode MODE. Negate X first if NEGATE_P
21089 is true. */
624d0f07
RS
21090
21091bool
f3582fda 21092aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
624d0f07 21093{
f3582fda 21094 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
624d0f07
RS
21095 return false;
21096
21097 /* After the optional negation, the immediate must be nonnegative.
21098 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21099 instead of SQADD Zn.B, Zn.B, #129. */
f3582fda 21100 rtx elt = unwrap_const_vec_duplicate (x);
624d0f07
RS
21101 return negate_p == (INTVAL (elt) < 0);
21102}
21103
43cacb12
RS
21104/* Return true if X is a valid immediate operand for an SVE logical
21105 instruction such as AND. */
21106
21107bool
21108aarch64_sve_bitmask_immediate_p (rtx x)
21109{
21110 rtx elt;
21111
21112 return (const_vec_duplicate_p (x, &elt)
21113 && CONST_INT_P (elt)
21114 && aarch64_bitmask_imm (INTVAL (elt),
21115 GET_MODE_INNER (GET_MODE (x))));
21116}
21117
21118/* Return true if X is a valid immediate for the SVE DUP and CPY
21119 instructions. */
21120
21121bool
21122aarch64_sve_dup_immediate_p (rtx x)
21123{
d29f7dd5
RS
21124 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
21125 if (!CONST_INT_P (x))
43cacb12
RS
21126 return false;
21127
d29f7dd5 21128 HOST_WIDE_INT val = INTVAL (x);
43cacb12
RS
21129 if (val & 0xff)
21130 return IN_RANGE (val, -0x80, 0x7f);
21131 return IN_RANGE (val, -0x8000, 0x7f00);
21132}
21133
21134/* Return true if X is a valid immediate operand for an SVE CMP instruction.
21135 SIGNED_P says whether the operand is signed rather than unsigned. */
21136
21137bool
21138aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21139{
6bc67182
RS
21140 x = unwrap_const_vec_duplicate (x);
21141 return (CONST_INT_P (x)
43cacb12 21142 && (signed_p
6bc67182
RS
21143 ? IN_RANGE (INTVAL (x), -16, 15)
21144 : IN_RANGE (INTVAL (x), 0, 127)));
43cacb12
RS
21145}
21146
21147/* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21148 instruction. Negate X first if NEGATE_P is true. */
21149
21150bool
21151aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21152{
21153 rtx elt;
21154 REAL_VALUE_TYPE r;
21155
21156 if (!const_vec_duplicate_p (x, &elt)
3793ecc1 21157 || !CONST_DOUBLE_P (elt))
43cacb12
RS
21158 return false;
21159
21160 r = *CONST_DOUBLE_REAL_VALUE (elt);
21161
21162 if (negate_p)
21163 r = real_value_negate (&r);
21164
21165 if (real_equal (&r, &dconst1))
21166 return true;
21167 if (real_equal (&r, &dconsthalf))
21168 return true;
21169 return false;
21170}
21171
21172/* Return true if X is a valid immediate operand for an SVE FMUL
21173 instruction. */
21174
21175bool
21176aarch64_sve_float_mul_immediate_p (rtx x)
21177{
21178 rtx elt;
21179
43cacb12 21180 return (const_vec_duplicate_p (x, &elt)
3793ecc1 21181 && CONST_DOUBLE_P (elt)
a19ba9e1
RS
21182 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21183 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
43cacb12
RS
21184}
21185
b187677b
RS
21186/* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21187 for the Advanced SIMD operation described by WHICH and INSN. If INFO
21188 is nonnull, use it to describe valid immediates. */
3520f7cc 21189static bool
b187677b
RS
21190aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21191 simd_immediate_info *info,
21192 enum simd_immediate_check which,
21193 simd_immediate_info::insn_type insn)
21194{
21195 /* Try a 4-byte immediate with LSL. */
21196 for (unsigned int shift = 0; shift < 32; shift += 8)
21197 if ((val32 & (0xff << shift)) == val32)
21198 {
21199 if (info)
21200 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21201 simd_immediate_info::LSL, shift);
21202 return true;
21203 }
3520f7cc 21204
b187677b
RS
21205 /* Try a 2-byte immediate with LSL. */
21206 unsigned int imm16 = val32 & 0xffff;
21207 if (imm16 == (val32 >> 16))
21208 for (unsigned int shift = 0; shift < 16; shift += 8)
21209 if ((imm16 & (0xff << shift)) == imm16)
48063b9d 21210 {
b187677b
RS
21211 if (info)
21212 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21213 simd_immediate_info::LSL, shift);
21214 return true;
48063b9d 21215 }
3520f7cc 21216
b187677b
RS
21217 /* Try a 4-byte immediate with MSL, except for cases that MVN
21218 can handle. */
21219 if (which == AARCH64_CHECK_MOV)
21220 for (unsigned int shift = 8; shift < 24; shift += 8)
21221 {
21222 unsigned int low = (1 << shift) - 1;
21223 if (((val32 & (0xff << shift)) | low) == val32)
21224 {
21225 if (info)
21226 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21227 simd_immediate_info::MSL, shift);
21228 return true;
21229 }
21230 }
43e9d192 21231
b187677b
RS
21232 return false;
21233}
21234
21235/* Return true if replicating VAL64 is a valid immediate for the
21236 Advanced SIMD operation described by WHICH. If INFO is nonnull,
21237 use it to describe valid immediates. */
21238static bool
21239aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21240 simd_immediate_info *info,
21241 enum simd_immediate_check which)
21242{
21243 unsigned int val32 = val64 & 0xffffffff;
21244 unsigned int val16 = val64 & 0xffff;
21245 unsigned int val8 = val64 & 0xff;
21246
21247 if (val32 == (val64 >> 32))
43e9d192 21248 {
b187677b
RS
21249 if ((which & AARCH64_CHECK_ORR) != 0
21250 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21251 simd_immediate_info::MOV))
21252 return true;
43e9d192 21253
b187677b
RS
21254 if ((which & AARCH64_CHECK_BIC) != 0
21255 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21256 simd_immediate_info::MVN))
21257 return true;
ee78df47 21258
b187677b
RS
21259 /* Try using a replicated byte. */
21260 if (which == AARCH64_CHECK_MOV
21261 && val16 == (val32 >> 16)
21262 && val8 == (val16 >> 8))
ee78df47 21263 {
b187677b
RS
21264 if (info)
21265 *info = simd_immediate_info (QImode, val8);
21266 return true;
ee78df47 21267 }
43e9d192
IB
21268 }
21269
b187677b
RS
21270 /* Try using a bit-to-bytemask. */
21271 if (which == AARCH64_CHECK_MOV)
43e9d192 21272 {
b187677b
RS
21273 unsigned int i;
21274 for (i = 0; i < 64; i += 8)
ab6501d7 21275 {
b187677b
RS
21276 unsigned char byte = (val64 >> i) & 0xff;
21277 if (byte != 0 && byte != 0xff)
21278 break;
ab6501d7 21279 }
b187677b 21280 if (i == 64)
ab6501d7 21281 {
b187677b
RS
21282 if (info)
21283 *info = simd_immediate_info (DImode, val64);
21284 return true;
ab6501d7 21285 }
43e9d192 21286 }
b187677b
RS
21287 return false;
21288}
43e9d192 21289
43cacb12
RS
21290/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21291 instruction. If INFO is nonnull, use it to describe valid immediates. */
21292
21293static bool
21294aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21295 simd_immediate_info *info)
21296{
21297 scalar_int_mode mode = DImode;
21298 unsigned int val32 = val64 & 0xffffffff;
21299 if (val32 == (val64 >> 32))
21300 {
21301 mode = SImode;
21302 unsigned int val16 = val32 & 0xffff;
21303 if (val16 == (val32 >> 16))
21304 {
21305 mode = HImode;
21306 unsigned int val8 = val16 & 0xff;
21307 if (val8 == (val16 >> 8))
21308 mode = QImode;
21309 }
21310 }
21311 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21312 if (IN_RANGE (val, -0x80, 0x7f))
21313 {
21314 /* DUP with no shift. */
21315 if (info)
21316 *info = simd_immediate_info (mode, val);
21317 return true;
21318 }
21319 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21320 {
21321 /* DUP with LSL #8. */
21322 if (info)
21323 *info = simd_immediate_info (mode, val);
21324 return true;
21325 }
21326 if (aarch64_bitmask_imm (val64, mode))
21327 {
21328 /* DUPM. */
21329 if (info)
21330 *info = simd_immediate_info (mode, val);
21331 return true;
21332 }
21333 return false;
21334}
21335
624d0f07
RS
21336/* Return true if X is an UNSPEC_PTRUE constant of the form:
21337
21338 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21339
21340 where PATTERN is the svpattern as a CONST_INT and where ZERO
21341 is a zero constant of the required PTRUE mode (which can have
21342 fewer elements than X's mode, if zero bits are significant).
21343
21344 If so, and if INFO is nonnull, describe the immediate in INFO. */
21345bool
21346aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21347{
21348 if (GET_CODE (x) != CONST)
21349 return false;
21350
21351 x = XEXP (x, 0);
21352 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21353 return false;
21354
21355 if (info)
21356 {
21357 aarch64_svpattern pattern
21358 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21359 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21360 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21361 *info = simd_immediate_info (int_mode, pattern);
21362 }
21363 return true;
21364}
21365
0b1fe8cf
RS
21366/* Return true if X is a valid SVE predicate. If INFO is nonnull, use
21367 it to describe valid immediates. */
21368
21369static bool
21370aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21371{
624d0f07
RS
21372 if (aarch64_sve_ptrue_svpattern_p (x, info))
21373 return true;
21374
0b1fe8cf
RS
21375 if (x == CONST0_RTX (GET_MODE (x)))
21376 {
21377 if (info)
21378 *info = simd_immediate_info (DImode, 0);
21379 return true;
21380 }
21381
21382 /* Analyze the value as a VNx16BImode. This should be relatively
21383 efficient, since rtx_vector_builder has enough built-in capacity
21384 to store all VLA predicate constants without needing the heap. */
21385 rtx_vector_builder builder;
21386 if (!aarch64_get_sve_pred_bits (builder, x))
21387 return false;
21388
21389 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21390 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21391 {
21392 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21393 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21394 if (pattern != AARCH64_NUM_SVPATTERNS)
21395 {
21396 if (info)
21397 {
21398 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21399 *info = simd_immediate_info (int_mode, pattern);
21400 }
21401 return true;
21402 }
21403 }
21404 return false;
21405}
21406
b187677b
RS
21407/* Return true if OP is a valid SIMD immediate for the operation
21408 described by WHICH. If INFO is nonnull, use it to describe valid
21409 immediates. */
21410bool
21411aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21412 enum simd_immediate_check which)
21413{
43cacb12
RS
21414 machine_mode mode = GET_MODE (op);
21415 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21416 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21417 return false;
21418
721c0fb3
RS
21419 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21420 return false;
21421
0b1fe8cf
RS
21422 if (vec_flags & VEC_SVE_PRED)
21423 return aarch64_sve_pred_valid_immediate (op, info);
21424
43cacb12 21425 scalar_mode elt_mode = GET_MODE_INNER (mode);
f9093f23 21426 rtx base, step;
b187677b 21427 unsigned int n_elts;
568b9c0e 21428 if (CONST_VECTOR_P (op)
f9093f23
RS
21429 && CONST_VECTOR_DUPLICATE_P (op))
21430 n_elts = CONST_VECTOR_NPATTERNS (op);
43cacb12
RS
21431 else if ((vec_flags & VEC_SVE_DATA)
21432 && const_vec_series_p (op, &base, &step))
21433 {
21434 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21435 if (!aarch64_sve_index_immediate_p (base)
21436 || !aarch64_sve_index_immediate_p (step))
21437 return false;
21438
21439 if (info)
cc68f7c2
RS
21440 {
21441 /* Get the corresponding container mode. E.g. an INDEX on V2SI
21442 should yield two integer values per 128-bit block, meaning
21443 that we need to treat it in the same way as V2DI and then
21444 ignore the upper 32 bits of each element. */
21445 elt_mode = aarch64_sve_container_int_mode (mode);
21446 *info = simd_immediate_info (elt_mode, base, step);
21447 }
43cacb12
RS
21448 return true;
21449 }
568b9c0e 21450 else if (CONST_VECTOR_P (op)
6a70badb
RS
21451 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21452 /* N_ELTS set above. */;
b187677b 21453 else
d8edd899 21454 return false;
43e9d192 21455
b187677b 21456 scalar_float_mode elt_float_mode;
f9093f23
RS
21457 if (n_elts == 1
21458 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
43e9d192 21459 {
f9093f23
RS
21460 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21461 if (aarch64_float_const_zero_rtx_p (elt)
21462 || aarch64_float_const_representable_p (elt))
21463 {
21464 if (info)
21465 *info = simd_immediate_info (elt_float_mode, elt);
21466 return true;
21467 }
b187677b 21468 }
43e9d192 21469
b23c6a2c
RS
21470 /* If all elements in an SVE vector have the same value, we have a free
21471 choice between using the element mode and using the container mode.
21472 Using the element mode means that unused parts of the vector are
21473 duplicates of the used elements, while using the container mode means
21474 that the unused parts are an extension of the used elements. Using the
21475 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21476 for its container mode VNx4SI while 0x00000101 isn't.
21477
21478 If not all elements in an SVE vector have the same value, we need the
21479 transition from one element to the next to occur at container boundaries.
21480 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21481 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
21482 scalar_int_mode elt_int_mode;
21483 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21484 elt_int_mode = aarch64_sve_container_int_mode (mode);
21485 else
21486 elt_int_mode = int_mode_for_mode (elt_mode).require ();
21487
21488 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
b187677b
RS
21489 if (elt_size > 8)
21490 return false;
e4f0f84d 21491
b187677b
RS
21492 /* Expand the vector constant out into a byte vector, with the least
21493 significant byte of the register first. */
21494 auto_vec<unsigned char, 16> bytes;
21495 bytes.reserve (n_elts * elt_size);
21496 for (unsigned int i = 0; i < n_elts; i++)
21497 {
f9093f23
RS
21498 /* The vector is provided in gcc endian-neutral fashion.
21499 For aarch64_be Advanced SIMD, it must be laid out in the vector
21500 register in reverse order. */
21501 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21502 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
43e9d192 21503
b187677b
RS
21504 if (elt_mode != elt_int_mode)
21505 elt = gen_lowpart (elt_int_mode, elt);
43e9d192 21506
b187677b
RS
21507 if (!CONST_INT_P (elt))
21508 return false;
43e9d192 21509
b187677b
RS
21510 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21511 for (unsigned int byte = 0; byte < elt_size; byte++)
48063b9d 21512 {
b187677b
RS
21513 bytes.quick_push (elt_val & 0xff);
21514 elt_val >>= BITS_PER_UNIT;
48063b9d 21515 }
43e9d192
IB
21516 }
21517
b187677b
RS
21518 /* The immediate must repeat every eight bytes. */
21519 unsigned int nbytes = bytes.length ();
21520 for (unsigned i = 8; i < nbytes; ++i)
21521 if (bytes[i] != bytes[i - 8])
21522 return false;
21523
21524 /* Get the repeating 8-byte value as an integer. No endian correction
21525 is needed here because bytes is already in lsb-first order. */
21526 unsigned HOST_WIDE_INT val64 = 0;
21527 for (unsigned int i = 0; i < 8; i++)
21528 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21529 << (i * BITS_PER_UNIT));
21530
43cacb12
RS
21531 if (vec_flags & VEC_SVE_DATA)
21532 return aarch64_sve_valid_immediate (val64, info);
21533 else
21534 return aarch64_advsimd_valid_immediate (val64, info, which);
21535}
21536
21537/* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21538 has a step in the range of INDEX. Return the index expression if so,
21539 otherwise return null. */
21540rtx
21541aarch64_check_zero_based_sve_index_immediate (rtx x)
21542{
21543 rtx base, step;
21544 if (const_vec_series_p (x, &base, &step)
21545 && base == const0_rtx
21546 && aarch64_sve_index_immediate_p (step))
21547 return step;
21548 return NULL_RTX;
43e9d192
IB
21549}
21550
43e9d192
IB
21551/* Check of immediate shift constants are within range. */
21552bool
ef4bddc2 21553aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
43e9d192 21554{
6bc67182
RS
21555 x = unwrap_const_vec_duplicate (x);
21556 if (!CONST_INT_P (x))
21557 return false;
43e9d192
IB
21558 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21559 if (left)
6bc67182 21560 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
43e9d192 21561 else
6bc67182 21562 return IN_RANGE (INTVAL (x), 1, bit_width);
43e9d192
IB
21563}
21564
7325d85a
KT
21565/* Return the bitmask CONST_INT to select the bits required by a zero extract
21566 operation of width WIDTH at bit position POS. */
21567
21568rtx
21569aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21570{
21571 gcc_assert (CONST_INT_P (width));
21572 gcc_assert (CONST_INT_P (pos));
21573
21574 unsigned HOST_WIDE_INT mask
21575 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21576 return GEN_INT (mask << UINTVAL (pos));
21577}
21578
83f8c414 21579bool
a6e0bfa7 21580aarch64_mov_operand_p (rtx x, machine_mode mode)
83f8c414 21581{
83f8c414
CSS
21582 if (GET_CODE (x) == HIGH
21583 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21584 return true;
21585
82614948 21586 if (CONST_INT_P (x))
83f8c414
CSS
21587 return true;
21588
43cacb12 21589 if (VECTOR_MODE_P (GET_MODE (x)))
678faefc
RS
21590 {
21591 /* Require predicate constants to be VNx16BI before RA, so that we
21592 force everything to have a canonical form. */
21593 if (!lra_in_progress
21594 && !reload_completed
21595 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21596 && GET_MODE (x) != VNx16BImode)
21597 return false;
21598
21599 return aarch64_simd_valid_immediate (x, NULL);
21600 }
43cacb12 21601
b33b2678
WD
21602 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
21603 x = strip_salt (x);
21604
a195c727
WD
21605 /* GOT accesses are valid moves. */
21606 if (SYMBOL_REF_P (x)
21607 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21608 return true;
21609
3793ecc1 21610 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
83f8c414
CSS
21611 return true;
21612
c0e0174b 21613 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
43cacb12
RS
21614 return true;
21615
a6e0bfa7 21616 return aarch64_classify_symbolic_expression (x)
a5350ddc 21617 == SYMBOL_TINY_ABSOLUTE;
83f8c414
CSS
21618}
21619
9b8830b6
TC
21620/* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21621 the constant creation. */
21622
21623rtx
21624aarch64_gen_shareable_zero (machine_mode mode)
21625{
21626 machine_mode zmode = V4SImode;
21627 rtx tmp = gen_reg_rtx (zmode);
21628 emit_move_insn (tmp, CONST0_RTX (zmode));
21629 return lowpart_subreg (mode, tmp, zmode);
21630}
21631
43e9d192
IB
21632/* Return a const_int vector of VAL. */
21633rtx
ab014eb3 21634aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
43e9d192 21635{
59d06c05
RS
21636 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21637 return gen_const_vec_duplicate (mode, c);
43e9d192
IB
21638}
21639
051d0e2f
SN
21640/* Check OP is a legal scalar immediate for the MOVI instruction. */
21641
21642bool
77e994c9 21643aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
051d0e2f 21644{
ef4bddc2 21645 machine_mode vmode;
051d0e2f 21646
43cacb12 21647 vmode = aarch64_simd_container_mode (mode, 64);
051d0e2f 21648 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
b187677b 21649 return aarch64_simd_valid_immediate (op_v, NULL);
051d0e2f
SN
21650}
21651
988fa693
JG
21652/* Construct and return a PARALLEL RTX vector with elements numbering the
21653 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21654 the vector - from the perspective of the architecture. This does not
21655 line up with GCC's perspective on lane numbers, so we end up with
21656 different masks depending on our target endian-ness. The diagram
21657 below may help. We must draw the distinction when building masks
21658 which select one half of the vector. An instruction selecting
21659 architectural low-lanes for a big-endian target, must be described using
21660 a mask selecting GCC high-lanes.
21661
21662 Big-Endian Little-Endian
21663
21664GCC 0 1 2 3 3 2 1 0
21665 | x | x | x | x | | x | x | x | x |
21666Architecture 3 2 1 0 3 2 1 0
21667
21668Low Mask: { 2, 3 } { 0, 1 }
21669High Mask: { 0, 1 } { 2, 3 }
f5cbabc1
RS
21670
21671 MODE Is the mode of the vector and NUNITS is the number of units in it. */
988fa693 21672
43e9d192 21673rtx
f5cbabc1 21674aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
43e9d192 21675{
43e9d192 21676 rtvec v = rtvec_alloc (nunits / 2);
988fa693
JG
21677 int high_base = nunits / 2;
21678 int low_base = 0;
21679 int base;
43e9d192
IB
21680 rtx t1;
21681 int i;
21682
988fa693
JG
21683 if (BYTES_BIG_ENDIAN)
21684 base = high ? low_base : high_base;
21685 else
21686 base = high ? high_base : low_base;
21687
21688 for (i = 0; i < nunits / 2; i++)
43e9d192
IB
21689 RTVEC_ELT (v, i) = GEN_INT (base + i);
21690
21691 t1 = gen_rtx_PARALLEL (mode, v);
21692 return t1;
21693}
21694
988fa693
JG
21695/* Check OP for validity as a PARALLEL RTX vector with elements
21696 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21697 from the perspective of the architecture. See the diagram above
21698 aarch64_simd_vect_par_cnst_half for more details. */
21699
21700bool
ef4bddc2 21701aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
988fa693
JG
21702 bool high)
21703{
6a70badb
RS
21704 int nelts;
21705 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
f5cbabc1
RS
21706 return false;
21707
6a70badb 21708 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
988fa693
JG
21709 HOST_WIDE_INT count_op = XVECLEN (op, 0);
21710 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21711 int i = 0;
21712
988fa693
JG
21713 if (count_op != count_ideal)
21714 return false;
21715
21716 for (i = 0; i < count_ideal; i++)
21717 {
21718 rtx elt_op = XVECEXP (op, 0, i);
21719 rtx elt_ideal = XVECEXP (ideal, 0, i);
21720
4aa81c2e 21721 if (!CONST_INT_P (elt_op)
988fa693
JG
21722 || INTVAL (elt_ideal) != INTVAL (elt_op))
21723 return false;
21724 }
21725 return true;
21726}
21727
4aeb1ba7
RS
21728/* Return a PARALLEL containing NELTS elements, with element I equal
21729 to BASE + I * STEP. */
21730
21731rtx
21732aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21733{
21734 rtvec vec = rtvec_alloc (nelts);
21735 for (unsigned int i = 0; i < nelts; ++i)
21736 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21737 return gen_rtx_PARALLEL (VOIDmode, vec);
21738}
21739
21740/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21741 series with step STEP. */
21742
21743bool
21744aarch64_stepped_int_parallel_p (rtx op, int step)
21745{
21746 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21747 return false;
21748
21749 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21750 for (int i = 1; i < XVECLEN (op, 0); ++i)
21751 if (!CONST_INT_P (XVECEXP (op, 0, i))
21752 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21753 return false;
21754
21755 return true;
21756}
21757
43e9d192
IB
21758/* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
21759 HIGH (exclusive). */
21760void
46ed6024
CB
21761aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21762 const_tree exp)
43e9d192
IB
21763{
21764 HOST_WIDE_INT lane;
4aa81c2e 21765 gcc_assert (CONST_INT_P (operand));
43e9d192
IB
21766 lane = INTVAL (operand);
21767
21768 if (lane < low || lane >= high)
46ed6024
CB
21769 {
21770 if (exp)
06357071
MS
21771 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21772 lane, low, high - 1);
46ed6024 21773 else
cf0c27ef 21774 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
46ed6024 21775 }
43e9d192
IB
21776}
21777
7ac29c0f
RS
21778/* Peform endian correction on lane number N, which indexes a vector
21779 of mode MODE, and return the result as an SImode rtx. */
21780
21781rtx
21782aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21783{
21784 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21785}
21786
43e9d192 21787/* Return TRUE if OP is a valid vector addressing mode. */
43cacb12 21788
43e9d192
IB
21789bool
21790aarch64_simd_mem_operand_p (rtx op)
21791{
21792 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
4aa81c2e 21793 || REG_P (XEXP (op, 0)));
43e9d192
IB
21794}
21795
43cacb12
RS
21796/* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
21797
21798bool
21799aarch64_sve_ld1r_operand_p (rtx op)
21800{
21801 struct aarch64_address_info addr;
21802 scalar_mode mode;
21803
21804 return (MEM_P (op)
21805 && is_a <scalar_mode> (GET_MODE (op), &mode)
21806 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21807 && addr.type == ADDRESS_REG_IMM
21808 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21809}
21810
9ceec73f
MM
21811/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21812 where the size of the read data is specified by `mode` and the size of the
21813 vector elements are specified by `elem_mode`. */
4aeb1ba7 21814bool
9ceec73f
MM
21815aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21816 scalar_mode elem_mode)
4aeb1ba7
RS
21817{
21818 struct aarch64_address_info addr;
4aeb1ba7
RS
21819 if (!MEM_P (op)
21820 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21821 return false;
21822
21823 if (addr.type == ADDRESS_REG_IMM)
9ceec73f 21824 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
4aeb1ba7
RS
21825
21826 if (addr.type == ADDRESS_REG_REG)
21827 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21828
21829 return false;
21830}
21831
9ceec73f
MM
21832/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
21833bool
21834aarch64_sve_ld1rq_operand_p (rtx op)
21835{
21836 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21837 GET_MODE_INNER (GET_MODE (op)));
21838}
21839
21840/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21841 accessing a vector where the element size is specified by `elem_mode`. */
21842bool
21843aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21844{
21845 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21846}
21847
624d0f07
RS
21848/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
21849bool
21850aarch64_sve_ldff1_operand_p (rtx op)
21851{
21852 if (!MEM_P (op))
21853 return false;
21854
21855 struct aarch64_address_info addr;
21856 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21857 return false;
21858
21859 if (addr.type == ADDRESS_REG_IMM)
21860 return known_eq (addr.const_offset, 0);
21861
21862 return addr.type == ADDRESS_REG_REG;
21863}
21864
21865/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
21866bool
21867aarch64_sve_ldnf1_operand_p (rtx op)
21868{
21869 struct aarch64_address_info addr;
21870
21871 return (MEM_P (op)
21872 && aarch64_classify_address (&addr, XEXP (op, 0),
21873 GET_MODE (op), false)
21874 && addr.type == ADDRESS_REG_IMM);
21875}
21876
43cacb12
RS
21877/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21878 The conditions for STR are the same. */
21879bool
21880aarch64_sve_ldr_operand_p (rtx op)
21881{
21882 struct aarch64_address_info addr;
21883
21884 return (MEM_P (op)
21885 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21886 false, ADDR_QUERY_ANY)
21887 && addr.type == ADDRESS_REG_IMM);
21888}
21889
624d0f07
RS
21890/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21891 addressing memory of mode MODE. */
21892bool
21893aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21894{
21895 struct aarch64_address_info addr;
ba15b0fa 21896 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
624d0f07
RS
21897 return false;
21898
21899 if (addr.type == ADDRESS_REG_IMM)
ba15b0fa 21900 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
624d0f07
RS
21901
21902 return addr.type == ADDRESS_REG_REG;
21903}
21904
9f4cbab8
RS
21905/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21906 We need to be able to access the individual pieces, so the range
21907 is different from LD[234] and ST[234]. */
21908bool
21909aarch64_sve_struct_memory_operand_p (rtx op)
21910{
21911 if (!MEM_P (op))
21912 return false;
21913
21914 machine_mode mode = GET_MODE (op);
21915 struct aarch64_address_info addr;
21916 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21917 ADDR_QUERY_ANY)
21918 || addr.type != ADDRESS_REG_IMM)
21919 return false;
21920
21921 poly_int64 first = addr.const_offset;
21922 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21923 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21924 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21925}
21926
2d8c6dc1
AH
21927/* Emit a register copy from operand to operand, taking care not to
21928 early-clobber source registers in the process.
43e9d192 21929
2d8c6dc1
AH
21930 COUNT is the number of components into which the copy needs to be
21931 decomposed. */
43e9d192 21932void
b8506a8a 21933aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
2d8c6dc1 21934 unsigned int count)
43e9d192
IB
21935{
21936 unsigned int i;
2d8c6dc1
AH
21937 int rdest = REGNO (operands[0]);
21938 int rsrc = REGNO (operands[1]);
43e9d192
IB
21939
21940 if (!reg_overlap_mentioned_p (operands[0], operands[1])
2d8c6dc1
AH
21941 || rdest < rsrc)
21942 for (i = 0; i < count; i++)
21943 emit_move_insn (gen_rtx_REG (mode, rdest + i),
21944 gen_rtx_REG (mode, rsrc + i));
43e9d192 21945 else
2d8c6dc1
AH
21946 for (i = 0; i < count; i++)
21947 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21948 gen_rtx_REG (mode, rsrc + count - i - 1));
43e9d192
IB
21949}
21950
668046d1 21951/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
6ec0e5b9 21952 one of VSTRUCT modes: OI, CI, or XI. */
668046d1 21953int
b8506a8a 21954aarch64_simd_attr_length_rglist (machine_mode mode)
668046d1 21955{
6a70badb
RS
21956 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
21957 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
668046d1
DS
21958}
21959
db0253a4 21960/* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
43cacb12
RS
21961 alignment of a vector to 128 bits. SVE predicates have an alignment of
21962 16 bits. */
db0253a4
TB
21963static HOST_WIDE_INT
21964aarch64_simd_vector_alignment (const_tree type)
21965{
07108a9e
RS
21966 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21967 be set for non-predicate vectors of booleans. Modes are the most
21968 direct way we have of identifying real SVE predicate types. */
21969 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21970 return 16;
cc68f7c2
RS
21971 widest_int min_size
21972 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21973 return wi::umin (min_size, 128).to_uhwi ();
db0253a4
TB
21974}
21975
43cacb12 21976/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
ca31798e 21977static poly_uint64
43cacb12
RS
21978aarch64_vectorize_preferred_vector_alignment (const_tree type)
21979{
21980 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21981 {
1393938e
RS
21982 /* If the length of the vector is a fixed power of 2, try to align
21983 to that length, otherwise don't try to align at all. */
43cacb12 21984 HOST_WIDE_INT result;
1393938e
RS
21985 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21986 || !pow2p_hwi (result))
43cacb12
RS
21987 result = TYPE_ALIGN (TREE_TYPE (type));
21988 return result;
21989 }
21990 return TYPE_ALIGN (type);
21991}
21992
db0253a4
TB
21993/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
21994static bool
21995aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21996{
21997 if (is_packed)
21998 return false;
21999
43cacb12
RS
22000 /* For fixed-length vectors, check that the vectorizer will aim for
22001 full-vector alignment. This isn't true for generic GCC vectors
22002 that are wider than the ABI maximum of 128 bits. */
ca31798e
AV
22003 poly_uint64 preferred_alignment =
22004 aarch64_vectorize_preferred_vector_alignment (type);
43cacb12 22005 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
ca31798e
AV
22006 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
22007 preferred_alignment))
db0253a4
TB
22008 return false;
22009
22010 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
22011 return true;
22012}
22013
7df76747
N
22014/* Return true if the vector misalignment factor is supported by the
22015 target. */
22016static bool
22017aarch64_builtin_support_vector_misalignment (machine_mode mode,
22018 const_tree type, int misalignment,
22019 bool is_packed)
22020{
22021 if (TARGET_SIMD && STRICT_ALIGNMENT)
22022 {
22023 /* Return if movmisalign pattern is not supported for this mode. */
22024 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
22025 return false;
22026
a509c571 22027 /* Misalignment factor is unknown at compile time. */
7df76747 22028 if (misalignment == -1)
a509c571 22029 return false;
7df76747
N
22030 }
22031 return default_builtin_support_vector_misalignment (mode, type, misalignment,
22032 is_packed);
22033}
22034
4369c11e
TB
22035/* If VALS is a vector constant that can be loaded into a register
22036 using DUP, generate instructions to do so and return an RTX to
22037 assign to the register. Otherwise return NULL_RTX. */
22038static rtx
22039aarch64_simd_dup_constant (rtx vals)
22040{
ef4bddc2
RS
22041 machine_mode mode = GET_MODE (vals);
22042 machine_mode inner_mode = GET_MODE_INNER (mode);
4369c11e 22043 rtx x;
4369c11e 22044
92695fbb 22045 if (!const_vec_duplicate_p (vals, &x))
4369c11e
TB
22046 return NULL_RTX;
22047
22048 /* We can load this constant by using DUP and a constant in a
22049 single ARM register. This will be cheaper than a vector
22050 load. */
92695fbb 22051 x = copy_to_mode_reg (inner_mode, x);
59d06c05 22052 return gen_vec_duplicate (mode, x);
4369c11e
TB
22053}
22054
22055
22056/* Generate code to load VALS, which is a PARALLEL containing only
22057 constants (for vec_init) or CONST_VECTOR, efficiently into a
22058 register. Returns an RTX to copy into the register, or NULL_RTX
67914693 22059 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
1df3f464 22060static rtx
4369c11e
TB
22061aarch64_simd_make_constant (rtx vals)
22062{
ef4bddc2 22063 machine_mode mode = GET_MODE (vals);
4369c11e
TB
22064 rtx const_dup;
22065 rtx const_vec = NULL_RTX;
4369c11e
TB
22066 int n_const = 0;
22067 int i;
22068
568b9c0e 22069 if (CONST_VECTOR_P (vals))
4369c11e
TB
22070 const_vec = vals;
22071 else if (GET_CODE (vals) == PARALLEL)
22072 {
22073 /* A CONST_VECTOR must contain only CONST_INTs and
22074 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
22075 Only store valid constants in a CONST_VECTOR. */
6a70badb 22076 int n_elts = XVECLEN (vals, 0);
4369c11e
TB
22077 for (i = 0; i < n_elts; ++i)
22078 {
22079 rtx x = XVECEXP (vals, 0, i);
22080 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22081 n_const++;
22082 }
22083 if (n_const == n_elts)
22084 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
22085 }
22086 else
22087 gcc_unreachable ();
22088
22089 if (const_vec != NULL_RTX
b187677b 22090 && aarch64_simd_valid_immediate (const_vec, NULL))
4369c11e
TB
22091 /* Load using MOVI/MVNI. */
22092 return const_vec;
22093 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
22094 /* Loaded using DUP. */
22095 return const_dup;
22096 else if (const_vec != NULL_RTX)
67914693 22097 /* Load from constant pool. We cannot take advantage of single-cycle
4369c11e
TB
22098 LD1 because we need a PC-relative addressing mode. */
22099 return const_vec;
22100 else
22101 /* A PARALLEL containing something not valid inside CONST_VECTOR.
67914693 22102 We cannot construct an initializer. */
4369c11e
TB
22103 return NULL_RTX;
22104}
22105
35a093b6
JG
22106/* Expand a vector initialisation sequence, such that TARGET is
22107 initialised to contain VALS. */
22108
4369c11e
TB
22109void
22110aarch64_expand_vector_init (rtx target, rtx vals)
22111{
ef4bddc2 22112 machine_mode mode = GET_MODE (target);
146c2e3a 22113 scalar_mode inner_mode = GET_MODE_INNER (mode);
35a093b6 22114 /* The number of vector elements. */
6a70badb 22115 int n_elts = XVECLEN (vals, 0);
35a093b6 22116 /* The number of vector elements which are not constant. */
8b66a2d4
AL
22117 int n_var = 0;
22118 rtx any_const = NULL_RTX;
35a093b6
JG
22119 /* The first element of vals. */
22120 rtx v0 = XVECEXP (vals, 0, 0);
4369c11e 22121 bool all_same = true;
4369c11e 22122
41dab855
KT
22123 /* This is a special vec_init<M><N> where N is not an element mode but a
22124 vector mode with half the elements of M. We expect to find two entries
22125 of mode N in VALS and we must put their concatentation into TARGET. */
22126 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
22127 {
4057266c
RS
22128 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
22129 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
22130 && known_eq (GET_MODE_SIZE (mode),
22131 2 * GET_MODE_SIZE (narrow_mode)));
22132 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22133 XVECEXP (vals, 0, 0),
22134 XVECEXP (vals, 0, 1)));
41dab855
KT
22135 return;
22136 }
22137
35a093b6 22138 /* Count the number of variable elements to initialise. */
8b66a2d4 22139 for (int i = 0; i < n_elts; ++i)
4369c11e 22140 {
8b66a2d4 22141 rtx x = XVECEXP (vals, 0, i);
35a093b6 22142 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
8b66a2d4
AL
22143 ++n_var;
22144 else
22145 any_const = x;
4369c11e 22146
35a093b6 22147 all_same &= rtx_equal_p (x, v0);
4369c11e
TB
22148 }
22149
35a093b6
JG
22150 /* No variable elements, hand off to aarch64_simd_make_constant which knows
22151 how best to handle this. */
4369c11e
TB
22152 if (n_var == 0)
22153 {
22154 rtx constant = aarch64_simd_make_constant (vals);
22155 if (constant != NULL_RTX)
22156 {
22157 emit_move_insn (target, constant);
22158 return;
22159 }
22160 }
22161
22162 /* Splat a single non-constant element if we can. */
22163 if (all_same)
22164 {
35a093b6 22165 rtx x = copy_to_mode_reg (inner_mode, v0);
59d06c05 22166 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
4369c11e
TB
22167 return;
22168 }
22169
769370f3
PK
22170 /* Check for interleaving case.
22171 For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22172 Generate following code:
22173 dup v0.h, x
22174 dup v1.h, y
22175 zip1 v0.h, v0.h, v1.h
22176 for "large enough" initializer. */
22177
22178 if (n_elts >= 8)
22179 {
22180 int i;
22181 for (i = 2; i < n_elts; i++)
22182 if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22183 break;
22184
22185 if (i == n_elts)
22186 {
22187 machine_mode mode = GET_MODE (target);
22188 rtx dest[2];
22189
22190 for (int i = 0; i < 2; i++)
22191 {
22192 rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22193 dest[i] = force_reg (mode, x);
22194 }
22195
22196 rtvec v = gen_rtvec (2, dest[0], dest[1]);
22197 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22198 return;
22199 }
22200 }
22201
85c1b6d7
AP
22202 enum insn_code icode = optab_handler (vec_set_optab, mode);
22203 gcc_assert (icode != CODE_FOR_nothing);
22204
22205 /* If there are only variable elements, try to optimize
22206 the insertion using dup for the most common element
22207 followed by insertions. */
22208
22209 /* The algorithm will fill matches[*][0] with the earliest matching element,
22210 and matches[X][1] with the count of duplicate elements (if X is the
22211 earliest element which has duplicates). */
22212
22213 if (n_var == n_elts && n_elts <= 16)
22214 {
22215 int matches[16][2] = {0};
22216 for (int i = 0; i < n_elts; i++)
22217 {
22218 for (int j = 0; j <= i; j++)
22219 {
22220 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22221 {
22222 matches[i][0] = j;
22223 matches[j][1]++;
22224 break;
22225 }
22226 }
22227 }
22228 int maxelement = 0;
22229 int maxv = 0;
22230 for (int i = 0; i < n_elts; i++)
22231 if (matches[i][1] > maxv)
22232 {
22233 maxelement = i;
22234 maxv = matches[i][1];
22235 }
22236
b4e2cd5b
JG
22237 /* Create a duplicate of the most common element, unless all elements
22238 are equally useless to us, in which case just immediately set the
22239 vector register using the first element. */
22240
22241 if (maxv == 1)
22242 {
22243 /* For vectors of two 64-bit elements, we can do even better. */
22244 if (n_elts == 2
22245 && (inner_mode == E_DImode
22246 || inner_mode == E_DFmode))
22247
22248 {
22249 rtx x0 = XVECEXP (vals, 0, 0);
22250 rtx x1 = XVECEXP (vals, 0, 1);
22251 /* Combine can pick up this case, but handling it directly
22252 here leaves clearer RTL.
22253
22254 This is load_pair_lanes<mode>, and also gives us a clean-up
22255 for store_pair_lanes<mode>. */
22256 if (memory_operand (x0, inner_mode)
22257 && memory_operand (x1, inner_mode)
958448a9 22258 && aarch64_mergeable_load_pair_p (mode, x0, x1))
b4e2cd5b
JG
22259 {
22260 rtx t;
22261 if (inner_mode == DFmode)
22262 t = gen_load_pair_lanesdf (target, x0, x1);
22263 else
22264 t = gen_load_pair_lanesdi (target, x0, x1);
22265 emit_insn (t);
22266 return;
22267 }
22268 }
22269 /* The subreg-move sequence below will move into lane zero of the
22270 vector register. For big-endian we want that position to hold
22271 the last element of VALS. */
22272 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22273 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22274 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22275 }
22276 else
22277 {
22278 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22279 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22280 }
85c1b6d7
AP
22281
22282 /* Insert the rest. */
22283 for (int i = 0; i < n_elts; i++)
22284 {
22285 rtx x = XVECEXP (vals, 0, i);
22286 if (matches[i][0] == maxelement)
22287 continue;
22288 x = copy_to_mode_reg (inner_mode, x);
22289 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22290 }
22291 return;
22292 }
22293
35a093b6
JG
22294 /* Initialise a vector which is part-variable. We want to first try
22295 to build those lanes which are constant in the most efficient way we
22296 can. */
22297 if (n_var != n_elts)
4369c11e
TB
22298 {
22299 rtx copy = copy_rtx (vals);
4369c11e 22300
8b66a2d4
AL
22301 /* Load constant part of vector. We really don't care what goes into the
22302 parts we will overwrite, but we're more likely to be able to load the
22303 constant efficiently if it has fewer, larger, repeating parts
22304 (see aarch64_simd_valid_immediate). */
22305 for (int i = 0; i < n_elts; i++)
22306 {
22307 rtx x = XVECEXP (vals, 0, i);
22308 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22309 continue;
22310 rtx subst = any_const;
22311 for (int bit = n_elts / 2; bit > 0; bit /= 2)
22312 {
22313 /* Look in the copied vector, as more elements are const. */
22314 rtx test = XVECEXP (copy, 0, i ^ bit);
22315 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22316 {
22317 subst = test;
22318 break;
22319 }
22320 }
22321 XVECEXP (copy, 0, i) = subst;
22322 }
4369c11e 22323 aarch64_expand_vector_init (target, copy);
35a093b6 22324 }
4369c11e 22325
35a093b6 22326 /* Insert the variable lanes directly. */
8b66a2d4 22327 for (int i = 0; i < n_elts; i++)
35a093b6
JG
22328 {
22329 rtx x = XVECEXP (vals, 0, i);
22330 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22331 continue;
22332 x = copy_to_mode_reg (inner_mode, x);
22333 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22334 }
4369c11e
TB
22335}
22336
3a0afad0
PK
22337/* Emit RTL corresponding to:
22338 insr TARGET, ELEM. */
22339
22340static void
22341emit_insr (rtx target, rtx elem)
22342{
22343 machine_mode mode = GET_MODE (target);
22344 scalar_mode elem_mode = GET_MODE_INNER (mode);
22345 elem = force_reg (elem_mode, elem);
22346
22347 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22348 gcc_assert (icode != CODE_FOR_nothing);
22349 emit_insn (GEN_FCN (icode) (target, target, elem));
22350}
22351
22352/* Subroutine of aarch64_sve_expand_vector_init for handling
22353 trailing constants.
22354 This function works as follows:
22355 (a) Create a new vector consisting of trailing constants.
22356 (b) Initialize TARGET with the constant vector using emit_move_insn.
22357 (c) Insert remaining elements in TARGET using insr.
22358 NELTS is the total number of elements in original vector while
22359 while NELTS_REQD is the number of elements that are actually
22360 significant.
22361
22362 ??? The heuristic used is to do above only if number of constants
22363 is at least half the total number of elements. May need fine tuning. */
22364
22365static bool
22366aarch64_sve_expand_vector_init_handle_trailing_constants
22367 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22368{
22369 machine_mode mode = GET_MODE (target);
22370 scalar_mode elem_mode = GET_MODE_INNER (mode);
22371 int n_trailing_constants = 0;
22372
22373 for (int i = nelts_reqd - 1;
5da301cb 22374 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
3a0afad0
PK
22375 i--)
22376 n_trailing_constants++;
22377
22378 if (n_trailing_constants >= nelts_reqd / 2)
22379 {
5da301cb
RS
22380 /* Try to use the natural pattern of BUILDER to extend the trailing
22381 constant elements to a full vector. Replace any variables in the
22382 extra elements with zeros.
22383
22384 ??? It would be better if the builders supported "don't care"
22385 elements, with the builder filling in whichever elements
22386 give the most compact encoding. */
22387 rtx_vector_builder v (mode, nelts, 1);
3a0afad0 22388 for (int i = 0; i < nelts; i++)
5da301cb
RS
22389 {
22390 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22391 if (!valid_for_const_vector_p (elem_mode, x))
41582f88 22392 x = CONST0_RTX (elem_mode);
5da301cb
RS
22393 v.quick_push (x);
22394 }
3a0afad0
PK
22395 rtx const_vec = v.build ();
22396 emit_move_insn (target, const_vec);
22397
22398 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22399 emit_insr (target, builder.elt (i));
22400
22401 return true;
22402 }
22403
22404 return false;
22405}
22406
22407/* Subroutine of aarch64_sve_expand_vector_init.
22408 Works as follows:
22409 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22410 (b) Skip trailing elements from BUILDER, which are the same as
22411 element NELTS_REQD - 1.
22412 (c) Insert earlier elements in reverse order in TARGET using insr. */
22413
22414static void
22415aarch64_sve_expand_vector_init_insert_elems (rtx target,
22416 const rtx_vector_builder &builder,
22417 int nelts_reqd)
22418{
22419 machine_mode mode = GET_MODE (target);
22420 scalar_mode elem_mode = GET_MODE_INNER (mode);
22421
22422 struct expand_operand ops[2];
22423 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22424 gcc_assert (icode != CODE_FOR_nothing);
22425
22426 create_output_operand (&ops[0], target, mode);
22427 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22428 expand_insn (icode, 2, ops);
22429
22430 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22431 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22432 emit_insr (target, builder.elt (i));
22433}
22434
22435/* Subroutine of aarch64_sve_expand_vector_init to handle case
22436 when all trailing elements of builder are same.
22437 This works as follows:
22438 (a) Use expand_insn interface to broadcast last vector element in TARGET.
22439 (b) Insert remaining elements in TARGET using insr.
22440
22441 ??? The heuristic used is to do above if number of same trailing elements
22442 is at least 3/4 of total number of elements, loosely based on
22443 heuristic from mostly_zeros_p. May need fine-tuning. */
22444
22445static bool
22446aarch64_sve_expand_vector_init_handle_trailing_same_elem
22447 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22448{
22449 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22450 if (ndups >= (3 * nelts_reqd) / 4)
22451 {
22452 aarch64_sve_expand_vector_init_insert_elems (target, builder,
22453 nelts_reqd - ndups + 1);
22454 return true;
22455 }
22456
22457 return false;
22458}
22459
22460/* Initialize register TARGET from BUILDER. NELTS is the constant number
22461 of elements in BUILDER.
22462
22463 The function tries to initialize TARGET from BUILDER if it fits one
22464 of the special cases outlined below.
22465
22466 Failing that, the function divides BUILDER into two sub-vectors:
22467 v_even = even elements of BUILDER;
22468 v_odd = odd elements of BUILDER;
22469
22470 and recursively calls itself with v_even and v_odd.
22471
22472 if (recursive call succeeded for v_even or v_odd)
22473 TARGET = zip (v_even, v_odd)
22474
22475 The function returns true if it managed to build TARGET from BUILDER
22476 with one of the special cases, false otherwise.
22477
22478 Example: {a, 1, b, 2, c, 3, d, 4}
22479
22480 The vector gets divided into:
22481 v_even = {a, b, c, d}
22482 v_odd = {1, 2, 3, 4}
22483
22484 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22485 initialize tmp2 from constant vector v_odd using emit_move_insn.
22486
22487 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22488 4 elements, so we construct tmp1 from v_even using insr:
22489 tmp1 = dup(d)
22490 insr tmp1, c
22491 insr tmp1, b
22492 insr tmp1, a
22493
22494 And finally:
22495 TARGET = zip (tmp1, tmp2)
22496 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
22497
22498static bool
22499aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22500 int nelts, int nelts_reqd)
22501{
22502 machine_mode mode = GET_MODE (target);
22503
22504 /* Case 1: Vector contains trailing constants. */
22505
22506 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22507 (target, builder, nelts, nelts_reqd))
22508 return true;
22509
22510 /* Case 2: Vector contains leading constants. */
22511
5da301cb 22512 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
3a0afad0
PK
22513 for (int i = 0; i < nelts_reqd; i++)
22514 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22515 rev_builder.finalize ();
22516
22517 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22518 (target, rev_builder, nelts, nelts_reqd))
22519 {
22520 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22521 return true;
22522 }
22523
22524 /* Case 3: Vector contains trailing same element. */
22525
22526 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22527 (target, builder, nelts_reqd))
22528 return true;
22529
22530 /* Case 4: Vector contains leading same element. */
22531
22532 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22533 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22534 {
22535 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22536 return true;
22537 }
22538
22539 /* Avoid recursing below 4-elements.
22540 ??? The threshold 4 may need fine-tuning. */
22541
22542 if (nelts_reqd <= 4)
22543 return false;
22544
5da301cb
RS
22545 rtx_vector_builder v_even (mode, nelts, 1);
22546 rtx_vector_builder v_odd (mode, nelts, 1);
3a0afad0
PK
22547
22548 for (int i = 0; i < nelts * 2; i += 2)
22549 {
22550 v_even.quick_push (builder.elt (i));
22551 v_odd.quick_push (builder.elt (i + 1));
22552 }
22553
22554 v_even.finalize ();
22555 v_odd.finalize ();
22556
22557 rtx tmp1 = gen_reg_rtx (mode);
22558 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22559 nelts, nelts_reqd / 2);
22560
22561 rtx tmp2 = gen_reg_rtx (mode);
22562 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22563 nelts, nelts_reqd / 2);
22564
22565 if (!did_even_p && !did_odd_p)
22566 return false;
22567
22568 /* Initialize v_even and v_odd using INSR if it didn't match any of the
22569 special cases and zip v_even, v_odd. */
22570
22571 if (!did_even_p)
22572 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22573
22574 if (!did_odd_p)
22575 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22576
22577 rtvec v = gen_rtvec (2, tmp1, tmp2);
22578 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22579 return true;
22580}
22581
22582/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
22583
22584void
22585aarch64_sve_expand_vector_init (rtx target, rtx vals)
22586{
22587 machine_mode mode = GET_MODE (target);
22588 int nelts = XVECLEN (vals, 0);
22589
5da301cb 22590 rtx_vector_builder v (mode, nelts, 1);
3a0afad0
PK
22591 for (int i = 0; i < nelts; i++)
22592 v.quick_push (XVECEXP (vals, 0, i));
22593 v.finalize ();
22594
22595 /* If neither sub-vectors of v could be initialized specially,
22596 then use INSR to insert all elements from v into TARGET.
22597 ??? This might not be optimal for vectors with large
22598 initializers like 16-element or above.
22599 For nelts < 4, it probably isn't useful to handle specially. */
22600
22601 if (nelts < 4
22602 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22603 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22604}
22605
b6c3aea1
RS
22606/* Check whether VALUE is a vector constant in which every element
22607 is either a power of 2 or a negated power of 2. If so, return
22608 a constant vector of log2s, and flip CODE between PLUS and MINUS
22609 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
22610
22611static rtx
22612aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22613{
568b9c0e 22614 if (!CONST_VECTOR_P (value))
b6c3aea1
RS
22615 return NULL_RTX;
22616
22617 rtx_vector_builder builder;
22618 if (!builder.new_unary_operation (GET_MODE (value), value, false))
22619 return NULL_RTX;
22620
22621 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22622 /* 1 if the result of the multiplication must be negated,
22623 0 if it mustn't, or -1 if we don't yet care. */
22624 int negate = -1;
22625 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22626 for (unsigned int i = 0; i < encoded_nelts; ++i)
22627 {
22628 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22629 if (!CONST_SCALAR_INT_P (elt))
22630 return NULL_RTX;
22631 rtx_mode_t val (elt, int_mode);
22632 wide_int pow2 = wi::neg (val);
22633 if (val != pow2)
22634 {
22635 /* It matters whether we negate or not. Make that choice,
22636 and make sure that it's consistent with previous elements. */
22637 if (negate == !wi::neg_p (val))
22638 return NULL_RTX;
22639 negate = wi::neg_p (val);
22640 if (!negate)
22641 pow2 = val;
22642 }
22643 /* POW2 is now the value that we want to be a power of 2. */
22644 int shift = wi::exact_log2 (pow2);
22645 if (shift < 0)
22646 return NULL_RTX;
22647 builder.quick_push (gen_int_mode (shift, int_mode));
22648 }
22649 if (negate == -1)
22650 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
22651 code = PLUS;
22652 else if (negate == 1)
22653 code = code == PLUS ? MINUS : PLUS;
22654 return builder.build ();
22655}
22656
22657/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22658 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
22659 operands array, in the same order as for fma_optab. Return true if
22660 the function emitted all the necessary instructions, false if the caller
22661 should generate the pattern normally with the new OPERANDS array. */
22662
22663bool
22664aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22665{
22666 machine_mode mode = GET_MODE (operands[0]);
22667 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22668 {
22669 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22670 NULL_RTX, true, OPTAB_DIRECT);
22671 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22672 operands[3], product, operands[0], true,
22673 OPTAB_DIRECT);
22674 return true;
22675 }
22676 operands[2] = force_reg (mode, operands[2]);
22677 return false;
22678}
22679
22680/* Likewise, but for a conditional pattern. */
22681
22682bool
22683aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22684{
22685 machine_mode mode = GET_MODE (operands[0]);
22686 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22687 {
22688 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22689 NULL_RTX, true, OPTAB_DIRECT);
22690 emit_insn (gen_cond (code, mode, operands[0], operands[1],
22691 operands[4], product, operands[5]));
22692 return true;
22693 }
22694 operands[3] = force_reg (mode, operands[3]);
22695 return false;
22696}
22697
43e9d192 22698static unsigned HOST_WIDE_INT
ef4bddc2 22699aarch64_shift_truncation_mask (machine_mode mode)
43e9d192 22700{
43cacb12
RS
22701 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22702 return 0;
22703 return GET_MODE_UNIT_BITSIZE (mode) - 1;
43e9d192
IB
22704}
22705
43e9d192
IB
22706/* Select a format to encode pointers in exception handling data. */
22707int
22708aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22709{
22710 int type;
22711 switch (aarch64_cmodel)
22712 {
22713 case AARCH64_CMODEL_TINY:
22714 case AARCH64_CMODEL_TINY_PIC:
22715 case AARCH64_CMODEL_SMALL:
22716 case AARCH64_CMODEL_SMALL_PIC:
1b1e81f8 22717 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
22718 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
22719 for everything. */
22720 type = DW_EH_PE_sdata4;
22721 break;
22722 default:
22723 /* No assumptions here. 8-byte relocs required. */
22724 type = DW_EH_PE_sdata8;
22725 break;
22726 }
22727 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22728}
22729
b07fc91c
SN
22730/* Output .variant_pcs for aarch64_vector_pcs function symbols. */
22731
22732static void
22733aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22734{
c600df9a 22735 if (TREE_CODE (decl) == FUNCTION_DECL)
b07fc91c 22736 {
c600df9a
RS
22737 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22738 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22739 {
22740 fprintf (stream, "\t.variant_pcs\t");
22741 assemble_name (stream, name);
22742 fprintf (stream, "\n");
22743 }
b07fc91c
SN
22744 }
22745}
22746
e1c1ecb0
KT
22747/* The last .arch and .tune assembly strings that we printed. */
22748static std::string aarch64_last_printed_arch_string;
22749static std::string aarch64_last_printed_tune_string;
22750
361fb3ee
KT
22751/* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
22752 by the function fndecl. */
22753
22754void
22755aarch64_declare_function_name (FILE *stream, const char* name,
22756 tree fndecl)
22757{
22758 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22759
22760 struct cl_target_option *targ_options;
22761 if (target_parts)
22762 targ_options = TREE_TARGET_OPTION (target_parts);
22763 else
22764 targ_options = TREE_TARGET_OPTION (target_option_current_node);
22765 gcc_assert (targ_options);
22766
22767 const struct processor *this_arch
ae54c1b0 22768 = aarch64_get_arch (targ_options->x_selected_arch);
361fb3ee 22769
2a269bda 22770 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
054b4005 22771 std::string extension
04a99ebe
JG
22772 = aarch64_get_extension_string_for_isa_flags (isa_flags,
22773 this_arch->flags);
e1c1ecb0
KT
22774 /* Only update the assembler .arch string if it is distinct from the last
22775 such string we printed. */
22776 std::string to_print = this_arch->name + extension;
22777 if (to_print != aarch64_last_printed_arch_string)
22778 {
22779 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22780 aarch64_last_printed_arch_string = to_print;
22781 }
361fb3ee
KT
22782
22783 /* Print the cpu name we're tuning for in the comments, might be
e1c1ecb0
KT
22784 useful to readers of the generated asm. Do it only when it changes
22785 from function to function and verbose assembly is requested. */
361fb3ee 22786 const struct processor *this_tune
ae54c1b0 22787 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
361fb3ee 22788
e1c1ecb0
KT
22789 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22790 {
22791 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22792 this_tune->name);
22793 aarch64_last_printed_tune_string = this_tune->name;
22794 }
361fb3ee 22795
b07fc91c
SN
22796 aarch64_asm_output_variant_pcs (stream, fndecl, name);
22797
361fb3ee
KT
22798 /* Don't forget the type directive for ELF. */
22799 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22800 ASM_OUTPUT_LABEL (stream, name);
c292cfe5
SN
22801
22802 cfun->machine->label_is_assembled = true;
22803}
22804
09c91cae 22805/* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
c292cfe5
SN
22806
22807void
22808aarch64_print_patchable_function_entry (FILE *file,
22809 unsigned HOST_WIDE_INT patch_area_size,
22810 bool record_p)
22811{
09c91cae 22812 if (!cfun->machine->label_is_assembled)
c292cfe5 22813 {
09c91cae
SP
22814 /* Emit the patching area before the entry label, if any. */
22815 default_print_patchable_function_entry (file, patch_area_size,
22816 record_p);
22817 return;
22818 }
22819
22820 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
22821 GEN_INT (record_p));
22822 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
22823
22824 if (!aarch64_bti_enabled ()
22825 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
22826 {
22827 /* Emit the patchable_area at the beginning of the function. */
22828 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
22829 INSN_ADDRESSES_NEW (insn, -1);
22830 return;
22831 }
22832
22833 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22834 if (!insn
22835 || !INSN_P (insn)
22836 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
22837 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
22838 {
22839 /* Emit a BTI_C. */
22840 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
c292cfe5
SN
22841 }
22842
09c91cae
SP
22843 /* Emit the patchable_area after BTI_C. */
22844 insn = emit_insn_after (pa, insn);
22845 INSN_ADDRESSES_NEW (insn, -1);
22846}
22847
22848/* Output patchable area. */
22849
22850void
22851aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
22852{
22853 default_print_patchable_function_entry (asm_out_file, patch_area_size,
22854 record_p);
361fb3ee
KT
22855}
22856
b07fc91c
SN
22857/* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
22858
22859void
22860aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22861{
22862 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22863 const char *value = IDENTIFIER_POINTER (target);
22864 aarch64_asm_output_variant_pcs (stream, decl, name);
22865 ASM_OUTPUT_DEF (stream, name, value);
22866}
22867
22868/* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
22869 function symbol references. */
22870
22871void
e8c47069 22872aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
b07fc91c 22873{
e8c47069 22874 default_elf_asm_output_external (stream, decl, name);
b07fc91c
SN
22875 aarch64_asm_output_variant_pcs (stream, decl, name);
22876}
22877
8fc16d72
ST
22878/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22879 Used to output the .cfi_b_key_frame directive when signing the current
22880 function with the B key. */
22881
22882void
22883aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22884{
2bdc7dcb 22885 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
8fc16d72
ST
22886 && aarch64_ra_sign_key == AARCH64_KEY_B)
22887 asm_fprintf (f, "\t.cfi_b_key_frame\n");
22888}
22889
e1c1ecb0
KT
22890/* Implements TARGET_ASM_FILE_START. Output the assembly header. */
22891
22892static void
22893aarch64_start_file (void)
22894{
22895 struct cl_target_option *default_options
22896 = TREE_TARGET_OPTION (target_option_default_node);
22897
22898 const struct processor *default_arch
ae54c1b0 22899 = aarch64_get_arch (default_options->x_selected_arch);
2a269bda 22900 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
e1c1ecb0 22901 std::string extension
04a99ebe
JG
22902 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22903 default_arch->flags);
e1c1ecb0
KT
22904
22905 aarch64_last_printed_arch_string = default_arch->name + extension;
22906 aarch64_last_printed_tune_string = "";
22907 asm_fprintf (asm_out_file, "\t.arch %s\n",
22908 aarch64_last_printed_arch_string.c_str ());
22909
22910 default_file_start ();
22911}
22912
0462169c
SN
22913/* Emit load exclusive. */
22914
22915static void
ef4bddc2 22916aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
0462169c
SN
22917 rtx mem, rtx model_rtx)
22918{
4a2095eb
RH
22919 if (mode == TImode)
22920 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22921 gen_highpart (DImode, rval),
22922 mem, model_rtx));
22923 else
22924 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
0462169c
SN
22925}
22926
22927/* Emit store exclusive. */
22928
22929static void
ef4bddc2 22930aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
4a2095eb 22931 rtx mem, rtx rval, rtx model_rtx)
0462169c 22932{
4a2095eb
RH
22933 if (mode == TImode)
22934 emit_insn (gen_aarch64_store_exclusive_pair
22935 (bval, mem, operand_subword (rval, 0, 0, TImode),
22936 operand_subword (rval, 1, 0, TImode), model_rtx));
22937 else
22938 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
0462169c
SN
22939}
22940
22941/* Mark the previous jump instruction as unlikely. */
22942
22943static void
22944aarch64_emit_unlikely_jump (rtx insn)
22945{
f370536c 22946 rtx_insn *jump = emit_jump_insn (insn);
5fa396ad 22947 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
0462169c
SN
22948}
22949
bc25483c 22950/* We store the names of the various atomic helpers in a 5x5 array.
3950b229
RH
22951 Return the libcall function given MODE, MODEL and NAMES. */
22952
22953rtx
22954aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22955 const atomic_ool_names *names)
22956{
bc25483c 22957 memmodel model = memmodel_from_int (INTVAL (model_rtx));
3950b229
RH
22958 int mode_idx, model_idx;
22959
22960 switch (mode)
22961 {
22962 case E_QImode:
22963 mode_idx = 0;
22964 break;
22965 case E_HImode:
22966 mode_idx = 1;
22967 break;
22968 case E_SImode:
22969 mode_idx = 2;
22970 break;
22971 case E_DImode:
22972 mode_idx = 3;
22973 break;
22974 case E_TImode:
22975 mode_idx = 4;
22976 break;
22977 default:
22978 gcc_unreachable ();
22979 }
22980
22981 switch (model)
22982 {
22983 case MEMMODEL_RELAXED:
22984 model_idx = 0;
22985 break;
22986 case MEMMODEL_CONSUME:
22987 case MEMMODEL_ACQUIRE:
22988 model_idx = 1;
22989 break;
22990 case MEMMODEL_RELEASE:
22991 model_idx = 2;
22992 break;
22993 case MEMMODEL_ACQ_REL:
22994 case MEMMODEL_SEQ_CST:
22995 model_idx = 3;
22996 break;
bc25483c
SP
22997 case MEMMODEL_SYNC_ACQUIRE:
22998 case MEMMODEL_SYNC_RELEASE:
22999 case MEMMODEL_SYNC_SEQ_CST:
23000 model_idx = 4;
23001 break;
3950b229
RH
23002 default:
23003 gcc_unreachable ();
23004 }
23005
23006 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
23007 VISIBILITY_HIDDEN);
23008}
23009
23010#define DEF0(B, N) \
23011 { "__aarch64_" #B #N "_relax", \
23012 "__aarch64_" #B #N "_acq", \
23013 "__aarch64_" #B #N "_rel", \
bc25483c
SP
23014 "__aarch64_" #B #N "_acq_rel", \
23015 "__aarch64_" #B #N "_sync" }
3950b229
RH
23016
23017#define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
23018 { NULL, NULL, NULL, NULL }
23019#define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
23020
23021static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
23022const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
23023const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
23024const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
23025const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
23026const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
23027
23028#undef DEF0
23029#undef DEF4
23030#undef DEF5
23031
0462169c
SN
23032/* Expand a compare and swap pattern. */
23033
23034void
23035aarch64_expand_compare_and_swap (rtx operands[])
23036{
d400fda3
RH
23037 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
23038 machine_mode mode, r_mode;
0462169c
SN
23039
23040 bval = operands[0];
23041 rval = operands[1];
23042 mem = operands[2];
23043 oldval = operands[3];
23044 newval = operands[4];
23045 is_weak = operands[5];
23046 mod_s = operands[6];
23047 mod_f = operands[7];
23048 mode = GET_MODE (mem);
0462169c
SN
23049
23050 /* Normally the succ memory model must be stronger than fail, but in the
23051 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
23052 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
46b35980
AM
23053 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
23054 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
0462169c
SN
23055 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
23056
d400fda3
RH
23057 r_mode = mode;
23058 if (mode == QImode || mode == HImode)
0462169c 23059 {
d400fda3
RH
23060 r_mode = SImode;
23061 rval = gen_reg_rtx (r_mode);
0462169c
SN
23062 }
23063
b0770c0f 23064 if (TARGET_LSE)
77f33f44
RH
23065 {
23066 /* The CAS insn requires oldval and rval overlap, but we need to
23067 have a copy of oldval saved across the operation to tell if
23068 the operation is successful. */
d400fda3
RH
23069 if (reg_overlap_mentioned_p (rval, oldval))
23070 rval = copy_to_mode_reg (r_mode, oldval);
77f33f44 23071 else
d400fda3
RH
23072 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
23073
77f33f44
RH
23074 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
23075 newval, mod_s));
d400fda3 23076 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
77f33f44 23077 }
3950b229
RH
23078 else if (TARGET_OUTLINE_ATOMICS)
23079 {
23080 /* Oldval must satisfy compare afterward. */
23081 if (!aarch64_plus_operand (oldval, mode))
23082 oldval = force_reg (mode, oldval);
23083 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
23084 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
23085 oldval, mode, newval, mode,
23086 XEXP (mem, 0), Pmode);
23087 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23088 }
b0770c0f 23089 else
d400fda3
RH
23090 {
23091 /* The oldval predicate varies by mode. Test it and force to reg. */
23092 insn_code code = code_for_aarch64_compare_and_swap (mode);
23093 if (!insn_data[code].operand[2].predicate (oldval, mode))
23094 oldval = force_reg (mode, oldval);
0462169c 23095
d400fda3
RH
23096 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
23097 is_weak, mod_s, mod_f));
23098 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
23099 }
23100
23101 if (r_mode != mode)
77f33f44
RH
23102 rval = gen_lowpart (mode, rval);
23103 emit_move_insn (operands[1], rval);
0462169c 23104
d400fda3 23105 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
f7df4a84 23106 emit_insn (gen_rtx_SET (bval, x));
0462169c
SN
23107}
23108
f70fb3b6
MW
23109/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23110 sequence implementing an atomic operation. */
23111
23112static void
23113aarch64_emit_post_barrier (enum memmodel model)
23114{
23115 const enum memmodel base_model = memmodel_base (model);
23116
23117 if (is_mm_sync (model)
23118 && (base_model == MEMMODEL_ACQUIRE
23119 || base_model == MEMMODEL_ACQ_REL
23120 || base_model == MEMMODEL_SEQ_CST))
23121 {
23122 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
23123 }
23124}
23125
0462169c
SN
23126/* Split a compare and swap pattern. */
23127
23128void
23129aarch64_split_compare_and_swap (rtx operands[])
23130{
e5e07b68
WD
23131 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23132 gcc_assert (epilogue_completed);
23133
b7e560de 23134 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
ef4bddc2 23135 machine_mode mode;
0462169c 23136 bool is_weak;
5d8a22a5 23137 rtx_code_label *label1, *label2;
ab876106 23138 enum memmodel model;
0462169c
SN
23139
23140 rval = operands[0];
23141 mem = operands[1];
23142 oldval = operands[2];
23143 newval = operands[3];
23144 is_weak = (operands[4] != const0_rtx);
ab876106 23145 model_rtx = operands[5];
0462169c
SN
23146 scratch = operands[7];
23147 mode = GET_MODE (mem);
ab876106 23148 model = memmodel_from_int (INTVAL (model_rtx));
0462169c 23149
17f47f86
KT
23150 /* When OLDVAL is zero and we want the strong version we can emit a tighter
23151 loop:
23152 .label1:
23153 LD[A]XR rval, [mem]
23154 CBNZ rval, .label2
23155 ST[L]XR scratch, newval, [mem]
23156 CBNZ scratch, .label1
23157 .label2:
23158 CMP rval, 0. */
b7e560de
RH
23159 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23160 oldval == const0_rtx && mode != TImode);
17f47f86 23161
5d8a22a5 23162 label1 = NULL;
0462169c
SN
23163 if (!is_weak)
23164 {
23165 label1 = gen_label_rtx ();
23166 emit_label (label1);
23167 }
23168 label2 = gen_label_rtx ();
23169
ab876106
MW
23170 /* The initial load can be relaxed for a __sync operation since a final
23171 barrier will be emitted to stop code hoisting. */
23172 if (is_mm_sync (model))
b7e560de 23173 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
ab876106
MW
23174 else
23175 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
0462169c 23176
17f47f86 23177 if (strong_zero_p)
b7e560de 23178 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17f47f86
KT
23179 else
23180 {
b7e560de
RH
23181 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23182 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17f47f86 23183 }
b7e560de
RH
23184 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23185 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23186 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c 23187
ab876106 23188 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
0462169c
SN
23189
23190 if (!is_weak)
23191 {
6e1eaca9
RE
23192 if (aarch64_track_speculation)
23193 {
23194 /* Emit an explicit compare instruction, so that we can correctly
23195 track the condition codes. */
23196 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23197 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23198 }
23199 else
23200 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23201
0462169c
SN
23202 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23203 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
f7df4a84 23204 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c
SN
23205 }
23206 else
b7e560de 23207 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
0462169c
SN
23208
23209 emit_label (label2);
b7e560de 23210
17f47f86
KT
23211 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23212 to set the condition flags. If this is not used it will be removed by
23213 later passes. */
23214 if (strong_zero_p)
b7e560de
RH
23215 aarch64_gen_compare_reg (NE, rval, const0_rtx);
23216
ab876106
MW
23217 /* Emit any final barrier needed for a __sync operation. */
23218 if (is_mm_sync (model))
23219 aarch64_emit_post_barrier (model);
0462169c 23220}
9cd7b720 23221
0462169c
SN
23222/* Split an atomic operation. */
23223
23224void
23225aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9cd7b720 23226 rtx value, rtx model_rtx, rtx cond)
0462169c 23227{
e5e07b68
WD
23228 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23229 gcc_assert (epilogue_completed);
23230
ef4bddc2
RS
23231 machine_mode mode = GET_MODE (mem);
23232 machine_mode wmode = (mode == DImode ? DImode : SImode);
f70fb3b6
MW
23233 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23234 const bool is_sync = is_mm_sync (model);
5d8a22a5
DM
23235 rtx_code_label *label;
23236 rtx x;
0462169c 23237
9cd7b720 23238 /* Split the atomic operation into a sequence. */
0462169c
SN
23239 label = gen_label_rtx ();
23240 emit_label (label);
23241
23242 if (new_out)
23243 new_out = gen_lowpart (wmode, new_out);
23244 if (old_out)
23245 old_out = gen_lowpart (wmode, old_out);
23246 else
23247 old_out = new_out;
23248 value = simplify_gen_subreg (wmode, value, mode, 0);
23249
f70fb3b6
MW
23250 /* The initial load can be relaxed for a __sync operation since a final
23251 barrier will be emitted to stop code hoisting. */
23252 if (is_sync)
23253 aarch64_emit_load_exclusive (mode, old_out, mem,
23254 GEN_INT (MEMMODEL_RELAXED));
23255 else
23256 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
0462169c
SN
23257
23258 switch (code)
23259 {
23260 case SET:
23261 new_out = value;
23262 break;
23263
23264 case NOT:
23265 x = gen_rtx_AND (wmode, old_out, value);
f7df4a84 23266 emit_insn (gen_rtx_SET (new_out, x));
0462169c 23267 x = gen_rtx_NOT (wmode, new_out);
f7df4a84 23268 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
23269 break;
23270
23271 case MINUS:
23272 if (CONST_INT_P (value))
23273 {
618ae596 23274 value = GEN_INT (-UINTVAL (value));
0462169c
SN
23275 code = PLUS;
23276 }
23277 /* Fall through. */
23278
23279 default:
23280 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
f7df4a84 23281 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
23282 break;
23283 }
23284
23285 aarch64_emit_store_exclusive (mode, cond, mem,
23286 gen_lowpart (mode, new_out), model_rtx);
23287
6e1eaca9
RE
23288 if (aarch64_track_speculation)
23289 {
23290 /* Emit an explicit compare instruction, so that we can correctly
23291 track the condition codes. */
23292 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23293 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23294 }
23295 else
23296 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23297
0462169c
SN
23298 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23299 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
f7df4a84 23300 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
f70fb3b6
MW
23301
23302 /* Emit any final barrier needed for a __sync operation. */
23303 if (is_sync)
23304 aarch64_emit_post_barrier (model);
0462169c
SN
23305}
23306
c2ec330c
AL
23307static void
23308aarch64_init_libfuncs (void)
23309{
23310 /* Half-precision float operations. The compiler handles all operations
23311 with NULL libfuncs by converting to SFmode. */
23312
23313 /* Conversions. */
23314 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23315 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23316
23317 /* Arithmetic. */
23318 set_optab_libfunc (add_optab, HFmode, NULL);
23319 set_optab_libfunc (sdiv_optab, HFmode, NULL);
23320 set_optab_libfunc (smul_optab, HFmode, NULL);
23321 set_optab_libfunc (neg_optab, HFmode, NULL);
23322 set_optab_libfunc (sub_optab, HFmode, NULL);
23323
23324 /* Comparisons. */
23325 set_optab_libfunc (eq_optab, HFmode, NULL);
23326 set_optab_libfunc (ne_optab, HFmode, NULL);
23327 set_optab_libfunc (lt_optab, HFmode, NULL);
23328 set_optab_libfunc (le_optab, HFmode, NULL);
23329 set_optab_libfunc (ge_optab, HFmode, NULL);
23330 set_optab_libfunc (gt_optab, HFmode, NULL);
23331 set_optab_libfunc (unord_optab, HFmode, NULL);
23332}
23333
43e9d192 23334/* Target hook for c_mode_for_suffix. */
ef4bddc2 23335static machine_mode
43e9d192
IB
23336aarch64_c_mode_for_suffix (char suffix)
23337{
23338 if (suffix == 'q')
23339 return TFmode;
23340
23341 return VOIDmode;
23342}
23343
3520f7cc
JG
23344/* We can only represent floating point constants which will fit in
23345 "quarter-precision" values. These values are characterised by
23346 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23347 by:
23348
23349 (-1)^s * (n/16) * 2^r
23350
23351 Where:
23352 's' is the sign bit.
23353 'n' is an integer in the range 16 <= n <= 31.
23354 'r' is an integer in the range -3 <= r <= 4. */
23355
23356/* Return true iff X can be represented by a quarter-precision
23357 floating point immediate operand X. Note, we cannot represent 0.0. */
23358bool
23359aarch64_float_const_representable_p (rtx x)
23360{
23361 /* This represents our current view of how many bits
23362 make up the mantissa. */
23363 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
ba96cdfb 23364 int exponent;
3520f7cc 23365 unsigned HOST_WIDE_INT mantissa, mask;
3520f7cc 23366 REAL_VALUE_TYPE r, m;
807e902e 23367 bool fail;
3520f7cc 23368
d29f7dd5 23369 x = unwrap_const_vec_duplicate (x);
3520f7cc
JG
23370 if (!CONST_DOUBLE_P (x))
23371 return false;
23372
a4518821
RS
23373 if (GET_MODE (x) == VOIDmode
23374 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
94bfa2da
TV
23375 return false;
23376
34a72c33 23377 r = *CONST_DOUBLE_REAL_VALUE (x);
3520f7cc
JG
23378
23379 /* We cannot represent infinities, NaNs or +/-zero. We won't
23380 know if we have +zero until we analyse the mantissa, but we
23381 can reject the other invalid values. */
23382 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23383 || REAL_VALUE_MINUS_ZERO (r))
23384 return false;
23385
ba96cdfb 23386 /* Extract exponent. */
3520f7cc
JG
23387 r = real_value_abs (&r);
23388 exponent = REAL_EXP (&r);
23389
23390 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23391 highest (sign) bit, with a fixed binary point at bit point_pos.
23392 m1 holds the low part of the mantissa, m2 the high part.
23393 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23394 bits for the mantissa, this can fail (low bits will be lost). */
23395 real_ldexp (&m, &r, point_pos - exponent);
807e902e 23396 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
3520f7cc
JG
23397
23398 /* If the low part of the mantissa has bits set we cannot represent
23399 the value. */
d9074b29 23400 if (w.ulow () != 0)
3520f7cc
JG
23401 return false;
23402 /* We have rejected the lower HOST_WIDE_INT, so update our
23403 understanding of how many bits lie in the mantissa and
23404 look only at the high HOST_WIDE_INT. */
807e902e 23405 mantissa = w.elt (1);
3520f7cc
JG
23406 point_pos -= HOST_BITS_PER_WIDE_INT;
23407
23408 /* We can only represent values with a mantissa of the form 1.xxxx. */
23409 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23410 if ((mantissa & mask) != 0)
23411 return false;
23412
23413 /* Having filtered unrepresentable values, we may now remove all
23414 but the highest 5 bits. */
23415 mantissa >>= point_pos - 5;
23416
23417 /* We cannot represent the value 0.0, so reject it. This is handled
23418 elsewhere. */
23419 if (mantissa == 0)
23420 return false;
23421
23422 /* Then, as bit 4 is always set, we can mask it off, leaving
23423 the mantissa in the range [0, 15]. */
23424 mantissa &= ~(1 << 4);
23425 gcc_assert (mantissa <= 15);
23426
23427 /* GCC internally does not use IEEE754-like encoding (where normalized
e53b6e56 23428 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
3520f7cc
JG
23429 Our mantissa values are shifted 4 places to the left relative to
23430 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23431 by 5 places to correct for GCC's representation. */
23432 exponent = 5 - exponent;
23433
23434 return (exponent >= 0 && exponent <= 7);
23435}
23436
ab6501d7
SD
23437/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23438 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
23439 output MOVI/MVNI, ORR or BIC immediate. */
3520f7cc 23440char*
b187677b 23441aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
ab6501d7 23442 enum simd_immediate_check which)
3520f7cc 23443{
3ea63f60 23444 bool is_valid;
3520f7cc 23445 static char templ[40];
3520f7cc 23446 const char *mnemonic;
e4f0f84d 23447 const char *shift_op;
3520f7cc 23448 unsigned int lane_count = 0;
81c2dfb9 23449 char element_char;
3520f7cc 23450
b187677b 23451 struct simd_immediate_info info;
48063b9d
IB
23452
23453 /* This will return true to show const_vector is legal for use as either
ab6501d7
SD
23454 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23455 It will also update INFO to show how the immediate should be generated.
23456 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
b187677b 23457 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
3520f7cc
JG
23458 gcc_assert (is_valid);
23459
b187677b
RS
23460 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23461 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
48063b9d 23462
b187677b 23463 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
3520f7cc 23464 {
1da83cce
RS
23465 gcc_assert (info.insn == simd_immediate_info::MOV
23466 && info.u.mov.shift == 0);
0d8e1702
KT
23467 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23468 move immediate path. */
1da83cce
RS
23469 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23470 info.u.mov.value = GEN_INT (0);
48063b9d
IB
23471 else
23472 {
83faf7d0 23473 const unsigned int buf_size = 20;
48063b9d 23474 char float_buf[buf_size] = {'\0'};
34a72c33 23475 real_to_decimal_for_mode (float_buf,
1da83cce 23476 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
b187677b 23477 buf_size, buf_size, 1, info.elt_mode);
48063b9d
IB
23478
23479 if (lane_count == 1)
23480 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23481 else
23482 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
81c2dfb9 23483 lane_count, element_char, float_buf);
48063b9d
IB
23484 return templ;
23485 }
3520f7cc 23486 }
3520f7cc 23487
1da83cce 23488 gcc_assert (CONST_INT_P (info.u.mov.value));
ab6501d7
SD
23489
23490 if (which == AARCH64_CHECK_MOV)
23491 {
b187677b 23492 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
1da83cce
RS
23493 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23494 ? "msl" : "lsl");
ab6501d7
SD
23495 if (lane_count == 1)
23496 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
1da83cce
RS
23497 mnemonic, UINTVAL (info.u.mov.value));
23498 else if (info.u.mov.shift)
ab6501d7
SD
23499 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23500 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
1da83cce
RS
23501 element_char, UINTVAL (info.u.mov.value), shift_op,
23502 info.u.mov.shift);
ab6501d7
SD
23503 else
23504 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23505 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
1da83cce 23506 element_char, UINTVAL (info.u.mov.value));
ab6501d7 23507 }
3520f7cc 23508 else
ab6501d7
SD
23509 {
23510 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
b187677b 23511 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
1da83cce 23512 if (info.u.mov.shift)
ab6501d7
SD
23513 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23514 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
1da83cce
RS
23515 element_char, UINTVAL (info.u.mov.value), "lsl",
23516 info.u.mov.shift);
ab6501d7
SD
23517 else
23518 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23519 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
1da83cce 23520 element_char, UINTVAL (info.u.mov.value));
ab6501d7 23521 }
3520f7cc
JG
23522 return templ;
23523}
23524
b7342d25 23525char*
77e994c9 23526aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
b7342d25 23527{
a2170965
TC
23528
23529 /* If a floating point number was passed and we desire to use it in an
23530 integer mode do the conversion to integer. */
23531 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23532 {
23533 unsigned HOST_WIDE_INT ival;
23534 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23535 gcc_unreachable ();
23536 immediate = gen_int_mode (ival, mode);
23537 }
23538
ef4bddc2 23539 machine_mode vmode;
0dc8e1e7 23540 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
a2170965
TC
23541 a 128 bit vector mode. */
23542 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
b7342d25 23543
a2170965 23544 vmode = aarch64_simd_container_mode (mode, width);
b7342d25 23545 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
b187677b 23546 return aarch64_output_simd_mov_immediate (v_op, width);
b7342d25
IB
23547}
23548
43cacb12
RS
23549/* Return the output string to use for moving immediate CONST_VECTOR
23550 into an SVE register. */
23551
23552char *
23553aarch64_output_sve_mov_immediate (rtx const_vector)
23554{
23555 static char templ[40];
23556 struct simd_immediate_info info;
23557 char element_char;
23558
23559 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23560 gcc_assert (is_valid);
23561
23562 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23563
1044fa32
RS
23564 machine_mode vec_mode = GET_MODE (const_vector);
23565 if (aarch64_sve_pred_mode_p (vec_mode))
23566 {
23567 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
0b1fe8cf
RS
23568 if (info.insn == simd_immediate_info::MOV)
23569 {
23570 gcc_assert (info.u.mov.value == const0_rtx);
23571 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23572 }
1044fa32 23573 else
0b1fe8cf
RS
23574 {
23575 gcc_assert (info.insn == simd_immediate_info::PTRUE);
23576 unsigned int total_bytes;
23577 if (info.u.pattern == AARCH64_SV_ALL
23578 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23579 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23580 total_bytes / GET_MODE_SIZE (info.elt_mode));
23581 else
23582 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23583 svpattern_token (info.u.pattern));
23584 }
1044fa32
RS
23585 return buf;
23586 }
23587
1da83cce 23588 if (info.insn == simd_immediate_info::INDEX)
43cacb12
RS
23589 {
23590 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23591 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
1da83cce
RS
23592 element_char, INTVAL (info.u.index.base),
23593 INTVAL (info.u.index.step));
43cacb12
RS
23594 return templ;
23595 }
23596
23597 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23598 {
1da83cce
RS
23599 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23600 info.u.mov.value = GEN_INT (0);
43cacb12
RS
23601 else
23602 {
23603 const int buf_size = 20;
23604 char float_buf[buf_size] = {};
23605 real_to_decimal_for_mode (float_buf,
1da83cce 23606 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
43cacb12
RS
23607 buf_size, buf_size, 1, info.elt_mode);
23608
23609 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23610 element_char, float_buf);
23611 return templ;
23612 }
23613 }
23614
23615 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
1da83cce 23616 element_char, INTVAL (info.u.mov.value));
43cacb12
RS
23617 return templ;
23618}
23619
624d0f07
RS
23620/* Return the asm template for a PTRUES. CONST_UNSPEC is the
23621 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23622 pattern. */
23623
23624char *
23625aarch64_output_sve_ptrues (rtx const_unspec)
23626{
23627 static char templ[40];
23628
23629 struct simd_immediate_info info;
23630 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23631 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23632
23633 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23634 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23635 svpattern_token (info.u.pattern));
23636 return templ;
23637}
23638
88b08073
JG
23639/* Split operands into moves from op[1] + op[2] into op[0]. */
23640
23641void
23642aarch64_split_combinev16qi (rtx operands[3])
23643{
23644 unsigned int dest = REGNO (operands[0]);
23645 unsigned int src1 = REGNO (operands[1]);
23646 unsigned int src2 = REGNO (operands[2]);
ef4bddc2 23647 machine_mode halfmode = GET_MODE (operands[1]);
462a99aa 23648 unsigned int halfregs = REG_NREGS (operands[1]);
88b08073
JG
23649 rtx destlo, desthi;
23650
23651 gcc_assert (halfmode == V16QImode);
23652
23653 if (src1 == dest && src2 == dest + halfregs)
23654 {
23655 /* No-op move. Can't split to nothing; emit something. */
23656 emit_note (NOTE_INSN_DELETED);
23657 return;
23658 }
23659
23660 /* Preserve register attributes for variable tracking. */
23661 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23662 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23663 GET_MODE_SIZE (halfmode));
23664
23665 /* Special case of reversed high/low parts. */
23666 if (reg_overlap_mentioned_p (operands[2], destlo)
23667 && reg_overlap_mentioned_p (operands[1], desthi))
23668 {
23669 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23670 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23671 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23672 }
23673 else if (!reg_overlap_mentioned_p (operands[2], destlo))
23674 {
23675 /* Try to avoid unnecessary moves if part of the result
23676 is in the right place already. */
23677 if (src1 != dest)
23678 emit_move_insn (destlo, operands[1]);
23679 if (src2 != dest + halfregs)
23680 emit_move_insn (desthi, operands[2]);
23681 }
23682 else
23683 {
23684 if (src2 != dest + halfregs)
23685 emit_move_insn (desthi, operands[2]);
23686 if (src1 != dest)
23687 emit_move_insn (destlo, operands[1]);
23688 }
23689}
23690
23691/* vec_perm support. */
23692
88b08073
JG
23693struct expand_vec_perm_d
23694{
23695 rtx target, op0, op1;
e3342de4 23696 vec_perm_indices perm;
ef4bddc2 23697 machine_mode vmode;
494bec02 23698 machine_mode op_mode;
43cacb12 23699 unsigned int vec_flags;
494bec02 23700 unsigned int op_vec_flags;
88b08073
JG
23701 bool one_vector_p;
23702 bool testing_p;
23703};
23704
7efc03fd
DP
23705static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23706
88b08073
JG
23707/* Generate a variable permutation. */
23708
23709static void
23710aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23711{
ef4bddc2 23712 machine_mode vmode = GET_MODE (target);
88b08073
JG
23713 bool one_vector_p = rtx_equal_p (op0, op1);
23714
23715 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23716 gcc_checking_assert (GET_MODE (op0) == vmode);
23717 gcc_checking_assert (GET_MODE (op1) == vmode);
23718 gcc_checking_assert (GET_MODE (sel) == vmode);
23719 gcc_checking_assert (TARGET_SIMD);
23720
23721 if (one_vector_p)
23722 {
23723 if (vmode == V8QImode)
23724 {
23725 /* Expand the argument to a V16QI mode by duplicating it. */
23726 rtx pair = gen_reg_rtx (V16QImode);
23727 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
b7e450c9 23728 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
88b08073
JG
23729 }
23730 else
23731 {
b7e450c9 23732 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
88b08073
JG
23733 }
23734 }
23735 else
23736 {
23737 rtx pair;
23738
23739 if (vmode == V8QImode)
23740 {
23741 pair = gen_reg_rtx (V16QImode);
23742 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
b7e450c9 23743 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
88b08073
JG
23744 }
23745 else
23746 {
66f206b8 23747 pair = gen_reg_rtx (V2x16QImode);
88b08073 23748 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
b7e450c9 23749 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
88b08073
JG
23750 }
23751 }
23752}
23753
80940017
RS
23754/* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23755 NELT is the number of elements in the vector. */
23756
88b08073 23757void
80940017
RS
23758aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23759 unsigned int nelt)
88b08073 23760{
ef4bddc2 23761 machine_mode vmode = GET_MODE (target);
88b08073 23762 bool one_vector_p = rtx_equal_p (op0, op1);
f7c4e5b8 23763 rtx mask;
88b08073
JG
23764
23765 /* The TBL instruction does not use a modulo index, so we must take care
23766 of that ourselves. */
f7c4e5b8
AL
23767 mask = aarch64_simd_gen_const_vector_dup (vmode,
23768 one_vector_p ? nelt - 1 : 2 * nelt - 1);
88b08073
JG
23769 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23770
f7c4e5b8
AL
23771 /* For big-endian, we also need to reverse the index within the vector
23772 (but not which vector). */
23773 if (BYTES_BIG_ENDIAN)
23774 {
23775 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
23776 if (!one_vector_p)
23777 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23778 sel = expand_simple_binop (vmode, XOR, sel, mask,
23779 NULL, 0, OPTAB_LIB_WIDEN);
23780 }
88b08073
JG
23781 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23782}
23783
43cacb12
RS
23784/* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
23785
23786static void
23787emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23788{
23789 emit_insn (gen_rtx_SET (target,
23790 gen_rtx_UNSPEC (GET_MODE (target),
23791 gen_rtvec (2, op0, op1), code)));
23792}
23793
23794/* Expand an SVE vec_perm with the given operands. */
23795
23796void
23797aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23798{
23799 machine_mode data_mode = GET_MODE (target);
23800 machine_mode sel_mode = GET_MODE (sel);
23801 /* Enforced by the pattern condition. */
23802 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23803
23804 /* Note: vec_perm indices are supposed to wrap when they go beyond the
23805 size of the two value vectors, i.e. the upper bits of the indices
23806 are effectively ignored. SVE TBL instead produces 0 for any
23807 out-of-range indices, so we need to modulo all the vec_perm indices
23808 to ensure they are all in range. */
23809 rtx sel_reg = force_reg (sel_mode, sel);
23810
23811 /* Check if the sel only references the first values vector. */
568b9c0e 23812 if (CONST_VECTOR_P (sel)
43cacb12
RS
23813 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23814 {
23815 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23816 return;
23817 }
23818
23819 /* Check if the two values vectors are the same. */
23820 if (rtx_equal_p (op0, op1))
23821 {
23822 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23823 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23824 NULL, 0, OPTAB_DIRECT);
23825 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23826 return;
23827 }
23828
23829 /* Run TBL on for each value vector and combine the results. */
23830
23831 rtx res0 = gen_reg_rtx (data_mode);
23832 rtx res1 = gen_reg_rtx (data_mode);
23833 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
568b9c0e 23834 if (!CONST_VECTOR_P (sel)
43cacb12
RS
23835 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23836 {
23837 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23838 2 * nunits - 1);
23839 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23840 NULL, 0, OPTAB_DIRECT);
23841 }
23842 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23843 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23844 NULL, 0, OPTAB_DIRECT);
23845 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23846 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23847 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23848 else
23849 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23850}
23851
cc4d934f
JG
23852/* Recognize patterns suitable for the TRN instructions. */
23853static bool
23854aarch64_evpc_trn (struct expand_vec_perm_d *d)
23855{
6a70badb
RS
23856 HOST_WIDE_INT odd;
23857 poly_uint64 nelt = d->perm.length ();
06039e71 23858 rtx out, in0, in1;
ef4bddc2 23859 machine_mode vmode = d->vmode;
cc4d934f
JG
23860
23861 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23862 return false;
23863
23864 /* Note that these are little-endian tests.
23865 We correct for big-endian later. */
6a70badb
RS
23866 if (!d->perm[0].is_constant (&odd)
23867 || (odd != 0 && odd != 1)
326ac20e
RS
23868 || !d->perm.series_p (0, 2, odd, 2)
23869 || !d->perm.series_p (1, 2, nelt + odd, 2))
cc4d934f 23870 return false;
cc4d934f
JG
23871
23872 /* Success! */
23873 if (d->testing_p)
23874 return true;
23875
23876 in0 = d->op0;
23877 in1 = d->op1;
43cacb12
RS
23878 /* We don't need a big-endian lane correction for SVE; see the comment
23879 at the head of aarch64-sve.md for details. */
23880 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f 23881 {
06039e71 23882 std::swap (in0, in1);
cc4d934f
JG
23883 odd = !odd;
23884 }
23885 out = d->target;
23886
3f8334a5
RS
23887 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23888 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
cc4d934f
JG
23889 return true;
23890}
23891
7efc03fd
DP
23892/* Try to re-encode the PERM constant so it combines odd and even elements.
23893 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23894 We retry with this new constant with the full suite of patterns. */
23895static bool
23896aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23897{
23898 expand_vec_perm_d newd;
23899 unsigned HOST_WIDE_INT nelt;
23900
23901 if (d->vec_flags != VEC_ADVSIMD)
23902 return false;
23903
23904 /* Get the new mode. Always twice the size of the inner
23905 and half the elements. */
23906 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23907 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23908 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23909 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23910
23911 if (new_mode == word_mode)
23912 return false;
23913
23914 /* to_constant is safe since this routine is specific to Advanced SIMD
23915 vectors. */
23916 nelt = d->perm.length ().to_constant ();
23917
23918 vec_perm_builder newpermconst;
23919 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23920
23921 /* Convert the perm constant if we can. Require even, odd as the pairs. */
23922 for (unsigned int i = 0; i < nelt; i += 2)
23923 {
23924 poly_int64 elt0 = d->perm[i];
23925 poly_int64 elt1 = d->perm[i + 1];
23926 poly_int64 newelt;
23927 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23928 return false;
23929 newpermconst.quick_push (newelt.to_constant ());
23930 }
23931 newpermconst.finalize ();
23932
23933 newd.vmode = new_mode;
23934 newd.vec_flags = VEC_ADVSIMD;
494bec02
PK
23935 newd.op_mode = newd.vmode;
23936 newd.op_vec_flags = newd.vec_flags;
7efc03fd
DP
23937 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23938 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23939 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23940 newd.testing_p = d->testing_p;
23941 newd.one_vector_p = d->one_vector_p;
23942
23943 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23944 return aarch64_expand_vec_perm_const_1 (&newd);
23945}
23946
cc4d934f
JG
23947/* Recognize patterns suitable for the UZP instructions. */
23948static bool
23949aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23950{
6a70badb 23951 HOST_WIDE_INT odd;
06039e71 23952 rtx out, in0, in1;
ef4bddc2 23953 machine_mode vmode = d->vmode;
cc4d934f
JG
23954
23955 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23956 return false;
23957
23958 /* Note that these are little-endian tests.
23959 We correct for big-endian later. */
6a70badb
RS
23960 if (!d->perm[0].is_constant (&odd)
23961 || (odd != 0 && odd != 1)
326ac20e 23962 || !d->perm.series_p (0, 1, odd, 2))
cc4d934f 23963 return false;
cc4d934f
JG
23964
23965 /* Success! */
23966 if (d->testing_p)
23967 return true;
23968
23969 in0 = d->op0;
23970 in1 = d->op1;
43cacb12
RS
23971 /* We don't need a big-endian lane correction for SVE; see the comment
23972 at the head of aarch64-sve.md for details. */
23973 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f 23974 {
06039e71 23975 std::swap (in0, in1);
cc4d934f
JG
23976 odd = !odd;
23977 }
23978 out = d->target;
23979
3f8334a5
RS
23980 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23981 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
cc4d934f
JG
23982 return true;
23983}
23984
23985/* Recognize patterns suitable for the ZIP instructions. */
23986static bool
23987aarch64_evpc_zip (struct expand_vec_perm_d *d)
23988{
6a70badb
RS
23989 unsigned int high;
23990 poly_uint64 nelt = d->perm.length ();
06039e71 23991 rtx out, in0, in1;
ef4bddc2 23992 machine_mode vmode = d->vmode;
cc4d934f
JG
23993
23994 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23995 return false;
23996
23997 /* Note that these are little-endian tests.
23998 We correct for big-endian later. */
6a70badb
RS
23999 poly_uint64 first = d->perm[0];
24000 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
24001 || !d->perm.series_p (0, 2, first, 1)
24002 || !d->perm.series_p (1, 2, first + nelt, 1))
cc4d934f 24003 return false;
6a70badb 24004 high = maybe_ne (first, 0U);
cc4d934f
JG
24005
24006 /* Success! */
24007 if (d->testing_p)
24008 return true;
24009
24010 in0 = d->op0;
24011 in1 = d->op1;
43cacb12
RS
24012 /* We don't need a big-endian lane correction for SVE; see the comment
24013 at the head of aarch64-sve.md for details. */
24014 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f 24015 {
06039e71 24016 std::swap (in0, in1);
cc4d934f
JG
24017 high = !high;
24018 }
24019 out = d->target;
24020
3f8334a5
RS
24021 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
24022 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
cc4d934f
JG
24023 return true;
24024}
24025
ae0533da
AL
24026/* Recognize patterns for the EXT insn. */
24027
24028static bool
24029aarch64_evpc_ext (struct expand_vec_perm_d *d)
24030{
6a70badb 24031 HOST_WIDE_INT location;
ae0533da
AL
24032 rtx offset;
24033
6a70badb
RS
24034 /* The first element always refers to the first vector.
24035 Check if the extracted indices are increasing by one. */
43cacb12
RS
24036 if (d->vec_flags == VEC_SVE_PRED
24037 || !d->perm[0].is_constant (&location)
6a70badb 24038 || !d->perm.series_p (0, 1, location, 1))
326ac20e 24039 return false;
ae0533da 24040
ae0533da
AL
24041 /* Success! */
24042 if (d->testing_p)
24043 return true;
24044
b31e65bb 24045 /* The case where (location == 0) is a no-op for both big- and little-endian,
43cacb12 24046 and is removed by the mid-end at optimization levels -O1 and higher.
b31e65bb 24047
43cacb12
RS
24048 We don't need a big-endian lane correction for SVE; see the comment
24049 at the head of aarch64-sve.md for details. */
24050 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
ae0533da
AL
24051 {
24052 /* After setup, we want the high elements of the first vector (stored
24053 at the LSB end of the register), and the low elements of the second
24054 vector (stored at the MSB end of the register). So swap. */
cb5c6c29 24055 std::swap (d->op0, d->op1);
6a70badb
RS
24056 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
24057 to_constant () is safe since this is restricted to Advanced SIMD
24058 vectors. */
24059 location = d->perm.length ().to_constant () - location;
ae0533da
AL
24060 }
24061
24062 offset = GEN_INT (location);
3f8334a5
RS
24063 emit_set_insn (d->target,
24064 gen_rtx_UNSPEC (d->vmode,
24065 gen_rtvec (3, d->op0, d->op1, offset),
24066 UNSPEC_EXT));
ae0533da
AL
24067 return true;
24068}
24069
43cacb12
RS
24070/* Recognize patterns for the REV{64,32,16} insns, which reverse elements
24071 within each 64-bit, 32-bit or 16-bit granule. */
923fcec3
AL
24072
24073static bool
43cacb12 24074aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
923fcec3 24075{
6a70badb
RS
24076 HOST_WIDE_INT diff;
24077 unsigned int i, size, unspec;
43cacb12 24078 machine_mode pred_mode;
923fcec3 24079
43cacb12
RS
24080 if (d->vec_flags == VEC_SVE_PRED
24081 || !d->one_vector_p
98452668
AC
24082 || !d->perm[0].is_constant (&diff)
24083 || !diff)
923fcec3
AL
24084 return false;
24085
6c3ce63b
RS
24086 if (d->vec_flags & VEC_SVE_DATA)
24087 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
24088 else
24089 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
24090 if (size == 64)
43cacb12
RS
24091 {
24092 unspec = UNSPEC_REV64;
24093 pred_mode = VNx2BImode;
24094 }
6c3ce63b 24095 else if (size == 32)
43cacb12
RS
24096 {
24097 unspec = UNSPEC_REV32;
24098 pred_mode = VNx4BImode;
24099 }
6c3ce63b 24100 else if (size == 16)
43cacb12
RS
24101 {
24102 unspec = UNSPEC_REV16;
24103 pred_mode = VNx8BImode;
24104 }
3f8334a5
RS
24105 else
24106 return false;
923fcec3 24107
326ac20e
RS
24108 unsigned int step = diff + 1;
24109 for (i = 0; i < step; ++i)
24110 if (!d->perm.series_p (i, step, diff - i, step))
24111 return false;
923fcec3
AL
24112
24113 /* Success! */
24114 if (d->testing_p)
24115 return true;
24116
6c3ce63b
RS
24117 if (d->vec_flags & VEC_SVE_DATA)
24118 {
24119 rtx pred = aarch64_ptrue_reg (pred_mode);
24120 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
24121 d->target, pred, d->op0));
d7a09c44 24122 return true;
43cacb12 24123 }
d7a09c44 24124 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
43cacb12
RS
24125 emit_set_insn (d->target, src);
24126 return true;
24127}
24128
24129/* Recognize patterns for the REV insn, which reverses elements within
24130 a full vector. */
24131
24132static bool
24133aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
24134{
24135 poly_uint64 nelt = d->perm.length ();
24136
28350fd1 24137 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
43cacb12
RS
24138 return false;
24139
24140 if (!d->perm.series_p (0, 1, nelt - 1, -1))
24141 return false;
24142
24143 /* Success! */
24144 if (d->testing_p)
24145 return true;
24146
24147 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
24148 emit_set_insn (d->target, src);
923fcec3
AL
24149 return true;
24150}
24151
91bd4114
JG
24152static bool
24153aarch64_evpc_dup (struct expand_vec_perm_d *d)
24154{
91bd4114
JG
24155 rtx out = d->target;
24156 rtx in0;
6a70badb 24157 HOST_WIDE_INT elt;
ef4bddc2 24158 machine_mode vmode = d->vmode;
91bd4114
JG
24159 rtx lane;
24160
43cacb12
RS
24161 if (d->vec_flags == VEC_SVE_PRED
24162 || d->perm.encoding ().encoded_nelts () != 1
6a70badb 24163 || !d->perm[0].is_constant (&elt))
326ac20e
RS
24164 return false;
24165
6c3ce63b
RS
24166 if ((d->vec_flags & VEC_SVE_DATA)
24167 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
43cacb12
RS
24168 return false;
24169
326ac20e
RS
24170 /* Success! */
24171 if (d->testing_p)
24172 return true;
24173
91bd4114
JG
24174 /* The generic preparation in aarch64_expand_vec_perm_const_1
24175 swaps the operand order and the permute indices if it finds
24176 d->perm[0] to be in the second operand. Thus, we can always
24177 use d->op0 and need not do any extra arithmetic to get the
24178 correct lane number. */
24179 in0 = d->op0;
f901401e 24180 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
91bd4114 24181
3f8334a5
RS
24182 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24183 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24184 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
91bd4114
JG
24185 return true;
24186}
24187
88b08073
JG
24188static bool
24189aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24190{
43cacb12 24191 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
ef4bddc2 24192 machine_mode vmode = d->vmode;
6a70badb
RS
24193
24194 /* Make sure that the indices are constant. */
24195 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24196 for (unsigned int i = 0; i < encoded_nelts; ++i)
24197 if (!d->perm[i].is_constant ())
24198 return false;
88b08073 24199
88b08073
JG
24200 if (d->testing_p)
24201 return true;
24202
24203 /* Generic code will try constant permutation twice. Once with the
24204 original mode and again with the elements lowered to QImode.
24205 So wait and don't do the selector expansion ourselves. */
24206 if (vmode != V8QImode && vmode != V16QImode)
24207 return false;
24208
6a70badb
RS
24209 /* to_constant is safe since this routine is specific to Advanced SIMD
24210 vectors. */
24211 unsigned int nelt = d->perm.length ().to_constant ();
24212 for (unsigned int i = 0; i < nelt; ++i)
24213 /* If big-endian and two vectors we end up with a weird mixed-endian
24214 mode on NEON. Reverse the index within each word but not the word
24215 itself. to_constant is safe because we checked is_constant above. */
24216 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24217 ? d->perm[i].to_constant () ^ (nelt - 1)
24218 : d->perm[i].to_constant ());
bbcc9c00 24219
88b08073
JG
24220 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24221 sel = force_reg (vmode, sel);
24222
24223 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24224 return true;
24225}
24226
43cacb12
RS
24227/* Try to implement D using an SVE TBL instruction. */
24228
24229static bool
24230aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24231{
24232 unsigned HOST_WIDE_INT nelt;
24233
24234 /* Permuting two variable-length vectors could overflow the
24235 index range. */
24236 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24237 return false;
24238
24239 if (d->testing_p)
24240 return true;
24241
d083ee47 24242 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
43cacb12 24243 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
e25c95ef
RS
24244 if (d->one_vector_p)
24245 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24246 else
24247 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
43cacb12
RS
24248 return true;
24249}
24250
494bec02
PK
24251/* Try to implement D using SVE dup instruction. */
24252
24253static bool
24254aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24255{
24256 if (BYTES_BIG_ENDIAN
24257 || !d->one_vector_p
24258 || d->vec_flags != VEC_SVE_DATA
24259 || d->op_vec_flags != VEC_ADVSIMD
24260 || d->perm.encoding ().nelts_per_pattern () != 1
24261 || !known_eq (d->perm.encoding ().npatterns (),
24262 GET_MODE_NUNITS (d->op_mode))
24263 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24264 return false;
24265
24266 int npatterns = d->perm.encoding ().npatterns ();
24267 for (int i = 0; i < npatterns; i++)
24268 if (!known_eq (d->perm[i], i))
24269 return false;
24270
24271 if (d->testing_p)
24272 return true;
24273
24274 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24275 return true;
24276}
24277
9556ef20
PK
24278/* Try to implement D using SVE SEL instruction. */
24279
24280static bool
24281aarch64_evpc_sel (struct expand_vec_perm_d *d)
24282{
24283 machine_mode vmode = d->vmode;
24284 int unit_size = GET_MODE_UNIT_SIZE (vmode);
24285
24286 if (d->vec_flags != VEC_SVE_DATA
24287 || unit_size > 8)
24288 return false;
24289
24290 int n_patterns = d->perm.encoding ().npatterns ();
24291 poly_int64 vec_len = d->perm.length ();
24292
24293 for (int i = 0; i < n_patterns; ++i)
24294 if (!known_eq (d->perm[i], i)
24295 && !known_eq (d->perm[i], vec_len + i))
24296 return false;
24297
24298 for (int i = n_patterns; i < n_patterns * 2; i++)
24299 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24300 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24301 return false;
24302
24303 if (d->testing_p)
24304 return true;
24305
cc68f7c2 24306 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
9556ef20 24307
b2f5b380 24308 /* Build a predicate that is true when op0 elements should be used. */
9556ef20
PK
24309 rtx_vector_builder builder (pred_mode, n_patterns, 2);
24310 for (int i = 0; i < n_patterns * 2; i++)
24311 {
24312 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24313 : CONST0_RTX (BImode);
24314 builder.quick_push (elem);
24315 }
24316
24317 rtx const_vec = builder.build ();
24318 rtx pred = force_reg (pred_mode, const_vec);
b2f5b380
RS
24319 /* TARGET = PRED ? OP0 : OP1. */
24320 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
9556ef20
PK
24321 return true;
24322}
24323
c9c87e6f
DP
24324/* Recognize patterns suitable for the INS instructions. */
24325static bool
24326aarch64_evpc_ins (struct expand_vec_perm_d *d)
24327{
24328 machine_mode mode = d->vmode;
24329 unsigned HOST_WIDE_INT nelt;
24330
24331 if (d->vec_flags != VEC_ADVSIMD)
24332 return false;
24333
24334 /* to_constant is safe since this routine is specific to Advanced SIMD
24335 vectors. */
24336 nelt = d->perm.length ().to_constant ();
24337 rtx insv = d->op0;
24338
24339 HOST_WIDE_INT idx = -1;
24340
24341 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24342 {
24343 HOST_WIDE_INT elt;
24344 if (!d->perm[i].is_constant (&elt))
24345 return false;
24346 if (elt == (HOST_WIDE_INT) i)
24347 continue;
24348 if (idx != -1)
24349 {
24350 idx = -1;
24351 break;
24352 }
24353 idx = i;
24354 }
24355
24356 if (idx == -1)
24357 {
24358 insv = d->op1;
24359 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24360 {
24361 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24362 continue;
24363 if (idx != -1)
24364 return false;
24365 idx = i;
24366 }
24367
24368 if (idx == -1)
24369 return false;
24370 }
24371
24372 if (d->testing_p)
24373 return true;
24374
24375 gcc_assert (idx != -1);
24376
24377 unsigned extractindex = d->perm[idx].to_constant ();
24378 rtx extractv = d->op0;
24379 if (extractindex >= nelt)
24380 {
24381 extractv = d->op1;
24382 extractindex -= nelt;
24383 }
24384 gcc_assert (extractindex < nelt);
24385
c9c87e6f
DP
24386 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24387 expand_operand ops[5];
24388 create_output_operand (&ops[0], d->target, mode);
52fa7717 24389 create_input_operand (&ops[1], insv, mode);
c9c87e6f
DP
24390 create_integer_operand (&ops[2], 1 << idx);
24391 create_input_operand (&ops[3], extractv, mode);
24392 create_integer_operand (&ops[4], extractindex);
24393 expand_insn (icode, 5, ops);
24394
24395 return true;
24396}
24397
88b08073
JG
24398static bool
24399aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24400{
494bec02
PK
24401 gcc_assert (d->op_mode != E_VOIDmode);
24402
88b08073
JG
24403 /* The pattern matching functions above are written to look for a small
24404 number to begin the sequence (0, 1, N/2). If we begin with an index
24405 from the second operand, we can swap the operands. */
6a70badb
RS
24406 poly_int64 nelt = d->perm.length ();
24407 if (known_ge (d->perm[0], nelt))
88b08073 24408 {
e3342de4 24409 d->perm.rotate_inputs (1);
cb5c6c29 24410 std::swap (d->op0, d->op1);
88b08073
JG
24411 }
24412
721c0fb3 24413 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
43cacb12 24414 || d->vec_flags == VEC_SVE_DATA
6c3ce63b 24415 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
43cacb12
RS
24416 || d->vec_flags == VEC_SVE_PRED)
24417 && known_gt (nelt, 1))
cc4d934f 24418 {
494bec02
PK
24419 if (d->vmode == d->op_mode)
24420 {
24421 if (aarch64_evpc_rev_local (d))
24422 return true;
24423 else if (aarch64_evpc_rev_global (d))
24424 return true;
24425 else if (aarch64_evpc_ext (d))
24426 return true;
24427 else if (aarch64_evpc_dup (d))
24428 return true;
24429 else if (aarch64_evpc_zip (d))
24430 return true;
24431 else if (aarch64_evpc_uzp (d))
24432 return true;
24433 else if (aarch64_evpc_trn (d))
24434 return true;
24435 else if (aarch64_evpc_sel (d))
24436 return true;
24437 else if (aarch64_evpc_ins (d))
24438 return true;
24439 else if (aarch64_evpc_reencode (d))
24440 return true;
24441
24442 if (d->vec_flags == VEC_SVE_DATA)
24443 return aarch64_evpc_sve_tbl (d);
24444 else if (d->vec_flags == VEC_ADVSIMD)
24445 return aarch64_evpc_tbl (d);
24446 }
24447 else
24448 {
24449 if (aarch64_evpc_sve_dup (d))
24450 return true;
24451 }
cc4d934f 24452 }
88b08073
JG
24453 return false;
24454}
24455
f151c9e1 24456/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
88b08073 24457
f151c9e1 24458static bool
ae8decf1
PK
24459aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24460 rtx target, rtx op0, rtx op1,
24461 const vec_perm_indices &sel)
88b08073
JG
24462{
24463 struct expand_vec_perm_d d;
88b08073 24464
326ac20e 24465 /* Check whether the mask can be applied to a single vector. */
e25c95ef
RS
24466 if (sel.ninputs () == 1
24467 || (op0 && rtx_equal_p (op0, op1)))
326ac20e
RS
24468 d.one_vector_p = true;
24469 else if (sel.all_from_input_p (0))
88b08073 24470 {
326ac20e
RS
24471 d.one_vector_p = true;
24472 op1 = op0;
88b08073 24473 }
326ac20e 24474 else if (sel.all_from_input_p (1))
88b08073 24475 {
88b08073 24476 d.one_vector_p = true;
326ac20e 24477 op0 = op1;
88b08073 24478 }
326ac20e
RS
24479 else
24480 d.one_vector_p = false;
88b08073 24481
326ac20e
RS
24482 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24483 sel.nelts_per_input ());
24484 d.vmode = vmode;
43cacb12 24485 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
494bec02
PK
24486 d.op_mode = op_mode;
24487 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
326ac20e 24488 d.target = target;
4cbebddc 24489 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
b1d1e2b5
JJ
24490 if (op0 == op1)
24491 d.op1 = d.op0;
24492 else
4cbebddc 24493 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
326ac20e 24494 d.testing_p = !target;
e3342de4 24495
f151c9e1
RS
24496 if (!d.testing_p)
24497 return aarch64_expand_vec_perm_const_1 (&d);
88b08073 24498
326ac20e 24499 rtx_insn *last = get_last_insn ();
f151c9e1 24500 bool ret = aarch64_expand_vec_perm_const_1 (&d);
326ac20e 24501 gcc_assert (last == get_last_insn ());
88b08073
JG
24502
24503 return ret;
24504}
24505
c98aabc1
TC
24506/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */
24507
24508bool
24509aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
24510 tree vectype, wide_int cst,
24511 rtx *output, rtx in0, rtx in1)
24512{
24513 if (code != TRUNC_DIV_EXPR
24514 || !TYPE_UNSIGNED (vectype))
24515 return false;
24516
8c2451ba
TC
24517 machine_mode mode = TYPE_MODE (vectype);
24518 unsigned int flags = aarch64_classify_vector_mode (mode);
c98aabc1
TC
24519 if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
24520 return false;
24521
71f3036b
TC
24522 int pow = wi::exact_log2 (cst + 1);
24523 auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
24524 /* SVE actually has a div operator, we may have gotten here through
24525 that route. */
24526 if (pow != (int) (element_precision (vectype) / 2)
24527 || insn_code == CODE_FOR_nothing)
24528 return false;
24529
24530 /* We can use the optimized pattern. */
c98aabc1 24531 if (in0 == NULL_RTX && in1 == NULL_RTX)
71f3036b 24532 return true;
c98aabc1 24533
c98aabc1
TC
24534 gcc_assert (output);
24535
8c2451ba
TC
24536 expand_operand ops[3];
24537 create_output_operand (&ops[0], *output, mode);
24538 create_input_operand (&ops[1], in0, mode);
24539 create_fixed_operand (&ops[2], in1);
24540 expand_insn (insn_code, 3, ops);
24541 *output = ops[0].value;
c98aabc1
TC
24542 return true;
24543}
24544
73e3da51
RS
24545/* Generate a byte permute mask for a register of mode MODE,
24546 which has NUNITS units. */
24547
668046d1 24548rtx
73e3da51 24549aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
668046d1
DS
24550{
24551 /* We have to reverse each vector because we dont have
24552 a permuted load that can reverse-load according to ABI rules. */
24553 rtx mask;
24554 rtvec v = rtvec_alloc (16);
73e3da51
RS
24555 unsigned int i, j;
24556 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
668046d1
DS
24557
24558 gcc_assert (BYTES_BIG_ENDIAN);
24559 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24560
24561 for (i = 0; i < nunits; i++)
24562 for (j = 0; j < usize; j++)
24563 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24564 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24565 return force_reg (V16QImode, mask);
24566}
24567
4a942af6 24568/* Expand an SVE integer comparison using the SVE equivalent of:
f22d7973 24569
4a942af6
RS
24570 (set TARGET (CODE OP0 OP1)). */
24571
24572void
24573aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
f22d7973 24574{
4a942af6
RS
24575 machine_mode pred_mode = GET_MODE (target);
24576 machine_mode data_mode = GET_MODE (op0);
00fa90d9
RS
24577 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24578 op0, op1);
24579 if (!rtx_equal_p (target, res))
24580 emit_move_insn (target, res);
f22d7973
RS
24581}
24582
43cacb12
RS
24583/* Return the UNSPEC_COND_* code for comparison CODE. */
24584
24585static unsigned int
24586aarch64_unspec_cond_code (rtx_code code)
24587{
24588 switch (code)
24589 {
24590 case NE:
cb18e86d 24591 return UNSPEC_COND_FCMNE;
43cacb12 24592 case EQ:
cb18e86d 24593 return UNSPEC_COND_FCMEQ;
43cacb12 24594 case LT:
cb18e86d 24595 return UNSPEC_COND_FCMLT;
43cacb12 24596 case GT:
cb18e86d 24597 return UNSPEC_COND_FCMGT;
43cacb12 24598 case LE:
cb18e86d 24599 return UNSPEC_COND_FCMLE;
43cacb12 24600 case GE:
cb18e86d 24601 return UNSPEC_COND_FCMGE;
4a942af6
RS
24602 case UNORDERED:
24603 return UNSPEC_COND_FCMUO;
43cacb12
RS
24604 default:
24605 gcc_unreachable ();
24606 }
24607}
24608
f22d7973 24609/* Emit:
43cacb12 24610
4a942af6 24611 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 24612
4a942af6
RS
24613 where <X> is the operation associated with comparison CODE.
24614 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
f22d7973
RS
24615
24616static void
4a942af6
RS
24617aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24618 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 24619{
4a942af6 24620 rtx flag = gen_int_mode (known_ptrue_p, SImode);
f22d7973 24621 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
4a942af6 24622 gen_rtvec (4, pred, flag, op0, op1),
f22d7973
RS
24623 aarch64_unspec_cond_code (code));
24624 emit_set_insn (target, unspec);
43cacb12
RS
24625}
24626
f22d7973 24627/* Emit the SVE equivalent of:
43cacb12 24628
4a942af6
RS
24629 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24630 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
f22d7973 24631 (set TARGET (ior:PRED_MODE TMP1 TMP2))
43cacb12 24632
4a942af6
RS
24633 where <Xi> is the operation associated with comparison CODEi.
24634 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
24635
24636static void
4a942af6
RS
24637aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24638 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 24639{
4a942af6 24640 machine_mode pred_mode = GET_MODE (pred);
43cacb12 24641 rtx tmp1 = gen_reg_rtx (pred_mode);
4a942af6 24642 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
43cacb12 24643 rtx tmp2 = gen_reg_rtx (pred_mode);
4a942af6 24644 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
f22d7973 24645 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
43cacb12
RS
24646}
24647
f22d7973 24648/* Emit the SVE equivalent of:
43cacb12 24649
4a942af6 24650 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 24651 (set TARGET (not TMP))
43cacb12 24652
4a942af6
RS
24653 where <X> is the operation associated with comparison CODE.
24654 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
24655
24656static void
4a942af6
RS
24657aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24658 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 24659{
4a942af6 24660 machine_mode pred_mode = GET_MODE (pred);
f22d7973 24661 rtx tmp = gen_reg_rtx (pred_mode);
4a942af6 24662 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
f22d7973 24663 aarch64_emit_unop (target, one_cmpl_optab, tmp);
43cacb12
RS
24664}
24665
f22d7973 24666/* Expand an SVE floating-point comparison using the SVE equivalent of:
43cacb12 24667
f22d7973 24668 (set TARGET (CODE OP0 OP1))
43cacb12
RS
24669
24670 If CAN_INVERT_P is true, the caller can also handle inverted results;
24671 return true if the result is in fact inverted. */
24672
24673bool
24674aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24675 rtx op0, rtx op1, bool can_invert_p)
24676{
24677 machine_mode pred_mode = GET_MODE (target);
24678 machine_mode data_mode = GET_MODE (op0);
24679
16de3637 24680 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
24681 switch (code)
24682 {
24683 case UNORDERED:
24684 /* UNORDERED has no immediate form. */
24685 op1 = force_reg (data_mode, op1);
f22d7973 24686 /* fall through */
43cacb12
RS
24687 case LT:
24688 case LE:
24689 case GT:
24690 case GE:
24691 case EQ:
24692 case NE:
f22d7973
RS
24693 {
24694 /* There is native support for the comparison. */
4a942af6 24695 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
24696 return false;
24697 }
43cacb12
RS
24698
24699 case LTGT:
24700 /* This is a trapping operation (LT or GT). */
4a942af6 24701 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
43cacb12
RS
24702 return false;
24703
24704 case UNEQ:
24705 if (!flag_trapping_math)
24706 {
24707 /* This would trap for signaling NaNs. */
24708 op1 = force_reg (data_mode, op1);
4a942af6
RS
24709 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24710 ptrue, true, op0, op1);
43cacb12
RS
24711 return false;
24712 }
24713 /* fall through */
43cacb12
RS
24714 case UNLT:
24715 case UNLE:
24716 case UNGT:
24717 case UNGE:
f22d7973
RS
24718 if (flag_trapping_math)
24719 {
24720 /* Work out which elements are ordered. */
24721 rtx ordered = gen_reg_rtx (pred_mode);
24722 op1 = force_reg (data_mode, op1);
4a942af6
RS
24723 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24724 ptrue, true, op0, op1);
f22d7973
RS
24725
24726 /* Test the opposite condition for the ordered elements,
24727 then invert the result. */
24728 if (code == UNEQ)
24729 code = NE;
24730 else
24731 code = reverse_condition_maybe_unordered (code);
24732 if (can_invert_p)
24733 {
4a942af6
RS
24734 aarch64_emit_sve_fp_cond (target, code,
24735 ordered, false, op0, op1);
f22d7973
RS
24736 return true;
24737 }
4a942af6
RS
24738 aarch64_emit_sve_invert_fp_cond (target, code,
24739 ordered, false, op0, op1);
f22d7973
RS
24740 return false;
24741 }
24742 break;
24743
24744 case ORDERED:
24745 /* ORDERED has no immediate form. */
24746 op1 = force_reg (data_mode, op1);
24747 break;
43cacb12
RS
24748
24749 default:
24750 gcc_unreachable ();
24751 }
f22d7973
RS
24752
24753 /* There is native support for the inverse comparison. */
24754 code = reverse_condition_maybe_unordered (code);
24755 if (can_invert_p)
24756 {
4a942af6 24757 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
24758 return true;
24759 }
4a942af6 24760 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973 24761 return false;
43cacb12
RS
24762}
24763
24764/* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
24765 of the data being selected and CMP_MODE is the mode of the values being
24766 compared. */
24767
24768void
24769aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24770 rtx *ops)
24771{
10116ec1 24772 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
43cacb12
RS
24773 rtx pred = gen_reg_rtx (pred_mode);
24774 if (FLOAT_MODE_P (cmp_mode))
24775 {
24776 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24777 ops[4], ops[5], true))
24778 std::swap (ops[1], ops[2]);
24779 }
24780 else
24781 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24782
d29f7dd5
RS
24783 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24784 ops[1] = force_reg (data_mode, ops[1]);
24785 /* The "false" value can only be zero if the "true" value is a constant. */
24786 if (register_operand (ops[1], data_mode)
24787 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24788 ops[2] = force_reg (data_mode, ops[2]);
24789
43cacb12
RS
24790 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24791 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24792}
24793
99e1629f
RS
24794/* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
24795 true. However due to issues with register allocation it is preferable
24796 to avoid tieing integer scalar and FP scalar modes. Executing integer
24797 operations in general registers is better than treating them as scalar
24798 vector operations. This reduces latency and avoids redundant int<->FP
24799 moves. So tie modes if they are either the same class, or vector modes
24800 with other vector modes, vector structs or any scalar mode. */
97e1ad78 24801
99e1629f 24802static bool
ef4bddc2 24803aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
97e1ad78 24804{
66f206b8
JW
24805 if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24806 != aarch64_advsimd_partial_struct_mode_p (mode2))
24807 && maybe_gt (GET_MODE_SIZE (mode1), 8)
24808 && maybe_gt (GET_MODE_SIZE (mode2), 8))
24809 return false;
24810
97e1ad78
JG
24811 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24812 return true;
24813
24814 /* We specifically want to allow elements of "structure" modes to
24815 be tieable to the structure. This more general condition allows
43cacb12
RS
24816 other rarer situations too. The reason we don't extend this to
24817 predicate modes is that there are no predicate structure modes
24818 nor any specific instructions for extracting part of a predicate
24819 register. */
24820 if (aarch64_vector_data_mode_p (mode1)
24821 && aarch64_vector_data_mode_p (mode2))
61f17a5c
WD
24822 return true;
24823
24824 /* Also allow any scalar modes with vectors. */
24825 if (aarch64_vector_mode_supported_p (mode1)
24826 || aarch64_vector_mode_supported_p (mode2))
97e1ad78
JG
24827 return true;
24828
24829 return false;
24830}
24831
e2c75eea
JG
24832/* Return a new RTX holding the result of moving POINTER forward by
24833 AMOUNT bytes. */
24834
24835static rtx
6a70badb 24836aarch64_move_pointer (rtx pointer, poly_int64 amount)
e2c75eea
JG
24837{
24838 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24839
24840 return adjust_automodify_address (pointer, GET_MODE (pointer),
24841 next, amount);
24842}
24843
24844/* Return a new RTX holding the result of moving POINTER forward by the
24845 size of the mode it points to. */
24846
24847static rtx
24848aarch64_progress_pointer (rtx pointer)
24849{
6a70badb 24850 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
e2c75eea
JG
24851}
24852
24853/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24854 MODE bytes. */
24855
24856static void
24857aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
ef4bddc2 24858 machine_mode mode)
e2c75eea 24859{
7cda9e08
SD
24860 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
24861 address copies using V4SImode so that we can use Q registers. */
24862 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24863 {
24864 mode = V4SImode;
24865 rtx reg1 = gen_reg_rtx (mode);
24866 rtx reg2 = gen_reg_rtx (mode);
24867 /* "Cast" the pointers to the correct mode. */
24868 *src = adjust_address (*src, mode, 0);
24869 *dst = adjust_address (*dst, mode, 0);
24870 /* Emit the memcpy. */
24871 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24872 aarch64_progress_pointer (*src)));
24873 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24874 aarch64_progress_pointer (*dst), reg2));
24875 /* Move the pointers forward. */
24876 *src = aarch64_move_pointer (*src, 32);
24877 *dst = aarch64_move_pointer (*dst, 32);
24878 return;
24879 }
24880
e2c75eea
JG
24881 rtx reg = gen_reg_rtx (mode);
24882
24883 /* "Cast" the pointers to the correct mode. */
24884 *src = adjust_address (*src, mode, 0);
24885 *dst = adjust_address (*dst, mode, 0);
24886 /* Emit the memcpy. */
24887 emit_move_insn (reg, *src);
24888 emit_move_insn (*dst, reg);
24889 /* Move the pointers forward. */
24890 *src = aarch64_progress_pointer (*src);
24891 *dst = aarch64_progress_pointer (*dst);
24892}
24893
0caf592d
KT
24894/* Expand a cpymem using the MOPS extension. OPERANDS are taken
24895 from the cpymem pattern. Return true iff we succeeded. */
24896static bool
24897aarch64_expand_cpymem_mops (rtx *operands)
24898{
24899 if (!TARGET_MOPS)
24900 return false;
65b77d0e
RS
24901
24902 /* All three registers are changed by the instruction, so each one
24903 must be a fresh pseudo. */
24904 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24905 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24906 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24907 rtx src_mem = replace_equiv_address (operands[1], src_addr);
24908 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24909 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
0caf592d
KT
24910
24911 return true;
24912}
24913
76715c32 24914/* Expand cpymem, as if from a __builtin_memcpy. Return true if
a459ee44
KT
24915 we succeed, otherwise return false, indicating that a libcall to
24916 memcpy should be emitted. */
e2c75eea
JG
24917
24918bool
76715c32 24919aarch64_expand_cpymem (rtx *operands)
e2c75eea 24920{
1d77928f 24921 int mode_bits;
e2c75eea
JG
24922 rtx dst = operands[0];
24923 rtx src = operands[1];
24924 rtx base;
1d77928f 24925 machine_mode cur_mode = BLKmode;
e2c75eea 24926
0caf592d 24927 /* Variable-sized memcpy can go through the MOPS expansion if available. */
e2c75eea 24928 if (!CONST_INT_P (operands[2]))
0caf592d 24929 return aarch64_expand_cpymem_mops (operands);
e2c75eea 24930
1d77928f 24931 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
e2c75eea 24932
0caf592d
KT
24933 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
24934 unsigned HOST_WIDE_INT max_copy_size
24935 = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
1d77928f 24936
a459ee44
KT
24937 bool size_p = optimize_function_for_size_p (cfun);
24938
0caf592d
KT
24939 /* Large constant-sized cpymem should go through MOPS when possible.
24940 It should be a win even for size optimization in the general case.
24941 For speed optimization the choice between MOPS and the SIMD sequence
24942 depends on the size of the copy, rather than number of instructions,
24943 alignment etc. */
a459ee44 24944 if (size > max_copy_size)
0caf592d 24945 return aarch64_expand_cpymem_mops (operands);
e2c75eea 24946
1d77928f
WD
24947 int copy_bits = 256;
24948
24949 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24950 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
24951 if (size <= 24
24952 || !TARGET_SIMD
24953 || (aarch64_tune_params.extra_tuning_flags
24954 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
a459ee44 24955 copy_bits = 128;
1d77928f 24956
a459ee44
KT
24957 /* Emit an inline load+store sequence and count the number of operations
24958 involved. We use a simple count of just the loads and stores emitted
24959 rather than rtx_insn count as all the pointer adjustments and reg copying
24960 in this function will get optimized away later in the pipeline. */
24961 start_sequence ();
24962 unsigned nops = 0;
0f801e0b 24963
e2c75eea
JG
24964 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24965 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24966
24967 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24968 src = adjust_automodify_address (src, VOIDmode, base, 0);
24969
1d77928f
WD
24970 /* Convert size to bits to make the rest of the code simpler. */
24971 int n = size * BITS_PER_UNIT;
f7e1d19d 24972
89c52e5e 24973 while (n > 0)
e2c75eea 24974 {
89c52e5e
TC
24975 /* Find the largest mode in which to do the copy in without over reading
24976 or writing. */
24977 opt_scalar_int_mode mode_iter;
24978 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
1d77928f 24979 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
89c52e5e 24980 cur_mode = mode_iter.require ();
e2c75eea 24981
89c52e5e 24982 gcc_assert (cur_mode != BLKmode);
e2c75eea 24983
89c52e5e 24984 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
1d77928f
WD
24985
24986 /* Prefer Q-register accesses for the last bytes. */
24987 if (mode_bits == 128 && copy_bits == 256)
24988 cur_mode = V4SImode;
24989
89c52e5e 24990 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
a459ee44
KT
24991 /* A single block copy is 1 load + 1 store. */
24992 nops += 2;
89c52e5e 24993 n -= mode_bits;
e2c75eea 24994
0caf592d
KT
24995 /* Emit trailing copies using overlapping unaligned accesses
24996 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
24997 if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
89c52e5e 24998 {
1d77928f 24999 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
f7e1d19d 25000 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
1d77928f 25001 gcc_assert (n_bits <= mode_bits);
89c52e5e
TC
25002 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
25003 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25004 n = n_bits;
e2c75eea
JG
25005 }
25006 }
a459ee44
KT
25007 rtx_insn *seq = get_insns ();
25008 end_sequence ();
0caf592d
KT
25009 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
25010 the constant size into a register. */
25011 unsigned mops_cost = 3 + 1;
25012
25013 /* If MOPS is available at this point we don't consider the libcall as it's
25014 not a win even on code size. At this point only consider MOPS if
25015 optimizing for size. For speed optimizations we will have chosen between
25016 the two based on copy size already. */
25017 if (TARGET_MOPS)
25018 {
25019 if (size_p && mops_cost < nops)
25020 return aarch64_expand_cpymem_mops (operands);
25021 emit_insn (seq);
25022 return true;
25023 }
a459ee44
KT
25024
25025 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
0caf592d
KT
25026 arguments + 1 for the call. When MOPS is not available and we're
25027 optimizing for size a libcall may be preferable. */
a459ee44
KT
25028 unsigned libcall_cost = 4;
25029 if (size_p && libcall_cost < nops)
25030 return false;
e2c75eea 25031
a459ee44 25032 emit_insn (seq);
e2c75eea
JG
25033 return true;
25034}
25035
54bbde55
SD
25036/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
25037 SRC is a register we have created with the duplicated value to be set. */
25038static void
25039aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
25040 machine_mode mode)
25041{
25042 /* If we are copying 128bits or 256bits, we can do that straight from
25043 the SIMD register we prepared. */
25044 if (known_eq (GET_MODE_BITSIZE (mode), 256))
25045 {
25046 mode = GET_MODE (src);
25047 /* "Cast" the *dst to the correct mode. */
25048 *dst = adjust_address (*dst, mode, 0);
25049 /* Emit the memset. */
25050 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
25051 aarch64_progress_pointer (*dst), src));
25052
25053 /* Move the pointers forward. */
25054 *dst = aarch64_move_pointer (*dst, 32);
25055 return;
25056 }
25057 if (known_eq (GET_MODE_BITSIZE (mode), 128))
25058 {
25059 /* "Cast" the *dst to the correct mode. */
25060 *dst = adjust_address (*dst, GET_MODE (src), 0);
25061 /* Emit the memset. */
25062 emit_move_insn (*dst, src);
25063 /* Move the pointers forward. */
25064 *dst = aarch64_move_pointer (*dst, 16);
25065 return;
25066 }
25067 /* For copying less, we have to extract the right amount from src. */
25068 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
25069
25070 /* "Cast" the *dst to the correct mode. */
25071 *dst = adjust_address (*dst, mode, 0);
25072 /* Emit the memset. */
25073 emit_move_insn (*dst, reg);
25074 /* Move the pointer forward. */
25075 *dst = aarch64_progress_pointer (*dst);
25076}
25077
d3bd985e
KT
25078/* Expand a setmem using the MOPS instructions. OPERANDS are the same
25079 as for the setmem pattern. Return true iff we succeed. */
25080static bool
25081aarch64_expand_setmem_mops (rtx *operands)
25082{
25083 if (!TARGET_MOPS)
25084 return false;
25085
65b77d0e
RS
25086 /* The first two registers are changed by the instruction, so both
25087 of them must be a fresh pseudo. */
25088 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
25089 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
25090 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
d3bd985e 25091 rtx val = operands[2];
65b77d0e
RS
25092 if (val != CONST0_RTX (QImode))
25093 val = force_reg (QImode, val);
25094 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
d3bd985e
KT
25095 return true;
25096}
25097
54bbde55
SD
25098/* Expand setmem, as if from a __builtin_memset. Return true if
25099 we succeed, otherwise return false. */
25100
25101bool
25102aarch64_expand_setmem (rtx *operands)
25103{
25104 int n, mode_bits;
25105 unsigned HOST_WIDE_INT len;
25106 rtx dst = operands[0];
25107 rtx val = operands[2], src;
25108 rtx base;
25109 machine_mode cur_mode = BLKmode, next_mode;
25110
d3bd985e
KT
25111 /* If we don't have SIMD registers or the size is variable use the MOPS
25112 inlined sequence if possible. */
25113 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
25114 return aarch64_expand_setmem_mops (operands);
54bbde55 25115
8f95e3c0 25116 bool size_p = optimize_function_for_size_p (cfun);
54bbde55 25117
d3bd985e
KT
25118 /* Default the maximum to 256-bytes when considering only libcall vs
25119 SIMD broadcast sequence. */
54bbde55
SD
25120 unsigned max_set_size = 256;
25121
54bbde55 25122 len = INTVAL (operands[1]);
d3bd985e 25123 if (len > max_set_size && !TARGET_MOPS)
54bbde55
SD
25124 return false;
25125
d3bd985e
KT
25126 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
25127 /* The MOPS sequence takes:
25128 3 instructions for the memory storing
25129 + 1 to move the constant size into a reg
25130 + 1 if VAL is a non-zero constant to move into a reg
25131 (zero constants can use XZR directly). */
25132 unsigned mops_cost = 3 + 1 + cst_val;
25133 /* A libcall to memset in the worst case takes 3 instructions to prepare
25134 the arguments + 1 for the call. */
25135 unsigned libcall_cost = 4;
25136
25137 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
25138 when available. */
25139 if (TARGET_MOPS
25140 && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
25141 return aarch64_expand_setmem_mops (operands);
25142
8f95e3c0 25143 /* Attempt a sequence with a vector broadcast followed by stores.
d3bd985e
KT
25144 Count the number of operations involved to see if it's worth it
25145 against the alternatives. A simple counter simd_ops on the
25146 algorithmically-relevant operations is used rather than an rtx_insn count
25147 as all the pointer adjusmtents and mode reinterprets will be optimized
25148 away later. */
8f95e3c0 25149 start_sequence ();
d3bd985e
KT
25150 unsigned simd_ops = 0;
25151
54bbde55
SD
25152 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
25153 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
25154
25155 /* Prepare the val using a DUP/MOVI v0.16B, val. */
25156 src = expand_vector_broadcast (V16QImode, val);
25157 src = force_reg (V16QImode, src);
d3bd985e 25158 simd_ops++;
54bbde55
SD
25159 /* Convert len to bits to make the rest of the code simpler. */
25160 n = len * BITS_PER_UNIT;
25161
25162 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
d3bd985e 25163 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
8f95e3c0
KT
25164 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25165 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
54bbde55
SD
25166 ? GET_MODE_BITSIZE (TImode) : 256;
25167
25168 while (n > 0)
25169 {
25170 /* Find the largest mode in which to do the copy without
25171 over writing. */
25172 opt_scalar_int_mode mode_iter;
25173 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25174 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25175 cur_mode = mode_iter.require ();
25176
25177 gcc_assert (cur_mode != BLKmode);
25178
25179 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25180 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
d3bd985e 25181 simd_ops++;
54bbde55
SD
25182 n -= mode_bits;
25183
25184 /* Do certain trailing copies as overlapping if it's going to be
25185 cheaper. i.e. less instructions to do so. For instance doing a 15
25186 byte copy it's more efficient to do two overlapping 8 byte copies than
a45786e9
AP
25187 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
25188 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
54bbde55
SD
25189 {
25190 next_mode = smallest_mode_for_size (n, MODE_INT);
25191 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25192 gcc_assert (n_bits <= mode_bits);
25193 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25194 n = n_bits;
25195 }
25196 }
8f95e3c0
KT
25197 rtx_insn *seq = get_insns ();
25198 end_sequence ();
54bbde55 25199
d3bd985e
KT
25200 if (size_p)
25201 {
25202 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25203 call to memset or the MOPS expansion. */
25204 if (TARGET_MOPS
25205 && mops_cost <= libcall_cost
25206 && mops_cost <= simd_ops)
25207 return aarch64_expand_setmem_mops (operands);
25208 /* If MOPS is not available or not shorter pick a libcall if the SIMD
25209 sequence is too long. */
25210 else if (libcall_cost < simd_ops)
25211 return false;
25212 emit_insn (seq);
25213 return true;
25214 }
25215
25216 /* At this point the SIMD broadcast sequence is the best choice when
25217 optimizing for speed. */
8f95e3c0 25218 emit_insn (seq);
54bbde55
SD
25219 return true;
25220}
25221
25222
141a3ccf
KT
25223/* Split a DImode store of a CONST_INT SRC to MEM DST as two
25224 SImode stores. Handle the case when the constant has identical
25225 bottom and top halves. This is beneficial when the two stores can be
25226 merged into an STP and we avoid synthesising potentially expensive
25227 immediates twice. Return true if such a split is possible. */
25228
25229bool
25230aarch64_split_dimode_const_store (rtx dst, rtx src)
25231{
25232 rtx lo = gen_lowpart (SImode, src);
25233 rtx hi = gen_highpart_mode (SImode, DImode, src);
25234
25235 bool size_p = optimize_function_for_size_p (cfun);
25236
25237 if (!rtx_equal_p (lo, hi))
25238 return false;
25239
25240 unsigned int orig_cost
25241 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25242 unsigned int lo_cost
25243 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25244
25245 /* We want to transform:
25246 MOV x1, 49370
25247 MOVK x1, 0x140, lsl 16
25248 MOVK x1, 0xc0da, lsl 32
25249 MOVK x1, 0x140, lsl 48
25250 STR x1, [x0]
25251 into:
25252 MOV w1, 49370
25253 MOVK w1, 0x140, lsl 16
25254 STP w1, w1, [x0]
25255 So we want to perform this only when we save two instructions
25256 or more. When optimizing for size, however, accept any code size
25257 savings we can. */
25258 if (size_p && orig_cost <= lo_cost)
25259 return false;
25260
25261 if (!size_p
25262 && (orig_cost <= lo_cost + 1))
25263 return false;
25264
25265 rtx mem_lo = adjust_address (dst, SImode, 0);
25266 if (!aarch64_mem_pair_operand (mem_lo, SImode))
25267 return false;
25268
25269 rtx tmp_reg = gen_reg_rtx (SImode);
25270 aarch64_expand_mov_immediate (tmp_reg, lo);
25271 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25272 /* Don't emit an explicit store pair as this may not be always profitable.
25273 Let the sched-fusion logic decide whether to merge them. */
25274 emit_move_insn (mem_lo, tmp_reg);
25275 emit_move_insn (mem_hi, tmp_reg);
25276
25277 return true;
25278}
25279
30c46053
MC
25280/* Generate RTL for a conditional branch with rtx comparison CODE in
25281 mode CC_MODE. The destination of the unlikely conditional branch
25282 is LABEL_REF. */
25283
25284void
25285aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25286 rtx label_ref)
25287{
25288 rtx x;
25289 x = gen_rtx_fmt_ee (code, VOIDmode,
25290 gen_rtx_REG (cc_mode, CC_REGNUM),
25291 const0_rtx);
25292
25293 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25294 gen_rtx_LABEL_REF (VOIDmode, label_ref),
25295 pc_rtx);
25296 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25297}
25298
25299/* Generate DImode scratch registers for 128-bit (TImode) addition.
25300
25301 OP1 represents the TImode destination operand 1
25302 OP2 represents the TImode destination operand 2
25303 LOW_DEST represents the low half (DImode) of TImode operand 0
25304 LOW_IN1 represents the low half (DImode) of TImode operand 1
25305 LOW_IN2 represents the low half (DImode) of TImode operand 2
25306 HIGH_DEST represents the high half (DImode) of TImode operand 0
25307 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25308 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25309
25310void
25311aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25312 rtx *low_in1, rtx *low_in2,
25313 rtx *high_dest, rtx *high_in1,
25314 rtx *high_in2)
25315{
25316 *low_dest = gen_reg_rtx (DImode);
25317 *low_in1 = gen_lowpart (DImode, op1);
25318 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25319 subreg_lowpart_offset (DImode, TImode));
25320 *high_dest = gen_reg_rtx (DImode);
25321 *high_in1 = gen_highpart (DImode, op1);
25322 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25323 subreg_highpart_offset (DImode, TImode));
25324}
25325
25326/* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25327
25328 This function differs from 'arch64_addti_scratch_regs' in that
25329 OP1 can be an immediate constant (zero). We must call
25330 subreg_highpart_offset with DImode and TImode arguments, otherwise
25331 VOIDmode will be used for the const_int which generates an internal
25332 error from subreg_size_highpart_offset which does not expect a size of zero.
25333
25334 OP1 represents the TImode destination operand 1
25335 OP2 represents the TImode destination operand 2
25336 LOW_DEST represents the low half (DImode) of TImode operand 0
25337 LOW_IN1 represents the low half (DImode) of TImode operand 1
25338 LOW_IN2 represents the low half (DImode) of TImode operand 2
25339 HIGH_DEST represents the high half (DImode) of TImode operand 0
25340 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25341 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25342
25343
25344void
25345aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25346 rtx *low_in1, rtx *low_in2,
25347 rtx *high_dest, rtx *high_in1,
25348 rtx *high_in2)
25349{
25350 *low_dest = gen_reg_rtx (DImode);
25351 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25352 subreg_lowpart_offset (DImode, TImode));
25353
25354 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25355 subreg_lowpart_offset (DImode, TImode));
25356 *high_dest = gen_reg_rtx (DImode);
25357
25358 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25359 subreg_highpart_offset (DImode, TImode));
25360 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25361 subreg_highpart_offset (DImode, TImode));
25362}
25363
25364/* Generate RTL for 128-bit (TImode) subtraction with overflow.
25365
25366 OP0 represents the TImode destination operand 0
25367 LOW_DEST represents the low half (DImode) of TImode operand 0
25368 LOW_IN1 represents the low half (DImode) of TImode operand 1
25369 LOW_IN2 represents the low half (DImode) of TImode operand 2
25370 HIGH_DEST represents the high half (DImode) of TImode operand 0
25371 HIGH_IN1 represents the high half (DImode) of TImode operand 1
a58fe3c5
RE
25372 HIGH_IN2 represents the high half (DImode) of TImode operand 2
25373 UNSIGNED_P is true if the operation is being performed on unsigned
25374 values. */
30c46053
MC
25375void
25376aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25377 rtx low_in2, rtx high_dest, rtx high_in1,
a58fe3c5 25378 rtx high_in2, bool unsigned_p)
30c46053
MC
25379{
25380 if (low_in2 == const0_rtx)
25381 {
25382 low_dest = low_in1;
a58fe3c5
RE
25383 high_in2 = force_reg (DImode, high_in2);
25384 if (unsigned_p)
25385 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25386 else
25387 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
30c46053
MC
25388 }
25389 else
25390 {
d80f0a8d
JJ
25391 if (aarch64_plus_immediate (low_in2, DImode))
25392 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
618ae596 25393 GEN_INT (-UINTVAL (low_in2))));
d80f0a8d 25394 else
30c46053 25395 {
d80f0a8d
JJ
25396 low_in2 = force_reg (DImode, low_in2);
25397 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
30c46053 25398 }
d80f0a8d 25399 high_in2 = force_reg (DImode, high_in2);
a58fe3c5
RE
25400
25401 if (unsigned_p)
25402 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25403 else
25404 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
30c46053
MC
25405 }
25406
25407 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25408 emit_move_insn (gen_highpart (DImode, op0), high_dest);
25409
25410}
25411
a3125fc2
CL
25412/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
25413
25414static unsigned HOST_WIDE_INT
25415aarch64_asan_shadow_offset (void)
25416{
10078f3e
AP
25417 if (TARGET_ILP32)
25418 return (HOST_WIDE_INT_1 << 29);
25419 else
25420 return (HOST_WIDE_INT_1 << 36);
a3125fc2
CL
25421}
25422
5f3bc026 25423static rtx
cb4347e8 25424aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
5f3bc026
ZC
25425 int code, tree treeop0, tree treeop1)
25426{
c8012fbc
WD
25427 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25428 rtx op0, op1;
5f3bc026 25429 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 25430 insn_code icode;
5f3bc026
ZC
25431 struct expand_operand ops[4];
25432
5f3bc026
ZC
25433 start_sequence ();
25434 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25435
25436 op_mode = GET_MODE (op0);
25437 if (op_mode == VOIDmode)
25438 op_mode = GET_MODE (op1);
25439
25440 switch (op_mode)
25441 {
4e10a5a7
RS
25442 case E_QImode:
25443 case E_HImode:
25444 case E_SImode:
5f3bc026
ZC
25445 cmp_mode = SImode;
25446 icode = CODE_FOR_cmpsi;
25447 break;
25448
4e10a5a7 25449 case E_DImode:
5f3bc026
ZC
25450 cmp_mode = DImode;
25451 icode = CODE_FOR_cmpdi;
25452 break;
25453
4e10a5a7 25454 case E_SFmode:
786e3c06
WD
25455 cmp_mode = SFmode;
25456 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25457 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25458 break;
25459
4e10a5a7 25460 case E_DFmode:
786e3c06
WD
25461 cmp_mode = DFmode;
25462 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25463 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25464 break;
25465
5f3bc026
ZC
25466 default:
25467 end_sequence ();
25468 return NULL_RTX;
25469 }
25470
c8012fbc
WD
25471 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25472 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
5f3bc026
ZC
25473 if (!op0 || !op1)
25474 {
25475 end_sequence ();
25476 return NULL_RTX;
25477 }
25478 *prep_seq = get_insns ();
25479 end_sequence ();
25480
c8012fbc
WD
25481 create_fixed_operand (&ops[0], op0);
25482 create_fixed_operand (&ops[1], op1);
5f3bc026
ZC
25483
25484 start_sequence ();
c8012fbc 25485 if (!maybe_expand_insn (icode, 2, ops))
5f3bc026
ZC
25486 {
25487 end_sequence ();
25488 return NULL_RTX;
25489 }
25490 *gen_seq = get_insns ();
25491 end_sequence ();
25492
c8012fbc
WD
25493 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25494 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
5f3bc026
ZC
25495}
25496
25497static rtx
cb4347e8
TS
25498aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25499 int cmp_code, tree treeop0, tree treeop1, int bit_code)
5f3bc026 25500{
c8012fbc
WD
25501 rtx op0, op1, target;
25502 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
5f3bc026 25503 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 25504 insn_code icode;
5f3bc026 25505 struct expand_operand ops[6];
c8012fbc 25506 int aarch64_cond;
5f3bc026 25507
cb4347e8 25508 push_to_sequence (*prep_seq);
5f3bc026
ZC
25509 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25510
25511 op_mode = GET_MODE (op0);
25512 if (op_mode == VOIDmode)
25513 op_mode = GET_MODE (op1);
25514
25515 switch (op_mode)
25516 {
4e10a5a7
RS
25517 case E_QImode:
25518 case E_HImode:
25519 case E_SImode:
5f3bc026 25520 cmp_mode = SImode;
5f3bc026
ZC
25521 break;
25522
4e10a5a7 25523 case E_DImode:
5f3bc026 25524 cmp_mode = DImode;
5f3bc026
ZC
25525 break;
25526
4e10a5a7 25527 case E_SFmode:
786e3c06
WD
25528 cmp_mode = SFmode;
25529 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
25530 break;
25531
4e10a5a7 25532 case E_DFmode:
786e3c06
WD
25533 cmp_mode = DFmode;
25534 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
25535 break;
25536
5f3bc026
ZC
25537 default:
25538 end_sequence ();
25539 return NULL_RTX;
25540 }
25541
865257c4
RS
25542 icode = code_for_ccmp (cc_mode, cmp_mode);
25543
5f3bc026
ZC
25544 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25545 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25546 if (!op0 || !op1)
25547 {
25548 end_sequence ();
25549 return NULL_RTX;
25550 }
25551 *prep_seq = get_insns ();
25552 end_sequence ();
25553
25554 target = gen_rtx_REG (cc_mode, CC_REGNUM);
c8012fbc 25555 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
5f3bc026 25556
c8012fbc
WD
25557 if (bit_code != AND)
25558 {
865257c4
RS
25559 /* Treat the ccmp patterns as canonical and use them where possible,
25560 but fall back to ccmp_rev patterns if there's no other option. */
25561 rtx_code prev_code = GET_CODE (prev);
25562 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25563 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25564 && !(prev_code == EQ
25565 || prev_code == NE
25566 || prev_code == ORDERED
25567 || prev_code == UNORDERED))
25568 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25569 else
25570 {
25571 rtx_code code = reverse_condition (prev_code);
25572 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25573 }
c8012fbc
WD
25574 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25575 }
25576
25577 create_fixed_operand (&ops[0], XEXP (prev, 0));
5f3bc026
ZC
25578 create_fixed_operand (&ops[1], target);
25579 create_fixed_operand (&ops[2], op0);
25580 create_fixed_operand (&ops[3], op1);
c8012fbc
WD
25581 create_fixed_operand (&ops[4], prev);
25582 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
5f3bc026 25583
cb4347e8 25584 push_to_sequence (*gen_seq);
5f3bc026
ZC
25585 if (!maybe_expand_insn (icode, 6, ops))
25586 {
25587 end_sequence ();
25588 return NULL_RTX;
25589 }
25590
25591 *gen_seq = get_insns ();
25592 end_sequence ();
25593
c8012fbc 25594 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
5f3bc026
ZC
25595}
25596
25597#undef TARGET_GEN_CCMP_FIRST
25598#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25599
25600#undef TARGET_GEN_CCMP_NEXT
25601#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25602
6a569cdd
KT
25603/* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
25604 instruction fusion of some sort. */
25605
25606static bool
25607aarch64_macro_fusion_p (void)
25608{
b175b679 25609 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
6a569cdd
KT
25610}
25611
25612
25613/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
25614 should be kept together during scheduling. */
25615
25616static bool
25617aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25618{
25619 rtx set_dest;
25620 rtx prev_set = single_set (prev);
25621 rtx curr_set = single_set (curr);
25622 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
25623 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25624
25625 if (!aarch64_macro_fusion_p ())
25626 return false;
25627
d7b03373 25628 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
6a569cdd
KT
25629 {
25630 /* We are trying to match:
25631 prev (mov) == (set (reg r0) (const_int imm16))
25632 curr (movk) == (set (zero_extract (reg r0)
25633 (const_int 16)
25634 (const_int 16))
25635 (const_int imm16_1)) */
25636
25637 set_dest = SET_DEST (curr_set);
25638
25639 if (GET_CODE (set_dest) == ZERO_EXTRACT
25640 && CONST_INT_P (SET_SRC (curr_set))
25641 && CONST_INT_P (SET_SRC (prev_set))
25642 && CONST_INT_P (XEXP (set_dest, 2))
25643 && INTVAL (XEXP (set_dest, 2)) == 16
25644 && REG_P (XEXP (set_dest, 0))
25645 && REG_P (SET_DEST (prev_set))
25646 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25647 {
25648 return true;
25649 }
25650 }
25651
d7b03373 25652 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
9bbe08fe
KT
25653 {
25654
25655 /* We're trying to match:
25656 prev (adrp) == (set (reg r1)
25657 (high (symbol_ref ("SYM"))))
25658 curr (add) == (set (reg r0)
25659 (lo_sum (reg r1)
25660 (symbol_ref ("SYM"))))
25661 Note that r0 need not necessarily be the same as r1, especially
25662 during pre-regalloc scheduling. */
25663
25664 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25665 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25666 {
25667 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25668 && REG_P (XEXP (SET_SRC (curr_set), 0))
25669 && REGNO (XEXP (SET_SRC (curr_set), 0))
25670 == REGNO (SET_DEST (prev_set))
25671 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25672 XEXP (SET_SRC (curr_set), 1)))
25673 return true;
25674 }
25675 }
25676
d7b03373 25677 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
cd0cb232
KT
25678 {
25679
25680 /* We're trying to match:
25681 prev (movk) == (set (zero_extract (reg r0)
25682 (const_int 16)
25683 (const_int 32))
25684 (const_int imm16_1))
25685 curr (movk) == (set (zero_extract (reg r0)
25686 (const_int 16)
25687 (const_int 48))
25688 (const_int imm16_2)) */
25689
25690 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25691 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25692 && REG_P (XEXP (SET_DEST (prev_set), 0))
25693 && REG_P (XEXP (SET_DEST (curr_set), 0))
25694 && REGNO (XEXP (SET_DEST (prev_set), 0))
25695 == REGNO (XEXP (SET_DEST (curr_set), 0))
25696 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25697 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25698 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25699 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25700 && CONST_INT_P (SET_SRC (prev_set))
25701 && CONST_INT_P (SET_SRC (curr_set)))
25702 return true;
25703
25704 }
d7b03373 25705 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
d8354ad7
KT
25706 {
25707 /* We're trying to match:
25708 prev (adrp) == (set (reg r0)
25709 (high (symbol_ref ("SYM"))))
25710 curr (ldr) == (set (reg r1)
25711 (mem (lo_sum (reg r0)
25712 (symbol_ref ("SYM")))))
25713 or
25714 curr (ldr) == (set (reg r1)
25715 (zero_extend (mem
25716 (lo_sum (reg r0)
25717 (symbol_ref ("SYM")))))) */
25718 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25719 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25720 {
25721 rtx curr_src = SET_SRC (curr_set);
25722
25723 if (GET_CODE (curr_src) == ZERO_EXTEND)
25724 curr_src = XEXP (curr_src, 0);
25725
25726 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25727 && REG_P (XEXP (XEXP (curr_src, 0), 0))
25728 && REGNO (XEXP (XEXP (curr_src, 0), 0))
25729 == REGNO (SET_DEST (prev_set))
25730 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25731 XEXP (SET_SRC (prev_set), 0)))
25732 return true;
25733 }
25734 }
cd0cb232 25735
a4f3fa71 25736 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
d7b03373 25737 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
a4f3fa71
WD
25738 && prev_set && curr_set && any_condjump_p (curr)
25739 && GET_CODE (SET_SRC (prev_set)) == COMPARE
25740 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25741 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25742 return true;
25743
25744 /* Fuse flag-setting ALU instructions and conditional branch. */
25745 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
3759108f
AP
25746 && any_condjump_p (curr))
25747 {
509f819a
N
25748 unsigned int condreg1, condreg2;
25749 rtx cc_reg_1;
25750 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25751 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25752
25753 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25754 && prev
25755 && modified_in_p (cc_reg_1, prev))
25756 {
f8a27206
AP
25757 enum attr_type prev_type = get_attr_type (prev);
25758
509f819a
N
25759 /* FIXME: this misses some which is considered simple arthematic
25760 instructions for ThunderX. Simple shifts are missed here. */
25761 if (prev_type == TYPE_ALUS_SREG
25762 || prev_type == TYPE_ALUS_IMM
25763 || prev_type == TYPE_LOGICS_REG
25764 || prev_type == TYPE_LOGICS_IMM)
25765 return true;
25766 }
3759108f
AP
25767 }
25768
a4f3fa71 25769 /* Fuse ALU instructions and CBZ/CBNZ. */
bee7e0fc
AP
25770 if (prev_set
25771 && curr_set
a4f3fa71 25772 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
00c7c57f
JB
25773 && any_condjump_p (curr))
25774 {
25775 /* We're trying to match:
25776 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25777 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
25778 (const_int 0))
25779 (label_ref ("SYM"))
25780 (pc)) */
25781 if (SET_DEST (curr_set) == (pc_rtx)
25782 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25783 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25784 && REG_P (SET_DEST (prev_set))
25785 && REGNO (SET_DEST (prev_set))
25786 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25787 {
25788 /* Fuse ALU operations followed by conditional branch instruction. */
25789 switch (get_attr_type (prev))
25790 {
25791 case TYPE_ALU_IMM:
25792 case TYPE_ALU_SREG:
25793 case TYPE_ADC_REG:
25794 case TYPE_ADC_IMM:
25795 case TYPE_ADCS_REG:
25796 case TYPE_ADCS_IMM:
25797 case TYPE_LOGIC_REG:
25798 case TYPE_LOGIC_IMM:
25799 case TYPE_CSEL:
25800 case TYPE_ADR:
25801 case TYPE_MOV_IMM:
25802 case TYPE_SHIFT_REG:
25803 case TYPE_SHIFT_IMM:
25804 case TYPE_BFM:
25805 case TYPE_RBIT:
25806 case TYPE_REV:
25807 case TYPE_EXTEND:
25808 return true;
25809
25810 default:;
25811 }
25812 }
25813 }
25814
590a06af
PT
25815 /* Fuse A+B+1 and A-B-1 */
25816 if (simple_sets_p
25817 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25818 {
25819 /* We're trying to match:
25820 prev == (set (r0) (plus (r0) (r1)))
25821 curr == (set (r0) (plus (r0) (const_int 1)))
25822 or:
25823 prev == (set (r0) (minus (r0) (r1)))
25824 curr == (set (r0) (plus (r0) (const_int -1))) */
25825
25826 rtx prev_src = SET_SRC (prev_set);
25827 rtx curr_src = SET_SRC (curr_set);
25828
25829 int polarity = 1;
25830 if (GET_CODE (prev_src) == MINUS)
25831 polarity = -1;
25832
25833 if (GET_CODE (curr_src) == PLUS
25834 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25835 && CONST_INT_P (XEXP (curr_src, 1))
25836 && INTVAL (XEXP (curr_src, 1)) == polarity
25837 && REG_P (XEXP (curr_src, 0))
25838 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25839 return true;
25840 }
25841
6a569cdd
KT
25842 return false;
25843}
25844
f2879a90
KT
25845/* Return true iff the instruction fusion described by OP is enabled. */
25846
25847bool
25848aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25849{
25850 return (aarch64_tune_params.fusible_ops & op) != 0;
25851}
25852
350013bc
BC
25853/* If MEM is in the form of [base+offset], extract the two parts
25854 of address and set to BASE and OFFSET, otherwise return false
25855 after clearing BASE and OFFSET. */
25856
25857bool
25858extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25859{
25860 rtx addr;
25861
25862 gcc_assert (MEM_P (mem));
25863
25864 addr = XEXP (mem, 0);
25865
25866 if (REG_P (addr))
25867 {
25868 *base = addr;
25869 *offset = const0_rtx;
25870 return true;
25871 }
25872
25873 if (GET_CODE (addr) == PLUS
25874 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25875 {
25876 *base = XEXP (addr, 0);
25877 *offset = XEXP (addr, 1);
25878 return true;
25879 }
25880
25881 *base = NULL_RTX;
25882 *offset = NULL_RTX;
25883
25884 return false;
25885}
25886
25887/* Types for scheduling fusion. */
25888enum sched_fusion_type
25889{
25890 SCHED_FUSION_NONE = 0,
25891 SCHED_FUSION_LD_SIGN_EXTEND,
25892 SCHED_FUSION_LD_ZERO_EXTEND,
25893 SCHED_FUSION_LD,
25894 SCHED_FUSION_ST,
25895 SCHED_FUSION_NUM
25896};
25897
25898/* If INSN is a load or store of address in the form of [base+offset],
25899 extract the two parts and set to BASE and OFFSET. Return scheduling
25900 fusion type this INSN is. */
25901
25902static enum sched_fusion_type
25903fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25904{
25905 rtx x, dest, src;
25906 enum sched_fusion_type fusion = SCHED_FUSION_LD;
25907
25908 gcc_assert (INSN_P (insn));
25909 x = PATTERN (insn);
25910 if (GET_CODE (x) != SET)
25911 return SCHED_FUSION_NONE;
25912
25913 src = SET_SRC (x);
25914 dest = SET_DEST (x);
25915
abc52318
KT
25916 machine_mode dest_mode = GET_MODE (dest);
25917
25918 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
350013bc
BC
25919 return SCHED_FUSION_NONE;
25920
25921 if (GET_CODE (src) == SIGN_EXTEND)
25922 {
25923 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25924 src = XEXP (src, 0);
3793ecc1 25925 if (!MEM_P (src) || GET_MODE (src) != SImode)
350013bc
BC
25926 return SCHED_FUSION_NONE;
25927 }
25928 else if (GET_CODE (src) == ZERO_EXTEND)
25929 {
25930 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25931 src = XEXP (src, 0);
3793ecc1 25932 if (!MEM_P (src) || GET_MODE (src) != SImode)
350013bc
BC
25933 return SCHED_FUSION_NONE;
25934 }
25935
3793ecc1 25936 if (MEM_P (src) && REG_P (dest))
350013bc 25937 extract_base_offset_in_addr (src, base, offset);
3793ecc1 25938 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
350013bc
BC
25939 {
25940 fusion = SCHED_FUSION_ST;
25941 extract_base_offset_in_addr (dest, base, offset);
25942 }
25943 else
25944 return SCHED_FUSION_NONE;
25945
25946 if (*base == NULL_RTX || *offset == NULL_RTX)
25947 fusion = SCHED_FUSION_NONE;
25948
25949 return fusion;
25950}
25951
25952/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25953
25954 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25955 and PRI are only calculated for these instructions. For other instruction,
25956 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
25957 type instruction fusion can be added by returning different priorities.
25958
25959 It's important that irrelevant instructions get the largest FUSION_PRI. */
25960
25961static void
25962aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25963 int *fusion_pri, int *pri)
25964{
25965 int tmp, off_val;
25966 rtx base, offset;
25967 enum sched_fusion_type fusion;
25968
25969 gcc_assert (INSN_P (insn));
25970
25971 tmp = max_pri - 1;
25972 fusion = fusion_load_store (insn, &base, &offset);
25973 if (fusion == SCHED_FUSION_NONE)
25974 {
25975 *pri = tmp;
25976 *fusion_pri = tmp;
25977 return;
25978 }
25979
25980 /* Set FUSION_PRI according to fusion type and base register. */
25981 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25982
25983 /* Calculate PRI. */
25984 tmp /= 2;
25985
25986 /* INSN with smaller offset goes first. */
25987 off_val = (int)(INTVAL (offset));
25988 if (off_val >= 0)
25989 tmp -= (off_val & 0xfffff);
25990 else
25991 tmp += ((- off_val) & 0xfffff);
25992
25993 *pri = tmp;
25994 return;
25995}
25996
9bca63d4
WD
25997/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25998 Adjust priority of sha1h instructions so they are scheduled before
25999 other SHA1 instructions. */
26000
26001static int
26002aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
26003{
26004 rtx x = PATTERN (insn);
26005
26006 if (GET_CODE (x) == SET)
26007 {
26008 x = SET_SRC (x);
26009
26010 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
26011 return priority + 10;
26012 }
26013
26014 return priority;
26015}
26016
958448a9
RS
26017/* If REVERSED is null, return true if memory reference *MEM2 comes
26018 immediately after memory reference *MEM1. Do not change the references
26019 in this case.
26020
26021 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
526e1639
RS
26022 if they are, try to make them use constant offsets from the same base
26023 register. Return true on success. When returning true, set *REVERSED
26024 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
26025static bool
26026aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
26027{
958448a9
RS
26028 if (reversed)
26029 *reversed = false;
26030
526e1639
RS
26031 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
26032 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
26033 return false;
26034
26035 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
26036 return false;
26037
26038 auto size1 = MEM_SIZE (*mem1);
26039 auto size2 = MEM_SIZE (*mem2);
26040
26041 rtx base1, base2, offset1, offset2;
26042 extract_base_offset_in_addr (*mem1, &base1, &offset1);
26043 extract_base_offset_in_addr (*mem2, &base2, &offset2);
26044
26045 /* Make sure at least one memory is in base+offset form. */
26046 if (!(base1 && offset1) && !(base2 && offset2))
26047 return false;
26048
26049 /* If both mems already use the same base register, just check the
26050 offsets. */
26051 if (base1 && base2 && rtx_equal_p (base1, base2))
26052 {
26053 if (!offset1 || !offset2)
26054 return false;
26055
26056 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
26057 return true;
26058
958448a9 26059 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
526e1639
RS
26060 {
26061 *reversed = true;
26062 return true;
26063 }
26064
26065 return false;
26066 }
26067
26068 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
26069 guarantee that the values are consecutive. */
26070 if (MEM_EXPR (*mem1)
26071 && MEM_EXPR (*mem2)
26072 && MEM_OFFSET_KNOWN_P (*mem1)
26073 && MEM_OFFSET_KNOWN_P (*mem2))
26074 {
26075 poly_int64 expr_offset1;
26076 poly_int64 expr_offset2;
26077 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
26078 &expr_offset1);
26079 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
26080 &expr_offset2);
26081 if (!expr_base1
26082 || !expr_base2
38ec23fa 26083 || !DECL_P (expr_base1)
526e1639
RS
26084 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
26085 return false;
26086
26087 expr_offset1 += MEM_OFFSET (*mem1);
26088 expr_offset2 += MEM_OFFSET (*mem2);
26089
26090 if (known_eq (expr_offset1 + size1, expr_offset2))
26091 ;
958448a9 26092 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
526e1639
RS
26093 *reversed = true;
26094 else
26095 return false;
26096
958448a9 26097 if (reversed)
526e1639 26098 {
958448a9
RS
26099 if (base2)
26100 {
26101 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
26102 expr_offset1 - expr_offset2);
26103 *mem1 = replace_equiv_address_nv (*mem1, addr1);
26104 }
26105 else
26106 {
26107 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
26108 expr_offset2 - expr_offset1);
26109 *mem2 = replace_equiv_address_nv (*mem2, addr2);
26110 }
526e1639
RS
26111 }
26112 return true;
26113 }
26114
26115 return false;
26116}
26117
958448a9
RS
26118/* Return true if MEM1 and MEM2 can be combined into a single access
26119 of mode MODE, with the combined access having the same address as MEM1. */
26120
26121bool
26122aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
26123{
26124 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
26125 return false;
26126 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
26127}
26128
350013bc
BC
26129/* Given OPERANDS of consecutive load/store, check if we can merge
26130 them into ldp/stp. LOAD is true if they are load instructions.
26131 MODE is the mode of memory operands. */
26132
26133bool
26134aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
b8506a8a 26135 machine_mode mode)
350013bc 26136{
350013bc 26137 enum reg_class rclass_1, rclass_2;
526e1639 26138 rtx mem_1, mem_2, reg_1, reg_2;
350013bc
BC
26139
26140 if (load)
26141 {
26142 mem_1 = operands[1];
26143 mem_2 = operands[3];
26144 reg_1 = operands[0];
26145 reg_2 = operands[2];
26146 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
26147 if (REGNO (reg_1) == REGNO (reg_2))
26148 return false;
526e1639
RS
26149 if (reg_overlap_mentioned_p (reg_1, mem_2))
26150 return false;
350013bc
BC
26151 }
26152 else
26153 {
26154 mem_1 = operands[0];
26155 mem_2 = operands[2];
26156 reg_1 = operands[1];
26157 reg_2 = operands[3];
26158 }
26159
bf84ac44
AP
26160 /* The mems cannot be volatile. */
26161 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26162 return false;
26163
54700e2e
AP
26164 /* If we have SImode and slow unaligned ldp,
26165 check the alignment to be at least 8 byte. */
26166 if (mode == SImode
26167 && (aarch64_tune_params.extra_tuning_flags
26168 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26169 && !optimize_size
26170 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26171 return false;
26172
350013bc 26173 /* Check if the addresses are in the form of [base+offset]. */
526e1639
RS
26174 bool reversed = false;
26175 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
350013bc
BC
26176 return false;
26177
dfe1da23
JW
26178 /* The operands must be of the same size. */
26179 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
526e1639 26180 GET_MODE_SIZE (GET_MODE (mem_2))));
350013bc 26181
9b56ec11
JW
26182 /* One of the memory accesses must be a mempair operand.
26183 If it is not the first one, they need to be swapped by the
26184 peephole. */
26185 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26186 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26187 return false;
26188
350013bc
BC
26189 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26190 rclass_1 = FP_REGS;
26191 else
26192 rclass_1 = GENERAL_REGS;
26193
26194 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26195 rclass_2 = FP_REGS;
26196 else
26197 rclass_2 = GENERAL_REGS;
26198
26199 /* Check if the registers are of same class. */
26200 if (rclass_1 != rclass_2)
26201 return false;
26202
26203 return true;
26204}
26205
9b56ec11
JW
26206/* Given OPERANDS of consecutive load/store that can be merged,
26207 swap them if they are not in ascending order. */
26208void
26209aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26210{
526e1639
RS
26211 int mem_op = load ? 1 : 0;
26212 bool reversed = false;
26213 if (!aarch64_check_consecutive_mems (operands + mem_op,
26214 operands + mem_op + 2, &reversed))
26215 gcc_unreachable ();
9b56ec11 26216
526e1639 26217 if (reversed)
9b56ec11
JW
26218 {
26219 /* Irrespective of whether this is a load or a store,
26220 we do the same swap. */
26221 std::swap (operands[0], operands[2]);
26222 std::swap (operands[1], operands[3]);
26223 }
26224}
26225
d0b51297
JW
26226/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26227 comparison between the two. */
26228int
26229aarch64_host_wide_int_compare (const void *x, const void *y)
26230{
26231 return wi::cmps (* ((const HOST_WIDE_INT *) x),
26232 * ((const HOST_WIDE_INT *) y));
26233}
26234
26235/* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26236 other pointing to a REG rtx containing an offset, compare the offsets
26237 of the two pairs.
26238
26239 Return:
26240
26241 1 iff offset (X) > offset (Y)
26242 0 iff offset (X) == offset (Y)
26243 -1 iff offset (X) < offset (Y) */
26244int
26245aarch64_ldrstr_offset_compare (const void *x, const void *y)
26246{
26247 const rtx * operands_1 = (const rtx *) x;
26248 const rtx * operands_2 = (const rtx *) y;
26249 rtx mem_1, mem_2, base, offset_1, offset_2;
26250
26251 if (MEM_P (operands_1[0]))
26252 mem_1 = operands_1[0];
26253 else
26254 mem_1 = operands_1[1];
26255
26256 if (MEM_P (operands_2[0]))
26257 mem_2 = operands_2[0];
26258 else
26259 mem_2 = operands_2[1];
26260
26261 /* Extract the offsets. */
26262 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26263 extract_base_offset_in_addr (mem_2, &base, &offset_2);
26264
26265 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26266
26267 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26268}
26269
350013bc
BC
26270/* Given OPERANDS of consecutive load/store, check if we can merge
26271 them into ldp/stp by adjusting the offset. LOAD is true if they
26272 are load instructions. MODE is the mode of memory operands.
26273
26274 Given below consecutive stores:
26275
26276 str w1, [xb, 0x100]
26277 str w1, [xb, 0x104]
26278 str w1, [xb, 0x108]
26279 str w1, [xb, 0x10c]
26280
26281 Though the offsets are out of the range supported by stp, we can
26282 still pair them after adjusting the offset, like:
26283
26284 add scratch, xb, 0x100
26285 stp w1, w1, [scratch]
26286 stp w1, w1, [scratch, 0x8]
26287
26288 The peephole patterns detecting this opportunity should guarantee
26289 the scratch register is avaliable. */
26290
26291bool
26292aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
cd91a084 26293 machine_mode mode)
350013bc 26294{
34d7854d
JW
26295 const int num_insns = 4;
26296 enum reg_class rclass;
26297 HOST_WIDE_INT offvals[num_insns], msize;
26298 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
350013bc
BC
26299
26300 if (load)
26301 {
34d7854d
JW
26302 for (int i = 0; i < num_insns; i++)
26303 {
26304 reg[i] = operands[2 * i];
26305 mem[i] = operands[2 * i + 1];
26306
26307 gcc_assert (REG_P (reg[i]));
26308 }
d0b51297
JW
26309
26310 /* Do not attempt to merge the loads if the loads clobber each other. */
26311 for (int i = 0; i < 8; i += 2)
26312 for (int j = i + 2; j < 8; j += 2)
26313 if (reg_overlap_mentioned_p (operands[i], operands[j]))
26314 return false;
350013bc
BC
26315 }
26316 else
34d7854d
JW
26317 for (int i = 0; i < num_insns; i++)
26318 {
26319 mem[i] = operands[2 * i];
26320 reg[i] = operands[2 * i + 1];
26321 }
350013bc 26322
34d7854d
JW
26323 /* Skip if memory operand is by itself valid for ldp/stp. */
26324 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
bf84ac44
AP
26325 return false;
26326
34d7854d
JW
26327 for (int i = 0; i < num_insns; i++)
26328 {
26329 /* The mems cannot be volatile. */
26330 if (MEM_VOLATILE_P (mem[i]))
26331 return false;
26332
26333 /* Check if the addresses are in the form of [base+offset]. */
26334 extract_base_offset_in_addr (mem[i], base + i, offset + i);
26335 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26336 return false;
26337 }
26338
363b395b
JW
26339 /* Check if the registers are of same class. */
26340 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26341 ? FP_REGS : GENERAL_REGS;
26342
26343 for (int i = 1; i < num_insns; i++)
26344 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26345 {
26346 if (rclass != FP_REGS)
26347 return false;
26348 }
26349 else
26350 {
26351 if (rclass != GENERAL_REGS)
26352 return false;
26353 }
26354
26355 /* Only the last register in the order in which they occur
26356 may be clobbered by the load. */
26357 if (rclass == GENERAL_REGS && load)
26358 for (int i = 0; i < num_insns - 1; i++)
34d7854d
JW
26359 if (reg_mentioned_p (reg[i], mem[i]))
26360 return false;
350013bc
BC
26361
26362 /* Check if the bases are same. */
34d7854d
JW
26363 for (int i = 0; i < num_insns - 1; i++)
26364 if (!rtx_equal_p (base[i], base[i + 1]))
26365 return false;
26366
26367 for (int i = 0; i < num_insns; i++)
26368 offvals[i] = INTVAL (offset[i]);
350013bc 26369
cd91a084 26370 msize = GET_MODE_SIZE (mode).to_constant ();
d0b51297
JW
26371
26372 /* Check if the offsets can be put in the right order to do a ldp/stp. */
34d7854d
JW
26373 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26374 aarch64_host_wide_int_compare);
d0b51297
JW
26375
26376 if (!(offvals[1] == offvals[0] + msize
26377 && offvals[3] == offvals[2] + msize))
350013bc
BC
26378 return false;
26379
d0b51297
JW
26380 /* Check that offsets are within range of each other. The ldp/stp
26381 instructions have 7 bit immediate offsets, so use 0x80. */
26382 if (offvals[2] - offvals[0] >= msize * 0x80)
26383 return false;
350013bc 26384
d0b51297
JW
26385 /* The offsets must be aligned with respect to each other. */
26386 if (offvals[0] % msize != offvals[2] % msize)
26387 return false;
26388
54700e2e
AP
26389 /* If we have SImode and slow unaligned ldp,
26390 check the alignment to be at least 8 byte. */
26391 if (mode == SImode
26392 && (aarch64_tune_params.extra_tuning_flags
34d7854d 26393 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
54700e2e 26394 && !optimize_size
34d7854d 26395 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
54700e2e
AP
26396 return false;
26397
350013bc
BC
26398 return true;
26399}
26400
26401/* Given OPERANDS of consecutive load/store, this function pairs them
d0b51297
JW
26402 into LDP/STP after adjusting the offset. It depends on the fact
26403 that the operands can be sorted so the offsets are correct for STP.
350013bc
BC
26404 MODE is the mode of memory operands. CODE is the rtl operator
26405 which should be applied to all memory operands, it's SIGN_EXTEND,
26406 ZERO_EXTEND or UNKNOWN. */
26407
26408bool
26409aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
cd91a084 26410 machine_mode mode, RTX_CODE code)
350013bc 26411{
d0b51297 26412 rtx base, offset_1, offset_3, t1, t2;
350013bc 26413 rtx mem_1, mem_2, mem_3, mem_4;
d0b51297
JW
26414 rtx temp_operands[8];
26415 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26416 stp_off_upper_limit, stp_off_lower_limit, msize;
9b56ec11 26417
d0b51297
JW
26418 /* We make changes on a copy as we may still bail out. */
26419 for (int i = 0; i < 8; i ++)
26420 temp_operands[i] = operands[i];
9b56ec11 26421
b662250c
BC
26422 /* Sort the operands. Note for cases as below:
26423 [base + 0x310] = A
26424 [base + 0x320] = B
26425 [base + 0x330] = C
26426 [base + 0x320] = D
26427 We need stable sorting otherwise wrong data may be store to offset 0x320.
26428 Also note the dead store in above case should be optimized away, but no
26429 guarantees here. */
26430 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26431 aarch64_ldrstr_offset_compare);
9b56ec11 26432
f6af9c21
RE
26433 /* Copy the memory operands so that if we have to bail for some
26434 reason the original addresses are unchanged. */
350013bc
BC
26435 if (load)
26436 {
f6af9c21
RE
26437 mem_1 = copy_rtx (temp_operands[1]);
26438 mem_2 = copy_rtx (temp_operands[3]);
26439 mem_3 = copy_rtx (temp_operands[5]);
26440 mem_4 = copy_rtx (temp_operands[7]);
350013bc
BC
26441 }
26442 else
26443 {
f6af9c21
RE
26444 mem_1 = copy_rtx (temp_operands[0]);
26445 mem_2 = copy_rtx (temp_operands[2]);
26446 mem_3 = copy_rtx (temp_operands[4]);
26447 mem_4 = copy_rtx (temp_operands[6]);
350013bc
BC
26448 gcc_assert (code == UNKNOWN);
26449 }
26450
9b56ec11 26451 extract_base_offset_in_addr (mem_1, &base, &offset_1);
d0b51297
JW
26452 extract_base_offset_in_addr (mem_3, &base, &offset_3);
26453 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26454 && offset_3 != NULL_RTX);
350013bc 26455
d0b51297 26456 /* Adjust offset so it can fit in LDP/STP instruction. */
cd91a084 26457 msize = GET_MODE_SIZE (mode).to_constant();
d0b51297
JW
26458 stp_off_upper_limit = msize * (0x40 - 1);
26459 stp_off_lower_limit = - msize * 0x40;
350013bc 26460
d0b51297
JW
26461 off_val_1 = INTVAL (offset_1);
26462 off_val_3 = INTVAL (offset_3);
26463
26464 /* The base offset is optimally half way between the two STP/LDP offsets. */
26465 if (msize <= 4)
26466 base_off = (off_val_1 + off_val_3) / 2;
26467 else
26468 /* However, due to issues with negative LDP/STP offset generation for
0dc8e1e7 26469 larger modes, for DF, DD, DI and vector modes. we must not use negative
d0b51297
JW
26470 addresses smaller than 9 signed unadjusted bits can store. This
26471 provides the most range in this case. */
26472 base_off = off_val_1;
26473
26474 /* Adjust the base so that it is aligned with the addresses but still
26475 optimal. */
26476 if (base_off % msize != off_val_1 % msize)
26477 /* Fix the offset, bearing in mind we want to make it bigger not
26478 smaller. */
26479 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26480 else if (msize <= 4)
26481 /* The negative range of LDP/STP is one larger than the positive range. */
26482 base_off += msize;
26483
26484 /* Check if base offset is too big or too small. We can attempt to resolve
26485 this issue by setting it to the maximum value and seeing if the offsets
26486 still fit. */
26487 if (base_off >= 0x1000)
350013bc 26488 {
d0b51297
JW
26489 base_off = 0x1000 - 1;
26490 /* We must still make sure that the base offset is aligned with respect
700d4cb0 26491 to the address. But it may not be made any bigger. */
d0b51297 26492 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
26493 }
26494
d0b51297
JW
26495 /* Likewise for the case where the base is too small. */
26496 if (base_off <= -0x1000)
350013bc 26497 {
d0b51297
JW
26498 base_off = -0x1000 + 1;
26499 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
26500 }
26501
d0b51297
JW
26502 /* Offset of the first STP/LDP. */
26503 new_off_1 = off_val_1 - base_off;
26504
26505 /* Offset of the second STP/LDP. */
26506 new_off_3 = off_val_3 - base_off;
350013bc 26507
d0b51297
JW
26508 /* The offsets must be within the range of the LDP/STP instructions. */
26509 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26510 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
350013bc
BC
26511 return false;
26512
d0b51297
JW
26513 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26514 new_off_1), true);
26515 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26516 new_off_1 + msize), true);
26517 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26518 new_off_3), true);
26519 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26520 new_off_3 + msize), true);
26521
26522 if (!aarch64_mem_pair_operand (mem_1, mode)
26523 || !aarch64_mem_pair_operand (mem_3, mode))
26524 return false;
350013bc
BC
26525
26526 if (code == ZERO_EXTEND)
26527 {
26528 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26529 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26530 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26531 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26532 }
26533 else if (code == SIGN_EXTEND)
26534 {
26535 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26536 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26537 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26538 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26539 }
26540
26541 if (load)
26542 {
d0b51297 26543 operands[0] = temp_operands[0];
350013bc 26544 operands[1] = mem_1;
d0b51297 26545 operands[2] = temp_operands[2];
350013bc 26546 operands[3] = mem_2;
d0b51297 26547 operands[4] = temp_operands[4];
350013bc 26548 operands[5] = mem_3;
d0b51297 26549 operands[6] = temp_operands[6];
350013bc
BC
26550 operands[7] = mem_4;
26551 }
26552 else
26553 {
26554 operands[0] = mem_1;
d0b51297 26555 operands[1] = temp_operands[1];
350013bc 26556 operands[2] = mem_2;
d0b51297 26557 operands[3] = temp_operands[3];
350013bc 26558 operands[4] = mem_3;
d0b51297 26559 operands[5] = temp_operands[5];
350013bc 26560 operands[6] = mem_4;
d0b51297 26561 operands[7] = temp_operands[7];
350013bc
BC
26562 }
26563
26564 /* Emit adjusting instruction. */
d0b51297 26565 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
350013bc 26566 /* Emit ldp/stp instructions. */
f7df4a84
RS
26567 t1 = gen_rtx_SET (operands[0], operands[1]);
26568 t2 = gen_rtx_SET (operands[2], operands[3]);
350013bc 26569 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
f7df4a84
RS
26570 t1 = gen_rtx_SET (operands[4], operands[5]);
26571 t2 = gen_rtx_SET (operands[6], operands[7]);
350013bc
BC
26572 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26573 return true;
26574}
26575
76a34e3f
RS
26576/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
26577 it isn't worth branching around empty masked ops (including masked
26578 stores). */
26579
26580static bool
26581aarch64_empty_mask_is_expensive (unsigned)
26582{
26583 return false;
26584}
26585
1b1e81f8
JW
26586/* Return 1 if pseudo register should be created and used to hold
26587 GOT address for PIC code. */
26588
26589bool
26590aarch64_use_pseudo_pic_reg (void)
26591{
26592 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26593}
26594
7b841a12
JW
26595/* Implement TARGET_UNSPEC_MAY_TRAP_P. */
26596
26597static int
26598aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26599{
26600 switch (XINT (x, 1))
26601 {
26602 case UNSPEC_GOTSMALLPIC:
26603 case UNSPEC_GOTSMALLPIC28K:
26604 case UNSPEC_GOTTINYPIC:
26605 return 0;
26606 default:
26607 break;
26608 }
26609
26610 return default_unspec_may_trap_p (x, flags);
26611}
26612
39252973
KT
26613
26614/* If X is a positive CONST_DOUBLE with a value that is a power of 2
26615 return the log2 of that value. Otherwise return -1. */
26616
26617int
26618aarch64_fpconst_pow_of_2 (rtx x)
26619{
26620 const REAL_VALUE_TYPE *r;
26621
26622 if (!CONST_DOUBLE_P (x))
26623 return -1;
26624
26625 r = CONST_DOUBLE_REAL_VALUE (x);
26626
26627 if (REAL_VALUE_NEGATIVE (*r)
26628 || REAL_VALUE_ISNAN (*r)
26629 || REAL_VALUE_ISINF (*r)
26630 || !real_isinteger (r, DFmode))
26631 return -1;
26632
26633 return exact_log2 (real_to_integer (r));
26634}
26635
188d0079
JH
26636/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26637 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26638 return n. Otherwise return -1. */
26639
26640int
26641aarch64_fpconst_pow2_recip (rtx x)
26642{
26643 REAL_VALUE_TYPE r0;
26644
26645 if (!CONST_DOUBLE_P (x))
26646 return -1;
26647
26648 r0 = *CONST_DOUBLE_REAL_VALUE (x);
26649 if (exact_real_inverse (DFmode, &r0)
26650 && !REAL_VALUE_NEGATIVE (r0))
26651 {
26652 int ret = exact_log2 (real_to_integer (&r0));
26653 if (ret >= 1 && ret <= 32)
26654 return ret;
26655 }
26656 return -1;
26657}
26658
39252973
KT
26659/* If X is a vector of equal CONST_DOUBLE values and that value is
26660 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
26661
26662int
26663aarch64_vec_fpconst_pow_of_2 (rtx x)
26664{
6a70badb 26665 int nelts;
568b9c0e 26666 if (!CONST_VECTOR_P (x)
6a70badb 26667 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
39252973
KT
26668 return -1;
26669
26670 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26671 return -1;
26672
26673 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26674 if (firstval <= 0)
26675 return -1;
26676
6a70badb 26677 for (int i = 1; i < nelts; i++)
39252973
KT
26678 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26679 return -1;
26680
26681 return firstval;
26682}
26683
11e554b3
JG
26684/* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26685 to float.
26686
26687 __fp16 always promotes through this hook.
26688 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26689 through the generic excess precision logic rather than here. */
26690
c2ec330c
AL
26691static tree
26692aarch64_promoted_type (const_tree t)
26693{
11e554b3
JG
26694 if (SCALAR_FLOAT_TYPE_P (t)
26695 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
c2ec330c 26696 return float_type_node;
11e554b3 26697
c2ec330c
AL
26698 return NULL_TREE;
26699}
ee62a5a6
RS
26700
26701/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
26702
26703static bool
9acc9cbe 26704aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
ee62a5a6
RS
26705 optimization_type opt_type)
26706{
26707 switch (op)
26708 {
26709 case rsqrt_optab:
9acc9cbe 26710 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
ee62a5a6
RS
26711
26712 default:
26713 return true;
26714 }
26715}
26716
43cacb12
RS
26717/* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
26718
26719static unsigned int
26720aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26721 int *offset)
26722{
26723 /* Polynomial invariant 1 == (VG / 2) - 1. */
26724 gcc_assert (i == 1);
26725 *factor = 2;
26726 *offset = 1;
26727 return AARCH64_DWARF_VG;
26728}
26729
11e554b3
JG
26730/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26731 if MODE is HFmode, and punt to the generic implementation otherwise. */
26732
26733static bool
7c5bd57a 26734aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
11e554b3
JG
26735{
26736 return (mode == HFmode
26737 ? true
26738 : default_libgcc_floating_mode_supported_p (mode));
26739}
26740
2e5f8203
JG
26741/* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26742 if MODE is HFmode, and punt to the generic implementation otherwise. */
26743
26744static bool
18e2a8b8 26745aarch64_scalar_mode_supported_p (scalar_mode mode)
2e5f8203 26746{
0dc8e1e7
CL
26747 if (DECIMAL_FLOAT_MODE_P (mode))
26748 return default_decimal_float_supported_p ();
26749
2e5f8203
JG
26750 return (mode == HFmode
26751 ? true
26752 : default_scalar_mode_supported_p (mode));
26753}
26754
11e554b3
JG
26755/* Set the value of FLT_EVAL_METHOD.
26756 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26757
26758 0: evaluate all operations and constants, whose semantic type has at
26759 most the range and precision of type float, to the range and
26760 precision of float; evaluate all other operations and constants to
26761 the range and precision of the semantic type;
26762
26763 N, where _FloatN is a supported interchange floating type
26764 evaluate all operations and constants, whose semantic type has at
26765 most the range and precision of _FloatN type, to the range and
26766 precision of the _FloatN type; evaluate all other operations and
26767 constants to the range and precision of the semantic type;
26768
26769 If we have the ARMv8.2-A extensions then we support _Float16 in native
26770 precision, so we should set this to 16. Otherwise, we support the type,
26771 but want to evaluate expressions in float precision, so set this to
26772 0. */
26773
26774static enum flt_eval_method
26775aarch64_excess_precision (enum excess_precision_type type)
26776{
26777 switch (type)
26778 {
26779 case EXCESS_PRECISION_TYPE_FAST:
26780 case EXCESS_PRECISION_TYPE_STANDARD:
26781 /* We can calculate either in 16-bit range and precision or
26782 32-bit range and precision. Make that decision based on whether
26783 we have native support for the ARMv8.2-A 16-bit floating-point
26784 instructions or not. */
26785 return (TARGET_FP_F16INST
26786 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26787 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26788 case EXCESS_PRECISION_TYPE_IMPLICIT:
f19a3270 26789 case EXCESS_PRECISION_TYPE_FLOAT16:
11e554b3
JG
26790 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26791 default:
26792 gcc_unreachable ();
26793 }
26794 return FLT_EVAL_METHOD_UNPREDICTABLE;
26795}
26796
b48d6421
KT
26797/* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
26798 scheduled for speculative execution. Reject the long-running division
26799 and square-root instructions. */
26800
26801static bool
26802aarch64_sched_can_speculate_insn (rtx_insn *insn)
26803{
26804 switch (get_attr_type (insn))
26805 {
26806 case TYPE_SDIV:
26807 case TYPE_UDIV:
26808 case TYPE_FDIVS:
26809 case TYPE_FDIVD:
26810 case TYPE_FSQRTS:
26811 case TYPE_FSQRTD:
26812 case TYPE_NEON_FP_SQRT_S:
26813 case TYPE_NEON_FP_SQRT_D:
26814 case TYPE_NEON_FP_SQRT_S_Q:
26815 case TYPE_NEON_FP_SQRT_D_Q:
26816 case TYPE_NEON_FP_DIV_S:
26817 case TYPE_NEON_FP_DIV_D:
26818 case TYPE_NEON_FP_DIV_S_Q:
26819 case TYPE_NEON_FP_DIV_D_Q:
26820 return false;
26821 default:
26822 return true;
26823 }
26824}
26825
43cacb12
RS
26826/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
26827
26828static int
26829aarch64_compute_pressure_classes (reg_class *classes)
26830{
26831 int i = 0;
26832 classes[i++] = GENERAL_REGS;
26833 classes[i++] = FP_REGS;
26834 /* PR_REGS isn't a useful pressure class because many predicate pseudo
26835 registers need to go in PR_LO_REGS at some point during their
26836 lifetime. Splitting it into two halves has the effect of making
26837 all predicates count against PR_LO_REGS, so that we try whenever
26838 possible to restrict the number of live predicates to 8. This
26839 greatly reduces the amount of spilling in certain loops. */
26840 classes[i++] = PR_LO_REGS;
26841 classes[i++] = PR_HI_REGS;
26842 return i;
26843}
26844
26845/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
26846
26847static bool
26848aarch64_can_change_mode_class (machine_mode from,
26849 machine_mode to, reg_class_t)
26850{
76607e7e
RS
26851 unsigned int from_flags = aarch64_classify_vector_mode (from);
26852 unsigned int to_flags = aarch64_classify_vector_mode (to);
26853
26854 bool from_sve_p = (from_flags & VEC_ANY_SVE);
26855 bool to_sve_p = (to_flags & VEC_ANY_SVE);
26856
26857 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26858 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26859
38e62001
RS
26860 bool from_pred_p = (from_flags & VEC_SVE_PRED);
26861 bool to_pred_p = (to_flags & VEC_SVE_PRED);
26862
66f206b8
JW
26863 bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26864 | VEC_PARTIAL));
594264e9
TC
26865 bool from_partial_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT
26866 | VEC_PARTIAL));
66f206b8 26867
38e62001
RS
26868 /* Don't allow changes between predicate modes and other modes.
26869 Only predicate registers can hold predicate modes and only
26870 non-predicate registers can hold non-predicate modes, so any
26871 attempt to mix them would require a round trip through memory. */
26872 if (from_pred_p != to_pred_p)
26873 return false;
26874
76607e7e
RS
26875 /* Don't allow changes between partial SVE modes and other modes.
26876 The contents of partial SVE modes are distributed evenly across
26877 the register, whereas GCC expects them to be clustered together. */
26878 if (from_partial_sve_p != to_partial_sve_p)
26879 return false;
26880
26881 /* Similarly reject changes between partial SVE modes that have
26882 different patterns of significant and insignificant bits. */
26883 if (from_partial_sve_p
26884 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26885 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26886 return false;
26887
594264e9
TC
26888 /* Don't allow changes between partial and other registers only if
26889 one is a normal SIMD register, allow only if not larger than 64-bit. */
26890 if ((to_partial_advsimd_struct_p ^ from_partial_advsimd_struct_p)
26891 && (known_gt (GET_MODE_SIZE (to), 8) || known_gt (GET_MODE_SIZE (to), 8)))
66f206b8
JW
26892 return false;
26893
38e62001
RS
26894 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26895 {
26896 /* Don't allow changes between SVE modes and other modes that might
26897 be bigger than 128 bits. In particular, OImode, CImode and XImode
26898 divide into 128-bit quantities while SVE modes divide into
26899 BITS_PER_SVE_VECTOR quantities. */
26900 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26901 return false;
26902 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26903 return false;
26904 }
26905
002092be
RS
26906 if (BYTES_BIG_ENDIAN)
26907 {
002092be
RS
26908 /* Don't allow changes between SVE data modes and non-SVE modes.
26909 See the comment at the head of aarch64-sve.md for details. */
26910 if (from_sve_p != to_sve_p)
26911 return false;
26912
26913 /* Don't allow changes in element size: lane 0 of the new vector
26914 would not then be lane 0 of the old vector. See the comment
26915 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26916 description.
26917
26918 In the worst case, this forces a register to be spilled in
26919 one mode and reloaded in the other, which handles the
26920 endianness correctly. */
26921 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26922 return false;
26923 }
43cacb12
RS
26924 return true;
26925}
26926
5cce8171
RS
26927/* Implement TARGET_EARLY_REMAT_MODES. */
26928
26929static void
26930aarch64_select_early_remat_modes (sbitmap modes)
26931{
26932 /* SVE values are not normally live across a call, so it should be
26933 worth doing early rematerialization even in VL-specific mode. */
26934 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
5c38705d
RS
26935 if (aarch64_sve_mode_p ((machine_mode) i))
26936 bitmap_set_bit (modes, i);
5cce8171
RS
26937}
26938
c0111dc4
RE
26939/* Override the default target speculation_safe_value. */
26940static rtx
26941aarch64_speculation_safe_value (machine_mode mode,
26942 rtx result, rtx val, rtx failval)
26943{
26944 /* Maybe we should warn if falling back to hard barriers. They are
26945 likely to be noticably more expensive than the alternative below. */
26946 if (!aarch64_track_speculation)
26947 return default_speculation_safe_value (mode, result, val, failval);
26948
26949 if (!REG_P (val))
26950 val = copy_to_mode_reg (mode, val);
26951
26952 if (!aarch64_reg_or_zero (failval, mode))
26953 failval = copy_to_mode_reg (mode, failval);
26954
21cebf90 26955 emit_insn (gen_despeculate_copy (mode, result, val, failval));
c0111dc4
RE
26956 return result;
26957}
26958
2d56d6ba
KT
26959/* Implement TARGET_ESTIMATED_POLY_VALUE.
26960 Look into the tuning structure for an estimate.
64432b68
KT
26961 KIND specifies the type of requested estimate: min, max or likely.
26962 For cores with a known SVE width all three estimates are the same.
26963 For generic SVE tuning we want to distinguish the maximum estimate from
26964 the minimum and likely ones.
26965 The likely estimate is the same as the minimum in that case to give a
26966 conservative behavior of auto-vectorizing with SVE when it is a win
26967 even for 128-bit SVE.
26968 When SVE width information is available VAL.coeffs[1] is multiplied by
26969 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
2d56d6ba
KT
26970
26971static HOST_WIDE_INT
64432b68
KT
26972aarch64_estimated_poly_value (poly_int64 val,
26973 poly_value_estimate_kind kind
26974 = POLY_VALUE_LIKELY)
2d56d6ba 26975{
fa3ca615 26976 unsigned int width_source = aarch64_tune_params.sve_width;
2d56d6ba 26977
64432b68
KT
26978 /* If there is no core-specific information then the minimum and likely
26979 values are based on 128-bit vectors and the maximum is based on
26980 the architectural maximum of 2048 bits. */
2d56d6ba 26981 if (width_source == SVE_SCALABLE)
64432b68
KT
26982 switch (kind)
26983 {
26984 case POLY_VALUE_MIN:
26985 case POLY_VALUE_LIKELY:
26986 return val.coeffs[0];
26987 case POLY_VALUE_MAX:
26988 return val.coeffs[0] + val.coeffs[1] * 15;
26989 }
2d56d6ba 26990
fa3ca615
RS
26991 /* Allow sve_width to be a bitmask of different VL, treating the lowest
26992 as likely. This could be made more general if future -mtune options
26993 need it to be. */
26994 if (kind == POLY_VALUE_MAX)
26995 width_source = 1 << floor_log2 (width_source);
26996 else
26997 width_source = least_bit_hwi (width_source);
26998
64432b68 26999 /* If the core provides width information, use that. */
2d56d6ba
KT
27000 HOST_WIDE_INT over_128 = width_source - 128;
27001 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
27002}
27003
d9186814
SE
27004
27005/* Return true for types that could be supported as SIMD return or
27006 argument types. */
27007
27008static bool
27009supported_simd_type (tree t)
27010{
27011 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
27012 {
27013 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
27014 return s == 1 || s == 2 || s == 4 || s == 8;
27015 }
27016 return false;
27017}
27018
27019/* Return true for types that currently are supported as SIMD return
27020 or argument types. */
27021
27022static bool
27023currently_supported_simd_type (tree t, tree b)
27024{
27025 if (COMPLEX_FLOAT_TYPE_P (t))
27026 return false;
27027
27028 if (TYPE_SIZE (t) != TYPE_SIZE (b))
27029 return false;
27030
27031 return supported_simd_type (t);
27032}
27033
27034/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
27035
27036static int
27037aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
27038 struct cgraph_simd_clone *clonei,
309e2d95
SL
27039 tree base_type, int num,
27040 bool explicit_p)
d9186814 27041{
39916cea 27042 tree t, ret_type;
abe93733
YY
27043 unsigned int elt_bits, count;
27044 unsigned HOST_WIDE_INT const_simdlen;
27045 poly_uint64 vec_bits;
d9186814
SE
27046
27047 if (!TARGET_SIMD)
27048 return 0;
27049
abe93733
YY
27050 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27051 const simdlens here. */
27052 if (maybe_ne (clonei->simdlen, 0U)
27053 && clonei->simdlen.is_constant (&const_simdlen)
27054 && (const_simdlen < 2
27055 || const_simdlen > 1024
27056 || (const_simdlen & (const_simdlen - 1)) != 0))
d9186814 27057 {
309e2d95
SL
27058 if (explicit_p)
27059 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27060 "unsupported simdlen %wd", const_simdlen);
d9186814
SE
27061 return 0;
27062 }
27063
27064 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
27065 if (TREE_CODE (ret_type) != VOID_TYPE
27066 && !currently_supported_simd_type (ret_type, base_type))
27067 {
309e2d95
SL
27068 if (!explicit_p)
27069 ;
27070 else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
d9186814
SE
27071 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27072 "GCC does not currently support mixed size types "
27073 "for %<simd%> functions");
27074 else if (supported_simd_type (ret_type))
27075 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27076 "GCC does not currently support return type %qT "
27077 "for %<simd%> functions", ret_type);
27078 else
27079 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27080 "unsupported return type %qT for %<simd%> functions",
27081 ret_type);
27082 return 0;
27083 }
27084
fcefc59b
JJ
27085 int i;
27086 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
27087 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
27088
27089 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
27090 t && t != void_list_node; t = TREE_CHAIN (t), i++)
d9186814 27091 {
fcefc59b 27092 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
d9186814 27093
fcefc59b
JJ
27094 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
27095 && !currently_supported_simd_type (arg_type, base_type))
d9186814 27096 {
309e2d95
SL
27097 if (!explicit_p)
27098 ;
27099 else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
d9186814
SE
27100 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27101 "GCC does not currently support mixed size types "
27102 "for %<simd%> functions");
27103 else
27104 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27105 "GCC does not currently support argument type %qT "
27106 "for %<simd%> functions", arg_type);
27107 return 0;
27108 }
27109 }
27110
27111 clonei->vecsize_mangle = 'n';
27112 clonei->mask_mode = VOIDmode;
27113 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
abe93733 27114 if (known_eq (clonei->simdlen, 0U))
d9186814
SE
27115 {
27116 count = 2;
27117 vec_bits = (num == 0 ? 64 : 128);
abe93733 27118 clonei->simdlen = exact_div (vec_bits, elt_bits);
d9186814
SE
27119 }
27120 else
27121 {
27122 count = 1;
27123 vec_bits = clonei->simdlen * elt_bits;
abe93733
YY
27124 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27125 const simdlens here. */
27126 if (clonei->simdlen.is_constant (&const_simdlen)
27127 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
d9186814 27128 {
309e2d95
SL
27129 if (explicit_p)
27130 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27131 "GCC does not currently support simdlen %wd for "
27132 "type %qT",
27133 const_simdlen, base_type);
d9186814
SE
27134 return 0;
27135 }
27136 }
27137 clonei->vecsize_int = vec_bits;
27138 clonei->vecsize_float = vec_bits;
27139 return count;
27140}
27141
27142/* Implement TARGET_SIMD_CLONE_ADJUST. */
27143
27144static void
27145aarch64_simd_clone_adjust (struct cgraph_node *node)
27146{
27147 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27148 use the correct ABI. */
27149
27150 tree t = TREE_TYPE (node->decl);
27151 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
27152 TYPE_ATTRIBUTES (t));
27153}
27154
27155/* Implement TARGET_SIMD_CLONE_USABLE. */
27156
27157static int
27158aarch64_simd_clone_usable (struct cgraph_node *node)
27159{
27160 switch (node->simdclone->vecsize_mangle)
27161 {
27162 case 'n':
27163 if (!TARGET_SIMD)
27164 return -1;
27165 return 0;
27166 default:
27167 gcc_unreachable ();
27168 }
27169}
27170
497f281c
SE
27171/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27172
27173static int
27174aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27175{
31427b97
RS
27176 auto check_attr = [&](const char *name) {
27177 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27178 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27179 if (!attr1 && !attr2)
27180 return true;
27181
27182 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27183 };
27184
27185 if (!check_attr ("aarch64_vector_pcs"))
27186 return 0;
27187 if (!check_attr ("Advanced SIMD type"))
497f281c 27188 return 0;
4cea5b8c
RS
27189 if (!check_attr ("SVE type"))
27190 return 0;
27191 if (!check_attr ("SVE sizeless type"))
27192 return 0;
497f281c
SE
27193 return 1;
27194}
27195
3bac1e20
SE
27196/* Implement TARGET_GET_MULTILIB_ABI_NAME */
27197
27198static const char *
27199aarch64_get_multilib_abi_name (void)
27200{
27201 if (TARGET_BIG_END)
27202 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27203 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27204}
27205
e76c8e56
JJ
27206/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27207 global variable based guard use the default else
27208 return a null tree. */
27209static tree
27210aarch64_stack_protect_guard (void)
27211{
27212 if (aarch64_stack_protector_guard == SSP_GLOBAL)
27213 return default_stack_protect_guard ();
27214
27215 return NULL_TREE;
27216}
27217
98698967
SMW
27218/* Return the diagnostic message string if conversion from FROMTYPE to
27219 TOTYPE is not allowed, NULL otherwise. */
27220
27221static const char *
27222aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
27223{
27224 if (element_mode (fromtype) != element_mode (totype))
27225 {
27226 /* Do no allow conversions to/from BFmode scalar types. */
27227 if (TYPE_MODE (fromtype) == BFmode)
27228 return N_("invalid conversion from type %<bfloat16_t%>");
27229 if (TYPE_MODE (totype) == BFmode)
27230 return N_("invalid conversion to type %<bfloat16_t%>");
27231 }
27232
27233 /* Conversion allowed. */
27234 return NULL;
27235}
27236
27237/* Return the diagnostic message string if the unary operation OP is
27238 not permitted on TYPE, NULL otherwise. */
27239
27240static const char *
27241aarch64_invalid_unary_op (int op, const_tree type)
27242{
27243 /* Reject all single-operand operations on BFmode except for &. */
27244 if (element_mode (type) == BFmode && op != ADDR_EXPR)
27245 return N_("operation not permitted on type %<bfloat16_t%>");
27246
27247 /* Operation allowed. */
27248 return NULL;
27249}
27250
27251/* Return the diagnostic message string if the binary operation OP is
27252 not permitted on TYPE1 and TYPE2, NULL otherwise. */
27253
27254static const char *
27255aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27256 const_tree type2)
27257{
27258 /* Reject all 2-operand operations on BFmode. */
27259 if (element_mode (type1) == BFmode
27260 || element_mode (type2) == BFmode)
27261 return N_("operation not permitted on type %<bfloat16_t%>");
27262
38e62001
RS
27263 if (VECTOR_TYPE_P (type1)
27264 && VECTOR_TYPE_P (type2)
27265 && !TYPE_INDIVISIBLE_P (type1)
27266 && !TYPE_INDIVISIBLE_P (type2)
27267 && (aarch64_sve::builtin_type_p (type1)
27268 != aarch64_sve::builtin_type_p (type2)))
27269 return N_("cannot combine GNU and SVE vectors in a binary operation");
27270
98698967
SMW
27271 /* Operation allowed. */
27272 return NULL;
27273}
27274
3bd87832
MM
27275/* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
27276 compiler that we automatically ignore the top byte of our pointers, which
27277 allows using -fsanitize=hwaddress. */
27278bool
27279aarch64_can_tag_addresses ()
27280{
27281 return !TARGET_ILP32;
27282}
27283
32efff9f
SD
27284/* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
27285 section at the end if needed. */
27286#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
27287#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
27288#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
27289void
27290aarch64_file_end_indicate_exec_stack ()
27291{
27292 file_end_indicate_exec_stack ();
27293
27294 unsigned feature_1_and = 0;
27295 if (aarch64_bti_enabled ())
27296 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27297
27298 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
27299 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27300
27301 if (feature_1_and)
27302 {
27303 /* Generate .note.gnu.property section. */
27304 switch_to_section (get_section (".note.gnu.property",
27305 SECTION_NOTYPE, NULL));
27306
27307 /* PT_NOTE header: namesz, descsz, type.
27308 namesz = 4 ("GNU\0")
27309 descsz = 16 (Size of the program property array)
27310 [(12 + padding) * Number of array elements]
27311 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
27312 assemble_align (POINTER_SIZE);
27313 assemble_integer (GEN_INT (4), 4, 32, 1);
27314 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27315 assemble_integer (GEN_INT (5), 4, 32, 1);
27316
27317 /* PT_NOTE name. */
27318 assemble_string ("GNU", 4);
27319
27320 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27321 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27322 datasz = 4
27323 data = feature_1_and. */
27324 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27325 assemble_integer (GEN_INT (4), 4, 32, 1);
27326 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27327
27328 /* Pad the size of the note to the required alignment. */
27329 assemble_align (POINTER_SIZE);
27330 }
27331}
27332#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27333#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27334#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
e76c8e56 27335
be178ecd
MM
27336/* Helper function for straight line speculation.
27337 Return what barrier should be emitted for straight line speculation
27338 mitigation.
27339 When not mitigating against straight line speculation this function returns
27340 an empty string.
27341 When mitigating against straight line speculation, use:
27342 * SB when the v8.5-A SB extension is enabled.
27343 * DSB+ISB otherwise. */
27344const char *
27345aarch64_sls_barrier (int mitigation_required)
27346{
27347 return mitigation_required
27348 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27349 : "";
27350}
27351
96b7f495
MM
27352static GTY (()) tree aarch64_sls_shared_thunks[30];
27353static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27354const char *indirect_symbol_names[30] = {
27355 "__call_indirect_x0",
27356 "__call_indirect_x1",
27357 "__call_indirect_x2",
27358 "__call_indirect_x3",
27359 "__call_indirect_x4",
27360 "__call_indirect_x5",
27361 "__call_indirect_x6",
27362 "__call_indirect_x7",
27363 "__call_indirect_x8",
27364 "__call_indirect_x9",
27365 "__call_indirect_x10",
27366 "__call_indirect_x11",
27367 "__call_indirect_x12",
27368 "__call_indirect_x13",
27369 "__call_indirect_x14",
27370 "__call_indirect_x15",
27371 "", /* "__call_indirect_x16", */
27372 "", /* "__call_indirect_x17", */
27373 "__call_indirect_x18",
27374 "__call_indirect_x19",
27375 "__call_indirect_x20",
27376 "__call_indirect_x21",
27377 "__call_indirect_x22",
27378 "__call_indirect_x23",
27379 "__call_indirect_x24",
27380 "__call_indirect_x25",
27381 "__call_indirect_x26",
27382 "__call_indirect_x27",
27383 "__call_indirect_x28",
27384 "__call_indirect_x29",
27385};
27386
27387/* Function to create a BLR thunk. This thunk is used to mitigate straight
27388 line speculation. Instead of a simple BLR that can be speculated past,
27389 we emit a BL to this thunk, and this thunk contains a BR to the relevant
27390 register. These thunks have the relevant speculation barries put after
27391 their indirect branch so that speculation is blocked.
27392
27393 We use such a thunk so the speculation barriers are kept off the
27394 architecturally executed path in order to reduce the performance overhead.
27395
27396 When optimizing for size we use stubs shared by the linked object.
27397 When optimizing for performance we emit stubs for each function in the hope
27398 that the branch predictor can better train on jumps specific for a given
27399 function. */
27400rtx
27401aarch64_sls_create_blr_label (int regnum)
27402{
27403 gcc_assert (STUB_REGNUM_P (regnum));
27404 if (optimize_function_for_size_p (cfun))
27405 {
27406 /* For the thunks shared between different functions in this compilation
27407 unit we use a named symbol -- this is just for users to more easily
27408 understand the generated assembly. */
27409 aarch64_sls_shared_thunks_needed = true;
27410 const char *thunk_name = indirect_symbol_names[regnum];
27411 if (aarch64_sls_shared_thunks[regnum] == NULL)
27412 {
27413 /* Build a decl representing this function stub and record it for
27414 later. We build a decl here so we can use the GCC machinery for
27415 handling sections automatically (through `get_named_section` and
27416 `make_decl_one_only`). That saves us a lot of trouble handling
27417 the specifics of different output file formats. */
27418 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27419 get_identifier (thunk_name),
27420 build_function_type_list (void_type_node,
27421 NULL_TREE));
27422 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27423 NULL_TREE, void_type_node);
27424 TREE_PUBLIC (decl) = 1;
27425 TREE_STATIC (decl) = 1;
27426 DECL_IGNORED_P (decl) = 1;
27427 DECL_ARTIFICIAL (decl) = 1;
27428 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27429 resolve_unique_section (decl, 0, false);
27430 aarch64_sls_shared_thunks[regnum] = decl;
27431 }
27432
27433 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27434 }
27435
27436 if (cfun->machine->call_via[regnum] == NULL)
27437 cfun->machine->call_via[regnum]
27438 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27439 return cfun->machine->call_via[regnum];
27440}
27441
27442/* Helper function for aarch64_sls_emit_blr_function_thunks and
27443 aarch64_sls_emit_shared_blr_thunks below. */
27444static void
27445aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27446{
27447 /* Save in x16 and branch to that function so this transformation does
27448 not prevent jumping to `BTI c` instructions. */
27449 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27450 asm_fprintf (out_file, "\tbr\tx16\n");
27451}
27452
27453/* Emit all BLR stubs for this particular function.
27454 Here we emit all the BLR stubs needed for the current function. Since we
27455 emit these stubs in a consecutive block we know there will be no speculation
27456 gadgets between each stub, and hence we only emit a speculation barrier at
27457 the end of the stub sequences.
27458
27459 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
27460void
27461aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27462{
27463 if (! aarch64_harden_sls_blr_p ())
27464 return;
27465
27466 bool any_functions_emitted = false;
27467 /* We must save and restore the current function section since this assembly
27468 is emitted at the end of the function. This means it can be emitted *just
27469 after* the cold section of a function. That cold part would be emitted in
27470 a different section. That switch would trigger a `.cfi_endproc` directive
27471 to be emitted in the original section and a `.cfi_startproc` directive to
27472 be emitted in the new section. Switching to the original section without
27473 restoring would mean that the `.cfi_endproc` emitted as a function ends
27474 would happen in a different section -- leaving an unmatched
27475 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27476 in the standard text section. */
27477 section *save_text_section = in_section;
27478 switch_to_section (function_section (current_function_decl));
27479 for (int regnum = 0; regnum < 30; ++regnum)
27480 {
27481 rtx specu_label = cfun->machine->call_via[regnum];
27482 if (specu_label == NULL)
27483 continue;
27484
27485 targetm.asm_out.print_operand (out_file, specu_label, 0);
27486 asm_fprintf (out_file, ":\n");
27487 aarch64_sls_emit_function_stub (out_file, regnum);
27488 any_functions_emitted = true;
27489 }
27490 if (any_functions_emitted)
27491 /* Can use the SB if needs be here, since this stub will only be used
27492 by the current function, and hence for the current target. */
27493 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27494 switch_to_section (save_text_section);
27495}
27496
27497/* Emit shared BLR stubs for the current compilation unit.
27498 Over the course of compiling this unit we may have converted some BLR
27499 instructions to a BL to a shared stub function. This is where we emit those
27500 stub functions.
27501 This function is for the stubs shared between different functions in this
27502 compilation unit. We share when optimizing for size instead of speed.
27503
27504 This function is called through the TARGET_ASM_FILE_END hook. */
27505void
27506aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27507{
27508 if (! aarch64_sls_shared_thunks_needed)
27509 return;
27510
27511 for (int regnum = 0; regnum < 30; ++regnum)
27512 {
27513 tree decl = aarch64_sls_shared_thunks[regnum];
27514 if (!decl)
27515 continue;
27516
27517 const char *name = indirect_symbol_names[regnum];
27518 switch_to_section (get_named_section (decl, NULL, 0));
27519 ASM_OUTPUT_ALIGN (out_file, 2);
27520 targetm.asm_out.globalize_label (out_file, name);
27521 /* Only emits if the compiler is configured for an assembler that can
27522 handle visibility directives. */
27523 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27524 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27525 ASM_OUTPUT_LABEL (out_file, name);
27526 aarch64_sls_emit_function_stub (out_file, regnum);
27527 /* Use the most conservative target to ensure it can always be used by any
27528 function in the translation unit. */
27529 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27530 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27531 }
27532}
27533
27534/* Implement TARGET_ASM_FILE_END. */
27535void
27536aarch64_asm_file_end ()
27537{
27538 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27539 /* Since this function will be called for the ASM_FILE_END hook, we ensure
27540 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27541 for FreeBSD) still gets called. */
27542#ifdef TARGET_ASM_FILE_END
27543 TARGET_ASM_FILE_END ();
27544#endif
27545}
27546
27547const char *
27548aarch64_indirect_call_asm (rtx addr)
27549{
27550 gcc_assert (REG_P (addr));
27551 if (aarch64_harden_sls_blr_p ())
27552 {
27553 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27554 output_asm_insn ("bl\t%0", &stub_label);
27555 }
27556 else
27557 output_asm_insn ("blr\t%0", &addr);
27558 return "";
27559}
27560
51b86113
DM
27561/* Target-specific selftests. */
27562
27563#if CHECKING_P
27564
27565namespace selftest {
27566
27567/* Selftest for the RTL loader.
27568 Verify that the RTL loader copes with a dump from
27569 print_rtx_function. This is essentially just a test that class
27570 function_reader can handle a real dump, but it also verifies
27571 that lookup_reg_by_dump_name correctly handles hard regs.
27572 The presence of hard reg names in the dump means that the test is
27573 target-specific, hence it is in this file. */
27574
27575static void
27576aarch64_test_loading_full_dump ()
27577{
27578 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27579
27580 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27581
27582 rtx_insn *insn_1 = get_insn_by_uid (1);
27583 ASSERT_EQ (NOTE, GET_CODE (insn_1));
27584
27585 rtx_insn *insn_15 = get_insn_by_uid (15);
27586 ASSERT_EQ (INSN, GET_CODE (insn_15));
27587 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27588
27589 /* Verify crtl->return_rtx. */
27590 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27591 ASSERT_EQ (0, REGNO (crtl->return_rtx));
27592 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27593}
27594
83d796d3
RS
27595/* Test the fractional_cost class. */
27596
27597static void
27598aarch64_test_fractional_cost ()
27599{
27600 using cf = fractional_cost;
27601
27602 ASSERT_EQ (cf (0, 20), 0);
27603
27604 ASSERT_EQ (cf (4, 2), 2);
27605 ASSERT_EQ (3, cf (9, 3));
27606
27607 ASSERT_NE (cf (5, 2), 2);
27608 ASSERT_NE (3, cf (8, 3));
27609
27610 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27611 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27612 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27613
27614 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27615 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27616 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27617 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27618 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27619 ASSERT_EQ (3 - cf (10, 3), 0);
27620
27621 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27622 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27623
27624 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27625 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27626 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27627 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27628 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27629 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27630 ASSERT_TRUE (cf (239, 240) < 1);
27631 ASSERT_FALSE (cf (240, 240) < 1);
27632 ASSERT_FALSE (cf (241, 240) < 1);
27633 ASSERT_FALSE (2 < cf (207, 104));
27634 ASSERT_FALSE (2 < cf (208, 104));
27635 ASSERT_TRUE (2 < cf (209, 104));
27636
27637 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27638 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27639 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27640 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27641 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27642 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27643 ASSERT_TRUE (cf (239, 240) < 1);
27644 ASSERT_FALSE (cf (240, 240) < 1);
27645 ASSERT_FALSE (cf (241, 240) < 1);
27646 ASSERT_FALSE (2 < cf (207, 104));
27647 ASSERT_FALSE (2 < cf (208, 104));
27648 ASSERT_TRUE (2 < cf (209, 104));
27649
27650 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27651 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27652 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27653 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27654 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27655 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27656 ASSERT_FALSE (cf (239, 240) >= 1);
27657 ASSERT_TRUE (cf (240, 240) >= 1);
27658 ASSERT_TRUE (cf (241, 240) >= 1);
27659 ASSERT_TRUE (2 >= cf (207, 104));
27660 ASSERT_TRUE (2 >= cf (208, 104));
27661 ASSERT_FALSE (2 >= cf (209, 104));
27662
27663 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27664 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27665 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27666 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27667 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27668 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27669 ASSERT_FALSE (cf (239, 240) > 1);
27670 ASSERT_FALSE (cf (240, 240) > 1);
27671 ASSERT_TRUE (cf (241, 240) > 1);
27672 ASSERT_TRUE (2 > cf (207, 104));
27673 ASSERT_FALSE (2 > cf (208, 104));
27674 ASSERT_FALSE (2 > cf (209, 104));
27675
27676 ASSERT_EQ (cf (1, 2).ceil (), 1);
27677 ASSERT_EQ (cf (11, 7).ceil (), 2);
27678 ASSERT_EQ (cf (20, 1).ceil (), 20);
27679 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27680 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27681 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27682 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27683 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27684
27685 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27686}
27687
51b86113
DM
27688/* Run all target-specific selftests. */
27689
27690static void
27691aarch64_run_selftests (void)
27692{
27693 aarch64_test_loading_full_dump ();
83d796d3 27694 aarch64_test_fractional_cost ();
51b86113
DM
27695}
27696
27697} // namespace selftest
27698
27699#endif /* #if CHECKING_P */
27700
cd0b2d36
RR
27701#undef TARGET_STACK_PROTECT_GUARD
27702#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27703
43e9d192
IB
27704#undef TARGET_ADDRESS_COST
27705#define TARGET_ADDRESS_COST aarch64_address_cost
27706
27707/* This hook will determines whether unnamed bitfields affect the alignment
27708 of the containing structure. The hook returns true if the structure
27709 should inherit the alignment requirements of an unnamed bitfield's
27710 type. */
27711#undef TARGET_ALIGN_ANON_BITFIELD
27712#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27713
27714#undef TARGET_ASM_ALIGNED_DI_OP
27715#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27716
27717#undef TARGET_ASM_ALIGNED_HI_OP
27718#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27719
27720#undef TARGET_ASM_ALIGNED_SI_OP
27721#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27722
27723#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27724#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27725 hook_bool_const_tree_hwi_hwi_const_tree_true
27726
e1c1ecb0
KT
27727#undef TARGET_ASM_FILE_START
27728#define TARGET_ASM_FILE_START aarch64_start_file
27729
43e9d192
IB
27730#undef TARGET_ASM_OUTPUT_MI_THUNK
27731#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27732
27733#undef TARGET_ASM_SELECT_RTX_SECTION
27734#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27735
27736#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27737#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27738
c292cfe5
SN
27739#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27740#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27741
43e9d192
IB
27742#undef TARGET_BUILD_BUILTIN_VA_LIST
27743#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27744
27745#undef TARGET_CALLEE_COPIES
7256c719 27746#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
43e9d192
IB
27747
27748#undef TARGET_CAN_ELIMINATE
27749#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27750
1fd8d40c
KT
27751#undef TARGET_CAN_INLINE_P
27752#define TARGET_CAN_INLINE_P aarch64_can_inline_p
27753
43e9d192
IB
27754#undef TARGET_CANNOT_FORCE_CONST_MEM
27755#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27756
50487d79
EM
27757#undef TARGET_CASE_VALUES_THRESHOLD
27758#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27759
43e9d192
IB
27760#undef TARGET_CONDITIONAL_REGISTER_USAGE
27761#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27762
38e62001
RS
27763#undef TARGET_MEMBER_TYPE_FORCES_BLK
27764#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27765
43e9d192
IB
27766/* Only the least significant bit is used for initialization guard
27767 variables. */
27768#undef TARGET_CXX_GUARD_MASK_BIT
27769#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27770
27771#undef TARGET_C_MODE_FOR_SUFFIX
27772#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27773
27774#ifdef TARGET_BIG_ENDIAN_DEFAULT
27775#undef TARGET_DEFAULT_TARGET_FLAGS
27776#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27777#endif
27778
27779#undef TARGET_CLASS_MAX_NREGS
27780#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27781
119103ca
JG
27782#undef TARGET_BUILTIN_DECL
27783#define TARGET_BUILTIN_DECL aarch64_builtin_decl
27784
a6fc00da
BH
27785#undef TARGET_BUILTIN_RECIPROCAL
27786#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27787
11e554b3
JG
27788#undef TARGET_C_EXCESS_PRECISION
27789#define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27790
43e9d192
IB
27791#undef TARGET_EXPAND_BUILTIN
27792#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27793
27794#undef TARGET_EXPAND_BUILTIN_VA_START
27795#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27796
9697e620
JG
27797#undef TARGET_FOLD_BUILTIN
27798#define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27799
43e9d192
IB
27800#undef TARGET_FUNCTION_ARG
27801#define TARGET_FUNCTION_ARG aarch64_function_arg
27802
27803#undef TARGET_FUNCTION_ARG_ADVANCE
27804#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27805
27806#undef TARGET_FUNCTION_ARG_BOUNDARY
27807#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27808
76b0cbf8
RS
27809#undef TARGET_FUNCTION_ARG_PADDING
27810#define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27811
43cacb12
RS
27812#undef TARGET_GET_RAW_RESULT_MODE
27813#define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27814#undef TARGET_GET_RAW_ARG_MODE
27815#define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27816
43e9d192
IB
27817#undef TARGET_FUNCTION_OK_FOR_SIBCALL
27818#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27819
27820#undef TARGET_FUNCTION_VALUE
27821#define TARGET_FUNCTION_VALUE aarch64_function_value
27822
27823#undef TARGET_FUNCTION_VALUE_REGNO_P
27824#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27825
fc72cba7
AL
27826#undef TARGET_GIMPLE_FOLD_BUILTIN
27827#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
0ac198d3 27828
43e9d192
IB
27829#undef TARGET_GIMPLIFY_VA_ARG_EXPR
27830#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27831
27832#undef TARGET_INIT_BUILTINS
27833#define TARGET_INIT_BUILTINS aarch64_init_builtins
27834
c64f7d37
WD
27835#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27836#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27837 aarch64_ira_change_pseudo_allocno_class
27838
43e9d192
IB
27839#undef TARGET_LEGITIMATE_ADDRESS_P
27840#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27841
27842#undef TARGET_LEGITIMATE_CONSTANT_P
27843#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27844
491ec060
WD
27845#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27846#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27847 aarch64_legitimize_address_displacement
27848
43e9d192
IB
27849#undef TARGET_LIBGCC_CMP_RETURN_MODE
27850#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27851
11e554b3
JG
27852#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27853#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27854aarch64_libgcc_floating_mode_supported_p
27855
ac2b960f
YZ
27856#undef TARGET_MANGLE_TYPE
27857#define TARGET_MANGLE_TYPE aarch64_mangle_type
27858
98698967
SMW
27859#undef TARGET_INVALID_CONVERSION
27860#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27861
27862#undef TARGET_INVALID_UNARY_OP
27863#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27864
27865#undef TARGET_INVALID_BINARY_OP
27866#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27867
65ef05d0
RS
27868#undef TARGET_VERIFY_TYPE_CONTEXT
27869#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27870
43e9d192
IB
27871#undef TARGET_MEMORY_MOVE_COST
27872#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27873
26e0ff94
WD
27874#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27875#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27876
43e9d192
IB
27877#undef TARGET_MUST_PASS_IN_STACK
27878#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27879
27880/* This target hook should return true if accesses to volatile bitfields
27881 should use the narrowest mode possible. It should return false if these
27882 accesses should use the bitfield container type. */
27883#undef TARGET_NARROW_VOLATILE_BITFIELD
27884#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27885
27886#undef TARGET_OPTION_OVERRIDE
27887#define TARGET_OPTION_OVERRIDE aarch64_override_options
27888
27889#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27890#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27891 aarch64_override_options_after_change
27892
29a14a1a
MK
27893#undef TARGET_OFFLOAD_OPTIONS
27894#define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27895
361fb3ee
KT
27896#undef TARGET_OPTION_RESTORE
27897#define TARGET_OPTION_RESTORE aarch64_option_restore
27898
27899#undef TARGET_OPTION_PRINT
27900#define TARGET_OPTION_PRINT aarch64_option_print
27901
5a2c8331
KT
27902#undef TARGET_OPTION_VALID_ATTRIBUTE_P
27903#define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27904
d78006d9
KT
27905#undef TARGET_SET_CURRENT_FUNCTION
27906#define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27907
43e9d192
IB
27908#undef TARGET_PASS_BY_REFERENCE
27909#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27910
27911#undef TARGET_PREFERRED_RELOAD_CLASS
27912#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27913
cee66c68
WD
27914#undef TARGET_SCHED_REASSOCIATION_WIDTH
27915#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27916
eb499454
RS
27917#undef TARGET_DWARF_FRAME_REG_MODE
27918#define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
27919
c2ec330c
AL
27920#undef TARGET_PROMOTED_TYPE
27921#define TARGET_PROMOTED_TYPE aarch64_promoted_type
27922
43e9d192
IB
27923#undef TARGET_SECONDARY_RELOAD
27924#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27925
721c0fb3
RS
27926#undef TARGET_SECONDARY_MEMORY_NEEDED
27927#define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27928
43e9d192
IB
27929#undef TARGET_SHIFT_TRUNCATION_MASK
27930#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27931
27932#undef TARGET_SETUP_INCOMING_VARARGS
27933#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27934
27935#undef TARGET_STRUCT_VALUE_RTX
27936#define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
27937
27938#undef TARGET_REGISTER_MOVE_COST
27939#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27940
27941#undef TARGET_RETURN_IN_MEMORY
27942#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27943
27944#undef TARGET_RETURN_IN_MSB
27945#define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27946
27947#undef TARGET_RTX_COSTS
7cc2145f 27948#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
43e9d192 27949
2e5f8203
JG
27950#undef TARGET_SCALAR_MODE_SUPPORTED_P
27951#define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27952
d126a4ae
AP
27953#undef TARGET_SCHED_ISSUE_RATE
27954#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27955
d0bc0cb6
RS
27956#undef TARGET_SCHED_VARIABLE_ISSUE
27957#define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27958
d03f7e44
MK
27959#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27960#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27961 aarch64_sched_first_cycle_multipass_dfa_lookahead
27962
2d6bc7fa
KT
27963#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27964#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27965 aarch64_first_cycle_multipass_dfa_lookahead_guard
27966
827ab47a
KT
27967#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27968#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27969 aarch64_get_separate_components
27970
27971#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27972#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27973 aarch64_components_for_bb
27974
27975#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27976#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27977 aarch64_disqualify_components
27978
27979#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27980#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27981 aarch64_emit_prologue_components
27982
27983#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27984#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27985 aarch64_emit_epilogue_components
27986
27987#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27988#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27989 aarch64_set_handled_components
27990
43e9d192
IB
27991#undef TARGET_TRAMPOLINE_INIT
27992#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27993
27994#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27995#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27996
27997#undef TARGET_VECTOR_MODE_SUPPORTED_P
27998#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27999
482b2b43
RS
28000#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
28001#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
28002
7df76747
N
28003#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
28004#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
28005 aarch64_builtin_support_vector_misalignment
28006
9f4cbab8
RS
28007#undef TARGET_ARRAY_MODE
28008#define TARGET_ARRAY_MODE aarch64_array_mode
28009
43e9d192
IB
28010#undef TARGET_ARRAY_MODE_SUPPORTED_P
28011#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
28012
6239dd05
RS
28013#undef TARGET_VECTORIZE_CREATE_COSTS
28014#define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
50a525b5 28015
8990e73a
TB
28016#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
28017#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
28018 aarch64_builtin_vectorization_cost
28019
43e9d192
IB
28020#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
28021#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
28022
42fc9a7f
JG
28023#undef TARGET_VECTORIZE_BUILTINS
28024#define TARGET_VECTORIZE_BUILTINS
28025
e021fb86
RS
28026#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
28027#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
28028 aarch64_autovectorize_vector_modes
3b357264 28029
aa87aced
KV
28030#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
28031#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
28032 aarch64_atomic_assign_expand_fenv
28033
43e9d192
IB
28034/* Section anchor support. */
28035
28036#undef TARGET_MIN_ANCHOR_OFFSET
28037#define TARGET_MIN_ANCHOR_OFFSET -256
28038
28039/* Limit the maximum anchor offset to 4k-1, since that's the limit for a
28040 byte offset; we can do much more for larger data types, but have no way
28041 to determine the size of the access. We assume accesses are aligned. */
28042#undef TARGET_MAX_ANCHOR_OFFSET
28043#define TARGET_MAX_ANCHOR_OFFSET 4095
28044
db0253a4
TB
28045#undef TARGET_VECTOR_ALIGNMENT
28046#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
28047
c98aabc1
TC
28048#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
28049#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
28050 aarch64_vectorize_can_special_div_by_constant
28051
43cacb12
RS
28052#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
28053#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
28054 aarch64_vectorize_preferred_vector_alignment
db0253a4
TB
28055#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
28056#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
28057 aarch64_simd_vector_alignment_reachable
28058
88b08073
JG
28059/* vec_perm support. */
28060
f151c9e1
RS
28061#undef TARGET_VECTORIZE_VEC_PERM_CONST
28062#define TARGET_VECTORIZE_VEC_PERM_CONST \
28063 aarch64_vectorize_vec_perm_const
88b08073 28064
74166aab
RS
28065#undef TARGET_VECTORIZE_RELATED_MODE
28066#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
43cacb12
RS
28067#undef TARGET_VECTORIZE_GET_MASK_MODE
28068#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
76a34e3f
RS
28069#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
28070#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
28071 aarch64_empty_mask_is_expensive
6a86928d
RS
28072#undef TARGET_PREFERRED_ELSE_VALUE
28073#define TARGET_PREFERRED_ELSE_VALUE \
28074 aarch64_preferred_else_value
43cacb12 28075
c2ec330c
AL
28076#undef TARGET_INIT_LIBFUNCS
28077#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
70f09188 28078
706b2314 28079#undef TARGET_FIXED_CONDITION_CODE_REGS
70f09188
AP
28080#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
28081
5cb74e90
RR
28082#undef TARGET_FLAGS_REGNUM
28083#define TARGET_FLAGS_REGNUM CC_REGNUM
28084
78607708
TV
28085#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
28086#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
28087
a3125fc2
CL
28088#undef TARGET_ASAN_SHADOW_OFFSET
28089#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
28090
0c4ec427
RE
28091#undef TARGET_LEGITIMIZE_ADDRESS
28092#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
28093
b48d6421
KT
28094#undef TARGET_SCHED_CAN_SPECULATE_INSN
28095#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
28096
594bdd53
FY
28097#undef TARGET_CAN_USE_DOLOOP_P
28098#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
28099
9bca63d4
WD
28100#undef TARGET_SCHED_ADJUST_PRIORITY
28101#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
28102
6a569cdd
KT
28103#undef TARGET_SCHED_MACRO_FUSION_P
28104#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
28105
28106#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
28107#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
28108
350013bc
BC
28109#undef TARGET_SCHED_FUSION_PRIORITY
28110#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
28111
7b841a12
JW
28112#undef TARGET_UNSPEC_MAY_TRAP_P
28113#define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
28114
1b1e81f8
JW
28115#undef TARGET_USE_PSEUDO_PIC_REG
28116#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
28117
cc8ca59e
JB
28118#undef TARGET_PRINT_OPERAND
28119#define TARGET_PRINT_OPERAND aarch64_print_operand
28120
28121#undef TARGET_PRINT_OPERAND_ADDRESS
28122#define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
28123
74b27d8e
RS
28124#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
28125#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
28126
ee62a5a6
RS
28127#undef TARGET_OPTAB_SUPPORTED_P
28128#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
28129
43203dea
RR
28130#undef TARGET_OMIT_STRUCT_RETURN_REG
28131#define TARGET_OMIT_STRUCT_RETURN_REG true
28132
43cacb12
RS
28133#undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
28134#define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
28135 aarch64_dwarf_poly_indeterminate_value
28136
f46fe37e
EB
28137/* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
28138#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
28139#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
28140
c43f4279
RS
28141#undef TARGET_HARD_REGNO_NREGS
28142#define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
f939c3e6
RS
28143#undef TARGET_HARD_REGNO_MODE_OK
28144#define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
28145
99e1629f
RS
28146#undef TARGET_MODES_TIEABLE_P
28147#define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
28148
80ec73f4
RS
28149#undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28150#define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28151 aarch64_hard_regno_call_part_clobbered
28152
5a5a3bc5
RS
28153#undef TARGET_INSN_CALLEE_ABI
28154#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
b3650d40 28155
58e17cf8
RS
28156#undef TARGET_CONSTANT_ALIGNMENT
28157#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28158
8c6e3b23
TC
28159#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28160#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28161 aarch64_stack_clash_protection_alloca_probe_range
28162
43cacb12
RS
28163#undef TARGET_COMPUTE_PRESSURE_CLASSES
28164#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28165
28166#undef TARGET_CAN_CHANGE_MODE_CLASS
28167#define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28168
5cce8171
RS
28169#undef TARGET_SELECT_EARLY_REMAT_MODES
28170#define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28171
c0111dc4
RE
28172#undef TARGET_SPECULATION_SAFE_VALUE
28173#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28174
2d56d6ba
KT
28175#undef TARGET_ESTIMATED_POLY_VALUE
28176#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28177
a0d0b980
SE
28178#undef TARGET_ATTRIBUTE_TABLE
28179#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28180
d9186814
SE
28181#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28182#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28183 aarch64_simd_clone_compute_vecsize_and_simdlen
28184
28185#undef TARGET_SIMD_CLONE_ADJUST
28186#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28187
28188#undef TARGET_SIMD_CLONE_USABLE
28189#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28190
497f281c
SE
28191#undef TARGET_COMP_TYPE_ATTRIBUTES
28192#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28193
3bac1e20
SE
28194#undef TARGET_GET_MULTILIB_ABI_NAME
28195#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28196
002ffd3c
RS
28197#undef TARGET_FNTYPE_ABI
28198#define TARGET_FNTYPE_ABI aarch64_fntype_abi
28199
3bd87832
MM
28200#undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28201#define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28202
51b86113
DM
28203#if CHECKING_P
28204#undef TARGET_RUN_TARGET_SELFTESTS
28205#define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28206#endif /* #if CHECKING_P */
28207
8fc16d72
ST
28208#undef TARGET_ASM_POST_CFI_STARTPROC
28209#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28210
c600df9a
RS
28211#undef TARGET_STRICT_ARGUMENT_NAMING
28212#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28213
1a7a35c7
RH
28214#undef TARGET_MD_ASM_ADJUST
28215#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28216
96b7f495
MM
28217#undef TARGET_ASM_FILE_END
28218#define TARGET_ASM_FILE_END aarch64_asm_file_end
28219
28220#undef TARGET_ASM_FUNCTION_EPILOGUE
28221#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28222
ce09ab17
DL
28223#undef TARGET_HAVE_SHADOW_CALL_STACK
28224#define TARGET_HAVE_SHADOW_CALL_STACK true
28225
2d7c73ee
WD
28226#undef TARGET_CONST_ANCHOR
28227#define TARGET_CONST_ANCHOR 0x1000000
28228
43e9d192
IB
28229struct gcc_target targetm = TARGET_INITIALIZER;
28230
28231#include "gt-aarch64.h"