]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/aarch64/aarch64.c
__builtin_shuffle sometimes should produce zip1 rather than TBL (PR82199)
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
CommitLineData
bdb7bf8a 1/* Machine description for AArch64 architecture.
8d9254fc 2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
43e9d192
IB
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
43e9d192 23#include "config.h"
01736018 24#define INCLUDE_STRING
43e9d192
IB
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407
AM
28#include "target.h"
29#include "rtl.h"
c7131fb2 30#include "tree.h"
e73cf9a2 31#include "memmodel.h"
c7131fb2 32#include "gimple.h"
e11c4407
AM
33#include "cfghooks.h"
34#include "cfgloop.h"
c7131fb2 35#include "df.h"
e11c4407
AM
36#include "tm_p.h"
37#include "stringpool.h"
314e6352 38#include "attribs.h"
e11c4407
AM
39#include "optabs.h"
40#include "regs.h"
41#include "emit-rtl.h"
42#include "recog.h"
d9186814 43#include "cgraph.h"
e11c4407 44#include "diagnostic.h"
43e9d192 45#include "insn-attr.h"
40e23961 46#include "alias.h"
40e23961 47#include "fold-const.h"
d8a2d370
DN
48#include "stor-layout.h"
49#include "calls.h"
50#include "varasm.h"
43e9d192 51#include "output.h"
36566b39 52#include "flags.h"
36566b39 53#include "explow.h"
43e9d192
IB
54#include "expr.h"
55#include "reload.h"
43e9d192 56#include "langhooks.h"
5a2c8331 57#include "opts.h"
45b0be94 58#include "gimplify.h"
43e9d192 59#include "dwarf2.h"
61d371eb 60#include "gimple-iterator.h"
8990e73a 61#include "tree-vectorizer.h"
d1bcc29f 62#include "aarch64-cost-tables.h"
0ee859b5 63#include "dumpfile.h"
9b2b7279 64#include "builtins.h"
8baff86e 65#include "rtl-iter.h"
9bbe08fe 66#include "tm-constrs.h"
d03f7e44 67#include "sched-int.h"
d78006d9 68#include "target-globals.h"
a3eb8a52 69#include "common/common-target.h"
43cacb12 70#include "cfgrtl.h"
51b86113
DM
71#include "selftest.h"
72#include "selftest-rtl.h"
43cacb12 73#include "rtx-vector-builder.h"
d9186814 74#include "intl.h"
7d8bdfa7 75#include "expmed.h"
002ffd3c 76#include "function-abi.h"
43e9d192 77
994c5d85 78/* This file should be included last. */
d58627a0
RS
79#include "target-def.h"
80
28514dda
YZ
81/* Defined for convenience. */
82#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
b187677b 84/* Information about a legitimate vector immediate operand. */
48063b9d
IB
85struct simd_immediate_info
86{
0b1fe8cf 87 enum insn_type { MOV, MVN, INDEX, PTRUE };
b187677b
RS
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
43cacb12 95 simd_immediate_info (scalar_mode, rtx, rtx);
0b1fe8cf 96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
b187677b
RS
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
b187677b
RS
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
1da83cce
RS
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
0b1fe8cf
RS
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
1da83cce 128 } u;
48063b9d
IB
129};
130
b187677b
RS
131/* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133inline simd_immediate_info
134::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
1da83cce
RS
135 : elt_mode (elt_mode_in), insn (MOV)
136{
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140}
b187677b
RS
141
142/* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145inline simd_immediate_info
146::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
1da83cce
RS
150 : elt_mode (elt_mode_in), insn (insn_in)
151{
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155}
43cacb12
RS
156
157/* Construct an integer immediate in which each element has mode ELT_MODE_IN
1da83cce 158 and where element I is equal to BASE_IN + I * STEP_IN. */
43cacb12 159inline simd_immediate_info
1da83cce
RS
160::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162{
163 u.index.base = base_in;
164 u.index.step = step_in;
165}
b187677b 166
0b1fe8cf
RS
167/* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169inline simd_immediate_info
170::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173{
174 u.pattern = pattern_in;
175}
176
38e62001
RS
177namespace {
178
179/* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
180class pure_scalable_type_info
181{
182public:
183 /* Represents the result of analyzing a type. All values are nonzero,
184 in the possibly forlorn hope that accidental conversions to bool
185 trigger a warning. */
186 enum analysis_result
187 {
188 /* The type does not have an ABI identity; i.e. it doesn't contain
189 at least one object whose type is a Fundamental Data Type. */
190 NO_ABI_IDENTITY = 1,
191
192 /* The type is definitely a Pure Scalable Type. */
193 IS_PST,
194
195 /* The type is definitely not a Pure Scalable Type. */
196 ISNT_PST,
197
198 /* It doesn't matter for PCS purposes whether the type is a Pure
199 Scalable Type or not, since the type will be handled the same
200 way regardless.
201
202 Specifically, this means that if the type is a Pure Scalable Type,
203 there aren't enough argument registers to hold it, and so it will
204 need to be passed or returned in memory. If the type isn't a
205 Pure Scalable Type, it's too big to be passed or returned in core
206 or SIMD&FP registers, and so again will need to go in memory. */
207 DOESNT_MATTER
208 };
209
210 /* Aggregates of 17 bytes or more are normally passed and returned
211 in memory, so aggregates of that size can safely be analyzed as
212 DOESNT_MATTER. We need to be able to collect enough pieces to
213 represent a PST that is smaller than that. Since predicates are
214 2 bytes in size for -msve-vector-bits=128, that means we need to be
215 able to store at least 8 pieces.
216
217 We also need to be able to store enough pieces to represent
218 a single vector in each vector argument register and a single
219 predicate in each predicate argument register. This means that
220 we need at least 12 pieces. */
221 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222#if __cplusplus >= 201103L
223 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224#endif
225
226 /* Describes one piece of a PST. Each piece is one of:
227
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
231
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
234 struct piece
235 {
236 rtx get_rtx (unsigned int, unsigned int) const;
237
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
240 unsigned int num_zr;
241 unsigned int num_pr;
242
243 /* The mode of the registers described above. */
244 machine_mode mode;
245
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode;
249
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset;
252 };
253
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec<piece, MAX_PIECES> pieces;
257
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
260
261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
262
263 analysis_result analyze (const_tree);
264 bool analyze_registers (const_tree);
265
266private:
267 analysis_result analyze_array (const_tree);
268 analysis_result analyze_record (const_tree);
269 void add_piece (const piece &);
270};
271}
272
43e9d192
IB
273/* The current code model. */
274enum aarch64_code_model aarch64_cmodel;
275
43cacb12
RS
276/* The number of 64-bit elements in an SVE vector. */
277poly_uint16 aarch64_sve_vg;
278
43e9d192
IB
279#ifdef HAVE_AS_TLS
280#undef TARGET_HAVE_TLS
281#define TARGET_HAVE_TLS 1
282#endif
283
ef4bddc2 284static bool aarch64_composite_type_p (const_tree, machine_mode);
38e62001 285static bool aarch64_return_in_memory_1 (const_tree);
ef4bddc2 286static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
43e9d192 287 const_tree,
ef4bddc2 288 machine_mode *, int *,
56fe3ca3 289 bool *, bool);
43e9d192
IB
290static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
43e9d192 292static void aarch64_override_options_after_change (void);
ef4bddc2 293static bool aarch64_vector_mode_supported_p (machine_mode);
ef4bddc2 294static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
7df76747
N
295static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 const_tree type,
297 int misalignment,
298 bool is_packed);
43cacb12 299static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
a25831ac
AV
300static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 aarch64_addr_query_type);
eb471ba3 302static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
88b08073 303
0c6caaf8
RL
304/* Major revision number of the ARM Architecture implemented by the target. */
305unsigned aarch64_architecture_version;
306
43e9d192 307/* The processor for which instructions should be scheduled. */
02fdbd5b 308enum aarch64_processor aarch64_tune = cortexa53;
43e9d192 309
43e9d192 310/* Mask to specify which instruction scheduling options should be used. */
28108a53 311uint64_t aarch64_tune_flags = 0;
43e9d192 312
1be34295 313/* Global flag for PC relative loads. */
9ee6540a 314bool aarch64_pcrelative_literal_loads;
1be34295 315
d6cb6d6a
WD
316/* Global flag for whether frame pointer is enabled. */
317bool aarch64_use_frame_pointer;
318
efac62a3
ST
319#define BRANCH_PROTECT_STR_MAX 255
320char *accepted_branch_protection_string = NULL;
321
322static enum aarch64_parse_opt_result
323aarch64_parse_branch_protection (const char*, char**);
324
8dec06f2
JG
325/* Support for command line parsing of boolean flags in the tuning
326 structures. */
327struct aarch64_flag_desc
328{
329 const char* name;
330 unsigned int flag;
331};
332
ed9fa8d2 333#define AARCH64_FUSION_PAIR(name, internal_name) \
8dec06f2
JG
334 { name, AARCH64_FUSE_##internal_name },
335static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
336{
337 { "none", AARCH64_FUSE_NOTHING },
338#include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
341};
8dec06f2 342
a339a01c 343#define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
8dec06f2
JG
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345static const struct aarch64_flag_desc aarch64_tuning_flags[] =
346{
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348#include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
351};
8dec06f2 352
43e9d192
IB
353/* Tuning parameters. */
354
43e9d192
IB
355static const struct cpu_addrcost_table generic_addrcost_table =
356{
67747367 357 {
2fae724a 358 1, /* hi */
bd95e655
JG
359 0, /* si */
360 0, /* di */
2fae724a 361 1, /* ti */
67747367 362 },
bd95e655
JG
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* register_offset */
783879e6
EM
366 0, /* register_sextend */
367 0, /* register_zextend */
bd95e655 368 0 /* imm_offset */
43e9d192
IB
369};
370
5ec1ae3b
EM
371static const struct cpu_addrcost_table exynosm1_addrcost_table =
372{
373 {
374 0, /* hi */
375 0, /* si */
376 0, /* di */
377 2, /* ti */
378 },
379 0, /* pre_modify */
380 0, /* post_modify */
381 1, /* register_offset */
382 1, /* register_sextend */
383 2, /* register_zextend */
384 0, /* imm_offset */
385};
386
381e27aa
PT
387static const struct cpu_addrcost_table xgene1_addrcost_table =
388{
381e27aa 389 {
bd95e655
JG
390 1, /* hi */
391 0, /* si */
392 0, /* di */
393 1, /* ti */
381e27aa 394 },
bd95e655 395 1, /* pre_modify */
52ddefd8 396 1, /* post_modify */
bd95e655 397 0, /* register_offset */
783879e6
EM
398 1, /* register_sextend */
399 1, /* register_zextend */
bd95e655 400 0, /* imm_offset */
381e27aa
PT
401};
402
d1261ac6 403static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
ad611a4c
VP
404{
405 {
5f407e57
AP
406 1, /* hi */
407 1, /* si */
408 1, /* di */
ad611a4c
VP
409 2, /* ti */
410 },
411 0, /* pre_modify */
412 0, /* post_modify */
413 2, /* register_offset */
414 3, /* register_sextend */
415 3, /* register_zextend */
416 0, /* imm_offset */
417};
418
fa477e45
AY
419static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
420{
421 {
422 1, /* hi */
423 1, /* si */
424 1, /* di */
425 2, /* ti */
426 },
427 0, /* pre_modify */
428 0, /* post_modify */
429 2, /* register_offset */
430 3, /* register_sextend */
431 3, /* register_zextend */
432 0, /* imm_offset */
433};
434
910f72e7
SZ
435static const struct cpu_addrcost_table tsv110_addrcost_table =
436{
437 {
438 1, /* hi */
439 0, /* si */
440 0, /* di */
441 1, /* ti */
442 },
443 0, /* pre_modify */
444 0, /* post_modify */
445 0, /* register_offset */
446 1, /* register_sextend */
447 1, /* register_zextend */
448 0, /* imm_offset */
449};
450
8d39ea2f
LM
451static const struct cpu_addrcost_table qdf24xx_addrcost_table =
452{
453 {
454 1, /* hi */
455 1, /* si */
456 1, /* di */
457 2, /* ti */
458 },
459 1, /* pre_modify */
460 1, /* post_modify */
461 3, /* register_offset */
31508b39 462 3, /* register_sextend */
8d39ea2f
LM
463 3, /* register_zextend */
464 2, /* imm_offset */
465};
466
43e9d192
IB
467static const struct cpu_regmove_cost generic_regmove_cost =
468{
bd95e655 469 1, /* GP2GP */
3969c510
WD
470 /* Avoid the use of slow int<->fp moves for spilling by setting
471 their cost higher than memmov_cost. */
bd95e655
JG
472 5, /* GP2FP */
473 5, /* FP2GP */
474 2 /* FP2FP */
43e9d192
IB
475};
476
e4a9c55a
WD
477static const struct cpu_regmove_cost cortexa57_regmove_cost =
478{
bd95e655 479 1, /* GP2GP */
e4a9c55a
WD
480 /* Avoid the use of slow int<->fp moves for spilling by setting
481 their cost higher than memmov_cost. */
bd95e655
JG
482 5, /* GP2FP */
483 5, /* FP2GP */
484 2 /* FP2FP */
e4a9c55a
WD
485};
486
487static const struct cpu_regmove_cost cortexa53_regmove_cost =
488{
bd95e655 489 1, /* GP2GP */
e4a9c55a
WD
490 /* Avoid the use of slow int<->fp moves for spilling by setting
491 their cost higher than memmov_cost. */
bd95e655
JG
492 5, /* GP2FP */
493 5, /* FP2GP */
494 2 /* FP2FP */
e4a9c55a
WD
495};
496
5ec1ae3b
EM
497static const struct cpu_regmove_cost exynosm1_regmove_cost =
498{
499 1, /* GP2GP */
500 /* Avoid the use of slow int<->fp moves for spilling by setting
501 their cost higher than memmov_cost (actual, 4 and 9). */
502 9, /* GP2FP */
503 9, /* FP2GP */
504 1 /* FP2FP */
505};
506
d1bcc29f
AP
507static const struct cpu_regmove_cost thunderx_regmove_cost =
508{
bd95e655
JG
509 2, /* GP2GP */
510 2, /* GP2FP */
511 6, /* FP2GP */
512 4 /* FP2FP */
d1bcc29f
AP
513};
514
381e27aa
PT
515static const struct cpu_regmove_cost xgene1_regmove_cost =
516{
bd95e655 517 1, /* GP2GP */
381e27aa
PT
518 /* Avoid the use of slow int<->fp moves for spilling by setting
519 their cost higher than memmov_cost. */
bd95e655
JG
520 8, /* GP2FP */
521 8, /* FP2GP */
522 2 /* FP2FP */
381e27aa
PT
523};
524
ee446d9f
JW
525static const struct cpu_regmove_cost qdf24xx_regmove_cost =
526{
527 2, /* GP2GP */
528 /* Avoid the use of int<->fp moves for spilling. */
529 6, /* GP2FP */
530 6, /* FP2GP */
531 4 /* FP2FP */
532};
533
d1261ac6 534static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
ad611a4c
VP
535{
536 1, /* GP2GP */
537 /* Avoid the use of int<->fp moves for spilling. */
2aeccecb
AY
538 5, /* GP2FP */
539 6, /* FP2GP */
540 3, /* FP2FP */
ad611a4c
VP
541};
542
fa477e45
AY
543static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
544{
545 1, /* GP2GP */
546 /* Avoid the use of int<->fp moves for spilling. */
547 4, /* GP2FP */
548 5, /* FP2GP */
549 4 /* FP2FP */
550};
551
910f72e7
SZ
552static const struct cpu_regmove_cost tsv110_regmove_cost =
553{
554 1, /* GP2GP */
555 /* Avoid the use of slow int<->fp moves for spilling by setting
556 their cost higher than memmov_cost. */
557 2, /* GP2FP */
558 3, /* FP2GP */
559 2 /* FP2FP */
560};
561
8990e73a 562/* Generic costs for vector insn classes. */
8990e73a
TB
563static const struct cpu_vector_cost generic_vector_cost =
564{
cd8ae5ed
AP
565 1, /* scalar_int_stmt_cost */
566 1, /* scalar_fp_stmt_cost */
bd95e655
JG
567 1, /* scalar_load_cost */
568 1, /* scalar_store_cost */
cd8ae5ed
AP
569 1, /* vec_int_stmt_cost */
570 1, /* vec_fp_stmt_cost */
c428f91c 571 2, /* vec_permute_cost */
4bf29d15 572 2, /* vec_to_scalar_cost */
bd95e655
JG
573 1, /* scalar_to_vec_cost */
574 1, /* vec_align_load_cost */
575 1, /* vec_unalign_load_cost */
576 1, /* vec_unalign_store_cost */
577 1, /* vec_store_cost */
578 3, /* cond_taken_branch_cost */
579 1 /* cond_not_taken_branch_cost */
8990e73a
TB
580};
581
e75bc10e
LM
582/* QDF24XX costs for vector insn classes. */
583static const struct cpu_vector_cost qdf24xx_vector_cost =
584{
585 1, /* scalar_int_stmt_cost */
586 1, /* scalar_fp_stmt_cost */
587 1, /* scalar_load_cost */
588 1, /* scalar_store_cost */
589 1, /* vec_int_stmt_cost */
590 3, /* vec_fp_stmt_cost */
591 2, /* vec_permute_cost */
592 1, /* vec_to_scalar_cost */
593 1, /* scalar_to_vec_cost */
594 1, /* vec_align_load_cost */
595 1, /* vec_unalign_load_cost */
596 1, /* vec_unalign_store_cost */
597 1, /* vec_store_cost */
598 3, /* cond_taken_branch_cost */
599 1 /* cond_not_taken_branch_cost */
600};
601
c3f20327
AP
602/* ThunderX costs for vector insn classes. */
603static const struct cpu_vector_cost thunderx_vector_cost =
604{
cd8ae5ed
AP
605 1, /* scalar_int_stmt_cost */
606 1, /* scalar_fp_stmt_cost */
c3f20327
AP
607 3, /* scalar_load_cost */
608 1, /* scalar_store_cost */
cd8ae5ed 609 4, /* vec_int_stmt_cost */
b29d7591 610 1, /* vec_fp_stmt_cost */
c3f20327
AP
611 4, /* vec_permute_cost */
612 2, /* vec_to_scalar_cost */
613 2, /* scalar_to_vec_cost */
614 3, /* vec_align_load_cost */
7e87a3d9
AP
615 5, /* vec_unalign_load_cost */
616 5, /* vec_unalign_store_cost */
c3f20327
AP
617 1, /* vec_store_cost */
618 3, /* cond_taken_branch_cost */
619 3 /* cond_not_taken_branch_cost */
620};
621
910f72e7
SZ
622static const struct cpu_vector_cost tsv110_vector_cost =
623{
624 1, /* scalar_int_stmt_cost */
625 1, /* scalar_fp_stmt_cost */
626 5, /* scalar_load_cost */
627 1, /* scalar_store_cost */
628 2, /* vec_int_stmt_cost */
629 2, /* vec_fp_stmt_cost */
630 2, /* vec_permute_cost */
631 3, /* vec_to_scalar_cost */
632 2, /* scalar_to_vec_cost */
633 5, /* vec_align_load_cost */
634 5, /* vec_unalign_load_cost */
635 1, /* vec_unalign_store_cost */
636 1, /* vec_store_cost */
637 1, /* cond_taken_branch_cost */
638 1 /* cond_not_taken_branch_cost */
639};
640
60bff090 641/* Generic costs for vector insn classes. */
60bff090
JG
642static const struct cpu_vector_cost cortexa57_vector_cost =
643{
cd8ae5ed
AP
644 1, /* scalar_int_stmt_cost */
645 1, /* scalar_fp_stmt_cost */
bd95e655
JG
646 4, /* scalar_load_cost */
647 1, /* scalar_store_cost */
cd8ae5ed
AP
648 2, /* vec_int_stmt_cost */
649 2, /* vec_fp_stmt_cost */
c428f91c 650 3, /* vec_permute_cost */
bd95e655
JG
651 8, /* vec_to_scalar_cost */
652 8, /* scalar_to_vec_cost */
db4a1c18
WD
653 4, /* vec_align_load_cost */
654 4, /* vec_unalign_load_cost */
bd95e655
JG
655 1, /* vec_unalign_store_cost */
656 1, /* vec_store_cost */
657 1, /* cond_taken_branch_cost */
658 1 /* cond_not_taken_branch_cost */
60bff090
JG
659};
660
5ec1ae3b
EM
661static const struct cpu_vector_cost exynosm1_vector_cost =
662{
cd8ae5ed
AP
663 1, /* scalar_int_stmt_cost */
664 1, /* scalar_fp_stmt_cost */
5ec1ae3b
EM
665 5, /* scalar_load_cost */
666 1, /* scalar_store_cost */
cd8ae5ed
AP
667 3, /* vec_int_stmt_cost */
668 3, /* vec_fp_stmt_cost */
c428f91c 669 3, /* vec_permute_cost */
5ec1ae3b
EM
670 3, /* vec_to_scalar_cost */
671 3, /* scalar_to_vec_cost */
672 5, /* vec_align_load_cost */
673 5, /* vec_unalign_load_cost */
674 1, /* vec_unalign_store_cost */
675 1, /* vec_store_cost */
676 1, /* cond_taken_branch_cost */
677 1 /* cond_not_taken_branch_cost */
678};
679
381e27aa 680/* Generic costs for vector insn classes. */
381e27aa
PT
681static const struct cpu_vector_cost xgene1_vector_cost =
682{
cd8ae5ed
AP
683 1, /* scalar_int_stmt_cost */
684 1, /* scalar_fp_stmt_cost */
bd95e655
JG
685 5, /* scalar_load_cost */
686 1, /* scalar_store_cost */
cd8ae5ed
AP
687 2, /* vec_int_stmt_cost */
688 2, /* vec_fp_stmt_cost */
c428f91c 689 2, /* vec_permute_cost */
bd95e655
JG
690 4, /* vec_to_scalar_cost */
691 4, /* scalar_to_vec_cost */
692 10, /* vec_align_load_cost */
693 10, /* vec_unalign_load_cost */
694 2, /* vec_unalign_store_cost */
695 2, /* vec_store_cost */
696 2, /* cond_taken_branch_cost */
697 1 /* cond_not_taken_branch_cost */
381e27aa
PT
698};
699
ad611a4c 700/* Costs for vector insn classes for Vulcan. */
d1261ac6 701static const struct cpu_vector_cost thunderx2t99_vector_cost =
ad611a4c 702{
cd8ae5ed
AP
703 1, /* scalar_int_stmt_cost */
704 6, /* scalar_fp_stmt_cost */
ad611a4c
VP
705 4, /* scalar_load_cost */
706 1, /* scalar_store_cost */
2aeccecb
AY
707 4, /* vec_int_stmt_cost */
708 5, /* vec_fp_stmt_cost */
5aef51c2 709 10, /* vec_permute_cost */
ad611a4c
VP
710 6, /* vec_to_scalar_cost */
711 5, /* scalar_to_vec_cost */
2aeccecb
AY
712 4, /* vec_align_load_cost */
713 4, /* vec_unalign_load_cost */
714 1, /* vec_unalign_store_cost */
715 1, /* vec_store_cost */
ad611a4c
VP
716 2, /* cond_taken_branch_cost */
717 1 /* cond_not_taken_branch_cost */
718};
719
fa477e45
AY
720static const struct cpu_vector_cost thunderx3t110_vector_cost =
721{
722 1, /* scalar_int_stmt_cost */
723 5, /* scalar_fp_stmt_cost */
724 4, /* scalar_load_cost */
725 1, /* scalar_store_cost */
726 5, /* vec_int_stmt_cost */
727 5, /* vec_fp_stmt_cost */
728 10, /* vec_permute_cost */
729 5, /* vec_to_scalar_cost */
730 5, /* scalar_to_vec_cost */
731 4, /* vec_align_load_cost */
732 4, /* vec_unalign_load_cost */
733 4, /* vec_unalign_store_cost */
734 4, /* vec_store_cost */
735 2, /* cond_taken_branch_cost */
736 1 /* cond_not_taken_branch_cost */
737};
738
739
b9066f5a
MW
740/* Generic costs for branch instructions. */
741static const struct cpu_branch_cost generic_branch_cost =
742{
9094d4a4
WD
743 1, /* Predictable. */
744 3 /* Unpredictable. */
b9066f5a
MW
745};
746
9acc9cbe
EM
747/* Generic approximation modes. */
748static const cpu_approx_modes generic_approx_modes =
749{
79a2bc2d 750 AARCH64_APPROX_NONE, /* division */
98daafa0 751 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
752 AARCH64_APPROX_NONE /* recip_sqrt */
753};
754
755/* Approximation modes for Exynos M1. */
756static const cpu_approx_modes exynosm1_approx_modes =
757{
79a2bc2d 758 AARCH64_APPROX_NONE, /* division */
98daafa0 759 AARCH64_APPROX_ALL, /* sqrt */
9acc9cbe
EM
760 AARCH64_APPROX_ALL /* recip_sqrt */
761};
762
763/* Approximation modes for X-Gene 1. */
764static const cpu_approx_modes xgene1_approx_modes =
765{
79a2bc2d 766 AARCH64_APPROX_NONE, /* division */
98daafa0 767 AARCH64_APPROX_NONE, /* sqrt */
9acc9cbe
EM
768 AARCH64_APPROX_ALL /* recip_sqrt */
769};
770
9d2c6e2e
MK
771/* Generic prefetch settings (which disable prefetch). */
772static const cpu_prefetch_tune generic_prefetch_tune =
773{
774 0, /* num_slots */
775 -1, /* l1_cache_size */
776 -1, /* l1_cache_line_size */
16b2cafd 777 -1, /* l2_cache_size */
d2ff35c0 778 true, /* prefetch_dynamic_strides */
59100dfc 779 -1, /* minimum_stride */
16b2cafd 780 -1 /* default_opt_level */
9d2c6e2e
MK
781};
782
783static const cpu_prefetch_tune exynosm1_prefetch_tune =
784{
785 0, /* num_slots */
786 -1, /* l1_cache_size */
787 64, /* l1_cache_line_size */
16b2cafd 788 -1, /* l2_cache_size */
d2ff35c0 789 true, /* prefetch_dynamic_strides */
59100dfc 790 -1, /* minimum_stride */
16b2cafd 791 -1 /* default_opt_level */
9d2c6e2e
MK
792};
793
794static const cpu_prefetch_tune qdf24xx_prefetch_tune =
795{
70c51b58
MK
796 4, /* num_slots */
797 32, /* l1_cache_size */
9d2c6e2e 798 64, /* l1_cache_line_size */
725e2110 799 512, /* l2_cache_size */
d2ff35c0 800 false, /* prefetch_dynamic_strides */
59100dfc
LM
801 2048, /* minimum_stride */
802 3 /* default_opt_level */
9d2c6e2e
MK
803};
804
f1e247d0
AP
805static const cpu_prefetch_tune thunderxt88_prefetch_tune =
806{
807 8, /* num_slots */
808 32, /* l1_cache_size */
809 128, /* l1_cache_line_size */
810 16*1024, /* l2_cache_size */
d2ff35c0 811 true, /* prefetch_dynamic_strides */
59100dfc 812 -1, /* minimum_stride */
f1e247d0
AP
813 3 /* default_opt_level */
814};
815
816static const cpu_prefetch_tune thunderx_prefetch_tune =
817{
818 8, /* num_slots */
819 32, /* l1_cache_size */
820 128, /* l1_cache_line_size */
821 -1, /* l2_cache_size */
d2ff35c0 822 true, /* prefetch_dynamic_strides */
59100dfc 823 -1, /* minimum_stride */
f1e247d0
AP
824 -1 /* default_opt_level */
825};
826
9d2c6e2e
MK
827static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
828{
f1e247d0
AP
829 8, /* num_slots */
830 32, /* l1_cache_size */
9d2c6e2e 831 64, /* l1_cache_line_size */
f1e247d0 832 256, /* l2_cache_size */
d2ff35c0 833 true, /* prefetch_dynamic_strides */
59100dfc 834 -1, /* minimum_stride */
16b2cafd 835 -1 /* default_opt_level */
9d2c6e2e
MK
836};
837
fa477e45
AY
838static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
839{
840 8, /* num_slots */
841 32, /* l1_cache_size */
842 64, /* l1_cache_line_size */
843 256, /* l2_cache_size */
844 true, /* prefetch_dynamic_strides */
845 -1, /* minimum_stride */
846 -1 /* default_opt_level */
847};
848
910f72e7
SZ
849static const cpu_prefetch_tune tsv110_prefetch_tune =
850{
851 0, /* num_slots */
852 64, /* l1_cache_size */
853 64, /* l1_cache_line_size */
854 512, /* l2_cache_size */
855 true, /* prefetch_dynamic_strides */
856 -1, /* minimum_stride */
857 -1 /* default_opt_level */
858};
859
d5e9851e
CM
860static const cpu_prefetch_tune xgene1_prefetch_tune =
861{
862 8, /* num_slots */
863 32, /* l1_cache_size */
864 64, /* l1_cache_line_size */
865 256, /* l2_cache_size */
866 true, /* prefetch_dynamic_strides */
867 -1, /* minimum_stride */
868 -1 /* default_opt_level */
869};
870
43e9d192
IB
871static const struct tune_params generic_tunings =
872{
4e2cd668 873 &cortexa57_extra_costs,
43e9d192
IB
874 &generic_addrcost_table,
875 &generic_regmove_cost,
8990e73a 876 &generic_vector_cost,
b9066f5a 877 &generic_branch_cost,
9acc9cbe 878 &generic_approx_modes,
2d56d6ba 879 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
880 4, /* memmov_cost */
881 2, /* issue_rate */
6ed8c923 882 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
4e55aefa 883 "16:12", /* function_align. */
c518c102
ML
884 "4", /* jump_align. */
885 "8", /* loop_align. */
cee66c68
WD
886 2, /* int_reassoc_width. */
887 4, /* fp_reassoc_width. */
50093a33
WD
888 1, /* vec_reassoc_width. */
889 2, /* min_div_recip_mul_sf. */
dfba575f 890 2, /* min_div_recip_mul_df. */
50487d79 891 0, /* max_case_values. */
3b4c0f7e 892 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
893 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
894 &generic_prefetch_tune
43e9d192
IB
895};
896
1c72a3ca
JG
897static const struct tune_params cortexa35_tunings =
898{
899 &cortexa53_extra_costs,
900 &generic_addrcost_table,
901 &cortexa53_regmove_cost,
902 &generic_vector_cost,
aca97ef8 903 &generic_branch_cost,
9acc9cbe 904 &generic_approx_modes,
2d56d6ba 905 SVE_NOT_IMPLEMENTED, /* sve_width */
1c72a3ca
JG
906 4, /* memmov_cost */
907 1, /* issue_rate */
0bc24338 908 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1c72a3ca 909 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
910 "16", /* function_align. */
911 "4", /* jump_align. */
912 "8", /* loop_align. */
1c72a3ca
JG
913 2, /* int_reassoc_width. */
914 4, /* fp_reassoc_width. */
915 1, /* vec_reassoc_width. */
916 2, /* min_div_recip_mul_sf. */
917 2, /* min_div_recip_mul_df. */
918 0, /* max_case_values. */
1c72a3ca 919 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
920 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
921 &generic_prefetch_tune
1c72a3ca
JG
922};
923
984239ad
KT
924static const struct tune_params cortexa53_tunings =
925{
926 &cortexa53_extra_costs,
927 &generic_addrcost_table,
e4a9c55a 928 &cortexa53_regmove_cost,
984239ad 929 &generic_vector_cost,
aca97ef8 930 &generic_branch_cost,
9acc9cbe 931 &generic_approx_modes,
2d56d6ba 932 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
933 4, /* memmov_cost */
934 2, /* issue_rate */
00a8574a 935 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 936 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
937 "16", /* function_align. */
938 "4", /* jump_align. */
939 "8", /* loop_align. */
cee66c68
WD
940 2, /* int_reassoc_width. */
941 4, /* fp_reassoc_width. */
50093a33
WD
942 1, /* vec_reassoc_width. */
943 2, /* min_div_recip_mul_sf. */
dfba575f 944 2, /* min_div_recip_mul_df. */
50487d79 945 0, /* max_case_values. */
2d6bc7fa 946 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
947 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
948 &generic_prefetch_tune
984239ad
KT
949};
950
4fd92af6
KT
951static const struct tune_params cortexa57_tunings =
952{
953 &cortexa57_extra_costs,
a39d4348 954 &generic_addrcost_table,
e4a9c55a 955 &cortexa57_regmove_cost,
60bff090 956 &cortexa57_vector_cost,
aca97ef8 957 &generic_branch_cost,
9acc9cbe 958 &generic_approx_modes,
2d56d6ba 959 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
960 4, /* memmov_cost */
961 3, /* issue_rate */
00a8574a 962 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
e9a3a175 963 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
964 "16", /* function_align. */
965 "4", /* jump_align. */
966 "8", /* loop_align. */
cee66c68
WD
967 2, /* int_reassoc_width. */
968 4, /* fp_reassoc_width. */
50093a33
WD
969 1, /* vec_reassoc_width. */
970 2, /* min_div_recip_mul_sf. */
dfba575f 971 2, /* min_div_recip_mul_df. */
50487d79 972 0, /* max_case_values. */
2d6bc7fa 973 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
974 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
975 &generic_prefetch_tune
dfba575f
JG
976};
977
978static const struct tune_params cortexa72_tunings =
979{
980 &cortexa57_extra_costs,
a39d4348 981 &generic_addrcost_table,
dfba575f
JG
982 &cortexa57_regmove_cost,
983 &cortexa57_vector_cost,
aca97ef8 984 &generic_branch_cost,
9acc9cbe 985 &generic_approx_modes,
2d56d6ba 986 SVE_NOT_IMPLEMENTED, /* sve_width */
dfba575f
JG
987 4, /* memmov_cost */
988 3, /* issue_rate */
00a8574a 989 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
dfba575f 990 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
c518c102
ML
991 "16", /* function_align. */
992 "4", /* jump_align. */
993 "8", /* loop_align. */
dfba575f
JG
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
50487d79 999 0, /* max_case_values. */
0bc24338 1000 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1001 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1002 &generic_prefetch_tune
4fd92af6
KT
1003};
1004
4fb570c4
KT
1005static const struct tune_params cortexa73_tunings =
1006{
1007 &cortexa57_extra_costs,
a39d4348 1008 &generic_addrcost_table,
4fb570c4
KT
1009 &cortexa57_regmove_cost,
1010 &cortexa57_vector_cost,
aca97ef8 1011 &generic_branch_cost,
4fb570c4 1012 &generic_approx_modes,
2d56d6ba 1013 SVE_NOT_IMPLEMENTED, /* sve_width */
4fb570c4
KT
1014 4, /* memmov_cost. */
1015 2, /* issue_rate. */
1016 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
c518c102
ML
1018 "16", /* function_align. */
1019 "4", /* jump_align. */
1020 "8", /* loop_align. */
4fb570c4
KT
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
4fb570c4 1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1028 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1029 &generic_prefetch_tune
4fb570c4
KT
1030};
1031
9d2c6e2e
MK
1032
1033
5ec1ae3b
EM
1034static const struct tune_params exynosm1_tunings =
1035{
1036 &exynosm1_extra_costs,
1037 &exynosm1_addrcost_table,
1038 &exynosm1_regmove_cost,
1039 &exynosm1_vector_cost,
1040 &generic_branch_cost,
9acc9cbe 1041 &exynosm1_approx_modes,
2d56d6ba 1042 SVE_NOT_IMPLEMENTED, /* sve_width */
5ec1ae3b
EM
1043 4, /* memmov_cost */
1044 3, /* issue_rate */
25cc2199 1045 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
c518c102
ML
1046 "4", /* function_align. */
1047 "4", /* jump_align. */
1048 "4", /* loop_align. */
5ec1ae3b
EM
1049 2, /* int_reassoc_width. */
1050 4, /* fp_reassoc_width. */
1051 1, /* vec_reassoc_width. */
1052 2, /* min_div_recip_mul_sf. */
1053 2, /* min_div_recip_mul_df. */
1054 48, /* max_case_values. */
220379df 1055 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1056 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1057 &exynosm1_prefetch_tune
5ec1ae3b
EM
1058};
1059
f1e247d0
AP
1060static const struct tune_params thunderxt88_tunings =
1061{
1062 &thunderx_extra_costs,
1063 &generic_addrcost_table,
1064 &thunderx_regmove_cost,
1065 &thunderx_vector_cost,
1066 &generic_branch_cost,
1067 &generic_approx_modes,
2d56d6ba 1068 SVE_NOT_IMPLEMENTED, /* sve_width */
f1e247d0
AP
1069 6, /* memmov_cost */
1070 2, /* issue_rate */
a4f3fa71 1071 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1072 "8", /* function_align. */
1073 "8", /* jump_align. */
1074 "8", /* loop_align. */
f1e247d0
AP
1075 2, /* int_reassoc_width. */
1076 4, /* fp_reassoc_width. */
1077 1, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1083 &thunderxt88_prefetch_tune
1084};
1085
d1bcc29f
AP
1086static const struct tune_params thunderx_tunings =
1087{
1088 &thunderx_extra_costs,
1089 &generic_addrcost_table,
1090 &thunderx_regmove_cost,
c3f20327 1091 &thunderx_vector_cost,
b9066f5a 1092 &generic_branch_cost,
9acc9cbe 1093 &generic_approx_modes,
2d56d6ba 1094 SVE_NOT_IMPLEMENTED, /* sve_width */
bd95e655
JG
1095 6, /* memmov_cost */
1096 2, /* issue_rate */
a4f3fa71 1097 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
c518c102
ML
1098 "8", /* function_align. */
1099 "8", /* jump_align. */
1100 "8", /* loop_align. */
cee66c68
WD
1101 2, /* int_reassoc_width. */
1102 4, /* fp_reassoc_width. */
50093a33
WD
1103 1, /* vec_reassoc_width. */
1104 2, /* min_div_recip_mul_sf. */
dfba575f 1105 2, /* min_div_recip_mul_df. */
50487d79 1106 0, /* max_case_values. */
2d6bc7fa 1107 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
b10f1009
AP
1108 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1109 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
f1e247d0 1110 &thunderx_prefetch_tune
d1bcc29f
AP
1111};
1112
910f72e7
SZ
1113static const struct tune_params tsv110_tunings =
1114{
1115 &tsv110_extra_costs,
1116 &tsv110_addrcost_table,
1117 &tsv110_regmove_cost,
1118 &tsv110_vector_cost,
1119 &generic_branch_cost,
1120 &generic_approx_modes,
2d56d6ba 1121 SVE_NOT_IMPLEMENTED, /* sve_width */
910f72e7
SZ
1122 4, /* memmov_cost */
1123 4, /* issue_rate */
a4f3fa71
WD
1124 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1125 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
910f72e7
SZ
1126 "16", /* function_align. */
1127 "4", /* jump_align. */
1128 "8", /* loop_align. */
1129 2, /* int_reassoc_width. */
1130 4, /* fp_reassoc_width. */
1131 1, /* vec_reassoc_width. */
1132 2, /* min_div_recip_mul_sf. */
1133 2, /* min_div_recip_mul_df. */
1134 0, /* max_case_values. */
1135 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1136 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1137 &tsv110_prefetch_tune
1138};
1139
381e27aa 1140static const struct tune_params xgene1_tunings =
e02669db
CM
1141{
1142 &xgene1_extra_costs,
1143 &xgene1_addrcost_table,
1144 &xgene1_regmove_cost,
1145 &xgene1_vector_cost,
1146 &generic_branch_cost,
1147 &xgene1_approx_modes,
2d56d6ba 1148 SVE_NOT_IMPLEMENTED, /* sve_width */
e02669db
CM
1149 6, /* memmov_cost */
1150 4, /* issue_rate */
1151 AARCH64_FUSE_NOTHING, /* fusible_ops */
1152 "16", /* function_align. */
1153 "16", /* jump_align. */
1154 "16", /* loop_align. */
1155 2, /* int_reassoc_width. */
1156 4, /* fp_reassoc_width. */
1157 1, /* vec_reassoc_width. */
1158 2, /* min_div_recip_mul_sf. */
1159 2, /* min_div_recip_mul_df. */
1160 17, /* max_case_values. */
1161 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1162 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1163 &xgene1_prefetch_tune
1164};
1165
1166static const struct tune_params emag_tunings =
381e27aa
PT
1167{
1168 &xgene1_extra_costs,
1169 &xgene1_addrcost_table,
1170 &xgene1_regmove_cost,
1171 &xgene1_vector_cost,
b9066f5a 1172 &generic_branch_cost,
9acc9cbe 1173 &xgene1_approx_modes,
2d56d6ba 1174 SVE_NOT_IMPLEMENTED,
bd95e655
JG
1175 6, /* memmov_cost */
1176 4, /* issue_rate */
e9a3a175 1177 AARCH64_FUSE_NOTHING, /* fusible_ops */
c518c102 1178 "16", /* function_align. */
cf28c77e 1179 "16", /* jump_align. */
c518c102 1180 "16", /* loop_align. */
381e27aa
PT
1181 2, /* int_reassoc_width. */
1182 4, /* fp_reassoc_width. */
50093a33
WD
1183 1, /* vec_reassoc_width. */
1184 2, /* min_div_recip_mul_sf. */
dfba575f 1185 2, /* min_div_recip_mul_df. */
cf28c77e 1186 17, /* max_case_values. */
2d6bc7fa 1187 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
9f5361c8 1188 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
d5e9851e 1189 &xgene1_prefetch_tune
381e27aa
PT
1190};
1191
ee446d9f
JW
1192static const struct tune_params qdf24xx_tunings =
1193{
1194 &qdf24xx_extra_costs,
8d39ea2f 1195 &qdf24xx_addrcost_table,
ee446d9f 1196 &qdf24xx_regmove_cost,
e75bc10e 1197 &qdf24xx_vector_cost,
ee446d9f
JW
1198 &generic_branch_cost,
1199 &generic_approx_modes,
2d56d6ba 1200 SVE_NOT_IMPLEMENTED, /* sve_width */
ee446d9f
JW
1201 4, /* memmov_cost */
1202 4, /* issue_rate */
1203 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1204 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1205 "16", /* function_align. */
1206 "8", /* jump_align. */
1207 "16", /* loop_align. */
ee446d9f
JW
1208 2, /* int_reassoc_width. */
1209 4, /* fp_reassoc_width. */
1210 1, /* vec_reassoc_width. */
1211 2, /* min_div_recip_mul_sf. */
1212 2, /* min_div_recip_mul_df. */
1213 0, /* max_case_values. */
4f2a94e6 1214 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
a98824ac 1215 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
9d2c6e2e 1216 &qdf24xx_prefetch_tune
ee446d9f
JW
1217};
1218
52ee8191
SP
1219/* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1220 for now. */
1221static const struct tune_params saphira_tunings =
1222{
1223 &generic_extra_costs,
1224 &generic_addrcost_table,
1225 &generic_regmove_cost,
1226 &generic_vector_cost,
1227 &generic_branch_cost,
1228 &generic_approx_modes,
2d56d6ba 1229 SVE_NOT_IMPLEMENTED, /* sve_width */
52ee8191
SP
1230 4, /* memmov_cost */
1231 4, /* issue_rate */
1232 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1233 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
c518c102
ML
1234 "16", /* function_align. */
1235 "8", /* jump_align. */
1236 "16", /* loop_align. */
52ee8191
SP
1237 2, /* int_reassoc_width. */
1238 4, /* fp_reassoc_width. */
1239 1, /* vec_reassoc_width. */
1240 2, /* min_div_recip_mul_sf. */
1241 2, /* min_div_recip_mul_df. */
1242 0, /* max_case_values. */
1243 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1244 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1245 &generic_prefetch_tune
1246};
1247
d1261ac6 1248static const struct tune_params thunderx2t99_tunings =
ad611a4c 1249{
d1261ac6
AP
1250 &thunderx2t99_extra_costs,
1251 &thunderx2t99_addrcost_table,
1252 &thunderx2t99_regmove_cost,
1253 &thunderx2t99_vector_cost,
aca97ef8 1254 &generic_branch_cost,
ad611a4c 1255 &generic_approx_modes,
2d56d6ba 1256 SVE_NOT_IMPLEMENTED, /* sve_width */
ad611a4c
VP
1257 4, /* memmov_cost. */
1258 4, /* issue_rate. */
a4f3fa71
WD
1259 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1260 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
c518c102
ML
1261 "16", /* function_align. */
1262 "8", /* jump_align. */
1263 "16", /* loop_align. */
ad611a4c
VP
1264 3, /* int_reassoc_width. */
1265 2, /* fp_reassoc_width. */
1266 2, /* vec_reassoc_width. */
1267 2, /* min_div_recip_mul_sf. */
1268 2, /* min_div_recip_mul_df. */
1269 0, /* max_case_values. */
f1e247d0 1270 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
9d2c6e2e
MK
1271 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1272 &thunderx2t99_prefetch_tune
ad611a4c
VP
1273};
1274
fa477e45
AY
1275static const struct tune_params thunderx3t110_tunings =
1276{
1277 &thunderx3t110_extra_costs,
1278 &thunderx3t110_addrcost_table,
1279 &thunderx3t110_regmove_cost,
1280 &thunderx3t110_vector_cost,
1281 &generic_branch_cost,
1282 &generic_approx_modes,
1283 SVE_NOT_IMPLEMENTED, /* sve_width */
1284 4, /* memmov_cost. */
1285 6, /* issue_rate. */
1286 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1287 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1288 "16", /* function_align. */
1289 "8", /* jump_align. */
1290 "16", /* loop_align. */
1291 3, /* int_reassoc_width. */
1292 2, /* fp_reassoc_width. */
1293 2, /* vec_reassoc_width. */
1294 2, /* min_div_recip_mul_sf. */
1295 2, /* min_div_recip_mul_df. */
1296 0, /* max_case_values. */
1297 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1298 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1299 &thunderx3t110_prefetch_tune
1300};
1301
9ed6834d 1302static const struct tune_params neoversen1_tunings =
fc881de2
KT
1303{
1304 &cortexa57_extra_costs,
1305 &generic_addrcost_table,
1306 &generic_regmove_cost,
1307 &cortexa57_vector_cost,
1308 &generic_branch_cost,
1309 &generic_approx_modes,
1310 SVE_NOT_IMPLEMENTED, /* sve_width */
1311 4, /* memmov_cost */
1312 3, /* issue_rate */
6ed8c923 1313 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
fc881de2 1314 "32:16", /* function_align. */
3a434597 1315 "4", /* jump_align. */
fc881de2
KT
1316 "32:16", /* loop_align. */
1317 2, /* int_reassoc_width. */
1318 4, /* fp_reassoc_width. */
1319 2, /* vec_reassoc_width. */
1320 2, /* min_div_recip_mul_sf. */
1321 2, /* min_div_recip_mul_df. */
1322 0, /* max_case_values. */
1323 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1324 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1325 &generic_prefetch_tune
1326};
1327
8dec06f2
JG
1328/* Support for fine-grained override of the tuning structures. */
1329struct aarch64_tuning_override_function
1330{
1331 const char* name;
1332 void (*parse_override)(const char*, struct tune_params*);
1333};
1334
1335static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1336static void aarch64_parse_tune_string (const char*, struct tune_params*);
886f092f 1337static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
8dec06f2
JG
1338
1339static const struct aarch64_tuning_override_function
1340aarch64_tuning_override_functions[] =
1341{
1342 { "fuse", aarch64_parse_fuse_string },
1343 { "tune", aarch64_parse_tune_string },
886f092f 1344 { "sve_width", aarch64_parse_sve_width_string },
8dec06f2
JG
1345 { NULL, NULL }
1346};
1347
43e9d192
IB
1348/* A processor implementing AArch64. */
1349struct processor
1350{
1351 const char *const name;
46806c44
KT
1352 enum aarch64_processor ident;
1353 enum aarch64_processor sched_core;
393ae126 1354 enum aarch64_arch arch;
0c6caaf8 1355 unsigned architecture_version;
28108a53 1356 const uint64_t flags;
43e9d192
IB
1357 const struct tune_params *const tune;
1358};
1359
393ae126
KT
1360/* Architectures implementing AArch64. */
1361static const struct processor all_architectures[] =
1362{
1363#define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1364 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1365#include "aarch64-arches.def"
393ae126
KT
1366 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1367};
1368
43e9d192
IB
1369/* Processor cores implementing AArch64. */
1370static const struct processor all_cores[] =
1371{
e8fcc9fa 1372#define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
393ae126
KT
1373 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1374 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1375 FLAGS, &COSTS##_tunings},
43e9d192 1376#include "aarch64-cores.def"
393ae126
KT
1377 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1378 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1379 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
43e9d192
IB
1380};
1381
43e9d192 1382
361fb3ee
KT
1383/* Target specification. These are populated by the -march, -mtune, -mcpu
1384 handling code or by target attributes. */
43e9d192
IB
1385static const struct processor *selected_arch;
1386static const struct processor *selected_cpu;
1387static const struct processor *selected_tune;
1388
8fc16d72
ST
1389enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1390
b175b679
JG
1391/* The current tuning set. */
1392struct tune_params aarch64_tune_params = generic_tunings;
1393
c600df9a
RS
1394/* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1395
1396static tree
1397handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1398 int, bool *no_add_attrs)
1399{
1400 /* Since we set fn_type_req to true, the caller should have checked
1401 this for us. */
1402 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1403 switch ((arm_pcs) fntype_abi (*node).id ())
1404 {
1405 case ARM_PCS_AAPCS64:
1406 case ARM_PCS_SIMD:
1407 return NULL_TREE;
1408
1409 case ARM_PCS_SVE:
1410 error ("the %qE attribute cannot be applied to an SVE function type",
1411 name);
1412 *no_add_attrs = true;
1413 return NULL_TREE;
1414
1415 case ARM_PCS_TLSDESC:
1416 case ARM_PCS_UNKNOWN:
1417 break;
1418 }
1419 gcc_unreachable ();
1420}
1421
a0d0b980
SE
1422/* Table of machine attributes. */
1423static const struct attribute_spec aarch64_attribute_table[] =
1424{
1425 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1426 affects_type_identity, handler, exclude } */
c600df9a
RS
1427 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1428 handle_aarch64_vector_pcs_attribute, NULL },
38e62001
RS
1429 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1430 aarch64_sve::handle_arm_sve_vector_bits_attribute,
1431 NULL },
31427b97 1432 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
683e93d1 1433 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
5002dae3 1434 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
a0d0b980
SE
1435 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1436};
1437
43e9d192
IB
1438#define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1439
1440/* An ISA extension in the co-processor and main instruction set space. */
1441struct aarch64_option_extension
1442{
1443 const char *const name;
1444 const unsigned long flags_on;
1445 const unsigned long flags_off;
1446};
1447
43e9d192
IB
1448typedef enum aarch64_cond_code
1449{
1450 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1451 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1452 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1453}
1454aarch64_cc;
1455
1456#define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1457
efac62a3
ST
1458struct aarch64_branch_protect_type
1459{
1460 /* The type's name that the user passes to the branch-protection option
1461 string. */
1462 const char* name;
1463 /* Function to handle the protection type and set global variables.
1464 First argument is the string token corresponding with this type and the
1465 second argument is the next token in the option string.
1466 Return values:
1467 * AARCH64_PARSE_OK: Handling was sucessful.
1468 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1469 should print an error.
1470 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1471 own error. */
1472 enum aarch64_parse_opt_result (*handler)(char*, char*);
1473 /* A list of types that can follow this type in the option string. */
1474 const aarch64_branch_protect_type* subtypes;
1475 unsigned int num_subtypes;
1476};
1477
1478static enum aarch64_parse_opt_result
1479aarch64_handle_no_branch_protection (char* str, char* rest)
1480{
1481 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
30afdf34 1482 aarch64_enable_bti = 0;
efac62a3
ST
1483 if (rest)
1484 {
1485 error ("unexpected %<%s%> after %<%s%>", rest, str);
1486 return AARCH64_PARSE_INVALID_FEATURE;
1487 }
1488 return AARCH64_PARSE_OK;
1489}
1490
1491static enum aarch64_parse_opt_result
1492aarch64_handle_standard_branch_protection (char* str, char* rest)
1493{
1494 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 1495 aarch64_ra_sign_key = AARCH64_KEY_A;
30afdf34 1496 aarch64_enable_bti = 1;
efac62a3
ST
1497 if (rest)
1498 {
1499 error ("unexpected %<%s%> after %<%s%>", rest, str);
1500 return AARCH64_PARSE_INVALID_FEATURE;
1501 }
1502 return AARCH64_PARSE_OK;
1503}
1504
1505static enum aarch64_parse_opt_result
1506aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1507 char* rest ATTRIBUTE_UNUSED)
1508{
1509 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
8fc16d72 1510 aarch64_ra_sign_key = AARCH64_KEY_A;
efac62a3
ST
1511 return AARCH64_PARSE_OK;
1512}
1513
1514static enum aarch64_parse_opt_result
1515aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1516 char* rest ATTRIBUTE_UNUSED)
1517{
1518 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1519 return AARCH64_PARSE_OK;
1520}
1521
8fc16d72
ST
1522static enum aarch64_parse_opt_result
1523aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1524 char* rest ATTRIBUTE_UNUSED)
1525{
1526 aarch64_ra_sign_key = AARCH64_KEY_B;
1527 return AARCH64_PARSE_OK;
1528}
1529
30afdf34
SD
1530static enum aarch64_parse_opt_result
1531aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1532 char* rest ATTRIBUTE_UNUSED)
1533{
1534 aarch64_enable_bti = 1;
1535 return AARCH64_PARSE_OK;
1536}
1537
efac62a3
ST
1538static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1539 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
8fc16d72 1540 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
efac62a3
ST
1541 { NULL, NULL, NULL, 0 }
1542};
1543
1544static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1545 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1546 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1547 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1548 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
30afdf34 1549 { "bti", aarch64_handle_bti_protection, NULL, 0 },
efac62a3
ST
1550 { NULL, NULL, NULL, 0 }
1551};
1552
43e9d192
IB
1553/* The condition codes of the processor, and the inverse function. */
1554static const char * const aarch64_condition_codes[] =
1555{
1556 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1557 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1558};
1559
57d6f4d0
RS
1560/* The preferred condition codes for SVE conditions. */
1561static const char *const aarch64_sve_condition_codes[] =
1562{
1563 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1564 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1565};
1566
0b1fe8cf
RS
1567/* Return the assembly token for svpattern value VALUE. */
1568
1569static const char *
1570svpattern_token (enum aarch64_svpattern pattern)
1571{
1572 switch (pattern)
1573 {
1574#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1575 AARCH64_FOR_SVPATTERN (CASE)
1576#undef CASE
1577 case AARCH64_NUM_SVPATTERNS:
1578 break;
1579 }
1580 gcc_unreachable ();
1581}
1582
38e62001
RS
1583/* Return the location of a piece that is known to be passed or returned
1584 in registers. FIRST_ZR is the first unused vector argument register
1585 and FIRST_PR is the first unused predicate argument register. */
1586
1587rtx
1588pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1589 unsigned int first_pr) const
1590{
1591 gcc_assert (VECTOR_MODE_P (mode)
1592 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1593 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1594
1595 if (num_zr > 0 && num_pr == 0)
1596 return gen_rtx_REG (mode, first_zr);
1597
1598 if (num_zr == 0 && num_pr == 1)
1599 return gen_rtx_REG (mode, first_pr);
1600
1601 gcc_unreachable ();
1602}
1603
1604/* Return the total number of vector registers required by the PST. */
1605
1606unsigned int
1607pure_scalable_type_info::num_zr () const
1608{
1609 unsigned int res = 0;
1610 for (unsigned int i = 0; i < pieces.length (); ++i)
1611 res += pieces[i].num_zr;
1612 return res;
1613}
1614
1615/* Return the total number of predicate registers required by the PST. */
1616
1617unsigned int
1618pure_scalable_type_info::num_pr () const
1619{
1620 unsigned int res = 0;
1621 for (unsigned int i = 0; i < pieces.length (); ++i)
1622 res += pieces[i].num_pr;
1623 return res;
1624}
1625
1626/* Return the location of a PST that is known to be passed or returned
1627 in registers. FIRST_ZR is the first unused vector argument register
1628 and FIRST_PR is the first unused predicate argument register. */
1629
1630rtx
1631pure_scalable_type_info::get_rtx (machine_mode mode,
1632 unsigned int first_zr,
1633 unsigned int first_pr) const
1634{
1635 /* Try to return a single REG if possible. This leads to better
1636 code generation; it isn't required for correctness. */
1637 if (mode == pieces[0].mode)
1638 {
1639 gcc_assert (pieces.length () == 1);
1640 return pieces[0].get_rtx (first_zr, first_pr);
1641 }
1642
1643 /* Build up a PARALLEL that contains the individual pieces. */
1644 rtvec rtxes = rtvec_alloc (pieces.length ());
1645 for (unsigned int i = 0; i < pieces.length (); ++i)
1646 {
1647 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1648 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1649 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1650 first_zr += pieces[i].num_zr;
1651 first_pr += pieces[i].num_pr;
1652 }
1653 return gen_rtx_PARALLEL (mode, rtxes);
1654}
1655
1656/* Analyze whether TYPE is a Pure Scalable Type according to the rules
1657 in the AAPCS64. */
1658
1659pure_scalable_type_info::analysis_result
1660pure_scalable_type_info::analyze (const_tree type)
1661{
1662 /* Prevent accidental reuse. */
1663 gcc_assert (pieces.is_empty ());
1664
1665 /* No code will be generated for erroneous types, so we won't establish
1666 an ABI mapping. */
1667 if (type == error_mark_node)
1668 return NO_ABI_IDENTITY;
1669
1670 /* Zero-sized types disappear in the language->ABI mapping. */
1671 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1672 return NO_ABI_IDENTITY;
1673
1674 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1675 piece p = {};
1676 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1677 {
1678 machine_mode mode = TYPE_MODE_RAW (type);
1679 gcc_assert (VECTOR_MODE_P (mode)
1680 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1681
1682 p.mode = p.orig_mode = mode;
1683 add_piece (p);
1684 return IS_PST;
1685 }
1686
1687 /* Check for user-defined PSTs. */
1688 if (TREE_CODE (type) == ARRAY_TYPE)
1689 return analyze_array (type);
1690 if (TREE_CODE (type) == RECORD_TYPE)
1691 return analyze_record (type);
1692
1693 return ISNT_PST;
1694}
1695
1696/* Analyze a type that is known not to be passed or returned in memory.
1697 Return true if it has an ABI identity and is a Pure Scalable Type. */
1698
1699bool
1700pure_scalable_type_info::analyze_registers (const_tree type)
1701{
1702 analysis_result result = analyze (type);
1703 gcc_assert (result != DOESNT_MATTER);
1704 return result == IS_PST;
1705}
1706
1707/* Subroutine of analyze for handling ARRAY_TYPEs. */
1708
1709pure_scalable_type_info::analysis_result
1710pure_scalable_type_info::analyze_array (const_tree type)
1711{
1712 /* Analyze the element type. */
1713 pure_scalable_type_info element_info;
1714 analysis_result result = element_info.analyze (TREE_TYPE (type));
1715 if (result != IS_PST)
1716 return result;
1717
1718 /* An array of unknown, flexible or variable length will be passed and
1719 returned by reference whatever we do. */
1720 tree nelts_minus_one = array_type_nelts (type);
1721 if (!tree_fits_uhwi_p (nelts_minus_one))
1722 return DOESNT_MATTER;
1723
1724 /* Likewise if the array is constant-sized but too big to be interesting.
1725 The double checks against MAX_PIECES are to protect against overflow. */
1726 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1727 if (count > MAX_PIECES)
1728 return DOESNT_MATTER;
1729 count += 1;
1730 if (count * element_info.pieces.length () > MAX_PIECES)
1731 return DOESNT_MATTER;
1732
1733 /* The above checks should have weeded out elements of unknown size. */
1734 poly_uint64 element_bytes;
1735 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1736 gcc_unreachable ();
1737
1738 /* Build up the list of individual vectors and predicates. */
1739 gcc_assert (!element_info.pieces.is_empty ());
1740 for (unsigned int i = 0; i < count; ++i)
1741 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1742 {
1743 piece p = element_info.pieces[j];
1744 p.offset += i * element_bytes;
1745 add_piece (p);
1746 }
1747 return IS_PST;
1748}
1749
1750/* Subroutine of analyze for handling RECORD_TYPEs. */
1751
1752pure_scalable_type_info::analysis_result
1753pure_scalable_type_info::analyze_record (const_tree type)
1754{
1755 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1756 {
1757 if (TREE_CODE (field) != FIELD_DECL)
1758 continue;
1759
1760 /* Zero-sized fields disappear in the language->ABI mapping. */
1761 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1762 continue;
1763
1764 /* All fields with an ABI identity must be PSTs for the record as
1765 a whole to be a PST. If any individual field is too big to be
1766 interesting then the record is too. */
1767 pure_scalable_type_info field_info;
1768 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1769 if (subresult == NO_ABI_IDENTITY)
1770 continue;
1771 if (subresult != IS_PST)
1772 return subresult;
1773
1774 /* Since all previous fields are PSTs, we ought to be able to track
1775 the field offset using poly_ints. */
1776 tree bitpos = bit_position (field);
1777 gcc_assert (poly_int_tree_p (bitpos));
1778
1779 /* For the same reason, it shouldn't be possible to create a PST field
1780 whose offset isn't byte-aligned. */
1781 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1782 BITS_PER_UNIT);
1783
1784 /* Punt if the record is too big to be interesting. */
1785 poly_uint64 bytepos;
1786 if (!wide_bytepos.to_uhwi (&bytepos)
1787 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1788 return DOESNT_MATTER;
1789
1790 /* Add the individual vectors and predicates in the field to the
1791 record's list. */
1792 gcc_assert (!field_info.pieces.is_empty ());
1793 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1794 {
1795 piece p = field_info.pieces[i];
1796 p.offset += bytepos;
1797 add_piece (p);
1798 }
1799 }
1800 /* Empty structures disappear in the language->ABI mapping. */
1801 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1802}
1803
1804/* Add P to the list of pieces in the type. */
1805
1806void
1807pure_scalable_type_info::add_piece (const piece &p)
1808{
1809 /* Try to fold the new piece into the previous one to form a
1810 single-mode PST. For example, if we see three consecutive vectors
1811 of the same mode, we can represent them using the corresponding
1812 3-tuple mode.
1813
1814 This is purely an optimization. */
1815 if (!pieces.is_empty ())
1816 {
1817 piece &prev = pieces.last ();
1818 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1819 unsigned int nelems1, nelems2;
1820 if (prev.orig_mode == p.orig_mode
1821 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1822 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1823 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1824 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1825 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1826 && targetm.array_mode (p.orig_mode,
1827 nelems1 + nelems2).exists (&prev.mode))
1828 {
1829 prev.num_zr += p.num_zr;
1830 prev.num_pr += p.num_pr;
1831 return;
1832 }
1833 }
1834 pieces.quick_push (p);
1835}
1836
1837/* Return true if at least one possible value of type TYPE includes at
1838 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1839
1840 This is a relatively expensive test for some types, so it should
1841 generally be made as late as possible. */
1842
1843static bool
1844aarch64_some_values_include_pst_objects_p (const_tree type)
1845{
1846 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1847 return false;
1848
1849 if (aarch64_sve::builtin_type_p (type))
1850 return true;
1851
1852 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1853 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1854
1855 if (RECORD_OR_UNION_TYPE_P (type))
1856 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1857 if (TREE_CODE (field) == FIELD_DECL
1858 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1859 return true;
1860
1861 return false;
1862}
1863
002ffd3c
RS
1864/* Return the descriptor of the SIMD ABI. */
1865
1866static const predefined_function_abi &
1867aarch64_simd_abi (void)
1868{
1869 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1870 if (!simd_abi.initialized_p ())
1871 {
1872 HARD_REG_SET full_reg_clobbers
1873 = default_function_abi.full_reg_clobbers ();
1874 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1875 if (FP_SIMD_SAVED_REGNUM_P (regno))
1876 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1877 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1878 }
1879 return simd_abi;
1880}
1881
c600df9a
RS
1882/* Return the descriptor of the SVE PCS. */
1883
1884static const predefined_function_abi &
1885aarch64_sve_abi (void)
1886{
1887 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1888 if (!sve_abi.initialized_p ())
1889 {
1890 HARD_REG_SET full_reg_clobbers
1891 = default_function_abi.full_reg_clobbers ();
1892 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1893 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
cb26919c 1894 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
c600df9a
RS
1895 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1896 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1897 }
1898 return sve_abi;
1899}
1900
973d2e01
TP
1901/* Generate code to enable conditional branches in functions over 1 MiB. */
1902const char *
1903aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1904 const char * branch_format)
1905{
1906 rtx_code_label * tmp_label = gen_label_rtx ();
1907 char label_buf[256];
1908 char buffer[128];
1909 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1910 CODE_LABEL_NUMBER (tmp_label));
1911 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1912 rtx dest_label = operands[pos_label];
1913 operands[pos_label] = tmp_label;
1914
1915 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1916 output_asm_insn (buffer, operands);
1917
1918 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1919 operands[pos_label] = dest_label;
1920 output_asm_insn (buffer, operands);
1921 return "";
1922}
1923
261fb553 1924void
fc29dfc9 1925aarch64_err_no_fpadvsimd (machine_mode mode)
261fb553 1926{
261fb553 1927 if (TARGET_GENERAL_REGS_ONLY)
fc29dfc9
SE
1928 if (FLOAT_MODE_P (mode))
1929 error ("%qs is incompatible with the use of floating-point types",
1930 "-mgeneral-regs-only");
1931 else
1932 error ("%qs is incompatible with the use of vector types",
1933 "-mgeneral-regs-only");
261fb553 1934 else
fc29dfc9
SE
1935 if (FLOAT_MODE_P (mode))
1936 error ("%qs feature modifier is incompatible with the use of"
1937 " floating-point types", "+nofp");
1938 else
1939 error ("%qs feature modifier is incompatible with the use of"
1940 " vector types", "+nofp");
261fb553
AL
1941}
1942
c0e0174b
RS
1943/* Report when we try to do something that requires SVE when SVE is disabled.
1944 This is an error of last resort and isn't very high-quality. It usually
1945 involves attempts to measure the vector length in some way. */
1946static void
1947aarch64_report_sve_required (void)
1948{
1949 static bool reported_p = false;
1950
1951 /* Avoid reporting a slew of messages for a single oversight. */
1952 if (reported_p)
1953 return;
1954
1955 error ("this operation requires the SVE ISA extension");
1956 inform (input_location, "you can enable SVE using the command-line"
1957 " option %<-march%>, or by using the %<target%>"
1958 " attribute or pragma");
1959 reported_p = true;
1960}
1961
183bfdaf
RS
1962/* Return true if REGNO is P0-P15 or one of the special FFR-related
1963 registers. */
1964inline bool
1965pr_or_ffr_regnum_p (unsigned int regno)
1966{
1967 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1968}
1969
c64f7d37 1970/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2eb2847e
WD
1971 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1972 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1973 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1974 and GENERAL_REGS is lower than the memory cost (in this case the best class
1975 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1976 cost results in bad allocations with many redundant int<->FP moves which
1977 are expensive on various cores.
1978 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1979 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1980 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1981 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
31e2b5a3
WD
1982 The result of this is that it is no longer inefficient to have a higher
1983 memory move cost than the register move cost.
1984*/
c64f7d37
WD
1985
1986static reg_class_t
31e2b5a3
WD
1987aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1988 reg_class_t best_class)
c64f7d37 1989{
b8506a8a 1990 machine_mode mode;
c64f7d37 1991
67e5c59a
RS
1992 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1993 || !reg_class_subset_p (FP_REGS, allocno_class))
c64f7d37
WD
1994 return allocno_class;
1995
67e5c59a
RS
1996 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1997 || !reg_class_subset_p (FP_REGS, best_class))
31e2b5a3
WD
1998 return best_class;
1999
c64f7d37
WD
2000 mode = PSEUDO_REGNO_MODE (regno);
2001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2002}
2003
26e0ff94 2004static unsigned int
b8506a8a 2005aarch64_min_divisions_for_recip_mul (machine_mode mode)
26e0ff94 2006{
50093a33 2007 if (GET_MODE_UNIT_SIZE (mode) == 4)
b175b679
JG
2008 return aarch64_tune_params.min_div_recip_mul_sf;
2009 return aarch64_tune_params.min_div_recip_mul_df;
26e0ff94
WD
2010}
2011
b5b33e11 2012/* Return the reassociation width of treeop OPC with mode MODE. */
cee66c68 2013static int
b5b33e11 2014aarch64_reassociation_width (unsigned opc, machine_mode mode)
cee66c68
WD
2015{
2016 if (VECTOR_MODE_P (mode))
b175b679 2017 return aarch64_tune_params.vec_reassoc_width;
cee66c68 2018 if (INTEGRAL_MODE_P (mode))
b175b679 2019 return aarch64_tune_params.int_reassoc_width;
b5b33e11
WD
2020 /* Avoid reassociating floating point addition so we emit more FMAs. */
2021 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
b175b679 2022 return aarch64_tune_params.fp_reassoc_width;
cee66c68
WD
2023 return 1;
2024}
2025
43e9d192
IB
2026/* Provide a mapping from gcc register numbers to dwarf register numbers. */
2027unsigned
2028aarch64_dbx_register_number (unsigned regno)
2029{
2030 if (GP_REGNUM_P (regno))
2031 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2032 else if (regno == SP_REGNUM)
2033 return AARCH64_DWARF_SP;
2034 else if (FP_REGNUM_P (regno))
2035 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
43cacb12
RS
2036 else if (PR_REGNUM_P (regno))
2037 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2038 else if (regno == VG_REGNUM)
2039 return AARCH64_DWARF_VG;
43e9d192
IB
2040
2041 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2042 equivalent DWARF register. */
2043 return DWARF_FRAME_REGISTERS;
2044}
2045
d29f7dd5
RS
2046/* If X is a CONST_DOUBLE, return its bit representation as a constant
2047 integer, otherwise return X unmodified. */
2048static rtx
2049aarch64_bit_representation (rtx x)
2050{
2051 if (CONST_DOUBLE_P (x))
2052 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2053 return x;
2054}
2055
43cacb12
RS
2056/* Return true if MODE is any of the Advanced SIMD structure modes. */
2057static bool
2058aarch64_advsimd_struct_mode_p (machine_mode mode)
2059{
2060 return (TARGET_SIMD
2061 && (mode == OImode || mode == CImode || mode == XImode));
2062}
2063
2064/* Return true if MODE is an SVE predicate mode. */
2065static bool
2066aarch64_sve_pred_mode_p (machine_mode mode)
2067{
2068 return (TARGET_SVE
2069 && (mode == VNx16BImode
2070 || mode == VNx8BImode
2071 || mode == VNx4BImode
2072 || mode == VNx2BImode));
2073}
2074
2075/* Three mutually-exclusive flags describing a vector or predicate type. */
2076const unsigned int VEC_ADVSIMD = 1;
2077const unsigned int VEC_SVE_DATA = 2;
2078const unsigned int VEC_SVE_PRED = 4;
2079/* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2080 a structure of 2, 3 or 4 vectors. */
2081const unsigned int VEC_STRUCT = 8;
550a3380
RS
2082/* Can be used in combination with VEC_SVE_DATA to indicate that the
2083 vector has fewer significant bytes than a full SVE vector. */
2084const unsigned int VEC_PARTIAL = 16;
43cacb12
RS
2085/* Useful combinations of the above. */
2086const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2087const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2088
2089/* Return a set of flags describing the vector properties of mode MODE.
2090 Ignore modes that are not supported by the current target. */
2091static unsigned int
2092aarch64_classify_vector_mode (machine_mode mode)
2093{
2094 if (aarch64_advsimd_struct_mode_p (mode))
2095 return VEC_ADVSIMD | VEC_STRUCT;
2096
2097 if (aarch64_sve_pred_mode_p (mode))
2098 return VEC_SVE_PRED;
2099
806f69cd
RS
2100 /* Make the decision based on the mode's enum value rather than its
2101 properties, so that we keep the correct classification regardless
2102 of -msve-vector-bits. */
2103 switch (mode)
43cacb12 2104 {
550a3380
RS
2105 /* Partial SVE QI vectors. */
2106 case E_VNx2QImode:
2107 case E_VNx4QImode:
2108 case E_VNx8QImode:
2109 /* Partial SVE HI vectors. */
2110 case E_VNx2HImode:
2111 case E_VNx4HImode:
2112 /* Partial SVE SI vector. */
2113 case E_VNx2SImode:
cc68f7c2
RS
2114 /* Partial SVE HF vectors. */
2115 case E_VNx2HFmode:
2116 case E_VNx4HFmode:
2117 /* Partial SVE SF vector. */
2118 case E_VNx2SFmode:
550a3380
RS
2119 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2120
806f69cd
RS
2121 case E_VNx16QImode:
2122 case E_VNx8HImode:
2123 case E_VNx4SImode:
2124 case E_VNx2DImode:
02fcd8ac 2125 case E_VNx8BFmode:
806f69cd
RS
2126 case E_VNx8HFmode:
2127 case E_VNx4SFmode:
2128 case E_VNx2DFmode:
2129 return TARGET_SVE ? VEC_SVE_DATA : 0;
2130
2131 /* x2 SVE vectors. */
2132 case E_VNx32QImode:
2133 case E_VNx16HImode:
2134 case E_VNx8SImode:
2135 case E_VNx4DImode:
02fcd8ac 2136 case E_VNx16BFmode:
806f69cd
RS
2137 case E_VNx16HFmode:
2138 case E_VNx8SFmode:
2139 case E_VNx4DFmode:
2140 /* x3 SVE vectors. */
2141 case E_VNx48QImode:
2142 case E_VNx24HImode:
2143 case E_VNx12SImode:
2144 case E_VNx6DImode:
02fcd8ac 2145 case E_VNx24BFmode:
806f69cd
RS
2146 case E_VNx24HFmode:
2147 case E_VNx12SFmode:
2148 case E_VNx6DFmode:
2149 /* x4 SVE vectors. */
2150 case E_VNx64QImode:
2151 case E_VNx32HImode:
2152 case E_VNx16SImode:
2153 case E_VNx8DImode:
02fcd8ac 2154 case E_VNx32BFmode:
806f69cd
RS
2155 case E_VNx32HFmode:
2156 case E_VNx16SFmode:
2157 case E_VNx8DFmode:
2158 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2159
2160 /* 64-bit Advanced SIMD vectors. */
2161 case E_V8QImode:
2162 case E_V4HImode:
2163 case E_V2SImode:
2164 /* ...E_V1DImode doesn't exist. */
2165 case E_V4HFmode:
abbe1ed2 2166 case E_V4BFmode:
806f69cd
RS
2167 case E_V2SFmode:
2168 case E_V1DFmode:
2169 /* 128-bit Advanced SIMD vectors. */
2170 case E_V16QImode:
2171 case E_V8HImode:
2172 case E_V4SImode:
2173 case E_V2DImode:
2174 case E_V8HFmode:
abbe1ed2 2175 case E_V8BFmode:
806f69cd
RS
2176 case E_V4SFmode:
2177 case E_V2DFmode:
2178 return TARGET_SIMD ? VEC_ADVSIMD : 0;
2179
2180 default:
2181 return 0;
43cacb12 2182 }
43cacb12
RS
2183}
2184
2185/* Return true if MODE is any of the data vector modes, including
2186 structure modes. */
43e9d192 2187static bool
43cacb12 2188aarch64_vector_data_mode_p (machine_mode mode)
43e9d192 2189{
43cacb12 2190 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
43e9d192
IB
2191}
2192
5c38705d
RS
2193/* Return true if MODE is any form of SVE mode, including predicates,
2194 vectors and structures. */
2195bool
2196aarch64_sve_mode_p (machine_mode mode)
2197{
2198 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2199}
2200
43cacb12
RS
2201/* Return true if MODE is an SVE data vector mode; either a single vector
2202 or a structure of vectors. */
43e9d192 2203static bool
43cacb12 2204aarch64_sve_data_mode_p (machine_mode mode)
43e9d192 2205{
43cacb12 2206 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
43e9d192
IB
2207}
2208
550a3380
RS
2209/* Return the number of defined bytes in one constituent vector of
2210 SVE mode MODE, which has vector flags VEC_FLAGS. */
2211static poly_int64
2212aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2213{
2214 if (vec_flags & VEC_PARTIAL)
2215 /* A single partial vector. */
2216 return GET_MODE_SIZE (mode);
2217
2218 if (vec_flags & VEC_SVE_DATA)
2219 /* A single vector or a tuple. */
2220 return BYTES_PER_SVE_VECTOR;
2221
2222 /* A single predicate. */
2223 gcc_assert (vec_flags & VEC_SVE_PRED);
2224 return BYTES_PER_SVE_PRED;
2225}
2226
9f4cbab8
RS
2227/* Implement target hook TARGET_ARRAY_MODE. */
2228static opt_machine_mode
2229aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2230{
2231 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2232 && IN_RANGE (nelems, 2, 4))
2233 return mode_for_vector (GET_MODE_INNER (mode),
2234 GET_MODE_NUNITS (mode) * nelems);
2235
2236 return opt_machine_mode ();
2237}
2238
43e9d192
IB
2239/* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2240static bool
ef4bddc2 2241aarch64_array_mode_supported_p (machine_mode mode,
43e9d192
IB
2242 unsigned HOST_WIDE_INT nelems)
2243{
2244 if (TARGET_SIMD
635e66fe
AL
2245 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2246 || AARCH64_VALID_SIMD_DREG_MODE (mode))
43e9d192
IB
2247 && (nelems >= 2 && nelems <= 4))
2248 return true;
2249
2250 return false;
2251}
2252
cc68f7c2
RS
2253/* MODE is some form of SVE vector mode. For data modes, return the number
2254 of vector register bits that each element of MODE occupies, such as 64
2255 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2256 in a 64-bit container). For predicate modes, return the number of
2257 data bits controlled by each significant predicate bit. */
2258
2259static unsigned int
2260aarch64_sve_container_bits (machine_mode mode)
2261{
2262 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2263 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2264 ? BITS_PER_SVE_VECTOR
2265 : GET_MODE_BITSIZE (mode));
2266 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2267}
2268
43cacb12
RS
2269/* Return the SVE predicate mode to use for elements that have
2270 ELEM_NBYTES bytes, if such a mode exists. */
2271
2272opt_machine_mode
2273aarch64_sve_pred_mode (unsigned int elem_nbytes)
2274{
2275 if (TARGET_SVE)
2276 {
2277 if (elem_nbytes == 1)
2278 return VNx16BImode;
2279 if (elem_nbytes == 2)
2280 return VNx8BImode;
2281 if (elem_nbytes == 4)
2282 return VNx4BImode;
2283 if (elem_nbytes == 8)
2284 return VNx2BImode;
2285 }
2286 return opt_machine_mode ();
2287}
2288
cc68f7c2
RS
2289/* Return the SVE predicate mode that should be used to control
2290 SVE mode MODE. */
2291
2292machine_mode
2293aarch64_sve_pred_mode (machine_mode mode)
2294{
2295 unsigned int bits = aarch64_sve_container_bits (mode);
2296 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2297}
2298
43cacb12
RS
2299/* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2300
2301static opt_machine_mode
10116ec1 2302aarch64_get_mask_mode (machine_mode mode)
43cacb12 2303{
10116ec1
RS
2304 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2305 if (vec_flags & VEC_SVE_DATA)
cc68f7c2 2306 return aarch64_sve_pred_mode (mode);
43cacb12 2307
10116ec1 2308 return default_get_mask_mode (mode);
43cacb12
RS
2309}
2310
d7a09c44
RS
2311/* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2312
624d0f07 2313opt_machine_mode
d7a09c44
RS
2314aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2315{
2316 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2317 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2318 machine_mode mode;
2319 FOR_EACH_MODE_IN_CLASS (mode, mclass)
2320 if (inner_mode == GET_MODE_INNER (mode)
2321 && known_eq (nunits, GET_MODE_NUNITS (mode))
2322 && aarch64_sve_data_mode_p (mode))
2323 return mode;
2324 return opt_machine_mode ();
2325}
2326
1044fa32
RS
2327/* Return the integer element mode associated with SVE mode MODE. */
2328
2329static scalar_int_mode
2330aarch64_sve_element_int_mode (machine_mode mode)
2331{
cc68f7c2
RS
2332 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2333 ? BITS_PER_SVE_VECTOR
2334 : GET_MODE_BITSIZE (mode));
2335 unsigned int elt_bits = vector_element_size (vector_bits,
1044fa32
RS
2336 GET_MODE_NUNITS (mode));
2337 return int_mode_for_size (elt_bits, 0).require ();
2338}
2339
cc68f7c2
RS
2340/* Return an integer element mode that contains exactly
2341 aarch64_sve_container_bits (MODE) bits. This is wider than
2342 aarch64_sve_element_int_mode if MODE is a partial vector,
2343 otherwise it's the same. */
2344
2345static scalar_int_mode
2346aarch64_sve_container_int_mode (machine_mode mode)
2347{
2348 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2349}
2350
d7a09c44 2351/* Return the integer vector mode associated with SVE mode MODE.
d083ee47 2352 Unlike related_int_vector_mode, this can handle the case in which
d7a09c44
RS
2353 MODE is a predicate (and thus has a different total size). */
2354
624d0f07 2355machine_mode
d7a09c44
RS
2356aarch64_sve_int_mode (machine_mode mode)
2357{
2358 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2359 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2360}
2361
74166aab
RS
2362/* Implement TARGET_VECTORIZE_RELATED_MODE. */
2363
2364static opt_machine_mode
2365aarch64_vectorize_related_mode (machine_mode vector_mode,
2366 scalar_mode element_mode,
2367 poly_uint64 nunits)
2368{
2369 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2370
cc68f7c2
RS
2371 /* If we're operating on SVE vectors, try to return an SVE mode. */
2372 poly_uint64 sve_nunits;
2373 if ((vec_flags & VEC_SVE_DATA)
2374 && multiple_p (BYTES_PER_SVE_VECTOR,
2375 GET_MODE_SIZE (element_mode), &sve_nunits))
2376 {
2377 machine_mode sve_mode;
2378 if (maybe_ne (nunits, 0U))
2379 {
2380 /* Try to find a full or partial SVE mode with exactly
2381 NUNITS units. */
2382 if (multiple_p (sve_nunits, nunits)
2383 && aarch64_sve_data_mode (element_mode,
2384 nunits).exists (&sve_mode))
2385 return sve_mode;
2386 }
2387 else
2388 {
2389 /* Take the preferred number of units from the number of bytes
2390 that fit in VECTOR_MODE. We always start by "autodetecting"
2391 a full vector mode with preferred_simd_mode, so vectors
2392 chosen here will also be full vector modes. Then
2393 autovectorize_vector_modes tries smaller starting modes
2394 and thus smaller preferred numbers of units. */
2395 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2396 if (aarch64_sve_data_mode (element_mode,
2397 sve_nunits).exists (&sve_mode))
2398 return sve_mode;
2399 }
2400 }
2401
74166aab
RS
2402 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2403 if ((vec_flags & VEC_ADVSIMD)
2404 && known_eq (nunits, 0U)
2405 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2406 && maybe_ge (GET_MODE_BITSIZE (element_mode)
2407 * GET_MODE_NUNITS (vector_mode), 128U))
2408 {
2409 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2410 if (VECTOR_MODE_P (res))
2411 return res;
2412 }
2413
2414 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2415}
2416
b41d1f6e
RS
2417/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2418 prefer to use the first arithmetic operand as the else value if
2419 the else value doesn't matter, since that exactly matches the SVE
2420 destructive merging form. For ternary operations we could either
2421 pick the first operand and use FMAD-like instructions or the last
2422 operand and use FMLA-like instructions; the latter seems more
2423 natural. */
6a86928d
RS
2424
2425static tree
b41d1f6e 2426aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
6a86928d 2427{
b41d1f6e 2428 return nops == 3 ? ops[2] : ops[0];
6a86928d
RS
2429}
2430
c43f4279 2431/* Implement TARGET_HARD_REGNO_NREGS. */
43e9d192 2432
c43f4279 2433static unsigned int
ef4bddc2 2434aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
43e9d192 2435{
6a70badb
RS
2436 /* ??? Logically we should only need to provide a value when
2437 HARD_REGNO_MODE_OK says that the combination is valid,
2438 but at the moment we need to handle all modes. Just ignore
2439 any runtime parts for registers that can't store them. */
2440 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
43e9d192
IB
2441 switch (aarch64_regno_regclass (regno))
2442 {
2443 case FP_REGS:
2444 case FP_LO_REGS:
163b1f6a 2445 case FP_LO8_REGS:
550a3380
RS
2446 {
2447 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448 if (vec_flags & VEC_SVE_DATA)
2449 return exact_div (GET_MODE_SIZE (mode),
2450 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2451 return CEIL (lowest_size, UNITS_PER_VREG);
2452 }
43cacb12
RS
2453 case PR_REGS:
2454 case PR_LO_REGS:
2455 case PR_HI_REGS:
183bfdaf
RS
2456 case FFR_REGS:
2457 case PR_AND_FFR_REGS:
43cacb12 2458 return 1;
43e9d192 2459 default:
6a70badb 2460 return CEIL (lowest_size, UNITS_PER_WORD);
43e9d192
IB
2461 }
2462 gcc_unreachable ();
2463}
2464
f939c3e6 2465/* Implement TARGET_HARD_REGNO_MODE_OK. */
43e9d192 2466
f939c3e6 2467static bool
ef4bddc2 2468aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
43e9d192
IB
2469{
2470 if (GET_MODE_CLASS (mode) == MODE_CC)
2471 return regno == CC_REGNUM;
2472
43cacb12
RS
2473 if (regno == VG_REGNUM)
2474 /* This must have the same size as _Unwind_Word. */
2475 return mode == DImode;
2476
2477 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2478 if (vec_flags & VEC_SVE_PRED)
183bfdaf 2479 return pr_or_ffr_regnum_p (regno);
43cacb12 2480
183bfdaf
RS
2481 if (pr_or_ffr_regnum_p (regno))
2482 return false;
43cacb12 2483
9259db42
YZ
2484 if (regno == SP_REGNUM)
2485 /* The purpose of comparing with ptr_mode is to support the
2486 global register variable associated with the stack pointer
2487 register via the syntax of asm ("wsp") in ILP32. */
2488 return mode == Pmode || mode == ptr_mode;
2489
2490 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
43e9d192
IB
2491 return mode == Pmode;
2492
563cc649
RH
2493 if (GP_REGNUM_P (regno))
2494 {
aa1a2795
RS
2495 if (vec_flags & VEC_ANY_SVE)
2496 return false;
563cc649
RH
2497 if (known_le (GET_MODE_SIZE (mode), 8))
2498 return true;
aa1a2795 2499 if (known_le (GET_MODE_SIZE (mode), 16))
563cc649
RH
2500 return (regno & 1) == 0;
2501 }
2502 else if (FP_REGNUM_P (regno))
43e9d192 2503 {
43cacb12 2504 if (vec_flags & VEC_STRUCT)
4edd6298 2505 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
43e9d192 2506 else
43cacb12 2507 return !VECTOR_MODE_P (mode) || vec_flags != 0;
43e9d192
IB
2508 }
2509
f939c3e6 2510 return false;
43e9d192
IB
2511}
2512
c600df9a
RS
2513/* Return true if a function with type FNTYPE returns its value in
2514 SVE vector or predicate registers. */
2515
2516static bool
2517aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2518{
c600df9a 2519 tree return_type = TREE_TYPE (fntype);
38e62001
RS
2520
2521 pure_scalable_type_info pst_info;
2522 switch (pst_info.analyze (return_type))
2523 {
2524 case pure_scalable_type_info::IS_PST:
2525 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2526 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2527
2528 case pure_scalable_type_info::DOESNT_MATTER:
2529 gcc_assert (aarch64_return_in_memory_1 (return_type));
2530 return false;
2531
2532 case pure_scalable_type_info::NO_ABI_IDENTITY:
2533 case pure_scalable_type_info::ISNT_PST:
2534 return false;
2535 }
2536 gcc_unreachable ();
c600df9a
RS
2537}
2538
2539/* Return true if a function with type FNTYPE takes arguments in
2540 SVE vector or predicate registers. */
2541
2542static bool
2543aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2544{
2545 CUMULATIVE_ARGS args_so_far_v;
2546 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2547 NULL_TREE, 0, true);
2548 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2549
2550 for (tree chain = TYPE_ARG_TYPES (fntype);
2551 chain && chain != void_list_node;
2552 chain = TREE_CHAIN (chain))
2553 {
2554 tree arg_type = TREE_VALUE (chain);
2555 if (arg_type == error_mark_node)
2556 return false;
2557
2558 function_arg_info arg (arg_type, /*named=*/true);
2559 apply_pass_by_reference_rules (&args_so_far_v, arg);
38e62001
RS
2560 pure_scalable_type_info pst_info;
2561 if (pst_info.analyze_registers (arg.type))
2562 {
2563 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2564 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2565 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2566 return true;
2567 }
c600df9a
RS
2568
2569 targetm.calls.function_arg_advance (args_so_far, arg);
2570 }
2571 return false;
2572}
2573
002ffd3c
RS
2574/* Implement TARGET_FNTYPE_ABI. */
2575
2576static const predefined_function_abi &
2577aarch64_fntype_abi (const_tree fntype)
2578{
2579 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2580 return aarch64_simd_abi ();
c600df9a
RS
2581
2582 if (aarch64_returns_value_in_sve_regs_p (fntype)
2583 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2584 return aarch64_sve_abi ();
2585
002ffd3c
RS
2586 return default_function_abi;
2587}
2588
482b2b43
RS
2589/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2590
2591static bool
2592aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2593{
2594 return (aarch64_sve::builtin_type_p (type1)
2595 == aarch64_sve::builtin_type_p (type2));
2596}
2597
c600df9a 2598/* Return true if we should emit CFI for register REGNO. */
a0d0b980
SE
2599
2600static bool
c600df9a 2601aarch64_emit_cfi_for_reg_p (unsigned int regno)
a0d0b980 2602{
c600df9a
RS
2603 return (GP_REGNUM_P (regno)
2604 || !default_function_abi.clobbers_full_reg_p (regno));
a0d0b980
SE
2605}
2606
c600df9a 2607/* Return the mode we should use to save and restore register REGNO. */
a0d0b980
SE
2608
2609static machine_mode
c600df9a 2610aarch64_reg_save_mode (unsigned int regno)
a0d0b980 2611{
c600df9a
RS
2612 if (GP_REGNUM_P (regno))
2613 return DImode;
2614
2615 if (FP_REGNUM_P (regno))
2616 switch (crtl->abi->id ())
2617 {
2618 case ARM_PCS_AAPCS64:
2619 /* Only the low 64 bits are saved by the base PCS. */
2620 return DFmode;
2621
2622 case ARM_PCS_SIMD:
2623 /* The vector PCS saves the low 128 bits (which is the full
2624 register on non-SVE targets). */
2625 return TFmode;
2626
2627 case ARM_PCS_SVE:
2628 /* Use vectors of DImode for registers that need frame
2629 information, so that the first 64 bytes of the save slot
2630 are always the equivalent of what storing D<n> would give. */
2631 if (aarch64_emit_cfi_for_reg_p (regno))
2632 return VNx2DImode;
2633
2634 /* Use vectors of bytes otherwise, so that the layout is
2635 endian-agnostic, and so that we can use LDR and STR for
2636 big-endian targets. */
2637 return VNx16QImode;
2638
2639 case ARM_PCS_TLSDESC:
2640 case ARM_PCS_UNKNOWN:
2641 break;
2642 }
2643
2644 if (PR_REGNUM_P (regno))
2645 /* Save the full predicate register. */
2646 return VNx16BImode;
2647
2648 gcc_unreachable ();
a0d0b980
SE
2649}
2650
5a5a3bc5 2651/* Implement TARGET_INSN_CALLEE_ABI. */
b3650d40 2652
5a5a3bc5
RS
2653const predefined_function_abi &
2654aarch64_insn_callee_abi (const rtx_insn *insn)
b3650d40 2655{
08cc4d92
RS
2656 rtx pat = PATTERN (insn);
2657 gcc_assert (GET_CODE (pat) == PARALLEL);
2658 rtx unspec = XVECEXP (pat, 0, 1);
2659 gcc_assert (GET_CODE (unspec) == UNSPEC
2660 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2661 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
b3650d40
SE
2662}
2663
80ec73f4
RS
2664/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2665 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2666 clobbers the top 64 bits when restoring the bottom 64 bits. */
2667
2668static bool
6ee2cc70
RS
2669aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2670 unsigned int regno,
473574ee 2671 machine_mode mode)
80ec73f4 2672{
c600df9a 2673 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
51051f47 2674 {
51051f47
RS
2675 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2676 unsigned int nregs = hard_regno_nregs (regno, mode);
2677 if (nregs > 1)
2678 per_register_size = exact_div (per_register_size, nregs);
bb6ce448
RS
2679 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2680 return maybe_gt (per_register_size, 16);
2681 return maybe_gt (per_register_size, 8);
51051f47
RS
2682 }
2683 return false;
473574ee
SE
2684}
2685
43cacb12
RS
2686/* Implement REGMODE_NATURAL_SIZE. */
2687poly_uint64
2688aarch64_regmode_natural_size (machine_mode mode)
2689{
2690 /* The natural size for SVE data modes is one SVE data vector,
2691 and similarly for predicates. We can't independently modify
2692 anything smaller than that. */
2693 /* ??? For now, only do this for variable-width SVE registers.
2694 Doing it for constant-sized registers breaks lower-subreg.c. */
2695 /* ??? And once that's fixed, we should probably have similar
2696 code for Advanced SIMD. */
2697 if (!aarch64_sve_vg.is_constant ())
2698 {
2699 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2700 if (vec_flags & VEC_SVE_PRED)
2701 return BYTES_PER_SVE_PRED;
2702 if (vec_flags & VEC_SVE_DATA)
2703 return BYTES_PER_SVE_VECTOR;
2704 }
2705 return UNITS_PER_WORD;
2706}
2707
73d9ac6a 2708/* Implement HARD_REGNO_CALLER_SAVE_MODE. */
ef4bddc2 2709machine_mode
43cacb12
RS
2710aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2711 machine_mode mode)
2712{
2713 /* The predicate mode determines which bits are significant and
2714 which are "don't care". Decreasing the number of lanes would
2715 lose data while increasing the number of lanes would make bits
2716 unnecessarily significant. */
2717 if (PR_REGNUM_P (regno))
2718 return mode;
6a70badb
RS
2719 if (known_ge (GET_MODE_SIZE (mode), 4))
2720 return mode;
73d9ac6a 2721 else
6a70badb 2722 return SImode;
73d9ac6a
IB
2723}
2724
231c52ae
ST
2725/* Return true if I's bits are consecutive ones from the MSB. */
2726bool
2727aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2728{
2729 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2730}
2731
58e17cf8
RS
2732/* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2733 that strcpy from constants will be faster. */
2734
2735static HOST_WIDE_INT
2736aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2737{
2738 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2739 return MAX (align, BITS_PER_WORD);
2740 return align;
2741}
2742
43e9d192
IB
2743/* Return true if calls to DECL should be treated as
2744 long-calls (ie called via a register). */
2745static bool
2746aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2747{
2748 return false;
2749}
2750
2751/* Return true if calls to symbol-ref SYM should be treated as
2752 long-calls (ie called via a register). */
2753bool
2754aarch64_is_long_call_p (rtx sym)
2755{
2756 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2757}
2758
b60d63cb
JW
2759/* Return true if calls to symbol-ref SYM should not go through
2760 plt stubs. */
2761
2762bool
2763aarch64_is_noplt_call_p (rtx sym)
2764{
2765 const_tree decl = SYMBOL_REF_DECL (sym);
2766
2767 if (flag_pic
2768 && decl
2769 && (!flag_plt
2770 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2771 && !targetm.binds_local_p (decl))
2772 return true;
2773
2774 return false;
2775}
2776
43e9d192
IB
2777/* Return true if the offsets to a zero/sign-extract operation
2778 represent an expression that matches an extend operation. The
71837f64 2779 operands represent the parameters from
43e9d192 2780
4745e701 2781 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
43e9d192 2782bool
77e994c9 2783aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
43e9d192
IB
2784 rtx extract_imm)
2785{
2786 HOST_WIDE_INT mult_val, extract_val;
2787
2788 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2789 return false;
2790
2791 mult_val = INTVAL (mult_imm);
2792 extract_val = INTVAL (extract_imm);
2793
2794 if (extract_val > 8
2795 && extract_val < GET_MODE_BITSIZE (mode)
2796 && exact_log2 (extract_val & ~7) > 0
2797 && (extract_val & 7) <= 4
2798 && mult_val == (1 << (extract_val & 7)))
2799 return true;
2800
2801 return false;
2802}
2803
2804/* Emit an insn that's a simple single-set. Both the operands must be
2805 known to be valid. */
827ab47a 2806inline static rtx_insn *
43e9d192
IB
2807emit_set_insn (rtx x, rtx y)
2808{
f7df4a84 2809 return emit_insn (gen_rtx_SET (x, y));
43e9d192
IB
2810}
2811
2812/* X and Y are two things to compare using CODE. Emit the compare insn and
2813 return the rtx for register 0 in the proper mode. */
2814rtx
2815aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2816{
4a2095eb
RH
2817 machine_mode cmp_mode = GET_MODE (x);
2818 machine_mode cc_mode;
2819 rtx cc_reg;
43e9d192 2820
4a2095eb
RH
2821 if (cmp_mode == TImode)
2822 {
2823 gcc_assert (code == NE);
2824
2825 cc_mode = CCmode;
2826 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2827
2828 rtx x_lo = operand_subword (x, 0, 0, TImode);
2829 rtx y_lo = operand_subword (y, 0, 0, TImode);
2830 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2831
2832 rtx x_hi = operand_subword (x, 1, 0, TImode);
2833 rtx y_hi = operand_subword (y, 1, 0, TImode);
865257c4
RS
2834 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2835 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2836 GEN_INT (AARCH64_EQ)));
4a2095eb
RH
2837 }
2838 else
2839 {
2840 cc_mode = SELECT_CC_MODE (code, x, y);
2841 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2842 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2843 }
43e9d192
IB
2844 return cc_reg;
2845}
2846
d400fda3
RH
2847/* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2848
2849static rtx
2850aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2851 machine_mode y_mode)
2852{
2853 if (y_mode == E_QImode || y_mode == E_HImode)
2854 {
2855 if (CONST_INT_P (y))
df562b12
JJ
2856 {
2857 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2858 y_mode = SImode;
2859 }
d400fda3
RH
2860 else
2861 {
2862 rtx t, cc_reg;
2863 machine_mode cc_mode;
2864
2865 t = gen_rtx_ZERO_EXTEND (SImode, y);
2866 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2867 cc_mode = CC_SWPmode;
2868 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2869 emit_set_insn (cc_reg, t);
2870 return cc_reg;
2871 }
2872 }
2873
846f78d4
PK
2874 if (!aarch64_plus_operand (y, y_mode))
2875 y = force_reg (y_mode, y);
2876
d400fda3
RH
2877 return aarch64_gen_compare_reg (code, x, y);
2878}
2879
43e9d192
IB
2880/* Build the SYMBOL_REF for __tls_get_addr. */
2881
2882static GTY(()) rtx tls_get_addr_libfunc;
2883
2884rtx
2885aarch64_tls_get_addr (void)
2886{
2887 if (!tls_get_addr_libfunc)
2888 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2889 return tls_get_addr_libfunc;
2890}
2891
2892/* Return the TLS model to use for ADDR. */
2893
2894static enum tls_model
2895tls_symbolic_operand_type (rtx addr)
2896{
2897 enum tls_model tls_kind = TLS_MODEL_NONE;
43e9d192
IB
2898 if (GET_CODE (addr) == CONST)
2899 {
6a70badb
RS
2900 poly_int64 addend;
2901 rtx sym = strip_offset (addr, &addend);
43e9d192
IB
2902 if (GET_CODE (sym) == SYMBOL_REF)
2903 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2904 }
2905 else if (GET_CODE (addr) == SYMBOL_REF)
2906 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2907
2908 return tls_kind;
2909}
2910
2911/* We'll allow lo_sum's in addresses in our legitimate addresses
2912 so that combine would take care of combining addresses where
2913 necessary, but for generation purposes, we'll generate the address
2914 as :
2915 RTL Absolute
2916 tmp = hi (symbol_ref); adrp x1, foo
2917 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2918 nop
2919
2920 PIC TLS
2921 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2922 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2923 bl __tls_get_addr
2924 nop
2925
2926 Load TLS symbol, depending on TLS mechanism and TLS access model.
2927
2928 Global Dynamic - Traditional TLS:
2929 adrp tmp, :tlsgd:imm
2930 add dest, tmp, #:tlsgd_lo12:imm
2931 bl __tls_get_addr
2932
2933 Global Dynamic - TLS Descriptors:
2934 adrp dest, :tlsdesc:imm
2935 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2936 add dest, dest, #:tlsdesc_lo12:imm
2937 blr tmp
2938 mrs tp, tpidr_el0
2939 add dest, dest, tp
2940
2941 Initial Exec:
2942 mrs tp, tpidr_el0
2943 adrp tmp, :gottprel:imm
2944 ldr dest, [tmp, #:gottprel_lo12:imm]
2945 add dest, dest, tp
2946
2947 Local Exec:
2948 mrs tp, tpidr_el0
0699caae
RL
2949 add t0, tp, #:tprel_hi12:imm, lsl #12
2950 add t0, t0, #:tprel_lo12_nc:imm
43e9d192
IB
2951*/
2952
2953static void
2954aarch64_load_symref_appropriately (rtx dest, rtx imm,
2955 enum aarch64_symbol_type type)
2956{
2957 switch (type)
2958 {
2959 case SYMBOL_SMALL_ABSOLUTE:
2960 {
28514dda 2961 /* In ILP32, the mode of dest can be either SImode or DImode. */
43e9d192 2962 rtx tmp_reg = dest;
ef4bddc2 2963 machine_mode mode = GET_MODE (dest);
28514dda
YZ
2964
2965 gcc_assert (mode == Pmode || mode == ptr_mode);
2966
43e9d192 2967 if (can_create_pseudo_p ())
28514dda 2968 tmp_reg = gen_reg_rtx (mode);
43e9d192 2969
28514dda 2970 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
43e9d192
IB
2971 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2972 return;
2973 }
2974
a5350ddc 2975 case SYMBOL_TINY_ABSOLUTE:
f7df4a84 2976 emit_insn (gen_rtx_SET (dest, imm));
a5350ddc
CSS
2977 return;
2978
1b1e81f8
JW
2979 case SYMBOL_SMALL_GOT_28K:
2980 {
2981 machine_mode mode = GET_MODE (dest);
2982 rtx gp_rtx = pic_offset_table_rtx;
53021678
JW
2983 rtx insn;
2984 rtx mem;
1b1e81f8
JW
2985
2986 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2987 here before rtl expand. Tree IVOPT will generate rtl pattern to
2988 decide rtx costs, in which case pic_offset_table_rtx is not
2989 initialized. For that case no need to generate the first adrp
026c3cfd 2990 instruction as the final cost for global variable access is
1b1e81f8
JW
2991 one instruction. */
2992 if (gp_rtx != NULL)
2993 {
2994 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2995 using the page base as GOT base, the first page may be wasted,
2996 in the worst scenario, there is only 28K space for GOT).
2997
2998 The generate instruction sequence for accessing global variable
2999 is:
3000
a3957742 3001 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1b1e81f8
JW
3002
3003 Only one instruction needed. But we must initialize
3004 pic_offset_table_rtx properly. We generate initialize insn for
3005 every global access, and allow CSE to remove all redundant.
3006
3007 The final instruction sequences will look like the following
3008 for multiply global variables access.
3009
a3957742 3010 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1b1e81f8 3011
a3957742
JW
3012 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3013 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3014 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3015 ... */
1b1e81f8
JW
3016
3017 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3018 crtl->uses_pic_offset_table = 1;
3019 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3020
3021 if (mode != GET_MODE (gp_rtx))
4ba8f0a3
AP
3022 gp_rtx = gen_lowpart (mode, gp_rtx);
3023
1b1e81f8
JW
3024 }
3025
3026 if (mode == ptr_mode)
3027 {
3028 if (mode == DImode)
53021678 3029 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1b1e81f8 3030 else
53021678
JW
3031 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3032
3033 mem = XVECEXP (SET_SRC (insn), 0, 0);
1b1e81f8
JW
3034 }
3035 else
3036 {
3037 gcc_assert (mode == Pmode);
53021678
JW
3038
3039 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3040 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1b1e81f8
JW
3041 }
3042
53021678
JW
3043 /* The operand is expected to be MEM. Whenever the related insn
3044 pattern changed, above code which calculate mem should be
3045 updated. */
3046 gcc_assert (GET_CODE (mem) == MEM);
3047 MEM_READONLY_P (mem) = 1;
3048 MEM_NOTRAP_P (mem) = 1;
3049 emit_insn (insn);
1b1e81f8
JW
3050 return;
3051 }
3052
6642bdb4 3053 case SYMBOL_SMALL_GOT_4G:
43e9d192 3054 {
28514dda
YZ
3055 /* In ILP32, the mode of dest can be either SImode or DImode,
3056 while the got entry is always of SImode size. The mode of
3057 dest depends on how dest is used: if dest is assigned to a
3058 pointer (e.g. in the memory), it has SImode; it may have
3059 DImode if dest is dereferenced to access the memeory.
3060 This is why we have to handle three different ldr_got_small
3061 patterns here (two patterns for ILP32). */
53021678
JW
3062
3063 rtx insn;
3064 rtx mem;
43e9d192 3065 rtx tmp_reg = dest;
ef4bddc2 3066 machine_mode mode = GET_MODE (dest);
28514dda 3067
43e9d192 3068 if (can_create_pseudo_p ())
28514dda
YZ
3069 tmp_reg = gen_reg_rtx (mode);
3070
3071 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3072 if (mode == ptr_mode)
3073 {
3074 if (mode == DImode)
53021678 3075 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
28514dda 3076 else
53021678
JW
3077 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3078
3079 mem = XVECEXP (SET_SRC (insn), 0, 0);
28514dda
YZ
3080 }
3081 else
3082 {
3083 gcc_assert (mode == Pmode);
53021678
JW
3084
3085 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3086 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
28514dda
YZ
3087 }
3088
53021678
JW
3089 gcc_assert (GET_CODE (mem) == MEM);
3090 MEM_READONLY_P (mem) = 1;
3091 MEM_NOTRAP_P (mem) = 1;
3092 emit_insn (insn);
43e9d192
IB
3093 return;
3094 }
3095
3096 case SYMBOL_SMALL_TLSGD:
3097 {
5d8a22a5 3098 rtx_insn *insns;
87ca615a
AP
3099 /* The return type of __tls_get_addr is the C pointer type
3100 so use ptr_mode. */
3101 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3102 rtx tmp_reg = dest;
3103
3104 if (GET_MODE (dest) != ptr_mode)
3105 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
43e9d192
IB
3106
3107 start_sequence ();
87ca615a 3108 if (ptr_mode == SImode)
23b88fda
N
3109 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3110 else
3111 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
43e9d192
IB
3112 insns = get_insns ();
3113 end_sequence ();
3114
3115 RTL_CONST_CALL_P (insns) = 1;
87ca615a
AP
3116 emit_libcall_block (insns, tmp_reg, result, imm);
3117 /* Convert back to the mode of the dest adding a zero_extend
3118 from SImode (ptr_mode) to DImode (Pmode). */
3119 if (dest != tmp_reg)
3120 convert_move (dest, tmp_reg, true);
43e9d192
IB
3121 return;
3122 }
3123
3124 case SYMBOL_SMALL_TLSDESC:
3125 {
ef4bddc2 3126 machine_mode mode = GET_MODE (dest);
621ad2de 3127 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
43e9d192
IB
3128 rtx tp;
3129
621ad2de
AP
3130 gcc_assert (mode == Pmode || mode == ptr_mode);
3131
2876a13f
JW
3132 /* In ILP32, the got entry is always of SImode size. Unlike
3133 small GOT, the dest is fixed at reg 0. */
3134 if (TARGET_ILP32)
3135 emit_insn (gen_tlsdesc_small_si (imm));
621ad2de 3136 else
2876a13f 3137 emit_insn (gen_tlsdesc_small_di (imm));
43e9d192 3138 tp = aarch64_load_tp (NULL);
621ad2de
AP
3139
3140 if (mode != Pmode)
3141 tp = gen_lowpart (mode, tp);
3142
2876a13f 3143 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
241dbd9d
QZ
3144 if (REG_P (dest))
3145 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
3146 return;
3147 }
3148
79496620 3149 case SYMBOL_SMALL_TLSIE:
43e9d192 3150 {
621ad2de
AP
3151 /* In ILP32, the mode of dest can be either SImode or DImode,
3152 while the got entry is always of SImode size. The mode of
3153 dest depends on how dest is used: if dest is assigned to a
3154 pointer (e.g. in the memory), it has SImode; it may have
3155 DImode if dest is dereferenced to access the memeory.
3156 This is why we have to handle three different tlsie_small
3157 patterns here (two patterns for ILP32). */
ef4bddc2 3158 machine_mode mode = GET_MODE (dest);
621ad2de 3159 rtx tmp_reg = gen_reg_rtx (mode);
43e9d192 3160 rtx tp = aarch64_load_tp (NULL);
621ad2de
AP
3161
3162 if (mode == ptr_mode)
3163 {
3164 if (mode == DImode)
3165 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3166 else
3167 {
3168 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3169 tp = gen_lowpart (mode, tp);
3170 }
3171 }
3172 else
3173 {
3174 gcc_assert (mode == Pmode);
3175 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3176 }
3177
f7df4a84 3178 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
241dbd9d
QZ
3179 if (REG_P (dest))
3180 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
3181 return;
3182 }
3183
cbf5629e 3184 case SYMBOL_TLSLE12:
d18ba284 3185 case SYMBOL_TLSLE24:
cbf5629e
JW
3186 case SYMBOL_TLSLE32:
3187 case SYMBOL_TLSLE48:
43e9d192 3188 {
cbf5629e 3189 machine_mode mode = GET_MODE (dest);
43e9d192 3190 rtx tp = aarch64_load_tp (NULL);
e6f7f0e9 3191
cbf5629e
JW
3192 if (mode != Pmode)
3193 tp = gen_lowpart (mode, tp);
3194
3195 switch (type)
3196 {
3197 case SYMBOL_TLSLE12:
3198 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3199 (dest, tp, imm));
3200 break;
3201 case SYMBOL_TLSLE24:
3202 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3203 (dest, tp, imm));
3204 break;
3205 case SYMBOL_TLSLE32:
3206 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3207 (dest, imm));
3208 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3209 (dest, dest, tp));
3210 break;
3211 case SYMBOL_TLSLE48:
3212 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3213 (dest, imm));
3214 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3215 (dest, dest, tp));
3216 break;
3217 default:
3218 gcc_unreachable ();
3219 }
e6f7f0e9 3220
241dbd9d
QZ
3221 if (REG_P (dest))
3222 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
43e9d192
IB
3223 return;
3224 }
3225
87dd8ab0 3226 case SYMBOL_TINY_GOT:
d91480de
D
3227 {
3228 rtx insn;
3229 machine_mode mode = GET_MODE (dest);
3230
3231 if (mode == ptr_mode)
3232 insn = gen_ldr_got_tiny (mode, dest, imm);
3233 else
3234 {
3235 gcc_assert (mode == Pmode);
3236 insn = gen_ldr_got_tiny_sidi (dest, imm);
3237 }
3238
3239 emit_insn (insn);
3240 return;
3241 }
87dd8ab0 3242
5ae7caad
JW
3243 case SYMBOL_TINY_TLSIE:
3244 {
3245 machine_mode mode = GET_MODE (dest);
3246 rtx tp = aarch64_load_tp (NULL);
3247
3248 if (mode == ptr_mode)
3249 {
3250 if (mode == DImode)
3251 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3252 else
3253 {
3254 tp = gen_lowpart (mode, tp);
3255 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3256 }
3257 }
3258 else
3259 {
3260 gcc_assert (mode == Pmode);
3261 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3262 }
3263
241dbd9d
QZ
3264 if (REG_P (dest))
3265 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
5ae7caad
JW
3266 return;
3267 }
3268
43e9d192
IB
3269 default:
3270 gcc_unreachable ();
3271 }
3272}
3273
3274/* Emit a move from SRC to DEST. Assume that the move expanders can
3275 handle all moves if !can_create_pseudo_p (). The distinction is
3276 important because, unlike emit_move_insn, the move expanders know
3277 how to force Pmode objects into the constant pool even when the
3278 constant pool address is not itself legitimate. */
3279static rtx
3280aarch64_emit_move (rtx dest, rtx src)
3281{
3282 return (can_create_pseudo_p ()
3283 ? emit_move_insn (dest, src)
3284 : emit_move_insn_1 (dest, src));
3285}
3286
f22d7973
RS
3287/* Apply UNOPTAB to OP and store the result in DEST. */
3288
3289static void
3290aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3291{
3292 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3293 if (dest != tmp)
3294 emit_move_insn (dest, tmp);
3295}
3296
3297/* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3298
3299static void
3300aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3301{
3302 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3303 OPTAB_DIRECT);
3304 if (dest != tmp)
3305 emit_move_insn (dest, tmp);
3306}
3307
030d03b8
RE
3308/* Split a 128-bit move operation into two 64-bit move operations,
3309 taking care to handle partial overlap of register to register
3310 copies. Special cases are needed when moving between GP regs and
3311 FP regs. SRC can be a register, constant or memory; DST a register
3312 or memory. If either operand is memory it must not have any side
3313 effects. */
43e9d192
IB
3314void
3315aarch64_split_128bit_move (rtx dst, rtx src)
3316{
030d03b8
RE
3317 rtx dst_lo, dst_hi;
3318 rtx src_lo, src_hi;
43e9d192 3319
ef4bddc2 3320 machine_mode mode = GET_MODE (dst);
12dc6974 3321
030d03b8
RE
3322 gcc_assert (mode == TImode || mode == TFmode);
3323 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3324 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
43e9d192
IB
3325
3326 if (REG_P (dst) && REG_P (src))
3327 {
030d03b8
RE
3328 int src_regno = REGNO (src);
3329 int dst_regno = REGNO (dst);
43e9d192 3330
030d03b8 3331 /* Handle FP <-> GP regs. */
43e9d192
IB
3332 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3333 {
030d03b8
RE
3334 src_lo = gen_lowpart (word_mode, src);
3335 src_hi = gen_highpart (word_mode, src);
3336
0016d8d9
RS
3337 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3338 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
030d03b8 3339 return;
43e9d192
IB
3340 }
3341 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3342 {
030d03b8
RE
3343 dst_lo = gen_lowpart (word_mode, dst);
3344 dst_hi = gen_highpart (word_mode, dst);
3345
0016d8d9
RS
3346 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3347 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
030d03b8 3348 return;
43e9d192 3349 }
43e9d192
IB
3350 }
3351
030d03b8
RE
3352 dst_lo = gen_lowpart (word_mode, dst);
3353 dst_hi = gen_highpart (word_mode, dst);
3354 src_lo = gen_lowpart (word_mode, src);
3355 src_hi = gen_highpart_mode (word_mode, mode, src);
3356
3357 /* At most one pairing may overlap. */
3358 if (reg_overlap_mentioned_p (dst_lo, src_hi))
3359 {
3360 aarch64_emit_move (dst_hi, src_hi);
3361 aarch64_emit_move (dst_lo, src_lo);
3362 }
3363 else
3364 {
3365 aarch64_emit_move (dst_lo, src_lo);
3366 aarch64_emit_move (dst_hi, src_hi);
3367 }
43e9d192
IB
3368}
3369
3370bool
3371aarch64_split_128bit_move_p (rtx dst, rtx src)
3372{
3373 return (! REG_P (src)
3374 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3375}
3376
8b033a8a
SN
3377/* Split a complex SIMD combine. */
3378
3379void
3380aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3381{
ef4bddc2
RS
3382 machine_mode src_mode = GET_MODE (src1);
3383 machine_mode dst_mode = GET_MODE (dst);
8b033a8a
SN
3384
3385 gcc_assert (VECTOR_MODE_P (dst_mode));
a977dc0c
MC
3386 gcc_assert (register_operand (dst, dst_mode)
3387 && register_operand (src1, src_mode)
3388 && register_operand (src2, src_mode));
8b033a8a 3389
0016d8d9 3390 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
a977dc0c 3391 return;
8b033a8a
SN
3392}
3393
fd4842cd
SN
3394/* Split a complex SIMD move. */
3395
3396void
3397aarch64_split_simd_move (rtx dst, rtx src)
3398{
ef4bddc2
RS
3399 machine_mode src_mode = GET_MODE (src);
3400 machine_mode dst_mode = GET_MODE (dst);
fd4842cd
SN
3401
3402 gcc_assert (VECTOR_MODE_P (dst_mode));
3403
3404 if (REG_P (dst) && REG_P (src))
3405 {
3406 gcc_assert (VECTOR_MODE_P (src_mode));
0016d8d9 3407 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
fd4842cd
SN
3408 }
3409}
3410
ef22810a
RH
3411bool
3412aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3413 machine_mode ymode, rtx y)
3414{
3415 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3416 gcc_assert (r != NULL);
3417 return rtx_equal_p (x, r);
3418}
ef22810a 3419
678faefc
RS
3420/* Return TARGET if it is nonnull and a register of mode MODE.
3421 Otherwise, return a fresh register of mode MODE if we can,
3422 or TARGET reinterpreted as MODE if we can't. */
3423
3424static rtx
3425aarch64_target_reg (rtx target, machine_mode mode)
3426{
3427 if (target && REG_P (target) && GET_MODE (target) == mode)
3428 return target;
3429 if (!can_create_pseudo_p ())
3430 {
3431 gcc_assert (target);
3432 return gen_lowpart (mode, target);
3433 }
3434 return gen_reg_rtx (mode);
3435}
3436
3437/* Return a register that contains the constant in BUILDER, given that
3438 the constant is a legitimate move operand. Use TARGET as the register
3439 if it is nonnull and convenient. */
3440
3441static rtx
3442aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3443{
3444 rtx src = builder.build ();
3445 target = aarch64_target_reg (target, GET_MODE (src));
3446 emit_insn (gen_rtx_SET (target, src));
3447 return target;
3448}
3449
43e9d192 3450static rtx
ef4bddc2 3451aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
43e9d192
IB
3452{
3453 if (can_create_pseudo_p ())
e18b4a81 3454 return force_reg (mode, value);
43e9d192
IB
3455 else
3456 {
f5470a77
RS
3457 gcc_assert (x);
3458 aarch64_emit_move (x, value);
43e9d192
IB
3459 return x;
3460 }
3461}
3462
0b1fe8cf
RS
3463/* Return true if predicate value X is a constant in which every element
3464 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3465 value, i.e. as a predicate in which all bits are significant. */
3466
3467static bool
3468aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3469{
3470 if (GET_CODE (x) != CONST_VECTOR)
3471 return false;
3472
3473 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3474 GET_MODE_NUNITS (GET_MODE (x)));
3475 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3476 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3477 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3478
3479 unsigned int nelts = const_vector_encoded_nelts (x);
3480 for (unsigned int i = 0; i < nelts; ++i)
3481 {
3482 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3483 if (!CONST_INT_P (elt))
3484 return false;
3485
3486 builder.quick_push (elt);
3487 for (unsigned int j = 1; j < factor; ++j)
3488 builder.quick_push (const0_rtx);
3489 }
3490 builder.finalize ();
3491 return true;
3492}
3493
3494/* BUILDER contains a predicate constant of mode VNx16BI. Return the
3495 widest predicate element size it can have (that is, the largest size
3496 for which each element would still be 0 or 1). */
3497
3498unsigned int
3499aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3500{
3501 /* Start with the most optimistic assumption: that we only need
3502 one bit per pattern. This is what we will use if only the first
3503 bit in each pattern is ever set. */
3504 unsigned int mask = GET_MODE_SIZE (DImode);
3505 mask |= builder.npatterns ();
3506
3507 /* Look for set bits. */
3508 unsigned int nelts = builder.encoded_nelts ();
3509 for (unsigned int i = 1; i < nelts; ++i)
3510 if (INTVAL (builder.elt (i)) != 0)
3511 {
3512 if (i & 1)
3513 return 1;
3514 mask |= i;
3515 }
3516 return mask & -mask;
3517}
3518
624d0f07
RS
3519/* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3520 return that predicate mode, otherwise return opt_machine_mode (). */
3521
3522opt_machine_mode
3523aarch64_ptrue_all_mode (rtx x)
3524{
3525 gcc_assert (GET_MODE (x) == VNx16BImode);
3526 if (GET_CODE (x) != CONST_VECTOR
3527 || !CONST_VECTOR_DUPLICATE_P (x)
3528 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3529 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3530 return opt_machine_mode ();
3531
3532 unsigned int nelts = const_vector_encoded_nelts (x);
3533 for (unsigned int i = 1; i < nelts; ++i)
3534 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3535 return opt_machine_mode ();
3536
3537 return aarch64_sve_pred_mode (nelts);
3538}
3539
0b1fe8cf
RS
3540/* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3541 that the constant would have with predicate element size ELT_SIZE
3542 (ignoring the upper bits in each element) and return:
3543
3544 * -1 if all bits are set
3545 * N if the predicate has N leading set bits followed by all clear bits
3546 * 0 if the predicate does not have any of these forms. */
3547
3548int
3549aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3550 unsigned int elt_size)
3551{
3552 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3553 followed by set bits. */
3554 if (builder.nelts_per_pattern () == 3)
3555 return 0;
3556
3557 /* Skip over leading set bits. */
3558 unsigned int nelts = builder.encoded_nelts ();
3559 unsigned int i = 0;
3560 for (; i < nelts; i += elt_size)
3561 if (INTVAL (builder.elt (i)) == 0)
3562 break;
3563 unsigned int vl = i / elt_size;
3564
3565 /* Check for the all-true case. */
3566 if (i == nelts)
3567 return -1;
3568
3569 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3570 repeating pattern of set bits followed by clear bits. */
3571 if (builder.nelts_per_pattern () != 2)
3572 return 0;
3573
3574 /* We have a "foreground" value and a duplicated "background" value.
3575 If the background might repeat and the last set bit belongs to it,
3576 we might have set bits followed by clear bits followed by set bits. */
3577 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3578 return 0;
3579
3580 /* Make sure that the rest are all clear. */
3581 for (; i < nelts; i += elt_size)
3582 if (INTVAL (builder.elt (i)) != 0)
3583 return 0;
3584
3585 return vl;
3586}
3587
3588/* See if there is an svpattern that encodes an SVE predicate of mode
3589 PRED_MODE in which the first VL bits are set and the rest are clear.
3590 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3591 A VL of -1 indicates an all-true vector. */
3592
3593aarch64_svpattern
3594aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3595{
3596 if (vl < 0)
3597 return AARCH64_SV_ALL;
3598
3599 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3600 return AARCH64_NUM_SVPATTERNS;
3601
3602 if (vl >= 1 && vl <= 8)
3603 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3604
3605 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3606 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3607
3608 int max_vl;
3609 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3610 {
3611 if (vl == (max_vl / 3) * 3)
3612 return AARCH64_SV_MUL3;
3613 /* These would only trigger for non-power-of-2 lengths. */
3614 if (vl == (max_vl & -4))
3615 return AARCH64_SV_MUL4;
3616 if (vl == (1 << floor_log2 (max_vl)))
3617 return AARCH64_SV_POW2;
3618 if (vl == max_vl)
3619 return AARCH64_SV_ALL;
3620 }
3621 return AARCH64_NUM_SVPATTERNS;
3622}
3623
34467289
RS
3624/* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3625 bits has the lowest bit set and the upper bits clear. This is the
3626 VNx16BImode equivalent of a PTRUE for controlling elements of
3627 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3628 all bits are significant, even the upper zeros. */
3629
3630rtx
3631aarch64_ptrue_all (unsigned int elt_size)
3632{
3633 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3634 builder.quick_push (const1_rtx);
3635 for (unsigned int i = 1; i < elt_size; ++i)
3636 builder.quick_push (const0_rtx);
3637 return builder.build ();
3638}
3639
16de3637
RS
3640/* Return an all-true predicate register of mode MODE. */
3641
3642rtx
3643aarch64_ptrue_reg (machine_mode mode)
3644{
3645 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
3646 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3647 return gen_lowpart (mode, reg);
16de3637
RS
3648}
3649
e7053b0c
RS
3650/* Return an all-false predicate register of mode MODE. */
3651
3652rtx
3653aarch64_pfalse_reg (machine_mode mode)
3654{
3655 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
678faefc
RS
3656 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3657 return gen_lowpart (mode, reg);
3658}
3659
c9c5a809
RS
3660/* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3661 true, or alternatively if we know that the operation predicated by
3662 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3663 aarch64_sve_gp_strictness operand that describes the operation
3664 predicated by PRED1[0]. */
3665
3666bool
3667aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3668{
3669 machine_mode mode = GET_MODE (pred2);
3670 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3671 && mode == GET_MODE (pred1[0])
3672 && aarch64_sve_gp_strictness (pred1[1], SImode));
3673 return (pred1[0] == CONSTM1_RTX (mode)
3674 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3675 || rtx_equal_p (pred1[0], pred2));
3676}
3677
00fa90d9
RS
3678/* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3679 for it. PRED2[0] is the predicate for the instruction whose result
3680 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3681 for it. Return true if we can prove that the two predicates are
3682 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3683 with PRED1[0] without changing behavior. */
3684
3685bool
3686aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3687{
3688 machine_mode mode = GET_MODE (pred1[0]);
3689 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3690 && mode == GET_MODE (pred2[0])
3691 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3692 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3693
3694 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3695 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3696 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3697 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3698 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3699}
3700
3701/* Emit a comparison CMP between OP0 and OP1, both of which have mode
3702 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3703 Use TARGET as the target register if nonnull and convenient. */
3704
3705static rtx
3706aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3707 machine_mode data_mode, rtx op1, rtx op2)
3708{
3709 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3710 expand_operand ops[5];
3711 create_output_operand (&ops[0], target, pred_mode);
3712 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3713 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3714 create_input_operand (&ops[3], op1, data_mode);
3715 create_input_operand (&ops[4], op2, data_mode);
3716 expand_insn (icode, 5, ops);
3717 return ops[0].value;
3718}
3719
678faefc
RS
3720/* Use a comparison to convert integer vector SRC into MODE, which is
3721 the corresponding SVE predicate mode. Use TARGET for the result
3722 if it's nonnull and convenient. */
3723
624d0f07 3724rtx
678faefc
RS
3725aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3726{
3727 machine_mode src_mode = GET_MODE (src);
00fa90d9
RS
3728 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3729 src, CONST0_RTX (src_mode));
e7053b0c
RS
3730}
3731
624d0f07
RS
3732/* Return the assembly token for svprfop value PRFOP. */
3733
3734static const char *
3735svprfop_token (enum aarch64_svprfop prfop)
3736{
3737 switch (prfop)
3738 {
3739#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3740 AARCH64_FOR_SVPRFOP (CASE)
3741#undef CASE
3742 case AARCH64_NUM_SVPRFOPS:
3743 break;
3744 }
3745 gcc_unreachable ();
3746}
3747
3748/* Return the assembly string for an SVE prefetch operation with
3749 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3750 and that SUFFIX is the format for the remaining operands. */
3751
3752char *
3753aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3754 const char *suffix)
3755{
3756 static char buffer[128];
3757 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3758 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3759 mnemonic, svprfop_token (prfop), suffix);
3760 gcc_assert (written < sizeof (buffer));
3761 return buffer;
3762}
3763
3764/* Check whether we can calculate the number of elements in PATTERN
3765 at compile time, given that there are NELTS_PER_VQ elements per
3766 128-bit block. Return the value if so, otherwise return -1. */
3767
3768HOST_WIDE_INT
3769aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3770{
3771 unsigned int vl, const_vg;
3772 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3773 vl = 1 + (pattern - AARCH64_SV_VL1);
3774 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3775 vl = 16 << (pattern - AARCH64_SV_VL16);
3776 else if (aarch64_sve_vg.is_constant (&const_vg))
3777 {
3778 /* There are two vector granules per quadword. */
3779 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3780 switch (pattern)
3781 {
3782 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3783 case AARCH64_SV_MUL4: return nelts & -4;
3784 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3785 case AARCH64_SV_ALL: return nelts;
3786 default: gcc_unreachable ();
3787 }
3788 }
3789 else
3790 return -1;
3791
3792 /* There are two vector granules per quadword. */
3793 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3794 if (known_le (vl, nelts_all))
3795 return vl;
3796
3797 /* Requesting more elements than are available results in a PFALSE. */
3798 if (known_gt (vl, nelts_all))
3799 return 0;
3800
3801 return -1;
3802}
3803
43cacb12
RS
3804/* Return true if we can move VALUE into a register using a single
3805 CNT[BHWD] instruction. */
3806
3807static bool
3808aarch64_sve_cnt_immediate_p (poly_int64 value)
3809{
3810 HOST_WIDE_INT factor = value.coeffs[0];
3811 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3812 return (value.coeffs[1] == factor
3813 && IN_RANGE (factor, 2, 16 * 16)
3814 && (factor & 1) == 0
3815 && factor <= 16 * (factor & -factor));
3816}
3817
3818/* Likewise for rtx X. */
3819
3820bool
3821aarch64_sve_cnt_immediate_p (rtx x)
3822{
3823 poly_int64 value;
3824 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3825}
3826
3827/* Return the asm string for an instruction with a CNT-like vector size
3828 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3829 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3830 first part of the operands template (the part that comes before the
139df05a
RS
3831 vector size itself). PATTERN is the pattern to use. FACTOR is the
3832 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3833 in each quadword. If it is zero, we can use any element size. */
43cacb12
RS
3834
3835static char *
3836aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
139df05a 3837 aarch64_svpattern pattern,
43cacb12
RS
3838 unsigned int factor,
3839 unsigned int nelts_per_vq)
3840{
139df05a 3841 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
43cacb12
RS
3842
3843 if (nelts_per_vq == 0)
3844 /* There is some overlap in the ranges of the four CNT instructions.
3845 Here we always use the smallest possible element size, so that the
3846 multiplier is 1 whereever possible. */
3847 nelts_per_vq = factor & -factor;
3848 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3849 gcc_assert (IN_RANGE (shift, 1, 4));
3850 char suffix = "dwhb"[shift - 1];
3851
3852 factor >>= shift;
3853 unsigned int written;
139df05a 3854 if (pattern == AARCH64_SV_ALL && factor == 1)
43cacb12
RS
3855 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3856 prefix, suffix, operands);
139df05a
RS
3857 else if (factor == 1)
3858 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3859 prefix, suffix, operands, svpattern_token (pattern));
43cacb12 3860 else
139df05a
RS
3861 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3862 prefix, suffix, operands, svpattern_token (pattern),
3863 factor);
43cacb12
RS
3864 gcc_assert (written < sizeof (buffer));
3865 return buffer;
3866}
3867
3868/* Return the asm string for an instruction with a CNT-like vector size
3869 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3870 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3871 first part of the operands template (the part that comes before the
3872 vector size itself). X is the value of the vector size operand,
139df05a
RS
3873 as a polynomial integer rtx; we need to convert this into an "all"
3874 pattern with a multiplier. */
43cacb12
RS
3875
3876char *
3877aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3878 rtx x)
3879{
3880 poly_int64 value = rtx_to_poly_int64 (x);
3881 gcc_assert (aarch64_sve_cnt_immediate_p (value));
139df05a 3882 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
43cacb12
RS
3883 value.coeffs[1], 0);
3884}
3885
624d0f07
RS
3886/* Return the asm string for an instruction with a CNT-like vector size
3887 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3888 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3889 first part of the operands template (the part that comes before the
3890 vector size itself). CNT_PAT[0..2] are the operands of the
3891 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3892
3893char *
3894aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3895 const char *operands, rtx *cnt_pat)
3896{
3897 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3898 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3899 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3900 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3901 factor, nelts_per_vq);
3902}
3903
0fdc30bc
RS
3904/* Return true if we can add X using a single SVE INC or DEC instruction. */
3905
3906bool
3907aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3908{
3909 poly_int64 value;
3910 return (poly_int_rtx_p (x, &value)
3911 && (aarch64_sve_cnt_immediate_p (value)
3912 || aarch64_sve_cnt_immediate_p (-value)));
3913}
3914
3915/* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3916 operand 0. */
3917
3918char *
3919aarch64_output_sve_scalar_inc_dec (rtx offset)
3920{
3921 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3922 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3923 if (offset_value.coeffs[1] > 0)
139df05a 3924 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
3925 offset_value.coeffs[1], 0);
3926 else
139df05a 3927 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
0fdc30bc
RS
3928 -offset_value.coeffs[1], 0);
3929}
3930
43cacb12
RS
3931/* Return true if we can add VALUE to a register using a single ADDVL
3932 or ADDPL instruction. */
3933
3934static bool
3935aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3936{
3937 HOST_WIDE_INT factor = value.coeffs[0];
3938 if (factor == 0 || value.coeffs[1] != factor)
3939 return false;
3940 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3941 and a value of 16 is one vector width. */
3942 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3943 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3944}
3945
3946/* Likewise for rtx X. */
3947
3948bool
3949aarch64_sve_addvl_addpl_immediate_p (rtx x)
3950{
3951 poly_int64 value;
3952 return (poly_int_rtx_p (x, &value)
3953 && aarch64_sve_addvl_addpl_immediate_p (value));
3954}
3955
0fdc30bc
RS
3956/* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3957 to operand 1 and storing the result in operand 0. */
43cacb12
RS
3958
3959char *
0fdc30bc 3960aarch64_output_sve_addvl_addpl (rtx offset)
43cacb12
RS
3961{
3962 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3963 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3964 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3965
43cacb12
RS
3966 int factor = offset_value.coeffs[1];
3967 if ((factor & 15) == 0)
3968 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3969 else
3970 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3971 return buffer;
3972}
3973
3974/* Return true if X is a valid immediate for an SVE vector INC or DEC
3975 instruction. If it is, store the number of elements in each vector
3976 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3977 factor in *FACTOR_OUT (if nonnull). */
3978
3979bool
0fdc30bc
RS
3980aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3981 unsigned int *nelts_per_vq_out)
43cacb12
RS
3982{
3983 rtx elt;
3984 poly_int64 value;
3985
3986 if (!const_vec_duplicate_p (x, &elt)
3987 || !poly_int_rtx_p (elt, &value))
3988 return false;
3989
3990 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3991 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3992 /* There's no vector INCB. */
3993 return false;
3994
3995 HOST_WIDE_INT factor = value.coeffs[0];
3996 if (value.coeffs[1] != factor)
3997 return false;
3998
3999 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4000 if ((factor % nelts_per_vq) != 0
4001 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4002 return false;
4003
4004 if (factor_out)
4005 *factor_out = factor;
4006 if (nelts_per_vq_out)
4007 *nelts_per_vq_out = nelts_per_vq;
4008 return true;
4009}
4010
4011/* Return true if X is a valid immediate for an SVE vector INC or DEC
4012 instruction. */
4013
4014bool
0fdc30bc 4015aarch64_sve_vector_inc_dec_immediate_p (rtx x)
43cacb12 4016{
0fdc30bc 4017 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
43cacb12
RS
4018}
4019
4020/* Return the asm template for an SVE vector INC or DEC instruction.
4021 OPERANDS gives the operands before the vector count and X is the
4022 value of the vector count operand itself. */
4023
4024char *
0fdc30bc 4025aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
43cacb12
RS
4026{
4027 int factor;
4028 unsigned int nelts_per_vq;
0fdc30bc 4029 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
43cacb12
RS
4030 gcc_unreachable ();
4031 if (factor < 0)
139df05a
RS
4032 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4033 -factor, nelts_per_vq);
43cacb12 4034 else
139df05a
RS
4035 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4036 factor, nelts_per_vq);
43cacb12 4037}
43e9d192 4038
82614948
RR
4039static int
4040aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
77e994c9 4041 scalar_int_mode mode)
43e9d192 4042{
43e9d192 4043 int i;
9a4865db
WD
4044 unsigned HOST_WIDE_INT val, val2, mask;
4045 int one_match, zero_match;
4046 int num_insns;
43e9d192 4047
9a4865db
WD
4048 val = INTVAL (imm);
4049
4050 if (aarch64_move_imm (val, mode))
43e9d192 4051 {
82614948 4052 if (generate)
f7df4a84 4053 emit_insn (gen_rtx_SET (dest, imm));
9a4865db 4054 return 1;
43e9d192
IB
4055 }
4056
9de00935
TC
4057 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4058 (with XXXX non-zero). In that case check to see if the move can be done in
4059 a smaller mode. */
4060 val2 = val & 0xffffffff;
4061 if (mode == DImode
4062 && aarch64_move_imm (val2, SImode)
4063 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4064 {
4065 if (generate)
4066 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4067
4068 /* Check if we have to emit a second instruction by checking to see
4069 if any of the upper 32 bits of the original DI mode value is set. */
4070 if (val == val2)
4071 return 1;
4072
4073 i = (val >> 48) ? 48 : 32;
4074
4075 if (generate)
4076 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4077 GEN_INT ((val >> i) & 0xffff)));
4078
4079 return 2;
4080 }
4081
9a4865db 4082 if ((val >> 32) == 0 || mode == SImode)
43e9d192 4083 {
82614948
RR
4084 if (generate)
4085 {
9a4865db
WD
4086 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4087 if (mode == SImode)
4088 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4089 GEN_INT ((val >> 16) & 0xffff)));
4090 else
4091 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4092 GEN_INT ((val >> 16) & 0xffff)));
82614948 4093 }
9a4865db 4094 return 2;
43e9d192
IB
4095 }
4096
4097 /* Remaining cases are all for DImode. */
4098
43e9d192 4099 mask = 0xffff;
9a4865db
WD
4100 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4101 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4102 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4103 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
43e9d192 4104
62c8d76c 4105 if (zero_match != 2 && one_match != 2)
43e9d192 4106 {
62c8d76c
WD
4107 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4108 For a 64-bit bitmask try whether changing 16 bits to all ones or
4109 zeroes creates a valid bitmask. To check any repeated bitmask,
4110 try using 16 bits from the other 32-bit half of val. */
43e9d192 4111
62c8d76c 4112 for (i = 0; i < 64; i += 16, mask <<= 16)
43e9d192 4113 {
62c8d76c
WD
4114 val2 = val & ~mask;
4115 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4116 break;
4117 val2 = val | mask;
4118 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4119 break;
4120 val2 = val2 & ~mask;
4121 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4122 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4123 break;
43e9d192 4124 }
62c8d76c 4125 if (i != 64)
43e9d192 4126 {
62c8d76c 4127 if (generate)
43e9d192 4128 {
62c8d76c
WD
4129 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4130 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
9a4865db 4131 GEN_INT ((val >> i) & 0xffff)));
43e9d192 4132 }
1312b1ba 4133 return 2;
43e9d192
IB
4134 }
4135 }
4136
9a4865db
WD
4137 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4138 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4139 otherwise skip zero bits. */
2c274197 4140
9a4865db 4141 num_insns = 1;
43e9d192 4142 mask = 0xffff;
9a4865db
WD
4143 val2 = one_match > zero_match ? ~val : val;
4144 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4145
4146 if (generate)
4147 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4148 ? (val | ~(mask << i))
4149 : (val & (mask << i)))));
4150 for (i += 16; i < 64; i += 16)
43e9d192 4151 {
9a4865db
WD
4152 if ((val2 & (mask << i)) == 0)
4153 continue;
4154 if (generate)
4155 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4156 GEN_INT ((val >> i) & 0xffff)));
4157 num_insns ++;
82614948
RR
4158 }
4159
4160 return num_insns;
4161}
4162
c0bb5bc5
WD
4163/* Return whether imm is a 128-bit immediate which is simple enough to
4164 expand inline. */
4165bool
4166aarch64_mov128_immediate (rtx imm)
4167{
4168 if (GET_CODE (imm) == CONST_INT)
4169 return true;
4170
4171 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4172
4173 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4174 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4175
4176 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4177 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4178}
4179
4180
43cacb12
RS
4181/* Return the number of temporary registers that aarch64_add_offset_1
4182 would need to add OFFSET to a register. */
4183
4184static unsigned int
4185aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4186{
4187 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4188}
4189
f5470a77
RS
4190/* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4191 a non-polynomial OFFSET. MODE is the mode of the addition.
4192 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4193 be set and CFA adjustments added to the generated instructions.
4194
4195 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4196 temporary if register allocation is already complete. This temporary
4197 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4198 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4199 the immediate again.
0100c5f9
RS
4200
4201 Since this function may be used to adjust the stack pointer, we must
4202 ensure that it cannot cause transient stack deallocation (for example
4203 by first incrementing SP and then decrementing when adjusting by a
4204 large immediate). */
4205
4206static void
f5470a77
RS
4207aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4208 rtx src, HOST_WIDE_INT offset, rtx temp1,
4209 bool frame_related_p, bool emit_move_imm)
0100c5f9 4210{
f5470a77
RS
4211 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4212 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4213
42bc589e 4214 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
0100c5f9
RS
4215 rtx_insn *insn;
4216
f5470a77
RS
4217 if (!moffset)
4218 {
4219 if (!rtx_equal_p (dest, src))
4220 {
4221 insn = emit_insn (gen_rtx_SET (dest, src));
4222 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4223 }
4224 return;
4225 }
0100c5f9
RS
4226
4227 /* Single instruction adjustment. */
f5470a77 4228 if (aarch64_uimm12_shift (moffset))
0100c5f9 4229 {
f5470a77 4230 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
0100c5f9
RS
4231 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4232 return;
4233 }
4234
f5470a77
RS
4235 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4236 and either:
4237
4238 a) the offset cannot be loaded by a 16-bit move or
4239 b) there is no spare register into which we can move it. */
4240 if (moffset < 0x1000000
4241 && ((!temp1 && !can_create_pseudo_p ())
4242 || !aarch64_move_imm (moffset, mode)))
0100c5f9 4243 {
f5470a77 4244 HOST_WIDE_INT low_off = moffset & 0xfff;
0100c5f9 4245
f5470a77
RS
4246 low_off = offset < 0 ? -low_off : low_off;
4247 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
0100c5f9 4248 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77 4249 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
0100c5f9
RS
4250 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4251 return;
4252 }
4253
4254 /* Emit a move immediate if required and an addition/subtraction. */
0100c5f9 4255 if (emit_move_imm)
f5470a77
RS
4256 {
4257 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
7aa605c9
JJ
4258 temp1 = aarch64_force_temporary (mode, temp1,
4259 gen_int_mode (moffset, mode));
f5470a77
RS
4260 }
4261 insn = emit_insn (offset < 0
4262 ? gen_sub3_insn (dest, src, temp1)
4263 : gen_add3_insn (dest, src, temp1));
0100c5f9
RS
4264 if (frame_related_p)
4265 {
4266 RTX_FRAME_RELATED_P (insn) = frame_related_p;
f5470a77
RS
4267 rtx adj = plus_constant (mode, src, offset);
4268 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
0100c5f9
RS
4269 }
4270}
4271
43cacb12
RS
4272/* Return the number of temporary registers that aarch64_add_offset
4273 would need to move OFFSET into a register or add OFFSET to a register;
4274 ADD_P is true if we want the latter rather than the former. */
4275
4276static unsigned int
4277aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4278{
4279 /* This follows the same structure as aarch64_add_offset. */
4280 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4281 return 0;
4282
4283 unsigned int count = 0;
4284 HOST_WIDE_INT factor = offset.coeffs[1];
4285 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4286 poly_int64 poly_offset (factor, factor);
4287 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4288 /* Need one register for the ADDVL/ADDPL result. */
4289 count += 1;
4290 else if (factor != 0)
4291 {
4292 factor = abs (factor);
4293 if (factor > 16 * (factor & -factor))
4294 /* Need one register for the CNT result and one for the multiplication
4295 factor. If necessary, the second temporary can be reused for the
4296 constant part of the offset. */
4297 return 2;
4298 /* Need one register for the CNT result (which might then
4299 be shifted). */
4300 count += 1;
4301 }
4302 return count + aarch64_add_offset_1_temporaries (constant);
4303}
4304
4305/* If X can be represented as a poly_int64, return the number
4306 of temporaries that are required to add it to a register.
4307 Return -1 otherwise. */
4308
4309int
4310aarch64_add_offset_temporaries (rtx x)
4311{
4312 poly_int64 offset;
4313 if (!poly_int_rtx_p (x, &offset))
4314 return -1;
4315 return aarch64_offset_temporaries (true, offset);
4316}
4317
f5470a77
RS
4318/* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4319 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4320 be set and CFA adjustments added to the generated instructions.
4321
4322 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4323 temporary if register allocation is already complete. This temporary
43cacb12
RS
4324 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4325 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4326 false to avoid emitting the immediate again.
4327
4328 TEMP2, if nonnull, is a second temporary register that doesn't
4329 overlap either DEST or REG.
f5470a77
RS
4330
4331 Since this function may be used to adjust the stack pointer, we must
4332 ensure that it cannot cause transient stack deallocation (for example
4333 by first incrementing SP and then decrementing when adjusting by a
4334 large immediate). */
4335
4336static void
4337aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
43cacb12
RS
4338 poly_int64 offset, rtx temp1, rtx temp2,
4339 bool frame_related_p, bool emit_move_imm = true)
0100c5f9 4340{
f5470a77
RS
4341 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4342 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
43cacb12
RS
4343 gcc_assert (temp1 == NULL_RTX
4344 || !frame_related_p
4345 || !reg_overlap_mentioned_p (temp1, dest));
4346 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4347
4348 /* Try using ADDVL or ADDPL to add the whole value. */
4349 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4350 {
4351 rtx offset_rtx = gen_int_mode (offset, mode);
4352 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4353 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4354 return;
4355 }
4356
4357 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4358 SVE vector register, over and above the minimum size of 128 bits.
4359 This is equivalent to half the value returned by CNTD with a
4360 vector shape of ALL. */
4361 HOST_WIDE_INT factor = offset.coeffs[1];
4362 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4363
4364 /* Try using ADDVL or ADDPL to add the VG-based part. */
4365 poly_int64 poly_offset (factor, factor);
4366 if (src != const0_rtx
4367 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4368 {
4369 rtx offset_rtx = gen_int_mode (poly_offset, mode);
4370 if (frame_related_p)
4371 {
4372 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4373 RTX_FRAME_RELATED_P (insn) = true;
4374 src = dest;
4375 }
4376 else
4377 {
4378 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4379 src = aarch64_force_temporary (mode, temp1, addr);
4380 temp1 = temp2;
4381 temp2 = NULL_RTX;
4382 }
4383 }
4384 /* Otherwise use a CNT-based sequence. */
4385 else if (factor != 0)
4386 {
4387 /* Use a subtraction if we have a negative factor. */
4388 rtx_code code = PLUS;
4389 if (factor < 0)
4390 {
4391 factor = -factor;
4392 code = MINUS;
4393 }
4394
4395 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4396 into the multiplication. */
4397 rtx val;
4398 int shift = 0;
4399 if (factor & 1)
4400 /* Use a right shift by 1. */
4401 shift = -1;
4402 else
4403 factor /= 2;
4404 HOST_WIDE_INT low_bit = factor & -factor;
4405 if (factor <= 16 * low_bit)
4406 {
4407 if (factor > 16 * 8)
4408 {
4409 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4410 the value with the minimum multiplier and shift it into
4411 position. */
4412 int extra_shift = exact_log2 (low_bit);
4413 shift += extra_shift;
4414 factor >>= extra_shift;
4415 }
4416 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4417 }
4418 else
4419 {
7d8bdfa7
RS
4420 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4421 directly, since that should increase the chances of being
4422 able to use a shift and add sequence. If LOW_BIT itself
4423 is out of range, just use CNTD. */
4424 if (low_bit <= 16 * 8)
4425 factor /= low_bit;
4426 else
4427 low_bit = 1;
4428
4429 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
43cacb12
RS
4430 val = aarch64_force_temporary (mode, temp1, val);
4431
7d8bdfa7
RS
4432 if (can_create_pseudo_p ())
4433 {
4434 rtx coeff1 = gen_int_mode (factor, mode);
4435 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4436 }
4437 else
43cacb12 4438 {
7d8bdfa7
RS
4439 /* Go back to using a negative multiplication factor if we have
4440 no register from which to subtract. */
4441 if (code == MINUS && src == const0_rtx)
4442 {
4443 factor = -factor;
4444 code = PLUS;
4445 }
4446 rtx coeff1 = gen_int_mode (factor, mode);
4447 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4448 val = gen_rtx_MULT (mode, val, coeff1);
43cacb12 4449 }
43cacb12
RS
4450 }
4451
4452 if (shift > 0)
4453 {
4454 /* Multiply by 1 << SHIFT. */
4455 val = aarch64_force_temporary (mode, temp1, val);
4456 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4457 }
4458 else if (shift == -1)
4459 {
4460 /* Divide by 2. */
4461 val = aarch64_force_temporary (mode, temp1, val);
4462 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4463 }
4464
4465 /* Calculate SRC +/- CNTD * FACTOR / 2. */
4466 if (src != const0_rtx)
4467 {
4468 val = aarch64_force_temporary (mode, temp1, val);
4469 val = gen_rtx_fmt_ee (code, mode, src, val);
4470 }
4471 else if (code == MINUS)
4472 {
4473 val = aarch64_force_temporary (mode, temp1, val);
4474 val = gen_rtx_NEG (mode, val);
4475 }
4476
4477 if (constant == 0 || frame_related_p)
4478 {
4479 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4480 if (frame_related_p)
4481 {
4482 RTX_FRAME_RELATED_P (insn) = true;
4483 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4484 gen_rtx_SET (dest, plus_constant (Pmode, src,
4485 poly_offset)));
4486 }
4487 src = dest;
4488 if (constant == 0)
4489 return;
4490 }
4491 else
4492 {
4493 src = aarch64_force_temporary (mode, temp1, val);
4494 temp1 = temp2;
4495 temp2 = NULL_RTX;
4496 }
4497
4498 emit_move_imm = true;
4499 }
f5470a77 4500
f5470a77
RS
4501 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4502 frame_related_p, emit_move_imm);
0100c5f9
RS
4503}
4504
43cacb12
RS
4505/* Like aarch64_add_offset, but the offset is given as an rtx rather
4506 than a poly_int64. */
4507
4508void
4509aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4510 rtx offset_rtx, rtx temp1, rtx temp2)
4511{
4512 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4513 temp1, temp2, false);
4514}
4515
f5470a77
RS
4516/* Add DELTA to the stack pointer, marking the instructions frame-related.
4517 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4518 if TEMP1 already contains abs (DELTA). */
4519
0100c5f9 4520static inline void
43cacb12 4521aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
0100c5f9 4522{
f5470a77 4523 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
43cacb12 4524 temp1, temp2, true, emit_move_imm);
0100c5f9
RS
4525}
4526
f5470a77
RS
4527/* Subtract DELTA from the stack pointer, marking the instructions
4528 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4529 if nonnull. */
4530
0100c5f9 4531static inline void
cd1bef27
JL
4532aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4533 bool emit_move_imm = true)
0100c5f9 4534{
f5470a77 4535 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
cd1bef27 4536 temp1, temp2, frame_related_p, emit_move_imm);
0100c5f9 4537}
82614948 4538
43cacb12
RS
4539/* Set DEST to (vec_series BASE STEP). */
4540
4541static void
4542aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
82614948
RR
4543{
4544 machine_mode mode = GET_MODE (dest);
43cacb12
RS
4545 scalar_mode inner = GET_MODE_INNER (mode);
4546
4547 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4548 if (!aarch64_sve_index_immediate_p (base))
4549 base = force_reg (inner, base);
4550 if (!aarch64_sve_index_immediate_p (step))
4551 step = force_reg (inner, step);
4552
4553 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4554}
82614948 4555
4aeb1ba7
RS
4556/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4557 register of mode MODE. Use TARGET for the result if it's nonnull
4558 and convenient.
4559
4560 The two vector modes must have the same element mode. The behavior
4561 is to duplicate architectural lane N of SRC into architectural lanes
4562 N + I * STEP of the result. On big-endian targets, architectural
4563 lane 0 of an Advanced SIMD vector is the last element of the vector
4564 in memory layout, so for big-endian targets this operation has the
4565 effect of reversing SRC before duplicating it. Callers need to
4566 account for this. */
43cacb12 4567
4aeb1ba7
RS
4568rtx
4569aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4570{
4571 machine_mode src_mode = GET_MODE (src);
4572 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4573 insn_code icode = (BYTES_BIG_ENDIAN
4574 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4575 : code_for_aarch64_vec_duplicate_vq_le (mode));
4576
4577 unsigned int i = 0;
4578 expand_operand ops[3];
4579 create_output_operand (&ops[i++], target, mode);
4580 create_output_operand (&ops[i++], src, src_mode);
4581 if (BYTES_BIG_ENDIAN)
4582 {
4583 /* Create a PARALLEL describing the reversal of SRC. */
4584 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4585 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4586 nelts_per_vq - 1, -1);
4587 create_fixed_operand (&ops[i++], sel);
43cacb12 4588 }
4aeb1ba7
RS
4589 expand_insn (icode, i, ops);
4590 return ops[0].value;
4591}
4592
4593/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4594 the memory image into DEST. Return true on success. */
43cacb12 4595
4aeb1ba7
RS
4596static bool
4597aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4598{
4599 src = force_const_mem (GET_MODE (src), src);
43cacb12
RS
4600 if (!src)
4601 return false;
4602
4603 /* Make sure that the address is legitimate. */
4aeb1ba7 4604 if (!aarch64_sve_ld1rq_operand_p (src))
43cacb12
RS
4605 {
4606 rtx addr = force_reg (Pmode, XEXP (src, 0));
4607 src = replace_equiv_address (src, addr);
4608 }
4609
947b1372 4610 machine_mode mode = GET_MODE (dest);
cc68f7c2 4611 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
16de3637 4612 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4aeb1ba7 4613 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
43cacb12
RS
4614 return true;
4615}
4616
4aeb1ba7
RS
4617/* Return a register containing CONST_VECTOR SRC, given that SRC has an
4618 SVE data mode and isn't a legitimate constant. Use TARGET for the
4619 result if convenient.
43cacb12 4620
4aeb1ba7
RS
4621 The returned register can have whatever mode seems most natural
4622 given the contents of SRC. */
4623
4624static rtx
4625aarch64_expand_sve_const_vector (rtx target, rtx src)
43cacb12
RS
4626{
4627 machine_mode mode = GET_MODE (src);
4628 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4629 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4aeb1ba7
RS
4630 scalar_mode elt_mode = GET_MODE_INNER (mode);
4631 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
cc68f7c2
RS
4632 unsigned int container_bits = aarch64_sve_container_bits (mode);
4633 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4634
4635 if (nelts_per_pattern == 1
4636 && encoded_bits <= 128
4637 && container_bits != elt_bits)
4638 {
4639 /* We have a partial vector mode and a constant whose full-vector
4640 equivalent would occupy a repeating 128-bit sequence. Build that
4641 full-vector equivalent instead, so that we have the option of
4642 using LD1RQ and Advanced SIMD operations. */
4643 unsigned int repeat = container_bits / elt_bits;
4644 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4645 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4646 for (unsigned int i = 0; i < npatterns; ++i)
4647 for (unsigned int j = 0; j < repeat; ++j)
4648 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4649 target = aarch64_target_reg (target, full_mode);
4650 return aarch64_expand_sve_const_vector (target, builder.build ());
4651 }
4aeb1ba7
RS
4652
4653 if (nelts_per_pattern == 1 && encoded_bits == 128)
4654 {
4655 /* The constant is a duplicated quadword but can't be narrowed
4656 beyond a quadword. Get the memory image of the first quadword
4657 as a 128-bit vector and try using LD1RQ to load it from memory.
4658
4659 The effect for both endiannesses is to load memory lane N into
4660 architectural lanes N + I * STEP of the result. On big-endian
4661 targets, the layout of the 128-bit vector in an Advanced SIMD
4662 register would be different from its layout in an SVE register,
4663 but this 128-bit vector is a memory value only. */
4664 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4665 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4666 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4667 return target;
4668 }
4669
4670 if (nelts_per_pattern == 1 && encoded_bits < 128)
4671 {
4672 /* The vector is a repeating sequence of 64 bits or fewer.
4673 See if we can load them using an Advanced SIMD move and then
4674 duplicate it to fill a vector. This is better than using a GPR
4675 move because it keeps everything in the same register file. */
4676 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4677 rtx_vector_builder builder (vq_mode, npatterns, 1);
4678 for (unsigned int i = 0; i < npatterns; ++i)
4679 {
4680 /* We want memory lane N to go into architectural lane N,
4681 so reverse for big-endian targets. The DUP .Q pattern
4682 has a compensating reverse built-in. */
4683 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4684 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4685 }
4686 rtx vq_src = builder.build ();
4687 if (aarch64_simd_valid_immediate (vq_src, NULL))
4688 {
4689 vq_src = force_reg (vq_mode, vq_src);
4690 return aarch64_expand_sve_dupq (target, mode, vq_src);
4691 }
4692
4693 /* Get an integer representation of the repeating part of Advanced
4694 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4695 which for big-endian targets is lane-swapped wrt a normal
4696 Advanced SIMD vector. This means that for both endiannesses,
4697 memory lane N of SVE vector SRC corresponds to architectural
4698 lane N of a register holding VQ_SRC. This in turn means that
4699 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4700 as a single 128-bit value) and thus that memory lane 0 of SRC is
4701 in the lsb of the integer. Duplicating the integer therefore
4702 ensures that memory lane N of SRC goes into architectural lane
4703 N + I * INDEX of the SVE register. */
4704 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4705 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4706 if (elt_value)
4707 {
4708 /* Pretend that we had a vector of INT_MODE to start with. */
4709 elt_mode = int_mode;
4710 mode = aarch64_full_sve_mode (int_mode).require ();
4711
4712 /* If the integer can be moved into a general register by a
4713 single instruction, do that and duplicate the result. */
4714 if (CONST_INT_P (elt_value)
4715 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4716 {
4717 elt_value = force_reg (elt_mode, elt_value);
4718 return expand_vector_broadcast (mode, elt_value);
4719 }
4720 }
4721 else if (npatterns == 1)
4722 /* We're duplicating a single value, but can't do better than
4723 force it to memory and load from there. This handles things
4724 like symbolic constants. */
4725 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
43cacb12 4726
4aeb1ba7 4727 if (elt_value)
8179efe0 4728 {
4aeb1ba7
RS
4729 /* Load the element from memory if we can, otherwise move it into
4730 a register and use a DUP. */
4731 rtx op = force_const_mem (elt_mode, elt_value);
4732 if (!op)
4733 op = force_reg (elt_mode, elt_value);
4734 return expand_vector_broadcast (mode, op);
8179efe0 4735 }
43cacb12
RS
4736 }
4737
4aeb1ba7
RS
4738 /* Try using INDEX. */
4739 rtx base, step;
4740 if (const_vec_series_p (src, &base, &step))
4741 {
4742 aarch64_expand_vec_series (target, base, step);
4743 return target;
4744 }
4745
4746 /* From here on, it's better to force the whole constant to memory
4747 if we can. */
4748 if (GET_MODE_NUNITS (mode).is_constant ())
4749 return NULL_RTX;
4750
43cacb12 4751 /* Expand each pattern individually. */
4aeb1ba7 4752 gcc_assert (npatterns > 1);
43cacb12
RS
4753 rtx_vector_builder builder;
4754 auto_vec<rtx, 16> vectors (npatterns);
4755 for (unsigned int i = 0; i < npatterns; ++i)
4756 {
4757 builder.new_vector (mode, 1, nelts_per_pattern);
4758 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4759 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4760 vectors.quick_push (force_reg (mode, builder.build ()));
4761 }
4762
4763 /* Use permutes to interleave the separate vectors. */
4764 while (npatterns > 1)
4765 {
4766 npatterns /= 2;
4767 for (unsigned int i = 0; i < npatterns; ++i)
4768 {
4aeb1ba7 4769 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
43cacb12
RS
4770 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4771 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4772 vectors[i] = tmp;
4773 }
4774 }
4aeb1ba7
RS
4775 gcc_assert (vectors[0] == target);
4776 return target;
43cacb12
RS
4777}
4778
678faefc
RS
4779/* Use WHILE to set a predicate register of mode MODE in which the first
4780 VL bits are set and the rest are clear. Use TARGET for the register
4781 if it's nonnull and convenient. */
0b1fe8cf 4782
678faefc
RS
4783static rtx
4784aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4785 unsigned int vl)
0b1fe8cf
RS
4786{
4787 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
678faefc 4788 target = aarch64_target_reg (target, mode);
6ad9571b 4789 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
624d0f07 4790 target, const0_rtx, limit));
678faefc
RS
4791 return target;
4792}
4793
2803bc3b
RS
4794static rtx
4795aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4796
4797/* BUILDER is a constant predicate in which the index of every set bit
4798 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4799 by inverting every element at a multiple of ELT_SIZE and EORing the
4800 result with an ELT_SIZE PTRUE.
4801
4802 Return a register that contains the constant on success, otherwise
4803 return null. Use TARGET as the register if it is nonnull and
4804 convenient. */
4805
4806static rtx
4807aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4808 unsigned int elt_size)
4809{
4810 /* Invert every element at a multiple of ELT_SIZE, keeping the
4811 other bits zero. */
4812 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4813 builder.nelts_per_pattern ());
4814 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4815 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4816 inv_builder.quick_push (const1_rtx);
4817 else
4818 inv_builder.quick_push (const0_rtx);
4819 inv_builder.finalize ();
4820
4821 /* See if we can load the constant cheaply. */
4822 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4823 if (!inv)
4824 return NULL_RTX;
4825
4826 /* EOR the result with an ELT_SIZE PTRUE. */
4827 rtx mask = aarch64_ptrue_all (elt_size);
4828 mask = force_reg (VNx16BImode, mask);
26bebf57 4829 inv = gen_lowpart (VNx16BImode, inv);
2803bc3b
RS
4830 target = aarch64_target_reg (target, VNx16BImode);
4831 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4832 return target;
4833}
4834
4835/* BUILDER is a constant predicate in which the index of every set bit
4836 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4837 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4838 register on success, otherwise return null. Use TARGET as the register
4839 if nonnull and convenient. */
4840
4841static rtx
4842aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4843 unsigned int elt_size,
4844 unsigned int permute_size)
4845{
4846 /* We're going to split the constant into two new constants A and B,
4847 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4848 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4849
4850 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4851 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4852
4853 where _ indicates elements that will be discarded by the permute.
4854
4855 First calculate the ELT_SIZEs for A and B. */
4856 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4857 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4858 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4859 if (INTVAL (builder.elt (i)) != 0)
4860 {
4861 if (i & permute_size)
4862 b_elt_size |= i - permute_size;
4863 else
4864 a_elt_size |= i;
4865 }
4866 a_elt_size &= -a_elt_size;
4867 b_elt_size &= -b_elt_size;
4868
4869 /* Now construct the vectors themselves. */
4870 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4871 builder.nelts_per_pattern ());
4872 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4873 builder.nelts_per_pattern ());
4874 unsigned int nelts = builder.encoded_nelts ();
4875 for (unsigned int i = 0; i < nelts; ++i)
4876 if (i & (elt_size - 1))
4877 {
4878 a_builder.quick_push (const0_rtx);
4879 b_builder.quick_push (const0_rtx);
4880 }
4881 else if ((i & permute_size) == 0)
4882 {
4883 /* The A and B elements are significant. */
4884 a_builder.quick_push (builder.elt (i));
4885 b_builder.quick_push (builder.elt (i + permute_size));
4886 }
4887 else
4888 {
4889 /* The A and B elements are going to be discarded, so pick whatever
4890 is likely to give a nice constant. We are targeting element
4891 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4892 with the aim of each being a sequence of ones followed by
4893 a sequence of zeros. So:
4894
4895 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4896 duplicate the last X_ELT_SIZE element, to extend the
4897 current sequence of ones or zeros.
4898
4899 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4900 zero, so that the constant really does have X_ELT_SIZE and
4901 not a smaller size. */
4902 if (a_elt_size > permute_size)
4903 a_builder.quick_push (const0_rtx);
4904 else
4905 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4906 if (b_elt_size > permute_size)
4907 b_builder.quick_push (const0_rtx);
4908 else
4909 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4910 }
4911 a_builder.finalize ();
4912 b_builder.finalize ();
4913
4914 /* Try loading A into a register. */
4915 rtx_insn *last = get_last_insn ();
4916 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4917 if (!a)
4918 return NULL_RTX;
4919
4920 /* Try loading B into a register. */
4921 rtx b = a;
4922 if (a_builder != b_builder)
4923 {
4924 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4925 if (!b)
4926 {
4927 delete_insns_since (last);
4928 return NULL_RTX;
4929 }
4930 }
4931
4932 /* Emit the TRN1 itself. */
4933 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4934 target = aarch64_target_reg (target, mode);
4935 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4936 gen_lowpart (mode, a),
4937 gen_lowpart (mode, b)));
4938 return target;
4939}
4940
678faefc
RS
4941/* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4942 constant in BUILDER into an SVE predicate register. Return the register
4943 on success, otherwise return null. Use TARGET for the register if
2803bc3b
RS
4944 nonnull and convenient.
4945
4946 ALLOW_RECURSE_P is true if we can use methods that would call this
4947 function recursively. */
678faefc
RS
4948
4949static rtx
2803bc3b
RS
4950aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4951 bool allow_recurse_p)
678faefc
RS
4952{
4953 if (builder.encoded_nelts () == 1)
4954 /* A PFALSE or a PTRUE .B ALL. */
4955 return aarch64_emit_set_immediate (target, builder);
4956
4957 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4958 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4959 {
4960 /* If we can load the constant using PTRUE, use it as-is. */
4961 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4962 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4963 return aarch64_emit_set_immediate (target, builder);
4964
4965 /* Otherwise use WHILE to set the first VL bits. */
4966 return aarch64_sve_move_pred_via_while (target, mode, vl);
4967 }
4968
2803bc3b
RS
4969 if (!allow_recurse_p)
4970 return NULL_RTX;
4971
4972 /* Try inverting the vector in element size ELT_SIZE and then EORing
4973 the result with an ELT_SIZE PTRUE. */
4974 if (INTVAL (builder.elt (0)) == 0)
4975 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4976 elt_size))
4977 return res;
4978
4979 /* Try using TRN1 to permute two simpler constants. */
4980 for (unsigned int i = elt_size; i <= 8; i *= 2)
4981 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4982 elt_size, i))
4983 return res;
4984
678faefc
RS
4985 return NULL_RTX;
4986}
4987
4988/* Return an SVE predicate register that contains the VNx16BImode
4989 constant in BUILDER, without going through the move expanders.
4990
4991 The returned register can have whatever mode seems most natural
4992 given the contents of BUILDER. Use TARGET for the result if
4993 convenient. */
4994
4995static rtx
4996aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4997{
4998 /* Try loading the constant using pure predicate operations. */
2803bc3b 4999 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
678faefc
RS
5000 return res;
5001
5002 /* Try forcing the constant to memory. */
5003 if (builder.full_nelts ().is_constant ())
5004 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5005 {
5006 target = aarch64_target_reg (target, VNx16BImode);
5007 emit_move_insn (target, mem);
5008 return target;
5009 }
5010
5011 /* The last resort is to load the constant as an integer and then
5012 compare it against zero. Use -1 for set bits in order to increase
5013 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5014 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5015 builder.nelts_per_pattern ());
5016 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5017 int_builder.quick_push (INTVAL (builder.elt (i))
5018 ? constm1_rtx : const0_rtx);
5019 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5020 int_builder.build ());
0b1fe8cf
RS
5021}
5022
4aeb1ba7 5023/* Set DEST to immediate IMM. */
43cacb12
RS
5024
5025void
4aeb1ba7 5026aarch64_expand_mov_immediate (rtx dest, rtx imm)
43cacb12
RS
5027{
5028 machine_mode mode = GET_MODE (dest);
82614948
RR
5029
5030 /* Check on what type of symbol it is. */
77e994c9
RS
5031 scalar_int_mode int_mode;
5032 if ((GET_CODE (imm) == SYMBOL_REF
5033 || GET_CODE (imm) == LABEL_REF
43cacb12
RS
5034 || GET_CODE (imm) == CONST
5035 || GET_CODE (imm) == CONST_POLY_INT)
77e994c9 5036 && is_a <scalar_int_mode> (mode, &int_mode))
82614948 5037 {
43cacb12
RS
5038 rtx mem;
5039 poly_int64 offset;
5040 HOST_WIDE_INT const_offset;
82614948
RR
5041 enum aarch64_symbol_type sty;
5042
5043 /* If we have (const (plus symbol offset)), separate out the offset
5044 before we start classifying the symbol. */
43cacb12 5045 rtx base = strip_offset (imm, &offset);
82614948 5046
43cacb12
RS
5047 /* We must always add an offset involving VL separately, rather than
5048 folding it into the relocation. */
5049 if (!offset.is_constant (&const_offset))
5050 {
c0e0174b
RS
5051 if (!TARGET_SVE)
5052 {
5053 aarch64_report_sve_required ();
5054 return;
5055 }
43cacb12
RS
5056 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5057 emit_insn (gen_rtx_SET (dest, imm));
5058 else
5059 {
5060 /* Do arithmetic on 32-bit values if the result is smaller
5061 than that. */
5062 if (partial_subreg_p (int_mode, SImode))
5063 {
5064 /* It is invalid to do symbol calculations in modes
5065 narrower than SImode. */
5066 gcc_assert (base == const0_rtx);
5067 dest = gen_lowpart (SImode, dest);
5068 int_mode = SImode;
5069 }
5070 if (base != const0_rtx)
5071 {
5072 base = aarch64_force_temporary (int_mode, dest, base);
5073 aarch64_add_offset (int_mode, dest, base, offset,
5074 NULL_RTX, NULL_RTX, false);
5075 }
5076 else
5077 aarch64_add_offset (int_mode, dest, base, offset,
5078 dest, NULL_RTX, false);
5079 }
5080 return;
5081 }
5082
5083 sty = aarch64_classify_symbol (base, const_offset);
82614948
RR
5084 switch (sty)
5085 {
5086 case SYMBOL_FORCE_TO_MEM:
43cacb12 5087 if (const_offset != 0
77e994c9 5088 && targetm.cannot_force_const_mem (int_mode, imm))
82614948
RR
5089 {
5090 gcc_assert (can_create_pseudo_p ());
77e994c9 5091 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
5092 aarch64_add_offset (int_mode, dest, base, const_offset,
5093 NULL_RTX, NULL_RTX, false);
82614948
RR
5094 return;
5095 }
b4f50fd4 5096
82614948
RR
5097 mem = force_const_mem (ptr_mode, imm);
5098 gcc_assert (mem);
b4f50fd4
RR
5099
5100 /* If we aren't generating PC relative literals, then
5101 we need to expand the literal pool access carefully.
5102 This is something that needs to be done in a number
5103 of places, so could well live as a separate function. */
9ee6540a 5104 if (!aarch64_pcrelative_literal_loads)
b4f50fd4
RR
5105 {
5106 gcc_assert (can_create_pseudo_p ());
5107 base = gen_reg_rtx (ptr_mode);
5108 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
00eee3fa
WD
5109 if (ptr_mode != Pmode)
5110 base = convert_memory_address (Pmode, base);
b4f50fd4
RR
5111 mem = gen_rtx_MEM (ptr_mode, base);
5112 }
5113
77e994c9
RS
5114 if (int_mode != ptr_mode)
5115 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
b4f50fd4 5116
f7df4a84 5117 emit_insn (gen_rtx_SET (dest, mem));
b4f50fd4 5118
82614948
RR
5119 return;
5120
5121 case SYMBOL_SMALL_TLSGD:
5122 case SYMBOL_SMALL_TLSDESC:
79496620 5123 case SYMBOL_SMALL_TLSIE:
1b1e81f8 5124 case SYMBOL_SMALL_GOT_28K:
6642bdb4 5125 case SYMBOL_SMALL_GOT_4G:
82614948 5126 case SYMBOL_TINY_GOT:
5ae7caad 5127 case SYMBOL_TINY_TLSIE:
43cacb12 5128 if (const_offset != 0)
82614948
RR
5129 {
5130 gcc_assert(can_create_pseudo_p ());
77e994c9 5131 base = aarch64_force_temporary (int_mode, dest, base);
43cacb12
RS
5132 aarch64_add_offset (int_mode, dest, base, const_offset,
5133 NULL_RTX, NULL_RTX, false);
82614948
RR
5134 return;
5135 }
5136 /* FALLTHRU */
5137
82614948
RR
5138 case SYMBOL_SMALL_ABSOLUTE:
5139 case SYMBOL_TINY_ABSOLUTE:
cbf5629e 5140 case SYMBOL_TLSLE12:
d18ba284 5141 case SYMBOL_TLSLE24:
cbf5629e
JW
5142 case SYMBOL_TLSLE32:
5143 case SYMBOL_TLSLE48:
82614948
RR
5144 aarch64_load_symref_appropriately (dest, imm, sty);
5145 return;
5146
5147 default:
5148 gcc_unreachable ();
5149 }
5150 }
5151
5152 if (!CONST_INT_P (imm))
5153 {
678faefc
RS
5154 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5155 {
5156 /* Only the low bit of each .H, .S and .D element is defined,
5157 so we can set the upper bits to whatever we like. If the
5158 predicate is all-true in MODE, prefer to set all the undefined
5159 bits as well, so that we can share a single .B predicate for
5160 all modes. */
5161 if (imm == CONSTM1_RTX (mode))
5162 imm = CONSTM1_RTX (VNx16BImode);
5163
5164 /* All methods for constructing predicate modes wider than VNx16BI
5165 will set the upper bits of each element to zero. Expose this
5166 by moving such constants as a VNx16BI, so that all bits are
5167 significant and so that constants for different modes can be
5168 shared. The wider constant will still be available as a
5169 REG_EQUAL note. */
5170 rtx_vector_builder builder;
5171 if (aarch64_get_sve_pred_bits (builder, imm))
5172 {
5173 rtx res = aarch64_expand_sve_const_pred (dest, builder);
5174 if (dest != res)
5175 emit_move_insn (dest, gen_lowpart (mode, res));
5176 return;
5177 }
5178 }
5179
43cacb12
RS
5180 if (GET_CODE (imm) == HIGH
5181 || aarch64_simd_valid_immediate (imm, NULL))
43cacb12 5182 {
4aeb1ba7
RS
5183 emit_insn (gen_rtx_SET (dest, imm));
5184 return;
43e9d192 5185 }
82614948 5186
4aeb1ba7
RS
5187 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5188 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5189 {
5190 if (dest != res)
5191 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5192 return;
5193 }
5194
5195 rtx mem = force_const_mem (mode, imm);
5196 gcc_assert (mem);
5197 emit_move_insn (dest, mem);
82614948 5198 return;
43e9d192 5199 }
82614948 5200
77e994c9
RS
5201 aarch64_internal_mov_immediate (dest, imm, true,
5202 as_a <scalar_int_mode> (mode));
43e9d192
IB
5203}
5204
43cacb12
RS
5205/* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5206 that is known to contain PTRUE. */
5207
5208void
5209aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5210{
0c63a8ee
TC
5211 expand_operand ops[3];
5212 machine_mode mode = GET_MODE (dest);
5213 create_output_operand (&ops[0], dest, mode);
5214 create_input_operand (&ops[1], pred, GET_MODE(pred));
5215 create_input_operand (&ops[2], src, mode);
f2b29269 5216 temporary_volatile_ok v (true);
0c63a8ee 5217 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
43cacb12
RS
5218}
5219
5220/* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5221 operand is in memory. In this case we need to use the predicated LD1
5222 and ST1 instead of LDR and STR, both for correctness on big-endian
5223 targets and because LD1 and ST1 support a wider range of addressing modes.
5224 PRED_MODE is the mode of the predicate.
5225
5226 See the comment at the head of aarch64-sve.md for details about the
5227 big-endian handling. */
5228
5229void
5230aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5231{
5232 machine_mode mode = GET_MODE (dest);
16de3637 5233 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
5234 if (!register_operand (src, mode)
5235 && !register_operand (dest, mode))
5236 {
5237 rtx tmp = gen_reg_rtx (mode);
5238 if (MEM_P (src))
5239 aarch64_emit_sve_pred_move (tmp, ptrue, src);
5240 else
5241 emit_move_insn (tmp, src);
5242 src = tmp;
5243 }
5244 aarch64_emit_sve_pred_move (dest, ptrue, src);
5245}
5246
002092be
RS
5247/* Called only on big-endian targets. See whether an SVE vector move
5248 from SRC to DEST is effectively a REV[BHW] instruction, because at
5249 least one operand is a subreg of an SVE vector that has wider or
5250 narrower elements. Return true and emit the instruction if so.
5251
5252 For example:
5253
5254 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5255
5256 represents a VIEW_CONVERT between the following vectors, viewed
5257 in memory order:
5258
5259 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5260 R1: { [0], [1], [2], [3], ... }
5261
5262 The high part of lane X in R2 should therefore correspond to lane X*2
5263 of R1, but the register representations are:
5264
5265 msb lsb
5266 R2: ...... [1].high [1].low [0].high [0].low
5267 R1: ...... [3] [2] [1] [0]
5268
5269 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5270 We therefore need a reverse operation to swap the high and low values
5271 around.
5272
5273 This is purely an optimization. Without it we would spill the
5274 subreg operand to the stack in one mode and reload it in the
5275 other mode, which has the same effect as the REV. */
5276
5277bool
5278aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5279{
5280 gcc_assert (BYTES_BIG_ENDIAN);
5281 if (GET_CODE (dest) == SUBREG)
5282 dest = SUBREG_REG (dest);
5283 if (GET_CODE (src) == SUBREG)
5284 src = SUBREG_REG (src);
5285
5286 /* The optimization handles two single SVE REGs with different element
5287 sizes. */
5288 if (!REG_P (dest)
5289 || !REG_P (src)
5290 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5291 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5292 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5293 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5294 return false;
5295
5296 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
16de3637 5297 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
002092be
RS
5298 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5299 UNSPEC_REV_SUBREG);
5300 emit_insn (gen_rtx_SET (dest, unspec));
5301 return true;
5302}
5303
5304/* Return a copy of X with mode MODE, without changing its other
5305 attributes. Unlike gen_lowpart, this doesn't care whether the
5306 mode change is valid. */
5307
624d0f07 5308rtx
002092be
RS
5309aarch64_replace_reg_mode (rtx x, machine_mode mode)
5310{
5311 if (GET_MODE (x) == mode)
5312 return x;
5313
5314 x = shallow_copy_rtx (x);
5315 set_mode_and_regno (x, mode, REGNO (x));
5316 return x;
5317}
5318
d7a09c44
RS
5319/* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5320 stored in wider integer containers. */
5321
5322static unsigned int
5323aarch64_sve_rev_unspec (machine_mode mode)
5324{
5325 switch (GET_MODE_UNIT_SIZE (mode))
5326 {
5327 case 1: return UNSPEC_REVB;
5328 case 2: return UNSPEC_REVH;
5329 case 4: return UNSPEC_REVW;
5330 }
5331 gcc_unreachable ();
5332}
5333
002092be
RS
5334/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5335 operands. */
5336
5337void
5338aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5339{
d7a09c44
RS
5340 /* Decide which REV operation we need. The mode with wider elements
5341 determines the mode of the operands and the mode with the narrower
002092be 5342 elements determines the reverse width. */
5c06093c
RS
5343 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5344 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
002092be
RS
5345 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5346 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5347 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5348
d7a09c44 5349 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
cc68f7c2 5350 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
002092be 5351
d7a09c44 5352 /* Get the operands in the appropriate modes and emit the instruction. */
002092be 5353 ptrue = gen_lowpart (pred_mode, ptrue);
d7a09c44
RS
5354 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5355 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5356 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5357 dest, ptrue, src));
002092be
RS
5358}
5359
43e9d192 5360static bool
c600df9a 5361aarch64_function_ok_for_sibcall (tree, tree exp)
43e9d192 5362{
c600df9a 5363 if (crtl->abi->id () != expr_callee_abi (exp).id ())
a0d0b980
SE
5364 return false;
5365
43e9d192
IB
5366 return true;
5367}
5368
38e62001
RS
5369/* Subroutine of aarch64_pass_by_reference for arguments that are not
5370 passed in SVE registers. */
43e9d192
IB
5371
5372static bool
56fe3ca3
RS
5373aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5374 const function_arg_info &arg)
43e9d192
IB
5375{
5376 HOST_WIDE_INT size;
ef4bddc2 5377 machine_mode dummymode;
43e9d192
IB
5378 int nregs;
5379
5380 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
52090e4d
RS
5381 if (arg.mode == BLKmode && arg.type)
5382 size = int_size_in_bytes (arg.type);
6a70badb
RS
5383 else
5384 /* No frontends can create types with variable-sized modes, so we
5385 shouldn't be asked to pass or return them. */
52090e4d 5386 size = GET_MODE_SIZE (arg.mode).to_constant ();
43e9d192 5387
aadc1c43 5388 /* Aggregates are passed by reference based on their size. */
52090e4d
RS
5389 if (arg.aggregate_type_p ())
5390 size = int_size_in_bytes (arg.type);
43e9d192
IB
5391
5392 /* Variable sized arguments are always returned by reference. */
5393 if (size < 0)
5394 return true;
5395
5396 /* Can this be a candidate to be passed in fp/simd register(s)? */
52090e4d 5397 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
56fe3ca3
RS
5398 &dummymode, &nregs, NULL,
5399 !pcum || pcum->silent_p))
43e9d192
IB
5400 return false;
5401
5402 /* Arguments which are variable sized or larger than 2 registers are
5403 passed by reference unless they are a homogenous floating point
5404 aggregate. */
5405 return size > 2 * UNITS_PER_WORD;
5406}
5407
38e62001
RS
5408/* Implement TARGET_PASS_BY_REFERENCE. */
5409
5410static bool
5411aarch64_pass_by_reference (cumulative_args_t pcum_v,
5412 const function_arg_info &arg)
5413{
5414 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5415
5416 if (!arg.type)
56fe3ca3 5417 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
5418
5419 pure_scalable_type_info pst_info;
5420 switch (pst_info.analyze (arg.type))
5421 {
5422 case pure_scalable_type_info::IS_PST:
5423 if (pcum && !pcum->silent_p && !TARGET_SVE)
5424 /* We can't gracefully recover at this point, so make this a
5425 fatal error. */
5426 fatal_error (input_location, "arguments of type %qT require"
5427 " the SVE ISA extension", arg.type);
5428
5429 /* Variadic SVE types are passed by reference. Normal non-variadic
5430 arguments are too if we've run out of registers. */
5431 return (!arg.named
5432 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5433 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5434
5435 case pure_scalable_type_info::DOESNT_MATTER:
56fe3ca3 5436 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
38e62001
RS
5437 return true;
5438
5439 case pure_scalable_type_info::NO_ABI_IDENTITY:
5440 case pure_scalable_type_info::ISNT_PST:
56fe3ca3 5441 return aarch64_pass_by_reference_1 (pcum, arg);
38e62001
RS
5442 }
5443 gcc_unreachable ();
5444}
5445
43e9d192
IB
5446/* Return TRUE if VALTYPE is padded to its least significant bits. */
5447static bool
5448aarch64_return_in_msb (const_tree valtype)
5449{
ef4bddc2 5450 machine_mode dummy_mode;
43e9d192
IB
5451 int dummy_int;
5452
5453 /* Never happens in little-endian mode. */
5454 if (!BYTES_BIG_ENDIAN)
5455 return false;
5456
5457 /* Only composite types smaller than or equal to 16 bytes can
5458 be potentially returned in registers. */
5459 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5460 || int_size_in_bytes (valtype) <= 0
5461 || int_size_in_bytes (valtype) > 16)
5462 return false;
5463
5464 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5465 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5466 is always passed/returned in the least significant bits of fp/simd
5467 register(s). */
5468 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
56fe3ca3
RS
5469 &dummy_mode, &dummy_int, NULL,
5470 false))
43e9d192
IB
5471 return false;
5472
38e62001
RS
5473 /* Likewise pure scalable types for SVE vector and predicate registers. */
5474 pure_scalable_type_info pst_info;
5475 if (pst_info.analyze_registers (valtype))
5476 return false;
5477
43e9d192
IB
5478 return true;
5479}
5480
38e62001
RS
5481/* Implement TARGET_FUNCTION_VALUE.
5482 Define how to find the value returned by a function. */
5483
43e9d192 5484static rtx
38e62001
RS
5485aarch64_function_value (const_tree type, const_tree func,
5486 bool outgoing ATTRIBUTE_UNUSED)
43e9d192 5487{
38e62001
RS
5488 machine_mode mode;
5489 int unsignedp;
c600df9a 5490
38e62001
RS
5491 mode = TYPE_MODE (type);
5492 if (INTEGRAL_TYPE_P (type))
5493 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
c600df9a 5494
38e62001
RS
5495 pure_scalable_type_info pst_info;
5496 if (type && pst_info.analyze_registers (type))
5497 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
c600df9a 5498
38e62001
RS
5499 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5500 are returned in memory, not by value. */
5501 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5502 bool sve_p = (vec_flags & VEC_ANY_SVE);
c600df9a 5503
43e9d192
IB
5504 if (aarch64_return_in_msb (type))
5505 {
5506 HOST_WIDE_INT size = int_size_in_bytes (type);
5507
5508 if (size % UNITS_PER_WORD != 0)
5509 {
5510 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
f4b31647 5511 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
43e9d192
IB
5512 }
5513 }
5514
6aa5370c
RS
5515 int count;
5516 machine_mode ag_mode;
56fe3ca3
RS
5517 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5518 NULL, false))
43e9d192 5519 {
38e62001 5520 gcc_assert (!sve_p);
43e9d192
IB
5521 if (!aarch64_composite_type_p (type, mode))
5522 {
5523 gcc_assert (count == 1 && mode == ag_mode);
5524 return gen_rtx_REG (mode, V0_REGNUM);
5525 }
5526 else
5527 {
5528 int i;
5529 rtx par;
5530
5531 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5532 for (i = 0; i < count; i++)
5533 {
5534 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6a70badb
RS
5535 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5536 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
5537 XVECEXP (par, 0, i) = tmp;
5538 }
5539 return par;
5540 }
5541 }
5542 else
6aa5370c 5543 {
38e62001
RS
5544 if (sve_p)
5545 {
5546 /* Vector types can acquire a partial SVE mode using things like
5547 __attribute__((vector_size(N))), and this is potentially useful.
5548 However, the choice of mode doesn't affect the type's ABI
5549 identity, so we should treat the types as though they had
5550 the associated integer mode, just like they did before SVE
5551 was introduced.
5552
5553 We know that the vector must be 128 bits or smaller,
5554 otherwise we'd have returned it in memory instead. */
5555 gcc_assert (type
5556 && (aarch64_some_values_include_pst_objects_p (type)
5557 || (vec_flags & VEC_PARTIAL)));
5558
5559 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5560 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5561 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5562 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5563 }
5564 return gen_rtx_REG (mode, R0_REGNUM);
6aa5370c 5565 }
6aa5370c
RS
5566}
5567
43e9d192
IB
5568/* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5569 Return true if REGNO is the number of a hard register in which the values
5570 of called function may come back. */
5571
5572static bool
5573aarch64_function_value_regno_p (const unsigned int regno)
5574{
5575 /* Maximum of 16 bytes can be returned in the general registers. Examples
5576 of 16-byte return values are: 128-bit integers and 16-byte small
5577 structures (excluding homogeneous floating-point aggregates). */
5578 if (regno == R0_REGNUM || regno == R1_REGNUM)
5579 return true;
5580
5581 /* Up to four fp/simd registers can return a function value, e.g. a
5582 homogeneous floating-point aggregate having four members. */
5583 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
d5726973 5584 return TARGET_FLOAT;
43e9d192
IB
5585
5586 return false;
5587}
5588
38e62001
RS
5589/* Subroutine for aarch64_return_in_memory for types that are not returned
5590 in SVE registers. */
43e9d192
IB
5591
5592static bool
38e62001 5593aarch64_return_in_memory_1 (const_tree type)
43e9d192
IB
5594{
5595 HOST_WIDE_INT size;
ef4bddc2 5596 machine_mode ag_mode;
43e9d192
IB
5597 int count;
5598
5599 if (!AGGREGATE_TYPE_P (type)
5600 && TREE_CODE (type) != COMPLEX_TYPE
5601 && TREE_CODE (type) != VECTOR_TYPE)
5602 /* Simple scalar types always returned in registers. */
5603 return false;
5604
56fe3ca3
RS
5605 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5606 &ag_mode, &count, NULL, false))
43e9d192
IB
5607 return false;
5608
5609 /* Types larger than 2 registers returned in memory. */
5610 size = int_size_in_bytes (type);
5611 return (size < 0 || size > 2 * UNITS_PER_WORD);
5612}
5613
38e62001
RS
5614/* Implement TARGET_RETURN_IN_MEMORY.
5615
5616 If the type T of the result of a function is such that
5617 void func (T arg)
5618 would require that arg be passed as a value in a register (or set of
5619 registers) according to the parameter passing rules, then the result
5620 is returned in the same registers as would be used for such an
5621 argument. */
5622
5623static bool
5624aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5625{
5626 pure_scalable_type_info pst_info;
5627 switch (pst_info.analyze (type))
5628 {
5629 case pure_scalable_type_info::IS_PST:
5630 return (pst_info.num_zr () > NUM_FP_ARG_REGS
5631 || pst_info.num_pr () > NUM_PR_ARG_REGS);
5632
5633 case pure_scalable_type_info::DOESNT_MATTER:
5634 gcc_assert (aarch64_return_in_memory_1 (type));
5635 return true;
5636
5637 case pure_scalable_type_info::NO_ABI_IDENTITY:
5638 case pure_scalable_type_info::ISNT_PST:
5639 return aarch64_return_in_memory_1 (type);
5640 }
5641 gcc_unreachable ();
5642}
5643
43e9d192 5644static bool
ef4bddc2 5645aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
43e9d192
IB
5646 const_tree type, int *nregs)
5647{
5648 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
56fe3ca3 5649 return aarch64_vfp_is_call_or_return_candidate (mode, type,
43e9d192 5650 &pcum->aapcs_vfp_rmode,
56fe3ca3 5651 nregs, NULL, pcum->silent_p);
43e9d192
IB
5652}
5653
985b8393 5654/* Given MODE and TYPE of a function argument, return the alignment in
43e9d192 5655 bits. The idea is to suppress any stronger alignment requested by
c590597c
RE
5656 the user and opt for the natural alignment (specified in AAPCS64 \S
5657 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5658 calculated in versions of GCC prior to GCC-9. This is a helper
5659 function for local use only. */
43e9d192 5660
985b8393 5661static unsigned int
c590597c
RE
5662aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5663 bool *abi_break)
43e9d192 5664{
c590597c 5665 *abi_break = false;
75d6cc81 5666 if (!type)
985b8393 5667 return GET_MODE_ALIGNMENT (mode);
2ec07fa6 5668
75d6cc81 5669 if (integer_zerop (TYPE_SIZE (type)))
985b8393 5670 return 0;
43e9d192 5671
75d6cc81
AL
5672 gcc_assert (TYPE_MODE (type) == mode);
5673
5674 if (!AGGREGATE_TYPE_P (type))
985b8393 5675 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
75d6cc81
AL
5676
5677 if (TREE_CODE (type) == ARRAY_TYPE)
985b8393 5678 return TYPE_ALIGN (TREE_TYPE (type));
75d6cc81 5679
985b8393 5680 unsigned int alignment = 0;
c590597c 5681 unsigned int bitfield_alignment = 0;
75d6cc81 5682 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
985b8393 5683 if (TREE_CODE (field) == FIELD_DECL)
c590597c 5684 {
56fe3ca3
RS
5685 /* Note that we explicitly consider zero-sized fields here,
5686 even though they don't map to AAPCS64 machine types.
5687 For example, in:
5688
5689 struct __attribute__((aligned(8))) empty {};
5690
5691 struct s {
5692 [[no_unique_address]] empty e;
5693 int x;
5694 };
5695
5696 "s" contains only one Fundamental Data Type (the int field)
5697 but gains 8-byte alignment and size thanks to "e". */
c590597c
RE
5698 alignment = std::max (alignment, DECL_ALIGN (field));
5699 if (DECL_BIT_FIELD_TYPE (field))
5700 bitfield_alignment
5701 = std::max (bitfield_alignment,
5702 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5703 }
5704
5705 if (bitfield_alignment > alignment)
5706 {
5707 *abi_break = true;
5708 return bitfield_alignment;
5709 }
43e9d192 5710
985b8393 5711 return alignment;
43e9d192
IB
5712}
5713
5714/* Layout a function argument according to the AAPCS64 rules. The rule
6aa5370c
RS
5715 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5716 mode that was originally given to us by the target hook, whereas the
5717 mode in ARG might be the result of replacing partial SVE modes with
5718 the equivalent integer mode. */
43e9d192
IB
5719
5720static void
38e62001 5721aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
5722{
5723 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
c600df9a
RS
5724 tree type = arg.type;
5725 machine_mode mode = arg.mode;
43e9d192
IB
5726 int ncrn, nvrn, nregs;
5727 bool allocate_ncrn, allocate_nvrn;
3abf17cf 5728 HOST_WIDE_INT size;
c590597c 5729 bool abi_break;
43e9d192
IB
5730
5731 /* We need to do this once per argument. */
5732 if (pcum->aapcs_arg_processed)
5733 return;
5734
5735 pcum->aapcs_arg_processed = true;
5736
38e62001
RS
5737 pure_scalable_type_info pst_info;
5738 if (type && pst_info.analyze_registers (type))
c600df9a
RS
5739 {
5740 /* The PCS says that it is invalid to pass an SVE value to an
5741 unprototyped function. There is no ABI-defined location we
5742 can return in this case, so we have no real choice but to raise
5743 an error immediately, even though this is only a query function. */
5744 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5745 {
5746 gcc_assert (!pcum->silent_p);
5747 error ("SVE type %qT cannot be passed to an unprototyped function",
5748 arg.type);
5749 /* Avoid repeating the message, and avoid tripping the assert
5750 below. */
5751 pcum->pcs_variant = ARM_PCS_SVE;
5752 }
5753
5754 /* We would have converted the argument into pass-by-reference
5755 form if it didn't fit in registers. */
38e62001
RS
5756 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5757 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
c600df9a
RS
5758 gcc_assert (arg.named
5759 && pcum->pcs_variant == ARM_PCS_SVE
c600df9a
RS
5760 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5761 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
38e62001
RS
5762 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5763 P0_REGNUM + pcum->aapcs_nprn);
c600df9a
RS
5764 return;
5765 }
5766
38e62001
RS
5767 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5768 are passed by reference, not by value. */
5769 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5770 bool sve_p = (vec_flags & VEC_ANY_SVE);
5771 if (sve_p)
5772 /* Vector types can acquire a partial SVE mode using things like
5773 __attribute__((vector_size(N))), and this is potentially useful.
5774 However, the choice of mode doesn't affect the type's ABI
5775 identity, so we should treat the types as though they had
5776 the associated integer mode, just like they did before SVE
5777 was introduced.
5778
5779 We know that the vector must be 128 bits or smaller,
5780 otherwise we'd have passed it in memory instead. */
5781 gcc_assert (type
5782 && (aarch64_some_values_include_pst_objects_p (type)
5783 || (vec_flags & VEC_PARTIAL)));
c600df9a 5784
3abf17cf 5785 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6a70badb
RS
5786 if (type)
5787 size = int_size_in_bytes (type);
5788 else
5789 /* No frontends can create types with variable-sized modes, so we
5790 shouldn't be asked to pass or return them. */
5791 size = GET_MODE_SIZE (mode).to_constant ();
5792 size = ROUND_UP (size, UNITS_PER_WORD);
3abf17cf 5793
43e9d192
IB
5794 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5795 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5796 mode,
5797 type,
5798 &nregs);
38e62001 5799 gcc_assert (!sve_p || !allocate_nvrn);
43e9d192
IB
5800
5801 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5802 The following code thus handles passing by SIMD/FP registers first. */
5803
5804 nvrn = pcum->aapcs_nvrn;
5805
5806 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5807 and homogenous short-vector aggregates (HVA). */
5808 if (allocate_nvrn)
5809 {
c600df9a 5810 if (!pcum->silent_p && !TARGET_FLOAT)
fc29dfc9 5811 aarch64_err_no_fpadvsimd (mode);
261fb553 5812
43e9d192
IB
5813 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5814 {
5815 pcum->aapcs_nextnvrn = nvrn + nregs;
5816 if (!aarch64_composite_type_p (type, mode))
5817 {
5818 gcc_assert (nregs == 1);
5819 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5820 }
5821 else
5822 {
5823 rtx par;
5824 int i;
5825 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5826 for (i = 0; i < nregs; i++)
5827 {
5828 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5829 V0_REGNUM + nvrn + i);
6a70badb
RS
5830 rtx offset = gen_int_mode
5831 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5832 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
43e9d192
IB
5833 XVECEXP (par, 0, i) = tmp;
5834 }
5835 pcum->aapcs_reg = par;
5836 }
5837 return;
5838 }
5839 else
5840 {
5841 /* C.3 NSRN is set to 8. */
5842 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5843 goto on_stack;
5844 }
5845 }
5846
5847 ncrn = pcum->aapcs_ncrn;
3abf17cf 5848 nregs = size / UNITS_PER_WORD;
43e9d192
IB
5849
5850 /* C6 - C9. though the sign and zero extension semantics are
5851 handled elsewhere. This is the case where the argument fits
5852 entirely general registers. */
5853 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5854 {
43e9d192
IB
5855 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5856
5857 /* C.8 if the argument has an alignment of 16 then the NGRN is
c590597c 5858 rounded up to the next even number. */
985b8393
JJ
5859 if (nregs == 2
5860 && ncrn % 2
2ec07fa6 5861 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
985b8393 5862 comparison is there because for > 16 * BITS_PER_UNIT
2ec07fa6
RR
5863 alignment nregs should be > 2 and therefore it should be
5864 passed by reference rather than value. */
38e62001 5865 && (aarch64_function_arg_alignment (mode, type, &abi_break)
c590597c 5866 == 16 * BITS_PER_UNIT))
985b8393 5867 {
c590597c
RE
5868 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5869 inform (input_location, "parameter passing for argument of type "
5870 "%qT changed in GCC 9.1", type);
985b8393
JJ
5871 ++ncrn;
5872 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
43e9d192 5873 }
2ec07fa6 5874
38e62001
RS
5875 /* If an argument with an SVE mode needs to be shifted up to the
5876 high part of the register, treat it as though it had an integer mode.
5877 Using the normal (parallel [...]) would suppress the shifting. */
5878 if (sve_p
5879 && BYTES_BIG_ENDIAN
5880 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
5881 && aarch64_pad_reg_upward (mode, type, false))
5882 {
5883 mode = int_mode_for_mode (mode).require ();
5884 sve_p = false;
5885 }
5886
43e9d192 5887 /* NREGS can be 0 when e.g. an empty structure is to be passed.
c590597c 5888 A reg is still generated for it, but the caller should be smart
43e9d192 5889 enough not to use it. */
38e62001
RS
5890 if (nregs == 0
5891 || (nregs == 1 && !sve_p)
5892 || GET_MODE_CLASS (mode) == MODE_INT)
2ec07fa6 5893 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
43e9d192
IB
5894 else
5895 {
5896 rtx par;
5897 int i;
5898
5899 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5900 for (i = 0; i < nregs; i++)
5901 {
38e62001
RS
5902 scalar_int_mode reg_mode = word_mode;
5903 if (nregs == 1)
5904 reg_mode = int_mode_for_mode (mode).require ();
5905 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
43e9d192
IB
5906 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5907 GEN_INT (i * UNITS_PER_WORD));
5908 XVECEXP (par, 0, i) = tmp;
5909 }
5910 pcum->aapcs_reg = par;
5911 }
5912
5913 pcum->aapcs_nextncrn = ncrn + nregs;
5914 return;
5915 }
5916
5917 /* C.11 */
5918 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5919
5920 /* The argument is passed on stack; record the needed number of words for
3abf17cf 5921 this argument and align the total size if necessary. */
43e9d192 5922on_stack:
3abf17cf 5923 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2ec07fa6 5924
38e62001 5925 if (aarch64_function_arg_alignment (mode, type, &abi_break)
c590597c
RE
5926 == 16 * BITS_PER_UNIT)
5927 {
5928 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5929 if (pcum->aapcs_stack_size != new_size)
5930 {
5931 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5932 inform (input_location, "parameter passing for argument of type "
5933 "%qT changed in GCC 9.1", type);
5934 pcum->aapcs_stack_size = new_size;
5935 }
5936 }
43e9d192
IB
5937 return;
5938}
5939
5940/* Implement TARGET_FUNCTION_ARG. */
5941
5942static rtx
6783fdb7 5943aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
43e9d192
IB
5944{
5945 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 5946 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
5947 || pcum->pcs_variant == ARM_PCS_SIMD
5948 || pcum->pcs_variant == ARM_PCS_SVE);
43e9d192 5949
6783fdb7 5950 if (arg.end_marker_p ())
08cc4d92 5951 return gen_int_mode (pcum->pcs_variant, DImode);
43e9d192 5952
38e62001 5953 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
5954 return pcum->aapcs_reg;
5955}
5956
5957void
5958aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
08cc4d92
RS
5959 const_tree fntype,
5960 rtx libname ATTRIBUTE_UNUSED,
5961 const_tree fndecl ATTRIBUTE_UNUSED,
c600df9a
RS
5962 unsigned n_named ATTRIBUTE_UNUSED,
5963 bool silent_p)
43e9d192
IB
5964{
5965 pcum->aapcs_ncrn = 0;
5966 pcum->aapcs_nvrn = 0;
c600df9a 5967 pcum->aapcs_nprn = 0;
43e9d192
IB
5968 pcum->aapcs_nextncrn = 0;
5969 pcum->aapcs_nextnvrn = 0;
c600df9a 5970 pcum->aapcs_nextnprn = 0;
08cc4d92
RS
5971 if (fntype)
5972 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5973 else
5974 pcum->pcs_variant = ARM_PCS_AAPCS64;
43e9d192
IB
5975 pcum->aapcs_reg = NULL_RTX;
5976 pcum->aapcs_arg_processed = false;
5977 pcum->aapcs_stack_words = 0;
5978 pcum->aapcs_stack_size = 0;
c600df9a 5979 pcum->silent_p = silent_p;
43e9d192 5980
c600df9a
RS
5981 if (!silent_p
5982 && !TARGET_FLOAT
261fb553
AL
5983 && fndecl && TREE_PUBLIC (fndecl)
5984 && fntype && fntype != error_mark_node)
5985 {
5986 const_tree type = TREE_TYPE (fntype);
5987 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5988 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5989 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
56fe3ca3 5990 &mode, &nregs, NULL, false))
fc29dfc9 5991 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
261fb553 5992 }
c600df9a
RS
5993
5994 if (!silent_p
5995 && !TARGET_SVE
5996 && pcum->pcs_variant == ARM_PCS_SVE)
5997 {
5998 /* We can't gracefully recover at this point, so make this a
5999 fatal error. */
6000 if (fndecl)
6001 fatal_error (input_location, "%qE requires the SVE ISA extension",
6002 fndecl);
6003 else
6004 fatal_error (input_location, "calls to functions of type %qT require"
6005 " the SVE ISA extension", fntype);
6006 }
43e9d192
IB
6007}
6008
6009static void
6010aarch64_function_arg_advance (cumulative_args_t pcum_v,
6930c98c 6011 const function_arg_info &arg)
43e9d192
IB
6012{
6013 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
08cc4d92 6014 if (pcum->pcs_variant == ARM_PCS_AAPCS64
c600df9a
RS
6015 || pcum->pcs_variant == ARM_PCS_SIMD
6016 || pcum->pcs_variant == ARM_PCS_SVE)
43e9d192 6017 {
38e62001 6018 aarch64_layout_arg (pcum_v, arg);
43e9d192
IB
6019 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6020 != (pcum->aapcs_stack_words != 0));
6021 pcum->aapcs_arg_processed = false;
6022 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6023 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
c600df9a 6024 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
43e9d192
IB
6025 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6026 pcum->aapcs_stack_words = 0;
6027 pcum->aapcs_reg = NULL_RTX;
6028 }
6029}
6030
6031bool
6032aarch64_function_arg_regno_p (unsigned regno)
6033{
6034 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6035 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6036}
6037
6038/* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
6039 PARM_BOUNDARY bits of alignment, but will be given anything up
6040 to STACK_BOUNDARY bits if the type requires it. This makes sure
6041 that both before and after the layout of each argument, the Next
6042 Stacked Argument Address (NSAA) will have a minimum alignment of
6043 8 bytes. */
6044
6045static unsigned int
ef4bddc2 6046aarch64_function_arg_boundary (machine_mode mode, const_tree type)
43e9d192 6047{
c590597c
RE
6048 bool abi_break;
6049 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6050 &abi_break);
6051 if (abi_break & warn_psabi)
6052 inform (input_location, "parameter passing for argument of type "
6053 "%qT changed in GCC 9.1", type);
6054
985b8393 6055 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
43e9d192
IB
6056}
6057
43cacb12
RS
6058/* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
6059
6060static fixed_size_mode
6061aarch64_get_reg_raw_mode (int regno)
6062{
6063 if (TARGET_SVE && FP_REGNUM_P (regno))
6064 /* Don't use the SVE part of the register for __builtin_apply and
6065 __builtin_return. The SVE registers aren't used by the normal PCS,
6066 so using them there would be a waste of time. The PCS extensions
6067 for SVE types are fundamentally incompatible with the
6068 __builtin_return/__builtin_apply interface. */
6069 return as_a <fixed_size_mode> (V16QImode);
6070 return default_get_reg_raw_mode (regno);
6071}
6072
76b0cbf8 6073/* Implement TARGET_FUNCTION_ARG_PADDING.
43e9d192
IB
6074
6075 Small aggregate types are placed in the lowest memory address.
6076
6077 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
6078
76b0cbf8
RS
6079static pad_direction
6080aarch64_function_arg_padding (machine_mode mode, const_tree type)
43e9d192
IB
6081{
6082 /* On little-endian targets, the least significant byte of every stack
6083 argument is passed at the lowest byte address of the stack slot. */
6084 if (!BYTES_BIG_ENDIAN)
76b0cbf8 6085 return PAD_UPWARD;
43e9d192 6086
00edcfbe 6087 /* Otherwise, integral, floating-point and pointer types are padded downward:
43e9d192
IB
6088 the least significant byte of a stack argument is passed at the highest
6089 byte address of the stack slot. */
6090 if (type
00edcfbe
YZ
6091 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6092 || POINTER_TYPE_P (type))
43e9d192 6093 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
76b0cbf8 6094 return PAD_DOWNWARD;
43e9d192
IB
6095
6096 /* Everything else padded upward, i.e. data in first byte of stack slot. */
76b0cbf8 6097 return PAD_UPWARD;
43e9d192
IB
6098}
6099
6100/* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6101
6102 It specifies padding for the last (may also be the only)
6103 element of a block move between registers and memory. If
6104 assuming the block is in the memory, padding upward means that
6105 the last element is padded after its highest significant byte,
6106 while in downward padding, the last element is padded at the
6107 its least significant byte side.
6108
6109 Small aggregates and small complex types are always padded
6110 upwards.
6111
6112 We don't need to worry about homogeneous floating-point or
6113 short-vector aggregates; their move is not affected by the
6114 padding direction determined here. Regardless of endianness,
6115 each element of such an aggregate is put in the least
6116 significant bits of a fp/simd register.
6117
6118 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6119 register has useful data, and return the opposite if the most
6120 significant byte does. */
6121
6122bool
ef4bddc2 6123aarch64_pad_reg_upward (machine_mode mode, const_tree type,
43e9d192
IB
6124 bool first ATTRIBUTE_UNUSED)
6125{
6126
38e62001
RS
6127 /* Aside from pure scalable types, small composite types are always
6128 padded upward. */
43e9d192
IB
6129 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6130 {
6a70badb
RS
6131 HOST_WIDE_INT size;
6132 if (type)
6133 size = int_size_in_bytes (type);
6134 else
6135 /* No frontends can create types with variable-sized modes, so we
6136 shouldn't be asked to pass or return them. */
6137 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192 6138 if (size < 2 * UNITS_PER_WORD)
38e62001
RS
6139 {
6140 pure_scalable_type_info pst_info;
6141 if (pst_info.analyze_registers (type))
6142 return false;
6143 return true;
6144 }
43e9d192
IB
6145 }
6146
6147 /* Otherwise, use the default padding. */
6148 return !BYTES_BIG_ENDIAN;
6149}
6150
095a2d76 6151static scalar_int_mode
43e9d192
IB
6152aarch64_libgcc_cmp_return_mode (void)
6153{
6154 return SImode;
6155}
6156
a3eb8a52
EB
6157#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6158
6159/* We use the 12-bit shifted immediate arithmetic instructions so values
6160 must be multiple of (1 << 12), i.e. 4096. */
6161#define ARITH_FACTOR 4096
6162
6163#if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6164#error Cannot use simple address calculation for stack probing
6165#endif
6166
6167/* The pair of scratch registers used for stack probing. */
8921ccbb
OH
6168#define PROBE_STACK_FIRST_REG R9_REGNUM
6169#define PROBE_STACK_SECOND_REG R10_REGNUM
a3eb8a52 6170
6a70badb 6171/* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
a3eb8a52
EB
6172 inclusive. These are offsets from the current stack pointer. */
6173
6174static void
6a70badb 6175aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
a3eb8a52 6176{
6a70badb
RS
6177 HOST_WIDE_INT size;
6178 if (!poly_size.is_constant (&size))
6179 {
6180 sorry ("stack probes for SVE frames");
6181 return;
6182 }
6183
5f5c5e0f 6184 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
a3eb8a52
EB
6185
6186 /* See the same assertion on PROBE_INTERVAL above. */
6187 gcc_assert ((first % ARITH_FACTOR) == 0);
6188
6189 /* See if we have a constant small number of probes to generate. If so,
6190 that's the easy case. */
6191 if (size <= PROBE_INTERVAL)
6192 {
6193 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6194
6195 emit_set_insn (reg1,
5f5c5e0f 6196 plus_constant (Pmode,
a3eb8a52 6197 stack_pointer_rtx, -(first + base)));
5f5c5e0f 6198 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
a3eb8a52
EB
6199 }
6200
6201 /* The run-time loop is made up of 8 insns in the generic case while the
6202 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6203 else if (size <= 4 * PROBE_INTERVAL)
6204 {
6205 HOST_WIDE_INT i, rem;
6206
6207 emit_set_insn (reg1,
5f5c5e0f 6208 plus_constant (Pmode,
a3eb8a52
EB
6209 stack_pointer_rtx,
6210 -(first + PROBE_INTERVAL)));
6211 emit_stack_probe (reg1);
6212
6213 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6214 it exceeds SIZE. If only two probes are needed, this will not
6215 generate any code. Then probe at FIRST + SIZE. */
6216 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6217 {
6218 emit_set_insn (reg1,
5f5c5e0f 6219 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
a3eb8a52
EB
6220 emit_stack_probe (reg1);
6221 }
6222
6223 rem = size - (i - PROBE_INTERVAL);
6224 if (rem > 256)
6225 {
6226 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6227
5f5c5e0f
EB
6228 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6229 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
a3eb8a52
EB
6230 }
6231 else
5f5c5e0f 6232 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
a3eb8a52
EB
6233 }
6234
6235 /* Otherwise, do the same as above, but in a loop. Note that we must be
6236 extra careful with variables wrapping around because we might be at
6237 the very top (or the very bottom) of the address space and we have
6238 to be able to handle this case properly; in particular, we use an
6239 equality test for the loop condition. */
6240 else
6241 {
5f5c5e0f 6242 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
a3eb8a52
EB
6243
6244 /* Step 1: round SIZE to the previous multiple of the interval. */
6245
6246 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6247
6248
6249 /* Step 2: compute initial and final value of the loop counter. */
6250
6251 /* TEST_ADDR = SP + FIRST. */
6252 emit_set_insn (reg1,
5f5c5e0f 6253 plus_constant (Pmode, stack_pointer_rtx, -first));
a3eb8a52
EB
6254
6255 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
13f752b2
JL
6256 HOST_WIDE_INT adjustment = - (first + rounded_size);
6257 if (! aarch64_uimm12_shift (adjustment))
6258 {
6259 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6260 true, Pmode);
6261 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6262 }
6263 else
8dd64cdf
EB
6264 emit_set_insn (reg2,
6265 plus_constant (Pmode, stack_pointer_rtx, adjustment));
6266
a3eb8a52
EB
6267 /* Step 3: the loop
6268
6269 do
6270 {
6271 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6272 probe at TEST_ADDR
6273 }
6274 while (TEST_ADDR != LAST_ADDR)
6275
6276 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6277 until it is equal to ROUNDED_SIZE. */
6278
5f5c5e0f 6279 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
a3eb8a52
EB
6280
6281
6282 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6283 that SIZE is equal to ROUNDED_SIZE. */
6284
6285 if (size != rounded_size)
6286 {
6287 HOST_WIDE_INT rem = size - rounded_size;
6288
6289 if (rem > 256)
6290 {
6291 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6292
5f5c5e0f
EB
6293 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6294 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
a3eb8a52
EB
6295 }
6296 else
5f5c5e0f 6297 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
a3eb8a52
EB
6298 }
6299 }
6300
6301 /* Make sure nothing is scheduled before we are done. */
6302 emit_insn (gen_blockage ());
6303}
6304
6305/* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6306 absolute addresses. */
6307
6308const char *
6309aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6310{
6311 static int labelno = 0;
6312 char loop_lab[32];
6313 rtx xops[2];
6314
6315 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6316
6317 /* Loop. */
6318 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6319
cd1bef27 6320 HOST_WIDE_INT stack_clash_probe_interval
028d4092 6321 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 6322
a3eb8a52
EB
6323 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6324 xops[0] = reg1;
cd1bef27
JL
6325 HOST_WIDE_INT interval;
6326 if (flag_stack_clash_protection)
6327 interval = stack_clash_probe_interval;
6328 else
6329 interval = PROBE_INTERVAL;
6330
6331 gcc_assert (aarch64_uimm12_shift (interval));
6332 xops[1] = GEN_INT (interval);
6333
a3eb8a52
EB
6334 output_asm_insn ("sub\t%0, %0, %1", xops);
6335
cd1bef27
JL
6336 /* If doing stack clash protection then we probe up by the ABI specified
6337 amount. We do this because we're dropping full pages at a time in the
6338 loop. But if we're doing non-stack clash probing, probe at SP 0. */
6339 if (flag_stack_clash_protection)
6340 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6341 else
6342 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6343
6344 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
6345 by this amount for each iteration. */
6346 output_asm_insn ("str\txzr, [%0, %1]", xops);
a3eb8a52
EB
6347
6348 /* Test if TEST_ADDR == LAST_ADDR. */
6349 xops[1] = reg2;
6350 output_asm_insn ("cmp\t%0, %1", xops);
6351
6352 /* Branch. */
6353 fputs ("\tb.ne\t", asm_out_file);
6354 assemble_name_raw (asm_out_file, loop_lab);
6355 fputc ('\n', asm_out_file);
6356
6357 return "";
6358}
6359
eb471ba3
TC
6360/* Emit the probe loop for doing stack clash probes and stack adjustments for
6361 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6362 of GUARD_SIZE. When a probe is emitted it is done at most
6363 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6364 at most MIN_PROBE_THRESHOLD. By the end of this function
6365 BASE = BASE - ADJUSTMENT. */
6366
6367const char *
6368aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6369 rtx min_probe_threshold, rtx guard_size)
6370{
6371 /* This function is not allowed to use any instruction generation function
6372 like gen_ and friends. If you do you'll likely ICE during CFG validation,
6373 so instead emit the code you want using output_asm_insn. */
6374 gcc_assert (flag_stack_clash_protection);
6375 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6376 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6377
6378 /* The minimum required allocation before the residual requires probing. */
6379 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6380
6381 /* Clamp the value down to the nearest value that can be used with a cmp. */
6382 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6383 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6384
6385 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6386 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6387
6388 static int labelno = 0;
6389 char loop_start_lab[32];
6390 char loop_end_lab[32];
6391 rtx xops[2];
6392
6393 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6394 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6395
6396 /* Emit loop start label. */
6397 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6398
6399 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
6400 xops[0] = adjustment;
6401 xops[1] = probe_offset_value_rtx;
6402 output_asm_insn ("cmp\t%0, %1", xops);
6403
6404 /* Branch to end if not enough adjustment to probe. */
6405 fputs ("\tb.lt\t", asm_out_file);
6406 assemble_name_raw (asm_out_file, loop_end_lab);
6407 fputc ('\n', asm_out_file);
6408
6409 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
6410 xops[0] = base;
6411 xops[1] = probe_offset_value_rtx;
6412 output_asm_insn ("sub\t%0, %0, %1", xops);
6413
6414 /* Probe at BASE. */
6415 xops[1] = const0_rtx;
6416 output_asm_insn ("str\txzr, [%0, %1]", xops);
6417
6418 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
6419 xops[0] = adjustment;
6420 xops[1] = probe_offset_value_rtx;
6421 output_asm_insn ("sub\t%0, %0, %1", xops);
6422
6423 /* Branch to start if still more bytes to allocate. */
6424 fputs ("\tb\t", asm_out_file);
6425 assemble_name_raw (asm_out_file, loop_start_lab);
6426 fputc ('\n', asm_out_file);
6427
6428 /* No probe leave. */
6429 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6430
6431 /* BASE = BASE - ADJUSTMENT. */
6432 xops[0] = base;
6433 xops[1] = adjustment;
6434 output_asm_insn ("sub\t%0, %0, %1", xops);
6435 return "";
6436}
6437
d6cb6d6a
WD
6438/* Determine whether a frame chain needs to be generated. */
6439static bool
6440aarch64_needs_frame_chain (void)
6441{
6442 /* Force a frame chain for EH returns so the return address is at FP+8. */
6443 if (frame_pointer_needed || crtl->calls_eh_return)
6444 return true;
6445
6446 /* A leaf function cannot have calls or write LR. */
6447 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6448
6449 /* Don't use a frame chain in leaf functions if leaf frame pointers
6450 are disabled. */
6451 if (flag_omit_leaf_frame_pointer && is_leaf)
6452 return false;
6453
6454 return aarch64_use_frame_pointer;
6455}
6456
43e9d192
IB
6457/* Mark the registers that need to be saved by the callee and calculate
6458 the size of the callee-saved registers area and frame record (both FP
33a2e348 6459 and LR may be omitted). */
43e9d192
IB
6460static void
6461aarch64_layout_frame (void)
6462{
c600df9a 6463 poly_int64 offset = 0;
4b0685d9 6464 int regno, last_fp_reg = INVALID_REGNUM;
c600df9a
RS
6465 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6466 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6467 bool frame_related_fp_reg_p = false;
ab43763e 6468 aarch64_frame &frame = cfun->machine->frame;
43e9d192 6469
ab43763e 6470 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7040939b 6471
8c6e3b23
TC
6472 /* Adjust the outgoing arguments size if required. Keep it in sync with what
6473 the mid-end is doing. */
6474 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6475
97826595
MS
6476#define SLOT_NOT_REQUIRED (-2)
6477#define SLOT_REQUIRED (-1)
6478
ab43763e
RS
6479 frame.wb_candidate1 = INVALID_REGNUM;
6480 frame.wb_candidate2 = INVALID_REGNUM;
c600df9a 6481 frame.spare_pred_reg = INVALID_REGNUM;
363ffa50 6482
43e9d192 6483 /* First mark all the registers that really need to be saved... */
c600df9a 6484 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
ab43763e 6485 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
43e9d192
IB
6486
6487 /* ... that includes the eh data registers (if needed)... */
6488 if (crtl->calls_eh_return)
6489 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
ab43763e 6490 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
43e9d192
IB
6491
6492 /* ... and any callee saved register that dataflow says is live. */
6493 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6494 if (df_regs_ever_live_p (regno)
dcdd0f05 6495 && !fixed_regs[regno]
1c923b60 6496 && (regno == R30_REGNUM
dcdd0f05 6497 || !crtl->abi->clobbers_full_reg_p (regno)))
ab43763e 6498 frame.reg_offset[regno] = SLOT_REQUIRED;
43e9d192
IB
6499
6500 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6501 if (df_regs_ever_live_p (regno)
dcdd0f05
RS
6502 && !fixed_regs[regno]
6503 && !crtl->abi->clobbers_full_reg_p (regno))
4b0685d9 6504 {
ab43763e 6505 frame.reg_offset[regno] = SLOT_REQUIRED;
4b0685d9 6506 last_fp_reg = regno;
c600df9a
RS
6507 if (aarch64_emit_cfi_for_reg_p (regno))
6508 frame_related_fp_reg_p = true;
4b0685d9 6509 }
43e9d192 6510
c600df9a
RS
6511 /* Big-endian SVE frames need a spare predicate register in order
6512 to save Z8-Z15. Decide which register they should use. Prefer
6513 an unused argument register if possible, so that we don't force P4
6514 to be saved unnecessarily. */
6515 if (frame_related_fp_reg_p
6516 && crtl->abi->id () == ARM_PCS_SVE
6517 && BYTES_BIG_ENDIAN)
6518 {
6519 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6520 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6521 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6522 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6523 break;
6524 gcc_assert (regno <= P7_REGNUM);
6525 frame.spare_pred_reg = regno;
6526 df_set_regs_ever_live (regno, true);
6527 }
6528
6529 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6530 if (df_regs_ever_live_p (regno)
6531 && !fixed_regs[regno]
6532 && !crtl->abi->clobbers_full_reg_p (regno))
6533 frame.reg_offset[regno] = SLOT_REQUIRED;
6534
d6430e3c
TC
6535 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
6536 LR counts as an implicit probe which allows us to maintain the invariant
6537 described in the comment at expand_prologue. */
c600df9a
RS
6538 gcc_assert (crtl->is_leaf
6539 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6540
6541 /* Now assign stack slots for the registers. Start with the predicate
6542 registers, since predicate LDR and STR have a relatively small
6543 offset range. These saves happen below the hard frame pointer. */
6544 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6545 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6546 {
6547 frame.reg_offset[regno] = offset;
6548 offset += BYTES_PER_SVE_PRED;
6549 }
6550
c600df9a
RS
6551 if (maybe_ne (offset, 0))
6552 {
cb26919c
RS
6553 /* If we have any vector registers to save above the predicate registers,
6554 the offset of the vector register save slots need to be a multiple
6555 of the vector size. This lets us use the immediate forms of LDR/STR
6556 (or LD1/ST1 for big-endian).
6557
6558 A vector register is 8 times the size of a predicate register,
6559 and we need to save a maximum of 12 predicate registers, so the
6560 first vector register will be at either #1, MUL VL or #2, MUL VL.
6561
6562 If we don't have any vector registers to save, and we know how
6563 big the predicate save area is, we can just round it up to the
6564 next 16-byte boundary. */
6565 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6566 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6567 else
6568 {
6569 if (known_le (offset, vector_save_size))
6570 offset = vector_save_size;
6571 else if (known_le (offset, vector_save_size * 2))
6572 offset = vector_save_size * 2;
6573 else
6574 gcc_unreachable ();
6575 }
c600df9a
RS
6576 }
6577
6578 /* If we need to save any SVE vector registers, add them next. */
6579 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6580 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6581 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6582 {
6583 frame.reg_offset[regno] = offset;
6584 offset += vector_save_size;
6585 }
6586
6587 /* OFFSET is now the offset of the hard frame pointer from the bottom
6588 of the callee save area. */
6589 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6590 frame.below_hard_fp_saved_regs_size = offset;
ab43763e 6591 if (frame.emit_frame_chain)
43e9d192 6592 {
2e1cdae5 6593 /* FP and LR are placed in the linkage record. */
c600df9a 6594 frame.reg_offset[R29_REGNUM] = offset;
ab43763e 6595 frame.wb_candidate1 = R29_REGNUM;
c600df9a 6596 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
ab43763e 6597 frame.wb_candidate2 = R30_REGNUM;
c600df9a 6598 offset += 2 * UNITS_PER_WORD;
1f7bffd0 6599 }
43e9d192 6600
2e1cdae5 6601 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
c600df9a 6602 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 6603 {
ab43763e
RS
6604 frame.reg_offset[regno] = offset;
6605 if (frame.wb_candidate1 == INVALID_REGNUM)
6606 frame.wb_candidate1 = regno;
6607 else if (frame.wb_candidate2 == INVALID_REGNUM)
6608 frame.wb_candidate2 = regno;
43e9d192
IB
6609 offset += UNITS_PER_WORD;
6610 }
6611
c600df9a
RS
6612 poly_int64 max_int_offset = offset;
6613 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6614 bool has_align_gap = maybe_ne (offset, max_int_offset);
4b0685d9 6615
43e9d192 6616 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
c600df9a 6617 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
43e9d192 6618 {
4b0685d9
WD
6619 /* If there is an alignment gap between integer and fp callee-saves,
6620 allocate the last fp register to it if possible. */
a0d0b980
SE
6621 if (regno == last_fp_reg
6622 && has_align_gap
c600df9a
RS
6623 && known_eq (vector_save_size, 8)
6624 && multiple_p (offset, 16))
4b0685d9 6625 {
ab43763e 6626 frame.reg_offset[regno] = max_int_offset;
4b0685d9
WD
6627 break;
6628 }
6629
ab43763e
RS
6630 frame.reg_offset[regno] = offset;
6631 if (frame.wb_candidate1 == INVALID_REGNUM)
6632 frame.wb_candidate1 = regno;
6633 else if (frame.wb_candidate2 == INVALID_REGNUM
6634 && frame.wb_candidate1 >= V0_REGNUM)
6635 frame.wb_candidate2 = regno;
c600df9a 6636 offset += vector_save_size;
43e9d192
IB
6637 }
6638
c600df9a 6639 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192 6640
ab43763e 6641 frame.saved_regs_size = offset;
1c960e02 6642
c600df9a 6643 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
71bfb77a 6644
c600df9a 6645 poly_int64 above_outgoing_args
6a70badb
RS
6646 = aligned_upper_bound (varargs_and_saved_regs_size
6647 + get_frame_size (),
6648 STACK_BOUNDARY / BITS_PER_UNIT);
1c960e02 6649
c600df9a
RS
6650 frame.hard_fp_offset
6651 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6652
6a70badb
RS
6653 /* Both these values are already aligned. */
6654 gcc_assert (multiple_p (crtl->outgoing_args_size,
6655 STACK_BOUNDARY / BITS_PER_UNIT));
c600df9a 6656 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
1c960e02 6657
ab43763e 6658 frame.locals_offset = frame.saved_varargs_size;
71bfb77a 6659
ab43763e
RS
6660 frame.initial_adjust = 0;
6661 frame.final_adjust = 0;
6662 frame.callee_adjust = 0;
c600df9a 6663 frame.sve_callee_adjust = 0;
ab43763e 6664 frame.callee_offset = 0;
71bfb77a
WD
6665
6666 HOST_WIDE_INT max_push_offset = 0;
ab43763e 6667 if (frame.wb_candidate2 != INVALID_REGNUM)
71bfb77a 6668 max_push_offset = 512;
ab43763e 6669 else if (frame.wb_candidate1 != INVALID_REGNUM)
71bfb77a
WD
6670 max_push_offset = 256;
6671
9b17a646 6672 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
c600df9a 6673 HOST_WIDE_INT const_saved_regs_size;
ab43763e 6674 if (frame.frame_size.is_constant (&const_size)
6a70badb 6675 && const_size < max_push_offset
c600df9a 6676 && known_eq (frame.hard_fp_offset, const_size))
71bfb77a
WD
6677 {
6678 /* Simple, small frame with no outgoing arguments:
c600df9a 6679
71bfb77a
WD
6680 stp reg1, reg2, [sp, -frame_size]!
6681 stp reg3, reg4, [sp, 16] */
ab43763e 6682 frame.callee_adjust = const_size;
71bfb77a 6683 }
9b17a646 6684 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
c600df9a
RS
6685 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6686 && const_outgoing_args_size + const_saved_regs_size < 512
6687 /* We could handle this case even with outgoing args, provided
6688 that the number of args left us with valid offsets for all
6689 predicate and vector save slots. It's such a rare case that
6690 it hardly seems worth the effort though. */
6691 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
71bfb77a 6692 && !(cfun->calls_alloca
9b17a646
RS
6693 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6694 && const_fp_offset < max_push_offset))
71bfb77a
WD
6695 {
6696 /* Frame with small outgoing arguments:
c600df9a 6697
71bfb77a
WD
6698 sub sp, sp, frame_size
6699 stp reg1, reg2, [sp, outgoing_args_size]
6700 stp reg3, reg4, [sp, outgoing_args_size + 16] */
ab43763e 6701 frame.initial_adjust = frame.frame_size;
9b17a646 6702 frame.callee_offset = const_outgoing_args_size;
71bfb77a 6703 }
c600df9a
RS
6704 else if (saves_below_hard_fp_p
6705 && known_eq (frame.saved_regs_size,
6706 frame.below_hard_fp_saved_regs_size))
6707 {
6708 /* Frame in which all saves are SVE saves:
6709
6710 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6711 save SVE registers relative to SP
6712 sub sp, sp, outgoing_args_size */
6713 frame.initial_adjust = (frame.hard_fp_offset
6714 + frame.below_hard_fp_saved_regs_size);
6715 frame.final_adjust = crtl->outgoing_args_size;
6716 }
ab43763e 6717 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6a70badb 6718 && const_fp_offset < max_push_offset)
71bfb77a 6719 {
c600df9a
RS
6720 /* Frame with large outgoing arguments or SVE saves, but with
6721 a small local area:
6722
71bfb77a
WD
6723 stp reg1, reg2, [sp, -hard_fp_offset]!
6724 stp reg3, reg4, [sp, 16]
c600df9a
RS
6725 [sub sp, sp, below_hard_fp_saved_regs_size]
6726 [save SVE registers relative to SP]
71bfb77a 6727 sub sp, sp, outgoing_args_size */
ab43763e 6728 frame.callee_adjust = const_fp_offset;
c600df9a 6729 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 6730 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a 6731 }
71bfb77a
WD
6732 else
6733 {
c600df9a
RS
6734 /* Frame with large local area and outgoing arguments or SVE saves,
6735 using frame pointer:
6736
71bfb77a
WD
6737 sub sp, sp, hard_fp_offset
6738 stp x29, x30, [sp, 0]
6739 add x29, sp, 0
6740 stp reg3, reg4, [sp, 16]
c600df9a
RS
6741 [sub sp, sp, below_hard_fp_saved_regs_size]
6742 [save SVE registers relative to SP]
71bfb77a 6743 sub sp, sp, outgoing_args_size */
ab43763e 6744 frame.initial_adjust = frame.hard_fp_offset;
c600df9a 6745 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8e66b377 6746 frame.final_adjust = crtl->outgoing_args_size;
71bfb77a
WD
6747 }
6748
8e66b377
RS
6749 /* Make sure the individual adjustments add up to the full frame size. */
6750 gcc_assert (known_eq (frame.initial_adjust
6751 + frame.callee_adjust
c600df9a 6752 + frame.sve_callee_adjust
8e66b377
RS
6753 + frame.final_adjust, frame.frame_size));
6754
59a3d73d
RS
6755 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
6756 {
6757 /* We've decided not to associate any register saves with the initial
6758 stack allocation. */
6759 frame.wb_candidate1 = INVALID_REGNUM;
6760 frame.wb_candidate2 = INVALID_REGNUM;
6761 }
6762
ab43763e 6763 frame.laid_out = true;
43e9d192
IB
6764}
6765
04ddfe06
KT
6766/* Return true if the register REGNO is saved on entry to
6767 the current function. */
6768
43e9d192
IB
6769static bool
6770aarch64_register_saved_on_entry (int regno)
6771{
c600df9a 6772 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
43e9d192
IB
6773}
6774
04ddfe06
KT
6775/* Return the next register up from REGNO up to LIMIT for the callee
6776 to save. */
6777
64dedd72
JW
6778static unsigned
6779aarch64_next_callee_save (unsigned regno, unsigned limit)
6780{
6781 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6782 regno ++;
6783 return regno;
6784}
43e9d192 6785
04ddfe06
KT
6786/* Push the register number REGNO of mode MODE to the stack with write-back
6787 adjusting the stack by ADJUSTMENT. */
6788
c5e1f66e 6789static void
ef4bddc2 6790aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
c5e1f66e
JW
6791 HOST_WIDE_INT adjustment)
6792 {
6793 rtx base_rtx = stack_pointer_rtx;
6794 rtx insn, reg, mem;
6795
6796 reg = gen_rtx_REG (mode, regno);
6797 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6798 plus_constant (Pmode, base_rtx, -adjustment));
30079dde 6799 mem = gen_frame_mem (mode, mem);
c5e1f66e
JW
6800
6801 insn = emit_move_insn (mem, reg);
6802 RTX_FRAME_RELATED_P (insn) = 1;
6803}
6804
04ddfe06
KT
6805/* Generate and return an instruction to store the pair of registers
6806 REG and REG2 of mode MODE to location BASE with write-back adjusting
6807 the stack location BASE by ADJUSTMENT. */
6808
80c11907 6809static rtx
ef4bddc2 6810aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
80c11907
JW
6811 HOST_WIDE_INT adjustment)
6812{
6813 switch (mode)
6814 {
4e10a5a7 6815 case E_DImode:
80c11907
JW
6816 return gen_storewb_pairdi_di (base, base, reg, reg2,
6817 GEN_INT (-adjustment),
6818 GEN_INT (UNITS_PER_WORD - adjustment));
4e10a5a7 6819 case E_DFmode:
80c11907
JW
6820 return gen_storewb_pairdf_di (base, base, reg, reg2,
6821 GEN_INT (-adjustment),
6822 GEN_INT (UNITS_PER_WORD - adjustment));
a0d0b980
SE
6823 case E_TFmode:
6824 return gen_storewb_pairtf_di (base, base, reg, reg2,
6825 GEN_INT (-adjustment),
6826 GEN_INT (UNITS_PER_VREG - adjustment));
80c11907
JW
6827 default:
6828 gcc_unreachable ();
6829 }
6830}
6831
04ddfe06
KT
6832/* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6833 stack pointer by ADJUSTMENT. */
6834
80c11907 6835static void
89ac681e 6836aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
80c11907 6837{
5d8a22a5 6838 rtx_insn *insn;
c600df9a 6839 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e 6840
71bfb77a 6841 if (regno2 == INVALID_REGNUM)
89ac681e
WD
6842 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6843
80c11907
JW
6844 rtx reg1 = gen_rtx_REG (mode, regno1);
6845 rtx reg2 = gen_rtx_REG (mode, regno2);
6846
6847 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6848 reg2, adjustment));
6849 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
80c11907
JW
6850 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6851 RTX_FRAME_RELATED_P (insn) = 1;
6852}
6853
04ddfe06
KT
6854/* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6855 adjusting it by ADJUSTMENT afterwards. */
6856
159313d9 6857static rtx
ef4bddc2 6858aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
159313d9
JW
6859 HOST_WIDE_INT adjustment)
6860{
6861 switch (mode)
6862 {
4e10a5a7 6863 case E_DImode:
159313d9 6864 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 6865 GEN_INT (UNITS_PER_WORD));
4e10a5a7 6866 case E_DFmode:
159313d9 6867 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3e322b3f 6868 GEN_INT (UNITS_PER_WORD));
a0d0b980
SE
6869 case E_TFmode:
6870 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6871 GEN_INT (UNITS_PER_VREG));
159313d9
JW
6872 default:
6873 gcc_unreachable ();
6874 }
6875}
6876
04ddfe06
KT
6877/* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6878 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6879 into CFI_OPS. */
6880
89ac681e
WD
6881static void
6882aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6883 rtx *cfi_ops)
6884{
c600df9a 6885 machine_mode mode = aarch64_reg_save_mode (regno1);
89ac681e
WD
6886 rtx reg1 = gen_rtx_REG (mode, regno1);
6887
6888 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6889
71bfb77a 6890 if (regno2 == INVALID_REGNUM)
89ac681e
WD
6891 {
6892 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6893 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
30079dde 6894 emit_move_insn (reg1, gen_frame_mem (mode, mem));
89ac681e
WD
6895 }
6896 else
6897 {
6898 rtx reg2 = gen_rtx_REG (mode, regno2);
6899 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6900 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6901 reg2, adjustment));
6902 }
6903}
6904
04ddfe06
KT
6905/* Generate and return a store pair instruction of mode MODE to store
6906 register REG1 to MEM1 and register REG2 to MEM2. */
6907
72df5c1f 6908static rtx
ef4bddc2 6909aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
72df5c1f
JW
6910 rtx reg2)
6911{
6912 switch (mode)
6913 {
4e10a5a7 6914 case E_DImode:
dfe1da23 6915 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
72df5c1f 6916
4e10a5a7 6917 case E_DFmode:
dfe1da23 6918 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
72df5c1f 6919
a0d0b980
SE
6920 case E_TFmode:
6921 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6922
72df5c1f
JW
6923 default:
6924 gcc_unreachable ();
6925 }
6926}
6927
04ddfe06
KT
6928/* Generate and regurn a load pair isntruction of mode MODE to load register
6929 REG1 from MEM1 and register REG2 from MEM2. */
6930
72df5c1f 6931static rtx
ef4bddc2 6932aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
72df5c1f
JW
6933 rtx mem2)
6934{
6935 switch (mode)
6936 {
4e10a5a7 6937 case E_DImode:
dfe1da23 6938 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
72df5c1f 6939
4e10a5a7 6940 case E_DFmode:
dfe1da23 6941 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
72df5c1f 6942
a0d0b980
SE
6943 case E_TFmode:
6944 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6945
72df5c1f
JW
6946 default:
6947 gcc_unreachable ();
6948 }
6949}
6950
db58fd89
JW
6951/* Return TRUE if return address signing should be enabled for the current
6952 function, otherwise return FALSE. */
6953
6954bool
6955aarch64_return_address_signing_enabled (void)
6956{
6957 /* This function should only be called after frame laid out. */
6958 gcc_assert (cfun->machine->frame.laid_out);
6959
2bc95be3
SN
6960 /* Turn return address signing off in any function that uses
6961 __builtin_eh_return. The address passed to __builtin_eh_return
6962 is not signed so either it has to be signed (with original sp)
6963 or the code path that uses it has to avoid authenticating it.
6964 Currently eh return introduces a return to anywhere gadget, no
6965 matter what we do here since it uses ret with user provided
6966 address. An ideal fix for that is to use indirect branch which
6967 can be protected with BTI j (to some extent). */
6968 if (crtl->calls_eh_return)
6969 return false;
6970
db58fd89 6971 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8fc16d72 6972 if its LR is pushed onto stack. */
db58fd89
JW
6973 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6974 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
c600df9a 6975 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
db58fd89
JW
6976}
6977
30afdf34
SD
6978/* Return TRUE if Branch Target Identification Mechanism is enabled. */
6979bool
6980aarch64_bti_enabled (void)
6981{
6982 return (aarch64_enable_bti == 1);
6983}
6984
c600df9a
RS
6985/* The caller is going to use ST1D or LD1D to save or restore an SVE
6986 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6987 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6988
6989 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6990 or LD1D address
6991
6992 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6993 if the variable isn't already nonnull
6994
6995 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6996 Handle this case using a temporary base register that is suitable for
6997 all offsets in that range. Use ANCHOR_REG as this base register if it
6998 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6999
7000static inline void
7001aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7002 rtx &anchor_reg, poly_int64 &offset,
7003 rtx &ptrue)
7004{
7005 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7006 {
7007 /* This is the maximum valid offset of the anchor from the base.
7008 Lower values would be valid too. */
7009 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7010 if (!anchor_reg)
7011 {
7012 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7013 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7014 gen_int_mode (anchor_offset, Pmode)));
7015 }
7016 base_rtx = anchor_reg;
7017 offset -= anchor_offset;
7018 }
7019 if (!ptrue)
7020 {
7021 int pred_reg = cfun->machine->frame.spare_pred_reg;
7022 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7023 CONSTM1_RTX (VNx16BImode));
7024 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7025 }
7026}
7027
7028/* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7029 is saved at BASE + OFFSET. */
7030
7031static void
7032aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7033 rtx base, poly_int64 offset)
7034{
7035 rtx mem = gen_frame_mem (GET_MODE (reg),
7036 plus_constant (Pmode, base, offset));
7037 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7038}
7039
04ddfe06
KT
7040/* Emit code to save the callee-saved registers from register number START
7041 to LIMIT to the stack at the location starting at offset START_OFFSET,
c600df9a
RS
7042 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
7043 is true if the hard frame pointer has been set up. */
43e9d192 7044
43e9d192 7045static void
c600df9a
RS
7046aarch64_save_callee_saves (poly_int64 start_offset,
7047 unsigned start, unsigned limit, bool skip_wb,
7048 bool hard_fp_valid_p)
43e9d192 7049{
5d8a22a5 7050 rtx_insn *insn;
43e9d192
IB
7051 unsigned regno;
7052 unsigned regno2;
c600df9a 7053 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
43e9d192 7054
0ec74a1e 7055 for (regno = aarch64_next_callee_save (start, limit);
64dedd72
JW
7056 regno <= limit;
7057 regno = aarch64_next_callee_save (regno + 1, limit))
43e9d192 7058 {
ae13fce3 7059 rtx reg, mem;
6a70badb 7060 poly_int64 offset;
c600df9a 7061 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
64dedd72 7062
ae13fce3
JW
7063 if (skip_wb
7064 && (regno == cfun->machine->frame.wb_candidate1
7065 || regno == cfun->machine->frame.wb_candidate2))
7066 continue;
7067
827ab47a 7068 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 7069 continue;
827ab47a 7070
c600df9a 7071 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3
JW
7072 reg = gen_rtx_REG (mode, regno);
7073 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7074 rtx base_rtx = stack_pointer_rtx;
7075 poly_int64 sp_offset = offset;
64dedd72 7076
c600df9a
RS
7077 HOST_WIDE_INT const_offset;
7078 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7079 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7080 offset, ptrue);
7081 else if (GP_REGNUM_P (regno)
7082 && (!offset.is_constant (&const_offset) || const_offset >= 512))
7083 {
7084 gcc_assert (known_eq (start_offset, 0));
7085 poly_int64 fp_offset
7086 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7087 if (hard_fp_valid_p)
7088 base_rtx = hard_frame_pointer_rtx;
7089 else
7090 {
7091 if (!anchor_reg)
7092 {
7093 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7094 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7095 gen_int_mode (fp_offset, Pmode)));
7096 }
7097 base_rtx = anchor_reg;
7098 }
7099 offset -= fp_offset;
7100 }
7101 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7102 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
64dedd72 7103
c600df9a
RS
7104 if (!aarch64_sve_mode_p (mode)
7105 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 7106 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
7107 && known_eq (GET_MODE_SIZE (mode),
7108 cfun->machine->frame.reg_offset[regno2]
7109 - cfun->machine->frame.reg_offset[regno]))
43e9d192 7110 {
0ec74a1e 7111 rtx reg2 = gen_rtx_REG (mode, regno2);
64dedd72
JW
7112 rtx mem2;
7113
c600df9a
RS
7114 offset += GET_MODE_SIZE (mode);
7115 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62
JW
7116 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7117 reg2));
0b4a9743 7118
64dedd72
JW
7119 /* The first part of a frame-related parallel insn is
7120 always assumed to be relevant to the frame
7121 calculations; subsequent parts, are only
7122 frame-related if explicitly marked. */
c600df9a
RS
7123 if (aarch64_emit_cfi_for_reg_p (regno2))
7124 {
7125 if (need_cfa_note_p)
7126 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7127 sp_offset + GET_MODE_SIZE (mode));
7128 else
7129 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7130 }
7131
64dedd72
JW
7132 regno = regno2;
7133 }
c600df9a
RS
7134 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7135 {
7136 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7137 need_cfa_note_p = true;
7138 }
7139 else if (aarch64_sve_mode_p (mode))
7140 insn = emit_insn (gen_rtx_SET (mem, reg));
64dedd72 7141 else
8ed2fc62
JW
7142 insn = emit_move_insn (mem, reg);
7143
c600df9a
RS
7144 RTX_FRAME_RELATED_P (insn) = frame_related_p;
7145 if (frame_related_p && need_cfa_note_p)
7146 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8ed2fc62
JW
7147 }
7148}
7149
c600df9a
RS
7150/* Emit code to restore the callee registers from register number START
7151 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7152 skipping any write-back candidates if SKIP_WB is true. Write the
7153 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
04ddfe06 7154
8ed2fc62 7155static void
c600df9a 7156aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
dd991abb 7157 unsigned limit, bool skip_wb, rtx *cfi_ops)
8ed2fc62 7158{
8ed2fc62
JW
7159 unsigned regno;
7160 unsigned regno2;
6a70badb 7161 poly_int64 offset;
c600df9a 7162 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8ed2fc62
JW
7163
7164 for (regno = aarch64_next_callee_save (start, limit);
7165 regno <= limit;
7166 regno = aarch64_next_callee_save (regno + 1, limit))
7167 {
c600df9a 7168 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
827ab47a 7169 if (cfun->machine->reg_is_wrapped_separately[regno])
c600df9a 7170 continue;
827ab47a 7171
ae13fce3 7172 rtx reg, mem;
8ed2fc62 7173
ae13fce3
JW
7174 if (skip_wb
7175 && (regno == cfun->machine->frame.wb_candidate1
7176 || regno == cfun->machine->frame.wb_candidate2))
7177 continue;
7178
c600df9a 7179 machine_mode mode = aarch64_reg_save_mode (regno);
ae13fce3 7180 reg = gen_rtx_REG (mode, regno);
8ed2fc62 7181 offset = start_offset + cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7182 rtx base_rtx = stack_pointer_rtx;
7183 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7184 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7185 offset, ptrue);
30079dde 7186 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8ed2fc62 7187
c600df9a
RS
7188 if (!aarch64_sve_mode_p (mode)
7189 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
827ab47a 7190 && !cfun->machine->reg_is_wrapped_separately[regno2]
c600df9a
RS
7191 && known_eq (GET_MODE_SIZE (mode),
7192 cfun->machine->frame.reg_offset[regno2]
7193 - cfun->machine->frame.reg_offset[regno]))
64dedd72 7194 {
8ed2fc62
JW
7195 rtx reg2 = gen_rtx_REG (mode, regno2);
7196 rtx mem2;
7197
c600df9a 7198 offset += GET_MODE_SIZE (mode);
30079dde 7199 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
dd991abb 7200 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8ed2fc62 7201
dd991abb 7202 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8ed2fc62 7203 regno = regno2;
43e9d192 7204 }
c600df9a
RS
7205 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7206 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7207 else if (aarch64_sve_mode_p (mode))
7208 emit_insn (gen_rtx_SET (reg, mem));
8ed2fc62 7209 else
dd991abb 7210 emit_move_insn (reg, mem);
c600df9a
RS
7211 if (frame_related_p)
7212 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
43e9d192 7213 }
43e9d192
IB
7214}
7215
43cacb12
RS
7216/* Return true if OFFSET is a signed 4-bit value multiplied by the size
7217 of MODE. */
7218
7219static inline bool
7220offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7221{
7222 HOST_WIDE_INT multiple;
7223 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7224 && IN_RANGE (multiple, -8, 7));
7225}
7226
7227/* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7228 of MODE. */
7229
7230static inline bool
7231offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7232{
7233 HOST_WIDE_INT multiple;
7234 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7235 && IN_RANGE (multiple, 0, 63));
7236}
7237
7238/* Return true if OFFSET is a signed 7-bit value multiplied by the size
7239 of MODE. */
7240
7241bool
7242aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7243{
7244 HOST_WIDE_INT multiple;
7245 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7246 && IN_RANGE (multiple, -64, 63));
7247}
7248
7249/* Return true if OFFSET is a signed 9-bit value. */
7250
3c5af608
MM
7251bool
7252aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7253 poly_int64 offset)
827ab47a 7254{
6a70badb
RS
7255 HOST_WIDE_INT const_offset;
7256 return (offset.is_constant (&const_offset)
7257 && IN_RANGE (const_offset, -256, 255));
827ab47a
KT
7258}
7259
43cacb12
RS
7260/* Return true if OFFSET is a signed 9-bit value multiplied by the size
7261 of MODE. */
7262
827ab47a 7263static inline bool
43cacb12 7264offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 7265{
6a70badb
RS
7266 HOST_WIDE_INT multiple;
7267 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 7268 && IN_RANGE (multiple, -256, 255));
827ab47a
KT
7269}
7270
43cacb12
RS
7271/* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7272 of MODE. */
7273
7274static inline bool
7275offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
827ab47a 7276{
6a70badb
RS
7277 HOST_WIDE_INT multiple;
7278 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
43cacb12 7279 && IN_RANGE (multiple, 0, 4095));
827ab47a
KT
7280}
7281
7282/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7283
7284static sbitmap
7285aarch64_get_separate_components (void)
7286{
827ab47a
KT
7287 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7288 bitmap_clear (components);
7289
7290 /* The registers we need saved to the frame. */
7291 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7292 if (aarch64_register_saved_on_entry (regno))
7293 {
c600df9a
RS
7294 /* Punt on saves and restores that use ST1D and LD1D. We could
7295 try to be smarter, but it would involve making sure that the
7296 spare predicate register itself is safe to use at the save
7297 and restore points. Also, when a frame pointer is being used,
7298 the slots are often out of reach of ST1D and LD1D anyway. */
7299 machine_mode mode = aarch64_reg_save_mode (regno);
7300 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7301 continue;
7302
6a70badb 7303 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7304
7305 /* If the register is saved in the first SVE save slot, we use
7306 it as a stack probe for -fstack-clash-protection. */
7307 if (flag_stack_clash_protection
7308 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7309 && known_eq (offset, 0))
7310 continue;
7311
7312 /* Get the offset relative to the register we'll use. */
7313 if (frame_pointer_needed)
7314 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7315 else
7316 offset += crtl->outgoing_args_size;
7317
827ab47a
KT
7318 /* Check that we can access the stack slot of the register with one
7319 direct load with no adjustments needed. */
c600df9a
RS
7320 if (aarch64_sve_mode_p (mode)
7321 ? offset_9bit_signed_scaled_p (mode, offset)
7322 : offset_12bit_unsigned_scaled_p (mode, offset))
827ab47a
KT
7323 bitmap_set_bit (components, regno);
7324 }
7325
7326 /* Don't mess with the hard frame pointer. */
7327 if (frame_pointer_needed)
7328 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7329
c600df9a
RS
7330 /* If the spare predicate register used by big-endian SVE code
7331 is call-preserved, it must be saved in the main prologue
7332 before any saves that use it. */
7333 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7334 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7335
827ab47a
KT
7336 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7337 unsigned reg2 = cfun->machine->frame.wb_candidate2;
0795f659 7338 /* If registers have been chosen to be stored/restored with
827ab47a
KT
7339 writeback don't interfere with them to avoid having to output explicit
7340 stack adjustment instructions. */
7341 if (reg2 != INVALID_REGNUM)
7342 bitmap_clear_bit (components, reg2);
7343 if (reg1 != INVALID_REGNUM)
7344 bitmap_clear_bit (components, reg1);
7345
7346 bitmap_clear_bit (components, LR_REGNUM);
7347 bitmap_clear_bit (components, SP_REGNUM);
7348
7349 return components;
7350}
7351
7352/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
7353
7354static sbitmap
7355aarch64_components_for_bb (basic_block bb)
7356{
7357 bitmap in = DF_LIVE_IN (bb);
7358 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7359 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7360
7361 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7362 bitmap_clear (components);
7363
ce9d2a37
RS
7364 /* Clobbered registers don't generate values in any meaningful sense,
7365 since nothing after the clobber can rely on their value. And we can't
7366 say that partially-clobbered registers are unconditionally killed,
7367 because whether they're killed or not depends on the mode of the
7368 value they're holding. Thus partially call-clobbered registers
7369 appear in neither the kill set nor the gen set.
7370
7371 Check manually for any calls that clobber more of a register than the
7372 current function can. */
7373 function_abi_aggregator callee_abis;
7374 rtx_insn *insn;
7375 FOR_BB_INSNS (bb, insn)
7376 if (CALL_P (insn))
7377 callee_abis.note_callee_abi (insn_callee_abi (insn));
7378 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7379
827ab47a
KT
7380 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
7381 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
dcdd0f05
RS
7382 if (!fixed_regs[regno]
7383 && !crtl->abi->clobbers_full_reg_p (regno)
ce9d2a37
RS
7384 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7385 || bitmap_bit_p (in, regno)
7386 || bitmap_bit_p (gen, regno)
7387 || bitmap_bit_p (kill, regno)))
3f26f054 7388 {
3f26f054
WD
7389 bitmap_set_bit (components, regno);
7390
7391 /* If there is a callee-save at an adjacent offset, add it too
7392 to increase the use of LDP/STP. */
c600df9a
RS
7393 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7394 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
3f26f054
WD
7395
7396 if (regno2 <= LAST_SAVED_REGNUM)
7397 {
c600df9a
RS
7398 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7399 if (regno < regno2
7400 ? known_eq (offset + 8, offset2)
7401 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
3f26f054
WD
7402 bitmap_set_bit (components, regno2);
7403 }
7404 }
827ab47a
KT
7405
7406 return components;
7407}
7408
7409/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7410 Nothing to do for aarch64. */
7411
7412static void
7413aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7414{
7415}
7416
7417/* Return the next set bit in BMP from START onwards. Return the total number
7418 of bits in BMP if no set bit is found at or after START. */
7419
7420static unsigned int
7421aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7422{
7423 unsigned int nbits = SBITMAP_SIZE (bmp);
7424 if (start == nbits)
7425 return start;
7426
7427 gcc_assert (start < nbits);
7428 for (unsigned int i = start; i < nbits; i++)
7429 if (bitmap_bit_p (bmp, i))
7430 return i;
7431
7432 return nbits;
7433}
7434
7435/* Do the work for aarch64_emit_prologue_components and
7436 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
7437 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7438 for these components or the epilogue sequence. That is, it determines
7439 whether we should emit stores or loads and what kind of CFA notes to attach
7440 to the insns. Otherwise the logic for the two sequences is very
7441 similar. */
7442
7443static void
7444aarch64_process_components (sbitmap components, bool prologue_p)
7445{
7446 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7447 ? HARD_FRAME_POINTER_REGNUM
7448 : STACK_POINTER_REGNUM);
7449
7450 unsigned last_regno = SBITMAP_SIZE (components);
7451 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7452 rtx_insn *insn = NULL;
7453
7454 while (regno != last_regno)
7455 {
c600df9a
RS
7456 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7457 machine_mode mode = aarch64_reg_save_mode (regno);
a0d0b980 7458
827ab47a 7459 rtx reg = gen_rtx_REG (mode, regno);
6a70badb 7460 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
c600df9a
RS
7461 if (frame_pointer_needed)
7462 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7463 else
7464 offset += crtl->outgoing_args_size;
7465
827ab47a
KT
7466 rtx addr = plus_constant (Pmode, ptr_reg, offset);
7467 rtx mem = gen_frame_mem (mode, addr);
7468
7469 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7470 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7471 /* No more registers to handle after REGNO.
7472 Emit a single save/restore and exit. */
7473 if (regno2 == last_regno)
7474 {
7475 insn = emit_insn (set);
c600df9a
RS
7476 if (frame_related_p)
7477 {
7478 RTX_FRAME_RELATED_P (insn) = 1;
7479 if (prologue_p)
7480 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7481 else
7482 add_reg_note (insn, REG_CFA_RESTORE, reg);
7483 }
827ab47a
KT
7484 break;
7485 }
7486
6a70badb 7487 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
827ab47a
KT
7488 /* The next register is not of the same class or its offset is not
7489 mergeable with the current one into a pair. */
c600df9a
RS
7490 if (aarch64_sve_mode_p (mode)
7491 || !satisfies_constraint_Ump (mem)
827ab47a 7492 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
dcdd0f05 7493 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6a70badb
RS
7494 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7495 GET_MODE_SIZE (mode)))
827ab47a
KT
7496 {
7497 insn = emit_insn (set);
c600df9a
RS
7498 if (frame_related_p)
7499 {
7500 RTX_FRAME_RELATED_P (insn) = 1;
7501 if (prologue_p)
7502 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7503 else
7504 add_reg_note (insn, REG_CFA_RESTORE, reg);
7505 }
827ab47a
KT
7506
7507 regno = regno2;
7508 continue;
7509 }
7510
c600df9a
RS
7511 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7512
827ab47a
KT
7513 /* REGNO2 can be saved/restored in a pair with REGNO. */
7514 rtx reg2 = gen_rtx_REG (mode, regno2);
c600df9a
RS
7515 if (frame_pointer_needed)
7516 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7517 else
7518 offset2 += crtl->outgoing_args_size;
827ab47a
KT
7519 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7520 rtx mem2 = gen_frame_mem (mode, addr2);
7521 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7522 : gen_rtx_SET (reg2, mem2);
7523
7524 if (prologue_p)
7525 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7526 else
7527 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7528
c600df9a 7529 if (frame_related_p || frame_related2_p)
827ab47a 7530 {
c600df9a
RS
7531 RTX_FRAME_RELATED_P (insn) = 1;
7532 if (prologue_p)
7533 {
7534 if (frame_related_p)
7535 add_reg_note (insn, REG_CFA_OFFSET, set);
7536 if (frame_related2_p)
7537 add_reg_note (insn, REG_CFA_OFFSET, set2);
7538 }
7539 else
7540 {
7541 if (frame_related_p)
7542 add_reg_note (insn, REG_CFA_RESTORE, reg);
7543 if (frame_related2_p)
7544 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7545 }
827ab47a
KT
7546 }
7547
7548 regno = aarch64_get_next_set_bit (components, regno2 + 1);
7549 }
7550}
7551
7552/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
7553
7554static void
7555aarch64_emit_prologue_components (sbitmap components)
7556{
7557 aarch64_process_components (components, true);
7558}
7559
7560/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7561
7562static void
7563aarch64_emit_epilogue_components (sbitmap components)
7564{
7565 aarch64_process_components (components, false);
7566}
7567
7568/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7569
7570static void
7571aarch64_set_handled_components (sbitmap components)
7572{
7573 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7574 if (bitmap_bit_p (components, regno))
7575 cfun->machine->reg_is_wrapped_separately[regno] = true;
7576}
7577
8c6e3b23
TC
7578/* On AArch64 we have an ABI defined safe buffer. This constant is used to
7579 determining the probe offset for alloca. */
7580
7581static HOST_WIDE_INT
7582aarch64_stack_clash_protection_alloca_probe_range (void)
7583{
7584 return STACK_CLASH_CALLER_GUARD;
7585}
7586
7587
cd1bef27
JL
7588/* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7589 registers. If POLY_SIZE is not large enough to require a probe this function
7590 will only adjust the stack. When allocating the stack space
7591 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7592 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7593 arguments. If we are then we ensure that any allocation larger than the ABI
7594 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7595 maintained.
7596
7597 We emit barriers after each stack adjustment to prevent optimizations from
7598 breaking the invariant that we never drop the stack more than a page. This
7599 invariant is needed to make it easier to correctly handle asynchronous
7600 events, e.g. if we were to allow the stack to be dropped by more than a page
7601 and then have multiple probes up and we take a signal somewhere in between
7602 then the signal handler doesn't know the state of the stack and can make no
7603 assumptions about which pages have been probed. */
7604
7605static void
7606aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7607 poly_int64 poly_size,
7608 bool frame_related_p,
7609 bool final_adjustment_p)
7610{
7611 HOST_WIDE_INT guard_size
028d4092 7612 = 1 << param_stack_clash_protection_guard_size;
cd1bef27 7613 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
cd1bef27 7614 HOST_WIDE_INT min_probe_threshold
c600df9a
RS
7615 = (final_adjustment_p
7616 ? guard_used_by_caller
7617 : guard_size - guard_used_by_caller);
7618 /* When doing the final adjustment for the outgoing arguments, take into
7619 account any unprobed space there is above the current SP. There are
7620 two cases:
7621
7622 - When saving SVE registers below the hard frame pointer, we force
7623 the lowest save to take place in the prologue before doing the final
7624 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7625 This acts as a probe at SP, so there is no unprobed space.
7626
7627 - When there are no SVE register saves, we use the store of the link
7628 register as a probe. We can't assume that LR was saved at position 0
7629 though, so treat any space below it as unprobed. */
7630 if (final_adjustment_p
7631 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7632 {
7633 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7634 if (known_ge (lr_offset, 0))
7635 min_probe_threshold -= lr_offset.to_constant ();
7636 else
7637 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7638 }
cd1bef27
JL
7639
7640 poly_int64 frame_size = cfun->machine->frame.frame_size;
7641
7642 /* We should always have a positive probe threshold. */
7643 gcc_assert (min_probe_threshold > 0);
7644
7645 if (flag_stack_clash_protection && !final_adjustment_p)
7646 {
7647 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
c600df9a 7648 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
cd1bef27
JL
7649 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7650
7651 if (known_eq (frame_size, 0))
7652 {
7653 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7654 }
c600df9a
RS
7655 else if (known_lt (initial_adjust + sve_callee_adjust,
7656 guard_size - guard_used_by_caller)
cd1bef27
JL
7657 && known_lt (final_adjust, guard_used_by_caller))
7658 {
7659 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7660 }
7661 }
7662
cd1bef27
JL
7663 /* If SIZE is not large enough to require probing, just adjust the stack and
7664 exit. */
eb471ba3 7665 if (known_lt (poly_size, min_probe_threshold)
cd1bef27
JL
7666 || !flag_stack_clash_protection)
7667 {
7668 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7669 return;
7670 }
7671
eb471ba3
TC
7672 HOST_WIDE_INT size;
7673 /* Handle the SVE non-constant case first. */
7674 if (!poly_size.is_constant (&size))
7675 {
7676 if (dump_file)
7677 {
7678 fprintf (dump_file, "Stack clash SVE prologue: ");
7679 print_dec (poly_size, dump_file);
7680 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7681 }
7682
7683 /* First calculate the amount of bytes we're actually spilling. */
7684 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7685 poly_size, temp1, temp2, false, true);
7686
7687 rtx_insn *insn = get_last_insn ();
7688
7689 if (frame_related_p)
7690 {
7691 /* This is done to provide unwinding information for the stack
7692 adjustments we're about to do, however to prevent the optimizers
143d3b15 7693 from removing the R11 move and leaving the CFA note (which would be
eb471ba3
TC
7694 very wrong) we tie the old and new stack pointer together.
7695 The tie will expand to nothing but the optimizers will not touch
7696 the instruction. */
143d3b15 7697 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
eb471ba3
TC
7698 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7699 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7700
7701 /* We want the CFA independent of the stack pointer for the
7702 duration of the loop. */
7703 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7704 RTX_FRAME_RELATED_P (insn) = 1;
7705 }
7706
7707 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7708 rtx guard_const = gen_int_mode (guard_size, Pmode);
7709
7710 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7711 stack_pointer_rtx, temp1,
7712 probe_const, guard_const));
7713
7714 /* Now reset the CFA register if needed. */
7715 if (frame_related_p)
7716 {
7717 add_reg_note (insn, REG_CFA_DEF_CFA,
7718 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7719 gen_int_mode (poly_size, Pmode)));
7720 RTX_FRAME_RELATED_P (insn) = 1;
7721 }
7722
7723 return;
7724 }
7725
cd1bef27
JL
7726 if (dump_file)
7727 fprintf (dump_file,
eb471ba3
TC
7728 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7729 " bytes, probing will be required.\n", size);
cd1bef27
JL
7730
7731 /* Round size to the nearest multiple of guard_size, and calculate the
7732 residual as the difference between the original size and the rounded
7733 size. */
7734 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7735 HOST_WIDE_INT residual = size - rounded_size;
7736
7737 /* We can handle a small number of allocations/probes inline. Otherwise
7738 punt to a loop. */
7739 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7740 {
7741 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7742 {
7743 aarch64_sub_sp (NULL, temp2, guard_size, true);
7744 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7745 guard_used_by_caller));
7746 emit_insn (gen_blockage ());
7747 }
7748 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7749 }
7750 else
7751 {
7752 /* Compute the ending address. */
7753 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7754 temp1, NULL, false, true);
7755 rtx_insn *insn = get_last_insn ();
7756
7757 /* For the initial allocation, we don't have a frame pointer
7758 set up, so we always need CFI notes. If we're doing the
7759 final allocation, then we may have a frame pointer, in which
7760 case it is the CFA, otherwise we need CFI notes.
7761
7762 We can determine which allocation we are doing by looking at
7763 the value of FRAME_RELATED_P since the final allocations are not
7764 frame related. */
7765 if (frame_related_p)
7766 {
7767 /* We want the CFA independent of the stack pointer for the
7768 duration of the loop. */
7769 add_reg_note (insn, REG_CFA_DEF_CFA,
7770 plus_constant (Pmode, temp1, rounded_size));
7771 RTX_FRAME_RELATED_P (insn) = 1;
7772 }
7773
7774 /* This allocates and probes the stack. Note that this re-uses some of
7775 the existing Ada stack protection code. However we are guaranteed not
7776 to enter the non loop or residual branches of that code.
7777
7778 The non-loop part won't be entered because if our allocation amount
7779 doesn't require a loop, the case above would handle it.
7780
7781 The residual amount won't be entered because TEMP1 is a mutliple of
7782 the allocation size. The residual will always be 0. As such, the only
7783 part we are actually using from that code is the loop setup. The
7784 actual probing is done in aarch64_output_probe_stack_range. */
7785 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7786 stack_pointer_rtx, temp1));
7787
7788 /* Now reset the CFA register if needed. */
7789 if (frame_related_p)
7790 {
7791 add_reg_note (insn, REG_CFA_DEF_CFA,
7792 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7793 RTX_FRAME_RELATED_P (insn) = 1;
7794 }
7795
7796 emit_insn (gen_blockage ());
7797 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7798 }
7799
7800 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7801 be probed. This maintains the requirement that each page is probed at
7802 least once. For initial probing we probe only if the allocation is
7803 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7804 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7805 GUARD_SIZE. This works that for any allocation that is large enough to
7806 trigger a probe here, we'll have at least one, and if they're not large
7807 enough for this code to emit anything for them, The page would have been
7808 probed by the saving of FP/LR either by this function or any callees. If
7809 we don't have any callees then we won't have more stack adjustments and so
7810 are still safe. */
7811 if (residual)
7812 {
7813 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7814 /* If we're doing final adjustments, and we've done any full page
7815 allocations then any residual needs to be probed. */
7816 if (final_adjustment_p && rounded_size != 0)
7817 min_probe_threshold = 0;
7818 /* If doing a small final adjustment, we always probe at offset 0.
7819 This is done to avoid issues when LR is not at position 0 or when
7820 the final adjustment is smaller than the probing offset. */
7821 else if (final_adjustment_p && rounded_size == 0)
7822 residual_probe_offset = 0;
7823
7824 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7825 if (residual >= min_probe_threshold)
7826 {
7827 if (dump_file)
7828 fprintf (dump_file,
7829 "Stack clash AArch64 prologue residuals: "
7830 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7831 "\n", residual);
7832
7833 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7834 residual_probe_offset));
7835 emit_insn (gen_blockage ());
7836 }
7837 }
7838}
7839
a0d0b980
SE
7840/* Return 1 if the register is used by the epilogue. We need to say the
7841 return register is used, but only after epilogue generation is complete.
7842 Note that in the case of sibcalls, the values "used by the epilogue" are
7843 considered live at the start of the called function.
7844
7845 For SIMD functions we need to return 1 for FP registers that are saved and
7846 restored by a function but are not zero in call_used_regs. If we do not do
7847 this optimizations may remove the restore of the register. */
7848
7849int
7850aarch64_epilogue_uses (int regno)
7851{
7852 if (epilogue_completed)
7853 {
7854 if (regno == LR_REGNUM)
7855 return 1;
a0d0b980
SE
7856 }
7857 return 0;
7858}
7859
43e9d192
IB
7860/* AArch64 stack frames generated by this compiler look like:
7861
7862 +-------------------------------+
7863 | |
7864 | incoming stack arguments |
7865 | |
34834420
MS
7866 +-------------------------------+
7867 | | <-- incoming stack pointer (aligned)
43e9d192
IB
7868 | callee-allocated save area |
7869 | for register varargs |
7870 | |
34834420
MS
7871 +-------------------------------+
7872 | local variables | <-- frame_pointer_rtx
43e9d192
IB
7873 | |
7874 +-------------------------------+
cd1bef27 7875 | padding | \
454fdba9 7876 +-------------------------------+ |
454fdba9 7877 | callee-saved registers | | frame.saved_regs_size
454fdba9
RL
7878 +-------------------------------+ |
7879 | LR' | |
7880 +-------------------------------+ |
c600df9a
RS
7881 | FP' | |
7882 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7883 | SVE vector registers | | \
7884 +-------------------------------+ | | below_hard_fp_saved_regs_size
7885 | SVE predicate registers | / /
7886 +-------------------------------+
43e9d192
IB
7887 | dynamic allocation |
7888 +-------------------------------+
34834420
MS
7889 | padding |
7890 +-------------------------------+
7891 | outgoing stack arguments | <-- arg_pointer
7892 | |
7893 +-------------------------------+
7894 | | <-- stack_pointer_rtx (aligned)
43e9d192 7895
34834420
MS
7896 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7897 but leave frame_pointer_rtx and hard_frame_pointer_rtx
cd1bef27
JL
7898 unchanged.
7899
7900 By default for stack-clash we assume the guard is at least 64KB, but this
7901 value is configurable to either 4KB or 64KB. We also force the guard size to
7902 be the same as the probing interval and both values are kept in sync.
7903
7904 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7905 on the guard size) of stack space without probing.
7906
7907 When probing is needed, we emit a probe at the start of the prologue
7908 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7909
7910 We have to track how much space has been allocated and the only stores
7911 to the stack we track as implicit probes are the FP/LR stores.
7912
7913 For outgoing arguments we probe if the size is larger than 1KB, such that
143d3b15
TC
7914 the ABI specified buffer is maintained for the next callee.
7915
7916 The following registers are reserved during frame layout and should not be
7917 used for any other purpose:
7918
c600df9a
RS
7919 - r11: Used by stack clash protection when SVE is enabled, and also
7920 as an anchor register when saving and restoring registers
143d3b15
TC
7921 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7922 - r14 and r15: Used for speculation tracking.
7923 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7924 - r30(LR), r29(FP): Used by standard frame layout.
7925
7926 These registers must be avoided in frame layout related code unless the
7927 explicit intention is to interact with one of the features listed above. */
43e9d192
IB
7928
7929/* Generate the prologue instructions for entry into a function.
7930 Establish the stack frame by decreasing the stack pointer with a
7931 properly calculated size and, if necessary, create a frame record
7932 filled with the values of LR and previous frame pointer. The
6991c977 7933 current FP is also set up if it is in use. */
43e9d192
IB
7934
7935void
7936aarch64_expand_prologue (void)
7937{
6a70badb
RS
7938 poly_int64 frame_size = cfun->machine->frame.frame_size;
7939 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 7940 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
7941 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7942 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
7943 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7944 poly_int64 below_hard_fp_saved_regs_size
7945 = cfun->machine->frame.below_hard_fp_saved_regs_size;
71bfb77a
WD
7946 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7947 unsigned reg2 = cfun->machine->frame.wb_candidate2;
204d2c03 7948 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
71bfb77a 7949 rtx_insn *insn;
43e9d192 7950
c600df9a
RS
7951 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7952 {
7953 /* Fold the SVE allocation into the initial allocation.
7954 We don't do this in aarch64_layout_arg to avoid pessimizing
7955 the epilogue code. */
7956 initial_adjust += sve_callee_adjust;
7957 sve_callee_adjust = 0;
7958 }
7959
db58fd89
JW
7960 /* Sign return address for functions. */
7961 if (aarch64_return_address_signing_enabled ())
27169e45 7962 {
8fc16d72
ST
7963 switch (aarch64_ra_sign_key)
7964 {
7965 case AARCH64_KEY_A:
7966 insn = emit_insn (gen_paciasp ());
7967 break;
7968 case AARCH64_KEY_B:
7969 insn = emit_insn (gen_pacibsp ());
7970 break;
7971 default:
7972 gcc_unreachable ();
7973 }
27169e45
JW
7974 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7975 RTX_FRAME_RELATED_P (insn) = 1;
7976 }
db58fd89 7977
dd991abb 7978 if (flag_stack_usage_info)
6a70badb 7979 current_function_static_stack_size = constant_lower_bound (frame_size);
43e9d192 7980
a3eb8a52
EB
7981 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7982 {
7983 if (crtl->is_leaf && !cfun->calls_alloca)
7984 {
6a70badb
RS
7985 if (maybe_gt (frame_size, PROBE_INTERVAL)
7986 && maybe_gt (frame_size, get_stack_check_protect ()))
8c1dd970
JL
7987 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7988 (frame_size
7989 - get_stack_check_protect ()));
a3eb8a52 7990 }
6a70badb 7991 else if (maybe_gt (frame_size, 0))
8c1dd970 7992 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
a3eb8a52
EB
7993 }
7994
901e66e0
SD
7995 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7996 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 7997
cd1bef27
JL
7998 /* In theory we should never have both an initial adjustment
7999 and a callee save adjustment. Verify that is the case since the
8000 code below does not handle it for -fstack-clash-protection. */
8001 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8002
8003 /* Will only probe if the initial adjustment is larger than the guard
8004 less the amount of the guard reserved for use by the caller's
8005 outgoing args. */
901e66e0 8006 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
cd1bef27 8007 true, false);
43e9d192 8008
71bfb77a
WD
8009 if (callee_adjust != 0)
8010 aarch64_push_regs (reg1, reg2, callee_adjust);
43e9d192 8011
c600df9a
RS
8012 /* The offset of the frame chain record (if any) from the current SP. */
8013 poly_int64 chain_offset = (initial_adjust + callee_adjust
8014 - cfun->machine->frame.hard_fp_offset);
8015 gcc_assert (known_ge (chain_offset, 0));
8016
8017 /* The offset of the bottom of the save area from the current SP. */
8018 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8019
204d2c03 8020 if (emit_frame_chain)
43e9d192 8021 {
71bfb77a 8022 if (callee_adjust == 0)
43cacb12
RS
8023 {
8024 reg1 = R29_REGNUM;
8025 reg2 = R30_REGNUM;
c600df9a
RS
8026 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8027 false, false);
43cacb12 8028 }
c600df9a
RS
8029 else
8030 gcc_assert (known_eq (chain_offset, 0));
f5470a77 8031 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
c600df9a 8032 stack_pointer_rtx, chain_offset,
901e66e0 8033 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
43cacb12
RS
8034 if (frame_pointer_needed && !frame_size.is_constant ())
8035 {
8036 /* Variable-sized frames need to describe the save slot
8037 address using DW_CFA_expression rather than DW_CFA_offset.
8038 This means that, without taking further action, the
8039 locations of the registers that we've already saved would
8040 remain based on the stack pointer even after we redefine
8041 the CFA based on the frame pointer. We therefore need new
8042 DW_CFA_expressions to re-express the save slots with addresses
8043 based on the frame pointer. */
8044 rtx_insn *insn = get_last_insn ();
8045 gcc_assert (RTX_FRAME_RELATED_P (insn));
8046
8047 /* Add an explicit CFA definition if this was previously
8048 implicit. */
8049 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8050 {
8051 rtx src = plus_constant (Pmode, stack_pointer_rtx,
8052 callee_offset);
8053 add_reg_note (insn, REG_CFA_ADJUST_CFA,
8054 gen_rtx_SET (hard_frame_pointer_rtx, src));
8055 }
8056
8057 /* Change the save slot expressions for the registers that
8058 we've already saved. */
c600df9a
RS
8059 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8060 hard_frame_pointer_rtx, UNITS_PER_WORD);
8061 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8062 hard_frame_pointer_rtx, 0);
43cacb12 8063 }
71bfb77a 8064 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
43e9d192 8065 }
71bfb77a 8066
c600df9a
RS
8067 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8068 callee_adjust != 0 || emit_frame_chain,
8069 emit_frame_chain);
8070 if (maybe_ne (sve_callee_adjust, 0))
8071 {
8072 gcc_assert (!flag_stack_clash_protection
8073 || known_eq (initial_adjust, 0));
8074 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8075 sve_callee_adjust,
8076 !frame_pointer_needed, false);
8077 saved_regs_offset += sve_callee_adjust;
8078 }
8079 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8080 false, emit_frame_chain);
8081 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8082 callee_adjust != 0 || emit_frame_chain,
8083 emit_frame_chain);
cd1bef27
JL
8084
8085 /* We may need to probe the final adjustment if it is larger than the guard
8086 that is assumed by the called. */
901e66e0 8087 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
cd1bef27 8088 !frame_pointer_needed, true);
43e9d192
IB
8089}
8090
4f942779
RL
8091/* Return TRUE if we can use a simple_return insn.
8092
8093 This function checks whether the callee saved stack is empty, which
8094 means no restore actions are need. The pro_and_epilogue will use
8095 this to check whether shrink-wrapping opt is feasible. */
8096
8097bool
8098aarch64_use_return_insn_p (void)
8099{
8100 if (!reload_completed)
8101 return false;
8102
8103 if (crtl->profile)
8104 return false;
8105
6a70badb 8106 return known_eq (cfun->machine->frame.frame_size, 0);
4f942779
RL
8107}
8108
71bfb77a
WD
8109/* Generate the epilogue instructions for returning from a function.
8110 This is almost exactly the reverse of the prolog sequence, except
8111 that we need to insert barriers to avoid scheduling loads that read
8112 from a deallocated stack, and we optimize the unwind records by
8113 emitting them all together if possible. */
43e9d192
IB
8114void
8115aarch64_expand_epilogue (bool for_sibcall)
8116{
6a70badb 8117 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
71bfb77a 8118 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6a70badb
RS
8119 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8120 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
c600df9a
RS
8121 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8122 poly_int64 below_hard_fp_saved_regs_size
8123 = cfun->machine->frame.below_hard_fp_saved_regs_size;
71bfb77a
WD
8124 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8125 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8126 rtx cfi_ops = NULL;
8127 rtx_insn *insn;
901e66e0
SD
8128 /* A stack clash protection prologue may not have left EP0_REGNUM or
8129 EP1_REGNUM in a usable state. The same is true for allocations
43cacb12 8130 with an SVE component, since we then need both temporary registers
cd1bef27
JL
8131 for each allocation. For stack clash we are in a usable state if
8132 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8133 HOST_WIDE_INT guard_size
028d4092 8134 = 1 << param_stack_clash_protection_guard_size;
cd1bef27
JL
8135 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8136
c600df9a
RS
8137 /* We can re-use the registers when:
8138
8139 (a) the deallocation amount is the same as the corresponding
8140 allocation amount (which is false if we combine the initial
8141 and SVE callee save allocations in the prologue); and
8142
8143 (b) the allocation amount doesn't need a probe (which is false
8144 if the amount is guard_size - guard_used_by_caller or greater).
8145
8146 In such situations the register should remain live with the correct
cd1bef27 8147 value. */
43cacb12 8148 bool can_inherit_p = (initial_adjust.is_constant ()
c600df9a 8149 && final_adjust.is_constant ()
cd1bef27 8150 && (!flag_stack_clash_protection
c600df9a
RS
8151 || (known_lt (initial_adjust,
8152 guard_size - guard_used_by_caller)
8153 && known_eq (sve_callee_adjust, 0))));
44c0e7b9 8154
71bfb77a 8155 /* We need to add memory barrier to prevent read from deallocated stack. */
6a70badb
RS
8156 bool need_barrier_p
8157 = maybe_ne (get_frame_size ()
8158 + cfun->machine->frame.saved_varargs_size, 0);
43e9d192 8159
71bfb77a 8160 /* Emit a barrier to prevent loads from a deallocated stack. */
6a70badb
RS
8161 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8162 || cfun->calls_alloca
8144a493 8163 || crtl->calls_eh_return)
43e9d192 8164 {
71bfb77a
WD
8165 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8166 need_barrier_p = false;
8167 }
7e8c2bd5 8168
71bfb77a
WD
8169 /* Restore the stack pointer from the frame pointer if it may not
8170 be the same as the stack pointer. */
901e66e0
SD
8171 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8172 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6a70badb
RS
8173 if (frame_pointer_needed
8174 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
f5470a77
RS
8175 /* If writeback is used when restoring callee-saves, the CFA
8176 is restored on the instruction doing the writeback. */
8177 aarch64_add_offset (Pmode, stack_pointer_rtx,
c600df9a
RS
8178 hard_frame_pointer_rtx,
8179 -callee_offset - below_hard_fp_saved_regs_size,
901e66e0 8180 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
71bfb77a 8181 else
cd1bef27
JL
8182 /* The case where we need to re-use the register here is very rare, so
8183 avoid the complicated condition and just always emit a move if the
8184 immediate doesn't fit. */
901e66e0 8185 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
43e9d192 8186
c600df9a
RS
8187 /* Restore the vector registers before the predicate registers,
8188 so that we can use P4 as a temporary for big-endian SVE frames. */
8189 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8190 callee_adjust != 0, &cfi_ops);
8191 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8192 false, &cfi_ops);
8193 if (maybe_ne (sve_callee_adjust, 0))
8194 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8195 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8196 R0_REGNUM, R30_REGNUM,
71bfb77a 8197 callee_adjust != 0, &cfi_ops);
43e9d192 8198
71bfb77a
WD
8199 if (need_barrier_p)
8200 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8201
8202 if (callee_adjust != 0)
8203 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8204
1ccbfffb
RS
8205 /* If we have no register restore information, the CFA must have been
8206 defined in terms of the stack pointer since the end of the prologue. */
8207 gcc_assert (cfi_ops || !frame_pointer_needed);
8208
8209 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
71bfb77a
WD
8210 {
8211 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
89ac681e 8212 insn = get_last_insn ();
71bfb77a
WD
8213 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8214 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
43e9d192 8215 RTX_FRAME_RELATED_P (insn) = 1;
71bfb77a 8216 cfi_ops = NULL;
43e9d192
IB
8217 }
8218
901e66e0
SD
8219 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8220 add restriction on emit_move optimization to leaf functions. */
8221 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8222 (!can_inherit_p || !crtl->is_leaf
8223 || df_regs_ever_live_p (EP0_REGNUM)));
7e8c2bd5 8224
71bfb77a
WD
8225 if (cfi_ops)
8226 {
8227 /* Emit delayed restores and reset the CFA to be SP. */
8228 insn = get_last_insn ();
8229 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8230 REG_NOTES (insn) = cfi_ops;
8231 RTX_FRAME_RELATED_P (insn) = 1;
dd991abb
RH
8232 }
8233
db58fd89
JW
8234 /* We prefer to emit the combined return/authenticate instruction RETAA,
8235 however there are three cases in which we must instead emit an explicit
8236 authentication instruction.
8237
8238 1) Sibcalls don't return in a normal way, so if we're about to call one
8239 we must authenticate.
8240
8241 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8242 generating code for !TARGET_ARMV8_3 we can't use it and must
8243 explicitly authenticate.
8244
8245 3) On an eh_return path we make extra stack adjustments to update the
8246 canonical frame address to be the exception handler's CFA. We want
8247 to authenticate using the CFA of the function which calls eh_return.
8248 */
8249 if (aarch64_return_address_signing_enabled ()
8250 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
27169e45 8251 {
8fc16d72
ST
8252 switch (aarch64_ra_sign_key)
8253 {
8254 case AARCH64_KEY_A:
8255 insn = emit_insn (gen_autiasp ());
8256 break;
8257 case AARCH64_KEY_B:
8258 insn = emit_insn (gen_autibsp ());
8259 break;
8260 default:
8261 gcc_unreachable ();
8262 }
27169e45
JW
8263 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8264 RTX_FRAME_RELATED_P (insn) = 1;
8265 }
db58fd89 8266
dd991abb 8267 /* Stack adjustment for exception handler. */
b5b9147d 8268 if (crtl->calls_eh_return && !for_sibcall)
dd991abb
RH
8269 {
8270 /* We need to unwind the stack by the offset computed by
8271 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8272 to be SP; letting the CFA move during this adjustment
8273 is just as correct as retaining the CFA from the body
8274 of the function. Therefore, do nothing special. */
8275 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
43e9d192
IB
8276 }
8277
8278 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8279 if (!for_sibcall)
8280 emit_jump_insn (ret_rtx);
8281}
8282
8144a493
WD
8283/* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8284 normally or return to a previous frame after unwinding.
1c960e02 8285
8144a493
WD
8286 An EH return uses a single shared return sequence. The epilogue is
8287 exactly like a normal epilogue except that it has an extra input
8288 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8289 that must be applied after the frame has been destroyed. An extra label
8290 is inserted before the epilogue which initializes this register to zero,
8291 and this is the entry point for a normal return.
43e9d192 8292
8144a493
WD
8293 An actual EH return updates the return address, initializes the stack
8294 adjustment and jumps directly into the epilogue (bypassing the zeroing
8295 of the adjustment). Since the return address is typically saved on the
8296 stack when a function makes a call, the saved LR must be updated outside
8297 the epilogue.
43e9d192 8298
8144a493
WD
8299 This poses problems as the store is generated well before the epilogue,
8300 so the offset of LR is not known yet. Also optimizations will remove the
8301 store as it appears dead, even after the epilogue is generated (as the
8302 base or offset for loading LR is different in many cases).
43e9d192 8303
8144a493
WD
8304 To avoid these problems this implementation forces the frame pointer
8305 in eh_return functions so that the location of LR is fixed and known early.
8306 It also marks the store volatile, so no optimization is permitted to
8307 remove the store. */
8308rtx
8309aarch64_eh_return_handler_rtx (void)
8310{
8311 rtx tmp = gen_frame_mem (Pmode,
8312 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
43e9d192 8313
8144a493
WD
8314 /* Mark the store volatile, so no optimization is permitted to remove it. */
8315 MEM_VOLATILE_P (tmp) = true;
8316 return tmp;
43e9d192
IB
8317}
8318
43e9d192
IB
8319/* Output code to add DELTA to the first argument, and then jump
8320 to FUNCTION. Used for C++ multiple inheritance. */
8321static void
8322aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8323 HOST_WIDE_INT delta,
8324 HOST_WIDE_INT vcall_offset,
8325 tree function)
8326{
8327 /* The this pointer is always in x0. Note that this differs from
8328 Arm where the this pointer maybe bumped to r1 if r0 is required
8329 to return a pointer to an aggregate. On AArch64 a result value
8330 pointer will be in x8. */
8331 int this_regno = R0_REGNUM;
5d8a22a5
DM
8332 rtx this_rtx, temp0, temp1, addr, funexp;
8333 rtx_insn *insn;
6b5777c6 8334 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
43e9d192 8335
c904388d
SD
8336 if (aarch64_bti_enabled ())
8337 emit_insn (gen_bti_c());
8338
75f1d6fc
SN
8339 reload_completed = 1;
8340 emit_note (NOTE_INSN_PROLOGUE_END);
43e9d192 8341
f5470a77 8342 this_rtx = gen_rtx_REG (Pmode, this_regno);
901e66e0
SD
8343 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8344 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
f5470a77 8345
43e9d192 8346 if (vcall_offset == 0)
43cacb12 8347 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
43e9d192
IB
8348 else
8349 {
28514dda 8350 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
43e9d192 8351
75f1d6fc
SN
8352 addr = this_rtx;
8353 if (delta != 0)
8354 {
8355 if (delta >= -256 && delta < 256)
8356 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8357 plus_constant (Pmode, this_rtx, delta));
8358 else
43cacb12
RS
8359 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8360 temp1, temp0, false);
43e9d192
IB
8361 }
8362
28514dda
YZ
8363 if (Pmode == ptr_mode)
8364 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8365 else
8366 aarch64_emit_move (temp0,
8367 gen_rtx_ZERO_EXTEND (Pmode,
8368 gen_rtx_MEM (ptr_mode, addr)));
75f1d6fc 8369
28514dda 8370 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
75f1d6fc 8371 addr = plus_constant (Pmode, temp0, vcall_offset);
43e9d192
IB
8372 else
8373 {
f43657b4
JW
8374 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8375 Pmode);
75f1d6fc 8376 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
43e9d192
IB
8377 }
8378
28514dda
YZ
8379 if (Pmode == ptr_mode)
8380 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8381 else
8382 aarch64_emit_move (temp1,
8383 gen_rtx_SIGN_EXTEND (Pmode,
8384 gen_rtx_MEM (ptr_mode, addr)));
8385
75f1d6fc 8386 emit_insn (gen_add2_insn (this_rtx, temp1));
43e9d192
IB
8387 }
8388
75f1d6fc
SN
8389 /* Generate a tail call to the target function. */
8390 if (!TREE_USED (function))
8391 {
8392 assemble_external (function);
8393 TREE_USED (function) = 1;
8394 }
8395 funexp = XEXP (DECL_RTL (function), 0);
8396 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
08cc4d92
RS
8397 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8398 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
75f1d6fc
SN
8399 SIBLING_CALL_P (insn) = 1;
8400
8401 insn = get_insns ();
8402 shorten_branches (insn);
6b5777c6
MF
8403
8404 assemble_start_function (thunk, fnname);
75f1d6fc
SN
8405 final_start_function (insn, file, 1);
8406 final (insn, file, 1);
43e9d192 8407 final_end_function ();
6b5777c6 8408 assemble_end_function (thunk, fnname);
75f1d6fc
SN
8409
8410 /* Stop pretending to be a post-reload pass. */
8411 reload_completed = 0;
43e9d192
IB
8412}
8413
43e9d192
IB
8414static bool
8415aarch64_tls_referenced_p (rtx x)
8416{
8417 if (!TARGET_HAVE_TLS)
8418 return false;
e7de8563
RS
8419 subrtx_iterator::array_type array;
8420 FOR_EACH_SUBRTX (iter, array, x, ALL)
8421 {
8422 const_rtx x = *iter;
8423 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8424 return true;
8425 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8426 TLS offsets, not real symbol references. */
8427 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8428 iter.skip_subrtxes ();
8429 }
8430 return false;
43e9d192
IB
8431}
8432
8433
43e9d192
IB
8434/* Return true if val can be encoded as a 12-bit unsigned immediate with
8435 a left shift of 0 or 12 bits. */
8436bool
8437aarch64_uimm12_shift (HOST_WIDE_INT val)
8438{
8439 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8440 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8441 );
8442}
8443
eb471ba3
TC
8444/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8445 that can be created with a left shift of 0 or 12. */
8446static HOST_WIDE_INT
8447aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8448{
8449 /* Check to see if the value fits in 24 bits, as that is the maximum we can
8450 handle correctly. */
8451 gcc_assert ((val & 0xffffff) == val);
8452
8453 if (((val & 0xfff) << 0) == val)
8454 return val;
8455
8456 return val & (0xfff << 12);
8457}
43e9d192
IB
8458
8459/* Return true if val is an immediate that can be loaded into a
8460 register by a MOVZ instruction. */
8461static bool
77e994c9 8462aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
43e9d192
IB
8463{
8464 if (GET_MODE_SIZE (mode) > 4)
8465 {
8466 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8467 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8468 return 1;
8469 }
8470 else
8471 {
43cacb12
RS
8472 /* Ignore sign extension. */
8473 val &= (HOST_WIDE_INT) 0xffffffff;
8474 }
8475 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8476 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8477}
8478
bba0c624
RS
8479/* Test whether:
8480
8481 X = (X & AND_VAL) | IOR_VAL;
8482
8483 can be implemented using:
8484
8485 MOVK X, #(IOR_VAL >> shift), LSL #shift
8486
8487 Return the shift if so, otherwise return -1. */
8488int
8489aarch64_movk_shift (const wide_int_ref &and_val,
8490 const wide_int_ref &ior_val)
8491{
8492 unsigned int precision = and_val.get_precision ();
8493 unsigned HOST_WIDE_INT mask = 0xffff;
8494 for (unsigned int shift = 0; shift < precision; shift += 16)
8495 {
8496 if (and_val == ~mask && (ior_val & mask) == ior_val)
8497 return shift;
8498 mask <<= 16;
8499 }
8500 return -1;
8501}
8502
43cacb12
RS
8503/* VAL is a value with the inner mode of MODE. Replicate it to fill a
8504 64-bit (DImode) integer. */
8505
8506static unsigned HOST_WIDE_INT
8507aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8508{
8509 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8510 while (size < 64)
8511 {
8512 val &= (HOST_WIDE_INT_1U << size) - 1;
8513 val |= val << size;
8514 size *= 2;
43e9d192 8515 }
43cacb12 8516 return val;
43e9d192
IB
8517}
8518
a64c73a2
WD
8519/* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
8520
8521static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8522 {
8523 0x0000000100000001ull,
8524 0x0001000100010001ull,
8525 0x0101010101010101ull,
8526 0x1111111111111111ull,
8527 0x5555555555555555ull,
8528 };
8529
43e9d192
IB
8530
8531/* Return true if val is a valid bitmask immediate. */
a64c73a2 8532
43e9d192 8533bool
a64c73a2 8534aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
43e9d192 8535{
a64c73a2
WD
8536 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8537 int bits;
8538
8539 /* Check for a single sequence of one bits and return quickly if so.
8540 The special cases of all ones and all zeroes returns false. */
43cacb12 8541 val = aarch64_replicate_bitmask_imm (val_in, mode);
a64c73a2
WD
8542 tmp = val + (val & -val);
8543
8544 if (tmp == (tmp & -tmp))
8545 return (val + 1) > 1;
8546
8547 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
8548 if (mode == SImode)
8549 val = (val << 32) | (val & 0xffffffff);
8550
8551 /* Invert if the immediate doesn't start with a zero bit - this means we
8552 only need to search for sequences of one bits. */
8553 if (val & 1)
8554 val = ~val;
8555
8556 /* Find the first set bit and set tmp to val with the first sequence of one
8557 bits removed. Return success if there is a single sequence of ones. */
8558 first_one = val & -val;
8559 tmp = val & (val + first_one);
8560
8561 if (tmp == 0)
8562 return true;
8563
8564 /* Find the next set bit and compute the difference in bit position. */
8565 next_one = tmp & -tmp;
8566 bits = clz_hwi (first_one) - clz_hwi (next_one);
8567 mask = val ^ tmp;
8568
8569 /* Check the bit position difference is a power of 2, and that the first
8570 sequence of one bits fits within 'bits' bits. */
8571 if ((mask >> bits) != 0 || bits != (bits & -bits))
8572 return false;
8573
8574 /* Check the sequence of one bits is repeated 64/bits times. */
8575 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
43e9d192
IB
8576}
8577
43fd192f
MC
8578/* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8579 Assumed precondition: VAL_IN Is not zero. */
8580
8581unsigned HOST_WIDE_INT
8582aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8583{
8584 int lowest_bit_set = ctz_hwi (val_in);
8585 int highest_bit_set = floor_log2 (val_in);
8586 gcc_assert (val_in != 0);
8587
8588 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8589 (HOST_WIDE_INT_1U << lowest_bit_set));
8590}
8591
8592/* Create constant where bits outside of lowest bit set to highest bit set
8593 are set to 1. */
8594
8595unsigned HOST_WIDE_INT
8596aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8597{
8598 return val_in | ~aarch64_and_split_imm1 (val_in);
8599}
8600
8601/* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8602
8603bool
8604aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8605{
77e994c9
RS
8606 scalar_int_mode int_mode;
8607 if (!is_a <scalar_int_mode> (mode, &int_mode))
8608 return false;
8609
8610 if (aarch64_bitmask_imm (val_in, int_mode))
43fd192f
MC
8611 return false;
8612
77e994c9 8613 if (aarch64_move_imm (val_in, int_mode))
43fd192f
MC
8614 return false;
8615
8616 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8617
77e994c9 8618 return aarch64_bitmask_imm (imm2, int_mode);
43fd192f 8619}
43e9d192
IB
8620
8621/* Return true if val is an immediate that can be loaded into a
8622 register in a single instruction. */
8623bool
ef4bddc2 8624aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
43e9d192 8625{
77e994c9
RS
8626 scalar_int_mode int_mode;
8627 if (!is_a <scalar_int_mode> (mode, &int_mode))
8628 return false;
8629
8630 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
43e9d192 8631 return 1;
77e994c9 8632 return aarch64_bitmask_imm (val, int_mode);
43e9d192
IB
8633}
8634
8635static bool
ef4bddc2 8636aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
43e9d192
IB
8637{
8638 rtx base, offset;
7eda14e1 8639
43e9d192
IB
8640 if (GET_CODE (x) == HIGH)
8641 return true;
8642
43cacb12
RS
8643 /* There's no way to calculate VL-based values using relocations. */
8644 subrtx_iterator::array_type array;
8645 FOR_EACH_SUBRTX (iter, array, x, ALL)
8646 if (GET_CODE (*iter) == CONST_POLY_INT)
8647 return true;
8648
43e9d192
IB
8649 split_const (x, &base, &offset);
8650 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
28514dda 8651 {
43cacb12 8652 if (aarch64_classify_symbol (base, INTVAL (offset))
28514dda
YZ
8653 != SYMBOL_FORCE_TO_MEM)
8654 return true;
8655 else
8656 /* Avoid generating a 64-bit relocation in ILP32; leave
8657 to aarch64_expand_mov_immediate to handle it properly. */
8658 return mode != ptr_mode;
8659 }
43e9d192
IB
8660
8661 return aarch64_tls_referenced_p (x);
8662}
8663
e79136e4
WD
8664/* Implement TARGET_CASE_VALUES_THRESHOLD.
8665 The expansion for a table switch is quite expensive due to the number
8666 of instructions, the table lookup and hard to predict indirect jump.
8667 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8668 set, otherwise use tables for > 16 cases as a tradeoff between size and
8669 performance. When optimizing for size, use the default setting. */
50487d79
EM
8670
8671static unsigned int
8672aarch64_case_values_threshold (void)
8673{
8674 /* Use the specified limit for the number of cases before using jump
8675 tables at higher optimization levels. */
8676 if (optimize > 2
8677 && selected_cpu->tune->max_case_values != 0)
8678 return selected_cpu->tune->max_case_values;
8679 else
e79136e4 8680 return optimize_size ? default_case_values_threshold () : 17;
50487d79
EM
8681}
8682
43e9d192
IB
8683/* Return true if register REGNO is a valid index register.
8684 STRICT_P is true if REG_OK_STRICT is in effect. */
8685
8686bool
8687aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8688{
8689 if (!HARD_REGISTER_NUM_P (regno))
8690 {
8691 if (!strict_p)
8692 return true;
8693
8694 if (!reg_renumber)
8695 return false;
8696
8697 regno = reg_renumber[regno];
8698 }
8699 return GP_REGNUM_P (regno);
8700}
8701
8702/* Return true if register REGNO is a valid base register for mode MODE.
8703 STRICT_P is true if REG_OK_STRICT is in effect. */
8704
8705bool
8706aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8707{
8708 if (!HARD_REGISTER_NUM_P (regno))
8709 {
8710 if (!strict_p)
8711 return true;
8712
8713 if (!reg_renumber)
8714 return false;
8715
8716 regno = reg_renumber[regno];
8717 }
8718
8719 /* The fake registers will be eliminated to either the stack or
8720 hard frame pointer, both of which are usually valid base registers.
8721 Reload deals with the cases where the eliminated form isn't valid. */
8722 return (GP_REGNUM_P (regno)
8723 || regno == SP_REGNUM
8724 || regno == FRAME_POINTER_REGNUM
8725 || regno == ARG_POINTER_REGNUM);
8726}
8727
8728/* Return true if X is a valid base register for mode MODE.
8729 STRICT_P is true if REG_OK_STRICT is in effect. */
8730
8731static bool
8732aarch64_base_register_rtx_p (rtx x, bool strict_p)
8733{
76160199
RS
8734 if (!strict_p
8735 && GET_CODE (x) == SUBREG
8736 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
43e9d192
IB
8737 x = SUBREG_REG (x);
8738
8739 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8740}
8741
8742/* Return true if address offset is a valid index. If it is, fill in INFO
8743 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8744
8745static bool
8746aarch64_classify_index (struct aarch64_address_info *info, rtx x,
ef4bddc2 8747 machine_mode mode, bool strict_p)
43e9d192
IB
8748{
8749 enum aarch64_address_type type;
8750 rtx index;
8751 int shift;
8752
8753 /* (reg:P) */
8754 if ((REG_P (x) || GET_CODE (x) == SUBREG)
8755 && GET_MODE (x) == Pmode)
8756 {
8757 type = ADDRESS_REG_REG;
8758 index = x;
8759 shift = 0;
8760 }
8761 /* (sign_extend:DI (reg:SI)) */
8762 else if ((GET_CODE (x) == SIGN_EXTEND
8763 || GET_CODE (x) == ZERO_EXTEND)
8764 && GET_MODE (x) == DImode
8765 && GET_MODE (XEXP (x, 0)) == SImode)
8766 {
8767 type = (GET_CODE (x) == SIGN_EXTEND)
8768 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8769 index = XEXP (x, 0);
8770 shift = 0;
8771 }
8772 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8773 else if (GET_CODE (x) == MULT
8774 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8775 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8776 && GET_MODE (XEXP (x, 0)) == DImode
8777 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8778 && CONST_INT_P (XEXP (x, 1)))
8779 {
8780 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8781 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8782 index = XEXP (XEXP (x, 0), 0);
8783 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8784 }
8785 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8786 else if (GET_CODE (x) == ASHIFT
8787 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8788 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8789 && GET_MODE (XEXP (x, 0)) == DImode
8790 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8791 && CONST_INT_P (XEXP (x, 1)))
8792 {
8793 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8794 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8795 index = XEXP (XEXP (x, 0), 0);
8796 shift = INTVAL (XEXP (x, 1));
8797 }
8798 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8799 else if ((GET_CODE (x) == SIGN_EXTRACT
8800 || GET_CODE (x) == ZERO_EXTRACT)
8801 && GET_MODE (x) == DImode
8802 && GET_CODE (XEXP (x, 0)) == MULT
8803 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8804 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8805 {
8806 type = (GET_CODE (x) == SIGN_EXTRACT)
8807 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8808 index = XEXP (XEXP (x, 0), 0);
8809 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8810 if (INTVAL (XEXP (x, 1)) != 32 + shift
8811 || INTVAL (XEXP (x, 2)) != 0)
8812 shift = -1;
8813 }
8814 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8815 (const_int 0xffffffff<<shift)) */
8816 else if (GET_CODE (x) == AND
8817 && GET_MODE (x) == DImode
8818 && GET_CODE (XEXP (x, 0)) == MULT
8819 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8820 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8821 && CONST_INT_P (XEXP (x, 1)))
8822 {
8823 type = ADDRESS_REG_UXTW;
8824 index = XEXP (XEXP (x, 0), 0);
8825 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8826 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8827 shift = -1;
8828 }
8829 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8830 else if ((GET_CODE (x) == SIGN_EXTRACT
8831 || GET_CODE (x) == ZERO_EXTRACT)
8832 && GET_MODE (x) == DImode
8833 && GET_CODE (XEXP (x, 0)) == ASHIFT
8834 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8835 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8836 {
8837 type = (GET_CODE (x) == SIGN_EXTRACT)
8838 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8839 index = XEXP (XEXP (x, 0), 0);
8840 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8841 if (INTVAL (XEXP (x, 1)) != 32 + shift
8842 || INTVAL (XEXP (x, 2)) != 0)
8843 shift = -1;
8844 }
8845 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8846 (const_int 0xffffffff<<shift)) */
8847 else if (GET_CODE (x) == AND
8848 && GET_MODE (x) == DImode
8849 && GET_CODE (XEXP (x, 0)) == ASHIFT
8850 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8851 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8852 && CONST_INT_P (XEXP (x, 1)))
8853 {
8854 type = ADDRESS_REG_UXTW;
8855 index = XEXP (XEXP (x, 0), 0);
8856 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8857 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8858 shift = -1;
8859 }
8860 /* (mult:P (reg:P) (const_int scale)) */
8861 else if (GET_CODE (x) == MULT
8862 && GET_MODE (x) == Pmode
8863 && GET_MODE (XEXP (x, 0)) == Pmode
8864 && CONST_INT_P (XEXP (x, 1)))
8865 {
8866 type = ADDRESS_REG_REG;
8867 index = XEXP (x, 0);
8868 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8869 }
8870 /* (ashift:P (reg:P) (const_int shift)) */
8871 else if (GET_CODE (x) == ASHIFT
8872 && GET_MODE (x) == Pmode
8873 && GET_MODE (XEXP (x, 0)) == Pmode
8874 && CONST_INT_P (XEXP (x, 1)))
8875 {
8876 type = ADDRESS_REG_REG;
8877 index = XEXP (x, 0);
8878 shift = INTVAL (XEXP (x, 1));
8879 }
8880 else
8881 return false;
8882
76160199
RS
8883 if (!strict_p
8884 && GET_CODE (index) == SUBREG
8885 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
43e9d192
IB
8886 index = SUBREG_REG (index);
8887
43cacb12
RS
8888 if (aarch64_sve_data_mode_p (mode))
8889 {
8890 if (type != ADDRESS_REG_REG
8891 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8892 return false;
8893 }
8894 else
8895 {
8896 if (shift != 0
8897 && !(IN_RANGE (shift, 1, 3)
8898 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8899 return false;
8900 }
8901
8902 if (REG_P (index)
43e9d192
IB
8903 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8904 {
8905 info->type = type;
8906 info->offset = index;
8907 info->shift = shift;
8908 return true;
8909 }
8910
8911 return false;
8912}
8913
abc52318
KT
8914/* Return true if MODE is one of the modes for which we
8915 support LDP/STP operations. */
8916
8917static bool
8918aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8919{
8920 return mode == SImode || mode == DImode
8921 || mode == SFmode || mode == DFmode
8922 || (aarch64_vector_mode_supported_p (mode)
9f5361c8
KT
8923 && (known_eq (GET_MODE_SIZE (mode), 8)
8924 || (known_eq (GET_MODE_SIZE (mode), 16)
8925 && (aarch64_tune_params.extra_tuning_flags
8926 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
abc52318
KT
8927}
8928
9e0218fc
RH
8929/* Return true if REGNO is a virtual pointer register, or an eliminable
8930 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8931 include stack_pointer or hard_frame_pointer. */
8932static bool
8933virt_or_elim_regno_p (unsigned regno)
8934{
8935 return ((regno >= FIRST_VIRTUAL_REGISTER
8936 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8937 || regno == FRAME_POINTER_REGNUM
8938 || regno == ARG_POINTER_REGNUM);
8939}
8940
a97d8b98
RS
8941/* Return true if X is a valid address of type TYPE for machine mode MODE.
8942 If it is, fill in INFO appropriately. STRICT_P is true if
8943 REG_OK_STRICT is in effect. */
43e9d192 8944
a98824ac 8945bool
43e9d192 8946aarch64_classify_address (struct aarch64_address_info *info,
a97d8b98 8947 rtx x, machine_mode mode, bool strict_p,
a98824ac 8948 aarch64_addr_query_type type)
43e9d192
IB
8949{
8950 enum rtx_code code = GET_CODE (x);
8951 rtx op0, op1;
dc640181
RS
8952 poly_int64 offset;
8953
6a70badb 8954 HOST_WIDE_INT const_size;
2d8c6dc1 8955
550a3380
RS
8956 /* Whether a vector mode is partial doesn't affect address legitimacy.
8957 Partial vectors like VNx8QImode allow the same indexed addressing
8958 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8959 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8960 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8961 vec_flags &= ~VEC_PARTIAL;
8962
80d43579
WD
8963 /* On BE, we use load/store pair for all large int mode load/stores.
8964 TI/TFmode may also use a load/store pair. */
43cacb12 8965 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
a97d8b98 8966 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
a25831ac 8967 || type == ADDR_QUERY_LDP_STP_N
80d43579
WD
8968 || mode == TImode
8969 || mode == TFmode
43cacb12 8970 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
2d8c6dc1 8971
a25831ac
AV
8972 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8973 corresponds to the actual size of the memory being loaded/stored and the
8974 mode of the corresponding addressing mode is half of that. */
8975 if (type == ADDR_QUERY_LDP_STP_N
8976 && known_eq (GET_MODE_SIZE (mode), 16))
8977 mode = DFmode;
8978
6a70badb 8979 bool allow_reg_index_p = (!load_store_pair_p
43cacb12
RS
8980 && (known_lt (GET_MODE_SIZE (mode), 16)
8981 || vec_flags == VEC_ADVSIMD
fa9863e7 8982 || vec_flags & VEC_SVE_DATA));
43cacb12
RS
8983
8984 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8985 [Rn, #offset, MUL VL]. */
8986 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8987 && (code != REG && code != PLUS))
8988 return false;
2d8c6dc1
AH
8989
8990 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8991 REG addressing. */
43cacb12
RS
8992 if (advsimd_struct_p
8993 && !BYTES_BIG_ENDIAN
43e9d192
IB
8994 && (code != POST_INC && code != REG))
8995 return false;
8996
43cacb12
RS
8997 gcc_checking_assert (GET_MODE (x) == VOIDmode
8998 || SCALAR_INT_MODE_P (GET_MODE (x)));
8999
43e9d192
IB
9000 switch (code)
9001 {
9002 case REG:
9003 case SUBREG:
9004 info->type = ADDRESS_REG_IMM;
9005 info->base = x;
9006 info->offset = const0_rtx;
dc640181 9007 info->const_offset = 0;
43e9d192
IB
9008 return aarch64_base_register_rtx_p (x, strict_p);
9009
9010 case PLUS:
9011 op0 = XEXP (x, 0);
9012 op1 = XEXP (x, 1);
15c0c5c9
JW
9013
9014 if (! strict_p
4aa81c2e 9015 && REG_P (op0)
9e0218fc 9016 && virt_or_elim_regno_p (REGNO (op0))
dc640181 9017 && poly_int_rtx_p (op1, &offset))
15c0c5c9
JW
9018 {
9019 info->type = ADDRESS_REG_IMM;
9020 info->base = op0;
9021 info->offset = op1;
dc640181 9022 info->const_offset = offset;
15c0c5c9
JW
9023
9024 return true;
9025 }
9026
6a70badb 9027 if (maybe_ne (GET_MODE_SIZE (mode), 0)
dc640181
RS
9028 && aarch64_base_register_rtx_p (op0, strict_p)
9029 && poly_int_rtx_p (op1, &offset))
43e9d192 9030 {
43e9d192
IB
9031 info->type = ADDRESS_REG_IMM;
9032 info->base = op0;
9033 info->offset = op1;
dc640181 9034 info->const_offset = offset;
43e9d192
IB
9035
9036 /* TImode and TFmode values are allowed in both pairs of X
9037 registers and individual Q registers. The available
9038 address modes are:
9039 X,X: 7-bit signed scaled offset
9040 Q: 9-bit signed offset
9041 We conservatively require an offset representable in either mode.
8ed49fab
KT
9042 When performing the check for pairs of X registers i.e. LDP/STP
9043 pass down DImode since that is the natural size of the LDP/STP
9044 instruction memory accesses. */
43e9d192 9045 if (mode == TImode || mode == TFmode)
8ed49fab 9046 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3c5af608 9047 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8734dfac 9048 || offset_12bit_unsigned_scaled_p (mode, offset)));
43e9d192 9049
2d8c6dc1
AH
9050 /* A 7bit offset check because OImode will emit a ldp/stp
9051 instruction (only big endian will get here).
9052 For ldp/stp instructions, the offset is scaled for the size of a
9053 single element of the pair. */
9054 if (mode == OImode)
9055 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9056
9057 /* Three 9/12 bit offsets checks because CImode will emit three
9058 ldr/str instructions (only big endian will get here). */
9059 if (mode == CImode)
9060 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3c5af608
MM
9061 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9062 offset + 32)
2d8c6dc1
AH
9063 || offset_12bit_unsigned_scaled_p (V16QImode,
9064 offset + 32)));
9065
9066 /* Two 7bit offsets checks because XImode will emit two ldp/stp
9067 instructions (only big endian will get here). */
9068 if (mode == XImode)
9069 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9070 && aarch64_offset_7bit_signed_scaled_p (TImode,
9071 offset + 32));
9072
43cacb12
RS
9073 /* Make "m" use the LD1 offset range for SVE data modes, so
9074 that pre-RTL optimizers like ivopts will work to that
9075 instead of the wider LDR/STR range. */
9076 if (vec_flags == VEC_SVE_DATA)
9077 return (type == ADDR_QUERY_M
9078 ? offset_4bit_signed_scaled_p (mode, offset)
9079 : offset_9bit_signed_scaled_p (mode, offset));
9080
9f4cbab8
RS
9081 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9082 {
9083 poly_int64 end_offset = (offset
9084 + GET_MODE_SIZE (mode)
9085 - BYTES_PER_SVE_VECTOR);
9086 return (type == ADDR_QUERY_M
9087 ? offset_4bit_signed_scaled_p (mode, offset)
9088 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9089 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9090 end_offset)));
9091 }
9092
43cacb12
RS
9093 if (vec_flags == VEC_SVE_PRED)
9094 return offset_9bit_signed_scaled_p (mode, offset);
9095
2d8c6dc1 9096 if (load_store_pair_p)
6a70badb 9097 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
9098 || known_eq (GET_MODE_SIZE (mode), 8)
9099 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 9100 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 9101 else
3c5af608 9102 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
43e9d192
IB
9103 || offset_12bit_unsigned_scaled_p (mode, offset));
9104 }
9105
9106 if (allow_reg_index_p)
9107 {
9108 /* Look for base + (scaled/extended) index register. */
9109 if (aarch64_base_register_rtx_p (op0, strict_p)
9110 && aarch64_classify_index (info, op1, mode, strict_p))
9111 {
9112 info->base = op0;
9113 return true;
9114 }
9115 if (aarch64_base_register_rtx_p (op1, strict_p)
9116 && aarch64_classify_index (info, op0, mode, strict_p))
9117 {
9118 info->base = op1;
9119 return true;
9120 }
9121 }
9122
9123 return false;
9124
9125 case POST_INC:
9126 case POST_DEC:
9127 case PRE_INC:
9128 case PRE_DEC:
9129 info->type = ADDRESS_REG_WB;
9130 info->base = XEXP (x, 0);
9131 info->offset = NULL_RTX;
9132 return aarch64_base_register_rtx_p (info->base, strict_p);
9133
9134 case POST_MODIFY:
9135 case PRE_MODIFY:
9136 info->type = ADDRESS_REG_WB;
9137 info->base = XEXP (x, 0);
9138 if (GET_CODE (XEXP (x, 1)) == PLUS
dc640181 9139 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
43e9d192
IB
9140 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9141 && aarch64_base_register_rtx_p (info->base, strict_p))
9142 {
43e9d192 9143 info->offset = XEXP (XEXP (x, 1), 1);
dc640181 9144 info->const_offset = offset;
43e9d192
IB
9145
9146 /* TImode and TFmode values are allowed in both pairs of X
9147 registers and individual Q registers. The available
9148 address modes are:
9149 X,X: 7-bit signed scaled offset
9150 Q: 9-bit signed offset
9151 We conservatively require an offset representable in either mode.
9152 */
9153 if (mode == TImode || mode == TFmode)
44707478 9154 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3c5af608 9155 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
43e9d192 9156
2d8c6dc1 9157 if (load_store_pair_p)
6a70badb 9158 return ((known_eq (GET_MODE_SIZE (mode), 4)
9f5361c8
KT
9159 || known_eq (GET_MODE_SIZE (mode), 8)
9160 || known_eq (GET_MODE_SIZE (mode), 16))
44707478 9161 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
43e9d192 9162 else
3c5af608 9163 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
43e9d192
IB
9164 }
9165 return false;
9166
9167 case CONST:
9168 case SYMBOL_REF:
9169 case LABEL_REF:
79517551
SN
9170 /* load literal: pc-relative constant pool entry. Only supported
9171 for SI mode or larger. */
43e9d192 9172 info->type = ADDRESS_SYMBOLIC;
2d8c6dc1 9173
6a70badb
RS
9174 if (!load_store_pair_p
9175 && GET_MODE_SIZE (mode).is_constant (&const_size)
9176 && const_size >= 4)
43e9d192
IB
9177 {
9178 rtx sym, addend;
9179
9180 split_const (x, &sym, &addend);
b4f50fd4
RR
9181 return ((GET_CODE (sym) == LABEL_REF
9182 || (GET_CODE (sym) == SYMBOL_REF
9183 && CONSTANT_POOL_ADDRESS_P (sym)
9ee6540a 9184 && aarch64_pcrelative_literal_loads)));
43e9d192
IB
9185 }
9186 return false;
9187
9188 case LO_SUM:
9189 info->type = ADDRESS_LO_SUM;
9190 info->base = XEXP (x, 0);
9191 info->offset = XEXP (x, 1);
9192 if (allow_reg_index_p
9193 && aarch64_base_register_rtx_p (info->base, strict_p))
9194 {
9195 rtx sym, offs;
9196 split_const (info->offset, &sym, &offs);
9197 if (GET_CODE (sym) == SYMBOL_REF
43cacb12
RS
9198 && (aarch64_classify_symbol (sym, INTVAL (offs))
9199 == SYMBOL_SMALL_ABSOLUTE))
43e9d192
IB
9200 {
9201 /* The symbol and offset must be aligned to the access size. */
9202 unsigned int align;
43e9d192
IB
9203
9204 if (CONSTANT_POOL_ADDRESS_P (sym))
9205 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9206 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9207 {
9208 tree exp = SYMBOL_REF_DECL (sym);
9209 align = TYPE_ALIGN (TREE_TYPE (exp));
58e17cf8 9210 align = aarch64_constant_alignment (exp, align);
43e9d192
IB
9211 }
9212 else if (SYMBOL_REF_DECL (sym))
9213 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6c031d8d
KV
9214 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9215 && SYMBOL_REF_BLOCK (sym) != NULL)
9216 align = SYMBOL_REF_BLOCK (sym)->alignment;
43e9d192
IB
9217 else
9218 align = BITS_PER_UNIT;
9219
6a70badb
RS
9220 poly_int64 ref_size = GET_MODE_SIZE (mode);
9221 if (known_eq (ref_size, 0))
43e9d192
IB
9222 ref_size = GET_MODE_SIZE (DImode);
9223
6a70badb
RS
9224 return (multiple_p (INTVAL (offs), ref_size)
9225 && multiple_p (align / BITS_PER_UNIT, ref_size));
43e9d192
IB
9226 }
9227 }
9228 return false;
9229
9230 default:
9231 return false;
9232 }
9233}
9234
9bf2f779
KT
9235/* Return true if the address X is valid for a PRFM instruction.
9236 STRICT_P is true if we should do strict checking with
9237 aarch64_classify_address. */
9238
9239bool
9240aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9241{
9242 struct aarch64_address_info addr;
9243
9244 /* PRFM accepts the same addresses as DImode... */
a97d8b98 9245 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9bf2f779
KT
9246 if (!res)
9247 return false;
9248
9249 /* ... except writeback forms. */
9250 return addr.type != ADDRESS_REG_WB;
9251}
9252
43e9d192
IB
9253bool
9254aarch64_symbolic_address_p (rtx x)
9255{
9256 rtx offset;
9257
9258 split_const (x, &x, &offset);
9259 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9260}
9261
a6e0bfa7 9262/* Classify the base of symbolic expression X. */
da4f13a4
MS
9263
9264enum aarch64_symbol_type
a6e0bfa7 9265aarch64_classify_symbolic_expression (rtx x)
43e9d192
IB
9266{
9267 rtx offset;
da4f13a4 9268
43e9d192 9269 split_const (x, &x, &offset);
43cacb12 9270 return aarch64_classify_symbol (x, INTVAL (offset));
43e9d192
IB
9271}
9272
9273
9274/* Return TRUE if X is a legitimate address for accessing memory in
9275 mode MODE. */
9276static bool
ef4bddc2 9277aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
43e9d192
IB
9278{
9279 struct aarch64_address_info addr;
9280
a97d8b98 9281 return aarch64_classify_address (&addr, x, mode, strict_p);
43e9d192
IB
9282}
9283
a97d8b98
RS
9284/* Return TRUE if X is a legitimate address of type TYPE for accessing
9285 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
43e9d192 9286bool
a97d8b98
RS
9287aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9288 aarch64_addr_query_type type)
43e9d192
IB
9289{
9290 struct aarch64_address_info addr;
9291
a97d8b98 9292 return aarch64_classify_address (&addr, x, mode, strict_p, type);
43e9d192
IB
9293}
9294
9005477f
RS
9295/* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9296
491ec060 9297static bool
9005477f
RS
9298aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9299 poly_int64 orig_offset,
9300 machine_mode mode)
491ec060 9301{
6a70badb
RS
9302 HOST_WIDE_INT size;
9303 if (GET_MODE_SIZE (mode).is_constant (&size))
9304 {
9005477f
RS
9305 HOST_WIDE_INT const_offset, second_offset;
9306
9307 /* A general SVE offset is A * VQ + B. Remove the A component from
9308 coefficient 0 in order to get the constant B. */
9309 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9310
9311 /* Split an out-of-range address displacement into a base and
9312 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9313 range otherwise to increase opportunities for sharing the base
9314 address of different sizes. Unaligned accesses use the signed
9315 9-bit range, TImode/TFmode use the intersection of signed
9316 scaled 7-bit and signed 9-bit offset. */
6a70badb 9317 if (mode == TImode || mode == TFmode)
9005477f
RS
9318 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9319 else if ((const_offset & (size - 1)) != 0)
9320 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6a70badb 9321 else
9005477f 9322 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
491ec060 9323
9005477f
RS
9324 if (second_offset == 0 || known_eq (orig_offset, second_offset))
9325 return false;
9326
9327 /* Split the offset into second_offset and the rest. */
9328 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9329 *offset2 = gen_int_mode (second_offset, Pmode);
9330 return true;
9331 }
9332 else
9333 {
9334 /* Get the mode we should use as the basis of the range. For structure
9335 modes this is the mode of one vector. */
9336 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9337 machine_mode step_mode
9338 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9339
9340 /* Get the "mul vl" multiplier we'd like to use. */
9341 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9342 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9343 if (vec_flags & VEC_SVE_DATA)
9344 /* LDR supports a 9-bit range, but the move patterns for
9345 structure modes require all vectors to be in range of the
9346 same base. The simplest way of accomodating that while still
9347 promoting reuse of anchor points between different modes is
9348 to use an 8-bit range unconditionally. */
9349 vnum = ((vnum + 128) & 255) - 128;
9350 else
9351 /* Predicates are only handled singly, so we might as well use
9352 the full range. */
9353 vnum = ((vnum + 256) & 511) - 256;
9354 if (vnum == 0)
9355 return false;
9356
9357 /* Convert the "mul vl" multiplier into a byte offset. */
9358 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9359 if (known_eq (second_offset, orig_offset))
9360 return false;
9361
9362 /* Split the offset into second_offset and the rest. */
9363 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9364 *offset2 = gen_int_mode (second_offset, Pmode);
6a70badb
RS
9365 return true;
9366 }
491ec060
WD
9367}
9368
a2170965
TC
9369/* Return the binary representation of floating point constant VALUE in INTVAL.
9370 If the value cannot be converted, return false without setting INTVAL.
9371 The conversion is done in the given MODE. */
9372bool
9373aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9374{
9375
9376 /* We make a general exception for 0. */
9377 if (aarch64_float_const_zero_rtx_p (value))
9378 {
9379 *intval = 0;
9380 return true;
9381 }
9382
0d0e0188 9383 scalar_float_mode mode;
a2170965 9384 if (GET_CODE (value) != CONST_DOUBLE
0d0e0188 9385 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
a2170965
TC
9386 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9387 /* Only support up to DF mode. */
9388 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9389 return false;
9390
9391 unsigned HOST_WIDE_INT ival = 0;
9392
9393 long res[2];
9394 real_to_target (res,
9395 CONST_DOUBLE_REAL_VALUE (value),
9396 REAL_MODE_FORMAT (mode));
9397
5c22bb48
TC
9398 if (mode == DFmode)
9399 {
9400 int order = BYTES_BIG_ENDIAN ? 1 : 0;
9401 ival = zext_hwi (res[order], 32);
9402 ival |= (zext_hwi (res[1 - order], 32) << 32);
9403 }
9404 else
9405 ival = zext_hwi (res[0], 32);
a2170965
TC
9406
9407 *intval = ival;
9408 return true;
9409}
9410
9411/* Return TRUE if rtx X is an immediate constant that can be moved using a
9412 single MOV(+MOVK) followed by an FMOV. */
9413bool
9414aarch64_float_const_rtx_p (rtx x)
9415{
9416 machine_mode mode = GET_MODE (x);
9417 if (mode == VOIDmode)
9418 return false;
9419
9420 /* Determine whether it's cheaper to write float constants as
9421 mov/movk pairs over ldr/adrp pairs. */
9422 unsigned HOST_WIDE_INT ival;
9423
9424 if (GET_CODE (x) == CONST_DOUBLE
9425 && SCALAR_FLOAT_MODE_P (mode)
9426 && aarch64_reinterpret_float_as_int (x, &ival))
9427 {
77e994c9
RS
9428 scalar_int_mode imode = (mode == HFmode
9429 ? SImode
9430 : int_mode_for_mode (mode).require ());
a2170965
TC
9431 int num_instr = aarch64_internal_mov_immediate
9432 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9433 return num_instr < 3;
9434 }
9435
9436 return false;
9437}
9438
43e9d192
IB
9439/* Return TRUE if rtx X is immediate constant 0.0 */
9440bool
3520f7cc 9441aarch64_float_const_zero_rtx_p (rtx x)
43e9d192 9442{
43e9d192
IB
9443 if (GET_MODE (x) == VOIDmode)
9444 return false;
9445
34a72c33 9446 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
43e9d192 9447 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
34a72c33 9448 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
43e9d192
IB
9449}
9450
a2170965
TC
9451/* Return TRUE if rtx X is immediate constant that fits in a single
9452 MOVI immediate operation. */
9453bool
9454aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9455{
9456 if (!TARGET_SIMD)
9457 return false;
9458
77e994c9
RS
9459 machine_mode vmode;
9460 scalar_int_mode imode;
a2170965
TC
9461 unsigned HOST_WIDE_INT ival;
9462
9463 if (GET_CODE (x) == CONST_DOUBLE
9464 && SCALAR_FLOAT_MODE_P (mode))
9465 {
9466 if (!aarch64_reinterpret_float_as_int (x, &ival))
9467 return false;
9468
35c38fa6
TC
9469 /* We make a general exception for 0. */
9470 if (aarch64_float_const_zero_rtx_p (x))
9471 return true;
9472
304b9962 9473 imode = int_mode_for_mode (mode).require ();
a2170965
TC
9474 }
9475 else if (GET_CODE (x) == CONST_INT
77e994c9
RS
9476 && is_a <scalar_int_mode> (mode, &imode))
9477 ival = INTVAL (x);
a2170965
TC
9478 else
9479 return false;
9480
9481 /* use a 64 bit mode for everything except for DI/DF mode, where we use
9482 a 128 bit vector mode. */
77e994c9 9483 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
a2170965
TC
9484
9485 vmode = aarch64_simd_container_mode (imode, width);
9486 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9487
b187677b 9488 return aarch64_simd_valid_immediate (v_op, NULL);
a2170965
TC
9489}
9490
9491
70f09188
AP
9492/* Return the fixed registers used for condition codes. */
9493
9494static bool
9495aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9496{
9497 *p1 = CC_REGNUM;
9498 *p2 = INVALID_REGNUM;
9499 return true;
9500}
9501
47210a04
RL
9502/* This function is used by the call expanders of the machine description.
9503 RESULT is the register in which the result is returned. It's NULL for
9504 "call" and "sibcall".
9505 MEM is the location of the function call.
08cc4d92 9506 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
47210a04
RL
9507 SIBCALL indicates whether this function call is normal call or sibling call.
9508 It will generate different pattern accordingly. */
9509
9510void
08cc4d92 9511aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
47210a04
RL
9512{
9513 rtx call, callee, tmp;
9514 rtvec vec;
9515 machine_mode mode;
9516
9517 gcc_assert (MEM_P (mem));
9518 callee = XEXP (mem, 0);
9519 mode = GET_MODE (callee);
9520 gcc_assert (mode == Pmode);
9521
9522 /* Decide if we should generate indirect calls by loading the
9523 address of the callee into a register before performing
9524 the branch-and-link. */
9525 if (SYMBOL_REF_P (callee)
9526 ? (aarch64_is_long_call_p (callee)
9527 || aarch64_is_noplt_call_p (callee))
9528 : !REG_P (callee))
9529 XEXP (mem, 0) = force_reg (mode, callee);
9530
9531 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9532
9533 if (result != NULL_RTX)
9534 call = gen_rtx_SET (result, call);
9535
9536 if (sibcall)
9537 tmp = ret_rtx;
9538 else
9539 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9540
08cc4d92
RS
9541 gcc_assert (CONST_INT_P (callee_abi));
9542 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9543 UNSPEC_CALLEE_ABI);
9544
9545 vec = gen_rtvec (3, call, callee_abi, tmp);
47210a04
RL
9546 call = gen_rtx_PARALLEL (VOIDmode, vec);
9547
9548 aarch64_emit_call_insn (call);
9549}
9550
78607708
TV
9551/* Emit call insn with PAT and do aarch64-specific handling. */
9552
d07a3fed 9553void
78607708
TV
9554aarch64_emit_call_insn (rtx pat)
9555{
9556 rtx insn = emit_call_insn (pat);
9557
9558 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9559 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9560 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9561}
9562
ef4bddc2 9563machine_mode
43e9d192
IB
9564aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9565{
f7343f20
RE
9566 machine_mode mode_x = GET_MODE (x);
9567 rtx_code code_x = GET_CODE (x);
9568
43e9d192
IB
9569 /* All floating point compares return CCFP if it is an equality
9570 comparison, and CCFPE otherwise. */
f7343f20 9571 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
43e9d192
IB
9572 {
9573 switch (code)
9574 {
9575 case EQ:
9576 case NE:
9577 case UNORDERED:
9578 case ORDERED:
9579 case UNLT:
9580 case UNLE:
9581 case UNGT:
9582 case UNGE:
9583 case UNEQ:
43e9d192
IB
9584 return CCFPmode;
9585
9586 case LT:
9587 case LE:
9588 case GT:
9589 case GE:
8332c5ee 9590 case LTGT:
43e9d192
IB
9591 return CCFPEmode;
9592
9593 default:
9594 gcc_unreachable ();
9595 }
9596 }
9597
2b8568fe
KT
9598 /* Equality comparisons of short modes against zero can be performed
9599 using the TST instruction with the appropriate bitmask. */
f73dc006 9600 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
2b8568fe 9601 && (code == EQ || code == NE)
f7343f20 9602 && (mode_x == HImode || mode_x == QImode))
2b8568fe
KT
9603 return CC_NZmode;
9604
b06335f9
KT
9605 /* Similarly, comparisons of zero_extends from shorter modes can
9606 be performed using an ANDS with an immediate mask. */
f7343f20
RE
9607 if (y == const0_rtx && code_x == ZERO_EXTEND
9608 && (mode_x == SImode || mode_x == DImode)
b06335f9
KT
9609 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9610 && (code == EQ || code == NE))
9611 return CC_NZmode;
9612
f7343f20 9613 if ((mode_x == SImode || mode_x == DImode)
43e9d192
IB
9614 && y == const0_rtx
9615 && (code == EQ || code == NE || code == LT || code == GE)
f7343f20
RE
9616 && (code_x == PLUS || code_x == MINUS || code_x == AND
9617 || code_x == NEG
9618 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7325d85a 9619 && CONST_INT_P (XEXP (x, 2)))))
43e9d192
IB
9620 return CC_NZmode;
9621
1c992d1e 9622 /* A compare with a shifted operand. Because of canonicalization,
43e9d192
IB
9623 the comparison will have to be swapped when we emit the assembly
9624 code. */
f7343f20 9625 if ((mode_x == SImode || mode_x == DImode)
ffa8a921 9626 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
f7343f20
RE
9627 && (code_x == ASHIFT || code_x == ASHIFTRT
9628 || code_x == LSHIFTRT
9629 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
43e9d192
IB
9630 return CC_SWPmode;
9631
1c992d1e
RE
9632 /* Similarly for a negated operand, but we can only do this for
9633 equalities. */
f7343f20 9634 if ((mode_x == SImode || mode_x == DImode)
4aa81c2e 9635 && (REG_P (y) || GET_CODE (y) == SUBREG)
1c992d1e 9636 && (code == EQ || code == NE)
f7343f20 9637 && code_x == NEG)
1c992d1e
RE
9638 return CC_Zmode;
9639
f7343f20
RE
9640 /* A test for unsigned overflow from an addition. */
9641 if ((mode_x == DImode || mode_x == TImode)
9642 && (code == LTU || code == GEU)
9643 && code_x == PLUS
9644 && rtx_equal_p (XEXP (x, 0), y))
ef22810a
RH
9645 return CC_Cmode;
9646
f7343f20
RE
9647 /* A test for unsigned overflow from an add with carry. */
9648 if ((mode_x == DImode || mode_x == TImode)
9649 && (code == LTU || code == GEU)
9650 && code_x == PLUS
9651 && CONST_SCALAR_INT_P (y)
9652 && (rtx_mode_t (y, mode_x)
9653 == (wi::shwi (1, mode_x)
9654 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9655 return CC_ADCmode;
9656
30c46053 9657 /* A test for signed overflow. */
f7343f20 9658 if ((mode_x == DImode || mode_x == TImode)
30c46053 9659 && code == NE
f7343f20 9660 && code_x == PLUS
30c46053
MC
9661 && GET_CODE (y) == SIGN_EXTEND)
9662 return CC_Vmode;
9663
43e9d192
IB
9664 /* For everything else, return CCmode. */
9665 return CCmode;
9666}
9667
3dfa7055 9668static int
b8506a8a 9669aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
3dfa7055 9670
cd5660ab 9671int
43e9d192
IB
9672aarch64_get_condition_code (rtx x)
9673{
ef4bddc2 9674 machine_mode mode = GET_MODE (XEXP (x, 0));
43e9d192
IB
9675 enum rtx_code comp_code = GET_CODE (x);
9676
9677 if (GET_MODE_CLASS (mode) != MODE_CC)
9678 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3dfa7055
ZC
9679 return aarch64_get_condition_code_1 (mode, comp_code);
9680}
43e9d192 9681
3dfa7055 9682static int
b8506a8a 9683aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
3dfa7055 9684{
43e9d192
IB
9685 switch (mode)
9686 {
4e10a5a7
RS
9687 case E_CCFPmode:
9688 case E_CCFPEmode:
43e9d192
IB
9689 switch (comp_code)
9690 {
9691 case GE: return AARCH64_GE;
9692 case GT: return AARCH64_GT;
9693 case LE: return AARCH64_LS;
9694 case LT: return AARCH64_MI;
9695 case NE: return AARCH64_NE;
9696 case EQ: return AARCH64_EQ;
9697 case ORDERED: return AARCH64_VC;
9698 case UNORDERED: return AARCH64_VS;
9699 case UNLT: return AARCH64_LT;
9700 case UNLE: return AARCH64_LE;
9701 case UNGT: return AARCH64_HI;
9702 case UNGE: return AARCH64_PL;
cd5660ab 9703 default: return -1;
43e9d192
IB
9704 }
9705 break;
9706
4e10a5a7 9707 case E_CCmode:
43e9d192
IB
9708 switch (comp_code)
9709 {
9710 case NE: return AARCH64_NE;
9711 case EQ: return AARCH64_EQ;
9712 case GE: return AARCH64_GE;
9713 case GT: return AARCH64_GT;
9714 case LE: return AARCH64_LE;
9715 case LT: return AARCH64_LT;
9716 case GEU: return AARCH64_CS;
9717 case GTU: return AARCH64_HI;
9718 case LEU: return AARCH64_LS;
9719 case LTU: return AARCH64_CC;
cd5660ab 9720 default: return -1;
43e9d192
IB
9721 }
9722 break;
9723
4e10a5a7 9724 case E_CC_SWPmode:
43e9d192
IB
9725 switch (comp_code)
9726 {
9727 case NE: return AARCH64_NE;
9728 case EQ: return AARCH64_EQ;
9729 case GE: return AARCH64_LE;
9730 case GT: return AARCH64_LT;
9731 case LE: return AARCH64_GE;
9732 case LT: return AARCH64_GT;
9733 case GEU: return AARCH64_LS;
9734 case GTU: return AARCH64_CC;
9735 case LEU: return AARCH64_CS;
9736 case LTU: return AARCH64_HI;
cd5660ab 9737 default: return -1;
43e9d192
IB
9738 }
9739 break;
9740
57d6f4d0
RS
9741 case E_CC_NZCmode:
9742 switch (comp_code)
9743 {
9744 case NE: return AARCH64_NE; /* = any */
9745 case EQ: return AARCH64_EQ; /* = none */
9746 case GE: return AARCH64_PL; /* = nfrst */
9747 case LT: return AARCH64_MI; /* = first */
9748 case GEU: return AARCH64_CS; /* = nlast */
9749 case GTU: return AARCH64_HI; /* = pmore */
9750 case LEU: return AARCH64_LS; /* = plast */
9751 case LTU: return AARCH64_CC; /* = last */
9752 default: return -1;
9753 }
9754 break;
9755
4e10a5a7 9756 case E_CC_NZmode:
43e9d192
IB
9757 switch (comp_code)
9758 {
9759 case NE: return AARCH64_NE;
9760 case EQ: return AARCH64_EQ;
9761 case GE: return AARCH64_PL;
9762 case LT: return AARCH64_MI;
cd5660ab 9763 default: return -1;
43e9d192
IB
9764 }
9765 break;
9766
4e10a5a7 9767 case E_CC_Zmode:
1c992d1e
RE
9768 switch (comp_code)
9769 {
9770 case NE: return AARCH64_NE;
9771 case EQ: return AARCH64_EQ;
cd5660ab 9772 default: return -1;
1c992d1e
RE
9773 }
9774 break;
9775
4e10a5a7 9776 case E_CC_Cmode:
ef22810a
RH
9777 switch (comp_code)
9778 {
f7343f20
RE
9779 case LTU: return AARCH64_CS;
9780 case GEU: return AARCH64_CC;
9781 default: return -1;
9782 }
9783 break;
9784
9785 case E_CC_ADCmode:
9786 switch (comp_code)
9787 {
9788 case GEU: return AARCH64_CS;
9789 case LTU: return AARCH64_CC;
ef22810a
RH
9790 default: return -1;
9791 }
9792 break;
9793
30c46053
MC
9794 case E_CC_Vmode:
9795 switch (comp_code)
9796 {
9797 case NE: return AARCH64_VS;
9798 case EQ: return AARCH64_VC;
9799 default: return -1;
9800 }
9801 break;
9802
43e9d192 9803 default:
cd5660ab 9804 return -1;
43e9d192 9805 }
3dfa7055 9806
3dfa7055 9807 return -1;
43e9d192
IB
9808}
9809
ddeabd3e
AL
9810bool
9811aarch64_const_vec_all_same_in_range_p (rtx x,
6a70badb
RS
9812 HOST_WIDE_INT minval,
9813 HOST_WIDE_INT maxval)
ddeabd3e 9814{
6a70badb
RS
9815 rtx elt;
9816 return (const_vec_duplicate_p (x, &elt)
9817 && CONST_INT_P (elt)
9818 && IN_RANGE (INTVAL (elt), minval, maxval));
ddeabd3e
AL
9819}
9820
9821bool
9822aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9823{
9824 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9825}
9826
43cacb12
RS
9827/* Return true if VEC is a constant in which every element is in the range
9828 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9829
9830static bool
9831aarch64_const_vec_all_in_range_p (rtx vec,
9832 HOST_WIDE_INT minval,
9833 HOST_WIDE_INT maxval)
9834{
9835 if (GET_CODE (vec) != CONST_VECTOR
9836 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9837 return false;
9838
9839 int nunits;
9840 if (!CONST_VECTOR_STEPPED_P (vec))
9841 nunits = const_vector_encoded_nelts (vec);
9842 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9843 return false;
9844
9845 for (int i = 0; i < nunits; i++)
9846 {
9847 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9848 if (!CONST_INT_P (vec_elem)
9849 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9850 return false;
9851 }
9852 return true;
9853}
43e9d192 9854
cf670503
ZC
9855/* N Z C V. */
9856#define AARCH64_CC_V 1
9857#define AARCH64_CC_C (1 << 1)
9858#define AARCH64_CC_Z (1 << 2)
9859#define AARCH64_CC_N (1 << 3)
9860
c8012fbc
WD
9861/* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9862static const int aarch64_nzcv_codes[] =
9863{
9864 0, /* EQ, Z == 1. */
9865 AARCH64_CC_Z, /* NE, Z == 0. */
9866 0, /* CS, C == 1. */
9867 AARCH64_CC_C, /* CC, C == 0. */
9868 0, /* MI, N == 1. */
9869 AARCH64_CC_N, /* PL, N == 0. */
9870 0, /* VS, V == 1. */
9871 AARCH64_CC_V, /* VC, V == 0. */
9872 0, /* HI, C ==1 && Z == 0. */
9873 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9874 AARCH64_CC_V, /* GE, N == V. */
9875 0, /* LT, N != V. */
9876 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9877 0, /* LE, !(Z == 0 && N == V). */
9878 0, /* AL, Any. */
9879 0 /* NV, Any. */
cf670503
ZC
9880};
9881
43cacb12
RS
9882/* Print floating-point vector immediate operand X to F, negating it
9883 first if NEGATE is true. Return true on success, false if it isn't
9884 a constant we can handle. */
9885
9886static bool
9887aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9888{
9889 rtx elt;
9890
9891 if (!const_vec_duplicate_p (x, &elt))
9892 return false;
9893
9894 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9895 if (negate)
9896 r = real_value_negate (&r);
9897
d29f7dd5
RS
9898 /* Handle the SVE single-bit immediates specially, since they have a
9899 fixed form in the assembly syntax. */
43cacb12
RS
9900 if (real_equal (&r, &dconst0))
9901 asm_fprintf (f, "0.0");
a19ba9e1
RS
9902 else if (real_equal (&r, &dconst2))
9903 asm_fprintf (f, "2.0");
43cacb12
RS
9904 else if (real_equal (&r, &dconst1))
9905 asm_fprintf (f, "1.0");
9906 else if (real_equal (&r, &dconsthalf))
9907 asm_fprintf (f, "0.5");
9908 else
d29f7dd5
RS
9909 {
9910 const int buf_size = 20;
9911 char float_buf[buf_size] = {'\0'};
9912 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9913 1, GET_MODE (elt));
9914 asm_fprintf (f, "%s", float_buf);
9915 }
43cacb12
RS
9916
9917 return true;
9918}
9919
9f4cbab8
RS
9920/* Return the equivalent letter for size. */
9921static char
9922sizetochar (int size)
9923{
9924 switch (size)
9925 {
9926 case 64: return 'd';
9927 case 32: return 's';
9928 case 16: return 'h';
9929 case 8 : return 'b';
9930 default: gcc_unreachable ();
9931 }
9932}
9933
bcf19844
JW
9934/* Print operand X to file F in a target specific manner according to CODE.
9935 The acceptable formatting commands given by CODE are:
9936 'c': An integer or symbol address without a preceding #
9937 sign.
43cacb12
RS
9938 'C': Take the duplicated element in a vector constant
9939 and print it in hex.
9940 'D': Take the duplicated element in a vector constant
9941 and print it as an unsigned integer, in decimal.
bcf19844 9942 'e': Print the sign/zero-extend size as a character 8->b,
d113ece6
RS
9943 16->h, 32->w. Can also be used for masks:
9944 0xff->b, 0xffff->h, 0xffffffff->w.
d29f7dd5
RS
9945 'I': If the operand is a duplicated vector constant,
9946 replace it with the duplicated scalar. If the
9947 operand is then a floating-point constant, replace
9948 it with the integer bit representation. Print the
9949 transformed constant as a signed decimal number.
bcf19844
JW
9950 'p': Prints N such that 2^N == X (X must be power of 2 and
9951 const int).
9952 'P': Print the number of non-zero bits in X (a const_int).
9953 'H': Print the higher numbered register of a pair (TImode)
9954 of regs.
9955 'm': Print a condition (eq, ne, etc).
9956 'M': Same as 'm', but invert condition.
43cacb12
RS
9957 'N': Take the duplicated element in a vector constant
9958 and print the negative of it in decimal.
bcf19844
JW
9959 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9960 'S/T/U/V': Print a FP/SIMD register name for a register list.
9961 The register printed is the FP/SIMD register name
9962 of X + 0/1/2/3 for S/T/U/V.
e3f15286 9963 'R': Print a scalar Integer/FP/SIMD register name + 1.
bcf19844
JW
9964 'X': Print bottom 16 bits of integer constant in hex.
9965 'w/x': Print a general register name or the zero register
9966 (32-bit or 64-bit).
9967 '0': Print a normal operand, if it's a general register,
9968 then we assume DImode.
9969 'k': Print NZCV for conditional compare instructions.
9970 'A': Output address constant representing the first
9971 argument of X, specifying a relocation offset
9972 if appropriate.
9973 'L': Output constant address specified by X
9974 with a relocation offset if appropriate.
9975 'G': Prints address of X, specifying a PC relative
e69a816d
WD
9976 relocation mode if appropriate.
9977 'y': Output address of LDP or STP - this is used for
9978 some LDP/STPs which don't use a PARALLEL in their
9979 pattern (so the mode needs to be adjusted).
9980 'z': Output address of a typical LDP or STP. */
bcf19844 9981
cc8ca59e
JB
9982static void
9983aarch64_print_operand (FILE *f, rtx x, int code)
43e9d192 9984{
43cacb12 9985 rtx elt;
43e9d192
IB
9986 switch (code)
9987 {
f541a481
KT
9988 case 'c':
9989 switch (GET_CODE (x))
9990 {
9991 case CONST_INT:
9992 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9993 break;
9994
9995 case SYMBOL_REF:
9996 output_addr_const (f, x);
9997 break;
9998
9999 case CONST:
10000 if (GET_CODE (XEXP (x, 0)) == PLUS
10001 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
10002 {
10003 output_addr_const (f, x);
10004 break;
10005 }
10006 /* Fall through. */
10007
10008 default:
ee61f880 10009 output_operand_lossage ("unsupported operand for code '%c'", code);
f541a481
KT
10010 }
10011 break;
10012
43e9d192 10013 case 'e':
43e9d192 10014 {
d113ece6
RS
10015 x = unwrap_const_vec_duplicate (x);
10016 if (!CONST_INT_P (x))
43e9d192
IB
10017 {
10018 output_operand_lossage ("invalid operand for '%%%c'", code);
10019 return;
10020 }
10021
d113ece6
RS
10022 HOST_WIDE_INT val = INTVAL (x);
10023 if ((val & ~7) == 8 || val == 0xff)
10024 fputc ('b', f);
10025 else if ((val & ~7) == 16 || val == 0xffff)
10026 fputc ('h', f);
10027 else if ((val & ~7) == 32 || val == 0xffffffff)
10028 fputc ('w', f);
10029 else
43e9d192 10030 {
43e9d192
IB
10031 output_operand_lossage ("invalid operand for '%%%c'", code);
10032 return;
10033 }
10034 }
10035 break;
10036
10037 case 'p':
10038 {
10039 int n;
10040
4aa81c2e 10041 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
43e9d192
IB
10042 {
10043 output_operand_lossage ("invalid operand for '%%%c'", code);
10044 return;
10045 }
10046
10047 asm_fprintf (f, "%d", n);
10048 }
10049 break;
10050
10051 case 'P':
4aa81c2e 10052 if (!CONST_INT_P (x))
43e9d192
IB
10053 {
10054 output_operand_lossage ("invalid operand for '%%%c'", code);
10055 return;
10056 }
10057
8d55c61b 10058 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
43e9d192
IB
10059 break;
10060
10061 case 'H':
c0111dc4
RE
10062 if (x == const0_rtx)
10063 {
10064 asm_fprintf (f, "xzr");
10065 break;
10066 }
10067
4aa81c2e 10068 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
43e9d192
IB
10069 {
10070 output_operand_lossage ("invalid operand for '%%%c'", code);
10071 return;
10072 }
10073
01a3a324 10074 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
43e9d192
IB
10075 break;
10076
d29f7dd5
RS
10077 case 'I':
10078 {
10079 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10080 if (CONST_INT_P (x))
10081 asm_fprintf (f, "%wd", INTVAL (x));
10082 else
10083 {
10084 output_operand_lossage ("invalid operand for '%%%c'", code);
10085 return;
10086 }
10087 break;
10088 }
10089
43e9d192 10090 case 'M':
c8012fbc 10091 case 'm':
cd5660ab
KT
10092 {
10093 int cond_code;
c8012fbc
WD
10094 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
10095 if (x == const_true_rtx)
cd5660ab 10096 {
c8012fbc
WD
10097 if (code == 'M')
10098 fputs ("nv", f);
cd5660ab
KT
10099 return;
10100 }
43e9d192 10101
cd5660ab
KT
10102 if (!COMPARISON_P (x))
10103 {
10104 output_operand_lossage ("invalid operand for '%%%c'", code);
10105 return;
10106 }
c8012fbc 10107
cd5660ab
KT
10108 cond_code = aarch64_get_condition_code (x);
10109 gcc_assert (cond_code >= 0);
c8012fbc
WD
10110 if (code == 'M')
10111 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
57d6f4d0
RS
10112 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10113 fputs (aarch64_sve_condition_codes[cond_code], f);
10114 else
10115 fputs (aarch64_condition_codes[cond_code], f);
cd5660ab 10116 }
43e9d192
IB
10117 break;
10118
43cacb12
RS
10119 case 'N':
10120 if (!const_vec_duplicate_p (x, &elt))
10121 {
10122 output_operand_lossage ("invalid vector constant");
10123 return;
10124 }
10125
10126 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10127 asm_fprintf (f, "%wd", -INTVAL (elt));
10128 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10129 && aarch64_print_vector_float_operand (f, x, true))
10130 ;
10131 else
10132 {
10133 output_operand_lossage ("invalid vector constant");
10134 return;
10135 }
10136 break;
10137
43e9d192
IB
10138 case 'b':
10139 case 'h':
10140 case 's':
10141 case 'd':
10142 case 'q':
43e9d192
IB
10143 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10144 {
10145 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10146 return;
10147 }
50ce6f88 10148 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
43e9d192
IB
10149 break;
10150
10151 case 'S':
10152 case 'T':
10153 case 'U':
10154 case 'V':
43e9d192
IB
10155 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10156 {
10157 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10158 return;
10159 }
43cacb12
RS
10160 asm_fprintf (f, "%c%d",
10161 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10162 REGNO (x) - V0_REGNUM + (code - 'S'));
43e9d192
IB
10163 break;
10164
2d8c6dc1 10165 case 'R':
e3f15286
RH
10166 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10167 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10168 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10169 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10170 else
10171 output_operand_lossage ("incompatible register operand for '%%%c'",
10172 code);
2d8c6dc1
AH
10173 break;
10174
a05c0ddf 10175 case 'X':
4aa81c2e 10176 if (!CONST_INT_P (x))
a05c0ddf
IB
10177 {
10178 output_operand_lossage ("invalid operand for '%%%c'", code);
10179 return;
10180 }
50d38551 10181 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
a05c0ddf
IB
10182 break;
10183
43cacb12
RS
10184 case 'C':
10185 {
10186 /* Print a replicated constant in hex. */
10187 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10188 {
10189 output_operand_lossage ("invalid operand for '%%%c'", code);
10190 return;
10191 }
10192 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10193 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10194 }
10195 break;
10196
10197 case 'D':
10198 {
10199 /* Print a replicated constant in decimal, treating it as
10200 unsigned. */
10201 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10202 {
10203 output_operand_lossage ("invalid operand for '%%%c'", code);
10204 return;
10205 }
10206 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10207 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10208 }
10209 break;
10210
43e9d192
IB
10211 case 'w':
10212 case 'x':
3520f7cc
JG
10213 if (x == const0_rtx
10214 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
43e9d192 10215 {
50ce6f88 10216 asm_fprintf (f, "%czr", code);
43e9d192
IB
10217 break;
10218 }
10219
10220 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10221 {
50ce6f88 10222 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
43e9d192
IB
10223 break;
10224 }
10225
10226 if (REG_P (x) && REGNO (x) == SP_REGNUM)
10227 {
50ce6f88 10228 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
43e9d192
IB
10229 break;
10230 }
10231
10232 /* Fall through */
10233
10234 case 0:
43e9d192
IB
10235 if (x == NULL)
10236 {
10237 output_operand_lossage ("missing operand");
10238 return;
10239 }
10240
10241 switch (GET_CODE (x))
10242 {
10243 case REG:
43cacb12 10244 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9f4cbab8
RS
10245 {
10246 if (REG_NREGS (x) == 1)
10247 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10248 else
10249 {
10250 char suffix
10251 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10252 asm_fprintf (f, "{z%d.%c - z%d.%c}",
10253 REGNO (x) - V0_REGNUM, suffix,
10254 END_REGNO (x) - V0_REGNUM - 1, suffix);
10255 }
10256 }
43cacb12
RS
10257 else
10258 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
43e9d192
IB
10259 break;
10260
10261 case MEM:
cc8ca59e 10262 output_address (GET_MODE (x), XEXP (x, 0));
43e9d192
IB
10263 break;
10264
10265 case LABEL_REF:
10266 case SYMBOL_REF:
10267 output_addr_const (asm_out_file, x);
10268 break;
10269
10270 case CONST_INT:
10271 asm_fprintf (f, "%wd", INTVAL (x));
10272 break;
10273
43cacb12
RS
10274 case CONST:
10275 if (!VECTOR_MODE_P (GET_MODE (x)))
3520f7cc 10276 {
43cacb12
RS
10277 output_addr_const (asm_out_file, x);
10278 break;
3520f7cc 10279 }
43cacb12
RS
10280 /* fall through */
10281
10282 case CONST_VECTOR:
10283 if (!const_vec_duplicate_p (x, &elt))
3520f7cc 10284 {
43cacb12
RS
10285 output_operand_lossage ("invalid vector constant");
10286 return;
3520f7cc 10287 }
43cacb12
RS
10288
10289 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10290 asm_fprintf (f, "%wd", INTVAL (elt));
10291 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10292 && aarch64_print_vector_float_operand (f, x, false))
10293 ;
3520f7cc 10294 else
43cacb12
RS
10295 {
10296 output_operand_lossage ("invalid vector constant");
10297 return;
10298 }
43e9d192
IB
10299 break;
10300
3520f7cc 10301 case CONST_DOUBLE:
2ca5b430
KT
10302 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10303 be getting CONST_DOUBLEs holding integers. */
10304 gcc_assert (GET_MODE (x) != VOIDmode);
10305 if (aarch64_float_const_zero_rtx_p (x))
3520f7cc
JG
10306 {
10307 fputc ('0', f);
10308 break;
10309 }
10310 else if (aarch64_float_const_representable_p (x))
10311 {
10312#define buf_size 20
10313 char float_buf[buf_size] = {'\0'};
34a72c33
RS
10314 real_to_decimal_for_mode (float_buf,
10315 CONST_DOUBLE_REAL_VALUE (x),
3520f7cc
JG
10316 buf_size, buf_size,
10317 1, GET_MODE (x));
10318 asm_fprintf (asm_out_file, "%s", float_buf);
10319 break;
10320#undef buf_size
10321 }
10322 output_operand_lossage ("invalid constant");
10323 return;
43e9d192
IB
10324 default:
10325 output_operand_lossage ("invalid operand");
10326 return;
10327 }
10328 break;
10329
10330 case 'A':
10331 if (GET_CODE (x) == HIGH)
10332 x = XEXP (x, 0);
10333
a6e0bfa7 10334 switch (aarch64_classify_symbolic_expression (x))
43e9d192 10335 {
6642bdb4 10336 case SYMBOL_SMALL_GOT_4G:
43e9d192
IB
10337 asm_fprintf (asm_out_file, ":got:");
10338 break;
10339
10340 case SYMBOL_SMALL_TLSGD:
10341 asm_fprintf (asm_out_file, ":tlsgd:");
10342 break;
10343
10344 case SYMBOL_SMALL_TLSDESC:
10345 asm_fprintf (asm_out_file, ":tlsdesc:");
10346 break;
10347
79496620 10348 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
10349 asm_fprintf (asm_out_file, ":gottprel:");
10350 break;
10351
d18ba284 10352 case SYMBOL_TLSLE24:
43e9d192
IB
10353 asm_fprintf (asm_out_file, ":tprel:");
10354 break;
10355
87dd8ab0
MS
10356 case SYMBOL_TINY_GOT:
10357 gcc_unreachable ();
10358 break;
10359
43e9d192
IB
10360 default:
10361 break;
10362 }
10363 output_addr_const (asm_out_file, x);
10364 break;
10365
10366 case 'L':
a6e0bfa7 10367 switch (aarch64_classify_symbolic_expression (x))
43e9d192 10368 {
6642bdb4 10369 case SYMBOL_SMALL_GOT_4G:
43e9d192
IB
10370 asm_fprintf (asm_out_file, ":lo12:");
10371 break;
10372
10373 case SYMBOL_SMALL_TLSGD:
10374 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10375 break;
10376
10377 case SYMBOL_SMALL_TLSDESC:
10378 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10379 break;
10380
79496620 10381 case SYMBOL_SMALL_TLSIE:
43e9d192
IB
10382 asm_fprintf (asm_out_file, ":gottprel_lo12:");
10383 break;
10384
cbf5629e
JW
10385 case SYMBOL_TLSLE12:
10386 asm_fprintf (asm_out_file, ":tprel_lo12:");
10387 break;
10388
d18ba284 10389 case SYMBOL_TLSLE24:
43e9d192
IB
10390 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10391 break;
10392
87dd8ab0
MS
10393 case SYMBOL_TINY_GOT:
10394 asm_fprintf (asm_out_file, ":got:");
10395 break;
10396
5ae7caad
JW
10397 case SYMBOL_TINY_TLSIE:
10398 asm_fprintf (asm_out_file, ":gottprel:");
10399 break;
10400
43e9d192
IB
10401 default:
10402 break;
10403 }
10404 output_addr_const (asm_out_file, x);
10405 break;
10406
10407 case 'G':
a6e0bfa7 10408 switch (aarch64_classify_symbolic_expression (x))
43e9d192 10409 {
d18ba284 10410 case SYMBOL_TLSLE24:
43e9d192
IB
10411 asm_fprintf (asm_out_file, ":tprel_hi12:");
10412 break;
10413 default:
10414 break;
10415 }
10416 output_addr_const (asm_out_file, x);
10417 break;
10418
cf670503
ZC
10419 case 'k':
10420 {
c8012fbc 10421 HOST_WIDE_INT cond_code;
cf670503 10422
c8012fbc 10423 if (!CONST_INT_P (x))
cf670503
ZC
10424 {
10425 output_operand_lossage ("invalid operand for '%%%c'", code);
10426 return;
10427 }
10428
c8012fbc
WD
10429 cond_code = INTVAL (x);
10430 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10431 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
cf670503
ZC
10432 }
10433 break;
10434
e69a816d
WD
10435 case 'y':
10436 case 'z':
10437 {
10438 machine_mode mode = GET_MODE (x);
10439
c348cab0 10440 if (GET_CODE (x) != MEM
6a70badb 10441 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
e69a816d
WD
10442 {
10443 output_operand_lossage ("invalid operand for '%%%c'", code);
10444 return;
10445 }
10446
a25831ac
AV
10447 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10448 code == 'y'
10449 ? ADDR_QUERY_LDP_STP_N
10450 : ADDR_QUERY_LDP_STP))
c348cab0 10451 output_operand_lossage ("invalid operand prefix '%%%c'", code);
e69a816d
WD
10452 }
10453 break;
10454
43e9d192
IB
10455 default:
10456 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10457 return;
10458 }
10459}
10460
e69a816d
WD
10461/* Print address 'x' of a memory access with mode 'mode'.
10462 'op' is the context required by aarch64_classify_address. It can either be
10463 MEM for a normal memory access or PARALLEL for LDP/STP. */
c348cab0 10464static bool
a97d8b98
RS
10465aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10466 aarch64_addr_query_type type)
43e9d192
IB
10467{
10468 struct aarch64_address_info addr;
550a3380 10469 unsigned int size, vec_flags;
43e9d192 10470
e69a816d 10471 /* Check all addresses are Pmode - including ILP32. */
31460ed2
JJ
10472 if (GET_MODE (x) != Pmode
10473 && (!CONST_INT_P (x)
10474 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10475 {
10476 output_operand_lossage ("invalid address mode");
10477 return false;
10478 }
e69a816d 10479
a97d8b98 10480 if (aarch64_classify_address (&addr, x, mode, true, type))
43e9d192
IB
10481 switch (addr.type)
10482 {
10483 case ADDRESS_REG_IMM:
dc640181 10484 if (known_eq (addr.const_offset, 0))
43cacb12 10485 {
550a3380
RS
10486 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10487 return true;
43cacb12 10488 }
550a3380
RS
10489
10490 vec_flags = aarch64_classify_vector_mode (mode);
10491 if (vec_flags & VEC_ANY_SVE)
43cacb12
RS
10492 {
10493 HOST_WIDE_INT vnum
10494 = exact_div (addr.const_offset,
550a3380 10495 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
43cacb12
RS
10496 asm_fprintf (f, "[%s, #%wd, mul vl]",
10497 reg_names[REGNO (addr.base)], vnum);
550a3380 10498 return true;
43cacb12 10499 }
550a3380
RS
10500
10501 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10502 INTVAL (addr.offset));
c348cab0 10503 return true;
43e9d192
IB
10504
10505 case ADDRESS_REG_REG:
10506 if (addr.shift == 0)
16a3246f 10507 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
01a3a324 10508 reg_names [REGNO (addr.offset)]);
43e9d192 10509 else
16a3246f 10510 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
01a3a324 10511 reg_names [REGNO (addr.offset)], addr.shift);
c348cab0 10512 return true;
43e9d192
IB
10513
10514 case ADDRESS_REG_UXTW:
10515 if (addr.shift == 0)
16a3246f 10516 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
10517 REGNO (addr.offset) - R0_REGNUM);
10518 else
16a3246f 10519 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 10520 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 10521 return true;
43e9d192
IB
10522
10523 case ADDRESS_REG_SXTW:
10524 if (addr.shift == 0)
16a3246f 10525 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
43e9d192
IB
10526 REGNO (addr.offset) - R0_REGNUM);
10527 else
16a3246f 10528 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
43e9d192 10529 REGNO (addr.offset) - R0_REGNUM, addr.shift);
c348cab0 10530 return true;
43e9d192
IB
10531
10532 case ADDRESS_REG_WB:
6a70badb
RS
10533 /* Writeback is only supported for fixed-width modes. */
10534 size = GET_MODE_SIZE (mode).to_constant ();
43e9d192
IB
10535 switch (GET_CODE (x))
10536 {
10537 case PRE_INC:
6a70badb 10538 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
c348cab0 10539 return true;
43e9d192 10540 case POST_INC:
6a70badb 10541 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
c348cab0 10542 return true;
43e9d192 10543 case PRE_DEC:
6a70badb 10544 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
c348cab0 10545 return true;
43e9d192 10546 case POST_DEC:
6a70badb 10547 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
c348cab0 10548 return true;
43e9d192 10549 case PRE_MODIFY:
6a70badb 10550 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
43e9d192 10551 INTVAL (addr.offset));
c348cab0 10552 return true;
43e9d192 10553 case POST_MODIFY:
6a70badb 10554 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
43e9d192 10555 INTVAL (addr.offset));
c348cab0 10556 return true;
43e9d192
IB
10557 default:
10558 break;
10559 }
10560 break;
10561
10562 case ADDRESS_LO_SUM:
16a3246f 10563 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
43e9d192
IB
10564 output_addr_const (f, addr.offset);
10565 asm_fprintf (f, "]");
c348cab0 10566 return true;
43e9d192
IB
10567
10568 case ADDRESS_SYMBOLIC:
d6591257 10569 output_addr_const (f, x);
c348cab0 10570 return true;
43e9d192
IB
10571 }
10572
c348cab0 10573 return false;
43e9d192
IB
10574}
10575
e69a816d
WD
10576/* Print address 'x' of a memory access with mode 'mode'. */
10577static void
10578aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10579{
43cacb12 10580 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
c348cab0 10581 output_addr_const (f, x);
e69a816d
WD
10582}
10583
43e9d192
IB
10584bool
10585aarch64_label_mentioned_p (rtx x)
10586{
10587 const char *fmt;
10588 int i;
10589
10590 if (GET_CODE (x) == LABEL_REF)
10591 return true;
10592
10593 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10594 referencing instruction, but they are constant offsets, not
10595 symbols. */
10596 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10597 return false;
10598
10599 fmt = GET_RTX_FORMAT (GET_CODE (x));
10600 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10601 {
10602 if (fmt[i] == 'E')
10603 {
10604 int j;
10605
10606 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10607 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10608 return 1;
10609 }
10610 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10611 return 1;
10612 }
10613
10614 return 0;
10615}
10616
10617/* Implement REGNO_REG_CLASS. */
10618
10619enum reg_class
10620aarch64_regno_regclass (unsigned regno)
10621{
96b7f495
MM
10622 if (STUB_REGNUM_P (regno))
10623 return STUB_REGS;
10624
43e9d192 10625 if (GP_REGNUM_P (regno))
a4a182c6 10626 return GENERAL_REGS;
43e9d192
IB
10627
10628 if (regno == SP_REGNUM)
10629 return STACK_REG;
10630
10631 if (regno == FRAME_POINTER_REGNUM
10632 || regno == ARG_POINTER_REGNUM)
f24bb080 10633 return POINTER_REGS;
43e9d192
IB
10634
10635 if (FP_REGNUM_P (regno))
163b1f6a
RS
10636 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10637 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
43e9d192 10638
43cacb12
RS
10639 if (PR_REGNUM_P (regno))
10640 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10641
183bfdaf
RS
10642 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10643 return FFR_REGS;
10644
43e9d192
IB
10645 return NO_REGS;
10646}
10647
6a70badb
RS
10648/* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10649 If OFFSET is out of range, return an offset of an anchor point
10650 that is in range. Return 0 otherwise. */
10651
10652static HOST_WIDE_INT
10653aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10654 machine_mode mode)
10655{
10656 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10657 if (size > 16)
10658 return (offset + 0x400) & ~0x7f0;
10659
10660 /* For offsets that aren't a multiple of the access size, the limit is
10661 -256...255. */
10662 if (offset & (size - 1))
10663 {
10664 /* BLKmode typically uses LDP of X-registers. */
10665 if (mode == BLKmode)
10666 return (offset + 512) & ~0x3ff;
10667 return (offset + 0x100) & ~0x1ff;
10668 }
10669
10670 /* Small negative offsets are supported. */
10671 if (IN_RANGE (offset, -256, 0))
10672 return 0;
10673
10674 if (mode == TImode || mode == TFmode)
10675 return (offset + 0x100) & ~0x1ff;
10676
10677 /* Use 12-bit offset by access size. */
10678 return offset & (~0xfff * size);
10679}
10680
0c4ec427 10681static rtx
ef4bddc2 10682aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
0c4ec427
RE
10683{
10684 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10685 where mask is selected by alignment and size of the offset.
10686 We try to pick as large a range for the offset as possible to
10687 maximize the chance of a CSE. However, for aligned addresses
10688 we limit the range to 4k so that structures with different sized
e8426e0a
BC
10689 elements are likely to use the same base. We need to be careful
10690 not to split a CONST for some forms of address expression, otherwise
10691 it will generate sub-optimal code. */
0c4ec427
RE
10692
10693 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10694 {
9e0218fc 10695 rtx base = XEXP (x, 0);
17d7bdd8 10696 rtx offset_rtx = XEXP (x, 1);
9e0218fc 10697 HOST_WIDE_INT offset = INTVAL (offset_rtx);
0c4ec427 10698
9e0218fc 10699 if (GET_CODE (base) == PLUS)
e8426e0a 10700 {
9e0218fc
RH
10701 rtx op0 = XEXP (base, 0);
10702 rtx op1 = XEXP (base, 1);
10703
10704 /* Force any scaling into a temp for CSE. */
10705 op0 = force_reg (Pmode, op0);
10706 op1 = force_reg (Pmode, op1);
10707
10708 /* Let the pointer register be in op0. */
10709 if (REG_POINTER (op1))
10710 std::swap (op0, op1);
10711
10712 /* If the pointer is virtual or frame related, then we know that
10713 virtual register instantiation or register elimination is going
10714 to apply a second constant. We want the two constants folded
10715 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10716 if (virt_or_elim_regno_p (REGNO (op0)))
e8426e0a 10717 {
9e0218fc
RH
10718 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10719 NULL_RTX, true, OPTAB_DIRECT);
10720 return gen_rtx_PLUS (Pmode, base, op1);
e8426e0a 10721 }
e8426e0a 10722
9e0218fc
RH
10723 /* Otherwise, in order to encourage CSE (and thence loop strength
10724 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10725 base = expand_binop (Pmode, add_optab, op0, op1,
10726 NULL_RTX, true, OPTAB_DIRECT);
10727 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
e8426e0a
BC
10728 }
10729
6a70badb
RS
10730 HOST_WIDE_INT size;
10731 if (GET_MODE_SIZE (mode).is_constant (&size))
ff0f3f1c 10732 {
6a70badb
RS
10733 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10734 mode);
10735 if (base_offset != 0)
10736 {
10737 base = plus_constant (Pmode, base, base_offset);
10738 base = force_operand (base, NULL_RTX);
10739 return plus_constant (Pmode, base, offset - base_offset);
10740 }
9e0218fc 10741 }
0c4ec427
RE
10742 }
10743
10744 return x;
10745}
10746
43e9d192
IB
10747static reg_class_t
10748aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10749 reg_class_t rclass,
ef4bddc2 10750 machine_mode mode,
43e9d192
IB
10751 secondary_reload_info *sri)
10752{
cc68f7c2
RS
10753 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10754 LDR and STR. See the comment at the head of aarch64-sve.md for
10755 more details about the big-endian handling. */
10756 if (reg_class_subset_p (rclass, FP_REGS)
9a1b9cb4
RS
10757 && !((REG_P (x) && HARD_REGISTER_P (x))
10758 || aarch64_simd_valid_immediate (x, NULL))
cc68f7c2 10759 && mode != VNx16QImode)
43cacb12 10760 {
cc68f7c2
RS
10761 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10762 if ((vec_flags & VEC_SVE_DATA)
10763 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10764 {
10765 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10766 return NO_REGS;
10767 }
43cacb12 10768 }
b4f50fd4
RR
10769
10770 /* If we have to disable direct literal pool loads and stores because the
10771 function is too big, then we need a scratch register. */
10772 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10773 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10774 || targetm.vector_mode_supported_p (GET_MODE (x)))
9ee6540a 10775 && !aarch64_pcrelative_literal_loads)
b4f50fd4 10776 {
0016d8d9 10777 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
b4f50fd4
RR
10778 return NO_REGS;
10779 }
10780
43e9d192
IB
10781 /* Without the TARGET_SIMD instructions we cannot move a Q register
10782 to a Q register directly. We need a scratch. */
10783 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10784 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10785 && reg_class_subset_p (rclass, FP_REGS))
10786 {
0016d8d9 10787 sri->icode = code_for_aarch64_reload_mov (mode);
43e9d192
IB
10788 return NO_REGS;
10789 }
10790
10791 /* A TFmode or TImode memory access should be handled via an FP_REGS
10792 because AArch64 has richer addressing modes for LDR/STR instructions
10793 than LDP/STP instructions. */
d5726973 10794 if (TARGET_FLOAT && rclass == GENERAL_REGS
6a70badb 10795 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
43e9d192
IB
10796 return FP_REGS;
10797
10798 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
a4a182c6 10799 return GENERAL_REGS;
43e9d192
IB
10800
10801 return NO_REGS;
10802}
10803
10804static bool
6216fd90 10805aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
43e9d192 10806{
6216fd90 10807 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
43e9d192 10808
6216fd90
WD
10809 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10810 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
43e9d192 10811 if (frame_pointer_needed)
6216fd90 10812 return to == HARD_FRAME_POINTER_REGNUM;
43e9d192
IB
10813 return true;
10814}
10815
6a70badb 10816poly_int64
43e9d192
IB
10817aarch64_initial_elimination_offset (unsigned from, unsigned to)
10818{
78c29983
MS
10819 if (to == HARD_FRAME_POINTER_REGNUM)
10820 {
10821 if (from == ARG_POINTER_REGNUM)
71bfb77a 10822 return cfun->machine->frame.hard_fp_offset;
78c29983
MS
10823
10824 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
10825 return cfun->machine->frame.hard_fp_offset
10826 - cfun->machine->frame.locals_offset;
78c29983
MS
10827 }
10828
10829 if (to == STACK_POINTER_REGNUM)
10830 {
10831 if (from == FRAME_POINTER_REGNUM)
71bfb77a
WD
10832 return cfun->machine->frame.frame_size
10833 - cfun->machine->frame.locals_offset;
78c29983
MS
10834 }
10835
1c960e02 10836 return cfun->machine->frame.frame_size;
43e9d192
IB
10837}
10838
463a54e5
SN
10839
10840/* Get return address without mangling. */
10841
10842rtx
10843aarch64_return_addr_rtx (void)
10844{
10845 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
10846 /* Note: aarch64_return_address_signing_enabled only
10847 works after cfun->machine->frame.laid_out is set,
10848 so here we don't know if the return address will
10849 be signed or not. */
10850 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
10851 emit_move_insn (lr, val);
10852 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
10853 return lr;
10854}
10855
10856
43e9d192
IB
10857/* Implement RETURN_ADDR_RTX. We do not support moving back to a
10858 previous frame. */
10859
10860rtx
10861aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10862{
10863 if (count != 0)
10864 return const0_rtx;
463a54e5 10865 return aarch64_return_addr_rtx ();
43e9d192
IB
10866}
10867
43e9d192
IB
10868static void
10869aarch64_asm_trampoline_template (FILE *f)
10870{
be7c41a5
OT
10871 /* Even if the current function doesn't have branch protection, some
10872 later function might, so since this template is only generated once
10873 we have to add a BTI just in case. */
10874 asm_fprintf (f, "\thint\t34 // bti c\n");
b5f794b4 10875
28514dda
YZ
10876 if (TARGET_ILP32)
10877 {
be178ecd
MM
10878 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
10879 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
28514dda
YZ
10880 }
10881 else
10882 {
be178ecd
MM
10883 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
10884 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
28514dda 10885 }
01a3a324 10886 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
b5f794b4 10887
be178ecd
MM
10888 /* We always emit a speculation barrier.
10889 This is because the same trampoline template is used for every nested
10890 function. Since nested functions are not particularly common or
10891 performant we don't worry too much about the extra instructions to copy
10892 around.
10893 This is not yet a problem, since we have not yet implemented function
10894 specific attributes to choose between hardening against straight line
10895 speculation or not, but such function specific attributes are likely to
10896 happen in the future. */
10897 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
10898
28514dda
YZ
10899 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10900 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
43e9d192
IB
10901}
10902
10903static void
10904aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10905{
10906 rtx fnaddr, mem, a_tramp;
be178ecd 10907 const int tramp_code_sz = 24;
43e9d192
IB
10908
10909 /* Don't need to copy the trailing D-words, we fill those in below. */
be178ecd
MM
10910 /* We create our own memory address in Pmode so that `emit_block_move` can
10911 use parts of the backend which expect Pmode addresses. */
10912 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
10913 emit_block_move (gen_rtx_MEM (BLKmode, temp),
10914 assemble_trampoline_template (),
28514dda
YZ
10915 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10916 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
43e9d192 10917 fnaddr = XEXP (DECL_RTL (fndecl), 0);
28514dda
YZ
10918 if (GET_MODE (fnaddr) != ptr_mode)
10919 fnaddr = convert_memory_address (ptr_mode, fnaddr);
43e9d192
IB
10920 emit_move_insn (mem, fnaddr);
10921
28514dda 10922 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
43e9d192
IB
10923 emit_move_insn (mem, chain_value);
10924
10925 /* XXX We should really define a "clear_cache" pattern and use
10926 gen_clear_cache(). */
10927 a_tramp = XEXP (m_tramp, 0);
10928 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
db69559b 10929 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
28514dda
YZ
10930 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10931 ptr_mode);
43e9d192
IB
10932}
10933
10934static unsigned char
ef4bddc2 10935aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
43e9d192 10936{
6a70badb
RS
10937 /* ??? Logically we should only need to provide a value when
10938 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10939 can hold MODE, but at the moment we need to handle all modes.
10940 Just ignore any runtime parts for registers that can't store them. */
10941 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
550a3380 10942 unsigned int nregs, vec_flags;
43e9d192
IB
10943 switch (regclass)
10944 {
96b7f495 10945 case STUB_REGS:
d677263e 10946 case TAILCALL_ADDR_REGS:
43e9d192
IB
10947 case POINTER_REGS:
10948 case GENERAL_REGS:
10949 case ALL_REGS:
f25a140b 10950 case POINTER_AND_FP_REGS:
43e9d192
IB
10951 case FP_REGS:
10952 case FP_LO_REGS:
163b1f6a 10953 case FP_LO8_REGS:
550a3380
RS
10954 vec_flags = aarch64_classify_vector_mode (mode);
10955 if ((vec_flags & VEC_SVE_DATA)
43cacb12 10956 && constant_multiple_p (GET_MODE_SIZE (mode),
550a3380 10957 aarch64_vl_bytes (mode, vec_flags), &nregs))
43cacb12 10958 return nregs;
550a3380 10959 return (vec_flags & VEC_ADVSIMD
6a70badb
RS
10960 ? CEIL (lowest_size, UNITS_PER_VREG)
10961 : CEIL (lowest_size, UNITS_PER_WORD));
43e9d192 10962 case STACK_REG:
43cacb12
RS
10963 case PR_REGS:
10964 case PR_LO_REGS:
10965 case PR_HI_REGS:
183bfdaf
RS
10966 case FFR_REGS:
10967 case PR_AND_FFR_REGS:
43e9d192
IB
10968 return 1;
10969
10970 case NO_REGS:
10971 return 0;
10972
10973 default:
10974 break;
10975 }
10976 gcc_unreachable ();
10977}
10978
10979static reg_class_t
78d8b9f0 10980aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
43e9d192 10981{
51bb310d 10982 if (regclass == POINTER_REGS)
78d8b9f0
IB
10983 return GENERAL_REGS;
10984
51bb310d
MS
10985 if (regclass == STACK_REG)
10986 {
10987 if (REG_P(x)
10988 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10989 return regclass;
10990
10991 return NO_REGS;
10992 }
10993
27bd251b
IB
10994 /* Register eliminiation can result in a request for
10995 SP+constant->FP_REGS. We cannot support such operations which
10996 use SP as source and an FP_REG as destination, so reject out
10997 right now. */
10998 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10999 {
11000 rtx lhs = XEXP (x, 0);
11001
11002 /* Look through a possible SUBREG introduced by ILP32. */
11003 if (GET_CODE (lhs) == SUBREG)
11004 lhs = SUBREG_REG (lhs);
11005
11006 gcc_assert (REG_P (lhs));
11007 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11008 POINTER_REGS));
11009 return NO_REGS;
11010 }
11011
78d8b9f0 11012 return regclass;
43e9d192
IB
11013}
11014
11015void
11016aarch64_asm_output_labelref (FILE* f, const char *name)
11017{
11018 asm_fprintf (f, "%U%s", name);
11019}
11020
11021static void
11022aarch64_elf_asm_constructor (rtx symbol, int priority)
11023{
11024 if (priority == DEFAULT_INIT_PRIORITY)
11025 default_ctor_section_asm_out_constructor (symbol, priority);
11026 else
11027 {
11028 section *s;
53d190c1
AT
11029 /* While priority is known to be in range [0, 65535], so 18 bytes
11030 would be enough, the compiler might not know that. To avoid
11031 -Wformat-truncation false positive, use a larger size. */
11032 char buf[23];
43e9d192 11033 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
fcef3abd 11034 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
11035 switch_to_section (s);
11036 assemble_align (POINTER_SIZE);
28514dda 11037 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
11038 }
11039}
11040
11041static void
11042aarch64_elf_asm_destructor (rtx symbol, int priority)
11043{
11044 if (priority == DEFAULT_INIT_PRIORITY)
11045 default_dtor_section_asm_out_destructor (symbol, priority);
11046 else
11047 {
11048 section *s;
53d190c1
AT
11049 /* While priority is known to be in range [0, 65535], so 18 bytes
11050 would be enough, the compiler might not know that. To avoid
11051 -Wformat-truncation false positive, use a larger size. */
11052 char buf[23];
43e9d192 11053 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
fcef3abd 11054 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
43e9d192
IB
11055 switch_to_section (s);
11056 assemble_align (POINTER_SIZE);
28514dda 11057 assemble_aligned_integer (POINTER_BYTES, symbol);
43e9d192
IB
11058 }
11059}
11060
11061const char*
11062aarch64_output_casesi (rtx *operands)
11063{
11064 char buf[100];
11065 char label[100];
b32d5189 11066 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
43e9d192
IB
11067 int index;
11068 static const char *const patterns[4][2] =
11069 {
11070 {
11071 "ldrb\t%w3, [%0,%w1,uxtw]",
11072 "add\t%3, %4, %w3, sxtb #2"
11073 },
11074 {
11075 "ldrh\t%w3, [%0,%w1,uxtw #1]",
11076 "add\t%3, %4, %w3, sxth #2"
11077 },
11078 {
11079 "ldr\t%w3, [%0,%w1,uxtw #2]",
11080 "add\t%3, %4, %w3, sxtw #2"
11081 },
11082 /* We assume that DImode is only generated when not optimizing and
11083 that we don't really need 64-bit address offsets. That would
11084 imply an object file with 8GB of code in a single function! */
11085 {
11086 "ldr\t%w3, [%0,%w1,uxtw #2]",
11087 "add\t%3, %4, %w3, sxtw #2"
11088 }
11089 };
11090
11091 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11092
77e994c9
RS
11093 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11094 index = exact_log2 (GET_MODE_SIZE (mode));
43e9d192
IB
11095
11096 gcc_assert (index >= 0 && index <= 3);
11097
11098 /* Need to implement table size reduction, by chaning the code below. */
11099 output_asm_insn (patterns[index][0], operands);
11100 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11101 snprintf (buf, sizeof (buf),
11102 "adr\t%%4, %s", targetm.strip_name_encoding (label));
11103 output_asm_insn (buf, operands);
11104 output_asm_insn (patterns[index][1], operands);
11105 output_asm_insn ("br\t%3", operands);
be178ecd
MM
11106 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11107 operands);
43e9d192
IB
11108 assemble_label (asm_out_file, label);
11109 return "";
11110}
11111
11112
11113/* Return size in bits of an arithmetic operand which is shifted/scaled and
11114 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11115 operator. */
11116
11117int
11118aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11119{
11120 if (shift >= 0 && shift <= 3)
11121 {
11122 int size;
11123 for (size = 8; size <= 32; size *= 2)
11124 {
11125 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11126 if (mask == bits << shift)
11127 return size;
11128 }
11129 }
11130 return 0;
11131}
11132
e78d485e
RR
11133/* Constant pools are per function only when PC relative
11134 literal loads are true or we are in the large memory
11135 model. */
11136
11137static inline bool
11138aarch64_can_use_per_function_literal_pools_p (void)
11139{
9ee6540a 11140 return (aarch64_pcrelative_literal_loads
e78d485e
RR
11141 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11142}
11143
43e9d192 11144static bool
e78d485e 11145aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
43e9d192 11146{
74a9301d
VM
11147 /* We can't use blocks for constants when we're using a per-function
11148 constant pool. */
11149 return !aarch64_can_use_per_function_literal_pools_p ();
43e9d192
IB
11150}
11151
e78d485e
RR
11152/* Select appropriate section for constants depending
11153 on where we place literal pools. */
11154
43e9d192 11155static section *
e78d485e
RR
11156aarch64_select_rtx_section (machine_mode mode,
11157 rtx x,
11158 unsigned HOST_WIDE_INT align)
43e9d192 11159{
e78d485e
RR
11160 if (aarch64_can_use_per_function_literal_pools_p ())
11161 return function_section (current_function_decl);
43e9d192 11162
e78d485e
RR
11163 return default_elf_select_rtx_section (mode, x, align);
11164}
43e9d192 11165
5fca7b66
RH
11166/* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11167void
11168aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11169 HOST_WIDE_INT offset)
11170{
11171 /* When using per-function literal pools, we must ensure that any code
11172 section is aligned to the minimal instruction length, lest we get
11173 errors from the assembler re "unaligned instructions". */
11174 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11175 ASM_OUTPUT_ALIGN (f, 2);
11176}
11177
43e9d192
IB
11178/* Costs. */
11179
11180/* Helper function for rtx cost calculation. Strip a shift expression
11181 from X. Returns the inner operand if successful, or the original
11182 expression on failure. */
11183static rtx
11184aarch64_strip_shift (rtx x)
11185{
11186 rtx op = x;
11187
57b77d46
RE
11188 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11189 we can convert both to ROR during final output. */
43e9d192
IB
11190 if ((GET_CODE (op) == ASHIFT
11191 || GET_CODE (op) == ASHIFTRT
57b77d46
RE
11192 || GET_CODE (op) == LSHIFTRT
11193 || GET_CODE (op) == ROTATERT
11194 || GET_CODE (op) == ROTATE)
43e9d192
IB
11195 && CONST_INT_P (XEXP (op, 1)))
11196 return XEXP (op, 0);
11197
11198 if (GET_CODE (op) == MULT
11199 && CONST_INT_P (XEXP (op, 1))
11200 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11201 return XEXP (op, 0);
11202
11203 return x;
11204}
11205
4745e701 11206/* Helper function for rtx cost calculation. Strip an extend
43e9d192
IB
11207 expression from X. Returns the inner operand if successful, or the
11208 original expression on failure. We deal with a number of possible
b10f1009
AP
11209 canonicalization variations here. If STRIP_SHIFT is true, then
11210 we can strip off a shift also. */
43e9d192 11211static rtx
b10f1009 11212aarch64_strip_extend (rtx x, bool strip_shift)
43e9d192 11213{
77e994c9 11214 scalar_int_mode mode;
43e9d192
IB
11215 rtx op = x;
11216
77e994c9
RS
11217 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11218 return op;
11219
43e9d192
IB
11220 /* Zero and sign extraction of a widened value. */
11221 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11222 && XEXP (op, 2) == const0_rtx
4745e701 11223 && GET_CODE (XEXP (op, 0)) == MULT
77e994c9 11224 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
43e9d192
IB
11225 XEXP (op, 1)))
11226 return XEXP (XEXP (op, 0), 0);
11227
11228 /* It can also be represented (for zero-extend) as an AND with an
11229 immediate. */
11230 if (GET_CODE (op) == AND
11231 && GET_CODE (XEXP (op, 0)) == MULT
11232 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11233 && CONST_INT_P (XEXP (op, 1))
11234 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11235 INTVAL (XEXP (op, 1))) != 0)
11236 return XEXP (XEXP (op, 0), 0);
11237
11238 /* Now handle extended register, as this may also have an optional
11239 left shift by 1..4. */
b10f1009
AP
11240 if (strip_shift
11241 && GET_CODE (op) == ASHIFT
43e9d192
IB
11242 && CONST_INT_P (XEXP (op, 1))
11243 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11244 op = XEXP (op, 0);
11245
11246 if (GET_CODE (op) == ZERO_EXTEND
11247 || GET_CODE (op) == SIGN_EXTEND)
11248 op = XEXP (op, 0);
11249
11250 if (op != x)
11251 return op;
11252
4745e701
JG
11253 return x;
11254}
11255
0a78ebe4
KT
11256/* Return true iff CODE is a shift supported in combination
11257 with arithmetic instructions. */
4d1919ed 11258
0a78ebe4
KT
11259static bool
11260aarch64_shift_p (enum rtx_code code)
11261{
11262 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11263}
11264
b10f1009
AP
11265
11266/* Return true iff X is a cheap shift without a sign extend. */
11267
11268static bool
11269aarch64_cheap_mult_shift_p (rtx x)
11270{
11271 rtx op0, op1;
11272
11273 op0 = XEXP (x, 0);
11274 op1 = XEXP (x, 1);
11275
11276 if (!(aarch64_tune_params.extra_tuning_flags
11277 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11278 return false;
11279
11280 if (GET_CODE (op0) == SIGN_EXTEND)
11281 return false;
11282
11283 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11284 && UINTVAL (op1) <= 4)
11285 return true;
11286
11287 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11288 return false;
11289
11290 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11291
11292 if (l2 > 0 && l2 <= 4)
11293 return true;
11294
11295 return false;
11296}
11297
4745e701 11298/* Helper function for rtx cost calculation. Calculate the cost of
0a78ebe4
KT
11299 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11300 Return the calculated cost of the expression, recursing manually in to
4745e701
JG
11301 operands where needed. */
11302
11303static int
e548c9df 11304aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
4745e701
JG
11305{
11306 rtx op0, op1;
11307 const struct cpu_cost_table *extra_cost
b175b679 11308 = aarch64_tune_params.insn_extra_cost;
4745e701 11309 int cost = 0;
0a78ebe4 11310 bool compound_p = (outer == PLUS || outer == MINUS);
ef4bddc2 11311 machine_mode mode = GET_MODE (x);
4745e701
JG
11312
11313 gcc_checking_assert (code == MULT);
11314
11315 op0 = XEXP (x, 0);
11316 op1 = XEXP (x, 1);
11317
11318 if (VECTOR_MODE_P (mode))
df81764b
TC
11319 {
11320 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11321 mode = GET_MODE_INNER (mode);
11322 if (vec_flags & VEC_ADVSIMD)
11323 {
11324 /* The by-element versions of the instruction have the same costs as
11325 the normal 3-vector version. So don't add the costs of the
11326 duplicate into the costs of the multiply. We make an assumption
11327 that the input to the VEC_DUPLICATE is already on the FP & SIMD
11328 side. This means costing of a MUL by element pre RA is a bit
11329 optimistic. */
11330 if (GET_CODE (op0) == VEC_DUPLICATE)
11331 op0 = XEXP (op0, 0);
11332 else if (GET_CODE (op1) == VEC_DUPLICATE)
11333 op1 = XEXP (op1, 0);
11334 }
11335 }
4745e701
JG
11336
11337 /* Integer multiply/fma. */
11338 if (GET_MODE_CLASS (mode) == MODE_INT)
11339 {
11340 /* The multiply will be canonicalized as a shift, cost it as such. */
0a78ebe4
KT
11341 if (aarch64_shift_p (GET_CODE (x))
11342 || (CONST_INT_P (op1)
11343 && exact_log2 (INTVAL (op1)) > 0))
4745e701 11344 {
0a78ebe4
KT
11345 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11346 || GET_CODE (op0) == SIGN_EXTEND;
4745e701
JG
11347 if (speed)
11348 {
0a78ebe4
KT
11349 if (compound_p)
11350 {
b10f1009
AP
11351 /* If the shift is considered cheap,
11352 then don't add any cost. */
11353 if (aarch64_cheap_mult_shift_p (x))
11354 ;
11355 else if (REG_P (op1))
0a78ebe4
KT
11356 /* ARITH + shift-by-register. */
11357 cost += extra_cost->alu.arith_shift_reg;
11358 else if (is_extend)
11359 /* ARITH + extended register. We don't have a cost field
11360 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
11361 cost += extra_cost->alu.extend_arith;
11362 else
11363 /* ARITH + shift-by-immediate. */
11364 cost += extra_cost->alu.arith_shift;
11365 }
4745e701
JG
11366 else
11367 /* LSL (immediate). */
0a78ebe4
KT
11368 cost += extra_cost->alu.shift;
11369
4745e701 11370 }
0a78ebe4
KT
11371 /* Strip extends as we will have costed them in the case above. */
11372 if (is_extend)
b10f1009 11373 op0 = aarch64_strip_extend (op0, true);
4745e701 11374
e548c9df 11375 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
4745e701
JG
11376
11377 return cost;
11378 }
11379
d2ac256b
KT
11380 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
11381 compound and let the below cases handle it. After all, MNEG is a
11382 special-case alias of MSUB. */
11383 if (GET_CODE (op0) == NEG)
11384 {
11385 op0 = XEXP (op0, 0);
11386 compound_p = true;
11387 }
11388
4745e701
JG
11389 /* Integer multiplies or FMAs have zero/sign extending variants. */
11390 if ((GET_CODE (op0) == ZERO_EXTEND
11391 && GET_CODE (op1) == ZERO_EXTEND)
11392 || (GET_CODE (op0) == SIGN_EXTEND
11393 && GET_CODE (op1) == SIGN_EXTEND))
11394 {
e548c9df
AM
11395 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11396 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
4745e701
JG
11397
11398 if (speed)
11399 {
0a78ebe4 11400 if (compound_p)
d2ac256b 11401 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
4745e701
JG
11402 cost += extra_cost->mult[0].extend_add;
11403 else
11404 /* MUL/SMULL/UMULL. */
11405 cost += extra_cost->mult[0].extend;
11406 }
11407
11408 return cost;
11409 }
11410
d2ac256b 11411 /* This is either an integer multiply or a MADD. In both cases
4745e701 11412 we want to recurse and cost the operands. */
e548c9df
AM
11413 cost += rtx_cost (op0, mode, MULT, 0, speed);
11414 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
11415
11416 if (speed)
11417 {
0a78ebe4 11418 if (compound_p)
d2ac256b 11419 /* MADD/MSUB. */
4745e701
JG
11420 cost += extra_cost->mult[mode == DImode].add;
11421 else
11422 /* MUL. */
11423 cost += extra_cost->mult[mode == DImode].simple;
11424 }
11425
11426 return cost;
11427 }
11428 else
11429 {
11430 if (speed)
11431 {
3d840f7d 11432 /* Floating-point FMA/FMUL can also support negations of the
d318517d
SN
11433 operands, unless the rounding mode is upward or downward in
11434 which case FNMUL is different than FMUL with operand negation. */
11435 bool neg0 = GET_CODE (op0) == NEG;
11436 bool neg1 = GET_CODE (op1) == NEG;
11437 if (compound_p || !flag_rounding_math || (neg0 && neg1))
11438 {
11439 if (neg0)
11440 op0 = XEXP (op0, 0);
11441 if (neg1)
11442 op1 = XEXP (op1, 0);
11443 }
4745e701 11444
0a78ebe4 11445 if (compound_p)
4745e701
JG
11446 /* FMADD/FNMADD/FNMSUB/FMSUB. */
11447 cost += extra_cost->fp[mode == DFmode].fma;
11448 else
3d840f7d 11449 /* FMUL/FNMUL. */
4745e701
JG
11450 cost += extra_cost->fp[mode == DFmode].mult;
11451 }
11452
e548c9df
AM
11453 cost += rtx_cost (op0, mode, MULT, 0, speed);
11454 cost += rtx_cost (op1, mode, MULT, 1, speed);
4745e701
JG
11455 return cost;
11456 }
43e9d192
IB
11457}
11458
67747367
JG
11459static int
11460aarch64_address_cost (rtx x,
ef4bddc2 11461 machine_mode mode,
67747367
JG
11462 addr_space_t as ATTRIBUTE_UNUSED,
11463 bool speed)
11464{
11465 enum rtx_code c = GET_CODE (x);
b175b679 11466 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
67747367
JG
11467 struct aarch64_address_info info;
11468 int cost = 0;
11469 info.shift = 0;
11470
a97d8b98 11471 if (!aarch64_classify_address (&info, x, mode, false))
67747367
JG
11472 {
11473 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11474 {
11475 /* This is a CONST or SYMBOL ref which will be split
11476 in a different way depending on the code model in use.
11477 Cost it through the generic infrastructure. */
e548c9df 11478 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
67747367
JG
11479 /* Divide through by the cost of one instruction to
11480 bring it to the same units as the address costs. */
11481 cost_symbol_ref /= COSTS_N_INSNS (1);
11482 /* The cost is then the cost of preparing the address,
11483 followed by an immediate (possibly 0) offset. */
11484 return cost_symbol_ref + addr_cost->imm_offset;
11485 }
11486 else
11487 {
11488 /* This is most likely a jump table from a case
11489 statement. */
11490 return addr_cost->register_offset;
11491 }
11492 }
11493
11494 switch (info.type)
11495 {
11496 case ADDRESS_LO_SUM:
11497 case ADDRESS_SYMBOLIC:
11498 case ADDRESS_REG_IMM:
11499 cost += addr_cost->imm_offset;
11500 break;
11501
11502 case ADDRESS_REG_WB:
11503 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11504 cost += addr_cost->pre_modify;
11505 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11506 cost += addr_cost->post_modify;
11507 else
11508 gcc_unreachable ();
11509
11510 break;
11511
11512 case ADDRESS_REG_REG:
11513 cost += addr_cost->register_offset;
11514 break;
11515
67747367 11516 case ADDRESS_REG_SXTW:
783879e6
EM
11517 cost += addr_cost->register_sextend;
11518 break;
11519
11520 case ADDRESS_REG_UXTW:
11521 cost += addr_cost->register_zextend;
67747367
JG
11522 break;
11523
11524 default:
11525 gcc_unreachable ();
11526 }
11527
11528
11529 if (info.shift > 0)
11530 {
11531 /* For the sake of calculating the cost of the shifted register
11532 component, we can treat same sized modes in the same way. */
6a70badb
RS
11533 if (known_eq (GET_MODE_BITSIZE (mode), 16))
11534 cost += addr_cost->addr_scale_costs.hi;
11535 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11536 cost += addr_cost->addr_scale_costs.si;
11537 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11538 cost += addr_cost->addr_scale_costs.di;
11539 else
11540 /* We can't tell, or this is a 128-bit vector. */
11541 cost += addr_cost->addr_scale_costs.ti;
67747367
JG
11542 }
11543
11544 return cost;
11545}
11546
b9066f5a
MW
11547/* Return the cost of a branch. If SPEED_P is true then the compiler is
11548 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
11549 to be taken. */
11550
11551int
11552aarch64_branch_cost (bool speed_p, bool predictable_p)
11553{
11554 /* When optimizing for speed, use the cost of unpredictable branches. */
11555 const struct cpu_branch_cost *branch_costs =
b175b679 11556 aarch64_tune_params.branch_costs;
b9066f5a
MW
11557
11558 if (!speed_p || predictable_p)
11559 return branch_costs->predictable;
11560 else
11561 return branch_costs->unpredictable;
11562}
11563
7cc2145f
JG
11564/* Return true if the RTX X in mode MODE is a zero or sign extract
11565 usable in an ADD or SUB (extended register) instruction. */
11566static bool
77e994c9 11567aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
7cc2145f
JG
11568{
11569 /* Catch add with a sign extract.
11570 This is add_<optab><mode>_multp2. */
11571 if (GET_CODE (x) == SIGN_EXTRACT
11572 || GET_CODE (x) == ZERO_EXTRACT)
11573 {
11574 rtx op0 = XEXP (x, 0);
11575 rtx op1 = XEXP (x, 1);
11576 rtx op2 = XEXP (x, 2);
11577
11578 if (GET_CODE (op0) == MULT
11579 && CONST_INT_P (op1)
11580 && op2 == const0_rtx
11581 && CONST_INT_P (XEXP (op0, 1))
11582 && aarch64_is_extend_from_extract (mode,
11583 XEXP (op0, 1),
11584 op1))
11585 {
11586 return true;
11587 }
11588 }
e47c4031
KT
11589 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11590 No shift. */
11591 else if (GET_CODE (x) == SIGN_EXTEND
11592 || GET_CODE (x) == ZERO_EXTEND)
11593 return REG_P (XEXP (x, 0));
7cc2145f
JG
11594
11595 return false;
11596}
11597
61263118
KT
11598static bool
11599aarch64_frint_unspec_p (unsigned int u)
11600{
11601 switch (u)
11602 {
11603 case UNSPEC_FRINTZ:
11604 case UNSPEC_FRINTP:
11605 case UNSPEC_FRINTM:
11606 case UNSPEC_FRINTA:
11607 case UNSPEC_FRINTN:
11608 case UNSPEC_FRINTX:
11609 case UNSPEC_FRINTI:
11610 return true;
11611
11612 default:
11613 return false;
11614 }
11615}
11616
fb0cb7fa
KT
11617/* Return true iff X is an rtx that will match an extr instruction
11618 i.e. as described in the *extr<mode>5_insn family of patterns.
11619 OP0 and OP1 will be set to the operands of the shifts involved
11620 on success and will be NULL_RTX otherwise. */
11621
11622static bool
11623aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11624{
11625 rtx op0, op1;
77e994c9
RS
11626 scalar_int_mode mode;
11627 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11628 return false;
fb0cb7fa
KT
11629
11630 *res_op0 = NULL_RTX;
11631 *res_op1 = NULL_RTX;
11632
11633 if (GET_CODE (x) != IOR)
11634 return false;
11635
11636 op0 = XEXP (x, 0);
11637 op1 = XEXP (x, 1);
11638
11639 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11640 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11641 {
11642 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11643 if (GET_CODE (op1) == ASHIFT)
11644 std::swap (op0, op1);
11645
11646 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11647 return false;
11648
11649 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11650 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11651
11652 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11653 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11654 {
11655 *res_op0 = XEXP (op0, 0);
11656 *res_op1 = XEXP (op1, 0);
11657 return true;
11658 }
11659 }
11660
11661 return false;
11662}
11663
2d5ffe46
AP
11664/* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11665 storing it in *COST. Result is true if the total cost of the operation
11666 has now been calculated. */
11667static bool
11668aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11669{
b9e3afe9
AP
11670 rtx inner;
11671 rtx comparator;
11672 enum rtx_code cmpcode;
e2a14bec
RS
11673 const struct cpu_cost_table *extra_cost
11674 = aarch64_tune_params.insn_extra_cost;
b9e3afe9
AP
11675
11676 if (COMPARISON_P (op0))
11677 {
11678 inner = XEXP (op0, 0);
11679 comparator = XEXP (op0, 1);
11680 cmpcode = GET_CODE (op0);
11681 }
11682 else
11683 {
11684 inner = op0;
11685 comparator = const0_rtx;
11686 cmpcode = NE;
11687 }
11688
2d5ffe46
AP
11689 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11690 {
11691 /* Conditional branch. */
b9e3afe9 11692 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46
AP
11693 return true;
11694 else
11695 {
b9e3afe9 11696 if (cmpcode == NE || cmpcode == EQ)
2d5ffe46 11697 {
2d5ffe46
AP
11698 if (comparator == const0_rtx)
11699 {
11700 /* TBZ/TBNZ/CBZ/CBNZ. */
11701 if (GET_CODE (inner) == ZERO_EXTRACT)
11702 /* TBZ/TBNZ. */
e548c9df
AM
11703 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11704 ZERO_EXTRACT, 0, speed);
11705 else
11706 /* CBZ/CBNZ. */
11707 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
2d5ffe46 11708
e2a14bec
RS
11709 return true;
11710 }
11711 if (register_operand (inner, VOIDmode)
11712 && aarch64_imm24 (comparator, VOIDmode))
11713 {
11714 /* SUB and SUBS. */
11715 *cost += COSTS_N_INSNS (2);
11716 if (speed)
11717 *cost += extra_cost->alu.arith * 2;
11718 return true;
11719 }
2d5ffe46 11720 }
b9e3afe9 11721 else if (cmpcode == LT || cmpcode == GE)
2d5ffe46 11722 {
2d5ffe46
AP
11723 /* TBZ/TBNZ. */
11724 if (comparator == const0_rtx)
11725 return true;
11726 }
11727 }
11728 }
b9e3afe9 11729 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
2d5ffe46 11730 {
786298dc 11731 /* CCMP. */
6dfeb7ce 11732 if (GET_CODE (op1) == COMPARE)
786298dc
WD
11733 {
11734 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11735 if (XEXP (op1, 1) == const0_rtx)
11736 *cost += 1;
11737 if (speed)
11738 {
11739 machine_mode mode = GET_MODE (XEXP (op1, 0));
11740 const struct cpu_cost_table *extra_cost
11741 = aarch64_tune_params.insn_extra_cost;
11742
11743 if (GET_MODE_CLASS (mode) == MODE_INT)
11744 *cost += extra_cost->alu.arith;
11745 else
11746 *cost += extra_cost->fp[mode == DFmode].compare;
11747 }
11748 return true;
11749 }
11750
2d5ffe46
AP
11751 /* It's a conditional operation based on the status flags,
11752 so it must be some flavor of CSEL. */
11753
11754 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11755 if (GET_CODE (op1) == NEG
11756 || GET_CODE (op1) == NOT
11757 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11758 op1 = XEXP (op1, 0);
bad00732
KT
11759 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11760 {
11761 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11762 op1 = XEXP (op1, 0);
11763 op2 = XEXP (op2, 0);
11764 }
d572ad49
AC
11765 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
11766 {
11767 inner = XEXP (op1, 0);
11768 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
11769 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
11770 op1 = XEXP (inner, 0);
11771 }
2d5ffe46 11772
e548c9df
AM
11773 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11774 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
2d5ffe46
AP
11775 return true;
11776 }
11777
11778 /* We don't know what this is, cost all operands. */
11779 return false;
11780}
11781
283b6c85
KT
11782/* Check whether X is a bitfield operation of the form shift + extend that
11783 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11784 operand to which the bitfield operation is applied. Otherwise return
11785 NULL_RTX. */
11786
11787static rtx
11788aarch64_extend_bitfield_pattern_p (rtx x)
11789{
11790 rtx_code outer_code = GET_CODE (x);
11791 machine_mode outer_mode = GET_MODE (x);
11792
11793 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11794 && outer_mode != SImode && outer_mode != DImode)
11795 return NULL_RTX;
11796
11797 rtx inner = XEXP (x, 0);
11798 rtx_code inner_code = GET_CODE (inner);
11799 machine_mode inner_mode = GET_MODE (inner);
11800 rtx op = NULL_RTX;
11801
11802 switch (inner_code)
11803 {
11804 case ASHIFT:
11805 if (CONST_INT_P (XEXP (inner, 1))
11806 && (inner_mode == QImode || inner_mode == HImode))
11807 op = XEXP (inner, 0);
11808 break;
11809 case LSHIFTRT:
11810 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11811 && (inner_mode == QImode || inner_mode == HImode))
11812 op = XEXP (inner, 0);
11813 break;
11814 case ASHIFTRT:
11815 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11816 && (inner_mode == QImode || inner_mode == HImode))
11817 op = XEXP (inner, 0);
11818 break;
11819 default:
11820 break;
11821 }
11822
11823 return op;
11824}
11825
8c83f71d
KT
11826/* Return true if the mask and a shift amount from an RTX of the form
11827 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11828 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11829
11830bool
77e994c9
RS
11831aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11832 rtx shft_amnt)
8c83f71d
KT
11833{
11834 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11835 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11836 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
1b6acf23
WD
11837 && (INTVAL (mask)
11838 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
8c83f71d
KT
11839}
11840
6a0d3939
SE
11841/* Return true if the masks and a shift amount from an RTX of the form
11842 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11843 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11844
11845bool
11846aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11847 unsigned HOST_WIDE_INT mask1,
11848 unsigned HOST_WIDE_INT shft_amnt,
11849 unsigned HOST_WIDE_INT mask2)
11850{
11851 unsigned HOST_WIDE_INT t;
11852
11853 /* Verify that there is no overlap in what bits are set in the two masks. */
11854 if (mask1 != ~mask2)
11855 return false;
11856
11857 /* Verify that mask2 is not all zeros or ones. */
11858 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11859 return false;
11860
11861 /* The shift amount should always be less than the mode size. */
11862 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11863
11864 /* Verify that the mask being shifted is contiguous and would be in the
11865 least significant bits after shifting by shft_amnt. */
11866 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11867 return (t == (t & -t));
11868}
11869
43e9d192
IB
11870/* Calculate the cost of calculating X, storing it in *COST. Result
11871 is true if the total cost of the operation has now been calculated. */
11872static bool
e548c9df 11873aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
43e9d192
IB
11874 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11875{
a8eecd00 11876 rtx op0, op1, op2;
73250c4c 11877 const struct cpu_cost_table *extra_cost
b175b679 11878 = aarch64_tune_params.insn_extra_cost;
e548c9df 11879 int code = GET_CODE (x);
b4206259 11880 scalar_int_mode int_mode;
43e9d192 11881
7fc5ef02
JG
11882 /* By default, assume that everything has equivalent cost to the
11883 cheapest instruction. Any additional costs are applied as a delta
11884 above this default. */
11885 *cost = COSTS_N_INSNS (1);
11886
43e9d192
IB
11887 switch (code)
11888 {
11889 case SET:
ba123b0d
JG
11890 /* The cost depends entirely on the operands to SET. */
11891 *cost = 0;
43e9d192
IB
11892 op0 = SET_DEST (x);
11893 op1 = SET_SRC (x);
11894
11895 switch (GET_CODE (op0))
11896 {
11897 case MEM:
11898 if (speed)
2961177e
JG
11899 {
11900 rtx address = XEXP (op0, 0);
b6875aac
KV
11901 if (VECTOR_MODE_P (mode))
11902 *cost += extra_cost->ldst.storev;
11903 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e
JG
11904 *cost += extra_cost->ldst.store;
11905 else if (mode == SFmode)
11906 *cost += extra_cost->ldst.storef;
11907 else if (mode == DFmode)
11908 *cost += extra_cost->ldst.stored;
11909
11910 *cost +=
11911 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11912 0, speed));
11913 }
43e9d192 11914
e548c9df 11915 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
11916 return true;
11917
11918 case SUBREG:
11919 if (! REG_P (SUBREG_REG (op0)))
e548c9df 11920 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
ba123b0d 11921
43e9d192
IB
11922 /* Fall through. */
11923 case REG:
b6875aac
KV
11924 /* The cost is one per vector-register copied. */
11925 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11926 {
fe1447a1
RS
11927 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11928 *cost = COSTS_N_INSNS (nregs);
b6875aac 11929 }
ba123b0d
JG
11930 /* const0_rtx is in general free, but we will use an
11931 instruction to set a register to 0. */
b6875aac
KV
11932 else if (REG_P (op1) || op1 == const0_rtx)
11933 {
11934 /* The cost is 1 per register copied. */
fe1447a1
RS
11935 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11936 *cost = COSTS_N_INSNS (nregs);
b6875aac 11937 }
ba123b0d
JG
11938 else
11939 /* Cost is just the cost of the RHS of the set. */
e548c9df 11940 *cost += rtx_cost (op1, mode, SET, 1, speed);
43e9d192
IB
11941 return true;
11942
ba123b0d 11943 case ZERO_EXTRACT:
43e9d192 11944 case SIGN_EXTRACT:
ba123b0d
JG
11945 /* Bit-field insertion. Strip any redundant widening of
11946 the RHS to meet the width of the target. */
43e9d192
IB
11947 if (GET_CODE (op1) == SUBREG)
11948 op1 = SUBREG_REG (op1);
11949 if ((GET_CODE (op1) == ZERO_EXTEND
11950 || GET_CODE (op1) == SIGN_EXTEND)
4aa81c2e 11951 && CONST_INT_P (XEXP (op0, 1))
77e994c9
RS
11952 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11953 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
43e9d192 11954 op1 = XEXP (op1, 0);
ba123b0d
JG
11955
11956 if (CONST_INT_P (op1))
11957 {
11958 /* MOV immediate is assumed to always be cheap. */
11959 *cost = COSTS_N_INSNS (1);
11960 }
11961 else
11962 {
11963 /* BFM. */
11964 if (speed)
11965 *cost += extra_cost->alu.bfi;
e548c9df 11966 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
ba123b0d
JG
11967 }
11968
43e9d192
IB
11969 return true;
11970
11971 default:
ba123b0d
JG
11972 /* We can't make sense of this, assume default cost. */
11973 *cost = COSTS_N_INSNS (1);
61263118 11974 return false;
43e9d192
IB
11975 }
11976 return false;
11977
9dfc162c
JG
11978 case CONST_INT:
11979 /* If an instruction can incorporate a constant within the
11980 instruction, the instruction's expression avoids calling
11981 rtx_cost() on the constant. If rtx_cost() is called on a
11982 constant, then it is usually because the constant must be
11983 moved into a register by one or more instructions.
11984
11985 The exception is constant 0, which can be expressed
11986 as XZR/WZR and is therefore free. The exception to this is
11987 if we have (set (reg) (const0_rtx)) in which case we must cost
11988 the move. However, we can catch that when we cost the SET, so
11989 we don't need to consider that here. */
11990 if (x == const0_rtx)
11991 *cost = 0;
11992 else
11993 {
11994 /* To an approximation, building any other constant is
11995 proportionally expensive to the number of instructions
11996 required to build that constant. This is true whether we
11997 are compiling for SPEED or otherwise. */
77e994c9
RS
11998 if (!is_a <scalar_int_mode> (mode, &int_mode))
11999 int_mode = word_mode;
82614948 12000 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
77e994c9 12001 (NULL_RTX, x, false, int_mode));
9dfc162c
JG
12002 }
12003 return true;
12004
12005 case CONST_DOUBLE:
a2170965
TC
12006
12007 /* First determine number of instructions to do the move
12008 as an integer constant. */
12009 if (!aarch64_float_const_representable_p (x)
12010 && !aarch64_can_const_movi_rtx_p (x, mode)
12011 && aarch64_float_const_rtx_p (x))
12012 {
12013 unsigned HOST_WIDE_INT ival;
12014 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12015 gcc_assert (succeed);
12016
77e994c9
RS
12017 scalar_int_mode imode = (mode == HFmode
12018 ? SImode
12019 : int_mode_for_mode (mode).require ());
a2170965
TC
12020 int ncost = aarch64_internal_mov_immediate
12021 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12022 *cost += COSTS_N_INSNS (ncost);
12023 return true;
12024 }
12025
9dfc162c
JG
12026 if (speed)
12027 {
12028 /* mov[df,sf]_aarch64. */
12029 if (aarch64_float_const_representable_p (x))
12030 /* FMOV (scalar immediate). */
12031 *cost += extra_cost->fp[mode == DFmode].fpconst;
12032 else if (!aarch64_float_const_zero_rtx_p (x))
12033 {
12034 /* This will be a load from memory. */
12035 if (mode == DFmode)
12036 *cost += extra_cost->ldst.loadd;
12037 else
12038 *cost += extra_cost->ldst.loadf;
12039 }
12040 else
12041 /* Otherwise this is +0.0. We get this using MOVI d0, #0
12042 or MOV v0.s[0], wzr - neither of which are modeled by the
12043 cost tables. Just use the default cost. */
12044 {
12045 }
12046 }
12047
12048 return true;
12049
43e9d192
IB
12050 case MEM:
12051 if (speed)
2961177e
JG
12052 {
12053 /* For loads we want the base cost of a load, plus an
12054 approximation for the additional cost of the addressing
12055 mode. */
12056 rtx address = XEXP (x, 0);
b6875aac
KV
12057 if (VECTOR_MODE_P (mode))
12058 *cost += extra_cost->ldst.loadv;
12059 else if (GET_MODE_CLASS (mode) == MODE_INT)
2961177e
JG
12060 *cost += extra_cost->ldst.load;
12061 else if (mode == SFmode)
12062 *cost += extra_cost->ldst.loadf;
12063 else if (mode == DFmode)
12064 *cost += extra_cost->ldst.loadd;
12065
12066 *cost +=
12067 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12068 0, speed));
12069 }
43e9d192
IB
12070
12071 return true;
12072
12073 case NEG:
4745e701
JG
12074 op0 = XEXP (x, 0);
12075
b6875aac
KV
12076 if (VECTOR_MODE_P (mode))
12077 {
12078 if (speed)
12079 {
12080 /* FNEG. */
12081 *cost += extra_cost->vect.alu;
12082 }
12083 return false;
12084 }
12085
e548c9df
AM
12086 if (GET_MODE_CLASS (mode) == MODE_INT)
12087 {
4745e701
JG
12088 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12089 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12090 {
12091 /* CSETM. */
e548c9df 12092 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
4745e701
JG
12093 return true;
12094 }
12095
12096 /* Cost this as SUB wzr, X. */
e548c9df 12097 op0 = CONST0_RTX (mode);
4745e701
JG
12098 op1 = XEXP (x, 0);
12099 goto cost_minus;
12100 }
12101
e548c9df 12102 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
4745e701
JG
12103 {
12104 /* Support (neg(fma...)) as a single instruction only if
12105 sign of zeros is unimportant. This matches the decision
12106 making in aarch64.md. */
12107 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12108 {
12109 /* FNMADD. */
e548c9df 12110 *cost = rtx_cost (op0, mode, NEG, 0, speed);
4745e701
JG
12111 return true;
12112 }
d318517d
SN
12113 if (GET_CODE (op0) == MULT)
12114 {
12115 /* FNMUL. */
12116 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12117 return true;
12118 }
4745e701
JG
12119 if (speed)
12120 /* FNEG. */
12121 *cost += extra_cost->fp[mode == DFmode].neg;
12122 return false;
12123 }
12124
12125 return false;
43e9d192 12126
781aeb73
KT
12127 case CLRSB:
12128 case CLZ:
12129 if (speed)
b6875aac
KV
12130 {
12131 if (VECTOR_MODE_P (mode))
12132 *cost += extra_cost->vect.alu;
12133 else
12134 *cost += extra_cost->alu.clz;
12135 }
781aeb73
KT
12136
12137 return false;
12138
5bfc8303
WD
12139 case CTZ:
12140 *cost = COSTS_N_INSNS (2);
12141
12142 if (speed)
12143 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12144 return false;
12145
43e9d192
IB
12146 case COMPARE:
12147 op0 = XEXP (x, 0);
12148 op1 = XEXP (x, 1);
12149
12150 if (op1 == const0_rtx
12151 && GET_CODE (op0) == AND)
12152 {
12153 x = op0;
e548c9df 12154 mode = GET_MODE (op0);
43e9d192
IB
12155 goto cost_logic;
12156 }
12157
a8eecd00
JG
12158 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12159 {
12160 /* TODO: A write to the CC flags possibly costs extra, this
12161 needs encoding in the cost tables. */
12162
e548c9df 12163 mode = GET_MODE (op0);
a8eecd00
JG
12164 /* ANDS. */
12165 if (GET_CODE (op0) == AND)
12166 {
12167 x = op0;
12168 goto cost_logic;
12169 }
12170
12171 if (GET_CODE (op0) == PLUS)
12172 {
12173 /* ADDS (and CMN alias). */
12174 x = op0;
12175 goto cost_plus;
12176 }
12177
12178 if (GET_CODE (op0) == MINUS)
12179 {
12180 /* SUBS. */
12181 x = op0;
12182 goto cost_minus;
12183 }
12184
345854d8
KT
12185 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12186 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12187 && CONST_INT_P (XEXP (op0, 2)))
12188 {
12189 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12190 Handle it here directly rather than going to cost_logic
12191 since we know the immediate generated for the TST is valid
12192 so we can avoid creating an intermediate rtx for it only
12193 for costing purposes. */
12194 if (speed)
12195 *cost += extra_cost->alu.logical;
12196
12197 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12198 ZERO_EXTRACT, 0, speed);
12199 return true;
12200 }
12201
a8eecd00
JG
12202 if (GET_CODE (op1) == NEG)
12203 {
12204 /* CMN. */
12205 if (speed)
12206 *cost += extra_cost->alu.arith;
12207
e548c9df
AM
12208 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12209 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
a8eecd00
JG
12210 return true;
12211 }
12212
12213 /* CMP.
12214
12215 Compare can freely swap the order of operands, and
12216 canonicalization puts the more complex operation first.
12217 But the integer MINUS logic expects the shift/extend
12218 operation in op1. */
12219 if (! (REG_P (op0)
12220 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12221 {
12222 op0 = XEXP (x, 1);
12223 op1 = XEXP (x, 0);
12224 }
12225 goto cost_minus;
12226 }
12227
12228 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12229 {
12230 /* FCMP. */
12231 if (speed)
12232 *cost += extra_cost->fp[mode == DFmode].compare;
12233
12234 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12235 {
e548c9df 12236 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
a8eecd00
JG
12237 /* FCMP supports constant 0.0 for no extra cost. */
12238 return true;
12239 }
12240 return false;
12241 }
12242
b6875aac
KV
12243 if (VECTOR_MODE_P (mode))
12244 {
12245 /* Vector compare. */
12246 if (speed)
12247 *cost += extra_cost->vect.alu;
12248
12249 if (aarch64_float_const_zero_rtx_p (op1))
12250 {
12251 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12252 cost. */
12253 return true;
12254 }
12255 return false;
12256 }
a8eecd00 12257 return false;
43e9d192
IB
12258
12259 case MINUS:
4745e701
JG
12260 {
12261 op0 = XEXP (x, 0);
12262 op1 = XEXP (x, 1);
12263
12264cost_minus:
e548c9df 12265 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
23cb6618 12266
4745e701
JG
12267 /* Detect valid immediates. */
12268 if ((GET_MODE_CLASS (mode) == MODE_INT
12269 || (GET_MODE_CLASS (mode) == MODE_CC
12270 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12271 && CONST_INT_P (op1)
12272 && aarch64_uimm12_shift (INTVAL (op1)))
12273 {
4745e701
JG
12274 if (speed)
12275 /* SUB(S) (immediate). */
12276 *cost += extra_cost->alu.arith;
12277 return true;
4745e701
JG
12278 }
12279
7cc2145f 12280 /* Look for SUB (extended register). */
77e994c9
RS
12281 if (is_a <scalar_int_mode> (mode, &int_mode)
12282 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
7cc2145f
JG
12283 {
12284 if (speed)
2533c820 12285 *cost += extra_cost->alu.extend_arith;
7cc2145f 12286
b10f1009 12287 op1 = aarch64_strip_extend (op1, true);
e47c4031 12288 *cost += rtx_cost (op1, VOIDmode,
e548c9df 12289 (enum rtx_code) GET_CODE (op1), 0, speed);
7cc2145f
JG
12290 return true;
12291 }
12292
b10f1009 12293 rtx new_op1 = aarch64_strip_extend (op1, false);
4745e701
JG
12294
12295 /* Cost this as an FMA-alike operation. */
12296 if ((GET_CODE (new_op1) == MULT
0a78ebe4 12297 || aarch64_shift_p (GET_CODE (new_op1)))
4745e701
JG
12298 && code != COMPARE)
12299 {
12300 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12301 (enum rtx_code) code,
12302 speed);
4745e701
JG
12303 return true;
12304 }
43e9d192 12305
e548c9df 12306 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
43e9d192 12307
4745e701
JG
12308 if (speed)
12309 {
b6875aac
KV
12310 if (VECTOR_MODE_P (mode))
12311 {
12312 /* Vector SUB. */
12313 *cost += extra_cost->vect.alu;
12314 }
12315 else if (GET_MODE_CLASS (mode) == MODE_INT)
12316 {
12317 /* SUB(S). */
12318 *cost += extra_cost->alu.arith;
12319 }
4745e701 12320 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
12321 {
12322 /* FSUB. */
12323 *cost += extra_cost->fp[mode == DFmode].addsub;
12324 }
4745e701
JG
12325 }
12326 return true;
12327 }
43e9d192
IB
12328
12329 case PLUS:
4745e701
JG
12330 {
12331 rtx new_op0;
43e9d192 12332
4745e701
JG
12333 op0 = XEXP (x, 0);
12334 op1 = XEXP (x, 1);
43e9d192 12335
a8eecd00 12336cost_plus:
4745e701
JG
12337 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12338 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12339 {
12340 /* CSINC. */
e548c9df
AM
12341 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12342 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
4745e701
JG
12343 return true;
12344 }
43e9d192 12345
4745e701 12346 if (GET_MODE_CLASS (mode) == MODE_INT
835d50c6 12347 && (aarch64_plus_immediate (op1, mode)
43cacb12 12348 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
4745e701 12349 {
e548c9df 12350 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
43e9d192 12351
4745e701
JG
12352 if (speed)
12353 /* ADD (immediate). */
12354 *cost += extra_cost->alu.arith;
12355 return true;
12356 }
12357
e548c9df 12358 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
23cb6618 12359
7cc2145f 12360 /* Look for ADD (extended register). */
77e994c9
RS
12361 if (is_a <scalar_int_mode> (mode, &int_mode)
12362 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
7cc2145f
JG
12363 {
12364 if (speed)
2533c820 12365 *cost += extra_cost->alu.extend_arith;
7cc2145f 12366
b10f1009 12367 op0 = aarch64_strip_extend (op0, true);
e47c4031 12368 *cost += rtx_cost (op0, VOIDmode,
e548c9df 12369 (enum rtx_code) GET_CODE (op0), 0, speed);
7cc2145f
JG
12370 return true;
12371 }
12372
4745e701
JG
12373 /* Strip any extend, leave shifts behind as we will
12374 cost them through mult_cost. */
b10f1009 12375 new_op0 = aarch64_strip_extend (op0, false);
4745e701
JG
12376
12377 if (GET_CODE (new_op0) == MULT
0a78ebe4 12378 || aarch64_shift_p (GET_CODE (new_op0)))
4745e701
JG
12379 {
12380 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12381 speed);
4745e701
JG
12382 return true;
12383 }
12384
e548c9df 12385 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
4745e701
JG
12386
12387 if (speed)
12388 {
b6875aac
KV
12389 if (VECTOR_MODE_P (mode))
12390 {
12391 /* Vector ADD. */
12392 *cost += extra_cost->vect.alu;
12393 }
12394 else if (GET_MODE_CLASS (mode) == MODE_INT)
12395 {
12396 /* ADD. */
12397 *cost += extra_cost->alu.arith;
12398 }
4745e701 12399 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b6875aac
KV
12400 {
12401 /* FADD. */
12402 *cost += extra_cost->fp[mode == DFmode].addsub;
12403 }
4745e701
JG
12404 }
12405 return true;
12406 }
43e9d192 12407
18b42b2a
KT
12408 case BSWAP:
12409 *cost = COSTS_N_INSNS (1);
12410
12411 if (speed)
b6875aac
KV
12412 {
12413 if (VECTOR_MODE_P (mode))
12414 *cost += extra_cost->vect.alu;
12415 else
12416 *cost += extra_cost->alu.rev;
12417 }
18b42b2a
KT
12418 return false;
12419
43e9d192 12420 case IOR:
f7d5cf8d
KT
12421 if (aarch_rev16_p (x))
12422 {
12423 *cost = COSTS_N_INSNS (1);
12424
b6875aac
KV
12425 if (speed)
12426 {
12427 if (VECTOR_MODE_P (mode))
12428 *cost += extra_cost->vect.alu;
12429 else
12430 *cost += extra_cost->alu.rev;
12431 }
12432 return true;
f7d5cf8d 12433 }
fb0cb7fa
KT
12434
12435 if (aarch64_extr_rtx_p (x, &op0, &op1))
12436 {
e548c9df
AM
12437 *cost += rtx_cost (op0, mode, IOR, 0, speed);
12438 *cost += rtx_cost (op1, mode, IOR, 1, speed);
fb0cb7fa
KT
12439 if (speed)
12440 *cost += extra_cost->alu.shift;
12441
12442 return true;
12443 }
f7d5cf8d 12444 /* Fall through. */
43e9d192
IB
12445 case XOR:
12446 case AND:
12447 cost_logic:
12448 op0 = XEXP (x, 0);
12449 op1 = XEXP (x, 1);
12450
b6875aac
KV
12451 if (VECTOR_MODE_P (mode))
12452 {
12453 if (speed)
12454 *cost += extra_cost->vect.alu;
12455 return true;
12456 }
12457
268c3b47
JG
12458 if (code == AND
12459 && GET_CODE (op0) == MULT
12460 && CONST_INT_P (XEXP (op0, 1))
12461 && CONST_INT_P (op1)
12462 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12463 INTVAL (op1)) != 0)
12464 {
12465 /* This is a UBFM/SBFM. */
e548c9df 12466 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
268c3b47
JG
12467 if (speed)
12468 *cost += extra_cost->alu.bfx;
12469 return true;
12470 }
12471
b4206259 12472 if (is_int_mode (mode, &int_mode))
43e9d192 12473 {
8c83f71d 12474 if (CONST_INT_P (op1))
43e9d192 12475 {
8c83f71d
KT
12476 /* We have a mask + shift version of a UBFIZ
12477 i.e. the *andim_ashift<mode>_bfiz pattern. */
12478 if (GET_CODE (op0) == ASHIFT
b4206259
RS
12479 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12480 XEXP (op0, 1)))
8c83f71d 12481 {
b4206259 12482 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8c83f71d
KT
12483 (enum rtx_code) code, 0, speed);
12484 if (speed)
12485 *cost += extra_cost->alu.bfx;
268c3b47 12486
8c83f71d
KT
12487 return true;
12488 }
b4206259 12489 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8c83f71d
KT
12490 {
12491 /* We possibly get the immediate for free, this is not
12492 modelled. */
b4206259
RS
12493 *cost += rtx_cost (op0, int_mode,
12494 (enum rtx_code) code, 0, speed);
8c83f71d
KT
12495 if (speed)
12496 *cost += extra_cost->alu.logical;
268c3b47 12497
8c83f71d
KT
12498 return true;
12499 }
43e9d192
IB
12500 }
12501 else
12502 {
268c3b47
JG
12503 rtx new_op0 = op0;
12504
12505 /* Handle ORN, EON, or BIC. */
43e9d192
IB
12506 if (GET_CODE (op0) == NOT)
12507 op0 = XEXP (op0, 0);
268c3b47
JG
12508
12509 new_op0 = aarch64_strip_shift (op0);
12510
12511 /* If we had a shift on op0 then this is a logical-shift-
12512 by-register/immediate operation. Otherwise, this is just
12513 a logical operation. */
12514 if (speed)
12515 {
12516 if (new_op0 != op0)
12517 {
12518 /* Shift by immediate. */
12519 if (CONST_INT_P (XEXP (op0, 1)))
12520 *cost += extra_cost->alu.log_shift;
12521 else
12522 *cost += extra_cost->alu.log_shift_reg;
12523 }
12524 else
12525 *cost += extra_cost->alu.logical;
12526 }
12527
12528 /* In both cases we want to cost both operands. */
b4206259
RS
12529 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12530 0, speed);
12531 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12532 1, speed);
268c3b47
JG
12533
12534 return true;
43e9d192 12535 }
43e9d192
IB
12536 }
12537 return false;
12538
268c3b47 12539 case NOT:
6365da9e
KT
12540 x = XEXP (x, 0);
12541 op0 = aarch64_strip_shift (x);
12542
b6875aac
KV
12543 if (VECTOR_MODE_P (mode))
12544 {
12545 /* Vector NOT. */
12546 *cost += extra_cost->vect.alu;
12547 return false;
12548 }
12549
6365da9e
KT
12550 /* MVN-shifted-reg. */
12551 if (op0 != x)
12552 {
e548c9df 12553 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6365da9e
KT
12554
12555 if (speed)
12556 *cost += extra_cost->alu.log_shift;
12557
12558 return true;
12559 }
12560 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12561 Handle the second form here taking care that 'a' in the above can
12562 be a shift. */
12563 else if (GET_CODE (op0) == XOR)
12564 {
12565 rtx newop0 = XEXP (op0, 0);
12566 rtx newop1 = XEXP (op0, 1);
12567 rtx op0_stripped = aarch64_strip_shift (newop0);
12568
e548c9df
AM
12569 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12570 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6365da9e
KT
12571
12572 if (speed)
12573 {
12574 if (op0_stripped != newop0)
12575 *cost += extra_cost->alu.log_shift;
12576 else
12577 *cost += extra_cost->alu.logical;
12578 }
12579
12580 return true;
12581 }
268c3b47
JG
12582 /* MVN. */
12583 if (speed)
12584 *cost += extra_cost->alu.logical;
12585
268c3b47
JG
12586 return false;
12587
43e9d192 12588 case ZERO_EXTEND:
b1685e62
JG
12589
12590 op0 = XEXP (x, 0);
12591 /* If a value is written in SI mode, then zero extended to DI
12592 mode, the operation will in general be free as a write to
12593 a 'w' register implicitly zeroes the upper bits of an 'x'
12594 register. However, if this is
12595
12596 (set (reg) (zero_extend (reg)))
12597
12598 we must cost the explicit register move. */
12599 if (mode == DImode
12600 && GET_MODE (op0) == SImode
12601 && outer == SET)
12602 {
e548c9df 12603 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
b1685e62 12604
dde23f43
KM
12605 /* If OP_COST is non-zero, then the cost of the zero extend
12606 is effectively the cost of the inner operation. Otherwise
12607 we have a MOV instruction and we take the cost from the MOV
12608 itself. This is true independently of whether we are
12609 optimizing for space or time. */
12610 if (op_cost)
b1685e62
JG
12611 *cost = op_cost;
12612
12613 return true;
12614 }
e548c9df 12615 else if (MEM_P (op0))
43e9d192 12616 {
b1685e62 12617 /* All loads can zero extend to any size for free. */
e548c9df 12618 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
43e9d192
IB
12619 return true;
12620 }
b1685e62 12621
283b6c85
KT
12622 op0 = aarch64_extend_bitfield_pattern_p (x);
12623 if (op0)
12624 {
12625 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12626 if (speed)
12627 *cost += extra_cost->alu.bfx;
12628 return true;
12629 }
12630
b1685e62 12631 if (speed)
b6875aac
KV
12632 {
12633 if (VECTOR_MODE_P (mode))
12634 {
12635 /* UMOV. */
12636 *cost += extra_cost->vect.alu;
12637 }
12638 else
12639 {
63715e5e
WD
12640 /* We generate an AND instead of UXTB/UXTH. */
12641 *cost += extra_cost->alu.logical;
b6875aac
KV
12642 }
12643 }
43e9d192
IB
12644 return false;
12645
12646 case SIGN_EXTEND:
b1685e62 12647 if (MEM_P (XEXP (x, 0)))
43e9d192 12648 {
b1685e62
JG
12649 /* LDRSH. */
12650 if (speed)
12651 {
12652 rtx address = XEXP (XEXP (x, 0), 0);
12653 *cost += extra_cost->ldst.load_sign_extend;
12654
12655 *cost +=
12656 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12657 0, speed));
12658 }
43e9d192
IB
12659 return true;
12660 }
b1685e62 12661
283b6c85
KT
12662 op0 = aarch64_extend_bitfield_pattern_p (x);
12663 if (op0)
12664 {
12665 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12666 if (speed)
12667 *cost += extra_cost->alu.bfx;
12668 return true;
12669 }
12670
b1685e62 12671 if (speed)
b6875aac
KV
12672 {
12673 if (VECTOR_MODE_P (mode))
12674 *cost += extra_cost->vect.alu;
12675 else
12676 *cost += extra_cost->alu.extend;
12677 }
43e9d192
IB
12678 return false;
12679
ba0cfa17
JG
12680 case ASHIFT:
12681 op0 = XEXP (x, 0);
12682 op1 = XEXP (x, 1);
12683
12684 if (CONST_INT_P (op1))
12685 {
ba0cfa17 12686 if (speed)
b6875aac
KV
12687 {
12688 if (VECTOR_MODE_P (mode))
12689 {
12690 /* Vector shift (immediate). */
12691 *cost += extra_cost->vect.alu;
12692 }
12693 else
12694 {
12695 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12696 aliases. */
12697 *cost += extra_cost->alu.shift;
12698 }
12699 }
ba0cfa17
JG
12700
12701 /* We can incorporate zero/sign extend for free. */
12702 if (GET_CODE (op0) == ZERO_EXTEND
12703 || GET_CODE (op0) == SIGN_EXTEND)
12704 op0 = XEXP (op0, 0);
12705
e548c9df 12706 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
ba0cfa17
JG
12707 return true;
12708 }
12709 else
12710 {
7813b280 12711 if (VECTOR_MODE_P (mode))
b6875aac 12712 {
7813b280
KT
12713 if (speed)
12714 /* Vector shift (register). */
12715 *cost += extra_cost->vect.alu;
12716 }
12717 else
12718 {
12719 if (speed)
12720 /* LSLV. */
12721 *cost += extra_cost->alu.shift_reg;
12722
12723 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12724 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
12725 && known_eq (INTVAL (XEXP (op1, 1)),
12726 GET_MODE_BITSIZE (mode) - 1))
b6875aac 12727 {
7813b280
KT
12728 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12729 /* We already demanded XEXP (op1, 0) to be REG_P, so
12730 don't recurse into it. */
12731 return true;
b6875aac
KV
12732 }
12733 }
ba0cfa17
JG
12734 return false; /* All arguments need to be in registers. */
12735 }
12736
43e9d192 12737 case ROTATE:
43e9d192
IB
12738 case ROTATERT:
12739 case LSHIFTRT:
43e9d192 12740 case ASHIFTRT:
ba0cfa17
JG
12741 op0 = XEXP (x, 0);
12742 op1 = XEXP (x, 1);
43e9d192 12743
ba0cfa17
JG
12744 if (CONST_INT_P (op1))
12745 {
12746 /* ASR (immediate) and friends. */
12747 if (speed)
b6875aac
KV
12748 {
12749 if (VECTOR_MODE_P (mode))
12750 *cost += extra_cost->vect.alu;
12751 else
12752 *cost += extra_cost->alu.shift;
12753 }
43e9d192 12754
e548c9df 12755 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
ba0cfa17
JG
12756 return true;
12757 }
12758 else
12759 {
7813b280 12760 if (VECTOR_MODE_P (mode))
b6875aac 12761 {
7813b280
KT
12762 if (speed)
12763 /* Vector shift (register). */
b6875aac 12764 *cost += extra_cost->vect.alu;
7813b280
KT
12765 }
12766 else
12767 {
12768 if (speed)
12769 /* ASR (register) and friends. */
b6875aac 12770 *cost += extra_cost->alu.shift_reg;
7813b280
KT
12771
12772 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12773 && CONST_INT_P (XEXP (op1, 1))
6a70badb
RS
12774 && known_eq (INTVAL (XEXP (op1, 1)),
12775 GET_MODE_BITSIZE (mode) - 1))
7813b280
KT
12776 {
12777 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12778 /* We already demanded XEXP (op1, 0) to be REG_P, so
12779 don't recurse into it. */
12780 return true;
12781 }
b6875aac 12782 }
ba0cfa17
JG
12783 return false; /* All arguments need to be in registers. */
12784 }
43e9d192 12785
909734be
JG
12786 case SYMBOL_REF:
12787
1b1e81f8
JW
12788 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12789 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
909734be
JG
12790 {
12791 /* LDR. */
12792 if (speed)
12793 *cost += extra_cost->ldst.load;
12794 }
12795 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12796 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12797 {
12798 /* ADRP, followed by ADD. */
12799 *cost += COSTS_N_INSNS (1);
12800 if (speed)
12801 *cost += 2 * extra_cost->alu.arith;
12802 }
12803 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12804 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12805 {
12806 /* ADR. */
12807 if (speed)
12808 *cost += extra_cost->alu.arith;
12809 }
12810
12811 if (flag_pic)
12812 {
12813 /* One extra load instruction, after accessing the GOT. */
12814 *cost += COSTS_N_INSNS (1);
12815 if (speed)
12816 *cost += extra_cost->ldst.load;
12817 }
43e9d192
IB
12818 return true;
12819
909734be 12820 case HIGH:
43e9d192 12821 case LO_SUM:
909734be
JG
12822 /* ADRP/ADD (immediate). */
12823 if (speed)
12824 *cost += extra_cost->alu.arith;
43e9d192
IB
12825 return true;
12826
12827 case ZERO_EXTRACT:
12828 case SIGN_EXTRACT:
7cc2145f
JG
12829 /* UBFX/SBFX. */
12830 if (speed)
b6875aac
KV
12831 {
12832 if (VECTOR_MODE_P (mode))
12833 *cost += extra_cost->vect.alu;
12834 else
12835 *cost += extra_cost->alu.bfx;
12836 }
7cc2145f
JG
12837
12838 /* We can trust that the immediates used will be correct (there
12839 are no by-register forms), so we need only cost op0. */
e548c9df 12840 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
43e9d192
IB
12841 return true;
12842
12843 case MULT:
4745e701
JG
12844 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12845 /* aarch64_rtx_mult_cost always handles recursion to its
12846 operands. */
12847 return true;
43e9d192
IB
12848
12849 case MOD:
4f58fe36
KT
12850 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12851 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12852 an unconditional negate. This case should only ever be reached through
12853 the set_smod_pow2_cheap check in expmed.c. */
12854 if (CONST_INT_P (XEXP (x, 1))
12855 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12856 && (mode == SImode || mode == DImode))
12857 {
12858 /* We expand to 4 instructions. Reset the baseline. */
12859 *cost = COSTS_N_INSNS (4);
12860
12861 if (speed)
12862 *cost += 2 * extra_cost->alu.logical
12863 + 2 * extra_cost->alu.arith;
12864
12865 return true;
12866 }
12867
12868 /* Fall-through. */
43e9d192 12869 case UMOD:
43e9d192
IB
12870 if (speed)
12871 {
cb9ac430 12872 /* Slighly prefer UMOD over SMOD. */
b6875aac
KV
12873 if (VECTOR_MODE_P (mode))
12874 *cost += extra_cost->vect.alu;
e548c9df
AM
12875 else if (GET_MODE_CLASS (mode) == MODE_INT)
12876 *cost += (extra_cost->mult[mode == DImode].add
cb9ac430
TC
12877 + extra_cost->mult[mode == DImode].idiv
12878 + (code == MOD ? 1 : 0));
43e9d192
IB
12879 }
12880 return false; /* All arguments need to be in registers. */
12881
12882 case DIV:
12883 case UDIV:
4105fe38 12884 case SQRT:
43e9d192
IB
12885 if (speed)
12886 {
b6875aac
KV
12887 if (VECTOR_MODE_P (mode))
12888 *cost += extra_cost->vect.alu;
12889 else if (GET_MODE_CLASS (mode) == MODE_INT)
4105fe38
JG
12890 /* There is no integer SQRT, so only DIV and UDIV can get
12891 here. */
cb9ac430
TC
12892 *cost += (extra_cost->mult[mode == DImode].idiv
12893 /* Slighly prefer UDIV over SDIV. */
12894 + (code == DIV ? 1 : 0));
4105fe38
JG
12895 else
12896 *cost += extra_cost->fp[mode == DFmode].div;
43e9d192
IB
12897 }
12898 return false; /* All arguments need to be in registers. */
12899
a8eecd00 12900 case IF_THEN_ELSE:
2d5ffe46
AP
12901 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12902 XEXP (x, 2), cost, speed);
a8eecd00
JG
12903
12904 case EQ:
12905 case NE:
12906 case GT:
12907 case GTU:
12908 case LT:
12909 case LTU:
12910 case GE:
12911 case GEU:
12912 case LE:
12913 case LEU:
12914
12915 return false; /* All arguments must be in registers. */
12916
b292109f
JG
12917 case FMA:
12918 op0 = XEXP (x, 0);
12919 op1 = XEXP (x, 1);
12920 op2 = XEXP (x, 2);
12921
12922 if (speed)
b6875aac
KV
12923 {
12924 if (VECTOR_MODE_P (mode))
12925 *cost += extra_cost->vect.alu;
12926 else
12927 *cost += extra_cost->fp[mode == DFmode].fma;
12928 }
b292109f
JG
12929
12930 /* FMSUB, FNMADD, and FNMSUB are free. */
12931 if (GET_CODE (op0) == NEG)
12932 op0 = XEXP (op0, 0);
12933
12934 if (GET_CODE (op2) == NEG)
12935 op2 = XEXP (op2, 0);
12936
12937 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12938 and the by-element operand as operand 0. */
12939 if (GET_CODE (op1) == NEG)
12940 op1 = XEXP (op1, 0);
12941
12942 /* Catch vector-by-element operations. The by-element operand can
12943 either be (vec_duplicate (vec_select (x))) or just
12944 (vec_select (x)), depending on whether we are multiplying by
12945 a vector or a scalar.
12946
12947 Canonicalization is not very good in these cases, FMA4 will put the
12948 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12949 if (GET_CODE (op0) == VEC_DUPLICATE)
12950 op0 = XEXP (op0, 0);
12951 else if (GET_CODE (op1) == VEC_DUPLICATE)
12952 op1 = XEXP (op1, 0);
12953
12954 if (GET_CODE (op0) == VEC_SELECT)
12955 op0 = XEXP (op0, 0);
12956 else if (GET_CODE (op1) == VEC_SELECT)
12957 op1 = XEXP (op1, 0);
12958
12959 /* If the remaining parameters are not registers,
12960 get the cost to put them into registers. */
e548c9df
AM
12961 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12962 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12963 *cost += rtx_cost (op2, mode, FMA, 2, speed);
b292109f
JG
12964 return true;
12965
5e2a765b
KT
12966 case FLOAT:
12967 case UNSIGNED_FLOAT:
12968 if (speed)
12969 *cost += extra_cost->fp[mode == DFmode].fromint;
12970 return false;
12971
b292109f
JG
12972 case FLOAT_EXTEND:
12973 if (speed)
b6875aac
KV
12974 {
12975 if (VECTOR_MODE_P (mode))
12976 {
12977 /*Vector truncate. */
12978 *cost += extra_cost->vect.alu;
12979 }
12980 else
12981 *cost += extra_cost->fp[mode == DFmode].widen;
12982 }
b292109f
JG
12983 return false;
12984
12985 case FLOAT_TRUNCATE:
12986 if (speed)
b6875aac
KV
12987 {
12988 if (VECTOR_MODE_P (mode))
12989 {
12990 /*Vector conversion. */
12991 *cost += extra_cost->vect.alu;
12992 }
12993 else
12994 *cost += extra_cost->fp[mode == DFmode].narrow;
12995 }
b292109f
JG
12996 return false;
12997
61263118
KT
12998 case FIX:
12999 case UNSIGNED_FIX:
13000 x = XEXP (x, 0);
13001 /* Strip the rounding part. They will all be implemented
13002 by the fcvt* family of instructions anyway. */
13003 if (GET_CODE (x) == UNSPEC)
13004 {
13005 unsigned int uns_code = XINT (x, 1);
13006
13007 if (uns_code == UNSPEC_FRINTA
13008 || uns_code == UNSPEC_FRINTM
13009 || uns_code == UNSPEC_FRINTN
13010 || uns_code == UNSPEC_FRINTP
13011 || uns_code == UNSPEC_FRINTZ)
13012 x = XVECEXP (x, 0, 0);
13013 }
13014
13015 if (speed)
b6875aac
KV
13016 {
13017 if (VECTOR_MODE_P (mode))
13018 *cost += extra_cost->vect.alu;
13019 else
13020 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13021 }
39252973
KT
13022
13023 /* We can combine fmul by a power of 2 followed by a fcvt into a single
13024 fixed-point fcvt. */
13025 if (GET_CODE (x) == MULT
13026 && ((VECTOR_MODE_P (mode)
13027 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13028 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13029 {
13030 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13031 0, speed);
13032 return true;
13033 }
13034
e548c9df 13035 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
61263118
KT
13036 return true;
13037
b292109f 13038 case ABS:
b6875aac
KV
13039 if (VECTOR_MODE_P (mode))
13040 {
13041 /* ABS (vector). */
13042 if (speed)
13043 *cost += extra_cost->vect.alu;
13044 }
13045 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
b292109f 13046 {
19261b99
KT
13047 op0 = XEXP (x, 0);
13048
13049 /* FABD, which is analogous to FADD. */
13050 if (GET_CODE (op0) == MINUS)
13051 {
e548c9df
AM
13052 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13053 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
19261b99
KT
13054 if (speed)
13055 *cost += extra_cost->fp[mode == DFmode].addsub;
13056
13057 return true;
13058 }
13059 /* Simple FABS is analogous to FNEG. */
b292109f
JG
13060 if (speed)
13061 *cost += extra_cost->fp[mode == DFmode].neg;
13062 }
13063 else
13064 {
13065 /* Integer ABS will either be split to
13066 two arithmetic instructions, or will be an ABS
13067 (scalar), which we don't model. */
13068 *cost = COSTS_N_INSNS (2);
13069 if (speed)
13070 *cost += 2 * extra_cost->alu.arith;
13071 }
13072 return false;
13073
13074 case SMAX:
13075 case SMIN:
13076 if (speed)
13077 {
b6875aac
KV
13078 if (VECTOR_MODE_P (mode))
13079 *cost += extra_cost->vect.alu;
13080 else
13081 {
13082 /* FMAXNM/FMINNM/FMAX/FMIN.
13083 TODO: This may not be accurate for all implementations, but
13084 we do not model this in the cost tables. */
13085 *cost += extra_cost->fp[mode == DFmode].addsub;
13086 }
b292109f
JG
13087 }
13088 return false;
13089
61263118
KT
13090 case UNSPEC:
13091 /* The floating point round to integer frint* instructions. */
13092 if (aarch64_frint_unspec_p (XINT (x, 1)))
13093 {
13094 if (speed)
13095 *cost += extra_cost->fp[mode == DFmode].roundint;
13096
13097 return false;
13098 }
781aeb73
KT
13099
13100 if (XINT (x, 1) == UNSPEC_RBIT)
13101 {
13102 if (speed)
13103 *cost += extra_cost->alu.rev;
13104
13105 return false;
13106 }
61263118
KT
13107 break;
13108
fb620c4a
JG
13109 case TRUNCATE:
13110
13111 /* Decompose <su>muldi3_highpart. */
13112 if (/* (truncate:DI */
13113 mode == DImode
13114 /* (lshiftrt:TI */
13115 && GET_MODE (XEXP (x, 0)) == TImode
13116 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13117 /* (mult:TI */
13118 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13119 /* (ANY_EXTEND:TI (reg:DI))
13120 (ANY_EXTEND:TI (reg:DI))) */
13121 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13122 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13123 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13124 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13125 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13126 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13127 /* (const_int 64) */
13128 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13129 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13130 {
13131 /* UMULH/SMULH. */
13132 if (speed)
13133 *cost += extra_cost->mult[mode == DImode].extend;
e548c9df
AM
13134 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13135 mode, MULT, 0, speed);
13136 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13137 mode, MULT, 1, speed);
fb620c4a
JG
13138 return true;
13139 }
13140
13141 /* Fall through. */
43e9d192 13142 default:
61263118 13143 break;
43e9d192 13144 }
61263118 13145
c10e3d7f
AP
13146 if (dump_file
13147 && flag_aarch64_verbose_cost)
61263118
KT
13148 fprintf (dump_file,
13149 "\nFailed to cost RTX. Assuming default cost.\n");
13150
13151 return true;
43e9d192
IB
13152}
13153
0ee859b5
JG
13154/* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13155 calculated for X. This cost is stored in *COST. Returns true
13156 if the total cost of X was calculated. */
13157static bool
e548c9df 13158aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
0ee859b5
JG
13159 int param, int *cost, bool speed)
13160{
e548c9df 13161 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
0ee859b5 13162
c10e3d7f
AP
13163 if (dump_file
13164 && flag_aarch64_verbose_cost)
0ee859b5
JG
13165 {
13166 print_rtl_single (dump_file, x);
13167 fprintf (dump_file, "\n%s cost: %d (%s)\n",
13168 speed ? "Hot" : "Cold",
13169 *cost, result ? "final" : "partial");
13170 }
13171
13172 return result;
13173}
13174
43e9d192 13175static int
ef4bddc2 13176aarch64_register_move_cost (machine_mode mode,
8a3a7e67 13177 reg_class_t from_i, reg_class_t to_i)
43e9d192 13178{
8a3a7e67
RH
13179 enum reg_class from = (enum reg_class) from_i;
13180 enum reg_class to = (enum reg_class) to_i;
43e9d192 13181 const struct cpu_regmove_cost *regmove_cost
b175b679 13182 = aarch64_tune_params.regmove_cost;
43e9d192 13183
3be07662 13184 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
96b7f495
MM
13185 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13186 || to == STUB_REGS)
3be07662
WD
13187 to = GENERAL_REGS;
13188
96b7f495
MM
13189 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13190 || from == STUB_REGS)
3be07662
WD
13191 from = GENERAL_REGS;
13192
183bfdaf
RS
13193 /* Make RDFFR very expensive. In particular, if we know that the FFR
13194 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13195 as a way of obtaining a PTRUE. */
13196 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13197 && hard_reg_set_subset_p (reg_class_contents[from_i],
13198 reg_class_contents[FFR_REGS]))
13199 return 80;
13200
6ee70f81
AP
13201 /* Moving between GPR and stack cost is the same as GP2GP. */
13202 if ((from == GENERAL_REGS && to == STACK_REG)
13203 || (to == GENERAL_REGS && from == STACK_REG))
13204 return regmove_cost->GP2GP;
13205
13206 /* To/From the stack register, we move via the gprs. */
13207 if (to == STACK_REG || from == STACK_REG)
13208 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13209 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13210
6a70badb 13211 if (known_eq (GET_MODE_SIZE (mode), 16))
8919453c
WD
13212 {
13213 /* 128-bit operations on general registers require 2 instructions. */
13214 if (from == GENERAL_REGS && to == GENERAL_REGS)
13215 return regmove_cost->GP2GP * 2;
13216 else if (from == GENERAL_REGS)
13217 return regmove_cost->GP2FP * 2;
13218 else if (to == GENERAL_REGS)
13219 return regmove_cost->FP2GP * 2;
13220
13221 /* When AdvSIMD instructions are disabled it is not possible to move
13222 a 128-bit value directly between Q registers. This is handled in
13223 secondary reload. A general register is used as a scratch to move
13224 the upper DI value and the lower DI value is moved directly,
13225 hence the cost is the sum of three moves. */
13226 if (! TARGET_SIMD)
13227 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13228
13229 return regmove_cost->FP2FP;
13230 }
13231
43e9d192
IB
13232 if (from == GENERAL_REGS && to == GENERAL_REGS)
13233 return regmove_cost->GP2GP;
13234 else if (from == GENERAL_REGS)
13235 return regmove_cost->GP2FP;
13236 else if (to == GENERAL_REGS)
13237 return regmove_cost->FP2GP;
13238
43e9d192
IB
13239 return regmove_cost->FP2FP;
13240}
13241
13242static int
ef4bddc2 13243aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
43e9d192
IB
13244 reg_class_t rclass ATTRIBUTE_UNUSED,
13245 bool in ATTRIBUTE_UNUSED)
13246{
b175b679 13247 return aarch64_tune_params.memmov_cost;
43e9d192
IB
13248}
13249
6d4d616a
RS
13250/* Implement TARGET_INIT_BUILTINS. */
13251static void
13252aarch64_init_builtins ()
13253{
13254 aarch64_general_init_builtins ();
624d0f07 13255 aarch64_sve::init_builtins ();
6d4d616a
RS
13256}
13257
13258/* Implement TARGET_FOLD_BUILTIN. */
13259static tree
13260aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13261{
13262 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13263 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13264 tree type = TREE_TYPE (TREE_TYPE (fndecl));
13265 switch (code & AARCH64_BUILTIN_CLASS)
13266 {
13267 case AARCH64_BUILTIN_GENERAL:
13268 return aarch64_general_fold_builtin (subcode, type, nargs, args);
624d0f07
RS
13269
13270 case AARCH64_BUILTIN_SVE:
13271 return NULL_TREE;
6d4d616a
RS
13272 }
13273 gcc_unreachable ();
13274}
13275
13276/* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13277static bool
13278aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13279{
13280 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13281 tree fndecl = gimple_call_fndecl (stmt);
13282 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13283 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13284 gimple *new_stmt = NULL;
13285 switch (code & AARCH64_BUILTIN_CLASS)
13286 {
13287 case AARCH64_BUILTIN_GENERAL:
13288 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13289 break;
624d0f07
RS
13290
13291 case AARCH64_BUILTIN_SVE:
13292 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13293 break;
6d4d616a
RS
13294 }
13295
13296 if (!new_stmt)
13297 return false;
13298
13299 gsi_replace (gsi, new_stmt, true);
13300 return true;
13301}
13302
13303/* Implement TARGET_EXPAND_BUILTIN. */
13304static rtx
c5dc215d 13305aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
6d4d616a
RS
13306{
13307 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13308 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13309 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13310 switch (code & AARCH64_BUILTIN_CLASS)
13311 {
13312 case AARCH64_BUILTIN_GENERAL:
c5dc215d 13313 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
624d0f07
RS
13314
13315 case AARCH64_BUILTIN_SVE:
13316 return aarch64_sve::expand_builtin (subcode, exp, target);
6d4d616a
RS
13317 }
13318 gcc_unreachable ();
13319}
13320
13321/* Implement TARGET_BUILTIN_DECL. */
13322static tree
13323aarch64_builtin_decl (unsigned int code, bool initialize_p)
13324{
13325 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13326 switch (code & AARCH64_BUILTIN_CLASS)
13327 {
13328 case AARCH64_BUILTIN_GENERAL:
13329 return aarch64_general_builtin_decl (subcode, initialize_p);
624d0f07
RS
13330
13331 case AARCH64_BUILTIN_SVE:
13332 return aarch64_sve::builtin_decl (subcode, initialize_p);
6d4d616a
RS
13333 }
13334 gcc_unreachable ();
13335}
13336
0c30e0f3
EM
13337/* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13338 to optimize 1.0/sqrt. */
ee62a5a6
RS
13339
13340static bool
9acc9cbe 13341use_rsqrt_p (machine_mode mode)
ee62a5a6
RS
13342{
13343 return (!flag_trapping_math
13344 && flag_unsafe_math_optimizations
9acc9cbe
EM
13345 && ((aarch64_tune_params.approx_modes->recip_sqrt
13346 & AARCH64_APPROX_MODE (mode))
1a33079e 13347 || flag_mrecip_low_precision_sqrt));
ee62a5a6
RS
13348}
13349
0c30e0f3
EM
13350/* Function to decide when to use the approximate reciprocal square root
13351 builtin. */
a6fc00da
BH
13352
13353static tree
ee62a5a6 13354aarch64_builtin_reciprocal (tree fndecl)
a6fc00da 13355{
9acc9cbe
EM
13356 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13357
13358 if (!use_rsqrt_p (mode))
a6fc00da 13359 return NULL_TREE;
6d4d616a
RS
13360 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13361 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13362 switch (code & AARCH64_BUILTIN_CLASS)
13363 {
13364 case AARCH64_BUILTIN_GENERAL:
13365 return aarch64_general_builtin_rsqrt (subcode);
624d0f07
RS
13366
13367 case AARCH64_BUILTIN_SVE:
13368 return NULL_TREE;
6d4d616a
RS
13369 }
13370 gcc_unreachable ();
a6fc00da
BH
13371}
13372
04f307cb
RS
13373/* Emit code to perform the floating-point operation:
13374
13375 DST = SRC1 * SRC2
13376
13377 where all three operands are already known to be registers.
13378 If the operation is an SVE one, PTRUE is a suitable all-true
13379 predicate. */
13380
13381static void
13382aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13383{
13384 if (ptrue)
13385 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13386 dst, ptrue, src1, src2,
13387 gen_int_mode (SVE_RELAXED_GP, SImode)));
13388 else
13389 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13390}
13391
98daafa0
EM
13392/* Emit instruction sequence to compute either the approximate square root
13393 or its approximate reciprocal, depending on the flag RECP, and return
13394 whether the sequence was emitted or not. */
a6fc00da 13395
98daafa0
EM
13396bool
13397aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
a6fc00da 13398{
98daafa0 13399 machine_mode mode = GET_MODE (dst);
daef0a8c
JW
13400
13401 if (GET_MODE_INNER (mode) == HFmode)
2e19adc8
RE
13402 {
13403 gcc_assert (!recp);
13404 return false;
13405 }
13406
2e19adc8
RE
13407 if (!recp)
13408 {
13409 if (!(flag_mlow_precision_sqrt
13410 || (aarch64_tune_params.approx_modes->sqrt
13411 & AARCH64_APPROX_MODE (mode))))
13412 return false;
13413
902d28bd 13414 if (!flag_finite_math_only
2e19adc8
RE
13415 || flag_trapping_math
13416 || !flag_unsafe_math_optimizations
13417 || optimize_function_for_size_p (cfun))
13418 return false;
13419 }
13420 else
13421 /* Caller assumes we cannot fail. */
13422 gcc_assert (use_rsqrt_p (mode));
daef0a8c 13423
a0ee8352
RS
13424 rtx pg = NULL_RTX;
13425 if (aarch64_sve_mode_p (mode))
13426 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
d7814449 13427 machine_mode mmsk = (VECTOR_MODE_P (mode)
d083ee47 13428 ? related_int_vector_mode (mode).require ()
d7814449 13429 : int_mode_for_mode (mode).require ());
0df28e68 13430 rtx xmsk = NULL_RTX;
98daafa0 13431 if (!recp)
0df28e68
RS
13432 {
13433 /* When calculating the approximate square root, compare the
13434 argument with 0.0 and create a mask. */
a0ee8352
RS
13435 rtx zero = CONST0_RTX (mode);
13436 if (pg)
13437 {
13438 xmsk = gen_reg_rtx (GET_MODE (pg));
13439 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13440 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13441 xmsk, pg, hint, src, zero));
13442 }
13443 else
13444 {
13445 xmsk = gen_reg_rtx (mmsk);
13446 emit_insn (gen_rtx_SET (xmsk,
13447 gen_rtx_NEG (mmsk,
13448 gen_rtx_EQ (mmsk, src, zero))));
13449 }
0df28e68 13450 }
a6fc00da 13451
98daafa0
EM
13452 /* Estimate the approximate reciprocal square root. */
13453 rtx xdst = gen_reg_rtx (mode);
0016d8d9 13454 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
a6fc00da 13455
98daafa0
EM
13456 /* Iterate over the series twice for SF and thrice for DF. */
13457 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
a6fc00da 13458
98daafa0
EM
13459 /* Optionally iterate over the series once less for faster performance
13460 while sacrificing the accuracy. */
13461 if ((recp && flag_mrecip_low_precision_sqrt)
13462 || (!recp && flag_mlow_precision_sqrt))
a6fc00da
BH
13463 iterations--;
13464
98daafa0
EM
13465 /* Iterate over the series to calculate the approximate reciprocal square
13466 root. */
13467 rtx x1 = gen_reg_rtx (mode);
13468 while (iterations--)
a6fc00da 13469 {
a6fc00da 13470 rtx x2 = gen_reg_rtx (mode);
a0ee8352 13471 aarch64_emit_mult (x2, pg, xdst, xdst);
98daafa0 13472
0016d8d9 13473 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
a6fc00da 13474
98daafa0 13475 if (iterations > 0)
a0ee8352 13476 aarch64_emit_mult (xdst, pg, xdst, x1);
98daafa0
EM
13477 }
13478
13479 if (!recp)
13480 {
a0ee8352
RS
13481 if (pg)
13482 /* Multiply nonzero source values by the corresponding intermediate
13483 result elements, so that the final calculation is the approximate
13484 square root rather than its reciprocal. Select a zero result for
13485 zero source values, to avoid the Inf * 0 -> NaN that we'd get
13486 otherwise. */
13487 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13488 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13489 else
13490 {
13491 /* Qualify the approximate reciprocal square root when the
13492 argument is 0.0 by squashing the intermediary result to 0.0. */
13493 rtx xtmp = gen_reg_rtx (mmsk);
13494 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13495 gen_rtx_SUBREG (mmsk, xdst, 0)));
13496 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
a6fc00da 13497
a0ee8352
RS
13498 /* Calculate the approximate square root. */
13499 aarch64_emit_mult (xdst, pg, xdst, src);
13500 }
a6fc00da
BH
13501 }
13502
98daafa0 13503 /* Finalize the approximation. */
a0ee8352 13504 aarch64_emit_mult (dst, pg, xdst, x1);
98daafa0
EM
13505
13506 return true;
a6fc00da
BH
13507}
13508
79a2bc2d
EM
13509/* Emit the instruction sequence to compute the approximation for the division
13510 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
13511
13512bool
13513aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13514{
13515 machine_mode mode = GET_MODE (quo);
33d72b63
JW
13516
13517 if (GET_MODE_INNER (mode) == HFmode)
13518 return false;
13519
79a2bc2d
EM
13520 bool use_approx_division_p = (flag_mlow_precision_div
13521 || (aarch64_tune_params.approx_modes->division
13522 & AARCH64_APPROX_MODE (mode)));
13523
13524 if (!flag_finite_math_only
13525 || flag_trapping_math
13526 || !flag_unsafe_math_optimizations
13527 || optimize_function_for_size_p (cfun)
13528 || !use_approx_division_p)
13529 return false;
13530
1be49a38
RR
13531 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13532 return false;
13533
04f307cb
RS
13534 rtx pg = NULL_RTX;
13535 if (aarch64_sve_mode_p (mode))
13536 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13537
79a2bc2d
EM
13538 /* Estimate the approximate reciprocal. */
13539 rtx xrcp = gen_reg_rtx (mode);
0016d8d9 13540 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
79a2bc2d
EM
13541
13542 /* Iterate over the series twice for SF and thrice for DF. */
13543 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13544
dbf3dc75
BL
13545 /* Optionally iterate over the series less for faster performance,
13546 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
79a2bc2d 13547 if (flag_mlow_precision_div)
dbf3dc75
BL
13548 iterations = (GET_MODE_INNER (mode) == DFmode
13549 ? aarch64_double_recp_precision
13550 : aarch64_float_recp_precision);
79a2bc2d
EM
13551
13552 /* Iterate over the series to calculate the approximate reciprocal. */
13553 rtx xtmp = gen_reg_rtx (mode);
13554 while (iterations--)
13555 {
0016d8d9 13556 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
79a2bc2d
EM
13557
13558 if (iterations > 0)
04f307cb 13559 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
79a2bc2d
EM
13560 }
13561
13562 if (num != CONST1_RTX (mode))
13563 {
13564 /* As the approximate reciprocal of DEN is already calculated, only
13565 calculate the approximate division when NUM is not 1.0. */
13566 rtx xnum = force_reg (mode, num);
04f307cb 13567 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
79a2bc2d
EM
13568 }
13569
13570 /* Finalize the approximation. */
04f307cb 13571 aarch64_emit_mult (quo, pg, xrcp, xtmp);
79a2bc2d
EM
13572 return true;
13573}
13574
d126a4ae
AP
13575/* Return the number of instructions that can be issued per cycle. */
13576static int
13577aarch64_sched_issue_rate (void)
13578{
b175b679 13579 return aarch64_tune_params.issue_rate;
d126a4ae
AP
13580}
13581
d0bc0cb6
RS
13582/* Implement TARGET_SCHED_VARIABLE_ISSUE. */
13583static int
13584aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13585{
13586 if (DEBUG_INSN_P (insn))
13587 return more;
13588
13589 rtx_code code = GET_CODE (PATTERN (insn));
13590 if (code == USE || code == CLOBBER)
13591 return more;
13592
13593 if (get_attr_type (insn) == TYPE_NO_INSN)
13594 return more;
13595
13596 return more - 1;
13597}
13598
d03f7e44
MK
13599static int
13600aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13601{
13602 int issue_rate = aarch64_sched_issue_rate ();
13603
13604 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13605}
13606
2d6bc7fa
KT
13607
13608/* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13609 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
13610 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13611
13612static int
13613aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13614 int ready_index)
13615{
13616 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13617}
13618
13619
8990e73a
TB
13620/* Vectorizer cost model target hooks. */
13621
13622/* Implement targetm.vectorize.builtin_vectorization_cost. */
13623static int
13624aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13625 tree vectype,
13626 int misalign ATTRIBUTE_UNUSED)
13627{
13628 unsigned elements;
cd8ae5ed
AP
13629 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13630 bool fp = false;
13631
13632 if (vectype != NULL)
13633 fp = FLOAT_TYPE_P (vectype);
8990e73a
TB
13634
13635 switch (type_of_cost)
13636 {
13637 case scalar_stmt:
cd8ae5ed 13638 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8990e73a
TB
13639
13640 case scalar_load:
cd8ae5ed 13641 return costs->scalar_load_cost;
8990e73a
TB
13642
13643 case scalar_store:
cd8ae5ed 13644 return costs->scalar_store_cost;
8990e73a
TB
13645
13646 case vector_stmt:
cd8ae5ed 13647 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8990e73a
TB
13648
13649 case vector_load:
cd8ae5ed 13650 return costs->vec_align_load_cost;
8990e73a
TB
13651
13652 case vector_store:
cd8ae5ed 13653 return costs->vec_store_cost;
8990e73a
TB
13654
13655 case vec_to_scalar:
cd8ae5ed 13656 return costs->vec_to_scalar_cost;
8990e73a
TB
13657
13658 case scalar_to_vec:
cd8ae5ed 13659 return costs->scalar_to_vec_cost;
8990e73a
TB
13660
13661 case unaligned_load:
cc9fe6bb 13662 case vector_gather_load:
cd8ae5ed 13663 return costs->vec_unalign_load_cost;
8990e73a
TB
13664
13665 case unaligned_store:
cc9fe6bb 13666 case vector_scatter_store:
cd8ae5ed 13667 return costs->vec_unalign_store_cost;
8990e73a
TB
13668
13669 case cond_branch_taken:
cd8ae5ed 13670 return costs->cond_taken_branch_cost;
8990e73a
TB
13671
13672 case cond_branch_not_taken:
cd8ae5ed 13673 return costs->cond_not_taken_branch_cost;
8990e73a
TB
13674
13675 case vec_perm:
cd8ae5ed 13676 return costs->vec_permute_cost;
c428f91c 13677
8990e73a 13678 case vec_promote_demote:
cd8ae5ed 13679 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8990e73a
TB
13680
13681 case vec_construct:
6a70badb 13682 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
8990e73a
TB
13683 return elements / 2 + 1;
13684
13685 default:
13686 gcc_unreachable ();
13687 }
13688}
13689
8b50d7a4
RS
13690/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13691 vectors would produce a series of LDP or STP operations. KIND is the
13692 kind of statement that STMT_INFO represents. */
13693static bool
13694aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13695 stmt_vec_info stmt_info)
13696{
13697 switch (kind)
13698 {
13699 case vector_load:
13700 case vector_store:
13701 case unaligned_load:
13702 case unaligned_store:
13703 break;
13704
13705 default:
13706 return false;
13707 }
13708
13709 if (aarch64_tune_params.extra_tuning_flags
13710 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13711 return false;
13712
13713 return is_gimple_assign (stmt_info->stmt);
13714}
13715
217ccab8
RS
13716/* Return true if STMT_INFO extends the result of a load. */
13717static bool
308bc496 13718aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
217ccab8
RS
13719{
13720 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13721 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13722 return false;
13723
13724 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13725 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13726 tree rhs_type = TREE_TYPE (rhs);
13727 if (!INTEGRAL_TYPE_P (lhs_type)
13728 || !INTEGRAL_TYPE_P (rhs_type)
13729 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13730 return false;
13731
308bc496 13732 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
217ccab8
RS
13733 return (def_stmt_info
13734 && STMT_VINFO_DATA_REF (def_stmt_info)
13735 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13736}
13737
2d56600c
RS
13738/* Return true if STMT_INFO is an integer truncation. */
13739static bool
13740aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13741{
13742 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13743 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13744 return false;
13745
13746 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13747 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13748 return (INTEGRAL_TYPE_P (lhs_type)
13749 && INTEGRAL_TYPE_P (rhs_type)
13750 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13751}
13752
217ccab8 13753/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
8b50d7a4
RS
13754 for STMT_INFO, which has cost kind KIND and which when vectorized would
13755 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
13756 targets. */
217ccab8 13757static unsigned int
308bc496 13758aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
8b50d7a4 13759 stmt_vec_info stmt_info, tree vectype,
217ccab8
RS
13760 unsigned int stmt_cost)
13761{
13762 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13763 vector register size or number of units. Integer promotions of this
13764 type therefore map to SXT[BHW] or UXT[BHW].
13765
13766 Most loads have extending forms that can do the sign or zero extension
13767 on the fly. Optimistically assume that a load followed by an extension
13768 will fold to this form during combine, and that the extension therefore
13769 comes for free. */
308bc496 13770 if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
217ccab8
RS
13771 stmt_cost = 0;
13772
2d56600c
RS
13773 /* For similar reasons, vector_stmt integer truncations are a no-op,
13774 because we can just ignore the unused upper bits of the source. */
13775 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13776 stmt_cost = 0;
13777
8b50d7a4
RS
13778 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13779 but there are no equivalent instructions for SVE. This means that
13780 (all other things being equal) 128-bit SVE needs twice as many load
13781 and store instructions as Advanced SIMD in order to process vector pairs.
13782
13783 Also, scalar code can often use LDP and STP to access pairs of values,
13784 so it is too simplistic to say that one SVE load or store replaces
13785 VF scalar loads and stores.
13786
13787 Ideally we would account for this in the scalar and Advanced SIMD
13788 costs by making suitable load/store pairs as cheap as a single
13789 load/store. However, that would be a very invasive change and in
13790 practice it tends to stress other parts of the cost model too much.
13791 E.g. stores of scalar constants currently count just a store,
13792 whereas stores of vector constants count a store and a vec_init.
13793 This is an artificial distinction for AArch64, where stores of
13794 nonzero scalar constants need the same kind of register invariant
13795 as vector stores.
13796
13797 An alternative would be to double the cost of any SVE loads and stores
13798 that could be paired in Advanced SIMD (and possibly also paired in
13799 scalar code). But this tends to stress other parts of the cost model
13800 in the same way. It also means that we can fall back to Advanced SIMD
13801 even if full-loop predication would have been useful.
13802
13803 Here we go for a more conservative version: double the costs of SVE
13804 loads and stores if one iteration of the scalar loop processes enough
13805 elements for it to use a whole number of Advanced SIMD LDP or STP
13806 instructions. This makes it very likely that the VF would be 1 for
13807 Advanced SIMD, and so no epilogue should be needed. */
13808 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13809 {
13810 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13811 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13812 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13813 if (multiple_p (count * elt_bits, 256)
13814 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13815 stmt_cost *= 2;
13816 }
13817
217ccab8
RS
13818 return stmt_cost;
13819}
13820
8990e73a
TB
13821/* Implement targetm.vectorize.add_stmt_cost. */
13822static unsigned
308bc496
RB
13823aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
13824 enum vect_cost_for_stmt kind,
78db0e09
RB
13825 struct _stmt_vec_info *stmt_info, tree vectype,
13826 int misalign, enum vect_cost_model_location where)
8990e73a
TB
13827{
13828 unsigned *cost = (unsigned *) data;
13829 unsigned retval = 0;
13830
13831 if (flag_vect_cost_model)
13832 {
8990e73a
TB
13833 int stmt_cost =
13834 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13835
217ccab8 13836 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
308bc496
RB
13837 stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
13838 vectype, stmt_cost);
217ccab8 13839
8990e73a
TB
13840 /* Statements in an inner loop relative to the loop being
13841 vectorized are weighted more heavily. The value here is
058e4c71 13842 arbitrary and could potentially be improved with analysis. */
308bc496
RB
13843 if (where == vect_body && stmt_info
13844 && stmt_in_inner_loop_p (vinfo, stmt_info))
058e4c71 13845 count *= 50; /* FIXME */
8990e73a
TB
13846
13847 retval = (unsigned) (count * stmt_cost);
13848 cost[where] += retval;
13849 }
13850
13851 return retval;
13852}
13853
0cfff2a1 13854static void initialize_aarch64_code_model (struct gcc_options *);
43e9d192 13855
0cfff2a1
KT
13856/* Parse the TO_PARSE string and put the architecture struct that it
13857 selects into RES and the architectural features into ISA_FLAGS.
13858 Return an aarch64_parse_opt_result describing the parse result.
c7887347
ML
13859 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13860 When the TO_PARSE string contains an invalid extension,
13861 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 13862
0cfff2a1
KT
13863static enum aarch64_parse_opt_result
13864aarch64_parse_arch (const char *to_parse, const struct processor **res,
28108a53 13865 uint64_t *isa_flags, std::string *invalid_extension)
43e9d192 13866{
ff150bc4 13867 const char *ext;
43e9d192 13868 const struct processor *arch;
43e9d192
IB
13869 size_t len;
13870
ff150bc4 13871 ext = strchr (to_parse, '+');
43e9d192
IB
13872
13873 if (ext != NULL)
ff150bc4 13874 len = ext - to_parse;
43e9d192 13875 else
ff150bc4 13876 len = strlen (to_parse);
43e9d192
IB
13877
13878 if (len == 0)
0cfff2a1
KT
13879 return AARCH64_PARSE_MISSING_ARG;
13880
43e9d192 13881
0cfff2a1 13882 /* Loop through the list of supported ARCHes to find a match. */
43e9d192
IB
13883 for (arch = all_architectures; arch->name != NULL; arch++)
13884 {
ff150bc4
ML
13885 if (strlen (arch->name) == len
13886 && strncmp (arch->name, to_parse, len) == 0)
43e9d192 13887 {
28108a53 13888 uint64_t isa_temp = arch->flags;
43e9d192
IB
13889
13890 if (ext != NULL)
13891 {
0cfff2a1
KT
13892 /* TO_PARSE string contains at least one extension. */
13893 enum aarch64_parse_opt_result ext_res
c7887347 13894 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 13895
0cfff2a1
KT
13896 if (ext_res != AARCH64_PARSE_OK)
13897 return ext_res;
ffee7aa9 13898 }
0cfff2a1
KT
13899 /* Extension parsing was successful. Confirm the result
13900 arch and ISA flags. */
13901 *res = arch;
13902 *isa_flags = isa_temp;
13903 return AARCH64_PARSE_OK;
43e9d192
IB
13904 }
13905 }
13906
13907 /* ARCH name not found in list. */
0cfff2a1 13908 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
13909}
13910
0cfff2a1
KT
13911/* Parse the TO_PARSE string and put the result tuning in RES and the
13912 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13913 describing the parse result. If there is an error parsing, RES and
c7887347
ML
13914 ISA_FLAGS are left unchanged.
13915 When the TO_PARSE string contains an invalid extension,
13916 a copy of the string is created and stored to INVALID_EXTENSION. */
43e9d192 13917
0cfff2a1
KT
13918static enum aarch64_parse_opt_result
13919aarch64_parse_cpu (const char *to_parse, const struct processor **res,
28108a53 13920 uint64_t *isa_flags, std::string *invalid_extension)
43e9d192 13921{
ff150bc4 13922 const char *ext;
43e9d192 13923 const struct processor *cpu;
43e9d192
IB
13924 size_t len;
13925
ff150bc4 13926 ext = strchr (to_parse, '+');
43e9d192
IB
13927
13928 if (ext != NULL)
ff150bc4 13929 len = ext - to_parse;
43e9d192 13930 else
ff150bc4 13931 len = strlen (to_parse);
43e9d192
IB
13932
13933 if (len == 0)
0cfff2a1
KT
13934 return AARCH64_PARSE_MISSING_ARG;
13935
43e9d192
IB
13936
13937 /* Loop through the list of supported CPUs to find a match. */
13938 for (cpu = all_cores; cpu->name != NULL; cpu++)
13939 {
ff150bc4 13940 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
43e9d192 13941 {
28108a53 13942 uint64_t isa_temp = cpu->flags;
0cfff2a1 13943
43e9d192
IB
13944
13945 if (ext != NULL)
13946 {
0cfff2a1
KT
13947 /* TO_PARSE string contains at least one extension. */
13948 enum aarch64_parse_opt_result ext_res
c7887347 13949 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
43e9d192 13950
0cfff2a1
KT
13951 if (ext_res != AARCH64_PARSE_OK)
13952 return ext_res;
13953 }
13954 /* Extension parsing was successfull. Confirm the result
13955 cpu and ISA flags. */
13956 *res = cpu;
13957 *isa_flags = isa_temp;
13958 return AARCH64_PARSE_OK;
43e9d192
IB
13959 }
13960 }
13961
13962 /* CPU name not found in list. */
0cfff2a1 13963 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
13964}
13965
0cfff2a1
KT
13966/* Parse the TO_PARSE string and put the cpu it selects into RES.
13967 Return an aarch64_parse_opt_result describing the parse result.
13968 If the parsing fails the RES does not change. */
43e9d192 13969
0cfff2a1
KT
13970static enum aarch64_parse_opt_result
13971aarch64_parse_tune (const char *to_parse, const struct processor **res)
43e9d192
IB
13972{
13973 const struct processor *cpu;
43e9d192
IB
13974
13975 /* Loop through the list of supported CPUs to find a match. */
13976 for (cpu = all_cores; cpu->name != NULL; cpu++)
13977 {
ff150bc4 13978 if (strcmp (cpu->name, to_parse) == 0)
43e9d192 13979 {
0cfff2a1
KT
13980 *res = cpu;
13981 return AARCH64_PARSE_OK;
43e9d192
IB
13982 }
13983 }
13984
13985 /* CPU name not found in list. */
0cfff2a1 13986 return AARCH64_PARSE_INVALID_ARG;
43e9d192
IB
13987}
13988
8dec06f2
JG
13989/* Parse TOKEN, which has length LENGTH to see if it is an option
13990 described in FLAG. If it is, return the index bit for that fusion type.
13991 If not, error (printing OPTION_NAME) and return zero. */
13992
13993static unsigned int
13994aarch64_parse_one_option_token (const char *token,
13995 size_t length,
13996 const struct aarch64_flag_desc *flag,
13997 const char *option_name)
13998{
13999 for (; flag->name != NULL; flag++)
14000 {
14001 if (length == strlen (flag->name)
14002 && !strncmp (flag->name, token, length))
14003 return flag->flag;
14004 }
14005
a3f9f006 14006 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
8dec06f2
JG
14007 return 0;
14008}
14009
14010/* Parse OPTION which is a comma-separated list of flags to enable.
14011 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14012 default state we inherit from the CPU tuning structures. OPTION_NAME
14013 gives the top-level option we are parsing in the -moverride string,
14014 for use in error messages. */
14015
14016static unsigned int
14017aarch64_parse_boolean_options (const char *option,
14018 const struct aarch64_flag_desc *flags,
14019 unsigned int initial_state,
14020 const char *option_name)
14021{
14022 const char separator = '.';
14023 const char* specs = option;
14024 const char* ntoken = option;
14025 unsigned int found_flags = initial_state;
14026
14027 while ((ntoken = strchr (specs, separator)))
14028 {
14029 size_t token_length = ntoken - specs;
14030 unsigned token_ops = aarch64_parse_one_option_token (specs,
14031 token_length,
14032 flags,
14033 option_name);
14034 /* If we find "none" (or, for simplicity's sake, an error) anywhere
14035 in the token stream, reset the supported operations. So:
14036
14037 adrp+add.cmp+branch.none.adrp+add
14038
14039 would have the result of turning on only adrp+add fusion. */
14040 if (!token_ops)
14041 found_flags = 0;
14042
14043 found_flags |= token_ops;
14044 specs = ++ntoken;
14045 }
14046
14047 /* We ended with a comma, print something. */
14048 if (!(*specs))
14049 {
14050 error ("%s string ill-formed\n", option_name);
14051 return 0;
14052 }
14053
14054 /* We still have one more token to parse. */
14055 size_t token_length = strlen (specs);
14056 unsigned token_ops = aarch64_parse_one_option_token (specs,
14057 token_length,
14058 flags,
14059 option_name);
14060 if (!token_ops)
14061 found_flags = 0;
14062
14063 found_flags |= token_ops;
14064 return found_flags;
14065}
14066
14067/* Support for overriding instruction fusion. */
14068
14069static void
14070aarch64_parse_fuse_string (const char *fuse_string,
14071 struct tune_params *tune)
14072{
14073 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14074 aarch64_fusible_pairs,
14075 tune->fusible_ops,
14076 "fuse=");
14077}
14078
14079/* Support for overriding other tuning flags. */
14080
14081static void
14082aarch64_parse_tune_string (const char *tune_string,
14083 struct tune_params *tune)
14084{
14085 tune->extra_tuning_flags
14086 = aarch64_parse_boolean_options (tune_string,
14087 aarch64_tuning_flags,
14088 tune->extra_tuning_flags,
14089 "tune=");
14090}
14091
886f092f
KT
14092/* Parse the sve_width tuning moverride string in TUNE_STRING.
14093 Accept the valid SVE vector widths allowed by
14094 aarch64_sve_vector_bits_enum and use it to override sve_width
14095 in TUNE. */
14096
14097static void
14098aarch64_parse_sve_width_string (const char *tune_string,
14099 struct tune_params *tune)
14100{
14101 int width = -1;
14102
14103 int n = sscanf (tune_string, "%d", &width);
14104 if (n == EOF)
14105 {
14106 error ("invalid format for sve_width");
14107 return;
14108 }
14109 switch (width)
14110 {
14111 case SVE_128:
14112 case SVE_256:
14113 case SVE_512:
14114 case SVE_1024:
14115 case SVE_2048:
14116 break;
14117 default:
14118 error ("invalid sve_width value: %d", width);
14119 }
14120 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14121}
14122
8dec06f2
JG
14123/* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14124 we understand. If it is, extract the option string and handoff to
14125 the appropriate function. */
14126
14127void
14128aarch64_parse_one_override_token (const char* token,
14129 size_t length,
14130 struct tune_params *tune)
14131{
14132 const struct aarch64_tuning_override_function *fn
14133 = aarch64_tuning_override_functions;
14134
14135 const char *option_part = strchr (token, '=');
14136 if (!option_part)
14137 {
14138 error ("tuning string missing in option (%s)", token);
14139 return;
14140 }
14141
14142 /* Get the length of the option name. */
14143 length = option_part - token;
14144 /* Skip the '=' to get to the option string. */
14145 option_part++;
14146
14147 for (; fn->name != NULL; fn++)
14148 {
14149 if (!strncmp (fn->name, token, length))
14150 {
14151 fn->parse_override (option_part, tune);
14152 return;
14153 }
14154 }
14155
14156 error ("unknown tuning option (%s)",token);
14157 return;
14158}
14159
5eee3c34
JW
14160/* A checking mechanism for the implementation of the tls size. */
14161
14162static void
14163initialize_aarch64_tls_size (struct gcc_options *opts)
14164{
14165 if (aarch64_tls_size == 0)
14166 aarch64_tls_size = 24;
14167
14168 switch (opts->x_aarch64_cmodel_var)
14169 {
14170 case AARCH64_CMODEL_TINY:
14171 /* Both the default and maximum TLS size allowed under tiny is 1M which
14172 needs two instructions to address, so we clamp the size to 24. */
14173 if (aarch64_tls_size > 24)
14174 aarch64_tls_size = 24;
14175 break;
14176 case AARCH64_CMODEL_SMALL:
14177 /* The maximum TLS size allowed under small is 4G. */
14178 if (aarch64_tls_size > 32)
14179 aarch64_tls_size = 32;
14180 break;
14181 case AARCH64_CMODEL_LARGE:
14182 /* The maximum TLS size allowed under large is 16E.
14183 FIXME: 16E should be 64bit, we only support 48bit offset now. */
14184 if (aarch64_tls_size > 48)
14185 aarch64_tls_size = 48;
14186 break;
14187 default:
14188 gcc_unreachable ();
14189 }
14190
14191 return;
14192}
14193
8dec06f2
JG
14194/* Parse STRING looking for options in the format:
14195 string :: option:string
14196 option :: name=substring
14197 name :: {a-z}
14198 substring :: defined by option. */
14199
14200static void
14201aarch64_parse_override_string (const char* input_string,
14202 struct tune_params* tune)
14203{
14204 const char separator = ':';
14205 size_t string_length = strlen (input_string) + 1;
14206 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14207 char *string = string_root;
14208 strncpy (string, input_string, string_length);
14209 string[string_length - 1] = '\0';
14210
14211 char* ntoken = string;
14212
14213 while ((ntoken = strchr (string, separator)))
14214 {
14215 size_t token_length = ntoken - string;
14216 /* Make this substring look like a string. */
14217 *ntoken = '\0';
14218 aarch64_parse_one_override_token (string, token_length, tune);
14219 string = ++ntoken;
14220 }
14221
14222 /* One last option to parse. */
14223 aarch64_parse_one_override_token (string, strlen (string), tune);
14224 free (string_root);
14225}
43e9d192 14226
43e9d192
IB
14227
14228static void
0cfff2a1 14229aarch64_override_options_after_change_1 (struct gcc_options *opts)
43e9d192 14230{
efac62a3
ST
14231 if (accepted_branch_protection_string)
14232 {
14233 opts->x_aarch64_branch_protection_string
14234 = xstrdup (accepted_branch_protection_string);
14235 }
14236
acea40ac
WD
14237 /* PR 70044: We have to be careful about being called multiple times for the
14238 same function. This means all changes should be repeatable. */
14239
d6cb6d6a
WD
14240 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14241 Disable the frame pointer flag so the mid-end will not use a frame
14242 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14243 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14244 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
14245 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
acea40ac 14246 if (opts->x_flag_omit_frame_pointer == 0)
a3dc8760 14247 opts->x_flag_omit_frame_pointer = 2;
43e9d192 14248
1be34295 14249 /* If not optimizing for size, set the default
0cfff2a1
KT
14250 alignment to what the target wants. */
14251 if (!opts->x_optimize_size)
43e9d192 14252 {
c518c102
ML
14253 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14254 opts->x_str_align_loops = aarch64_tune_params.loop_align;
14255 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14256 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14257 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14258 opts->x_str_align_functions = aarch64_tune_params.function_align;
43e9d192 14259 }
b4f50fd4 14260
9ee6540a
WD
14261 /* We default to no pc-relative literal loads. */
14262
14263 aarch64_pcrelative_literal_loads = false;
14264
14265 /* If -mpc-relative-literal-loads is set on the command line, this
b4f50fd4 14266 implies that the user asked for PC relative literal loads. */
9ee6540a
WD
14267 if (opts->x_pcrelative_literal_loads == 1)
14268 aarch64_pcrelative_literal_loads = true;
b4f50fd4 14269
9ee6540a
WD
14270 /* In the tiny memory model it makes no sense to disallow PC relative
14271 literal pool loads. */
14272 if (aarch64_cmodel == AARCH64_CMODEL_TINY
14273 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14274 aarch64_pcrelative_literal_loads = true;
98daafa0
EM
14275
14276 /* When enabling the lower precision Newton series for the square root, also
14277 enable it for the reciprocal square root, since the latter is an
14278 intermediary step for the former. */
14279 if (flag_mlow_precision_sqrt)
14280 flag_mrecip_low_precision_sqrt = true;
0cfff2a1 14281}
43e9d192 14282
0cfff2a1
KT
14283/* 'Unpack' up the internal tuning structs and update the options
14284 in OPTS. The caller must have set up selected_tune and selected_arch
14285 as all the other target-specific codegen decisions are
14286 derived from them. */
14287
e4ea20c8 14288void
0cfff2a1
KT
14289aarch64_override_options_internal (struct gcc_options *opts)
14290{
14291 aarch64_tune_flags = selected_tune->flags;
14292 aarch64_tune = selected_tune->sched_core;
14293 /* Make a copy of the tuning parameters attached to the core, which
14294 we may later overwrite. */
14295 aarch64_tune_params = *(selected_tune->tune);
14296 aarch64_architecture_version = selected_arch->architecture_version;
14297
14298 if (opts->x_aarch64_override_tune_string)
14299 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14300 &aarch64_tune_params);
14301
14302 /* This target defaults to strict volatile bitfields. */
14303 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14304 opts->x_flag_strict_volatile_bitfields = 1;
14305
cd0b2d36
RR
14306 if (aarch64_stack_protector_guard == SSP_GLOBAL
14307 && opts->x_aarch64_stack_protector_guard_offset_str)
14308 {
41804907 14309 error ("incompatible options %<-mstack-protector-guard=global%> and "
63d42e89 14310 "%<-mstack-protector-guard-offset=%s%>",
cd0b2d36
RR
14311 aarch64_stack_protector_guard_offset_str);
14312 }
14313
14314 if (aarch64_stack_protector_guard == SSP_SYSREG
14315 && !(opts->x_aarch64_stack_protector_guard_offset_str
14316 && opts->x_aarch64_stack_protector_guard_reg_str))
14317 {
a3f9f006
ML
14318 error ("both %<-mstack-protector-guard-offset%> and "
14319 "%<-mstack-protector-guard-reg%> must be used "
14320 "with %<-mstack-protector-guard=sysreg%>");
cd0b2d36
RR
14321 }
14322
14323 if (opts->x_aarch64_stack_protector_guard_reg_str)
14324 {
14325 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14326 error ("specify a system register with a small string length.");
14327 }
14328
14329 if (opts->x_aarch64_stack_protector_guard_offset_str)
14330 {
14331 char *end;
14332 const char *str = aarch64_stack_protector_guard_offset_str;
14333 errno = 0;
14334 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14335 if (!*str || *end || errno)
14336 error ("%qs is not a valid offset in %qs", str,
63d42e89 14337 "-mstack-protector-guard-offset=");
cd0b2d36
RR
14338 aarch64_stack_protector_guard_offset = offs;
14339 }
14340
0cfff2a1 14341 initialize_aarch64_code_model (opts);
5eee3c34 14342 initialize_aarch64_tls_size (opts);
63892fa2 14343
2d6bc7fa
KT
14344 int queue_depth = 0;
14345 switch (aarch64_tune_params.autoprefetcher_model)
14346 {
14347 case tune_params::AUTOPREFETCHER_OFF:
14348 queue_depth = -1;
14349 break;
14350 case tune_params::AUTOPREFETCHER_WEAK:
14351 queue_depth = 0;
14352 break;
14353 case tune_params::AUTOPREFETCHER_STRONG:
14354 queue_depth = max_insn_queue_index + 1;
14355 break;
14356 default:
14357 gcc_unreachable ();
14358 }
14359
14360 /* We don't mind passing in global_options_set here as we don't use
14361 the *options_set structs anyway. */
028d4092
ML
14362 SET_OPTION_IF_UNSET (opts, &global_options_set,
14363 param_sched_autopref_queue_depth, queue_depth);
2d6bc7fa 14364
9d2c6e2e
MK
14365 /* Set up parameters to be used in prefetching algorithm. Do not
14366 override the defaults unless we are tuning for a core we have
14367 researched values for. */
14368 if (aarch64_tune_params.prefetch->num_slots > 0)
028d4092
ML
14369 SET_OPTION_IF_UNSET (opts, &global_options_set,
14370 param_simultaneous_prefetches,
14371 aarch64_tune_params.prefetch->num_slots);
9d2c6e2e 14372 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
028d4092
ML
14373 SET_OPTION_IF_UNSET (opts, &global_options_set,
14374 param_l1_cache_size,
14375 aarch64_tune_params.prefetch->l1_cache_size);
9d2c6e2e 14376 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
028d4092
ML
14377 SET_OPTION_IF_UNSET (opts, &global_options_set,
14378 param_l1_cache_line_size,
14379 aarch64_tune_params.prefetch->l1_cache_line_size);
9d2c6e2e 14380 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
028d4092
ML
14381 SET_OPTION_IF_UNSET (opts, &global_options_set,
14382 param_l2_cache_size,
14383 aarch64_tune_params.prefetch->l2_cache_size);
d2ff35c0 14384 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
028d4092
ML
14385 SET_OPTION_IF_UNSET (opts, &global_options_set,
14386 param_prefetch_dynamic_strides, 0);
59100dfc 14387 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
028d4092
ML
14388 SET_OPTION_IF_UNSET (opts, &global_options_set,
14389 param_prefetch_minimum_stride,
14390 aarch64_tune_params.prefetch->minimum_stride);
50487d79 14391
13494fcb 14392 /* Use the alternative scheduling-pressure algorithm by default. */
028d4092
ML
14393 SET_OPTION_IF_UNSET (opts, &global_options_set,
14394 param_sched_pressure_algorithm,
14395 SCHED_PRESSURE_MODEL);
13494fcb 14396
fbe9af50 14397 /* Validate the guard size. */
028d4092 14398 int guard_size = param_stack_clash_protection_guard_size;
fbe9af50 14399
8100e93b
ML
14400 if (guard_size != 12 && guard_size != 16)
14401 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14402 "size. Given value %d (%llu KB) is out of range",
14403 guard_size, (1ULL << guard_size) / 1024ULL);
14404
fbe9af50
TC
14405 /* Enforce that interval is the same size as size so the mid-end does the
14406 right thing. */
028d4092
ML
14407 SET_OPTION_IF_UNSET (opts, &global_options_set,
14408 param_stack_clash_protection_probe_interval,
14409 guard_size);
fbe9af50
TC
14410
14411 /* The maybe_set calls won't update the value if the user has explicitly set
14412 one. Which means we need to validate that probing interval and guard size
14413 are equal. */
14414 int probe_interval
028d4092 14415 = param_stack_clash_protection_probe_interval;
fbe9af50 14416 if (guard_size != probe_interval)
904f3daa
ML
14417 error ("stack clash guard size %<%d%> must be equal to probing interval "
14418 "%<%d%>", guard_size, probe_interval);
fbe9af50 14419
16b2cafd
MK
14420 /* Enable sw prefetching at specified optimization level for
14421 CPUS that have prefetch. Lower optimization level threshold by 1
14422 when profiling is enabled. */
14423 if (opts->x_flag_prefetch_loop_arrays < 0
14424 && !opts->x_optimize_size
14425 && aarch64_tune_params.prefetch->default_opt_level >= 0
14426 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14427 opts->x_flag_prefetch_loop_arrays = 1;
14428
266c2b54
ML
14429 if (opts->x_aarch64_arch_string == NULL)
14430 opts->x_aarch64_arch_string = selected_arch->name;
14431 if (opts->x_aarch64_cpu_string == NULL)
14432 opts->x_aarch64_cpu_string = selected_cpu->name;
14433 if (opts->x_aarch64_tune_string == NULL)
14434 opts->x_aarch64_tune_string = selected_tune->name;
14435
0cfff2a1
KT
14436 aarch64_override_options_after_change_1 (opts);
14437}
43e9d192 14438
01f44038
KT
14439/* Print a hint with a suggestion for a core or architecture name that
14440 most closely resembles what the user passed in STR. ARCH is true if
14441 the user is asking for an architecture name. ARCH is false if the user
14442 is asking for a core name. */
14443
14444static void
14445aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14446{
14447 auto_vec<const char *> candidates;
14448 const struct processor *entry = arch ? all_architectures : all_cores;
14449 for (; entry->name != NULL; entry++)
14450 candidates.safe_push (entry->name);
a08b5429
ML
14451
14452#ifdef HAVE_LOCAL_CPU_DETECT
14453 /* Add also "native" as possible value. */
14454 if (arch)
14455 candidates.safe_push ("native");
14456#endif
14457
01f44038
KT
14458 char *s;
14459 const char *hint = candidates_list_and_hint (str, s, candidates);
14460 if (hint)
14461 inform (input_location, "valid arguments are: %s;"
14462 " did you mean %qs?", s, hint);
6285e915
ML
14463 else
14464 inform (input_location, "valid arguments are: %s", s);
14465
01f44038
KT
14466 XDELETEVEC (s);
14467}
14468
14469/* Print a hint with a suggestion for a core name that most closely resembles
14470 what the user passed in STR. */
14471
14472inline static void
14473aarch64_print_hint_for_core (const char *str)
14474{
14475 aarch64_print_hint_for_core_or_arch (str, false);
14476}
14477
14478/* Print a hint with a suggestion for an architecture name that most closely
14479 resembles what the user passed in STR. */
14480
14481inline static void
14482aarch64_print_hint_for_arch (const char *str)
14483{
14484 aarch64_print_hint_for_core_or_arch (str, true);
14485}
14486
c7887347
ML
14487
14488/* Print a hint with a suggestion for an extension name
14489 that most closely resembles what the user passed in STR. */
14490
14491void
14492aarch64_print_hint_for_extensions (const std::string &str)
14493{
14494 auto_vec<const char *> candidates;
14495 aarch64_get_all_extension_candidates (&candidates);
14496 char *s;
14497 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14498 if (hint)
14499 inform (input_location, "valid arguments are: %s;"
14500 " did you mean %qs?", s, hint);
14501 else
14502 inform (input_location, "valid arguments are: %s;", s);
14503
14504 XDELETEVEC (s);
14505}
14506
0cfff2a1
KT
14507/* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
14508 specified in STR and throw errors if appropriate. Put the results if
361fb3ee
KT
14509 they are valid in RES and ISA_FLAGS. Return whether the option is
14510 valid. */
43e9d192 14511
361fb3ee 14512static bool
0cfff2a1 14513aarch64_validate_mcpu (const char *str, const struct processor **res,
28108a53 14514 uint64_t *isa_flags)
0cfff2a1 14515{
c7887347 14516 std::string invalid_extension;
0cfff2a1 14517 enum aarch64_parse_opt_result parse_res
c7887347 14518 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
14519
14520 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 14521 return true;
0cfff2a1
KT
14522
14523 switch (parse_res)
14524 {
14525 case AARCH64_PARSE_MISSING_ARG:
fb241da2 14526 error ("missing cpu name in %<-mcpu=%s%>", str);
0cfff2a1
KT
14527 break;
14528 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 14529 error ("unknown value %qs for %<-mcpu%>", str);
01f44038 14530 aarch64_print_hint_for_core (str);
0cfff2a1
KT
14531 break;
14532 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
14533 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14534 invalid_extension.c_str (), str);
14535 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
14536 break;
14537 default:
14538 gcc_unreachable ();
14539 }
361fb3ee
KT
14540
14541 return false;
0cfff2a1
KT
14542}
14543
a9ba2a9b
MM
14544/* Straight line speculation indicators. */
14545enum aarch64_sls_hardening_type
14546{
14547 SLS_NONE = 0,
14548 SLS_RETBR = 1,
14549 SLS_BLR = 2,
14550 SLS_ALL = 3,
14551};
14552static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14553
14554/* Return whether we should mitigatate Straight Line Speculation for the RET
14555 and BR instructions. */
14556bool
14557aarch64_harden_sls_retbr_p (void)
14558{
14559 return aarch64_sls_hardening & SLS_RETBR;
14560}
14561
14562/* Return whether we should mitigatate Straight Line Speculation for the BLR
14563 instruction. */
14564bool
14565aarch64_harden_sls_blr_p (void)
14566{
14567 return aarch64_sls_hardening & SLS_BLR;
14568}
14569
14570/* As of yet we only allow setting these options globally, in the future we may
14571 allow setting them per function. */
14572static void
14573aarch64_validate_sls_mitigation (const char *const_str)
14574{
14575 char *token_save = NULL;
14576 char *str = NULL;
14577
14578 if (strcmp (const_str, "none") == 0)
14579 {
14580 aarch64_sls_hardening = SLS_NONE;
14581 return;
14582 }
14583 if (strcmp (const_str, "all") == 0)
14584 {
14585 aarch64_sls_hardening = SLS_ALL;
14586 return;
14587 }
14588
14589 char *str_root = xstrdup (const_str);
14590 str = strtok_r (str_root, ",", &token_save);
14591 if (!str)
14592 error ("invalid argument given to %<-mharden-sls=%>");
14593
14594 int temp = SLS_NONE;
14595 while (str)
14596 {
14597 if (strcmp (str, "blr") == 0)
14598 temp |= SLS_BLR;
14599 else if (strcmp (str, "retbr") == 0)
14600 temp |= SLS_RETBR;
14601 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14602 {
14603 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14604 break;
14605 }
14606 else
14607 {
14608 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14609 break;
14610 }
14611 str = strtok_r (NULL, ",", &token_save);
14612 }
14613 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14614 free (str_root);
14615}
14616
efac62a3
ST
14617/* Parses CONST_STR for branch protection features specified in
14618 aarch64_branch_protect_types, and set any global variables required. Returns
14619 the parsing result and assigns LAST_STR to the last processed token from
14620 CONST_STR so that it can be used for error reporting. */
14621
14622static enum
14623aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14624 char** last_str)
14625{
14626 char *str_root = xstrdup (const_str);
14627 char* token_save = NULL;
14628 char *str = strtok_r (str_root, "+", &token_save);
14629 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14630 if (!str)
14631 res = AARCH64_PARSE_MISSING_ARG;
14632 else
14633 {
14634 char *next_str = strtok_r (NULL, "+", &token_save);
14635 /* Reset the branch protection features to their defaults. */
14636 aarch64_handle_no_branch_protection (NULL, NULL);
14637
14638 while (str && res == AARCH64_PARSE_OK)
14639 {
14640 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14641 bool found = false;
14642 /* Search for this type. */
14643 while (type && type->name && !found && res == AARCH64_PARSE_OK)
14644 {
14645 if (strcmp (str, type->name) == 0)
14646 {
14647 found = true;
14648 res = type->handler (str, next_str);
14649 str = next_str;
14650 next_str = strtok_r (NULL, "+", &token_save);
14651 }
14652 else
14653 type++;
14654 }
14655 if (found && res == AARCH64_PARSE_OK)
14656 {
14657 bool found_subtype = true;
14658 /* Loop through each token until we find one that isn't a
14659 subtype. */
14660 while (found_subtype)
14661 {
14662 found_subtype = false;
14663 const aarch64_branch_protect_type *subtype = type->subtypes;
14664 /* Search for the subtype. */
14665 while (str && subtype && subtype->name && !found_subtype
14666 && res == AARCH64_PARSE_OK)
14667 {
14668 if (strcmp (str, subtype->name) == 0)
14669 {
14670 found_subtype = true;
14671 res = subtype->handler (str, next_str);
14672 str = next_str;
14673 next_str = strtok_r (NULL, "+", &token_save);
14674 }
14675 else
14676 subtype++;
14677 }
14678 }
14679 }
14680 else if (!found)
14681 res = AARCH64_PARSE_INVALID_ARG;
14682 }
14683 }
14684 /* Copy the last processed token into the argument to pass it back.
14685 Used by option and attribute validation to print the offending token. */
14686 if (last_str)
14687 {
14688 if (str) strcpy (*last_str, str);
14689 else *last_str = NULL;
14690 }
14691 if (res == AARCH64_PARSE_OK)
14692 {
14693 /* If needed, alloc the accepted string then copy in const_str.
14694 Used by override_option_after_change_1. */
14695 if (!accepted_branch_protection_string)
14696 accepted_branch_protection_string = (char *) xmalloc (
14697 BRANCH_PROTECT_STR_MAX
14698 + 1);
14699 strncpy (accepted_branch_protection_string, const_str,
14700 BRANCH_PROTECT_STR_MAX + 1);
14701 /* Forcibly null-terminate. */
14702 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14703 }
14704 return res;
14705}
14706
14707static bool
14708aarch64_validate_mbranch_protection (const char *const_str)
14709{
14710 char *str = (char *) xmalloc (strlen (const_str));
14711 enum aarch64_parse_opt_result res =
14712 aarch64_parse_branch_protection (const_str, &str);
14713 if (res == AARCH64_PARSE_INVALID_ARG)
a9c697b8 14714 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
efac62a3 14715 else if (res == AARCH64_PARSE_MISSING_ARG)
a9c697b8 14716 error ("missing argument for %<-mbranch-protection=%>");
efac62a3
ST
14717 free (str);
14718 return res == AARCH64_PARSE_OK;
14719}
14720
0cfff2a1
KT
14721/* Validate a command-line -march option. Parse the arch and extensions
14722 (if any) specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
14723 results, if they are valid, in RES and ISA_FLAGS. Return whether the
14724 option is valid. */
0cfff2a1 14725
361fb3ee 14726static bool
0cfff2a1 14727aarch64_validate_march (const char *str, const struct processor **res,
28108a53 14728 uint64_t *isa_flags)
0cfff2a1 14729{
c7887347 14730 std::string invalid_extension;
0cfff2a1 14731 enum aarch64_parse_opt_result parse_res
c7887347 14732 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
0cfff2a1
KT
14733
14734 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 14735 return true;
0cfff2a1
KT
14736
14737 switch (parse_res)
14738 {
14739 case AARCH64_PARSE_MISSING_ARG:
fb241da2 14740 error ("missing arch name in %<-march=%s%>", str);
0cfff2a1
KT
14741 break;
14742 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 14743 error ("unknown value %qs for %<-march%>", str);
01f44038 14744 aarch64_print_hint_for_arch (str);
0cfff2a1
KT
14745 break;
14746 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
14747 error ("invalid feature modifier %qs in %<-march=%s%>",
14748 invalid_extension.c_str (), str);
14749 aarch64_print_hint_for_extensions (invalid_extension);
0cfff2a1
KT
14750 break;
14751 default:
14752 gcc_unreachable ();
14753 }
361fb3ee
KT
14754
14755 return false;
0cfff2a1
KT
14756}
14757
14758/* Validate a command-line -mtune option. Parse the cpu
14759 specified in STR and throw errors if appropriate. Put the
361fb3ee
KT
14760 result, if it is valid, in RES. Return whether the option is
14761 valid. */
0cfff2a1 14762
361fb3ee 14763static bool
0cfff2a1
KT
14764aarch64_validate_mtune (const char *str, const struct processor **res)
14765{
14766 enum aarch64_parse_opt_result parse_res
14767 = aarch64_parse_tune (str, res);
14768
14769 if (parse_res == AARCH64_PARSE_OK)
361fb3ee 14770 return true;
0cfff2a1
KT
14771
14772 switch (parse_res)
14773 {
14774 case AARCH64_PARSE_MISSING_ARG:
fb241da2 14775 error ("missing cpu name in %<-mtune=%s%>", str);
0cfff2a1
KT
14776 break;
14777 case AARCH64_PARSE_INVALID_ARG:
a3f9f006 14778 error ("unknown value %qs for %<-mtune%>", str);
01f44038 14779 aarch64_print_hint_for_core (str);
0cfff2a1
KT
14780 break;
14781 default:
14782 gcc_unreachable ();
14783 }
361fb3ee
KT
14784 return false;
14785}
14786
14787/* Return the CPU corresponding to the enum CPU.
14788 If it doesn't specify a cpu, return the default. */
14789
14790static const struct processor *
14791aarch64_get_tune_cpu (enum aarch64_processor cpu)
14792{
14793 if (cpu != aarch64_none)
14794 return &all_cores[cpu];
14795
14796 /* The & 0x3f is to extract the bottom 6 bits that encode the
14797 default cpu as selected by the --with-cpu GCC configure option
14798 in config.gcc.
14799 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14800 flags mechanism should be reworked to make it more sane. */
14801 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14802}
14803
14804/* Return the architecture corresponding to the enum ARCH.
14805 If it doesn't specify a valid architecture, return the default. */
14806
14807static const struct processor *
14808aarch64_get_arch (enum aarch64_arch arch)
14809{
14810 if (arch != aarch64_no_arch)
14811 return &all_architectures[arch];
14812
14813 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14814
14815 return &all_architectures[cpu->arch];
0cfff2a1
KT
14816}
14817
43cacb12
RS
14818/* Return the VG value associated with -msve-vector-bits= value VALUE. */
14819
14820static poly_uint16
14821aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14822{
9b070057
RS
14823 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14824 on big-endian targets, so we would need to forbid subregs that convert
14825 from one to the other. By default a reinterpret sequence would then
14826 involve a store to memory in one mode and a load back in the other.
14827 Even if we optimize that sequence using reverse instructions,
14828 it would still be a significant potential overhead.
14829
14830 For now, it seems better to generate length-agnostic code for that
14831 case instead. */
14832 if (value == SVE_SCALABLE
14833 || (value == SVE_128 && BYTES_BIG_ENDIAN))
43cacb12
RS
14834 return poly_uint16 (2, 2);
14835 else
14836 return (int) value / 64;
14837}
14838
0cfff2a1
KT
14839/* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
14840 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14841 tuning structs. In particular it must set selected_tune and
14842 aarch64_isa_flags that define the available ISA features and tuning
14843 decisions. It must also set selected_arch as this will be used to
14844 output the .arch asm tags for each function. */
14845
14846static void
14847aarch64_override_options (void)
14848{
28108a53
MM
14849 uint64_t cpu_isa = 0;
14850 uint64_t arch_isa = 0;
0cfff2a1
KT
14851 aarch64_isa_flags = 0;
14852
361fb3ee
KT
14853 bool valid_cpu = true;
14854 bool valid_tune = true;
14855 bool valid_arch = true;
14856
0cfff2a1
KT
14857 selected_cpu = NULL;
14858 selected_arch = NULL;
14859 selected_tune = NULL;
14860
a9ba2a9b
MM
14861 if (aarch64_harden_sls_string)
14862 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
14863
efac62a3
ST
14864 if (aarch64_branch_protection_string)
14865 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14866
0cfff2a1
KT
14867 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14868 If either of -march or -mtune is given, they override their
14869 respective component of -mcpu. */
14870 if (aarch64_cpu_string)
361fb3ee
KT
14871 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14872 &cpu_isa);
0cfff2a1
KT
14873
14874 if (aarch64_arch_string)
361fb3ee
KT
14875 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14876 &arch_isa);
0cfff2a1
KT
14877
14878 if (aarch64_tune_string)
361fb3ee 14879 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
43e9d192 14880
6881e3c1
OH
14881#ifdef SUBTARGET_OVERRIDE_OPTIONS
14882 SUBTARGET_OVERRIDE_OPTIONS;
14883#endif
14884
43e9d192
IB
14885 /* If the user did not specify a processor, choose the default
14886 one for them. This will be the CPU set during configuration using
a3cd0246 14887 --with-cpu, otherwise it is "generic". */
43e9d192
IB
14888 if (!selected_cpu)
14889 {
0cfff2a1
KT
14890 if (selected_arch)
14891 {
14892 selected_cpu = &all_cores[selected_arch->ident];
14893 aarch64_isa_flags = arch_isa;
361fb3ee 14894 explicit_arch = selected_arch->arch;
0cfff2a1
KT
14895 }
14896 else
14897 {
361fb3ee
KT
14898 /* Get default configure-time CPU. */
14899 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
0cfff2a1
KT
14900 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14901 }
361fb3ee
KT
14902
14903 if (selected_tune)
14904 explicit_tune_core = selected_tune->ident;
0cfff2a1
KT
14905 }
14906 /* If both -mcpu and -march are specified check that they are architecturally
14907 compatible, warn if they're not and prefer the -march ISA flags. */
14908 else if (selected_arch)
14909 {
14910 if (selected_arch->arch != selected_cpu->arch)
14911 {
a3f9f006 14912 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
349297b6
JH
14913 aarch64_cpu_string,
14914 aarch64_arch_string);
0cfff2a1
KT
14915 }
14916 aarch64_isa_flags = arch_isa;
361fb3ee
KT
14917 explicit_arch = selected_arch->arch;
14918 explicit_tune_core = selected_tune ? selected_tune->ident
14919 : selected_cpu->ident;
0cfff2a1
KT
14920 }
14921 else
14922 {
14923 /* -mcpu but no -march. */
14924 aarch64_isa_flags = cpu_isa;
361fb3ee
KT
14925 explicit_tune_core = selected_tune ? selected_tune->ident
14926 : selected_cpu->ident;
14927 gcc_assert (selected_cpu);
14928 selected_arch = &all_architectures[selected_cpu->arch];
14929 explicit_arch = selected_arch->arch;
43e9d192
IB
14930 }
14931
0cfff2a1
KT
14932 /* Set the arch as well as we will need it when outputing
14933 the .arch directive in assembly. */
14934 if (!selected_arch)
14935 {
14936 gcc_assert (selected_cpu);
14937 selected_arch = &all_architectures[selected_cpu->arch];
14938 }
43e9d192 14939
43e9d192 14940 if (!selected_tune)
3edaf26d 14941 selected_tune = selected_cpu;
43e9d192 14942
c7ff4f0f
SD
14943 if (aarch64_enable_bti == 2)
14944 {
14945#ifdef TARGET_ENABLE_BTI
14946 aarch64_enable_bti = 1;
14947#else
14948 aarch64_enable_bti = 0;
14949#endif
14950 }
14951
14952 /* Return address signing is currently not supported for ILP32 targets. For
14953 LP64 targets use the configured option in the absence of a command-line
14954 option for -mbranch-protection. */
14955 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14956 {
14957#ifdef TARGET_ENABLE_PAC_RET
14958 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
c7ff4f0f
SD
14959#else
14960 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14961#endif
14962 }
14963
0cfff2a1
KT
14964#ifndef HAVE_AS_MABI_OPTION
14965 /* The compiler may have been configured with 2.23.* binutils, which does
14966 not have support for ILP32. */
14967 if (TARGET_ILP32)
a3f9f006 14968 error ("assembler does not support %<-mabi=ilp32%>");
0cfff2a1 14969#endif
43e9d192 14970
43cacb12
RS
14971 /* Convert -msve-vector-bits to a VG count. */
14972 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14973
db58fd89 14974 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
a3f9f006 14975 sorry ("return address signing is only supported for %<-mabi=lp64%>");
db58fd89 14976
361fb3ee
KT
14977 /* Make sure we properly set up the explicit options. */
14978 if ((aarch64_cpu_string && valid_cpu)
14979 || (aarch64_tune_string && valid_tune))
14980 gcc_assert (explicit_tune_core != aarch64_none);
14981
14982 if ((aarch64_cpu_string && valid_cpu)
14983 || (aarch64_arch_string && valid_arch))
14984 gcc_assert (explicit_arch != aarch64_no_arch);
14985
5f7dbaa0
RE
14986 /* The pass to insert speculation tracking runs before
14987 shrink-wrapping and the latter does not know how to update the
14988 tracking status. So disable it in this case. */
14989 if (aarch64_track_speculation)
14990 flag_shrink_wrap = 0;
14991
0cfff2a1
KT
14992 aarch64_override_options_internal (&global_options);
14993
14994 /* Save these options as the default ones in case we push and pop them later
14995 while processing functions with potential target attributes. */
14996 target_option_default_node = target_option_current_node
14997 = build_target_option_node (&global_options);
43e9d192
IB
14998}
14999
15000/* Implement targetm.override_options_after_change. */
15001
15002static void
15003aarch64_override_options_after_change (void)
15004{
0cfff2a1 15005 aarch64_override_options_after_change_1 (&global_options);
43e9d192
IB
15006}
15007
15008static struct machine_function *
15009aarch64_init_machine_status (void)
15010{
15011 struct machine_function *machine;
766090c2 15012 machine = ggc_cleared_alloc<machine_function> ();
43e9d192
IB
15013 return machine;
15014}
15015
15016void
15017aarch64_init_expanders (void)
15018{
15019 init_machine_status = aarch64_init_machine_status;
15020}
15021
15022/* A checking mechanism for the implementation of the various code models. */
15023static void
0cfff2a1 15024initialize_aarch64_code_model (struct gcc_options *opts)
43e9d192 15025{
6c0ab626
X
15026 aarch64_cmodel = opts->x_aarch64_cmodel_var;
15027 switch (opts->x_aarch64_cmodel_var)
15028 {
15029 case AARCH64_CMODEL_TINY:
15030 if (opts->x_flag_pic)
15031 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15032 break;
15033 case AARCH64_CMODEL_SMALL:
15034 if (opts->x_flag_pic)
15035 {
34ecdb0f 15036#ifdef HAVE_AS_SMALL_PIC_RELOCS
6c0ab626
X
15037 aarch64_cmodel = (flag_pic == 2
15038 ? AARCH64_CMODEL_SMALL_PIC
15039 : AARCH64_CMODEL_SMALL_SPIC);
34ecdb0f 15040#else
6c0ab626 15041 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
34ecdb0f 15042#endif
6c0ab626
X
15043 }
15044 break;
15045 case AARCH64_CMODEL_LARGE:
15046 if (opts->x_flag_pic)
15047 sorry ("code model %qs with %<-f%s%>", "large",
15048 opts->x_flag_pic > 1 ? "PIC" : "pic");
15049 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15050 sorry ("code model %qs not supported in ilp32 mode", "large");
15051 break;
15052 case AARCH64_CMODEL_TINY_PIC:
15053 case AARCH64_CMODEL_SMALL_PIC:
15054 case AARCH64_CMODEL_SMALL_SPIC:
15055 gcc_unreachable ();
15056 }
43e9d192
IB
15057}
15058
361fb3ee
KT
15059/* Implement TARGET_OPTION_SAVE. */
15060
15061static void
15062aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
15063{
15064 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
efac62a3
ST
15065 ptr->x_aarch64_branch_protection_string
15066 = opts->x_aarch64_branch_protection_string;
361fb3ee
KT
15067}
15068
15069/* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
15070 using the information saved in PTR. */
15071
15072static void
15073aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
15074{
15075 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15076 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15077 opts->x_explicit_arch = ptr->x_explicit_arch;
15078 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15079 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
efac62a3
ST
15080 opts->x_aarch64_branch_protection_string
15081 = ptr->x_aarch64_branch_protection_string;
15082 if (opts->x_aarch64_branch_protection_string)
15083 {
15084 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15085 NULL);
15086 }
361fb3ee
KT
15087
15088 aarch64_override_options_internal (opts);
15089}
15090
15091/* Implement TARGET_OPTION_PRINT. */
15092
15093static void
15094aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15095{
15096 const struct processor *cpu
15097 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
28108a53 15098 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
361fb3ee 15099 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
054b4005 15100 std::string extension
04a99ebe 15101 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
361fb3ee
KT
15102
15103 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
054b4005
JG
15104 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15105 arch->name, extension.c_str ());
361fb3ee
KT
15106}
15107
d78006d9
KT
15108static GTY(()) tree aarch64_previous_fndecl;
15109
e4ea20c8
KT
15110void
15111aarch64_reset_previous_fndecl (void)
15112{
15113 aarch64_previous_fndecl = NULL;
15114}
15115
acfc1ac1
KT
15116/* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15117 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15118 make sure optab availability predicates are recomputed when necessary. */
15119
15120void
15121aarch64_save_restore_target_globals (tree new_tree)
15122{
15123 if (TREE_TARGET_GLOBALS (new_tree))
15124 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15125 else if (new_tree == target_option_default_node)
15126 restore_target_globals (&default_target_globals);
15127 else
15128 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15129}
15130
d78006d9
KT
15131/* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
15132 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15133 of the function, if such exists. This function may be called multiple
15134 times on a single function so use aarch64_previous_fndecl to avoid
15135 setting up identical state. */
15136
15137static void
15138aarch64_set_current_function (tree fndecl)
15139{
acfc1ac1
KT
15140 if (!fndecl || fndecl == aarch64_previous_fndecl)
15141 return;
15142
d78006d9
KT
15143 tree old_tree = (aarch64_previous_fndecl
15144 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15145 : NULL_TREE);
15146
acfc1ac1 15147 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
d78006d9 15148
acfc1ac1
KT
15149 /* If current function has no attributes but the previous one did,
15150 use the default node. */
15151 if (!new_tree && old_tree)
15152 new_tree = target_option_default_node;
d78006d9 15153
acfc1ac1
KT
15154 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
15155 the default have been handled by aarch64_save_restore_target_globals from
15156 aarch64_pragma_target_parse. */
15157 if (old_tree == new_tree)
15158 return;
d78006d9 15159
acfc1ac1 15160 aarch64_previous_fndecl = fndecl;
6e17a23b 15161
acfc1ac1
KT
15162 /* First set the target options. */
15163 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
6e17a23b 15164
acfc1ac1 15165 aarch64_save_restore_target_globals (new_tree);
d78006d9 15166}
361fb3ee 15167
5a2c8331
KT
15168/* Enum describing the various ways we can handle attributes.
15169 In many cases we can reuse the generic option handling machinery. */
15170
15171enum aarch64_attr_opt_type
15172{
15173 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
15174 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
15175 aarch64_attr_enum, /* Attribute sets an enum variable. */
15176 aarch64_attr_custom /* Attribute requires a custom handling function. */
15177};
15178
15179/* All the information needed to handle a target attribute.
15180 NAME is the name of the attribute.
9c582551 15181 ATTR_TYPE specifies the type of behavior of the attribute as described
5a2c8331
KT
15182 in the definition of enum aarch64_attr_opt_type.
15183 ALLOW_NEG is true if the attribute supports a "no-" form.
ab93e9b7
SE
15184 HANDLER is the function that takes the attribute string as an argument
15185 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
5a2c8331 15186 OPT_NUM is the enum specifying the option that the attribute modifies.
9c582551 15187 This is needed for attributes that mirror the behavior of a command-line
5a2c8331
KT
15188 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15189 aarch64_attr_enum. */
15190
15191struct aarch64_attribute_info
15192{
15193 const char *name;
15194 enum aarch64_attr_opt_type attr_type;
15195 bool allow_neg;
ab93e9b7 15196 bool (*handler) (const char *);
5a2c8331
KT
15197 enum opt_code opt_num;
15198};
15199
ab93e9b7 15200/* Handle the ARCH_STR argument to the arch= target attribute. */
5a2c8331
KT
15201
15202static bool
ab93e9b7 15203aarch64_handle_attr_arch (const char *str)
5a2c8331
KT
15204{
15205 const struct processor *tmp_arch = NULL;
c7887347 15206 std::string invalid_extension;
5a2c8331 15207 enum aarch64_parse_opt_result parse_res
c7887347 15208 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
5a2c8331
KT
15209
15210 if (parse_res == AARCH64_PARSE_OK)
15211 {
15212 gcc_assert (tmp_arch);
15213 selected_arch = tmp_arch;
15214 explicit_arch = selected_arch->arch;
15215 return true;
15216 }
15217
15218 switch (parse_res)
15219 {
15220 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 15221 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
5a2c8331
KT
15222 break;
15223 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 15224 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
01f44038 15225 aarch64_print_hint_for_arch (str);
5a2c8331
KT
15226 break;
15227 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
15228 error ("invalid feature modifier %s of value (\"%s\") in "
15229 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15230 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
15231 break;
15232 default:
15233 gcc_unreachable ();
15234 }
15235
15236 return false;
15237}
15238
ab93e9b7 15239/* Handle the argument CPU_STR to the cpu= target attribute. */
5a2c8331
KT
15240
15241static bool
ab93e9b7 15242aarch64_handle_attr_cpu (const char *str)
5a2c8331
KT
15243{
15244 const struct processor *tmp_cpu = NULL;
c7887347 15245 std::string invalid_extension;
5a2c8331 15246 enum aarch64_parse_opt_result parse_res
c7887347 15247 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
5a2c8331
KT
15248
15249 if (parse_res == AARCH64_PARSE_OK)
15250 {
15251 gcc_assert (tmp_cpu);
15252 selected_tune = tmp_cpu;
15253 explicit_tune_core = selected_tune->ident;
15254
15255 selected_arch = &all_architectures[tmp_cpu->arch];
15256 explicit_arch = selected_arch->arch;
15257 return true;
15258 }
15259
15260 switch (parse_res)
15261 {
15262 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 15263 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
5a2c8331
KT
15264 break;
15265 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 15266 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
01f44038 15267 aarch64_print_hint_for_core (str);
5a2c8331
KT
15268 break;
15269 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
15270 error ("invalid feature modifier %s of value (\"%s\") in "
15271 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15272 aarch64_print_hint_for_extensions (invalid_extension);
5a2c8331
KT
15273 break;
15274 default:
15275 gcc_unreachable ();
15276 }
15277
15278 return false;
15279}
15280
efac62a3
ST
15281/* Handle the argument STR to the branch-protection= attribute. */
15282
15283 static bool
15284 aarch64_handle_attr_branch_protection (const char* str)
15285 {
81e40f3a 15286 char *err_str = (char *) xmalloc (strlen (str) + 1);
efac62a3
ST
15287 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15288 &err_str);
15289 bool success = false;
15290 switch (res)
15291 {
15292 case AARCH64_PARSE_MISSING_ARG:
15293 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15294 " attribute");
15295 break;
15296 case AARCH64_PARSE_INVALID_ARG:
15297 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15298 "=\")%> pragma or attribute", err_str);
15299 break;
15300 case AARCH64_PARSE_OK:
15301 success = true;
15302 /* Fall through. */
15303 case AARCH64_PARSE_INVALID_FEATURE:
15304 break;
15305 default:
15306 gcc_unreachable ();
15307 }
15308 free (err_str);
15309 return success;
15310 }
15311
ab93e9b7 15312/* Handle the argument STR to the tune= target attribute. */
5a2c8331
KT
15313
15314static bool
ab93e9b7 15315aarch64_handle_attr_tune (const char *str)
5a2c8331
KT
15316{
15317 const struct processor *tmp_tune = NULL;
15318 enum aarch64_parse_opt_result parse_res
15319 = aarch64_parse_tune (str, &tmp_tune);
15320
15321 if (parse_res == AARCH64_PARSE_OK)
15322 {
15323 gcc_assert (tmp_tune);
15324 selected_tune = tmp_tune;
15325 explicit_tune_core = selected_tune->ident;
15326 return true;
15327 }
15328
15329 switch (parse_res)
15330 {
15331 case AARCH64_PARSE_INVALID_ARG:
ab93e9b7 15332 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
01f44038 15333 aarch64_print_hint_for_core (str);
5a2c8331
KT
15334 break;
15335 default:
15336 gcc_unreachable ();
15337 }
15338
15339 return false;
15340}
15341
15342/* Parse an architecture extensions target attribute string specified in STR.
15343 For example "+fp+nosimd". Show any errors if needed. Return TRUE
15344 if successful. Update aarch64_isa_flags to reflect the ISA features
ab93e9b7 15345 modified. */
5a2c8331
KT
15346
15347static bool
ab93e9b7 15348aarch64_handle_attr_isa_flags (char *str)
5a2c8331
KT
15349{
15350 enum aarch64_parse_opt_result parse_res;
28108a53 15351 uint64_t isa_flags = aarch64_isa_flags;
5a2c8331 15352
e4ea20c8
KT
15353 /* We allow "+nothing" in the beginning to clear out all architectural
15354 features if the user wants to handpick specific features. */
15355 if (strncmp ("+nothing", str, 8) == 0)
15356 {
15357 isa_flags = 0;
15358 str += 8;
15359 }
15360
c7887347
ML
15361 std::string invalid_extension;
15362 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
5a2c8331
KT
15363
15364 if (parse_res == AARCH64_PARSE_OK)
15365 {
15366 aarch64_isa_flags = isa_flags;
15367 return true;
15368 }
15369
15370 switch (parse_res)
15371 {
15372 case AARCH64_PARSE_MISSING_ARG:
ab93e9b7 15373 error ("missing value in %<target()%> pragma or attribute");
5a2c8331
KT
15374 break;
15375
15376 case AARCH64_PARSE_INVALID_FEATURE:
c7887347
ML
15377 error ("invalid feature modifier %s of value (\"%s\") in "
15378 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
5a2c8331
KT
15379 break;
15380
15381 default:
15382 gcc_unreachable ();
15383 }
15384
15385 return false;
15386}
15387
15388/* The target attributes that we support. On top of these we also support just
15389 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
15390 handled explicitly in aarch64_process_one_target_attr. */
15391
15392static const struct aarch64_attribute_info aarch64_attributes[] =
15393{
15394 { "general-regs-only", aarch64_attr_mask, false, NULL,
15395 OPT_mgeneral_regs_only },
15396 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15397 OPT_mfix_cortex_a53_835769 },
48bb1a55
CL
15398 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15399 OPT_mfix_cortex_a53_843419 },
5a2c8331 15400 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
675d044c 15401 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
5a2c8331
KT
15402 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15403 OPT_momit_leaf_frame_pointer },
15404 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15405 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15406 OPT_march_ },
15407 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15408 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15409 OPT_mtune_ },
efac62a3
ST
15410 { "branch-protection", aarch64_attr_custom, false,
15411 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
db58fd89
JW
15412 { "sign-return-address", aarch64_attr_enum, false, NULL,
15413 OPT_msign_return_address_ },
9e02b45f
ML
15414 { "outline-atomics", aarch64_attr_bool, true, NULL,
15415 OPT_moutline_atomics},
5a2c8331
KT
15416 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15417};
15418
15419/* Parse ARG_STR which contains the definition of one target attribute.
ab93e9b7 15420 Show appropriate errors if any or return true if the attribute is valid. */
5a2c8331
KT
15421
15422static bool
ab93e9b7 15423aarch64_process_one_target_attr (char *arg_str)
5a2c8331
KT
15424{
15425 bool invert = false;
15426
15427 size_t len = strlen (arg_str);
15428
15429 if (len == 0)
15430 {
ab93e9b7 15431 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
15432 return false;
15433 }
15434
15435 char *str_to_check = (char *) alloca (len + 1);
15436 strcpy (str_to_check, arg_str);
15437
5a2c8331
KT
15438 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15439 It is easier to detect and handle it explicitly here rather than going
15440 through the machinery for the rest of the target attributes in this
15441 function. */
15442 if (*str_to_check == '+')
ab93e9b7 15443 return aarch64_handle_attr_isa_flags (str_to_check);
5a2c8331
KT
15444
15445 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15446 {
15447 invert = true;
15448 str_to_check += 3;
15449 }
15450 char *arg = strchr (str_to_check, '=');
15451
15452 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15453 and point ARG to "foo". */
15454 if (arg)
15455 {
15456 *arg = '\0';
15457 arg++;
15458 }
15459 const struct aarch64_attribute_info *p_attr;
16d12992 15460 bool found = false;
5a2c8331
KT
15461 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15462 {
15463 /* If the names don't match up, or the user has given an argument
15464 to an attribute that doesn't accept one, or didn't give an argument
15465 to an attribute that expects one, fail to match. */
15466 if (strcmp (str_to_check, p_attr->name) != 0)
15467 continue;
15468
16d12992 15469 found = true;
5a2c8331
KT
15470 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15471 || p_attr->attr_type == aarch64_attr_enum;
15472
15473 if (attr_need_arg_p ^ (arg != NULL))
15474 {
ab93e9b7 15475 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
5a2c8331
KT
15476 return false;
15477 }
15478
15479 /* If the name matches but the attribute does not allow "no-" versions
15480 then we can't match. */
15481 if (invert && !p_attr->allow_neg)
15482 {
ab93e9b7 15483 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
5a2c8331
KT
15484 return false;
15485 }
15486
15487 switch (p_attr->attr_type)
15488 {
15489 /* Has a custom handler registered.
15490 For example, cpu=, arch=, tune=. */
15491 case aarch64_attr_custom:
15492 gcc_assert (p_attr->handler);
ab93e9b7 15493 if (!p_attr->handler (arg))
5a2c8331
KT
15494 return false;
15495 break;
15496
15497 /* Either set or unset a boolean option. */
15498 case aarch64_attr_bool:
15499 {
15500 struct cl_decoded_option decoded;
15501
15502 generate_option (p_attr->opt_num, NULL, !invert,
15503 CL_TARGET, &decoded);
15504 aarch64_handle_option (&global_options, &global_options_set,
15505 &decoded, input_location);
15506 break;
15507 }
15508 /* Set or unset a bit in the target_flags. aarch64_handle_option
15509 should know what mask to apply given the option number. */
15510 case aarch64_attr_mask:
15511 {
15512 struct cl_decoded_option decoded;
15513 /* We only need to specify the option number.
15514 aarch64_handle_option will know which mask to apply. */
15515 decoded.opt_index = p_attr->opt_num;
15516 decoded.value = !invert;
15517 aarch64_handle_option (&global_options, &global_options_set,
15518 &decoded, input_location);
15519 break;
15520 }
15521 /* Use the option setting machinery to set an option to an enum. */
15522 case aarch64_attr_enum:
15523 {
15524 gcc_assert (arg);
15525 bool valid;
15526 int value;
15527 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15528 &value, CL_TARGET);
15529 if (valid)
15530 {
15531 set_option (&global_options, NULL, p_attr->opt_num, value,
15532 NULL, DK_UNSPECIFIED, input_location,
15533 global_dc);
15534 }
15535 else
15536 {
ab93e9b7 15537 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
5a2c8331
KT
15538 }
15539 break;
15540 }
15541 default:
15542 gcc_unreachable ();
15543 }
15544 }
15545
16d12992
KT
15546 /* If we reached here we either have found an attribute and validated
15547 it or didn't match any. If we matched an attribute but its arguments
15548 were malformed we will have returned false already. */
15549 return found;
5a2c8331
KT
15550}
15551
15552/* Count how many times the character C appears in
15553 NULL-terminated string STR. */
15554
15555static unsigned int
15556num_occurences_in_str (char c, char *str)
15557{
15558 unsigned int res = 0;
15559 while (*str != '\0')
15560 {
15561 if (*str == c)
15562 res++;
15563
15564 str++;
15565 }
15566
15567 return res;
15568}
15569
15570/* Parse the tree in ARGS that contains the target attribute information
ab93e9b7 15571 and update the global target options space. */
5a2c8331
KT
15572
15573bool
ab93e9b7 15574aarch64_process_target_attr (tree args)
5a2c8331
KT
15575{
15576 if (TREE_CODE (args) == TREE_LIST)
15577 {
15578 do
15579 {
15580 tree head = TREE_VALUE (args);
15581 if (head)
15582 {
ab93e9b7 15583 if (!aarch64_process_target_attr (head))
5a2c8331
KT
15584 return false;
15585 }
15586 args = TREE_CHAIN (args);
15587 } while (args);
15588
15589 return true;
15590 }
3b6cb9e3
ML
15591
15592 if (TREE_CODE (args) != STRING_CST)
15593 {
15594 error ("attribute %<target%> argument not a string");
15595 return false;
15596 }
5a2c8331
KT
15597
15598 size_t len = strlen (TREE_STRING_POINTER (args));
15599 char *str_to_check = (char *) alloca (len + 1);
15600 strcpy (str_to_check, TREE_STRING_POINTER (args));
15601
15602 if (len == 0)
15603 {
ab93e9b7 15604 error ("malformed %<target()%> pragma or attribute");
5a2c8331
KT
15605 return false;
15606 }
15607
15608 /* Used to catch empty spaces between commas i.e.
15609 attribute ((target ("attr1,,attr2"))). */
15610 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15611
15612 /* Handle multiple target attributes separated by ','. */
7185a4eb 15613 char *token = strtok_r (str_to_check, ",", &str_to_check);
5a2c8331
KT
15614
15615 unsigned int num_attrs = 0;
15616 while (token)
15617 {
15618 num_attrs++;
ab93e9b7 15619 if (!aarch64_process_one_target_attr (token))
5a2c8331 15620 {
ab93e9b7 15621 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
5a2c8331
KT
15622 return false;
15623 }
15624
7185a4eb 15625 token = strtok_r (NULL, ",", &str_to_check);
5a2c8331
KT
15626 }
15627
15628 if (num_attrs != num_commas + 1)
15629 {
ab93e9b7 15630 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
5a2c8331
KT
15631 return false;
15632 }
15633
15634 return true;
15635}
15636
15637/* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
15638 process attribute ((target ("..."))). */
15639
15640static bool
15641aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15642{
15643 struct cl_target_option cur_target;
15644 bool ret;
15645 tree old_optimize;
15646 tree new_target, new_optimize;
15647 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
91d0e8de
KT
15648
15649 /* If what we're processing is the current pragma string then the
15650 target option node is already stored in target_option_current_node
15651 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
15652 having to re-parse the string. This is especially useful to keep
15653 arm_neon.h compile times down since that header contains a lot
15654 of intrinsics enclosed in pragmas. */
15655 if (!existing_target && args == current_target_pragma)
15656 {
15657 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15658 return true;
15659 }
5a2c8331
KT
15660 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15661
15662 old_optimize = build_optimization_node (&global_options);
15663 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15664
15665 /* If the function changed the optimization levels as well as setting
15666 target options, start with the optimizations specified. */
15667 if (func_optimize && func_optimize != old_optimize)
15668 cl_optimization_restore (&global_options,
15669 TREE_OPTIMIZATION (func_optimize));
15670
15671 /* Save the current target options to restore at the end. */
15672 cl_target_option_save (&cur_target, &global_options);
15673
15674 /* If fndecl already has some target attributes applied to it, unpack
15675 them so that we add this attribute on top of them, rather than
15676 overwriting them. */
15677 if (existing_target)
15678 {
15679 struct cl_target_option *existing_options
15680 = TREE_TARGET_OPTION (existing_target);
15681
15682 if (existing_options)
15683 cl_target_option_restore (&global_options, existing_options);
15684 }
15685 else
15686 cl_target_option_restore (&global_options,
15687 TREE_TARGET_OPTION (target_option_current_node));
15688
ab93e9b7 15689 ret = aarch64_process_target_attr (args);
5a2c8331
KT
15690
15691 /* Set up any additional state. */
15692 if (ret)
15693 {
15694 aarch64_override_options_internal (&global_options);
e95a988a
KT
15695 /* Initialize SIMD builtins if we haven't already.
15696 Set current_target_pragma to NULL for the duration so that
15697 the builtin initialization code doesn't try to tag the functions
15698 being built with the attributes specified by any current pragma, thus
15699 going into an infinite recursion. */
15700 if (TARGET_SIMD)
15701 {
15702 tree saved_current_target_pragma = current_target_pragma;
15703 current_target_pragma = NULL;
15704 aarch64_init_simd_builtins ();
15705 current_target_pragma = saved_current_target_pragma;
15706 }
5a2c8331
KT
15707 new_target = build_target_option_node (&global_options);
15708 }
15709 else
15710 new_target = NULL;
15711
15712 new_optimize = build_optimization_node (&global_options);
15713
15714 if (fndecl && ret)
15715 {
15716 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15717
15718 if (old_optimize != new_optimize)
15719 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15720 }
15721
15722 cl_target_option_restore (&global_options, &cur_target);
15723
15724 if (old_optimize != new_optimize)
15725 cl_optimization_restore (&global_options,
15726 TREE_OPTIMIZATION (old_optimize));
15727 return ret;
15728}
15729
1fd8d40c
KT
15730/* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
15731 tri-bool options (yes, no, don't care) and the default value is
15732 DEF, determine whether to reject inlining. */
15733
15734static bool
15735aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15736 int dont_care, int def)
15737{
15738 /* If the callee doesn't care, always allow inlining. */
15739 if (callee == dont_care)
15740 return true;
15741
15742 /* If the caller doesn't care, always allow inlining. */
15743 if (caller == dont_care)
15744 return true;
15745
15746 /* Otherwise, allow inlining if either the callee and caller values
15747 agree, or if the callee is using the default value. */
15748 return (callee == caller || callee == def);
15749}
15750
15751/* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
15752 to inline CALLEE into CALLER based on target-specific info.
15753 Make sure that the caller and callee have compatible architectural
15754 features. Then go through the other possible target attributes
15755 and see if they can block inlining. Try not to reject always_inline
15756 callees unless they are incompatible architecturally. */
15757
15758static bool
15759aarch64_can_inline_p (tree caller, tree callee)
15760{
15761 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15762 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15763
1fd8d40c
KT
15764 struct cl_target_option *caller_opts
15765 = TREE_TARGET_OPTION (caller_tree ? caller_tree
15766 : target_option_default_node);
15767
675d044c
SD
15768 struct cl_target_option *callee_opts
15769 = TREE_TARGET_OPTION (callee_tree ? callee_tree
15770 : target_option_default_node);
1fd8d40c
KT
15771
15772 /* Callee's ISA flags should be a subset of the caller's. */
15773 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15774 != callee_opts->x_aarch64_isa_flags)
15775 return false;
15776
15777 /* Allow non-strict aligned functions inlining into strict
15778 aligned ones. */
15779 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15780 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15781 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15782 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15783 return false;
15784
15785 bool always_inline = lookup_attribute ("always_inline",
15786 DECL_ATTRIBUTES (callee));
15787
15788 /* If the architectural features match up and the callee is always_inline
15789 then the other attributes don't matter. */
15790 if (always_inline)
15791 return true;
15792
15793 if (caller_opts->x_aarch64_cmodel_var
15794 != callee_opts->x_aarch64_cmodel_var)
15795 return false;
15796
15797 if (caller_opts->x_aarch64_tls_dialect
15798 != callee_opts->x_aarch64_tls_dialect)
15799 return false;
15800
15801 /* Honour explicit requests to workaround errata. */
15802 if (!aarch64_tribools_ok_for_inlining_p (
15803 caller_opts->x_aarch64_fix_a53_err835769,
15804 callee_opts->x_aarch64_fix_a53_err835769,
15805 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15806 return false;
15807
48bb1a55
CL
15808 if (!aarch64_tribools_ok_for_inlining_p (
15809 caller_opts->x_aarch64_fix_a53_err843419,
15810 callee_opts->x_aarch64_fix_a53_err843419,
15811 2, TARGET_FIX_ERR_A53_843419))
15812 return false;
15813
1fd8d40c
KT
15814 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15815 caller and calle and they don't match up, reject inlining. */
15816 if (!aarch64_tribools_ok_for_inlining_p (
15817 caller_opts->x_flag_omit_leaf_frame_pointer,
15818 callee_opts->x_flag_omit_leaf_frame_pointer,
15819 2, 1))
15820 return false;
15821
15822 /* If the callee has specific tuning overrides, respect them. */
15823 if (callee_opts->x_aarch64_override_tune_string != NULL
15824 && caller_opts->x_aarch64_override_tune_string == NULL)
15825 return false;
15826
15827 /* If the user specified tuning override strings for the
15828 caller and callee and they don't match up, reject inlining.
15829 We just do a string compare here, we don't analyze the meaning
15830 of the string, as it would be too costly for little gain. */
15831 if (callee_opts->x_aarch64_override_tune_string
15832 && caller_opts->x_aarch64_override_tune_string
15833 && (strcmp (callee_opts->x_aarch64_override_tune_string,
15834 caller_opts->x_aarch64_override_tune_string) != 0))
15835 return false;
15836
15837 return true;
15838}
15839
bb6ce448
RS
15840/* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15841 been already. */
15842
15843unsigned int
15844aarch64_tlsdesc_abi_id ()
15845{
15846 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15847 if (!tlsdesc_abi.initialized_p ())
15848 {
15849 HARD_REG_SET full_reg_clobbers;
15850 CLEAR_HARD_REG_SET (full_reg_clobbers);
15851 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15852 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15853 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15854 SET_HARD_REG_BIT (full_reg_clobbers, regno);
15855 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15856 }
15857 return tlsdesc_abi.id ();
15858}
15859
43e9d192
IB
15860/* Return true if SYMBOL_REF X binds locally. */
15861
15862static bool
15863aarch64_symbol_binds_local_p (const_rtx x)
15864{
15865 return (SYMBOL_REF_DECL (x)
15866 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15867 : SYMBOL_REF_LOCAL_P (x));
15868}
15869
15870/* Return true if SYMBOL_REF X is thread local */
15871static bool
15872aarch64_tls_symbol_p (rtx x)
15873{
15874 if (! TARGET_HAVE_TLS)
15875 return false;
15876
15877 if (GET_CODE (x) != SYMBOL_REF)
15878 return false;
15879
15880 return SYMBOL_REF_TLS_MODEL (x) != 0;
15881}
15882
15883/* Classify a TLS symbol into one of the TLS kinds. */
15884enum aarch64_symbol_type
15885aarch64_classify_tls_symbol (rtx x)
15886{
15887 enum tls_model tls_kind = tls_symbolic_operand_type (x);
15888
15889 switch (tls_kind)
15890 {
15891 case TLS_MODEL_GLOBAL_DYNAMIC:
15892 case TLS_MODEL_LOCAL_DYNAMIC:
15893 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15894
15895 case TLS_MODEL_INITIAL_EXEC:
5ae7caad
JW
15896 switch (aarch64_cmodel)
15897 {
15898 case AARCH64_CMODEL_TINY:
15899 case AARCH64_CMODEL_TINY_PIC:
15900 return SYMBOL_TINY_TLSIE;
15901 default:
79496620 15902 return SYMBOL_SMALL_TLSIE;
5ae7caad 15903 }
43e9d192
IB
15904
15905 case TLS_MODEL_LOCAL_EXEC:
cbf5629e
JW
15906 if (aarch64_tls_size == 12)
15907 return SYMBOL_TLSLE12;
15908 else if (aarch64_tls_size == 24)
15909 return SYMBOL_TLSLE24;
15910 else if (aarch64_tls_size == 32)
15911 return SYMBOL_TLSLE32;
15912 else if (aarch64_tls_size == 48)
15913 return SYMBOL_TLSLE48;
15914 else
15915 gcc_unreachable ();
43e9d192
IB
15916
15917 case TLS_MODEL_EMULATED:
15918 case TLS_MODEL_NONE:
15919 return SYMBOL_FORCE_TO_MEM;
15920
15921 default:
15922 gcc_unreachable ();
15923 }
15924}
15925
43cacb12
RS
15926/* Return the correct method for accessing X + OFFSET, where X is either
15927 a SYMBOL_REF or LABEL_REF. */
17f4d4bf 15928
43e9d192 15929enum aarch64_symbol_type
43cacb12 15930aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
43e9d192
IB
15931{
15932 if (GET_CODE (x) == LABEL_REF)
15933 {
15934 switch (aarch64_cmodel)
15935 {
15936 case AARCH64_CMODEL_LARGE:
15937 return SYMBOL_FORCE_TO_MEM;
15938
15939 case AARCH64_CMODEL_TINY_PIC:
15940 case AARCH64_CMODEL_TINY:
a5350ddc
CSS
15941 return SYMBOL_TINY_ABSOLUTE;
15942
1b1e81f8 15943 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
15944 case AARCH64_CMODEL_SMALL_PIC:
15945 case AARCH64_CMODEL_SMALL:
15946 return SYMBOL_SMALL_ABSOLUTE;
15947
15948 default:
15949 gcc_unreachable ();
15950 }
15951 }
15952
17f4d4bf 15953 if (GET_CODE (x) == SYMBOL_REF)
43e9d192 15954 {
43e9d192
IB
15955 if (aarch64_tls_symbol_p (x))
15956 return aarch64_classify_tls_symbol (x);
15957
17f4d4bf
CSS
15958 switch (aarch64_cmodel)
15959 {
15960 case AARCH64_CMODEL_TINY:
15f6e0da 15961 /* When we retrieve symbol + offset address, we have to make sure
f8b756b7
TB
15962 the offset does not cause overflow of the final address. But
15963 we have no way of knowing the address of symbol at compile time
15964 so we can't accurately say if the distance between the PC and
7d3b27ff
WD
15965 symbol + offset is outside the addressible range of +/-1MB in the
15966 TINY code model. So we limit the maximum offset to +/-64KB and
15967 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15968 If offset_within_block_p is true we allow larger offsets.
15969 Furthermore force to memory if the symbol is a weak reference to
15970 something that doesn't resolve to a symbol in this module. */
15971
15972 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
a5350ddc 15973 return SYMBOL_FORCE_TO_MEM;
7d3b27ff
WD
15974 if (!(IN_RANGE (offset, -0x10000, 0x10000)
15975 || offset_within_block_p (x, offset)))
15976 return SYMBOL_FORCE_TO_MEM;
15977
a5350ddc
CSS
15978 return SYMBOL_TINY_ABSOLUTE;
15979
17f4d4bf 15980 case AARCH64_CMODEL_SMALL:
f8b756b7 15981 /* Same reasoning as the tiny code model, but the offset cap here is
7d3b27ff
WD
15982 1MB, allowing +/-3.9GB for the offset to the symbol. */
15983
15984 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
17f4d4bf 15985 return SYMBOL_FORCE_TO_MEM;
7d3b27ff
WD
15986 if (!(IN_RANGE (offset, -0x100000, 0x100000)
15987 || offset_within_block_p (x, offset)))
15988 return SYMBOL_FORCE_TO_MEM;
15989
17f4d4bf 15990 return SYMBOL_SMALL_ABSOLUTE;
43e9d192 15991
17f4d4bf 15992 case AARCH64_CMODEL_TINY_PIC:
38e6c9a6 15993 if (!aarch64_symbol_binds_local_p (x))
87dd8ab0 15994 return SYMBOL_TINY_GOT;
38e6c9a6
MS
15995 return SYMBOL_TINY_ABSOLUTE;
15996
1b1e81f8 15997 case AARCH64_CMODEL_SMALL_SPIC:
17f4d4bf
CSS
15998 case AARCH64_CMODEL_SMALL_PIC:
15999 if (!aarch64_symbol_binds_local_p (x))
1b1e81f8
JW
16000 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16001 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
17f4d4bf 16002 return SYMBOL_SMALL_ABSOLUTE;
43e9d192 16003
9ee6540a
WD
16004 case AARCH64_CMODEL_LARGE:
16005 /* This is alright even in PIC code as the constant
16006 pool reference is always PC relative and within
16007 the same translation unit. */
d47d34bb 16008 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
9ee6540a
WD
16009 return SYMBOL_SMALL_ABSOLUTE;
16010 else
16011 return SYMBOL_FORCE_TO_MEM;
16012
17f4d4bf
CSS
16013 default:
16014 gcc_unreachable ();
16015 }
43e9d192 16016 }
17f4d4bf 16017
43e9d192
IB
16018 /* By default push everything into the constant pool. */
16019 return SYMBOL_FORCE_TO_MEM;
16020}
16021
43e9d192
IB
16022bool
16023aarch64_constant_address_p (rtx x)
16024{
16025 return (CONSTANT_P (x) && memory_address_p (DImode, x));
16026}
16027
16028bool
16029aarch64_legitimate_pic_operand_p (rtx x)
16030{
16031 if (GET_CODE (x) == SYMBOL_REF
16032 || (GET_CODE (x) == CONST
16033 && GET_CODE (XEXP (x, 0)) == PLUS
16034 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
16035 return false;
16036
16037 return true;
16038}
16039
26895c21
WD
16040/* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
16041 that should be rematerialized rather than spilled. */
3520f7cc 16042
43e9d192 16043static bool
ef4bddc2 16044aarch64_legitimate_constant_p (machine_mode mode, rtx x)
43e9d192 16045{
26895c21 16046 /* Support CSE and rematerialization of common constants. */
c0bb5bc5 16047 if (CONST_INT_P (x)
9f7b87ca 16048 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
c0bb5bc5 16049 || GET_CODE (x) == CONST_VECTOR)
26895c21
WD
16050 return true;
16051
43cacb12
RS
16052 /* Do not allow vector struct mode constants for Advanced SIMD.
16053 We could support 0 and -1 easily, but they need support in
16054 aarch64-simd.md. */
16055 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16056 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
43e9d192
IB
16057 return false;
16058
43cacb12
RS
16059 /* Only accept variable-length vector constants if they can be
16060 handled directly.
16061
16062 ??? It would be possible to handle rematerialization of other
16063 constants via secondary reloads. */
16064 if (vec_flags & VEC_ANY_SVE)
16065 return aarch64_simd_valid_immediate (x, NULL);
16066
509bb9b6
RS
16067 if (GET_CODE (x) == HIGH)
16068 x = XEXP (x, 0);
16069
43cacb12
RS
16070 /* Accept polynomial constants that can be calculated by using the
16071 destination of a move as the sole temporary. Constants that
16072 require a second temporary cannot be rematerialized (they can't be
16073 forced to memory and also aren't legitimate constants). */
16074 poly_int64 offset;
16075 if (poly_int_rtx_p (x, &offset))
16076 return aarch64_offset_temporaries (false, offset) <= 1;
16077
16078 /* If an offset is being added to something else, we need to allow the
16079 base to be moved into the destination register, meaning that there
16080 are no free temporaries for the offset. */
16081 x = strip_offset (x, &offset);
16082 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16083 return false;
26895c21 16084
43cacb12
RS
16085 /* Do not allow const (plus (anchor_symbol, const_int)). */
16086 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16087 return false;
26895c21 16088
f28e54bd
WD
16089 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
16090 so spilling them is better than rematerialization. */
16091 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16092 return true;
16093
26895c21
WD
16094 /* Label references are always constant. */
16095 if (GET_CODE (x) == LABEL_REF)
16096 return true;
16097
16098 return false;
43e9d192
IB
16099}
16100
a5bc806c 16101rtx
43e9d192
IB
16102aarch64_load_tp (rtx target)
16103{
16104 if (!target
16105 || GET_MODE (target) != Pmode
16106 || !register_operand (target, Pmode))
16107 target = gen_reg_rtx (Pmode);
16108
16109 /* Can return in any reg. */
16110 emit_insn (gen_aarch64_load_tp_hard (target));
16111 return target;
16112}
16113
43e9d192
IB
16114/* On AAPCS systems, this is the "struct __va_list". */
16115static GTY(()) tree va_list_type;
16116
16117/* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16118 Return the type to use as __builtin_va_list.
16119
16120 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16121
16122 struct __va_list
16123 {
16124 void *__stack;
16125 void *__gr_top;
16126 void *__vr_top;
16127 int __gr_offs;
16128 int __vr_offs;
16129 }; */
16130
16131static tree
16132aarch64_build_builtin_va_list (void)
16133{
16134 tree va_list_name;
16135 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16136
16137 /* Create the type. */
16138 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16139 /* Give it the required name. */
16140 va_list_name = build_decl (BUILTINS_LOCATION,
16141 TYPE_DECL,
16142 get_identifier ("__va_list"),
16143 va_list_type);
16144 DECL_ARTIFICIAL (va_list_name) = 1;
16145 TYPE_NAME (va_list_type) = va_list_name;
665c56c6 16146 TYPE_STUB_DECL (va_list_type) = va_list_name;
43e9d192
IB
16147
16148 /* Create the fields. */
16149 f_stack = build_decl (BUILTINS_LOCATION,
16150 FIELD_DECL, get_identifier ("__stack"),
16151 ptr_type_node);
16152 f_grtop = build_decl (BUILTINS_LOCATION,
16153 FIELD_DECL, get_identifier ("__gr_top"),
16154 ptr_type_node);
16155 f_vrtop = build_decl (BUILTINS_LOCATION,
16156 FIELD_DECL, get_identifier ("__vr_top"),
16157 ptr_type_node);
16158 f_groff = build_decl (BUILTINS_LOCATION,
16159 FIELD_DECL, get_identifier ("__gr_offs"),
16160 integer_type_node);
16161 f_vroff = build_decl (BUILTINS_LOCATION,
16162 FIELD_DECL, get_identifier ("__vr_offs"),
16163 integer_type_node);
16164
88e3bdd1 16165 /* Tell tree-stdarg pass about our internal offset fields.
3fd6b9cc
JW
16166 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16167 purpose to identify whether the code is updating va_list internal
16168 offset fields through irregular way. */
16169 va_list_gpr_counter_field = f_groff;
16170 va_list_fpr_counter_field = f_vroff;
16171
43e9d192
IB
16172 DECL_ARTIFICIAL (f_stack) = 1;
16173 DECL_ARTIFICIAL (f_grtop) = 1;
16174 DECL_ARTIFICIAL (f_vrtop) = 1;
16175 DECL_ARTIFICIAL (f_groff) = 1;
16176 DECL_ARTIFICIAL (f_vroff) = 1;
16177
16178 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16179 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16180 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16181 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16182 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16183
16184 TYPE_FIELDS (va_list_type) = f_stack;
16185 DECL_CHAIN (f_stack) = f_grtop;
16186 DECL_CHAIN (f_grtop) = f_vrtop;
16187 DECL_CHAIN (f_vrtop) = f_groff;
16188 DECL_CHAIN (f_groff) = f_vroff;
16189
16190 /* Compute its layout. */
16191 layout_type (va_list_type);
16192
16193 return va_list_type;
16194}
16195
16196/* Implement TARGET_EXPAND_BUILTIN_VA_START. */
16197static void
16198aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16199{
16200 const CUMULATIVE_ARGS *cum;
16201 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16202 tree stack, grtop, vrtop, groff, vroff;
16203 tree t;
88e3bdd1
JW
16204 int gr_save_area_size = cfun->va_list_gpr_size;
16205 int vr_save_area_size = cfun->va_list_fpr_size;
43e9d192
IB
16206 int vr_offset;
16207
16208 cum = &crtl->args.info;
88e3bdd1
JW
16209 if (cfun->va_list_gpr_size)
16210 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16211 cfun->va_list_gpr_size);
16212 if (cfun->va_list_fpr_size)
16213 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16214 * UNITS_PER_VREG, cfun->va_list_fpr_size);
43e9d192 16215
d5726973 16216 if (!TARGET_FLOAT)
43e9d192 16217 {
261fb553 16218 gcc_assert (cum->aapcs_nvrn == 0);
43e9d192
IB
16219 vr_save_area_size = 0;
16220 }
16221
16222 f_stack = TYPE_FIELDS (va_list_type_node);
16223 f_grtop = DECL_CHAIN (f_stack);
16224 f_vrtop = DECL_CHAIN (f_grtop);
16225 f_groff = DECL_CHAIN (f_vrtop);
16226 f_vroff = DECL_CHAIN (f_groff);
16227
16228 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16229 NULL_TREE);
16230 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16231 NULL_TREE);
16232 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16233 NULL_TREE);
16234 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16235 NULL_TREE);
16236 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16237 NULL_TREE);
16238
16239 /* Emit code to initialize STACK, which points to the next varargs stack
16240 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
16241 by named arguments. STACK is 8-byte aligned. */
16242 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16243 if (cum->aapcs_stack_size > 0)
16244 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16245 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16246 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16247
16248 /* Emit code to initialize GRTOP, the top of the GR save area.
16249 virtual_incoming_args_rtx should have been 16 byte aligned. */
16250 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16251 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16252 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16253
16254 /* Emit code to initialize VRTOP, the top of the VR save area.
16255 This address is gr_save_area_bytes below GRTOP, rounded
16256 down to the next 16-byte boundary. */
16257 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
4f59f9f2
UB
16258 vr_offset = ROUND_UP (gr_save_area_size,
16259 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
16260
16261 if (vr_offset)
16262 t = fold_build_pointer_plus_hwi (t, -vr_offset);
16263 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16264 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16265
16266 /* Emit code to initialize GROFF, the offset from GRTOP of the
16267 next GPR argument. */
16268 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16269 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16270 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16271
16272 /* Likewise emit code to initialize VROFF, the offset from FTOP
16273 of the next VR argument. */
16274 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16275 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16276 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16277}
16278
16279/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
16280
16281static tree
16282aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16283 gimple_seq *post_p ATTRIBUTE_UNUSED)
16284{
16285 tree addr;
16286 bool indirect_p;
16287 bool is_ha; /* is HFA or HVA. */
16288 bool dw_align; /* double-word align. */
ef4bddc2 16289 machine_mode ag_mode = VOIDmode;
43e9d192 16290 int nregs;
ef4bddc2 16291 machine_mode mode;
43e9d192
IB
16292
16293 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16294 tree stack, f_top, f_off, off, arg, roundup, on_stack;
16295 HOST_WIDE_INT size, rsize, adjust, align;
16296 tree t, u, cond1, cond2;
16297
fde65a89 16298 indirect_p = pass_va_arg_by_reference (type);
43e9d192
IB
16299 if (indirect_p)
16300 type = build_pointer_type (type);
16301
16302 mode = TYPE_MODE (type);
16303
16304 f_stack = TYPE_FIELDS (va_list_type_node);
16305 f_grtop = DECL_CHAIN (f_stack);
16306 f_vrtop = DECL_CHAIN (f_grtop);
16307 f_groff = DECL_CHAIN (f_vrtop);
16308 f_vroff = DECL_CHAIN (f_groff);
16309
16310 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16311 f_stack, NULL_TREE);
16312 size = int_size_in_bytes (type);
c590597c
RE
16313
16314 bool abi_break;
16315 align
16316 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
43e9d192
IB
16317
16318 dw_align = false;
16319 adjust = 0;
56fe3ca3
RS
16320 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16321 &is_ha, false))
43e9d192 16322 {
6a70badb
RS
16323 /* No frontends can create types with variable-sized modes, so we
16324 shouldn't be asked to pass or return them. */
16325 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16326
43e9d192 16327 /* TYPE passed in fp/simd registers. */
d5726973 16328 if (!TARGET_FLOAT)
fc29dfc9 16329 aarch64_err_no_fpadvsimd (mode);
43e9d192
IB
16330
16331 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16332 unshare_expr (valist), f_vrtop, NULL_TREE);
16333 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16334 unshare_expr (valist), f_vroff, NULL_TREE);
16335
16336 rsize = nregs * UNITS_PER_VREG;
16337
16338 if (is_ha)
16339 {
6a70badb
RS
16340 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16341 adjust = UNITS_PER_VREG - ag_size;
43e9d192 16342 }
76b0cbf8 16343 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
16344 && size < UNITS_PER_VREG)
16345 {
16346 adjust = UNITS_PER_VREG - size;
16347 }
16348 }
16349 else
16350 {
16351 /* TYPE passed in general registers. */
16352 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16353 unshare_expr (valist), f_grtop, NULL_TREE);
16354 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16355 unshare_expr (valist), f_groff, NULL_TREE);
4f59f9f2 16356 rsize = ROUND_UP (size, UNITS_PER_WORD);
43e9d192
IB
16357 nregs = rsize / UNITS_PER_WORD;
16358
16359 if (align > 8)
c590597c
RE
16360 {
16361 if (abi_break && warn_psabi)
16362 inform (input_location, "parameter passing for argument of type "
16363 "%qT changed in GCC 9.1", type);
16364 dw_align = true;
16365 }
43e9d192 16366
76b0cbf8 16367 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
16368 && size < UNITS_PER_WORD)
16369 {
16370 adjust = UNITS_PER_WORD - size;
16371 }
16372 }
16373
16374 /* Get a local temporary for the field value. */
16375 off = get_initialized_tmp_var (f_off, pre_p, NULL);
16376
16377 /* Emit code to branch if off >= 0. */
16378 t = build2 (GE_EXPR, boolean_type_node, off,
16379 build_int_cst (TREE_TYPE (off), 0));
16380 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16381
16382 if (dw_align)
16383 {
16384 /* Emit: offs = (offs + 15) & -16. */
16385 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16386 build_int_cst (TREE_TYPE (off), 15));
16387 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16388 build_int_cst (TREE_TYPE (off), -16));
16389 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16390 }
16391 else
16392 roundup = NULL;
16393
16394 /* Update ap.__[g|v]r_offs */
16395 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16396 build_int_cst (TREE_TYPE (off), rsize));
16397 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16398
16399 /* String up. */
16400 if (roundup)
16401 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16402
16403 /* [cond2] if (ap.__[g|v]r_offs > 0) */
16404 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16405 build_int_cst (TREE_TYPE (f_off), 0));
16406 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16407
16408 /* String up: make sure the assignment happens before the use. */
16409 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16410 COND_EXPR_ELSE (cond1) = t;
16411
16412 /* Prepare the trees handling the argument that is passed on the stack;
16413 the top level node will store in ON_STACK. */
16414 arg = get_initialized_tmp_var (stack, pre_p, NULL);
16415 if (align > 8)
16416 {
16417 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
4bdc2738 16418 t = fold_build_pointer_plus_hwi (arg, 15);
43e9d192
IB
16419 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16420 build_int_cst (TREE_TYPE (t), -16));
43e9d192
IB
16421 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16422 }
16423 else
16424 roundup = NULL;
16425 /* Advance ap.__stack */
4bdc2738 16426 t = fold_build_pointer_plus_hwi (arg, size + 7);
43e9d192
IB
16427 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16428 build_int_cst (TREE_TYPE (t), -8));
43e9d192
IB
16429 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16430 /* String up roundup and advance. */
16431 if (roundup)
16432 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16433 /* String up with arg */
16434 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16435 /* Big-endianness related address adjustment. */
76b0cbf8 16436 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
43e9d192
IB
16437 && size < UNITS_PER_WORD)
16438 {
16439 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16440 size_int (UNITS_PER_WORD - size));
16441 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16442 }
16443
16444 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16445 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16446
16447 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
16448 t = off;
16449 if (adjust)
16450 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16451 build_int_cst (TREE_TYPE (off), adjust));
16452
16453 t = fold_convert (sizetype, t);
16454 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16455
16456 if (is_ha)
16457 {
16458 /* type ha; // treat as "struct {ftype field[n];}"
16459 ... [computing offs]
16460 for (i = 0; i <nregs; ++i, offs += 16)
16461 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16462 return ha; */
16463 int i;
16464 tree tmp_ha, field_t, field_ptr_t;
16465
16466 /* Declare a local variable. */
16467 tmp_ha = create_tmp_var_raw (type, "ha");
16468 gimple_add_tmp_var (tmp_ha);
16469
16470 /* Establish the base type. */
16471 switch (ag_mode)
16472 {
4e10a5a7 16473 case E_SFmode:
43e9d192
IB
16474 field_t = float_type_node;
16475 field_ptr_t = float_ptr_type_node;
16476 break;
4e10a5a7 16477 case E_DFmode:
43e9d192
IB
16478 field_t = double_type_node;
16479 field_ptr_t = double_ptr_type_node;
16480 break;
4e10a5a7 16481 case E_TFmode:
43e9d192
IB
16482 field_t = long_double_type_node;
16483 field_ptr_t = long_double_ptr_type_node;
16484 break;
4e10a5a7 16485 case E_HFmode:
1b62ed4f
JG
16486 field_t = aarch64_fp16_type_node;
16487 field_ptr_t = aarch64_fp16_ptr_type_node;
43e9d192 16488 break;
abbe1ed2
SMW
16489 case E_BFmode:
16490 field_t = aarch64_bf16_type_node;
16491 field_ptr_t = aarch64_bf16_ptr_type_node;
16492 break;
4e10a5a7
RS
16493 case E_V2SImode:
16494 case E_V4SImode:
43e9d192
IB
16495 {
16496 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16497 field_t = build_vector_type_for_mode (innertype, ag_mode);
16498 field_ptr_t = build_pointer_type (field_t);
16499 }
16500 break;
16501 default:
16502 gcc_assert (0);
16503 }
16504
16505 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
ab563903 16506 TREE_ADDRESSABLE (tmp_ha) = 1;
43e9d192
IB
16507 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16508 addr = t;
16509 t = fold_convert (field_ptr_t, addr);
16510 t = build2 (MODIFY_EXPR, field_t,
16511 build1 (INDIRECT_REF, field_t, tmp_ha),
16512 build1 (INDIRECT_REF, field_t, t));
16513
16514 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
16515 for (i = 1; i < nregs; ++i)
16516 {
16517 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16518 u = fold_convert (field_ptr_t, addr);
16519 u = build2 (MODIFY_EXPR, field_t,
16520 build2 (MEM_REF, field_t, tmp_ha,
16521 build_int_cst (field_ptr_t,
16522 (i *
16523 int_size_in_bytes (field_t)))),
16524 build1 (INDIRECT_REF, field_t, u));
16525 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16526 }
16527
16528 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16529 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16530 }
16531
16532 COND_EXPR_ELSE (cond2) = t;
16533 addr = fold_convert (build_pointer_type (type), cond1);
16534 addr = build_va_arg_indirect_ref (addr);
16535
16536 if (indirect_p)
16537 addr = build_va_arg_indirect_ref (addr);
16538
16539 return addr;
16540}
16541
16542/* Implement TARGET_SETUP_INCOMING_VARARGS. */
16543
16544static void
e7056ca4
RS
16545aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16546 const function_arg_info &arg,
16547 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
43e9d192
IB
16548{
16549 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16550 CUMULATIVE_ARGS local_cum;
88e3bdd1
JW
16551 int gr_saved = cfun->va_list_gpr_size;
16552 int vr_saved = cfun->va_list_fpr_size;
43e9d192
IB
16553
16554 /* The caller has advanced CUM up to, but not beyond, the last named
16555 argument. Advance a local copy of CUM past the last "real" named
16556 argument, to find out how many registers are left over. */
16557 local_cum = *cum;
6930c98c 16558 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
43e9d192 16559
88e3bdd1
JW
16560 /* Found out how many registers we need to save.
16561 Honor tree-stdvar analysis results. */
16562 if (cfun->va_list_gpr_size)
16563 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16564 cfun->va_list_gpr_size / UNITS_PER_WORD);
16565 if (cfun->va_list_fpr_size)
16566 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16567 cfun->va_list_fpr_size / UNITS_PER_VREG);
43e9d192 16568
d5726973 16569 if (!TARGET_FLOAT)
43e9d192 16570 {
261fb553 16571 gcc_assert (local_cum.aapcs_nvrn == 0);
43e9d192
IB
16572 vr_saved = 0;
16573 }
16574
16575 if (!no_rtl)
16576 {
16577 if (gr_saved > 0)
16578 {
16579 rtx ptr, mem;
16580
16581 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
16582 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16583 - gr_saved * UNITS_PER_WORD);
16584 mem = gen_frame_mem (BLKmode, ptr);
16585 set_mem_alias_set (mem, get_varargs_alias_set ());
16586
16587 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16588 mem, gr_saved);
16589 }
16590 if (vr_saved > 0)
16591 {
16592 /* We can't use move_block_from_reg, because it will use
16593 the wrong mode, storing D regs only. */
ef4bddc2 16594 machine_mode mode = TImode;
88e3bdd1 16595 int off, i, vr_start;
43e9d192
IB
16596
16597 /* Set OFF to the offset from virtual_incoming_args_rtx of
16598 the first vector register. The VR save area lies below
16599 the GR one, and is aligned to 16 bytes. */
4f59f9f2
UB
16600 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16601 STACK_BOUNDARY / BITS_PER_UNIT);
43e9d192
IB
16602 off -= vr_saved * UNITS_PER_VREG;
16603
88e3bdd1
JW
16604 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16605 for (i = 0; i < vr_saved; ++i)
43e9d192
IB
16606 {
16607 rtx ptr, mem;
16608
16609 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16610 mem = gen_frame_mem (mode, ptr);
16611 set_mem_alias_set (mem, get_varargs_alias_set ());
88e3bdd1 16612 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
43e9d192
IB
16613 off += UNITS_PER_VREG;
16614 }
16615 }
16616 }
16617
16618 /* We don't save the size into *PRETEND_SIZE because we want to avoid
16619 any complication of having crtl->args.pretend_args_size changed. */
8799637a 16620 cfun->machine->frame.saved_varargs_size
4f59f9f2
UB
16621 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16622 STACK_BOUNDARY / BITS_PER_UNIT)
43e9d192
IB
16623 + vr_saved * UNITS_PER_VREG);
16624}
16625
16626static void
16627aarch64_conditional_register_usage (void)
16628{
16629 int i;
16630 if (!TARGET_FLOAT)
16631 {
16632 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16633 {
16634 fixed_regs[i] = 1;
16635 call_used_regs[i] = 1;
16636 }
16637 }
43cacb12
RS
16638 if (!TARGET_SVE)
16639 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16640 {
16641 fixed_regs[i] = 1;
16642 call_used_regs[i] = 1;
16643 }
3751345d 16644
183bfdaf
RS
16645 /* Only allow the FFR and FFRT to be accessed via special patterns. */
16646 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16647 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16648
3751345d
RE
16649 /* When tracking speculation, we need a couple of call-clobbered registers
16650 to track the speculation state. It would be nice to just use
16651 IP0 and IP1, but currently there are numerous places that just
16652 assume these registers are free for other uses (eg pointer
16653 authentication). */
16654 if (aarch64_track_speculation)
16655 {
16656 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16657 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16658 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16659 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16660 }
43e9d192
IB
16661}
16662
38e62001
RS
16663/* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
16664
16665bool
16666aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16667{
16668 /* For records we're passed a FIELD_DECL, for arrays we're passed
16669 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
16670 const_tree type = TREE_TYPE (field_or_array);
16671
16672 /* Assign BLKmode to anything that contains multiple SVE predicates.
16673 For structures, the "multiple" case is indicated by MODE being
16674 VOIDmode. */
16675 unsigned int num_zr, num_pr;
16676 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16677 {
16678 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16679 return !simple_cst_equal (TYPE_SIZE (field_or_array),
16680 TYPE_SIZE (type));
16681 return mode == VOIDmode;
16682 }
16683
16684 return default_member_type_forces_blk (field_or_array, mode);
16685}
16686
56fe3ca3
RS
16687/* Bitmasks that indicate whether earlier versions of GCC would have
16688 taken a different path through the ABI logic. This should result in
16689 a -Wpsabi warning if the earlier path led to a different ABI decision.
16690
16691 WARN_PSABI_EMPTY_CXX17_BASE
16692 Indicates that the type includes an artificial empty C++17 base field
16693 that, prior to GCC 10.1, would prevent the type from being treated as
16694 a HFA or HVA. See PR94383 for details.
16695
16696 WARN_PSABI_NO_UNIQUE_ADDRESS
16697 Indicates that the type includes an empty [[no_unique_address]] field
16698 that, prior to GCC 10.1, would prevent the type from being treated as
16699 a HFA or HVA. */
16700const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16701const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16702
43e9d192
IB
16703/* Walk down the type tree of TYPE counting consecutive base elements.
16704 If *MODEP is VOIDmode, then set it to the first valid floating point
16705 type. If a non-floating point type is found, or if a floating point
16706 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
e73a32d6
MM
16707 otherwise return the count in the sub-tree.
16708
56fe3ca3
RS
16709 The WARN_PSABI_FLAGS argument allows the caller to check whether this
16710 function has changed its behavior relative to earlier versions of GCC.
16711 Normally the argument should be nonnull and point to a zero-initialized
16712 variable. The function then records whether the ABI decision might
16713 be affected by a known fix to the ABI logic, setting the associated
16714 WARN_PSABI_* bits if so.
16715
16716 When the argument is instead a null pointer, the function tries to
16717 simulate the behavior of GCC before all such ABI fixes were made.
16718 This is useful to check whether the function returns something
16719 different after the ABI fixes. */
43e9d192 16720static int
e73a32d6 16721aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
56fe3ca3 16722 unsigned int *warn_psabi_flags)
43e9d192 16723{
ef4bddc2 16724 machine_mode mode;
43e9d192
IB
16725 HOST_WIDE_INT size;
16726
38e62001
RS
16727 if (aarch64_sve::builtin_type_p (type))
16728 return -1;
c600df9a 16729
43e9d192
IB
16730 switch (TREE_CODE (type))
16731 {
16732 case REAL_TYPE:
16733 mode = TYPE_MODE (type);
1b62ed4f
JG
16734 if (mode != DFmode && mode != SFmode
16735 && mode != TFmode && mode != HFmode)
43e9d192
IB
16736 return -1;
16737
16738 if (*modep == VOIDmode)
16739 *modep = mode;
16740
16741 if (*modep == mode)
16742 return 1;
16743
16744 break;
16745
16746 case COMPLEX_TYPE:
16747 mode = TYPE_MODE (TREE_TYPE (type));
1b62ed4f
JG
16748 if (mode != DFmode && mode != SFmode
16749 && mode != TFmode && mode != HFmode)
43e9d192
IB
16750 return -1;
16751
16752 if (*modep == VOIDmode)
16753 *modep = mode;
16754
16755 if (*modep == mode)
16756 return 2;
16757
16758 break;
16759
16760 case VECTOR_TYPE:
16761 /* Use V2SImode and V4SImode as representatives of all 64-bit
16762 and 128-bit vector types. */
16763 size = int_size_in_bytes (type);
16764 switch (size)
16765 {
16766 case 8:
16767 mode = V2SImode;
16768 break;
16769 case 16:
16770 mode = V4SImode;
16771 break;
16772 default:
16773 return -1;
16774 }
16775
16776 if (*modep == VOIDmode)
16777 *modep = mode;
16778
16779 /* Vector modes are considered to be opaque: two vectors are
16780 equivalent for the purposes of being homogeneous aggregates
16781 if they are the same size. */
16782 if (*modep == mode)
16783 return 1;
16784
16785 break;
16786
16787 case ARRAY_TYPE:
16788 {
16789 int count;
16790 tree index = TYPE_DOMAIN (type);
16791
807e902e
KZ
16792 /* Can't handle incomplete types nor sizes that are not
16793 fixed. */
16794 if (!COMPLETE_TYPE_P (type)
16795 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
16796 return -1;
16797
e73a32d6 16798 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
56fe3ca3 16799 warn_psabi_flags);
43e9d192
IB
16800 if (count == -1
16801 || !index
16802 || !TYPE_MAX_VALUE (index)
cc269bb6 16803 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
43e9d192 16804 || !TYPE_MIN_VALUE (index)
cc269bb6 16805 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
43e9d192
IB
16806 || count < 0)
16807 return -1;
16808
ae7e9ddd
RS
16809 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16810 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
43e9d192
IB
16811
16812 /* There must be no padding. */
6a70badb
RS
16813 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16814 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
16815 return -1;
16816
16817 return count;
16818 }
16819
16820 case RECORD_TYPE:
16821 {
16822 int count = 0;
16823 int sub_count;
16824 tree field;
16825
807e902e
KZ
16826 /* Can't handle incomplete types nor sizes that are not
16827 fixed. */
16828 if (!COMPLETE_TYPE_P (type)
16829 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
16830 return -1;
16831
16832 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16833 {
16834 if (TREE_CODE (field) != FIELD_DECL)
16835 continue;
16836
56fe3ca3 16837 if (DECL_FIELD_ABI_IGNORED (field))
e73a32d6 16838 {
56fe3ca3
RS
16839 /* See whether this is something that earlier versions of
16840 GCC failed to ignore. */
16841 unsigned int flag;
16842 if (lookup_attribute ("no_unique_address",
16843 DECL_ATTRIBUTES (field)))
16844 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
16845 else if (cxx17_empty_base_field_p (field))
16846 flag = WARN_PSABI_EMPTY_CXX17_BASE;
16847 else
16848 /* No compatibility problem. */
16849 continue;
16850
16851 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
16852 if (warn_psabi_flags)
16853 {
16854 *warn_psabi_flags |= flag;
16855 continue;
16856 }
e73a32d6
MM
16857 }
16858
16859 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 16860 warn_psabi_flags);
43e9d192
IB
16861 if (sub_count < 0)
16862 return -1;
16863 count += sub_count;
16864 }
16865
16866 /* There must be no padding. */
6a70badb
RS
16867 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16868 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
16869 return -1;
16870
16871 return count;
16872 }
16873
16874 case UNION_TYPE:
16875 case QUAL_UNION_TYPE:
16876 {
16877 /* These aren't very interesting except in a degenerate case. */
16878 int count = 0;
16879 int sub_count;
16880 tree field;
16881
807e902e
KZ
16882 /* Can't handle incomplete types nor sizes that are not
16883 fixed. */
16884 if (!COMPLETE_TYPE_P (type)
16885 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
43e9d192
IB
16886 return -1;
16887
16888 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16889 {
16890 if (TREE_CODE (field) != FIELD_DECL)
16891 continue;
16892
e73a32d6 16893 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
56fe3ca3 16894 warn_psabi_flags);
43e9d192
IB
16895 if (sub_count < 0)
16896 return -1;
16897 count = count > sub_count ? count : sub_count;
16898 }
16899
16900 /* There must be no padding. */
6a70badb
RS
16901 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16902 count * GET_MODE_BITSIZE (*modep)))
43e9d192
IB
16903 return -1;
16904
16905 return count;
16906 }
16907
16908 default:
16909 break;
16910 }
16911
16912 return -1;
16913}
16914
b6ec6215
KT
16915/* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16916 type as described in AAPCS64 \S 4.1.2.
16917
16918 See the comment above aarch64_composite_type_p for the notes on MODE. */
16919
16920static bool
16921aarch64_short_vector_p (const_tree type,
16922 machine_mode mode)
16923{
6a70badb 16924 poly_int64 size = -1;
b6ec6215
KT
16925
16926 if (type && TREE_CODE (type) == VECTOR_TYPE)
38e62001
RS
16927 {
16928 if (aarch64_sve::builtin_type_p (type))
16929 return false;
16930 size = int_size_in_bytes (type);
16931 }
b6ec6215 16932 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
38e62001
RS
16933 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16934 {
16935 /* Rely only on the type, not the mode, when processing SVE types. */
16936 if (type && aarch64_some_values_include_pst_objects_p (type))
b2672dd6
FY
16937 /* Leave later code to report an error if SVE is disabled. */
16938 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
38e62001
RS
16939 else
16940 size = GET_MODE_SIZE (mode);
16941 }
16942 if (known_eq (size, 8) || known_eq (size, 16))
16943 {
16944 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16945 they are being treated as scalable AAPCS64 types. */
16946 gcc_assert (!aarch64_sve_mode_p (mode));
16947 return true;
16948 }
16949 return false;
b6ec6215
KT
16950}
16951
43e9d192
IB
16952/* Return TRUE if the type, as described by TYPE and MODE, is a composite
16953 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
16954 array types. The C99 floating-point complex types are also considered
16955 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
16956 types, which are GCC extensions and out of the scope of AAPCS64, are
16957 treated as composite types here as well.
16958
16959 Note that MODE itself is not sufficient in determining whether a type
16960 is such a composite type or not. This is because
16961 stor-layout.c:compute_record_mode may have already changed the MODE
16962 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
16963 structure with only one field may have its MODE set to the mode of the
16964 field. Also an integer mode whose size matches the size of the
16965 RECORD_TYPE type may be used to substitute the original mode
16966 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
16967 solely relied on. */
16968
16969static bool
16970aarch64_composite_type_p (const_tree type,
ef4bddc2 16971 machine_mode mode)
43e9d192 16972{
b6ec6215
KT
16973 if (aarch64_short_vector_p (type, mode))
16974 return false;
16975
43e9d192
IB
16976 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16977 return true;
16978
16979 if (mode == BLKmode
16980 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16981 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16982 return true;
16983
16984 return false;
16985}
16986
43e9d192
IB
16987/* Return TRUE if an argument, whose type is described by TYPE and MODE,
16988 shall be passed or returned in simd/fp register(s) (providing these
16989 parameter passing registers are available).
16990
16991 Upon successful return, *COUNT returns the number of needed registers,
16992 *BASE_MODE returns the mode of the individual register and when IS_HAF
16993 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
56fe3ca3
RS
16994 floating-point aggregate or a homogeneous short-vector aggregate.
16995
16996 SILENT_P is true if the function should refrain from reporting any
16997 diagnostics. This should only be used if the caller is certain that
16998 any ABI decisions would eventually come through this function with
16999 SILENT_P set to false. */
43e9d192
IB
17000
17001static bool
ef4bddc2 17002aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
43e9d192 17003 const_tree type,
ef4bddc2 17004 machine_mode *base_mode,
43e9d192 17005 int *count,
56fe3ca3
RS
17006 bool *is_ha,
17007 bool silent_p)
43e9d192 17008{
c600df9a
RS
17009 if (is_ha != NULL) *is_ha = false;
17010
ef4bddc2 17011 machine_mode new_mode = VOIDmode;
43e9d192
IB
17012 bool composite_p = aarch64_composite_type_p (type, mode);
17013
43e9d192
IB
17014 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17015 || aarch64_short_vector_p (type, mode))
17016 {
17017 *count = 1;
17018 new_mode = mode;
17019 }
17020 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17021 {
17022 if (is_ha != NULL) *is_ha = true;
17023 *count = 2;
17024 new_mode = GET_MODE_INNER (mode);
17025 }
17026 else if (type && composite_p)
17027 {
56fe3ca3
RS
17028 unsigned int warn_psabi_flags = 0;
17029 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17030 &warn_psabi_flags);
43e9d192
IB
17031 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17032 {
e73a32d6
MM
17033 static unsigned last_reported_type_uid;
17034 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17035 int alt;
56fe3ca3
RS
17036 if (!silent_p
17037 && warn_psabi
17038 && warn_psabi_flags
e73a32d6
MM
17039 && uid != last_reported_type_uid
17040 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17041 != ag_count))
17042 {
e33a1eae
JJ
17043 const char *url
17044 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
e73a32d6
MM
17045 gcc_assert (alt == -1);
17046 last_reported_type_uid = uid;
56fe3ca3
RS
17047 /* Use TYPE_MAIN_VARIANT to strip any redundant const
17048 qualification. */
17049 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17050 inform (input_location, "parameter passing for argument of "
17051 "type %qT with %<[[no_unique_address]]%> members "
691eeb65
JJ
17052 "changed %{in GCC 10.1%}",
17053 TYPE_MAIN_VARIANT (type), url);
56fe3ca3
RS
17054 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17055 inform (input_location, "parameter passing for argument of "
17056 "type %qT when C++17 is enabled changed to match "
691eeb65
JJ
17057 "C++14 %{in GCC 10.1%}",
17058 TYPE_MAIN_VARIANT (type), url);
e73a32d6
MM
17059 }
17060
43e9d192
IB
17061 if (is_ha != NULL) *is_ha = true;
17062 *count = ag_count;
17063 }
17064 else
17065 return false;
17066 }
17067 else
17068 return false;
17069
38e62001 17070 gcc_assert (!aarch64_sve_mode_p (new_mode));
43e9d192
IB
17071 *base_mode = new_mode;
17072 return true;
17073}
17074
17075/* Implement TARGET_STRUCT_VALUE_RTX. */
17076
17077static rtx
17078aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17079 int incoming ATTRIBUTE_UNUSED)
17080{
17081 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17082}
17083
17084/* Implements target hook vector_mode_supported_p. */
17085static bool
ef4bddc2 17086aarch64_vector_mode_supported_p (machine_mode mode)
43e9d192 17087{
43cacb12 17088 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
cc68f7c2 17089 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
43e9d192
IB
17090}
17091
4aeb1ba7
RS
17092/* Return the full-width SVE vector mode for element mode MODE, if one
17093 exists. */
17094opt_machine_mode
17095aarch64_full_sve_mode (scalar_mode mode)
17096{
17097 switch (mode)
17098 {
17099 case E_DFmode:
17100 return VNx2DFmode;
17101 case E_SFmode:
17102 return VNx4SFmode;
17103 case E_HFmode:
17104 return VNx8HFmode;
02fcd8ac
RS
17105 case E_BFmode:
17106 return VNx8BFmode;
4aeb1ba7 17107 case E_DImode:
02fcd8ac 17108 return VNx2DImode;
4aeb1ba7
RS
17109 case E_SImode:
17110 return VNx4SImode;
17111 case E_HImode:
17112 return VNx8HImode;
17113 case E_QImode:
17114 return VNx16QImode;
17115 default:
17116 return opt_machine_mode ();
17117 }
17118}
17119
17120/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17121 if it exists. */
17122opt_machine_mode
17123aarch64_vq_mode (scalar_mode mode)
17124{
17125 switch (mode)
17126 {
17127 case E_DFmode:
17128 return V2DFmode;
17129 case E_SFmode:
17130 return V4SFmode;
17131 case E_HFmode:
17132 return V8HFmode;
abbe1ed2
SMW
17133 case E_BFmode:
17134 return V8BFmode;
4aeb1ba7
RS
17135 case E_SImode:
17136 return V4SImode;
17137 case E_HImode:
17138 return V8HImode;
17139 case E_QImode:
17140 return V16QImode;
17141 case E_DImode:
17142 return V2DImode;
17143 default:
17144 return opt_machine_mode ();
17145 }
17146}
17147
b7342d25
IB
17148/* Return appropriate SIMD container
17149 for MODE within a vector of WIDTH bits. */
ef4bddc2 17150static machine_mode
43cacb12 17151aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
43e9d192 17152{
9b070057
RS
17153 if (TARGET_SVE
17154 && maybe_ne (width, 128)
17155 && known_eq (width, BITS_PER_SVE_VECTOR))
4aeb1ba7 17156 return aarch64_full_sve_mode (mode).else_mode (word_mode);
43cacb12
RS
17157
17158 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
43e9d192 17159 if (TARGET_SIMD)
b7342d25 17160 {
43cacb12 17161 if (known_eq (width, 128))
4aeb1ba7 17162 return aarch64_vq_mode (mode).else_mode (word_mode);
b7342d25
IB
17163 else
17164 switch (mode)
17165 {
4e10a5a7 17166 case E_SFmode:
b7342d25 17167 return V2SFmode;
4e10a5a7 17168 case E_HFmode:
b719f884 17169 return V4HFmode;
abbe1ed2
SMW
17170 case E_BFmode:
17171 return V4BFmode;
4e10a5a7 17172 case E_SImode:
b7342d25 17173 return V2SImode;
4e10a5a7 17174 case E_HImode:
b7342d25 17175 return V4HImode;
4e10a5a7 17176 case E_QImode:
b7342d25
IB
17177 return V8QImode;
17178 default:
17179 break;
17180 }
17181 }
43e9d192
IB
17182 return word_mode;
17183}
17184
b7342d25 17185/* Return 128-bit container as the preferred SIMD mode for MODE. */
ef4bddc2 17186static machine_mode
005ba29c 17187aarch64_preferred_simd_mode (scalar_mode mode)
b7342d25 17188{
43cacb12
RS
17189 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
17190 return aarch64_simd_container_mode (mode, bits);
b7342d25
IB
17191}
17192
86e36728 17193/* Return a list of possible vector sizes for the vectorizer
3b357264 17194 to iterate over. */
bcc7e346 17195static unsigned int
e021fb86 17196aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
3b357264 17197{
cc68f7c2
RS
17198 static const machine_mode sve_modes[] = {
17199 /* Try using full vectors for all element types. */
17200 VNx16QImode,
17201
17202 /* Try using 16-bit containers for 8-bit elements and full vectors
17203 for wider elements. */
17204 VNx8QImode,
17205
17206 /* Try using 32-bit containers for 8-bit and 16-bit elements and
17207 full vectors for wider elements. */
17208 VNx4QImode,
74166aab 17209
cc68f7c2
RS
17210 /* Try using 64-bit containers for all element types. */
17211 VNx2QImode
17212 };
17213
17214 static const machine_mode advsimd_modes[] = {
17215 /* Try using 128-bit vectors for all element types. */
17216 V16QImode,
17217
17218 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17219 for wider elements. */
17220 V8QImode,
17221
17222 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17223 for wider elements.
17224
17225 TODO: We could support a limited form of V4QImode too, so that
17226 we use 32-bit vectors for 8-bit elements. */
17227 V4HImode,
17228
17229 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17230 for 64-bit elements.
74166aab 17231
cc68f7c2
RS
17232 TODO: We could similarly support limited forms of V2QImode and V2HImode
17233 for this case. */
17234 V2SImode
17235 };
74166aab 17236
cc68f7c2
RS
17237 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17238 This is because:
74166aab 17239
cc68f7c2
RS
17240 - If we can't use N-byte Advanced SIMD vectors then the placement
17241 doesn't matter; we'll just continue as though the Advanced SIMD
17242 entry didn't exist.
74166aab 17243
cc68f7c2
RS
17244 - If an SVE main loop with N bytes ends up being cheaper than an
17245 Advanced SIMD main loop with N bytes then by default we'll replace
17246 the Advanced SIMD version with the SVE one.
74166aab 17247
cc68f7c2
RS
17248 - If an Advanced SIMD main loop with N bytes ends up being cheaper
17249 than an SVE main loop with N bytes then by default we'll try to
17250 use the SVE loop to vectorize the epilogue instead. */
17251 unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
17252 unsigned int advsimd_i = 0;
17253 while (advsimd_i < ARRAY_SIZE (advsimd_modes))
17254 {
17255 if (sve_i < ARRAY_SIZE (sve_modes)
17256 && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
17257 GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
17258 modes->safe_push (sve_modes[sve_i++]);
17259 else
17260 modes->safe_push (advsimd_modes[advsimd_i++]);
17261 }
17262 while (sve_i < ARRAY_SIZE (sve_modes))
17263 modes->safe_push (sve_modes[sve_i++]);
bcc7e346 17264
eb23241b
RS
17265 unsigned int flags = 0;
17266 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17267 can compare SVE against Advanced SIMD and so that we can compare
17268 multiple SVE vectorization approaches against each other. There's
17269 not really any point doing this for Advanced SIMD only, since the
17270 first mode that works should always be the best. */
17271 if (TARGET_SVE && aarch64_sve_compare_costs)
17272 flags |= VECT_COMPARE_COSTS;
17273 return flags;
3b357264
JG
17274}
17275
ac2b960f
YZ
17276/* Implement TARGET_MANGLE_TYPE. */
17277
6f549691 17278static const char *
ac2b960f
YZ
17279aarch64_mangle_type (const_tree type)
17280{
17281 /* The AArch64 ABI documents say that "__va_list" has to be
17f8ace2 17282 mangled as if it is in the "std" namespace. */
ac2b960f
YZ
17283 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17284 return "St9__va_list";
17285
abbe1ed2 17286 /* Half-precision floating point types. */
c2ec330c 17287 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
abbe1ed2
SMW
17288 {
17289 if (TYPE_MODE (type) == BFmode)
17290 return "u6__bf16";
17291 else
17292 return "Dh";
17293 }
c2ec330c 17294
f9d53c27
TB
17295 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
17296 builtin types. */
17297 if (TYPE_NAME (type) != NULL)
624d0f07
RS
17298 {
17299 const char *res;
17300 if ((res = aarch64_general_mangle_builtin_type (type))
17301 || (res = aarch64_sve::mangle_builtin_type (type)))
17302 return res;
17303 }
c6fc9e43 17304
ac2b960f
YZ
17305 /* Use the default mangling. */
17306 return NULL;
17307}
17308
65ef05d0
RS
17309/* Implement TARGET_VERIFY_TYPE_CONTEXT. */
17310
17311static bool
17312aarch64_verify_type_context (location_t loc, type_context_kind context,
17313 const_tree type, bool silent_p)
17314{
17315 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17316}
17317
75cf1494
KT
17318/* Find the first rtx_insn before insn that will generate an assembly
17319 instruction. */
17320
17321static rtx_insn *
17322aarch64_prev_real_insn (rtx_insn *insn)
17323{
17324 if (!insn)
17325 return NULL;
17326
17327 do
17328 {
17329 insn = prev_real_insn (insn);
17330 }
17331 while (insn && recog_memoized (insn) < 0);
17332
17333 return insn;
17334}
17335
17336static bool
17337is_madd_op (enum attr_type t1)
17338{
17339 unsigned int i;
17340 /* A number of these may be AArch32 only. */
17341 enum attr_type mlatypes[] = {
17342 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17343 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17344 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17345 };
17346
17347 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17348 {
17349 if (t1 == mlatypes[i])
17350 return true;
17351 }
17352
17353 return false;
17354}
17355
17356/* Check if there is a register dependency between a load and the insn
17357 for which we hold recog_data. */
17358
17359static bool
17360dep_between_memop_and_curr (rtx memop)
17361{
17362 rtx load_reg;
17363 int opno;
17364
8baff86e 17365 gcc_assert (GET_CODE (memop) == SET);
75cf1494
KT
17366
17367 if (!REG_P (SET_DEST (memop)))
17368 return false;
17369
17370 load_reg = SET_DEST (memop);
8baff86e 17371 for (opno = 1; opno < recog_data.n_operands; opno++)
75cf1494
KT
17372 {
17373 rtx operand = recog_data.operand[opno];
17374 if (REG_P (operand)
17375 && reg_overlap_mentioned_p (load_reg, operand))
17376 return true;
17377
17378 }
17379 return false;
17380}
17381
8baff86e
KT
17382
17383/* When working around the Cortex-A53 erratum 835769,
17384 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17385 instruction and has a preceding memory instruction such that a NOP
17386 should be inserted between them. */
17387
75cf1494
KT
17388bool
17389aarch64_madd_needs_nop (rtx_insn* insn)
17390{
17391 enum attr_type attr_type;
17392 rtx_insn *prev;
17393 rtx body;
17394
b32c1043 17395 if (!TARGET_FIX_ERR_A53_835769)
75cf1494
KT
17396 return false;
17397
e322d6e3 17398 if (!INSN_P (insn) || recog_memoized (insn) < 0)
75cf1494
KT
17399 return false;
17400
17401 attr_type = get_attr_type (insn);
17402 if (!is_madd_op (attr_type))
17403 return false;
17404
17405 prev = aarch64_prev_real_insn (insn);
3fea1a75
KT
17406 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17407 Restore recog state to INSN to avoid state corruption. */
17408 extract_constrain_insn_cached (insn);
17409
550e2205 17410 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
75cf1494
KT
17411 return false;
17412
17413 body = single_set (prev);
17414
17415 /* If the previous insn is a memory op and there is no dependency between
8baff86e
KT
17416 it and the DImode madd, emit a NOP between them. If body is NULL then we
17417 have a complex memory operation, probably a load/store pair.
17418 Be conservative for now and emit a NOP. */
17419 if (GET_MODE (recog_data.operand[0]) == DImode
17420 && (!body || !dep_between_memop_and_curr (body)))
75cf1494
KT
17421 return true;
17422
17423 return false;
17424
17425}
17426
8baff86e
KT
17427
17428/* Implement FINAL_PRESCAN_INSN. */
17429
75cf1494
KT
17430void
17431aarch64_final_prescan_insn (rtx_insn *insn)
17432{
17433 if (aarch64_madd_needs_nop (insn))
17434 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17435}
17436
17437
43cacb12
RS
17438/* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17439 instruction. */
17440
17441bool
17442aarch64_sve_index_immediate_p (rtx base_or_step)
17443{
17444 return (CONST_INT_P (base_or_step)
17445 && IN_RANGE (INTVAL (base_or_step), -16, 15));
17446}
17447
f3582fda
RS
17448/* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17449 when applied to mode MODE. Negate X first if NEGATE_P is true. */
43cacb12
RS
17450
17451bool
f3582fda 17452aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
43cacb12 17453{
f3582fda
RS
17454 rtx elt = unwrap_const_vec_duplicate (x);
17455 if (!CONST_INT_P (elt))
43cacb12
RS
17456 return false;
17457
17458 HOST_WIDE_INT val = INTVAL (elt);
17459 if (negate_p)
17460 val = -val;
f3582fda 17461 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
43cacb12
RS
17462
17463 if (val & 0xff)
17464 return IN_RANGE (val, 0, 0xff);
17465 return IN_RANGE (val, 0, 0xff00);
17466}
17467
624d0f07 17468/* Return true if X is a valid immediate for the SVE SQADD and SQSUB
f3582fda
RS
17469 instructions when applied to mode MODE. Negate X first if NEGATE_P
17470 is true. */
624d0f07
RS
17471
17472bool
f3582fda 17473aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
624d0f07 17474{
f3582fda 17475 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
624d0f07
RS
17476 return false;
17477
17478 /* After the optional negation, the immediate must be nonnegative.
17479 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17480 instead of SQADD Zn.B, Zn.B, #129. */
f3582fda 17481 rtx elt = unwrap_const_vec_duplicate (x);
624d0f07
RS
17482 return negate_p == (INTVAL (elt) < 0);
17483}
17484
43cacb12
RS
17485/* Return true if X is a valid immediate operand for an SVE logical
17486 instruction such as AND. */
17487
17488bool
17489aarch64_sve_bitmask_immediate_p (rtx x)
17490{
17491 rtx elt;
17492
17493 return (const_vec_duplicate_p (x, &elt)
17494 && CONST_INT_P (elt)
17495 && aarch64_bitmask_imm (INTVAL (elt),
17496 GET_MODE_INNER (GET_MODE (x))));
17497}
17498
17499/* Return true if X is a valid immediate for the SVE DUP and CPY
17500 instructions. */
17501
17502bool
17503aarch64_sve_dup_immediate_p (rtx x)
17504{
d29f7dd5
RS
17505 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17506 if (!CONST_INT_P (x))
43cacb12
RS
17507 return false;
17508
d29f7dd5 17509 HOST_WIDE_INT val = INTVAL (x);
43cacb12
RS
17510 if (val & 0xff)
17511 return IN_RANGE (val, -0x80, 0x7f);
17512 return IN_RANGE (val, -0x8000, 0x7f00);
17513}
17514
17515/* Return true if X is a valid immediate operand for an SVE CMP instruction.
17516 SIGNED_P says whether the operand is signed rather than unsigned. */
17517
17518bool
17519aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17520{
6bc67182
RS
17521 x = unwrap_const_vec_duplicate (x);
17522 return (CONST_INT_P (x)
43cacb12 17523 && (signed_p
6bc67182
RS
17524 ? IN_RANGE (INTVAL (x), -16, 15)
17525 : IN_RANGE (INTVAL (x), 0, 127)));
43cacb12
RS
17526}
17527
17528/* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17529 instruction. Negate X first if NEGATE_P is true. */
17530
17531bool
17532aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17533{
17534 rtx elt;
17535 REAL_VALUE_TYPE r;
17536
17537 if (!const_vec_duplicate_p (x, &elt)
17538 || GET_CODE (elt) != CONST_DOUBLE)
17539 return false;
17540
17541 r = *CONST_DOUBLE_REAL_VALUE (elt);
17542
17543 if (negate_p)
17544 r = real_value_negate (&r);
17545
17546 if (real_equal (&r, &dconst1))
17547 return true;
17548 if (real_equal (&r, &dconsthalf))
17549 return true;
17550 return false;
17551}
17552
17553/* Return true if X is a valid immediate operand for an SVE FMUL
17554 instruction. */
17555
17556bool
17557aarch64_sve_float_mul_immediate_p (rtx x)
17558{
17559 rtx elt;
17560
43cacb12
RS
17561 return (const_vec_duplicate_p (x, &elt)
17562 && GET_CODE (elt) == CONST_DOUBLE
a19ba9e1
RS
17563 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17564 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
43cacb12
RS
17565}
17566
b187677b
RS
17567/* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17568 for the Advanced SIMD operation described by WHICH and INSN. If INFO
17569 is nonnull, use it to describe valid immediates. */
3520f7cc 17570static bool
b187677b
RS
17571aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17572 simd_immediate_info *info,
17573 enum simd_immediate_check which,
17574 simd_immediate_info::insn_type insn)
17575{
17576 /* Try a 4-byte immediate with LSL. */
17577 for (unsigned int shift = 0; shift < 32; shift += 8)
17578 if ((val32 & (0xff << shift)) == val32)
17579 {
17580 if (info)
17581 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17582 simd_immediate_info::LSL, shift);
17583 return true;
17584 }
3520f7cc 17585
b187677b
RS
17586 /* Try a 2-byte immediate with LSL. */
17587 unsigned int imm16 = val32 & 0xffff;
17588 if (imm16 == (val32 >> 16))
17589 for (unsigned int shift = 0; shift < 16; shift += 8)
17590 if ((imm16 & (0xff << shift)) == imm16)
48063b9d 17591 {
b187677b
RS
17592 if (info)
17593 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17594 simd_immediate_info::LSL, shift);
17595 return true;
48063b9d 17596 }
3520f7cc 17597
b187677b
RS
17598 /* Try a 4-byte immediate with MSL, except for cases that MVN
17599 can handle. */
17600 if (which == AARCH64_CHECK_MOV)
17601 for (unsigned int shift = 8; shift < 24; shift += 8)
17602 {
17603 unsigned int low = (1 << shift) - 1;
17604 if (((val32 & (0xff << shift)) | low) == val32)
17605 {
17606 if (info)
17607 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17608 simd_immediate_info::MSL, shift);
17609 return true;
17610 }
17611 }
43e9d192 17612
b187677b
RS
17613 return false;
17614}
17615
17616/* Return true if replicating VAL64 is a valid immediate for the
17617 Advanced SIMD operation described by WHICH. If INFO is nonnull,
17618 use it to describe valid immediates. */
17619static bool
17620aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17621 simd_immediate_info *info,
17622 enum simd_immediate_check which)
17623{
17624 unsigned int val32 = val64 & 0xffffffff;
17625 unsigned int val16 = val64 & 0xffff;
17626 unsigned int val8 = val64 & 0xff;
17627
17628 if (val32 == (val64 >> 32))
43e9d192 17629 {
b187677b
RS
17630 if ((which & AARCH64_CHECK_ORR) != 0
17631 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17632 simd_immediate_info::MOV))
17633 return true;
43e9d192 17634
b187677b
RS
17635 if ((which & AARCH64_CHECK_BIC) != 0
17636 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17637 simd_immediate_info::MVN))
17638 return true;
ee78df47 17639
b187677b
RS
17640 /* Try using a replicated byte. */
17641 if (which == AARCH64_CHECK_MOV
17642 && val16 == (val32 >> 16)
17643 && val8 == (val16 >> 8))
ee78df47 17644 {
b187677b
RS
17645 if (info)
17646 *info = simd_immediate_info (QImode, val8);
17647 return true;
ee78df47 17648 }
43e9d192
IB
17649 }
17650
b187677b
RS
17651 /* Try using a bit-to-bytemask. */
17652 if (which == AARCH64_CHECK_MOV)
43e9d192 17653 {
b187677b
RS
17654 unsigned int i;
17655 for (i = 0; i < 64; i += 8)
ab6501d7 17656 {
b187677b
RS
17657 unsigned char byte = (val64 >> i) & 0xff;
17658 if (byte != 0 && byte != 0xff)
17659 break;
ab6501d7 17660 }
b187677b 17661 if (i == 64)
ab6501d7 17662 {
b187677b
RS
17663 if (info)
17664 *info = simd_immediate_info (DImode, val64);
17665 return true;
ab6501d7 17666 }
43e9d192 17667 }
b187677b
RS
17668 return false;
17669}
43e9d192 17670
43cacb12
RS
17671/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17672 instruction. If INFO is nonnull, use it to describe valid immediates. */
17673
17674static bool
17675aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17676 simd_immediate_info *info)
17677{
17678 scalar_int_mode mode = DImode;
17679 unsigned int val32 = val64 & 0xffffffff;
17680 if (val32 == (val64 >> 32))
17681 {
17682 mode = SImode;
17683 unsigned int val16 = val32 & 0xffff;
17684 if (val16 == (val32 >> 16))
17685 {
17686 mode = HImode;
17687 unsigned int val8 = val16 & 0xff;
17688 if (val8 == (val16 >> 8))
17689 mode = QImode;
17690 }
17691 }
17692 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17693 if (IN_RANGE (val, -0x80, 0x7f))
17694 {
17695 /* DUP with no shift. */
17696 if (info)
17697 *info = simd_immediate_info (mode, val);
17698 return true;
17699 }
17700 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17701 {
17702 /* DUP with LSL #8. */
17703 if (info)
17704 *info = simd_immediate_info (mode, val);
17705 return true;
17706 }
17707 if (aarch64_bitmask_imm (val64, mode))
17708 {
17709 /* DUPM. */
17710 if (info)
17711 *info = simd_immediate_info (mode, val);
17712 return true;
17713 }
17714 return false;
17715}
17716
624d0f07
RS
17717/* Return true if X is an UNSPEC_PTRUE constant of the form:
17718
17719 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17720
17721 where PATTERN is the svpattern as a CONST_INT and where ZERO
17722 is a zero constant of the required PTRUE mode (which can have
17723 fewer elements than X's mode, if zero bits are significant).
17724
17725 If so, and if INFO is nonnull, describe the immediate in INFO. */
17726bool
17727aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17728{
17729 if (GET_CODE (x) != CONST)
17730 return false;
17731
17732 x = XEXP (x, 0);
17733 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17734 return false;
17735
17736 if (info)
17737 {
17738 aarch64_svpattern pattern
17739 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17740 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17741 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17742 *info = simd_immediate_info (int_mode, pattern);
17743 }
17744 return true;
17745}
17746
0b1fe8cf
RS
17747/* Return true if X is a valid SVE predicate. If INFO is nonnull, use
17748 it to describe valid immediates. */
17749
17750static bool
17751aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17752{
624d0f07
RS
17753 if (aarch64_sve_ptrue_svpattern_p (x, info))
17754 return true;
17755
0b1fe8cf
RS
17756 if (x == CONST0_RTX (GET_MODE (x)))
17757 {
17758 if (info)
17759 *info = simd_immediate_info (DImode, 0);
17760 return true;
17761 }
17762
17763 /* Analyze the value as a VNx16BImode. This should be relatively
17764 efficient, since rtx_vector_builder has enough built-in capacity
17765 to store all VLA predicate constants without needing the heap. */
17766 rtx_vector_builder builder;
17767 if (!aarch64_get_sve_pred_bits (builder, x))
17768 return false;
17769
17770 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
17771 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
17772 {
17773 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
17774 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
17775 if (pattern != AARCH64_NUM_SVPATTERNS)
17776 {
17777 if (info)
17778 {
17779 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
17780 *info = simd_immediate_info (int_mode, pattern);
17781 }
17782 return true;
17783 }
17784 }
17785 return false;
17786}
17787
b187677b
RS
17788/* Return true if OP is a valid SIMD immediate for the operation
17789 described by WHICH. If INFO is nonnull, use it to describe valid
17790 immediates. */
17791bool
17792aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
17793 enum simd_immediate_check which)
17794{
43cacb12
RS
17795 machine_mode mode = GET_MODE (op);
17796 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17797 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
17798 return false;
17799
0b1fe8cf
RS
17800 if (vec_flags & VEC_SVE_PRED)
17801 return aarch64_sve_pred_valid_immediate (op, info);
17802
43cacb12 17803 scalar_mode elt_mode = GET_MODE_INNER (mode);
f9093f23 17804 rtx base, step;
b187677b 17805 unsigned int n_elts;
f9093f23
RS
17806 if (GET_CODE (op) == CONST_VECTOR
17807 && CONST_VECTOR_DUPLICATE_P (op))
17808 n_elts = CONST_VECTOR_NPATTERNS (op);
43cacb12
RS
17809 else if ((vec_flags & VEC_SVE_DATA)
17810 && const_vec_series_p (op, &base, &step))
17811 {
17812 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
17813 if (!aarch64_sve_index_immediate_p (base)
17814 || !aarch64_sve_index_immediate_p (step))
17815 return false;
17816
17817 if (info)
cc68f7c2
RS
17818 {
17819 /* Get the corresponding container mode. E.g. an INDEX on V2SI
17820 should yield two integer values per 128-bit block, meaning
17821 that we need to treat it in the same way as V2DI and then
17822 ignore the upper 32 bits of each element. */
17823 elt_mode = aarch64_sve_container_int_mode (mode);
17824 *info = simd_immediate_info (elt_mode, base, step);
17825 }
43cacb12
RS
17826 return true;
17827 }
6a70badb
RS
17828 else if (GET_CODE (op) == CONST_VECTOR
17829 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
17830 /* N_ELTS set above. */;
b187677b 17831 else
d8edd899 17832 return false;
43e9d192 17833
b187677b 17834 scalar_float_mode elt_float_mode;
f9093f23
RS
17835 if (n_elts == 1
17836 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
43e9d192 17837 {
f9093f23
RS
17838 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
17839 if (aarch64_float_const_zero_rtx_p (elt)
17840 || aarch64_float_const_representable_p (elt))
17841 {
17842 if (info)
17843 *info = simd_immediate_info (elt_float_mode, elt);
17844 return true;
17845 }
b187677b 17846 }
43e9d192 17847
b23c6a2c
RS
17848 /* If all elements in an SVE vector have the same value, we have a free
17849 choice between using the element mode and using the container mode.
17850 Using the element mode means that unused parts of the vector are
17851 duplicates of the used elements, while using the container mode means
17852 that the unused parts are an extension of the used elements. Using the
17853 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17854 for its container mode VNx4SI while 0x00000101 isn't.
17855
17856 If not all elements in an SVE vector have the same value, we need the
17857 transition from one element to the next to occur at container boundaries.
17858 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17859 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
17860 scalar_int_mode elt_int_mode;
17861 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
17862 elt_int_mode = aarch64_sve_container_int_mode (mode);
17863 else
17864 elt_int_mode = int_mode_for_mode (elt_mode).require ();
17865
17866 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
b187677b
RS
17867 if (elt_size > 8)
17868 return false;
e4f0f84d 17869
b187677b
RS
17870 /* Expand the vector constant out into a byte vector, with the least
17871 significant byte of the register first. */
17872 auto_vec<unsigned char, 16> bytes;
17873 bytes.reserve (n_elts * elt_size);
17874 for (unsigned int i = 0; i < n_elts; i++)
17875 {
f9093f23
RS
17876 /* The vector is provided in gcc endian-neutral fashion.
17877 For aarch64_be Advanced SIMD, it must be laid out in the vector
17878 register in reverse order. */
17879 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
17880 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
43e9d192 17881
b187677b
RS
17882 if (elt_mode != elt_int_mode)
17883 elt = gen_lowpart (elt_int_mode, elt);
43e9d192 17884
b187677b
RS
17885 if (!CONST_INT_P (elt))
17886 return false;
43e9d192 17887
b187677b
RS
17888 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17889 for (unsigned int byte = 0; byte < elt_size; byte++)
48063b9d 17890 {
b187677b
RS
17891 bytes.quick_push (elt_val & 0xff);
17892 elt_val >>= BITS_PER_UNIT;
48063b9d 17893 }
43e9d192
IB
17894 }
17895
b187677b
RS
17896 /* The immediate must repeat every eight bytes. */
17897 unsigned int nbytes = bytes.length ();
17898 for (unsigned i = 8; i < nbytes; ++i)
17899 if (bytes[i] != bytes[i - 8])
17900 return false;
17901
17902 /* Get the repeating 8-byte value as an integer. No endian correction
17903 is needed here because bytes is already in lsb-first order. */
17904 unsigned HOST_WIDE_INT val64 = 0;
17905 for (unsigned int i = 0; i < 8; i++)
17906 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17907 << (i * BITS_PER_UNIT));
17908
43cacb12
RS
17909 if (vec_flags & VEC_SVE_DATA)
17910 return aarch64_sve_valid_immediate (val64, info);
17911 else
17912 return aarch64_advsimd_valid_immediate (val64, info, which);
17913}
17914
17915/* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17916 has a step in the range of INDEX. Return the index expression if so,
17917 otherwise return null. */
17918rtx
17919aarch64_check_zero_based_sve_index_immediate (rtx x)
17920{
17921 rtx base, step;
17922 if (const_vec_series_p (x, &base, &step)
17923 && base == const0_rtx
17924 && aarch64_sve_index_immediate_p (step))
17925 return step;
17926 return NULL_RTX;
43e9d192
IB
17927}
17928
43e9d192
IB
17929/* Check of immediate shift constants are within range. */
17930bool
ef4bddc2 17931aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
43e9d192 17932{
6bc67182
RS
17933 x = unwrap_const_vec_duplicate (x);
17934 if (!CONST_INT_P (x))
17935 return false;
43e9d192
IB
17936 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17937 if (left)
6bc67182 17938 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
43e9d192 17939 else
6bc67182 17940 return IN_RANGE (INTVAL (x), 1, bit_width);
43e9d192
IB
17941}
17942
7325d85a
KT
17943/* Return the bitmask CONST_INT to select the bits required by a zero extract
17944 operation of width WIDTH at bit position POS. */
17945
17946rtx
17947aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17948{
17949 gcc_assert (CONST_INT_P (width));
17950 gcc_assert (CONST_INT_P (pos));
17951
17952 unsigned HOST_WIDE_INT mask
17953 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17954 return GEN_INT (mask << UINTVAL (pos));
17955}
17956
83f8c414 17957bool
a6e0bfa7 17958aarch64_mov_operand_p (rtx x, machine_mode mode)
83f8c414 17959{
83f8c414
CSS
17960 if (GET_CODE (x) == HIGH
17961 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17962 return true;
17963
82614948 17964 if (CONST_INT_P (x))
83f8c414
CSS
17965 return true;
17966
43cacb12 17967 if (VECTOR_MODE_P (GET_MODE (x)))
678faefc
RS
17968 {
17969 /* Require predicate constants to be VNx16BI before RA, so that we
17970 force everything to have a canonical form. */
17971 if (!lra_in_progress
17972 && !reload_completed
17973 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17974 && GET_MODE (x) != VNx16BImode)
17975 return false;
17976
17977 return aarch64_simd_valid_immediate (x, NULL);
17978 }
43cacb12 17979
83f8c414
CSS
17980 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17981 return true;
17982
c0e0174b 17983 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
43cacb12
RS
17984 return true;
17985
a6e0bfa7 17986 return aarch64_classify_symbolic_expression (x)
a5350ddc 17987 == SYMBOL_TINY_ABSOLUTE;
83f8c414
CSS
17988}
17989
43e9d192
IB
17990/* Return a const_int vector of VAL. */
17991rtx
ab014eb3 17992aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
43e9d192 17993{
59d06c05
RS
17994 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
17995 return gen_const_vec_duplicate (mode, c);
43e9d192
IB
17996}
17997
051d0e2f
SN
17998/* Check OP is a legal scalar immediate for the MOVI instruction. */
17999
18000bool
77e994c9 18001aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
051d0e2f 18002{
ef4bddc2 18003 machine_mode vmode;
051d0e2f 18004
43cacb12 18005 vmode = aarch64_simd_container_mode (mode, 64);
051d0e2f 18006 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
b187677b 18007 return aarch64_simd_valid_immediate (op_v, NULL);
051d0e2f
SN
18008}
18009
988fa693
JG
18010/* Construct and return a PARALLEL RTX vector with elements numbering the
18011 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18012 the vector - from the perspective of the architecture. This does not
18013 line up with GCC's perspective on lane numbers, so we end up with
18014 different masks depending on our target endian-ness. The diagram
18015 below may help. We must draw the distinction when building masks
18016 which select one half of the vector. An instruction selecting
18017 architectural low-lanes for a big-endian target, must be described using
18018 a mask selecting GCC high-lanes.
18019
18020 Big-Endian Little-Endian
18021
18022GCC 0 1 2 3 3 2 1 0
18023 | x | x | x | x | | x | x | x | x |
18024Architecture 3 2 1 0 3 2 1 0
18025
18026Low Mask: { 2, 3 } { 0, 1 }
18027High Mask: { 0, 1 } { 2, 3 }
f5cbabc1
RS
18028
18029 MODE Is the mode of the vector and NUNITS is the number of units in it. */
988fa693 18030
43e9d192 18031rtx
f5cbabc1 18032aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
43e9d192 18033{
43e9d192 18034 rtvec v = rtvec_alloc (nunits / 2);
988fa693
JG
18035 int high_base = nunits / 2;
18036 int low_base = 0;
18037 int base;
43e9d192
IB
18038 rtx t1;
18039 int i;
18040
988fa693
JG
18041 if (BYTES_BIG_ENDIAN)
18042 base = high ? low_base : high_base;
18043 else
18044 base = high ? high_base : low_base;
18045
18046 for (i = 0; i < nunits / 2; i++)
43e9d192
IB
18047 RTVEC_ELT (v, i) = GEN_INT (base + i);
18048
18049 t1 = gen_rtx_PARALLEL (mode, v);
18050 return t1;
18051}
18052
988fa693
JG
18053/* Check OP for validity as a PARALLEL RTX vector with elements
18054 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18055 from the perspective of the architecture. See the diagram above
18056 aarch64_simd_vect_par_cnst_half for more details. */
18057
18058bool
ef4bddc2 18059aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
988fa693
JG
18060 bool high)
18061{
6a70badb
RS
18062 int nelts;
18063 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
f5cbabc1
RS
18064 return false;
18065
6a70badb 18066 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
988fa693
JG
18067 HOST_WIDE_INT count_op = XVECLEN (op, 0);
18068 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18069 int i = 0;
18070
988fa693
JG
18071 if (count_op != count_ideal)
18072 return false;
18073
18074 for (i = 0; i < count_ideal; i++)
18075 {
18076 rtx elt_op = XVECEXP (op, 0, i);
18077 rtx elt_ideal = XVECEXP (ideal, 0, i);
18078
4aa81c2e 18079 if (!CONST_INT_P (elt_op)
988fa693
JG
18080 || INTVAL (elt_ideal) != INTVAL (elt_op))
18081 return false;
18082 }
18083 return true;
18084}
18085
4aeb1ba7
RS
18086/* Return a PARALLEL containing NELTS elements, with element I equal
18087 to BASE + I * STEP. */
18088
18089rtx
18090aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18091{
18092 rtvec vec = rtvec_alloc (nelts);
18093 for (unsigned int i = 0; i < nelts; ++i)
18094 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18095 return gen_rtx_PARALLEL (VOIDmode, vec);
18096}
18097
18098/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18099 series with step STEP. */
18100
18101bool
18102aarch64_stepped_int_parallel_p (rtx op, int step)
18103{
18104 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18105 return false;
18106
18107 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18108 for (int i = 1; i < XVECLEN (op, 0); ++i)
18109 if (!CONST_INT_P (XVECEXP (op, 0, i))
18110 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18111 return false;
18112
18113 return true;
18114}
18115
43e9d192
IB
18116/* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
18117 HIGH (exclusive). */
18118void
46ed6024
CB
18119aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18120 const_tree exp)
43e9d192
IB
18121{
18122 HOST_WIDE_INT lane;
4aa81c2e 18123 gcc_assert (CONST_INT_P (operand));
43e9d192
IB
18124 lane = INTVAL (operand);
18125
18126 if (lane < low || lane >= high)
46ed6024
CB
18127 {
18128 if (exp)
cf0c27ef 18129 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
46ed6024 18130 else
cf0c27ef 18131 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
46ed6024 18132 }
43e9d192
IB
18133}
18134
7ac29c0f
RS
18135/* Peform endian correction on lane number N, which indexes a vector
18136 of mode MODE, and return the result as an SImode rtx. */
18137
18138rtx
18139aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18140{
18141 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18142}
18143
43e9d192 18144/* Return TRUE if OP is a valid vector addressing mode. */
43cacb12 18145
43e9d192
IB
18146bool
18147aarch64_simd_mem_operand_p (rtx op)
18148{
18149 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
4aa81c2e 18150 || REG_P (XEXP (op, 0)));
43e9d192
IB
18151}
18152
43cacb12
RS
18153/* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
18154
18155bool
18156aarch64_sve_ld1r_operand_p (rtx op)
18157{
18158 struct aarch64_address_info addr;
18159 scalar_mode mode;
18160
18161 return (MEM_P (op)
18162 && is_a <scalar_mode> (GET_MODE (op), &mode)
18163 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18164 && addr.type == ADDRESS_REG_IMM
18165 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18166}
18167
9ceec73f
MM
18168/* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18169 where the size of the read data is specified by `mode` and the size of the
18170 vector elements are specified by `elem_mode`. */
4aeb1ba7 18171bool
9ceec73f
MM
18172aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18173 scalar_mode elem_mode)
4aeb1ba7
RS
18174{
18175 struct aarch64_address_info addr;
4aeb1ba7
RS
18176 if (!MEM_P (op)
18177 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18178 return false;
18179
18180 if (addr.type == ADDRESS_REG_IMM)
9ceec73f 18181 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
4aeb1ba7
RS
18182
18183 if (addr.type == ADDRESS_REG_REG)
18184 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18185
18186 return false;
18187}
18188
9ceec73f
MM
18189/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
18190bool
18191aarch64_sve_ld1rq_operand_p (rtx op)
18192{
18193 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18194 GET_MODE_INNER (GET_MODE (op)));
18195}
18196
18197/* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18198 accessing a vector where the element size is specified by `elem_mode`. */
18199bool
18200aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18201{
18202 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18203}
18204
624d0f07
RS
18205/* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
18206bool
18207aarch64_sve_ldff1_operand_p (rtx op)
18208{
18209 if (!MEM_P (op))
18210 return false;
18211
18212 struct aarch64_address_info addr;
18213 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18214 return false;
18215
18216 if (addr.type == ADDRESS_REG_IMM)
18217 return known_eq (addr.const_offset, 0);
18218
18219 return addr.type == ADDRESS_REG_REG;
18220}
18221
18222/* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
18223bool
18224aarch64_sve_ldnf1_operand_p (rtx op)
18225{
18226 struct aarch64_address_info addr;
18227
18228 return (MEM_P (op)
18229 && aarch64_classify_address (&addr, XEXP (op, 0),
18230 GET_MODE (op), false)
18231 && addr.type == ADDRESS_REG_IMM);
18232}
18233
43cacb12
RS
18234/* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18235 The conditions for STR are the same. */
18236bool
18237aarch64_sve_ldr_operand_p (rtx op)
18238{
18239 struct aarch64_address_info addr;
18240
18241 return (MEM_P (op)
18242 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18243 false, ADDR_QUERY_ANY)
18244 && addr.type == ADDRESS_REG_IMM);
18245}
18246
624d0f07
RS
18247/* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18248 addressing memory of mode MODE. */
18249bool
18250aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18251{
18252 struct aarch64_address_info addr;
18253 if (!aarch64_classify_address (&addr, op, mode, false))
18254 return false;
18255
18256 if (addr.type == ADDRESS_REG_IMM)
18257 return known_eq (addr.const_offset, 0);
18258
18259 return addr.type == ADDRESS_REG_REG;
18260}
18261
9f4cbab8
RS
18262/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18263 We need to be able to access the individual pieces, so the range
18264 is different from LD[234] and ST[234]. */
18265bool
18266aarch64_sve_struct_memory_operand_p (rtx op)
18267{
18268 if (!MEM_P (op))
18269 return false;
18270
18271 machine_mode mode = GET_MODE (op);
18272 struct aarch64_address_info addr;
18273 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18274 ADDR_QUERY_ANY)
18275 || addr.type != ADDRESS_REG_IMM)
18276 return false;
18277
18278 poly_int64 first = addr.const_offset;
18279 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18280 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18281 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18282}
18283
2d8c6dc1
AH
18284/* Emit a register copy from operand to operand, taking care not to
18285 early-clobber source registers in the process.
43e9d192 18286
2d8c6dc1
AH
18287 COUNT is the number of components into which the copy needs to be
18288 decomposed. */
43e9d192 18289void
b8506a8a 18290aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
2d8c6dc1 18291 unsigned int count)
43e9d192
IB
18292{
18293 unsigned int i;
2d8c6dc1
AH
18294 int rdest = REGNO (operands[0]);
18295 int rsrc = REGNO (operands[1]);
43e9d192
IB
18296
18297 if (!reg_overlap_mentioned_p (operands[0], operands[1])
2d8c6dc1
AH
18298 || rdest < rsrc)
18299 for (i = 0; i < count; i++)
18300 emit_move_insn (gen_rtx_REG (mode, rdest + i),
18301 gen_rtx_REG (mode, rsrc + i));
43e9d192 18302 else
2d8c6dc1
AH
18303 for (i = 0; i < count; i++)
18304 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18305 gen_rtx_REG (mode, rsrc + count - i - 1));
43e9d192
IB
18306}
18307
668046d1 18308/* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
6ec0e5b9 18309 one of VSTRUCT modes: OI, CI, or XI. */
668046d1 18310int
b8506a8a 18311aarch64_simd_attr_length_rglist (machine_mode mode)
668046d1 18312{
6a70badb
RS
18313 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
18314 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
668046d1
DS
18315}
18316
db0253a4 18317/* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
43cacb12
RS
18318 alignment of a vector to 128 bits. SVE predicates have an alignment of
18319 16 bits. */
db0253a4
TB
18320static HOST_WIDE_INT
18321aarch64_simd_vector_alignment (const_tree type)
18322{
07108a9e
RS
18323 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18324 be set for non-predicate vectors of booleans. Modes are the most
18325 direct way we have of identifying real SVE predicate types. */
18326 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18327 return 16;
cc68f7c2
RS
18328 widest_int min_size
18329 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18330 return wi::umin (min_size, 128).to_uhwi ();
db0253a4
TB
18331}
18332
43cacb12 18333/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
ca31798e 18334static poly_uint64
43cacb12
RS
18335aarch64_vectorize_preferred_vector_alignment (const_tree type)
18336{
18337 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18338 {
18339 /* If the length of the vector is fixed, try to align to that length,
18340 otherwise don't try to align at all. */
18341 HOST_WIDE_INT result;
18342 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18343 result = TYPE_ALIGN (TREE_TYPE (type));
18344 return result;
18345 }
18346 return TYPE_ALIGN (type);
18347}
18348
db0253a4
TB
18349/* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
18350static bool
18351aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18352{
18353 if (is_packed)
18354 return false;
18355
43cacb12
RS
18356 /* For fixed-length vectors, check that the vectorizer will aim for
18357 full-vector alignment. This isn't true for generic GCC vectors
18358 that are wider than the ABI maximum of 128 bits. */
ca31798e
AV
18359 poly_uint64 preferred_alignment =
18360 aarch64_vectorize_preferred_vector_alignment (type);
43cacb12 18361 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
ca31798e
AV
18362 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18363 preferred_alignment))
db0253a4
TB
18364 return false;
18365
18366 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
18367 return true;
18368}
18369
7df76747
N
18370/* Return true if the vector misalignment factor is supported by the
18371 target. */
18372static bool
18373aarch64_builtin_support_vector_misalignment (machine_mode mode,
18374 const_tree type, int misalignment,
18375 bool is_packed)
18376{
18377 if (TARGET_SIMD && STRICT_ALIGNMENT)
18378 {
18379 /* Return if movmisalign pattern is not supported for this mode. */
18380 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18381 return false;
18382
a509c571 18383 /* Misalignment factor is unknown at compile time. */
7df76747 18384 if (misalignment == -1)
a509c571 18385 return false;
7df76747
N
18386 }
18387 return default_builtin_support_vector_misalignment (mode, type, misalignment,
18388 is_packed);
18389}
18390
4369c11e
TB
18391/* If VALS is a vector constant that can be loaded into a register
18392 using DUP, generate instructions to do so and return an RTX to
18393 assign to the register. Otherwise return NULL_RTX. */
18394static rtx
18395aarch64_simd_dup_constant (rtx vals)
18396{
ef4bddc2
RS
18397 machine_mode mode = GET_MODE (vals);
18398 machine_mode inner_mode = GET_MODE_INNER (mode);
4369c11e 18399 rtx x;
4369c11e 18400
92695fbb 18401 if (!const_vec_duplicate_p (vals, &x))
4369c11e
TB
18402 return NULL_RTX;
18403
18404 /* We can load this constant by using DUP and a constant in a
18405 single ARM register. This will be cheaper than a vector
18406 load. */
92695fbb 18407 x = copy_to_mode_reg (inner_mode, x);
59d06c05 18408 return gen_vec_duplicate (mode, x);
4369c11e
TB
18409}
18410
18411
18412/* Generate code to load VALS, which is a PARALLEL containing only
18413 constants (for vec_init) or CONST_VECTOR, efficiently into a
18414 register. Returns an RTX to copy into the register, or NULL_RTX
67914693 18415 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
1df3f464 18416static rtx
4369c11e
TB
18417aarch64_simd_make_constant (rtx vals)
18418{
ef4bddc2 18419 machine_mode mode = GET_MODE (vals);
4369c11e
TB
18420 rtx const_dup;
18421 rtx const_vec = NULL_RTX;
4369c11e
TB
18422 int n_const = 0;
18423 int i;
18424
18425 if (GET_CODE (vals) == CONST_VECTOR)
18426 const_vec = vals;
18427 else if (GET_CODE (vals) == PARALLEL)
18428 {
18429 /* A CONST_VECTOR must contain only CONST_INTs and
18430 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18431 Only store valid constants in a CONST_VECTOR. */
6a70badb 18432 int n_elts = XVECLEN (vals, 0);
4369c11e
TB
18433 for (i = 0; i < n_elts; ++i)
18434 {
18435 rtx x = XVECEXP (vals, 0, i);
18436 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18437 n_const++;
18438 }
18439 if (n_const == n_elts)
18440 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18441 }
18442 else
18443 gcc_unreachable ();
18444
18445 if (const_vec != NULL_RTX
b187677b 18446 && aarch64_simd_valid_immediate (const_vec, NULL))
4369c11e
TB
18447 /* Load using MOVI/MVNI. */
18448 return const_vec;
18449 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18450 /* Loaded using DUP. */
18451 return const_dup;
18452 else if (const_vec != NULL_RTX)
67914693 18453 /* Load from constant pool. We cannot take advantage of single-cycle
4369c11e
TB
18454 LD1 because we need a PC-relative addressing mode. */
18455 return const_vec;
18456 else
18457 /* A PARALLEL containing something not valid inside CONST_VECTOR.
67914693 18458 We cannot construct an initializer. */
4369c11e
TB
18459 return NULL_RTX;
18460}
18461
35a093b6
JG
18462/* Expand a vector initialisation sequence, such that TARGET is
18463 initialised to contain VALS. */
18464
4369c11e
TB
18465void
18466aarch64_expand_vector_init (rtx target, rtx vals)
18467{
ef4bddc2 18468 machine_mode mode = GET_MODE (target);
146c2e3a 18469 scalar_mode inner_mode = GET_MODE_INNER (mode);
35a093b6 18470 /* The number of vector elements. */
6a70badb 18471 int n_elts = XVECLEN (vals, 0);
35a093b6 18472 /* The number of vector elements which are not constant. */
8b66a2d4
AL
18473 int n_var = 0;
18474 rtx any_const = NULL_RTX;
35a093b6
JG
18475 /* The first element of vals. */
18476 rtx v0 = XVECEXP (vals, 0, 0);
4369c11e 18477 bool all_same = true;
4369c11e 18478
41dab855
KT
18479 /* This is a special vec_init<M><N> where N is not an element mode but a
18480 vector mode with half the elements of M. We expect to find two entries
18481 of mode N in VALS and we must put their concatentation into TARGET. */
18482 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18483 {
18484 gcc_assert (known_eq (GET_MODE_SIZE (mode),
18485 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18486 rtx lo = XVECEXP (vals, 0, 0);
18487 rtx hi = XVECEXP (vals, 0, 1);
18488 machine_mode narrow_mode = GET_MODE (lo);
18489 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18490 gcc_assert (narrow_mode == GET_MODE (hi));
18491
18492 /* When we want to concatenate a half-width vector with zeroes we can
18493 use the aarch64_combinez[_be] patterns. Just make sure that the
18494 zeroes are in the right half. */
18495 if (BYTES_BIG_ENDIAN
18496 && aarch64_simd_imm_zero (lo, narrow_mode)
18497 && general_operand (hi, narrow_mode))
18498 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18499 else if (!BYTES_BIG_ENDIAN
18500 && aarch64_simd_imm_zero (hi, narrow_mode)
18501 && general_operand (lo, narrow_mode))
18502 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18503 else
18504 {
18505 /* Else create the two half-width registers and combine them. */
18506 if (!REG_P (lo))
18507 lo = force_reg (GET_MODE (lo), lo);
18508 if (!REG_P (hi))
18509 hi = force_reg (GET_MODE (hi), hi);
18510
18511 if (BYTES_BIG_ENDIAN)
18512 std::swap (lo, hi);
18513 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18514 }
18515 return;
18516 }
18517
35a093b6 18518 /* Count the number of variable elements to initialise. */
8b66a2d4 18519 for (int i = 0; i < n_elts; ++i)
4369c11e 18520 {
8b66a2d4 18521 rtx x = XVECEXP (vals, 0, i);
35a093b6 18522 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
8b66a2d4
AL
18523 ++n_var;
18524 else
18525 any_const = x;
4369c11e 18526
35a093b6 18527 all_same &= rtx_equal_p (x, v0);
4369c11e
TB
18528 }
18529
35a093b6
JG
18530 /* No variable elements, hand off to aarch64_simd_make_constant which knows
18531 how best to handle this. */
4369c11e
TB
18532 if (n_var == 0)
18533 {
18534 rtx constant = aarch64_simd_make_constant (vals);
18535 if (constant != NULL_RTX)
18536 {
18537 emit_move_insn (target, constant);
18538 return;
18539 }
18540 }
18541
18542 /* Splat a single non-constant element if we can. */
18543 if (all_same)
18544 {
35a093b6 18545 rtx x = copy_to_mode_reg (inner_mode, v0);
59d06c05 18546 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
4369c11e
TB
18547 return;
18548 }
18549
85c1b6d7
AP
18550 enum insn_code icode = optab_handler (vec_set_optab, mode);
18551 gcc_assert (icode != CODE_FOR_nothing);
18552
18553 /* If there are only variable elements, try to optimize
18554 the insertion using dup for the most common element
18555 followed by insertions. */
18556
18557 /* The algorithm will fill matches[*][0] with the earliest matching element,
18558 and matches[X][1] with the count of duplicate elements (if X is the
18559 earliest element which has duplicates). */
18560
18561 if (n_var == n_elts && n_elts <= 16)
18562 {
18563 int matches[16][2] = {0};
18564 for (int i = 0; i < n_elts; i++)
18565 {
18566 for (int j = 0; j <= i; j++)
18567 {
18568 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18569 {
18570 matches[i][0] = j;
18571 matches[j][1]++;
18572 break;
18573 }
18574 }
18575 }
18576 int maxelement = 0;
18577 int maxv = 0;
18578 for (int i = 0; i < n_elts; i++)
18579 if (matches[i][1] > maxv)
18580 {
18581 maxelement = i;
18582 maxv = matches[i][1];
18583 }
18584
b4e2cd5b
JG
18585 /* Create a duplicate of the most common element, unless all elements
18586 are equally useless to us, in which case just immediately set the
18587 vector register using the first element. */
18588
18589 if (maxv == 1)
18590 {
18591 /* For vectors of two 64-bit elements, we can do even better. */
18592 if (n_elts == 2
18593 && (inner_mode == E_DImode
18594 || inner_mode == E_DFmode))
18595
18596 {
18597 rtx x0 = XVECEXP (vals, 0, 0);
18598 rtx x1 = XVECEXP (vals, 0, 1);
18599 /* Combine can pick up this case, but handling it directly
18600 here leaves clearer RTL.
18601
18602 This is load_pair_lanes<mode>, and also gives us a clean-up
18603 for store_pair_lanes<mode>. */
18604 if (memory_operand (x0, inner_mode)
18605 && memory_operand (x1, inner_mode)
18606 && !STRICT_ALIGNMENT
18607 && rtx_equal_p (XEXP (x1, 0),
18608 plus_constant (Pmode,
18609 XEXP (x0, 0),
18610 GET_MODE_SIZE (inner_mode))))
18611 {
18612 rtx t;
18613 if (inner_mode == DFmode)
18614 t = gen_load_pair_lanesdf (target, x0, x1);
18615 else
18616 t = gen_load_pair_lanesdi (target, x0, x1);
18617 emit_insn (t);
18618 return;
18619 }
18620 }
18621 /* The subreg-move sequence below will move into lane zero of the
18622 vector register. For big-endian we want that position to hold
18623 the last element of VALS. */
18624 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18625 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18626 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18627 }
18628 else
18629 {
18630 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18631 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18632 }
85c1b6d7
AP
18633
18634 /* Insert the rest. */
18635 for (int i = 0; i < n_elts; i++)
18636 {
18637 rtx x = XVECEXP (vals, 0, i);
18638 if (matches[i][0] == maxelement)
18639 continue;
18640 x = copy_to_mode_reg (inner_mode, x);
18641 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18642 }
18643 return;
18644 }
18645
35a093b6
JG
18646 /* Initialise a vector which is part-variable. We want to first try
18647 to build those lanes which are constant in the most efficient way we
18648 can. */
18649 if (n_var != n_elts)
4369c11e
TB
18650 {
18651 rtx copy = copy_rtx (vals);
4369c11e 18652
8b66a2d4
AL
18653 /* Load constant part of vector. We really don't care what goes into the
18654 parts we will overwrite, but we're more likely to be able to load the
18655 constant efficiently if it has fewer, larger, repeating parts
18656 (see aarch64_simd_valid_immediate). */
18657 for (int i = 0; i < n_elts; i++)
18658 {
18659 rtx x = XVECEXP (vals, 0, i);
18660 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18661 continue;
18662 rtx subst = any_const;
18663 for (int bit = n_elts / 2; bit > 0; bit /= 2)
18664 {
18665 /* Look in the copied vector, as more elements are const. */
18666 rtx test = XVECEXP (copy, 0, i ^ bit);
18667 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18668 {
18669 subst = test;
18670 break;
18671 }
18672 }
18673 XVECEXP (copy, 0, i) = subst;
18674 }
4369c11e 18675 aarch64_expand_vector_init (target, copy);
35a093b6 18676 }
4369c11e 18677
35a093b6 18678 /* Insert the variable lanes directly. */
8b66a2d4 18679 for (int i = 0; i < n_elts; i++)
35a093b6
JG
18680 {
18681 rtx x = XVECEXP (vals, 0, i);
18682 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18683 continue;
18684 x = copy_to_mode_reg (inner_mode, x);
18685 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18686 }
4369c11e
TB
18687}
18688
3a0afad0
PK
18689/* Emit RTL corresponding to:
18690 insr TARGET, ELEM. */
18691
18692static void
18693emit_insr (rtx target, rtx elem)
18694{
18695 machine_mode mode = GET_MODE (target);
18696 scalar_mode elem_mode = GET_MODE_INNER (mode);
18697 elem = force_reg (elem_mode, elem);
18698
18699 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18700 gcc_assert (icode != CODE_FOR_nothing);
18701 emit_insn (GEN_FCN (icode) (target, target, elem));
18702}
18703
18704/* Subroutine of aarch64_sve_expand_vector_init for handling
18705 trailing constants.
18706 This function works as follows:
18707 (a) Create a new vector consisting of trailing constants.
18708 (b) Initialize TARGET with the constant vector using emit_move_insn.
18709 (c) Insert remaining elements in TARGET using insr.
18710 NELTS is the total number of elements in original vector while
18711 while NELTS_REQD is the number of elements that are actually
18712 significant.
18713
18714 ??? The heuristic used is to do above only if number of constants
18715 is at least half the total number of elements. May need fine tuning. */
18716
18717static bool
18718aarch64_sve_expand_vector_init_handle_trailing_constants
18719 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18720{
18721 machine_mode mode = GET_MODE (target);
18722 scalar_mode elem_mode = GET_MODE_INNER (mode);
18723 int n_trailing_constants = 0;
18724
18725 for (int i = nelts_reqd - 1;
5da301cb 18726 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
3a0afad0
PK
18727 i--)
18728 n_trailing_constants++;
18729
18730 if (n_trailing_constants >= nelts_reqd / 2)
18731 {
5da301cb
RS
18732 /* Try to use the natural pattern of BUILDER to extend the trailing
18733 constant elements to a full vector. Replace any variables in the
18734 extra elements with zeros.
18735
18736 ??? It would be better if the builders supported "don't care"
18737 elements, with the builder filling in whichever elements
18738 give the most compact encoding. */
18739 rtx_vector_builder v (mode, nelts, 1);
3a0afad0 18740 for (int i = 0; i < nelts; i++)
5da301cb
RS
18741 {
18742 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18743 if (!valid_for_const_vector_p (elem_mode, x))
18744 x = const0_rtx;
18745 v.quick_push (x);
18746 }
3a0afad0
PK
18747 rtx const_vec = v.build ();
18748 emit_move_insn (target, const_vec);
18749
18750 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18751 emit_insr (target, builder.elt (i));
18752
18753 return true;
18754 }
18755
18756 return false;
18757}
18758
18759/* Subroutine of aarch64_sve_expand_vector_init.
18760 Works as follows:
18761 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18762 (b) Skip trailing elements from BUILDER, which are the same as
18763 element NELTS_REQD - 1.
18764 (c) Insert earlier elements in reverse order in TARGET using insr. */
18765
18766static void
18767aarch64_sve_expand_vector_init_insert_elems (rtx target,
18768 const rtx_vector_builder &builder,
18769 int nelts_reqd)
18770{
18771 machine_mode mode = GET_MODE (target);
18772 scalar_mode elem_mode = GET_MODE_INNER (mode);
18773
18774 struct expand_operand ops[2];
18775 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
18776 gcc_assert (icode != CODE_FOR_nothing);
18777
18778 create_output_operand (&ops[0], target, mode);
18779 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
18780 expand_insn (icode, 2, ops);
18781
18782 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18783 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
18784 emit_insr (target, builder.elt (i));
18785}
18786
18787/* Subroutine of aarch64_sve_expand_vector_init to handle case
18788 when all trailing elements of builder are same.
18789 This works as follows:
18790 (a) Use expand_insn interface to broadcast last vector element in TARGET.
18791 (b) Insert remaining elements in TARGET using insr.
18792
18793 ??? The heuristic used is to do above if number of same trailing elements
18794 is at least 3/4 of total number of elements, loosely based on
18795 heuristic from mostly_zeros_p. May need fine-tuning. */
18796
18797static bool
18798aarch64_sve_expand_vector_init_handle_trailing_same_elem
18799 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
18800{
18801 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18802 if (ndups >= (3 * nelts_reqd) / 4)
18803 {
18804 aarch64_sve_expand_vector_init_insert_elems (target, builder,
18805 nelts_reqd - ndups + 1);
18806 return true;
18807 }
18808
18809 return false;
18810}
18811
18812/* Initialize register TARGET from BUILDER. NELTS is the constant number
18813 of elements in BUILDER.
18814
18815 The function tries to initialize TARGET from BUILDER if it fits one
18816 of the special cases outlined below.
18817
18818 Failing that, the function divides BUILDER into two sub-vectors:
18819 v_even = even elements of BUILDER;
18820 v_odd = odd elements of BUILDER;
18821
18822 and recursively calls itself with v_even and v_odd.
18823
18824 if (recursive call succeeded for v_even or v_odd)
18825 TARGET = zip (v_even, v_odd)
18826
18827 The function returns true if it managed to build TARGET from BUILDER
18828 with one of the special cases, false otherwise.
18829
18830 Example: {a, 1, b, 2, c, 3, d, 4}
18831
18832 The vector gets divided into:
18833 v_even = {a, b, c, d}
18834 v_odd = {1, 2, 3, 4}
18835
18836 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18837 initialize tmp2 from constant vector v_odd using emit_move_insn.
18838
18839 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18840 4 elements, so we construct tmp1 from v_even using insr:
18841 tmp1 = dup(d)
18842 insr tmp1, c
18843 insr tmp1, b
18844 insr tmp1, a
18845
18846 And finally:
18847 TARGET = zip (tmp1, tmp2)
18848 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
18849
18850static bool
18851aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
18852 int nelts, int nelts_reqd)
18853{
18854 machine_mode mode = GET_MODE (target);
18855
18856 /* Case 1: Vector contains trailing constants. */
18857
18858 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18859 (target, builder, nelts, nelts_reqd))
18860 return true;
18861
18862 /* Case 2: Vector contains leading constants. */
18863
5da301cb 18864 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
3a0afad0
PK
18865 for (int i = 0; i < nelts_reqd; i++)
18866 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
18867 rev_builder.finalize ();
18868
18869 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18870 (target, rev_builder, nelts, nelts_reqd))
18871 {
18872 emit_insn (gen_aarch64_sve_rev (mode, target, target));
18873 return true;
18874 }
18875
18876 /* Case 3: Vector contains trailing same element. */
18877
18878 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18879 (target, builder, nelts_reqd))
18880 return true;
18881
18882 /* Case 4: Vector contains leading same element. */
18883
18884 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18885 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
18886 {
18887 emit_insn (gen_aarch64_sve_rev (mode, target, target));
18888 return true;
18889 }
18890
18891 /* Avoid recursing below 4-elements.
18892 ??? The threshold 4 may need fine-tuning. */
18893
18894 if (nelts_reqd <= 4)
18895 return false;
18896
5da301cb
RS
18897 rtx_vector_builder v_even (mode, nelts, 1);
18898 rtx_vector_builder v_odd (mode, nelts, 1);
3a0afad0
PK
18899
18900 for (int i = 0; i < nelts * 2; i += 2)
18901 {
18902 v_even.quick_push (builder.elt (i));
18903 v_odd.quick_push (builder.elt (i + 1));
18904 }
18905
18906 v_even.finalize ();
18907 v_odd.finalize ();
18908
18909 rtx tmp1 = gen_reg_rtx (mode);
18910 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18911 nelts, nelts_reqd / 2);
18912
18913 rtx tmp2 = gen_reg_rtx (mode);
18914 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18915 nelts, nelts_reqd / 2);
18916
18917 if (!did_even_p && !did_odd_p)
18918 return false;
18919
18920 /* Initialize v_even and v_odd using INSR if it didn't match any of the
18921 special cases and zip v_even, v_odd. */
18922
18923 if (!did_even_p)
18924 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18925
18926 if (!did_odd_p)
18927 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18928
18929 rtvec v = gen_rtvec (2, tmp1, tmp2);
18930 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18931 return true;
18932}
18933
18934/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
18935
18936void
18937aarch64_sve_expand_vector_init (rtx target, rtx vals)
18938{
18939 machine_mode mode = GET_MODE (target);
18940 int nelts = XVECLEN (vals, 0);
18941
5da301cb 18942 rtx_vector_builder v (mode, nelts, 1);
3a0afad0
PK
18943 for (int i = 0; i < nelts; i++)
18944 v.quick_push (XVECEXP (vals, 0, i));
18945 v.finalize ();
18946
18947 /* If neither sub-vectors of v could be initialized specially,
18948 then use INSR to insert all elements from v into TARGET.
18949 ??? This might not be optimal for vectors with large
18950 initializers like 16-element or above.
18951 For nelts < 4, it probably isn't useful to handle specially. */
18952
18953 if (nelts < 4
18954 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18955 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18956}
18957
b6c3aea1
RS
18958/* Check whether VALUE is a vector constant in which every element
18959 is either a power of 2 or a negated power of 2. If so, return
18960 a constant vector of log2s, and flip CODE between PLUS and MINUS
18961 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
18962
18963static rtx
18964aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18965{
18966 if (GET_CODE (value) != CONST_VECTOR)
18967 return NULL_RTX;
18968
18969 rtx_vector_builder builder;
18970 if (!builder.new_unary_operation (GET_MODE (value), value, false))
18971 return NULL_RTX;
18972
18973 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18974 /* 1 if the result of the multiplication must be negated,
18975 0 if it mustn't, or -1 if we don't yet care. */
18976 int negate = -1;
18977 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18978 for (unsigned int i = 0; i < encoded_nelts; ++i)
18979 {
18980 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18981 if (!CONST_SCALAR_INT_P (elt))
18982 return NULL_RTX;
18983 rtx_mode_t val (elt, int_mode);
18984 wide_int pow2 = wi::neg (val);
18985 if (val != pow2)
18986 {
18987 /* It matters whether we negate or not. Make that choice,
18988 and make sure that it's consistent with previous elements. */
18989 if (negate == !wi::neg_p (val))
18990 return NULL_RTX;
18991 negate = wi::neg_p (val);
18992 if (!negate)
18993 pow2 = val;
18994 }
18995 /* POW2 is now the value that we want to be a power of 2. */
18996 int shift = wi::exact_log2 (pow2);
18997 if (shift < 0)
18998 return NULL_RTX;
18999 builder.quick_push (gen_int_mode (shift, int_mode));
19000 }
19001 if (negate == -1)
19002 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
19003 code = PLUS;
19004 else if (negate == 1)
19005 code = code == PLUS ? MINUS : PLUS;
19006 return builder.build ();
19007}
19008
19009/* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19010 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
19011 operands array, in the same order as for fma_optab. Return true if
19012 the function emitted all the necessary instructions, false if the caller
19013 should generate the pattern normally with the new OPERANDS array. */
19014
19015bool
19016aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19017{
19018 machine_mode mode = GET_MODE (operands[0]);
19019 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19020 {
19021 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19022 NULL_RTX, true, OPTAB_DIRECT);
19023 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19024 operands[3], product, operands[0], true,
19025 OPTAB_DIRECT);
19026 return true;
19027 }
19028 operands[2] = force_reg (mode, operands[2]);
19029 return false;
19030}
19031
19032/* Likewise, but for a conditional pattern. */
19033
19034bool
19035aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19036{
19037 machine_mode mode = GET_MODE (operands[0]);
19038 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19039 {
19040 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19041 NULL_RTX, true, OPTAB_DIRECT);
19042 emit_insn (gen_cond (code, mode, operands[0], operands[1],
19043 operands[4], product, operands[5]));
19044 return true;
19045 }
19046 operands[3] = force_reg (mode, operands[3]);
19047 return false;
19048}
19049
43e9d192 19050static unsigned HOST_WIDE_INT
ef4bddc2 19051aarch64_shift_truncation_mask (machine_mode mode)
43e9d192 19052{
43cacb12
RS
19053 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19054 return 0;
19055 return GET_MODE_UNIT_BITSIZE (mode) - 1;
43e9d192
IB
19056}
19057
43e9d192
IB
19058/* Select a format to encode pointers in exception handling data. */
19059int
19060aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19061{
19062 int type;
19063 switch (aarch64_cmodel)
19064 {
19065 case AARCH64_CMODEL_TINY:
19066 case AARCH64_CMODEL_TINY_PIC:
19067 case AARCH64_CMODEL_SMALL:
19068 case AARCH64_CMODEL_SMALL_PIC:
1b1e81f8 19069 case AARCH64_CMODEL_SMALL_SPIC:
43e9d192
IB
19070 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
19071 for everything. */
19072 type = DW_EH_PE_sdata4;
19073 break;
19074 default:
19075 /* No assumptions here. 8-byte relocs required. */
19076 type = DW_EH_PE_sdata8;
19077 break;
19078 }
19079 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19080}
19081
b07fc91c
SN
19082/* Output .variant_pcs for aarch64_vector_pcs function symbols. */
19083
19084static void
19085aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19086{
c600df9a 19087 if (TREE_CODE (decl) == FUNCTION_DECL)
b07fc91c 19088 {
c600df9a
RS
19089 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19090 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19091 {
19092 fprintf (stream, "\t.variant_pcs\t");
19093 assemble_name (stream, name);
19094 fprintf (stream, "\n");
19095 }
b07fc91c
SN
19096 }
19097}
19098
e1c1ecb0
KT
19099/* The last .arch and .tune assembly strings that we printed. */
19100static std::string aarch64_last_printed_arch_string;
19101static std::string aarch64_last_printed_tune_string;
19102
361fb3ee
KT
19103/* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
19104 by the function fndecl. */
19105
19106void
19107aarch64_declare_function_name (FILE *stream, const char* name,
19108 tree fndecl)
19109{
19110 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19111
19112 struct cl_target_option *targ_options;
19113 if (target_parts)
19114 targ_options = TREE_TARGET_OPTION (target_parts);
19115 else
19116 targ_options = TREE_TARGET_OPTION (target_option_current_node);
19117 gcc_assert (targ_options);
19118
19119 const struct processor *this_arch
19120 = aarch64_get_arch (targ_options->x_explicit_arch);
19121
28108a53 19122 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
054b4005 19123 std::string extension
04a99ebe
JG
19124 = aarch64_get_extension_string_for_isa_flags (isa_flags,
19125 this_arch->flags);
e1c1ecb0
KT
19126 /* Only update the assembler .arch string if it is distinct from the last
19127 such string we printed. */
19128 std::string to_print = this_arch->name + extension;
19129 if (to_print != aarch64_last_printed_arch_string)
19130 {
19131 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19132 aarch64_last_printed_arch_string = to_print;
19133 }
361fb3ee
KT
19134
19135 /* Print the cpu name we're tuning for in the comments, might be
e1c1ecb0
KT
19136 useful to readers of the generated asm. Do it only when it changes
19137 from function to function and verbose assembly is requested. */
361fb3ee
KT
19138 const struct processor *this_tune
19139 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19140
e1c1ecb0
KT
19141 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19142 {
19143 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19144 this_tune->name);
19145 aarch64_last_printed_tune_string = this_tune->name;
19146 }
361fb3ee 19147
b07fc91c
SN
19148 aarch64_asm_output_variant_pcs (stream, fndecl, name);
19149
361fb3ee
KT
19150 /* Don't forget the type directive for ELF. */
19151 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19152 ASM_OUTPUT_LABEL (stream, name);
c292cfe5
SN
19153
19154 cfun->machine->label_is_assembled = true;
19155}
19156
19157/* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
19158 the function label and emit a BTI if necessary. */
19159
19160void
19161aarch64_print_patchable_function_entry (FILE *file,
19162 unsigned HOST_WIDE_INT patch_area_size,
19163 bool record_p)
19164{
19165 if (cfun->machine->label_is_assembled
19166 && aarch64_bti_enabled ()
19167 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19168 {
19169 /* Remove the BTI that follows the patch area and insert a new BTI
19170 before the patch area right after the function label. */
19171 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19172 if (insn
19173 && INSN_P (insn)
19174 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19175 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19176 delete_insn (insn);
19177 asm_fprintf (file, "\thint\t34 // bti c\n");
19178 }
19179
19180 default_print_patchable_function_entry (file, patch_area_size, record_p);
361fb3ee
KT
19181}
19182
b07fc91c
SN
19183/* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
19184
19185void
19186aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19187{
19188 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19189 const char *value = IDENTIFIER_POINTER (target);
19190 aarch64_asm_output_variant_pcs (stream, decl, name);
19191 ASM_OUTPUT_DEF (stream, name, value);
19192}
19193
19194/* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
19195 function symbol references. */
19196
19197void
e8c47069 19198aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
b07fc91c 19199{
e8c47069 19200 default_elf_asm_output_external (stream, decl, name);
b07fc91c
SN
19201 aarch64_asm_output_variant_pcs (stream, decl, name);
19202}
19203
8fc16d72
ST
19204/* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19205 Used to output the .cfi_b_key_frame directive when signing the current
19206 function with the B key. */
19207
19208void
19209aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19210{
2bdc7dcb 19211 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
8fc16d72
ST
19212 && aarch64_ra_sign_key == AARCH64_KEY_B)
19213 asm_fprintf (f, "\t.cfi_b_key_frame\n");
19214}
19215
e1c1ecb0
KT
19216/* Implements TARGET_ASM_FILE_START. Output the assembly header. */
19217
19218static void
19219aarch64_start_file (void)
19220{
19221 struct cl_target_option *default_options
19222 = TREE_TARGET_OPTION (target_option_default_node);
19223
19224 const struct processor *default_arch
19225 = aarch64_get_arch (default_options->x_explicit_arch);
28108a53 19226 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
e1c1ecb0 19227 std::string extension
04a99ebe
JG
19228 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19229 default_arch->flags);
e1c1ecb0
KT
19230
19231 aarch64_last_printed_arch_string = default_arch->name + extension;
19232 aarch64_last_printed_tune_string = "";
19233 asm_fprintf (asm_out_file, "\t.arch %s\n",
19234 aarch64_last_printed_arch_string.c_str ());
19235
19236 default_file_start ();
19237}
19238
0462169c
SN
19239/* Emit load exclusive. */
19240
19241static void
ef4bddc2 19242aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
0462169c
SN
19243 rtx mem, rtx model_rtx)
19244{
4a2095eb
RH
19245 if (mode == TImode)
19246 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19247 gen_highpart (DImode, rval),
19248 mem, model_rtx));
19249 else
19250 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
0462169c
SN
19251}
19252
19253/* Emit store exclusive. */
19254
19255static void
ef4bddc2 19256aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
4a2095eb 19257 rtx mem, rtx rval, rtx model_rtx)
0462169c 19258{
4a2095eb
RH
19259 if (mode == TImode)
19260 emit_insn (gen_aarch64_store_exclusive_pair
19261 (bval, mem, operand_subword (rval, 0, 0, TImode),
19262 operand_subword (rval, 1, 0, TImode), model_rtx));
19263 else
19264 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
0462169c
SN
19265}
19266
19267/* Mark the previous jump instruction as unlikely. */
19268
19269static void
19270aarch64_emit_unlikely_jump (rtx insn)
19271{
f370536c 19272 rtx_insn *jump = emit_jump_insn (insn);
5fa396ad 19273 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
0462169c
SN
19274}
19275
3950b229
RH
19276/* We store the names of the various atomic helpers in a 5x4 array.
19277 Return the libcall function given MODE, MODEL and NAMES. */
19278
19279rtx
19280aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19281 const atomic_ool_names *names)
19282{
19283 memmodel model = memmodel_base (INTVAL (model_rtx));
19284 int mode_idx, model_idx;
19285
19286 switch (mode)
19287 {
19288 case E_QImode:
19289 mode_idx = 0;
19290 break;
19291 case E_HImode:
19292 mode_idx = 1;
19293 break;
19294 case E_SImode:
19295 mode_idx = 2;
19296 break;
19297 case E_DImode:
19298 mode_idx = 3;
19299 break;
19300 case E_TImode:
19301 mode_idx = 4;
19302 break;
19303 default:
19304 gcc_unreachable ();
19305 }
19306
19307 switch (model)
19308 {
19309 case MEMMODEL_RELAXED:
19310 model_idx = 0;
19311 break;
19312 case MEMMODEL_CONSUME:
19313 case MEMMODEL_ACQUIRE:
19314 model_idx = 1;
19315 break;
19316 case MEMMODEL_RELEASE:
19317 model_idx = 2;
19318 break;
19319 case MEMMODEL_ACQ_REL:
19320 case MEMMODEL_SEQ_CST:
19321 model_idx = 3;
19322 break;
19323 default:
19324 gcc_unreachable ();
19325 }
19326
19327 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19328 VISIBILITY_HIDDEN);
19329}
19330
19331#define DEF0(B, N) \
19332 { "__aarch64_" #B #N "_relax", \
19333 "__aarch64_" #B #N "_acq", \
19334 "__aarch64_" #B #N "_rel", \
19335 "__aarch64_" #B #N "_acq_rel" }
19336
19337#define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19338 { NULL, NULL, NULL, NULL }
19339#define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19340
19341static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19342const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19343const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19344const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19345const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19346const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19347
19348#undef DEF0
19349#undef DEF4
19350#undef DEF5
19351
0462169c
SN
19352/* Expand a compare and swap pattern. */
19353
19354void
19355aarch64_expand_compare_and_swap (rtx operands[])
19356{
d400fda3
RH
19357 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19358 machine_mode mode, r_mode;
0462169c
SN
19359
19360 bval = operands[0];
19361 rval = operands[1];
19362 mem = operands[2];
19363 oldval = operands[3];
19364 newval = operands[4];
19365 is_weak = operands[5];
19366 mod_s = operands[6];
19367 mod_f = operands[7];
19368 mode = GET_MODE (mem);
0462169c
SN
19369
19370 /* Normally the succ memory model must be stronger than fail, but in the
19371 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19372 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
46b35980
AM
19373 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19374 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
0462169c
SN
19375 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19376
d400fda3
RH
19377 r_mode = mode;
19378 if (mode == QImode || mode == HImode)
0462169c 19379 {
d400fda3
RH
19380 r_mode = SImode;
19381 rval = gen_reg_rtx (r_mode);
0462169c
SN
19382 }
19383
b0770c0f 19384 if (TARGET_LSE)
77f33f44
RH
19385 {
19386 /* The CAS insn requires oldval and rval overlap, but we need to
19387 have a copy of oldval saved across the operation to tell if
19388 the operation is successful. */
d400fda3
RH
19389 if (reg_overlap_mentioned_p (rval, oldval))
19390 rval = copy_to_mode_reg (r_mode, oldval);
77f33f44 19391 else
d400fda3
RH
19392 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19393
77f33f44
RH
19394 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19395 newval, mod_s));
d400fda3 19396 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
77f33f44 19397 }
3950b229
RH
19398 else if (TARGET_OUTLINE_ATOMICS)
19399 {
19400 /* Oldval must satisfy compare afterward. */
19401 if (!aarch64_plus_operand (oldval, mode))
19402 oldval = force_reg (mode, oldval);
19403 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19404 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19405 oldval, mode, newval, mode,
19406 XEXP (mem, 0), Pmode);
19407 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19408 }
b0770c0f 19409 else
d400fda3
RH
19410 {
19411 /* The oldval predicate varies by mode. Test it and force to reg. */
19412 insn_code code = code_for_aarch64_compare_and_swap (mode);
19413 if (!insn_data[code].operand[2].predicate (oldval, mode))
19414 oldval = force_reg (mode, oldval);
0462169c 19415
d400fda3
RH
19416 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19417 is_weak, mod_s, mod_f));
19418 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19419 }
19420
19421 if (r_mode != mode)
77f33f44
RH
19422 rval = gen_lowpart (mode, rval);
19423 emit_move_insn (operands[1], rval);
0462169c 19424
d400fda3 19425 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
f7df4a84 19426 emit_insn (gen_rtx_SET (bval, x));
0462169c
SN
19427}
19428
f70fb3b6
MW
19429/* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19430 sequence implementing an atomic operation. */
19431
19432static void
19433aarch64_emit_post_barrier (enum memmodel model)
19434{
19435 const enum memmodel base_model = memmodel_base (model);
19436
19437 if (is_mm_sync (model)
19438 && (base_model == MEMMODEL_ACQUIRE
19439 || base_model == MEMMODEL_ACQ_REL
19440 || base_model == MEMMODEL_SEQ_CST))
19441 {
19442 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19443 }
19444}
19445
0462169c
SN
19446/* Split a compare and swap pattern. */
19447
19448void
19449aarch64_split_compare_and_swap (rtx operands[])
19450{
e5e07b68
WD
19451 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19452 gcc_assert (epilogue_completed);
19453
b7e560de 19454 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
ef4bddc2 19455 machine_mode mode;
0462169c 19456 bool is_weak;
5d8a22a5 19457 rtx_code_label *label1, *label2;
ab876106 19458 enum memmodel model;
0462169c
SN
19459
19460 rval = operands[0];
19461 mem = operands[1];
19462 oldval = operands[2];
19463 newval = operands[3];
19464 is_weak = (operands[4] != const0_rtx);
ab876106 19465 model_rtx = operands[5];
0462169c
SN
19466 scratch = operands[7];
19467 mode = GET_MODE (mem);
ab876106 19468 model = memmodel_from_int (INTVAL (model_rtx));
0462169c 19469
17f47f86
KT
19470 /* When OLDVAL is zero and we want the strong version we can emit a tighter
19471 loop:
19472 .label1:
19473 LD[A]XR rval, [mem]
19474 CBNZ rval, .label2
19475 ST[L]XR scratch, newval, [mem]
19476 CBNZ scratch, .label1
19477 .label2:
19478 CMP rval, 0. */
b7e560de
RH
19479 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19480 oldval == const0_rtx && mode != TImode);
17f47f86 19481
5d8a22a5 19482 label1 = NULL;
0462169c
SN
19483 if (!is_weak)
19484 {
19485 label1 = gen_label_rtx ();
19486 emit_label (label1);
19487 }
19488 label2 = gen_label_rtx ();
19489
ab876106
MW
19490 /* The initial load can be relaxed for a __sync operation since a final
19491 barrier will be emitted to stop code hoisting. */
19492 if (is_mm_sync (model))
b7e560de 19493 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
ab876106
MW
19494 else
19495 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
0462169c 19496
17f47f86 19497 if (strong_zero_p)
b7e560de 19498 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17f47f86
KT
19499 else
19500 {
b7e560de
RH
19501 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19502 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17f47f86 19503 }
b7e560de
RH
19504 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19505 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19506 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c 19507
ab876106 19508 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
0462169c
SN
19509
19510 if (!is_weak)
19511 {
6e1eaca9
RE
19512 if (aarch64_track_speculation)
19513 {
19514 /* Emit an explicit compare instruction, so that we can correctly
19515 track the condition codes. */
19516 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19517 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19518 }
19519 else
19520 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19521
0462169c
SN
19522 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19523 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
f7df4a84 19524 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
0462169c
SN
19525 }
19526 else
b7e560de 19527 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
0462169c
SN
19528
19529 emit_label (label2);
b7e560de 19530
17f47f86
KT
19531 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19532 to set the condition flags. If this is not used it will be removed by
19533 later passes. */
19534 if (strong_zero_p)
b7e560de
RH
19535 aarch64_gen_compare_reg (NE, rval, const0_rtx);
19536
ab876106
MW
19537 /* Emit any final barrier needed for a __sync operation. */
19538 if (is_mm_sync (model))
19539 aarch64_emit_post_barrier (model);
0462169c 19540}
9cd7b720 19541
0462169c
SN
19542/* Split an atomic operation. */
19543
19544void
19545aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9cd7b720 19546 rtx value, rtx model_rtx, rtx cond)
0462169c 19547{
e5e07b68
WD
19548 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19549 gcc_assert (epilogue_completed);
19550
ef4bddc2
RS
19551 machine_mode mode = GET_MODE (mem);
19552 machine_mode wmode = (mode == DImode ? DImode : SImode);
f70fb3b6
MW
19553 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19554 const bool is_sync = is_mm_sync (model);
5d8a22a5
DM
19555 rtx_code_label *label;
19556 rtx x;
0462169c 19557
9cd7b720 19558 /* Split the atomic operation into a sequence. */
0462169c
SN
19559 label = gen_label_rtx ();
19560 emit_label (label);
19561
19562 if (new_out)
19563 new_out = gen_lowpart (wmode, new_out);
19564 if (old_out)
19565 old_out = gen_lowpart (wmode, old_out);
19566 else
19567 old_out = new_out;
19568 value = simplify_gen_subreg (wmode, value, mode, 0);
19569
f70fb3b6
MW
19570 /* The initial load can be relaxed for a __sync operation since a final
19571 barrier will be emitted to stop code hoisting. */
19572 if (is_sync)
19573 aarch64_emit_load_exclusive (mode, old_out, mem,
19574 GEN_INT (MEMMODEL_RELAXED));
19575 else
19576 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
0462169c
SN
19577
19578 switch (code)
19579 {
19580 case SET:
19581 new_out = value;
19582 break;
19583
19584 case NOT:
19585 x = gen_rtx_AND (wmode, old_out, value);
f7df4a84 19586 emit_insn (gen_rtx_SET (new_out, x));
0462169c 19587 x = gen_rtx_NOT (wmode, new_out);
f7df4a84 19588 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
19589 break;
19590
19591 case MINUS:
19592 if (CONST_INT_P (value))
19593 {
19594 value = GEN_INT (-INTVAL (value));
19595 code = PLUS;
19596 }
19597 /* Fall through. */
19598
19599 default:
19600 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
f7df4a84 19601 emit_insn (gen_rtx_SET (new_out, x));
0462169c
SN
19602 break;
19603 }
19604
19605 aarch64_emit_store_exclusive (mode, cond, mem,
19606 gen_lowpart (mode, new_out), model_rtx);
19607
6e1eaca9
RE
19608 if (aarch64_track_speculation)
19609 {
19610 /* Emit an explicit compare instruction, so that we can correctly
19611 track the condition codes. */
19612 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19613 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19614 }
19615 else
19616 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19617
0462169c
SN
19618 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19619 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
f7df4a84 19620 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
f70fb3b6
MW
19621
19622 /* Emit any final barrier needed for a __sync operation. */
19623 if (is_sync)
19624 aarch64_emit_post_barrier (model);
0462169c
SN
19625}
19626
c2ec330c
AL
19627static void
19628aarch64_init_libfuncs (void)
19629{
19630 /* Half-precision float operations. The compiler handles all operations
19631 with NULL libfuncs by converting to SFmode. */
19632
19633 /* Conversions. */
19634 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19635 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19636
19637 /* Arithmetic. */
19638 set_optab_libfunc (add_optab, HFmode, NULL);
19639 set_optab_libfunc (sdiv_optab, HFmode, NULL);
19640 set_optab_libfunc (smul_optab, HFmode, NULL);
19641 set_optab_libfunc (neg_optab, HFmode, NULL);
19642 set_optab_libfunc (sub_optab, HFmode, NULL);
19643
19644 /* Comparisons. */
19645 set_optab_libfunc (eq_optab, HFmode, NULL);
19646 set_optab_libfunc (ne_optab, HFmode, NULL);
19647 set_optab_libfunc (lt_optab, HFmode, NULL);
19648 set_optab_libfunc (le_optab, HFmode, NULL);
19649 set_optab_libfunc (ge_optab, HFmode, NULL);
19650 set_optab_libfunc (gt_optab, HFmode, NULL);
19651 set_optab_libfunc (unord_optab, HFmode, NULL);
19652}
19653
43e9d192 19654/* Target hook for c_mode_for_suffix. */
ef4bddc2 19655static machine_mode
43e9d192
IB
19656aarch64_c_mode_for_suffix (char suffix)
19657{
19658 if (suffix == 'q')
19659 return TFmode;
19660
19661 return VOIDmode;
19662}
19663
3520f7cc
JG
19664/* We can only represent floating point constants which will fit in
19665 "quarter-precision" values. These values are characterised by
19666 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
19667 by:
19668
19669 (-1)^s * (n/16) * 2^r
19670
19671 Where:
19672 's' is the sign bit.
19673 'n' is an integer in the range 16 <= n <= 31.
19674 'r' is an integer in the range -3 <= r <= 4. */
19675
19676/* Return true iff X can be represented by a quarter-precision
19677 floating point immediate operand X. Note, we cannot represent 0.0. */
19678bool
19679aarch64_float_const_representable_p (rtx x)
19680{
19681 /* This represents our current view of how many bits
19682 make up the mantissa. */
19683 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
ba96cdfb 19684 int exponent;
3520f7cc 19685 unsigned HOST_WIDE_INT mantissa, mask;
3520f7cc 19686 REAL_VALUE_TYPE r, m;
807e902e 19687 bool fail;
3520f7cc 19688
d29f7dd5 19689 x = unwrap_const_vec_duplicate (x);
3520f7cc
JG
19690 if (!CONST_DOUBLE_P (x))
19691 return false;
19692
a4518821
RS
19693 if (GET_MODE (x) == VOIDmode
19694 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
94bfa2da
TV
19695 return false;
19696
34a72c33 19697 r = *CONST_DOUBLE_REAL_VALUE (x);
3520f7cc
JG
19698
19699 /* We cannot represent infinities, NaNs or +/-zero. We won't
19700 know if we have +zero until we analyse the mantissa, but we
19701 can reject the other invalid values. */
19702 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19703 || REAL_VALUE_MINUS_ZERO (r))
19704 return false;
19705
ba96cdfb 19706 /* Extract exponent. */
3520f7cc
JG
19707 r = real_value_abs (&r);
19708 exponent = REAL_EXP (&r);
19709
19710 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19711 highest (sign) bit, with a fixed binary point at bit point_pos.
19712 m1 holds the low part of the mantissa, m2 the high part.
19713 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19714 bits for the mantissa, this can fail (low bits will be lost). */
19715 real_ldexp (&m, &r, point_pos - exponent);
807e902e 19716 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
3520f7cc
JG
19717
19718 /* If the low part of the mantissa has bits set we cannot represent
19719 the value. */
d9074b29 19720 if (w.ulow () != 0)
3520f7cc
JG
19721 return false;
19722 /* We have rejected the lower HOST_WIDE_INT, so update our
19723 understanding of how many bits lie in the mantissa and
19724 look only at the high HOST_WIDE_INT. */
807e902e 19725 mantissa = w.elt (1);
3520f7cc
JG
19726 point_pos -= HOST_BITS_PER_WIDE_INT;
19727
19728 /* We can only represent values with a mantissa of the form 1.xxxx. */
19729 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19730 if ((mantissa & mask) != 0)
19731 return false;
19732
19733 /* Having filtered unrepresentable values, we may now remove all
19734 but the highest 5 bits. */
19735 mantissa >>= point_pos - 5;
19736
19737 /* We cannot represent the value 0.0, so reject it. This is handled
19738 elsewhere. */
19739 if (mantissa == 0)
19740 return false;
19741
19742 /* Then, as bit 4 is always set, we can mask it off, leaving
19743 the mantissa in the range [0, 15]. */
19744 mantissa &= ~(1 << 4);
19745 gcc_assert (mantissa <= 15);
19746
19747 /* GCC internally does not use IEEE754-like encoding (where normalized
19748 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
19749 Our mantissa values are shifted 4 places to the left relative to
19750 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19751 by 5 places to correct for GCC's representation. */
19752 exponent = 5 - exponent;
19753
19754 return (exponent >= 0 && exponent <= 7);
19755}
19756
ab6501d7
SD
19757/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19758 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
19759 output MOVI/MVNI, ORR or BIC immediate. */
3520f7cc 19760char*
b187677b 19761aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
ab6501d7 19762 enum simd_immediate_check which)
3520f7cc 19763{
3ea63f60 19764 bool is_valid;
3520f7cc 19765 static char templ[40];
3520f7cc 19766 const char *mnemonic;
e4f0f84d 19767 const char *shift_op;
3520f7cc 19768 unsigned int lane_count = 0;
81c2dfb9 19769 char element_char;
3520f7cc 19770
b187677b 19771 struct simd_immediate_info info;
48063b9d
IB
19772
19773 /* This will return true to show const_vector is legal for use as either
ab6501d7
SD
19774 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19775 It will also update INFO to show how the immediate should be generated.
19776 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
b187677b 19777 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
3520f7cc
JG
19778 gcc_assert (is_valid);
19779
b187677b
RS
19780 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19781 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
48063b9d 19782
b187677b 19783 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
3520f7cc 19784 {
1da83cce
RS
19785 gcc_assert (info.insn == simd_immediate_info::MOV
19786 && info.u.mov.shift == 0);
0d8e1702
KT
19787 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19788 move immediate path. */
1da83cce
RS
19789 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19790 info.u.mov.value = GEN_INT (0);
48063b9d
IB
19791 else
19792 {
83faf7d0 19793 const unsigned int buf_size = 20;
48063b9d 19794 char float_buf[buf_size] = {'\0'};
34a72c33 19795 real_to_decimal_for_mode (float_buf,
1da83cce 19796 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
b187677b 19797 buf_size, buf_size, 1, info.elt_mode);
48063b9d
IB
19798
19799 if (lane_count == 1)
19800 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
19801 else
19802 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
81c2dfb9 19803 lane_count, element_char, float_buf);
48063b9d
IB
19804 return templ;
19805 }
3520f7cc 19806 }
3520f7cc 19807
1da83cce 19808 gcc_assert (CONST_INT_P (info.u.mov.value));
ab6501d7
SD
19809
19810 if (which == AARCH64_CHECK_MOV)
19811 {
b187677b 19812 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
1da83cce
RS
19813 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
19814 ? "msl" : "lsl");
ab6501d7
SD
19815 if (lane_count == 1)
19816 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
1da83cce
RS
19817 mnemonic, UINTVAL (info.u.mov.value));
19818 else if (info.u.mov.shift)
ab6501d7
SD
19819 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19820 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
1da83cce
RS
19821 element_char, UINTVAL (info.u.mov.value), shift_op,
19822 info.u.mov.shift);
ab6501d7
SD
19823 else
19824 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19825 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
1da83cce 19826 element_char, UINTVAL (info.u.mov.value));
ab6501d7 19827 }
3520f7cc 19828 else
ab6501d7
SD
19829 {
19830 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
b187677b 19831 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
1da83cce 19832 if (info.u.mov.shift)
ab6501d7
SD
19833 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19834 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
1da83cce
RS
19835 element_char, UINTVAL (info.u.mov.value), "lsl",
19836 info.u.mov.shift);
ab6501d7
SD
19837 else
19838 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19839 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
1da83cce 19840 element_char, UINTVAL (info.u.mov.value));
ab6501d7 19841 }
3520f7cc
JG
19842 return templ;
19843}
19844
b7342d25 19845char*
77e994c9 19846aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
b7342d25 19847{
a2170965
TC
19848
19849 /* If a floating point number was passed and we desire to use it in an
19850 integer mode do the conversion to integer. */
19851 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
19852 {
19853 unsigned HOST_WIDE_INT ival;
19854 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
19855 gcc_unreachable ();
19856 immediate = gen_int_mode (ival, mode);
19857 }
19858
ef4bddc2 19859 machine_mode vmode;
a2170965
TC
19860 /* use a 64 bit mode for everything except for DI/DF mode, where we use
19861 a 128 bit vector mode. */
19862 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
b7342d25 19863
a2170965 19864 vmode = aarch64_simd_container_mode (mode, width);
b7342d25 19865 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
b187677b 19866 return aarch64_output_simd_mov_immediate (v_op, width);
b7342d25
IB
19867}
19868
43cacb12
RS
19869/* Return the output string to use for moving immediate CONST_VECTOR
19870 into an SVE register. */
19871
19872char *
19873aarch64_output_sve_mov_immediate (rtx const_vector)
19874{
19875 static char templ[40];
19876 struct simd_immediate_info info;
19877 char element_char;
19878
19879 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
19880 gcc_assert (is_valid);
19881
19882 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19883
1044fa32
RS
19884 machine_mode vec_mode = GET_MODE (const_vector);
19885 if (aarch64_sve_pred_mode_p (vec_mode))
19886 {
19887 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
0b1fe8cf
RS
19888 if (info.insn == simd_immediate_info::MOV)
19889 {
19890 gcc_assert (info.u.mov.value == const0_rtx);
19891 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
19892 }
1044fa32 19893 else
0b1fe8cf
RS
19894 {
19895 gcc_assert (info.insn == simd_immediate_info::PTRUE);
19896 unsigned int total_bytes;
19897 if (info.u.pattern == AARCH64_SV_ALL
19898 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
19899 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19900 total_bytes / GET_MODE_SIZE (info.elt_mode));
19901 else
19902 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19903 svpattern_token (info.u.pattern));
19904 }
1044fa32
RS
19905 return buf;
19906 }
19907
1da83cce 19908 if (info.insn == simd_immediate_info::INDEX)
43cacb12
RS
19909 {
19910 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19911 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
1da83cce
RS
19912 element_char, INTVAL (info.u.index.base),
19913 INTVAL (info.u.index.step));
43cacb12
RS
19914 return templ;
19915 }
19916
19917 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19918 {
1da83cce
RS
19919 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19920 info.u.mov.value = GEN_INT (0);
43cacb12
RS
19921 else
19922 {
19923 const int buf_size = 20;
19924 char float_buf[buf_size] = {};
19925 real_to_decimal_for_mode (float_buf,
1da83cce 19926 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
43cacb12
RS
19927 buf_size, buf_size, 1, info.elt_mode);
19928
19929 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19930 element_char, float_buf);
19931 return templ;
19932 }
19933 }
19934
19935 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
1da83cce 19936 element_char, INTVAL (info.u.mov.value));
43cacb12
RS
19937 return templ;
19938}
19939
624d0f07
RS
19940/* Return the asm template for a PTRUES. CONST_UNSPEC is the
19941 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19942 pattern. */
19943
19944char *
19945aarch64_output_sve_ptrues (rtx const_unspec)
19946{
19947 static char templ[40];
19948
19949 struct simd_immediate_info info;
19950 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19951 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19952
19953 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19954 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19955 svpattern_token (info.u.pattern));
19956 return templ;
19957}
19958
88b08073
JG
19959/* Split operands into moves from op[1] + op[2] into op[0]. */
19960
19961void
19962aarch64_split_combinev16qi (rtx operands[3])
19963{
19964 unsigned int dest = REGNO (operands[0]);
19965 unsigned int src1 = REGNO (operands[1]);
19966 unsigned int src2 = REGNO (operands[2]);
ef4bddc2 19967 machine_mode halfmode = GET_MODE (operands[1]);
462a99aa 19968 unsigned int halfregs = REG_NREGS (operands[1]);
88b08073
JG
19969 rtx destlo, desthi;
19970
19971 gcc_assert (halfmode == V16QImode);
19972
19973 if (src1 == dest && src2 == dest + halfregs)
19974 {
19975 /* No-op move. Can't split to nothing; emit something. */
19976 emit_note (NOTE_INSN_DELETED);
19977 return;
19978 }
19979
19980 /* Preserve register attributes for variable tracking. */
19981 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19982 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19983 GET_MODE_SIZE (halfmode));
19984
19985 /* Special case of reversed high/low parts. */
19986 if (reg_overlap_mentioned_p (operands[2], destlo)
19987 && reg_overlap_mentioned_p (operands[1], desthi))
19988 {
19989 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19990 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
19991 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
19992 }
19993 else if (!reg_overlap_mentioned_p (operands[2], destlo))
19994 {
19995 /* Try to avoid unnecessary moves if part of the result
19996 is in the right place already. */
19997 if (src1 != dest)
19998 emit_move_insn (destlo, operands[1]);
19999 if (src2 != dest + halfregs)
20000 emit_move_insn (desthi, operands[2]);
20001 }
20002 else
20003 {
20004 if (src2 != dest + halfregs)
20005 emit_move_insn (desthi, operands[2]);
20006 if (src1 != dest)
20007 emit_move_insn (destlo, operands[1]);
20008 }
20009}
20010
20011/* vec_perm support. */
20012
88b08073
JG
20013struct expand_vec_perm_d
20014{
20015 rtx target, op0, op1;
e3342de4 20016 vec_perm_indices perm;
ef4bddc2 20017 machine_mode vmode;
43cacb12 20018 unsigned int vec_flags;
88b08073
JG
20019 bool one_vector_p;
20020 bool testing_p;
20021};
20022
7efc03fd
DP
20023static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
20024
88b08073
JG
20025/* Generate a variable permutation. */
20026
20027static void
20028aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20029{
ef4bddc2 20030 machine_mode vmode = GET_MODE (target);
88b08073
JG
20031 bool one_vector_p = rtx_equal_p (op0, op1);
20032
20033 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20034 gcc_checking_assert (GET_MODE (op0) == vmode);
20035 gcc_checking_assert (GET_MODE (op1) == vmode);
20036 gcc_checking_assert (GET_MODE (sel) == vmode);
20037 gcc_checking_assert (TARGET_SIMD);
20038
20039 if (one_vector_p)
20040 {
20041 if (vmode == V8QImode)
20042 {
20043 /* Expand the argument to a V16QI mode by duplicating it. */
20044 rtx pair = gen_reg_rtx (V16QImode);
20045 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20046 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20047 }
20048 else
20049 {
20050 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20051 }
20052 }
20053 else
20054 {
20055 rtx pair;
20056
20057 if (vmode == V8QImode)
20058 {
20059 pair = gen_reg_rtx (V16QImode);
20060 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20061 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20062 }
20063 else
20064 {
20065 pair = gen_reg_rtx (OImode);
20066 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20067 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20068 }
20069 }
20070}
20071
80940017
RS
20072/* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20073 NELT is the number of elements in the vector. */
20074
88b08073 20075void
80940017
RS
20076aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20077 unsigned int nelt)
88b08073 20078{
ef4bddc2 20079 machine_mode vmode = GET_MODE (target);
88b08073 20080 bool one_vector_p = rtx_equal_p (op0, op1);
f7c4e5b8 20081 rtx mask;
88b08073
JG
20082
20083 /* The TBL instruction does not use a modulo index, so we must take care
20084 of that ourselves. */
f7c4e5b8
AL
20085 mask = aarch64_simd_gen_const_vector_dup (vmode,
20086 one_vector_p ? nelt - 1 : 2 * nelt - 1);
88b08073
JG
20087 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20088
f7c4e5b8
AL
20089 /* For big-endian, we also need to reverse the index within the vector
20090 (but not which vector). */
20091 if (BYTES_BIG_ENDIAN)
20092 {
20093 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
20094 if (!one_vector_p)
20095 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20096 sel = expand_simple_binop (vmode, XOR, sel, mask,
20097 NULL, 0, OPTAB_LIB_WIDEN);
20098 }
88b08073
JG
20099 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20100}
20101
43cacb12
RS
20102/* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
20103
20104static void
20105emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20106{
20107 emit_insn (gen_rtx_SET (target,
20108 gen_rtx_UNSPEC (GET_MODE (target),
20109 gen_rtvec (2, op0, op1), code)));
20110}
20111
20112/* Expand an SVE vec_perm with the given operands. */
20113
20114void
20115aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20116{
20117 machine_mode data_mode = GET_MODE (target);
20118 machine_mode sel_mode = GET_MODE (sel);
20119 /* Enforced by the pattern condition. */
20120 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20121
20122 /* Note: vec_perm indices are supposed to wrap when they go beyond the
20123 size of the two value vectors, i.e. the upper bits of the indices
20124 are effectively ignored. SVE TBL instead produces 0 for any
20125 out-of-range indices, so we need to modulo all the vec_perm indices
20126 to ensure they are all in range. */
20127 rtx sel_reg = force_reg (sel_mode, sel);
20128
20129 /* Check if the sel only references the first values vector. */
20130 if (GET_CODE (sel) == CONST_VECTOR
20131 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20132 {
20133 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20134 return;
20135 }
20136
20137 /* Check if the two values vectors are the same. */
20138 if (rtx_equal_p (op0, op1))
20139 {
20140 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20141 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20142 NULL, 0, OPTAB_DIRECT);
20143 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20144 return;
20145 }
20146
20147 /* Run TBL on for each value vector and combine the results. */
20148
20149 rtx res0 = gen_reg_rtx (data_mode);
20150 rtx res1 = gen_reg_rtx (data_mode);
20151 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20152 if (GET_CODE (sel) != CONST_VECTOR
20153 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20154 {
20155 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20156 2 * nunits - 1);
20157 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20158 NULL, 0, OPTAB_DIRECT);
20159 }
20160 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20161 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20162 NULL, 0, OPTAB_DIRECT);
20163 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20164 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20165 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20166 else
20167 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20168}
20169
cc4d934f
JG
20170/* Recognize patterns suitable for the TRN instructions. */
20171static bool
20172aarch64_evpc_trn (struct expand_vec_perm_d *d)
20173{
6a70badb
RS
20174 HOST_WIDE_INT odd;
20175 poly_uint64 nelt = d->perm.length ();
cc4d934f 20176 rtx out, in0, in1, x;
ef4bddc2 20177 machine_mode vmode = d->vmode;
cc4d934f
JG
20178
20179 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20180 return false;
20181
20182 /* Note that these are little-endian tests.
20183 We correct for big-endian later. */
6a70badb
RS
20184 if (!d->perm[0].is_constant (&odd)
20185 || (odd != 0 && odd != 1)
326ac20e
RS
20186 || !d->perm.series_p (0, 2, odd, 2)
20187 || !d->perm.series_p (1, 2, nelt + odd, 2))
cc4d934f 20188 return false;
cc4d934f
JG
20189
20190 /* Success! */
20191 if (d->testing_p)
20192 return true;
20193
20194 in0 = d->op0;
20195 in1 = d->op1;
43cacb12
RS
20196 /* We don't need a big-endian lane correction for SVE; see the comment
20197 at the head of aarch64-sve.md for details. */
20198 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
20199 {
20200 x = in0, in0 = in1, in1 = x;
20201 odd = !odd;
20202 }
20203 out = d->target;
20204
3f8334a5
RS
20205 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20206 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
cc4d934f
JG
20207 return true;
20208}
20209
7efc03fd
DP
20210/* Try to re-encode the PERM constant so it combines odd and even elements.
20211 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
20212 We retry with this new constant with the full suite of patterns. */
20213static bool
20214aarch64_evpc_reencode (struct expand_vec_perm_d *d)
20215{
20216 expand_vec_perm_d newd;
20217 unsigned HOST_WIDE_INT nelt;
20218
20219 if (d->vec_flags != VEC_ADVSIMD)
20220 return false;
20221
20222 /* Get the new mode. Always twice the size of the inner
20223 and half the elements. */
20224 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
20225 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
20226 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
20227 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
20228
20229 if (new_mode == word_mode)
20230 return false;
20231
20232 /* to_constant is safe since this routine is specific to Advanced SIMD
20233 vectors. */
20234 nelt = d->perm.length ().to_constant ();
20235
20236 vec_perm_builder newpermconst;
20237 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
20238
20239 /* Convert the perm constant if we can. Require even, odd as the pairs. */
20240 for (unsigned int i = 0; i < nelt; i += 2)
20241 {
20242 poly_int64 elt0 = d->perm[i];
20243 poly_int64 elt1 = d->perm[i + 1];
20244 poly_int64 newelt;
20245 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
20246 return false;
20247 newpermconst.quick_push (newelt.to_constant ());
20248 }
20249 newpermconst.finalize ();
20250
20251 newd.vmode = new_mode;
20252 newd.vec_flags = VEC_ADVSIMD;
20253 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
20254 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
20255 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
20256 newd.testing_p = d->testing_p;
20257 newd.one_vector_p = d->one_vector_p;
20258
20259 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
20260 return aarch64_expand_vec_perm_const_1 (&newd);
20261}
20262
cc4d934f
JG
20263/* Recognize patterns suitable for the UZP instructions. */
20264static bool
20265aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20266{
6a70badb 20267 HOST_WIDE_INT odd;
cc4d934f 20268 rtx out, in0, in1, x;
ef4bddc2 20269 machine_mode vmode = d->vmode;
cc4d934f
JG
20270
20271 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20272 return false;
20273
20274 /* Note that these are little-endian tests.
20275 We correct for big-endian later. */
6a70badb
RS
20276 if (!d->perm[0].is_constant (&odd)
20277 || (odd != 0 && odd != 1)
326ac20e 20278 || !d->perm.series_p (0, 1, odd, 2))
cc4d934f 20279 return false;
cc4d934f
JG
20280
20281 /* Success! */
20282 if (d->testing_p)
20283 return true;
20284
20285 in0 = d->op0;
20286 in1 = d->op1;
43cacb12
RS
20287 /* We don't need a big-endian lane correction for SVE; see the comment
20288 at the head of aarch64-sve.md for details. */
20289 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
20290 {
20291 x = in0, in0 = in1, in1 = x;
20292 odd = !odd;
20293 }
20294 out = d->target;
20295
3f8334a5
RS
20296 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20297 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
cc4d934f
JG
20298 return true;
20299}
20300
20301/* Recognize patterns suitable for the ZIP instructions. */
20302static bool
20303aarch64_evpc_zip (struct expand_vec_perm_d *d)
20304{
6a70badb
RS
20305 unsigned int high;
20306 poly_uint64 nelt = d->perm.length ();
cc4d934f 20307 rtx out, in0, in1, x;
ef4bddc2 20308 machine_mode vmode = d->vmode;
cc4d934f
JG
20309
20310 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20311 return false;
20312
20313 /* Note that these are little-endian tests.
20314 We correct for big-endian later. */
6a70badb
RS
20315 poly_uint64 first = d->perm[0];
20316 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20317 || !d->perm.series_p (0, 2, first, 1)
20318 || !d->perm.series_p (1, 2, first + nelt, 1))
cc4d934f 20319 return false;
6a70badb 20320 high = maybe_ne (first, 0U);
cc4d934f
JG
20321
20322 /* Success! */
20323 if (d->testing_p)
20324 return true;
20325
20326 in0 = d->op0;
20327 in1 = d->op1;
43cacb12
RS
20328 /* We don't need a big-endian lane correction for SVE; see the comment
20329 at the head of aarch64-sve.md for details. */
20330 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
cc4d934f
JG
20331 {
20332 x = in0, in0 = in1, in1 = x;
20333 high = !high;
20334 }
20335 out = d->target;
20336
3f8334a5
RS
20337 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20338 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
cc4d934f
JG
20339 return true;
20340}
20341
ae0533da
AL
20342/* Recognize patterns for the EXT insn. */
20343
20344static bool
20345aarch64_evpc_ext (struct expand_vec_perm_d *d)
20346{
6a70badb 20347 HOST_WIDE_INT location;
ae0533da
AL
20348 rtx offset;
20349
6a70badb
RS
20350 /* The first element always refers to the first vector.
20351 Check if the extracted indices are increasing by one. */
43cacb12
RS
20352 if (d->vec_flags == VEC_SVE_PRED
20353 || !d->perm[0].is_constant (&location)
6a70badb 20354 || !d->perm.series_p (0, 1, location, 1))
326ac20e 20355 return false;
ae0533da 20356
ae0533da
AL
20357 /* Success! */
20358 if (d->testing_p)
20359 return true;
20360
b31e65bb 20361 /* The case where (location == 0) is a no-op for both big- and little-endian,
43cacb12 20362 and is removed by the mid-end at optimization levels -O1 and higher.
b31e65bb 20363
43cacb12
RS
20364 We don't need a big-endian lane correction for SVE; see the comment
20365 at the head of aarch64-sve.md for details. */
20366 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
ae0533da
AL
20367 {
20368 /* After setup, we want the high elements of the first vector (stored
20369 at the LSB end of the register), and the low elements of the second
20370 vector (stored at the MSB end of the register). So swap. */
cb5c6c29 20371 std::swap (d->op0, d->op1);
6a70badb
RS
20372 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20373 to_constant () is safe since this is restricted to Advanced SIMD
20374 vectors. */
20375 location = d->perm.length ().to_constant () - location;
ae0533da
AL
20376 }
20377
20378 offset = GEN_INT (location);
3f8334a5
RS
20379 emit_set_insn (d->target,
20380 gen_rtx_UNSPEC (d->vmode,
20381 gen_rtvec (3, d->op0, d->op1, offset),
20382 UNSPEC_EXT));
ae0533da
AL
20383 return true;
20384}
20385
43cacb12
RS
20386/* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20387 within each 64-bit, 32-bit or 16-bit granule. */
923fcec3
AL
20388
20389static bool
43cacb12 20390aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
923fcec3 20391{
6a70badb
RS
20392 HOST_WIDE_INT diff;
20393 unsigned int i, size, unspec;
43cacb12 20394 machine_mode pred_mode;
923fcec3 20395
43cacb12
RS
20396 if (d->vec_flags == VEC_SVE_PRED
20397 || !d->one_vector_p
98452668
AC
20398 || !d->perm[0].is_constant (&diff)
20399 || !diff)
923fcec3
AL
20400 return false;
20401
3f8334a5
RS
20402 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
20403 if (size == 8)
43cacb12
RS
20404 {
20405 unspec = UNSPEC_REV64;
20406 pred_mode = VNx2BImode;
20407 }
3f8334a5 20408 else if (size == 4)
43cacb12
RS
20409 {
20410 unspec = UNSPEC_REV32;
20411 pred_mode = VNx4BImode;
20412 }
3f8334a5 20413 else if (size == 2)
43cacb12
RS
20414 {
20415 unspec = UNSPEC_REV16;
20416 pred_mode = VNx8BImode;
20417 }
3f8334a5
RS
20418 else
20419 return false;
923fcec3 20420
326ac20e
RS
20421 unsigned int step = diff + 1;
20422 for (i = 0; i < step; ++i)
20423 if (!d->perm.series_p (i, step, diff - i, step))
20424 return false;
923fcec3
AL
20425
20426 /* Success! */
20427 if (d->testing_p)
20428 return true;
20429
43cacb12
RS
20430 if (d->vec_flags == VEC_SVE_DATA)
20431 {
d7a09c44
RS
20432 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
20433 rtx target = gen_reg_rtx (int_mode);
20434 if (BYTES_BIG_ENDIAN)
20435 /* The act of taking a subreg between INT_MODE and d->vmode
20436 is itself a reversing operation on big-endian targets;
20437 see the comment at the head of aarch64-sve.md for details.
20438 First reinterpret OP0 as INT_MODE without using a subreg
20439 and without changing the contents. */
20440 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
20441 else
20442 {
20443 /* For SVE we use REV[BHW] unspecs derived from the element size
20444 of v->mode and vector modes whose elements have SIZE bytes.
20445 This ensures that the vector modes match the predicate modes. */
20446 int unspec = aarch64_sve_rev_unspec (d->vmode);
20447 rtx pred = aarch64_ptrue_reg (pred_mode);
20448 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
20449 gen_lowpart (int_mode, d->op0)));
20450 }
20451 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20452 return true;
43cacb12 20453 }
d7a09c44 20454 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
43cacb12
RS
20455 emit_set_insn (d->target, src);
20456 return true;
20457}
20458
20459/* Recognize patterns for the REV insn, which reverses elements within
20460 a full vector. */
20461
20462static bool
20463aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20464{
20465 poly_uint64 nelt = d->perm.length ();
20466
28350fd1 20467 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
43cacb12
RS
20468 return false;
20469
20470 if (!d->perm.series_p (0, 1, nelt - 1, -1))
20471 return false;
20472
20473 /* Success! */
20474 if (d->testing_p)
20475 return true;
20476
20477 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20478 emit_set_insn (d->target, src);
923fcec3
AL
20479 return true;
20480}
20481
91bd4114
JG
20482static bool
20483aarch64_evpc_dup (struct expand_vec_perm_d *d)
20484{
91bd4114
JG
20485 rtx out = d->target;
20486 rtx in0;
6a70badb 20487 HOST_WIDE_INT elt;
ef4bddc2 20488 machine_mode vmode = d->vmode;
91bd4114
JG
20489 rtx lane;
20490
43cacb12
RS
20491 if (d->vec_flags == VEC_SVE_PRED
20492 || d->perm.encoding ().encoded_nelts () != 1
6a70badb 20493 || !d->perm[0].is_constant (&elt))
326ac20e
RS
20494 return false;
20495
43cacb12
RS
20496 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20497 return false;
20498
326ac20e
RS
20499 /* Success! */
20500 if (d->testing_p)
20501 return true;
20502
91bd4114
JG
20503 /* The generic preparation in aarch64_expand_vec_perm_const_1
20504 swaps the operand order and the permute indices if it finds
20505 d->perm[0] to be in the second operand. Thus, we can always
20506 use d->op0 and need not do any extra arithmetic to get the
20507 correct lane number. */
20508 in0 = d->op0;
f901401e 20509 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
91bd4114 20510
3f8334a5
RS
20511 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20512 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20513 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
91bd4114
JG
20514 return true;
20515}
20516
88b08073
JG
20517static bool
20518aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20519{
43cacb12 20520 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
ef4bddc2 20521 machine_mode vmode = d->vmode;
6a70badb
RS
20522
20523 /* Make sure that the indices are constant. */
20524 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20525 for (unsigned int i = 0; i < encoded_nelts; ++i)
20526 if (!d->perm[i].is_constant ())
20527 return false;
88b08073 20528
88b08073
JG
20529 if (d->testing_p)
20530 return true;
20531
20532 /* Generic code will try constant permutation twice. Once with the
20533 original mode and again with the elements lowered to QImode.
20534 So wait and don't do the selector expansion ourselves. */
20535 if (vmode != V8QImode && vmode != V16QImode)
20536 return false;
20537
6a70badb
RS
20538 /* to_constant is safe since this routine is specific to Advanced SIMD
20539 vectors. */
20540 unsigned int nelt = d->perm.length ().to_constant ();
20541 for (unsigned int i = 0; i < nelt; ++i)
20542 /* If big-endian and two vectors we end up with a weird mixed-endian
20543 mode on NEON. Reverse the index within each word but not the word
20544 itself. to_constant is safe because we checked is_constant above. */
20545 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20546 ? d->perm[i].to_constant () ^ (nelt - 1)
20547 : d->perm[i].to_constant ());
bbcc9c00 20548
88b08073
JG
20549 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20550 sel = force_reg (vmode, sel);
20551
20552 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20553 return true;
20554}
20555
43cacb12
RS
20556/* Try to implement D using an SVE TBL instruction. */
20557
20558static bool
20559aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20560{
20561 unsigned HOST_WIDE_INT nelt;
20562
20563 /* Permuting two variable-length vectors could overflow the
20564 index range. */
20565 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20566 return false;
20567
20568 if (d->testing_p)
20569 return true;
20570
d083ee47 20571 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
43cacb12 20572 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
e25c95ef
RS
20573 if (d->one_vector_p)
20574 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20575 else
20576 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
43cacb12
RS
20577 return true;
20578}
20579
9556ef20
PK
20580/* Try to implement D using SVE SEL instruction. */
20581
20582static bool
20583aarch64_evpc_sel (struct expand_vec_perm_d *d)
20584{
20585 machine_mode vmode = d->vmode;
20586 int unit_size = GET_MODE_UNIT_SIZE (vmode);
20587
20588 if (d->vec_flags != VEC_SVE_DATA
20589 || unit_size > 8)
20590 return false;
20591
20592 int n_patterns = d->perm.encoding ().npatterns ();
20593 poly_int64 vec_len = d->perm.length ();
20594
20595 for (int i = 0; i < n_patterns; ++i)
20596 if (!known_eq (d->perm[i], i)
20597 && !known_eq (d->perm[i], vec_len + i))
20598 return false;
20599
20600 for (int i = n_patterns; i < n_patterns * 2; i++)
20601 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20602 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20603 return false;
20604
20605 if (d->testing_p)
20606 return true;
20607
cc68f7c2 20608 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
9556ef20 20609
b2f5b380 20610 /* Build a predicate that is true when op0 elements should be used. */
9556ef20
PK
20611 rtx_vector_builder builder (pred_mode, n_patterns, 2);
20612 for (int i = 0; i < n_patterns * 2; i++)
20613 {
20614 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20615 : CONST0_RTX (BImode);
20616 builder.quick_push (elem);
20617 }
20618
20619 rtx const_vec = builder.build ();
20620 rtx pred = force_reg (pred_mode, const_vec);
b2f5b380
RS
20621 /* TARGET = PRED ? OP0 : OP1. */
20622 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
9556ef20
PK
20623 return true;
20624}
20625
88b08073
JG
20626static bool
20627aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20628{
20629 /* The pattern matching functions above are written to look for a small
20630 number to begin the sequence (0, 1, N/2). If we begin with an index
20631 from the second operand, we can swap the operands. */
6a70badb
RS
20632 poly_int64 nelt = d->perm.length ();
20633 if (known_ge (d->perm[0], nelt))
88b08073 20634 {
e3342de4 20635 d->perm.rotate_inputs (1);
cb5c6c29 20636 std::swap (d->op0, d->op1);
88b08073
JG
20637 }
20638
43cacb12
RS
20639 if ((d->vec_flags == VEC_ADVSIMD
20640 || d->vec_flags == VEC_SVE_DATA
20641 || d->vec_flags == VEC_SVE_PRED)
20642 && known_gt (nelt, 1))
cc4d934f 20643 {
43cacb12
RS
20644 if (aarch64_evpc_rev_local (d))
20645 return true;
20646 else if (aarch64_evpc_rev_global (d))
923fcec3
AL
20647 return true;
20648 else if (aarch64_evpc_ext (d))
ae0533da 20649 return true;
f901401e
AL
20650 else if (aarch64_evpc_dup (d))
20651 return true;
ae0533da 20652 else if (aarch64_evpc_zip (d))
cc4d934f
JG
20653 return true;
20654 else if (aarch64_evpc_uzp (d))
20655 return true;
20656 else if (aarch64_evpc_trn (d))
20657 return true;
9556ef20
PK
20658 else if (aarch64_evpc_sel (d))
20659 return true;
7efc03fd
DP
20660 else if (aarch64_evpc_reencode (d))
20661 return true;
43cacb12
RS
20662 if (d->vec_flags == VEC_SVE_DATA)
20663 return aarch64_evpc_sve_tbl (d);
4ec8bb67 20664 else if (d->vec_flags == VEC_ADVSIMD)
43cacb12 20665 return aarch64_evpc_tbl (d);
cc4d934f 20666 }
88b08073
JG
20667 return false;
20668}
20669
f151c9e1 20670/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
88b08073 20671
f151c9e1
RS
20672static bool
20673aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20674 rtx op1, const vec_perm_indices &sel)
88b08073
JG
20675{
20676 struct expand_vec_perm_d d;
88b08073 20677
326ac20e 20678 /* Check whether the mask can be applied to a single vector. */
e25c95ef
RS
20679 if (sel.ninputs () == 1
20680 || (op0 && rtx_equal_p (op0, op1)))
326ac20e
RS
20681 d.one_vector_p = true;
20682 else if (sel.all_from_input_p (0))
88b08073 20683 {
326ac20e
RS
20684 d.one_vector_p = true;
20685 op1 = op0;
88b08073 20686 }
326ac20e 20687 else if (sel.all_from_input_p (1))
88b08073 20688 {
88b08073 20689 d.one_vector_p = true;
326ac20e 20690 op0 = op1;
88b08073 20691 }
326ac20e
RS
20692 else
20693 d.one_vector_p = false;
88b08073 20694
326ac20e
RS
20695 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
20696 sel.nelts_per_input ());
20697 d.vmode = vmode;
43cacb12 20698 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
326ac20e
RS
20699 d.target = target;
20700 d.op0 = op0;
20701 d.op1 = op1;
20702 d.testing_p = !target;
e3342de4 20703
f151c9e1
RS
20704 if (!d.testing_p)
20705 return aarch64_expand_vec_perm_const_1 (&d);
88b08073 20706
326ac20e 20707 rtx_insn *last = get_last_insn ();
f151c9e1 20708 bool ret = aarch64_expand_vec_perm_const_1 (&d);
326ac20e 20709 gcc_assert (last == get_last_insn ());
88b08073
JG
20710
20711 return ret;
20712}
20713
73e3da51
RS
20714/* Generate a byte permute mask for a register of mode MODE,
20715 which has NUNITS units. */
20716
668046d1 20717rtx
73e3da51 20718aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
668046d1
DS
20719{
20720 /* We have to reverse each vector because we dont have
20721 a permuted load that can reverse-load according to ABI rules. */
20722 rtx mask;
20723 rtvec v = rtvec_alloc (16);
73e3da51
RS
20724 unsigned int i, j;
20725 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
668046d1
DS
20726
20727 gcc_assert (BYTES_BIG_ENDIAN);
20728 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
20729
20730 for (i = 0; i < nunits; i++)
20731 for (j = 0; j < usize; j++)
20732 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
20733 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
20734 return force_reg (V16QImode, mask);
20735}
20736
4a942af6 20737/* Expand an SVE integer comparison using the SVE equivalent of:
f22d7973 20738
4a942af6
RS
20739 (set TARGET (CODE OP0 OP1)). */
20740
20741void
20742aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
f22d7973 20743{
4a942af6
RS
20744 machine_mode pred_mode = GET_MODE (target);
20745 machine_mode data_mode = GET_MODE (op0);
00fa90d9
RS
20746 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
20747 op0, op1);
20748 if (!rtx_equal_p (target, res))
20749 emit_move_insn (target, res);
f22d7973
RS
20750}
20751
43cacb12
RS
20752/* Return the UNSPEC_COND_* code for comparison CODE. */
20753
20754static unsigned int
20755aarch64_unspec_cond_code (rtx_code code)
20756{
20757 switch (code)
20758 {
20759 case NE:
cb18e86d 20760 return UNSPEC_COND_FCMNE;
43cacb12 20761 case EQ:
cb18e86d 20762 return UNSPEC_COND_FCMEQ;
43cacb12 20763 case LT:
cb18e86d 20764 return UNSPEC_COND_FCMLT;
43cacb12 20765 case GT:
cb18e86d 20766 return UNSPEC_COND_FCMGT;
43cacb12 20767 case LE:
cb18e86d 20768 return UNSPEC_COND_FCMLE;
43cacb12 20769 case GE:
cb18e86d 20770 return UNSPEC_COND_FCMGE;
4a942af6
RS
20771 case UNORDERED:
20772 return UNSPEC_COND_FCMUO;
43cacb12
RS
20773 default:
20774 gcc_unreachable ();
20775 }
20776}
20777
f22d7973 20778/* Emit:
43cacb12 20779
4a942af6 20780 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 20781
4a942af6
RS
20782 where <X> is the operation associated with comparison CODE.
20783 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
f22d7973
RS
20784
20785static void
4a942af6
RS
20786aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
20787 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 20788{
4a942af6 20789 rtx flag = gen_int_mode (known_ptrue_p, SImode);
f22d7973 20790 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
4a942af6 20791 gen_rtvec (4, pred, flag, op0, op1),
f22d7973
RS
20792 aarch64_unspec_cond_code (code));
20793 emit_set_insn (target, unspec);
43cacb12
RS
20794}
20795
f22d7973 20796/* Emit the SVE equivalent of:
43cacb12 20797
4a942af6
RS
20798 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20799 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
f22d7973 20800 (set TARGET (ior:PRED_MODE TMP1 TMP2))
43cacb12 20801
4a942af6
RS
20802 where <Xi> is the operation associated with comparison CODEi.
20803 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
20804
20805static void
4a942af6
RS
20806aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
20807 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 20808{
4a942af6 20809 machine_mode pred_mode = GET_MODE (pred);
43cacb12 20810 rtx tmp1 = gen_reg_rtx (pred_mode);
4a942af6 20811 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
43cacb12 20812 rtx tmp2 = gen_reg_rtx (pred_mode);
4a942af6 20813 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
f22d7973 20814 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
43cacb12
RS
20815}
20816
f22d7973 20817/* Emit the SVE equivalent of:
43cacb12 20818
4a942af6 20819 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
f22d7973 20820 (set TARGET (not TMP))
43cacb12 20821
4a942af6
RS
20822 where <X> is the operation associated with comparison CODE.
20823 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
43cacb12
RS
20824
20825static void
4a942af6
RS
20826aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
20827 bool known_ptrue_p, rtx op0, rtx op1)
43cacb12 20828{
4a942af6 20829 machine_mode pred_mode = GET_MODE (pred);
f22d7973 20830 rtx tmp = gen_reg_rtx (pred_mode);
4a942af6 20831 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
f22d7973 20832 aarch64_emit_unop (target, one_cmpl_optab, tmp);
43cacb12
RS
20833}
20834
f22d7973 20835/* Expand an SVE floating-point comparison using the SVE equivalent of:
43cacb12 20836
f22d7973 20837 (set TARGET (CODE OP0 OP1))
43cacb12
RS
20838
20839 If CAN_INVERT_P is true, the caller can also handle inverted results;
20840 return true if the result is in fact inverted. */
20841
20842bool
20843aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
20844 rtx op0, rtx op1, bool can_invert_p)
20845{
20846 machine_mode pred_mode = GET_MODE (target);
20847 machine_mode data_mode = GET_MODE (op0);
20848
16de3637 20849 rtx ptrue = aarch64_ptrue_reg (pred_mode);
43cacb12
RS
20850 switch (code)
20851 {
20852 case UNORDERED:
20853 /* UNORDERED has no immediate form. */
20854 op1 = force_reg (data_mode, op1);
f22d7973 20855 /* fall through */
43cacb12
RS
20856 case LT:
20857 case LE:
20858 case GT:
20859 case GE:
20860 case EQ:
20861 case NE:
f22d7973
RS
20862 {
20863 /* There is native support for the comparison. */
4a942af6 20864 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
20865 return false;
20866 }
43cacb12
RS
20867
20868 case LTGT:
20869 /* This is a trapping operation (LT or GT). */
4a942af6 20870 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
43cacb12
RS
20871 return false;
20872
20873 case UNEQ:
20874 if (!flag_trapping_math)
20875 {
20876 /* This would trap for signaling NaNs. */
20877 op1 = force_reg (data_mode, op1);
4a942af6
RS
20878 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
20879 ptrue, true, op0, op1);
43cacb12
RS
20880 return false;
20881 }
20882 /* fall through */
43cacb12
RS
20883 case UNLT:
20884 case UNLE:
20885 case UNGT:
20886 case UNGE:
f22d7973
RS
20887 if (flag_trapping_math)
20888 {
20889 /* Work out which elements are ordered. */
20890 rtx ordered = gen_reg_rtx (pred_mode);
20891 op1 = force_reg (data_mode, op1);
4a942af6
RS
20892 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
20893 ptrue, true, op0, op1);
f22d7973
RS
20894
20895 /* Test the opposite condition for the ordered elements,
20896 then invert the result. */
20897 if (code == UNEQ)
20898 code = NE;
20899 else
20900 code = reverse_condition_maybe_unordered (code);
20901 if (can_invert_p)
20902 {
4a942af6
RS
20903 aarch64_emit_sve_fp_cond (target, code,
20904 ordered, false, op0, op1);
f22d7973
RS
20905 return true;
20906 }
4a942af6
RS
20907 aarch64_emit_sve_invert_fp_cond (target, code,
20908 ordered, false, op0, op1);
f22d7973
RS
20909 return false;
20910 }
20911 break;
20912
20913 case ORDERED:
20914 /* ORDERED has no immediate form. */
20915 op1 = force_reg (data_mode, op1);
20916 break;
43cacb12
RS
20917
20918 default:
20919 gcc_unreachable ();
20920 }
f22d7973
RS
20921
20922 /* There is native support for the inverse comparison. */
20923 code = reverse_condition_maybe_unordered (code);
20924 if (can_invert_p)
20925 {
4a942af6 20926 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973
RS
20927 return true;
20928 }
4a942af6 20929 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
f22d7973 20930 return false;
43cacb12
RS
20931}
20932
20933/* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
20934 of the data being selected and CMP_MODE is the mode of the values being
20935 compared. */
20936
20937void
20938aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
20939 rtx *ops)
20940{
10116ec1 20941 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
43cacb12
RS
20942 rtx pred = gen_reg_rtx (pred_mode);
20943 if (FLOAT_MODE_P (cmp_mode))
20944 {
20945 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
20946 ops[4], ops[5], true))
20947 std::swap (ops[1], ops[2]);
20948 }
20949 else
20950 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
20951
d29f7dd5
RS
20952 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
20953 ops[1] = force_reg (data_mode, ops[1]);
20954 /* The "false" value can only be zero if the "true" value is a constant. */
20955 if (register_operand (ops[1], data_mode)
20956 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
20957 ops[2] = force_reg (data_mode, ops[2]);
20958
43cacb12
RS
20959 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
20960 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
20961}
20962
99e1629f
RS
20963/* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
20964 true. However due to issues with register allocation it is preferable
20965 to avoid tieing integer scalar and FP scalar modes. Executing integer
20966 operations in general registers is better than treating them as scalar
20967 vector operations. This reduces latency and avoids redundant int<->FP
20968 moves. So tie modes if they are either the same class, or vector modes
20969 with other vector modes, vector structs or any scalar mode. */
97e1ad78 20970
99e1629f 20971static bool
ef4bddc2 20972aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
97e1ad78
JG
20973{
20974 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
20975 return true;
20976
20977 /* We specifically want to allow elements of "structure" modes to
20978 be tieable to the structure. This more general condition allows
43cacb12
RS
20979 other rarer situations too. The reason we don't extend this to
20980 predicate modes is that there are no predicate structure modes
20981 nor any specific instructions for extracting part of a predicate
20982 register. */
20983 if (aarch64_vector_data_mode_p (mode1)
20984 && aarch64_vector_data_mode_p (mode2))
61f17a5c
WD
20985 return true;
20986
20987 /* Also allow any scalar modes with vectors. */
20988 if (aarch64_vector_mode_supported_p (mode1)
20989 || aarch64_vector_mode_supported_p (mode2))
97e1ad78
JG
20990 return true;
20991
20992 return false;
20993}
20994
e2c75eea
JG
20995/* Return a new RTX holding the result of moving POINTER forward by
20996 AMOUNT bytes. */
20997
20998static rtx
6a70badb 20999aarch64_move_pointer (rtx pointer, poly_int64 amount)
e2c75eea
JG
21000{
21001 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21002
21003 return adjust_automodify_address (pointer, GET_MODE (pointer),
21004 next, amount);
21005}
21006
21007/* Return a new RTX holding the result of moving POINTER forward by the
21008 size of the mode it points to. */
21009
21010static rtx
21011aarch64_progress_pointer (rtx pointer)
21012{
6a70badb 21013 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
e2c75eea
JG
21014}
21015
21016/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21017 MODE bytes. */
21018
21019static void
21020aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
ef4bddc2 21021 machine_mode mode)
e2c75eea
JG
21022{
21023 rtx reg = gen_reg_rtx (mode);
21024
21025 /* "Cast" the pointers to the correct mode. */
21026 *src = adjust_address (*src, mode, 0);
21027 *dst = adjust_address (*dst, mode, 0);
21028 /* Emit the memcpy. */
21029 emit_move_insn (reg, *src);
21030 emit_move_insn (*dst, reg);
21031 /* Move the pointers forward. */
21032 *src = aarch64_progress_pointer (*src);
21033 *dst = aarch64_progress_pointer (*dst);
21034}
21035
76715c32 21036/* Expand cpymem, as if from a __builtin_memcpy. Return true if
e2c75eea
JG
21037 we succeed, otherwise return false. */
21038
21039bool
76715c32 21040aarch64_expand_cpymem (rtx *operands)
e2c75eea 21041{
89c52e5e 21042 int n, mode_bits;
e2c75eea
JG
21043 rtx dst = operands[0];
21044 rtx src = operands[1];
21045 rtx base;
89c52e5e 21046 machine_mode cur_mode = BLKmode, next_mode;
e2c75eea
JG
21047 bool speed_p = !optimize_function_for_size_p (cfun);
21048
21049 /* When optimizing for size, give a better estimate of the length of a
89c52e5e
TC
21050 memcpy call, but use the default otherwise. Moves larger than 8 bytes
21051 will always require an even number of instructions to do now. And each
21052 operation requires both a load+store, so devide the max number by 2. */
21053 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
e2c75eea
JG
21054
21055 /* We can't do anything smart if the amount to copy is not constant. */
21056 if (!CONST_INT_P (operands[2]))
21057 return false;
21058
89c52e5e 21059 n = INTVAL (operands[2]);
e2c75eea 21060
89c52e5e
TC
21061 /* Try to keep the number of instructions low. For all cases we will do at
21062 most two moves for the residual amount, since we'll always overlap the
21063 remainder. */
21064 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
e2c75eea
JG
21065 return false;
21066
21067 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21068 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21069
21070 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21071 src = adjust_automodify_address (src, VOIDmode, base, 0);
21072
89c52e5e
TC
21073 /* Convert n to bits to make the rest of the code simpler. */
21074 n = n * BITS_PER_UNIT;
e2c75eea 21075
f7e1d19d
TC
21076 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
21077 larger than TImode, but we should not use them for loads/stores here. */
21078 const int copy_limit = GET_MODE_BITSIZE (TImode);
21079
89c52e5e 21080 while (n > 0)
e2c75eea 21081 {
89c52e5e
TC
21082 /* Find the largest mode in which to do the copy in without over reading
21083 or writing. */
21084 opt_scalar_int_mode mode_iter;
21085 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
f7e1d19d 21086 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
89c52e5e 21087 cur_mode = mode_iter.require ();
e2c75eea 21088
89c52e5e 21089 gcc_assert (cur_mode != BLKmode);
e2c75eea 21090
89c52e5e
TC
21091 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21092 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
e2c75eea 21093
89c52e5e 21094 n -= mode_bits;
e2c75eea 21095
89c52e5e
TC
21096 /* Do certain trailing copies as overlapping if it's going to be
21097 cheaper. i.e. less instructions to do so. For instance doing a 15
21098 byte copy it's more efficient to do two overlapping 8 byte copies than
21099 8 + 6 + 1. */
f7e1d19d 21100 if (n > 0 && n <= 8 * BITS_PER_UNIT)
89c52e5e 21101 {
f7e1d19d
TC
21102 next_mode = smallest_mode_for_size (n, MODE_INT);
21103 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
89c52e5e
TC
21104 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21105 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21106 n = n_bits;
e2c75eea
JG
21107 }
21108 }
21109
21110 return true;
21111}
21112
141a3ccf
KT
21113/* Split a DImode store of a CONST_INT SRC to MEM DST as two
21114 SImode stores. Handle the case when the constant has identical
21115 bottom and top halves. This is beneficial when the two stores can be
21116 merged into an STP and we avoid synthesising potentially expensive
21117 immediates twice. Return true if such a split is possible. */
21118
21119bool
21120aarch64_split_dimode_const_store (rtx dst, rtx src)
21121{
21122 rtx lo = gen_lowpart (SImode, src);
21123 rtx hi = gen_highpart_mode (SImode, DImode, src);
21124
21125 bool size_p = optimize_function_for_size_p (cfun);
21126
21127 if (!rtx_equal_p (lo, hi))
21128 return false;
21129
21130 unsigned int orig_cost
21131 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21132 unsigned int lo_cost
21133 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21134
21135 /* We want to transform:
21136 MOV x1, 49370
21137 MOVK x1, 0x140, lsl 16
21138 MOVK x1, 0xc0da, lsl 32
21139 MOVK x1, 0x140, lsl 48
21140 STR x1, [x0]
21141 into:
21142 MOV w1, 49370
21143 MOVK w1, 0x140, lsl 16
21144 STP w1, w1, [x0]
21145 So we want to perform this only when we save two instructions
21146 or more. When optimizing for size, however, accept any code size
21147 savings we can. */
21148 if (size_p && orig_cost <= lo_cost)
21149 return false;
21150
21151 if (!size_p
21152 && (orig_cost <= lo_cost + 1))
21153 return false;
21154
21155 rtx mem_lo = adjust_address (dst, SImode, 0);
21156 if (!aarch64_mem_pair_operand (mem_lo, SImode))
21157 return false;
21158
21159 rtx tmp_reg = gen_reg_rtx (SImode);
21160 aarch64_expand_mov_immediate (tmp_reg, lo);
21161 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21162 /* Don't emit an explicit store pair as this may not be always profitable.
21163 Let the sched-fusion logic decide whether to merge them. */
21164 emit_move_insn (mem_lo, tmp_reg);
21165 emit_move_insn (mem_hi, tmp_reg);
21166
21167 return true;
21168}
21169
30c46053
MC
21170/* Generate RTL for a conditional branch with rtx comparison CODE in
21171 mode CC_MODE. The destination of the unlikely conditional branch
21172 is LABEL_REF. */
21173
21174void
21175aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21176 rtx label_ref)
21177{
21178 rtx x;
21179 x = gen_rtx_fmt_ee (code, VOIDmode,
21180 gen_rtx_REG (cc_mode, CC_REGNUM),
21181 const0_rtx);
21182
21183 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21184 gen_rtx_LABEL_REF (VOIDmode, label_ref),
21185 pc_rtx);
21186 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21187}
21188
21189/* Generate DImode scratch registers for 128-bit (TImode) addition.
21190
21191 OP1 represents the TImode destination operand 1
21192 OP2 represents the TImode destination operand 2
21193 LOW_DEST represents the low half (DImode) of TImode operand 0
21194 LOW_IN1 represents the low half (DImode) of TImode operand 1
21195 LOW_IN2 represents the low half (DImode) of TImode operand 2
21196 HIGH_DEST represents the high half (DImode) of TImode operand 0
21197 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21198 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21199
21200void
21201aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21202 rtx *low_in1, rtx *low_in2,
21203 rtx *high_dest, rtx *high_in1,
21204 rtx *high_in2)
21205{
21206 *low_dest = gen_reg_rtx (DImode);
21207 *low_in1 = gen_lowpart (DImode, op1);
21208 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21209 subreg_lowpart_offset (DImode, TImode));
21210 *high_dest = gen_reg_rtx (DImode);
21211 *high_in1 = gen_highpart (DImode, op1);
21212 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21213 subreg_highpart_offset (DImode, TImode));
21214}
21215
21216/* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21217
21218 This function differs from 'arch64_addti_scratch_regs' in that
21219 OP1 can be an immediate constant (zero). We must call
21220 subreg_highpart_offset with DImode and TImode arguments, otherwise
21221 VOIDmode will be used for the const_int which generates an internal
21222 error from subreg_size_highpart_offset which does not expect a size of zero.
21223
21224 OP1 represents the TImode destination operand 1
21225 OP2 represents the TImode destination operand 2
21226 LOW_DEST represents the low half (DImode) of TImode operand 0
21227 LOW_IN1 represents the low half (DImode) of TImode operand 1
21228 LOW_IN2 represents the low half (DImode) of TImode operand 2
21229 HIGH_DEST represents the high half (DImode) of TImode operand 0
21230 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21231 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21232
21233
21234void
21235aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21236 rtx *low_in1, rtx *low_in2,
21237 rtx *high_dest, rtx *high_in1,
21238 rtx *high_in2)
21239{
21240 *low_dest = gen_reg_rtx (DImode);
21241 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21242 subreg_lowpart_offset (DImode, TImode));
21243
21244 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21245 subreg_lowpart_offset (DImode, TImode));
21246 *high_dest = gen_reg_rtx (DImode);
21247
21248 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21249 subreg_highpart_offset (DImode, TImode));
21250 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21251 subreg_highpart_offset (DImode, TImode));
21252}
21253
21254/* Generate RTL for 128-bit (TImode) subtraction with overflow.
21255
21256 OP0 represents the TImode destination operand 0
21257 LOW_DEST represents the low half (DImode) of TImode operand 0
21258 LOW_IN1 represents the low half (DImode) of TImode operand 1
21259 LOW_IN2 represents the low half (DImode) of TImode operand 2
21260 HIGH_DEST represents the high half (DImode) of TImode operand 0
21261 HIGH_IN1 represents the high half (DImode) of TImode operand 1
a58fe3c5
RE
21262 HIGH_IN2 represents the high half (DImode) of TImode operand 2
21263 UNSIGNED_P is true if the operation is being performed on unsigned
21264 values. */
30c46053
MC
21265void
21266aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21267 rtx low_in2, rtx high_dest, rtx high_in1,
a58fe3c5 21268 rtx high_in2, bool unsigned_p)
30c46053
MC
21269{
21270 if (low_in2 == const0_rtx)
21271 {
21272 low_dest = low_in1;
a58fe3c5
RE
21273 high_in2 = force_reg (DImode, high_in2);
21274 if (unsigned_p)
21275 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21276 else
21277 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
30c46053
MC
21278 }
21279 else
21280 {
d80f0a8d
JJ
21281 if (aarch64_plus_immediate (low_in2, DImode))
21282 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21283 GEN_INT (-INTVAL (low_in2))));
21284 else
30c46053 21285 {
d80f0a8d
JJ
21286 low_in2 = force_reg (DImode, low_in2);
21287 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
30c46053 21288 }
d80f0a8d 21289 high_in2 = force_reg (DImode, high_in2);
a58fe3c5
RE
21290
21291 if (unsigned_p)
21292 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21293 else
21294 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
30c46053
MC
21295 }
21296
21297 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21298 emit_move_insn (gen_highpart (DImode, op0), high_dest);
21299
21300}
21301
a3125fc2
CL
21302/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
21303
21304static unsigned HOST_WIDE_INT
21305aarch64_asan_shadow_offset (void)
21306{
10078f3e
AP
21307 if (TARGET_ILP32)
21308 return (HOST_WIDE_INT_1 << 29);
21309 else
21310 return (HOST_WIDE_INT_1 << 36);
a3125fc2
CL
21311}
21312
5f3bc026 21313static rtx
cb4347e8 21314aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
5f3bc026
ZC
21315 int code, tree treeop0, tree treeop1)
21316{
c8012fbc
WD
21317 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21318 rtx op0, op1;
5f3bc026 21319 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 21320 insn_code icode;
5f3bc026
ZC
21321 struct expand_operand ops[4];
21322
5f3bc026
ZC
21323 start_sequence ();
21324 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21325
21326 op_mode = GET_MODE (op0);
21327 if (op_mode == VOIDmode)
21328 op_mode = GET_MODE (op1);
21329
21330 switch (op_mode)
21331 {
4e10a5a7
RS
21332 case E_QImode:
21333 case E_HImode:
21334 case E_SImode:
5f3bc026
ZC
21335 cmp_mode = SImode;
21336 icode = CODE_FOR_cmpsi;
21337 break;
21338
4e10a5a7 21339 case E_DImode:
5f3bc026
ZC
21340 cmp_mode = DImode;
21341 icode = CODE_FOR_cmpdi;
21342 break;
21343
4e10a5a7 21344 case E_SFmode:
786e3c06
WD
21345 cmp_mode = SFmode;
21346 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21347 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21348 break;
21349
4e10a5a7 21350 case E_DFmode:
786e3c06
WD
21351 cmp_mode = DFmode;
21352 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21353 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21354 break;
21355
5f3bc026
ZC
21356 default:
21357 end_sequence ();
21358 return NULL_RTX;
21359 }
21360
c8012fbc
WD
21361 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21362 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
5f3bc026
ZC
21363 if (!op0 || !op1)
21364 {
21365 end_sequence ();
21366 return NULL_RTX;
21367 }
21368 *prep_seq = get_insns ();
21369 end_sequence ();
21370
c8012fbc
WD
21371 create_fixed_operand (&ops[0], op0);
21372 create_fixed_operand (&ops[1], op1);
5f3bc026
ZC
21373
21374 start_sequence ();
c8012fbc 21375 if (!maybe_expand_insn (icode, 2, ops))
5f3bc026
ZC
21376 {
21377 end_sequence ();
21378 return NULL_RTX;
21379 }
21380 *gen_seq = get_insns ();
21381 end_sequence ();
21382
c8012fbc
WD
21383 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21384 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
5f3bc026
ZC
21385}
21386
21387static rtx
cb4347e8
TS
21388aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21389 int cmp_code, tree treeop0, tree treeop1, int bit_code)
5f3bc026 21390{
c8012fbc
WD
21391 rtx op0, op1, target;
21392 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
5f3bc026 21393 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
c8012fbc 21394 insn_code icode;
5f3bc026 21395 struct expand_operand ops[6];
c8012fbc 21396 int aarch64_cond;
5f3bc026 21397
cb4347e8 21398 push_to_sequence (*prep_seq);
5f3bc026
ZC
21399 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21400
21401 op_mode = GET_MODE (op0);
21402 if (op_mode == VOIDmode)
21403 op_mode = GET_MODE (op1);
21404
21405 switch (op_mode)
21406 {
4e10a5a7
RS
21407 case E_QImode:
21408 case E_HImode:
21409 case E_SImode:
5f3bc026 21410 cmp_mode = SImode;
5f3bc026
ZC
21411 break;
21412
4e10a5a7 21413 case E_DImode:
5f3bc026 21414 cmp_mode = DImode;
5f3bc026
ZC
21415 break;
21416
4e10a5a7 21417 case E_SFmode:
786e3c06
WD
21418 cmp_mode = SFmode;
21419 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
21420 break;
21421
4e10a5a7 21422 case E_DFmode:
786e3c06
WD
21423 cmp_mode = DFmode;
21424 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
786e3c06
WD
21425 break;
21426
5f3bc026
ZC
21427 default:
21428 end_sequence ();
21429 return NULL_RTX;
21430 }
21431
865257c4
RS
21432 icode = code_for_ccmp (cc_mode, cmp_mode);
21433
5f3bc026
ZC
21434 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21435 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21436 if (!op0 || !op1)
21437 {
21438 end_sequence ();
21439 return NULL_RTX;
21440 }
21441 *prep_seq = get_insns ();
21442 end_sequence ();
21443
21444 target = gen_rtx_REG (cc_mode, CC_REGNUM);
c8012fbc 21445 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
5f3bc026 21446
c8012fbc
WD
21447 if (bit_code != AND)
21448 {
865257c4
RS
21449 /* Treat the ccmp patterns as canonical and use them where possible,
21450 but fall back to ccmp_rev patterns if there's no other option. */
21451 rtx_code prev_code = GET_CODE (prev);
21452 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21453 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21454 && !(prev_code == EQ
21455 || prev_code == NE
21456 || prev_code == ORDERED
21457 || prev_code == UNORDERED))
21458 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21459 else
21460 {
21461 rtx_code code = reverse_condition (prev_code);
21462 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21463 }
c8012fbc
WD
21464 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21465 }
21466
21467 create_fixed_operand (&ops[0], XEXP (prev, 0));
5f3bc026
ZC
21468 create_fixed_operand (&ops[1], target);
21469 create_fixed_operand (&ops[2], op0);
21470 create_fixed_operand (&ops[3], op1);
c8012fbc
WD
21471 create_fixed_operand (&ops[4], prev);
21472 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
5f3bc026 21473
cb4347e8 21474 push_to_sequence (*gen_seq);
5f3bc026
ZC
21475 if (!maybe_expand_insn (icode, 6, ops))
21476 {
21477 end_sequence ();
21478 return NULL_RTX;
21479 }
21480
21481 *gen_seq = get_insns ();
21482 end_sequence ();
21483
c8012fbc 21484 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
5f3bc026
ZC
21485}
21486
21487#undef TARGET_GEN_CCMP_FIRST
21488#define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21489
21490#undef TARGET_GEN_CCMP_NEXT
21491#define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21492
6a569cdd
KT
21493/* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
21494 instruction fusion of some sort. */
21495
21496static bool
21497aarch64_macro_fusion_p (void)
21498{
b175b679 21499 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
6a569cdd
KT
21500}
21501
21502
21503/* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
21504 should be kept together during scheduling. */
21505
21506static bool
21507aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21508{
21509 rtx set_dest;
21510 rtx prev_set = single_set (prev);
21511 rtx curr_set = single_set (curr);
21512 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
21513 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21514
21515 if (!aarch64_macro_fusion_p ())
21516 return false;
21517
d7b03373 21518 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
6a569cdd
KT
21519 {
21520 /* We are trying to match:
21521 prev (mov) == (set (reg r0) (const_int imm16))
21522 curr (movk) == (set (zero_extract (reg r0)
21523 (const_int 16)
21524 (const_int 16))
21525 (const_int imm16_1)) */
21526
21527 set_dest = SET_DEST (curr_set);
21528
21529 if (GET_CODE (set_dest) == ZERO_EXTRACT
21530 && CONST_INT_P (SET_SRC (curr_set))
21531 && CONST_INT_P (SET_SRC (prev_set))
21532 && CONST_INT_P (XEXP (set_dest, 2))
21533 && INTVAL (XEXP (set_dest, 2)) == 16
21534 && REG_P (XEXP (set_dest, 0))
21535 && REG_P (SET_DEST (prev_set))
21536 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21537 {
21538 return true;
21539 }
21540 }
21541
d7b03373 21542 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
9bbe08fe
KT
21543 {
21544
21545 /* We're trying to match:
21546 prev (adrp) == (set (reg r1)
21547 (high (symbol_ref ("SYM"))))
21548 curr (add) == (set (reg r0)
21549 (lo_sum (reg r1)
21550 (symbol_ref ("SYM"))))
21551 Note that r0 need not necessarily be the same as r1, especially
21552 during pre-regalloc scheduling. */
21553
21554 if (satisfies_constraint_Ush (SET_SRC (prev_set))
21555 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21556 {
21557 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21558 && REG_P (XEXP (SET_SRC (curr_set), 0))
21559 && REGNO (XEXP (SET_SRC (curr_set), 0))
21560 == REGNO (SET_DEST (prev_set))
21561 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21562 XEXP (SET_SRC (curr_set), 1)))
21563 return true;
21564 }
21565 }
21566
d7b03373 21567 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
cd0cb232
KT
21568 {
21569
21570 /* We're trying to match:
21571 prev (movk) == (set (zero_extract (reg r0)
21572 (const_int 16)
21573 (const_int 32))
21574 (const_int imm16_1))
21575 curr (movk) == (set (zero_extract (reg r0)
21576 (const_int 16)
21577 (const_int 48))
21578 (const_int imm16_2)) */
21579
21580 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
21581 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
21582 && REG_P (XEXP (SET_DEST (prev_set), 0))
21583 && REG_P (XEXP (SET_DEST (curr_set), 0))
21584 && REGNO (XEXP (SET_DEST (prev_set), 0))
21585 == REGNO (XEXP (SET_DEST (curr_set), 0))
21586 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
21587 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
21588 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
21589 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
21590 && CONST_INT_P (SET_SRC (prev_set))
21591 && CONST_INT_P (SET_SRC (curr_set)))
21592 return true;
21593
21594 }
d7b03373 21595 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
d8354ad7
KT
21596 {
21597 /* We're trying to match:
21598 prev (adrp) == (set (reg r0)
21599 (high (symbol_ref ("SYM"))))
21600 curr (ldr) == (set (reg r1)
21601 (mem (lo_sum (reg r0)
21602 (symbol_ref ("SYM")))))
21603 or
21604 curr (ldr) == (set (reg r1)
21605 (zero_extend (mem
21606 (lo_sum (reg r0)
21607 (symbol_ref ("SYM")))))) */
21608 if (satisfies_constraint_Ush (SET_SRC (prev_set))
21609 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21610 {
21611 rtx curr_src = SET_SRC (curr_set);
21612
21613 if (GET_CODE (curr_src) == ZERO_EXTEND)
21614 curr_src = XEXP (curr_src, 0);
21615
21616 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
21617 && REG_P (XEXP (XEXP (curr_src, 0), 0))
21618 && REGNO (XEXP (XEXP (curr_src, 0), 0))
21619 == REGNO (SET_DEST (prev_set))
21620 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
21621 XEXP (SET_SRC (prev_set), 0)))
21622 return true;
21623 }
21624 }
cd0cb232 21625
a4f3fa71 21626 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
d7b03373 21627 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
a4f3fa71
WD
21628 && prev_set && curr_set && any_condjump_p (curr)
21629 && GET_CODE (SET_SRC (prev_set)) == COMPARE
21630 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
21631 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
21632 return true;
21633
21634 /* Fuse flag-setting ALU instructions and conditional branch. */
21635 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
3759108f
AP
21636 && any_condjump_p (curr))
21637 {
509f819a
N
21638 unsigned int condreg1, condreg2;
21639 rtx cc_reg_1;
21640 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
21641 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
21642
21643 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
21644 && prev
21645 && modified_in_p (cc_reg_1, prev))
21646 {
f8a27206
AP
21647 enum attr_type prev_type = get_attr_type (prev);
21648
509f819a
N
21649 /* FIXME: this misses some which is considered simple arthematic
21650 instructions for ThunderX. Simple shifts are missed here. */
21651 if (prev_type == TYPE_ALUS_SREG
21652 || prev_type == TYPE_ALUS_IMM
21653 || prev_type == TYPE_LOGICS_REG
21654 || prev_type == TYPE_LOGICS_IMM)
21655 return true;
21656 }
3759108f
AP
21657 }
21658
a4f3fa71 21659 /* Fuse ALU instructions and CBZ/CBNZ. */
bee7e0fc
AP
21660 if (prev_set
21661 && curr_set
a4f3fa71 21662 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
00c7c57f
JB
21663 && any_condjump_p (curr))
21664 {
21665 /* We're trying to match:
21666 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21667 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
21668 (const_int 0))
21669 (label_ref ("SYM"))
21670 (pc)) */
21671 if (SET_DEST (curr_set) == (pc_rtx)
21672 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
21673 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
21674 && REG_P (SET_DEST (prev_set))
21675 && REGNO (SET_DEST (prev_set))
21676 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
21677 {
21678 /* Fuse ALU operations followed by conditional branch instruction. */
21679 switch (get_attr_type (prev))
21680 {
21681 case TYPE_ALU_IMM:
21682 case TYPE_ALU_SREG:
21683 case TYPE_ADC_REG:
21684 case TYPE_ADC_IMM:
21685 case TYPE_ADCS_REG:
21686 case TYPE_ADCS_IMM:
21687 case TYPE_LOGIC_REG:
21688 case TYPE_LOGIC_IMM:
21689 case TYPE_CSEL:
21690 case TYPE_ADR:
21691 case TYPE_MOV_IMM:
21692 case TYPE_SHIFT_REG:
21693 case TYPE_SHIFT_IMM:
21694 case TYPE_BFM:
21695 case TYPE_RBIT:
21696 case TYPE_REV:
21697 case TYPE_EXTEND:
21698 return true;
21699
21700 default:;
21701 }
21702 }
21703 }
21704
6a569cdd
KT
21705 return false;
21706}
21707
f2879a90
KT
21708/* Return true iff the instruction fusion described by OP is enabled. */
21709
21710bool
21711aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
21712{
21713 return (aarch64_tune_params.fusible_ops & op) != 0;
21714}
21715
350013bc
BC
21716/* If MEM is in the form of [base+offset], extract the two parts
21717 of address and set to BASE and OFFSET, otherwise return false
21718 after clearing BASE and OFFSET. */
21719
21720bool
21721extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
21722{
21723 rtx addr;
21724
21725 gcc_assert (MEM_P (mem));
21726
21727 addr = XEXP (mem, 0);
21728
21729 if (REG_P (addr))
21730 {
21731 *base = addr;
21732 *offset = const0_rtx;
21733 return true;
21734 }
21735
21736 if (GET_CODE (addr) == PLUS
21737 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
21738 {
21739 *base = XEXP (addr, 0);
21740 *offset = XEXP (addr, 1);
21741 return true;
21742 }
21743
21744 *base = NULL_RTX;
21745 *offset = NULL_RTX;
21746
21747 return false;
21748}
21749
21750/* Types for scheduling fusion. */
21751enum sched_fusion_type
21752{
21753 SCHED_FUSION_NONE = 0,
21754 SCHED_FUSION_LD_SIGN_EXTEND,
21755 SCHED_FUSION_LD_ZERO_EXTEND,
21756 SCHED_FUSION_LD,
21757 SCHED_FUSION_ST,
21758 SCHED_FUSION_NUM
21759};
21760
21761/* If INSN is a load or store of address in the form of [base+offset],
21762 extract the two parts and set to BASE and OFFSET. Return scheduling
21763 fusion type this INSN is. */
21764
21765static enum sched_fusion_type
21766fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
21767{
21768 rtx x, dest, src;
21769 enum sched_fusion_type fusion = SCHED_FUSION_LD;
21770
21771 gcc_assert (INSN_P (insn));
21772 x = PATTERN (insn);
21773 if (GET_CODE (x) != SET)
21774 return SCHED_FUSION_NONE;
21775
21776 src = SET_SRC (x);
21777 dest = SET_DEST (x);
21778
abc52318
KT
21779 machine_mode dest_mode = GET_MODE (dest);
21780
21781 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
350013bc
BC
21782 return SCHED_FUSION_NONE;
21783
21784 if (GET_CODE (src) == SIGN_EXTEND)
21785 {
21786 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
21787 src = XEXP (src, 0);
21788 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21789 return SCHED_FUSION_NONE;
21790 }
21791 else if (GET_CODE (src) == ZERO_EXTEND)
21792 {
21793 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
21794 src = XEXP (src, 0);
21795 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21796 return SCHED_FUSION_NONE;
21797 }
21798
21799 if (GET_CODE (src) == MEM && REG_P (dest))
21800 extract_base_offset_in_addr (src, base, offset);
21801 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
21802 {
21803 fusion = SCHED_FUSION_ST;
21804 extract_base_offset_in_addr (dest, base, offset);
21805 }
21806 else
21807 return SCHED_FUSION_NONE;
21808
21809 if (*base == NULL_RTX || *offset == NULL_RTX)
21810 fusion = SCHED_FUSION_NONE;
21811
21812 return fusion;
21813}
21814
21815/* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21816
21817 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21818 and PRI are only calculated for these instructions. For other instruction,
21819 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
21820 type instruction fusion can be added by returning different priorities.
21821
21822 It's important that irrelevant instructions get the largest FUSION_PRI. */
21823
21824static void
21825aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
21826 int *fusion_pri, int *pri)
21827{
21828 int tmp, off_val;
21829 rtx base, offset;
21830 enum sched_fusion_type fusion;
21831
21832 gcc_assert (INSN_P (insn));
21833
21834 tmp = max_pri - 1;
21835 fusion = fusion_load_store (insn, &base, &offset);
21836 if (fusion == SCHED_FUSION_NONE)
21837 {
21838 *pri = tmp;
21839 *fusion_pri = tmp;
21840 return;
21841 }
21842
21843 /* Set FUSION_PRI according to fusion type and base register. */
21844 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
21845
21846 /* Calculate PRI. */
21847 tmp /= 2;
21848
21849 /* INSN with smaller offset goes first. */
21850 off_val = (int)(INTVAL (offset));
21851 if (off_val >= 0)
21852 tmp -= (off_val & 0xfffff);
21853 else
21854 tmp += ((- off_val) & 0xfffff);
21855
21856 *pri = tmp;
21857 return;
21858}
21859
9bca63d4
WD
21860/* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21861 Adjust priority of sha1h instructions so they are scheduled before
21862 other SHA1 instructions. */
21863
21864static int
21865aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
21866{
21867 rtx x = PATTERN (insn);
21868
21869 if (GET_CODE (x) == SET)
21870 {
21871 x = SET_SRC (x);
21872
21873 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
21874 return priority + 10;
21875 }
21876
21877 return priority;
21878}
21879
350013bc
BC
21880/* Given OPERANDS of consecutive load/store, check if we can merge
21881 them into ldp/stp. LOAD is true if they are load instructions.
21882 MODE is the mode of memory operands. */
21883
21884bool
21885aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
b8506a8a 21886 machine_mode mode)
350013bc
BC
21887{
21888 HOST_WIDE_INT offval_1, offval_2, msize;
21889 enum reg_class rclass_1, rclass_2;
21890 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
21891
21892 if (load)
21893 {
21894 mem_1 = operands[1];
21895 mem_2 = operands[3];
21896 reg_1 = operands[0];
21897 reg_2 = operands[2];
21898 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
21899 if (REGNO (reg_1) == REGNO (reg_2))
21900 return false;
21901 }
21902 else
21903 {
21904 mem_1 = operands[0];
21905 mem_2 = operands[2];
21906 reg_1 = operands[1];
21907 reg_2 = operands[3];
21908 }
21909
bf84ac44
AP
21910 /* The mems cannot be volatile. */
21911 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
21912 return false;
21913
54700e2e
AP
21914 /* If we have SImode and slow unaligned ldp,
21915 check the alignment to be at least 8 byte. */
21916 if (mode == SImode
21917 && (aarch64_tune_params.extra_tuning_flags
21918 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21919 && !optimize_size
21920 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
21921 return false;
21922
350013bc
BC
21923 /* Check if the addresses are in the form of [base+offset]. */
21924 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
21925 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
21926 return false;
21927 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
21928 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
21929 return false;
21930
21931 /* Check if the bases are same. */
21932 if (!rtx_equal_p (base_1, base_2))
21933 return false;
21934
dfe1da23
JW
21935 /* The operands must be of the same size. */
21936 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
21937 GET_MODE_SIZE (GET_MODE (mem_2))));
21938
350013bc
BC
21939 offval_1 = INTVAL (offset_1);
21940 offval_2 = INTVAL (offset_2);
6a70badb
RS
21941 /* We should only be trying this for fixed-sized modes. There is no
21942 SVE LDP/STP instruction. */
21943 msize = GET_MODE_SIZE (mode).to_constant ();
350013bc
BC
21944 /* Check if the offsets are consecutive. */
21945 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
21946 return false;
21947
21948 /* Check if the addresses are clobbered by load. */
21949 if (load)
21950 {
21951 if (reg_mentioned_p (reg_1, mem_1))
21952 return false;
21953
21954 /* In increasing order, the last load can clobber the address. */
21955 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
9b56ec11 21956 return false;
350013bc
BC
21957 }
21958
9b56ec11
JW
21959 /* One of the memory accesses must be a mempair operand.
21960 If it is not the first one, they need to be swapped by the
21961 peephole. */
21962 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
21963 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
21964 return false;
21965
350013bc
BC
21966 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
21967 rclass_1 = FP_REGS;
21968 else
21969 rclass_1 = GENERAL_REGS;
21970
21971 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
21972 rclass_2 = FP_REGS;
21973 else
21974 rclass_2 = GENERAL_REGS;
21975
21976 /* Check if the registers are of same class. */
21977 if (rclass_1 != rclass_2)
21978 return false;
21979
21980 return true;
21981}
21982
9b56ec11
JW
21983/* Given OPERANDS of consecutive load/store that can be merged,
21984 swap them if they are not in ascending order. */
21985void
21986aarch64_swap_ldrstr_operands (rtx* operands, bool load)
21987{
21988 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
21989 HOST_WIDE_INT offval_1, offval_2;
21990
21991 if (load)
21992 {
21993 mem_1 = operands[1];
21994 mem_2 = operands[3];
21995 }
21996 else
21997 {
21998 mem_1 = operands[0];
21999 mem_2 = operands[2];
22000 }
22001
22002 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22003 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22004
22005 offval_1 = INTVAL (offset_1);
22006 offval_2 = INTVAL (offset_2);
22007
22008 if (offval_1 > offval_2)
22009 {
22010 /* Irrespective of whether this is a load or a store,
22011 we do the same swap. */
22012 std::swap (operands[0], operands[2]);
22013 std::swap (operands[1], operands[3]);
22014 }
22015}
22016
d0b51297
JW
22017/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22018 comparison between the two. */
22019int
22020aarch64_host_wide_int_compare (const void *x, const void *y)
22021{
22022 return wi::cmps (* ((const HOST_WIDE_INT *) x),
22023 * ((const HOST_WIDE_INT *) y));
22024}
22025
22026/* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22027 other pointing to a REG rtx containing an offset, compare the offsets
22028 of the two pairs.
22029
22030 Return:
22031
22032 1 iff offset (X) > offset (Y)
22033 0 iff offset (X) == offset (Y)
22034 -1 iff offset (X) < offset (Y) */
22035int
22036aarch64_ldrstr_offset_compare (const void *x, const void *y)
22037{
22038 const rtx * operands_1 = (const rtx *) x;
22039 const rtx * operands_2 = (const rtx *) y;
22040 rtx mem_1, mem_2, base, offset_1, offset_2;
22041
22042 if (MEM_P (operands_1[0]))
22043 mem_1 = operands_1[0];
22044 else
22045 mem_1 = operands_1[1];
22046
22047 if (MEM_P (operands_2[0]))
22048 mem_2 = operands_2[0];
22049 else
22050 mem_2 = operands_2[1];
22051
22052 /* Extract the offsets. */
22053 extract_base_offset_in_addr (mem_1, &base, &offset_1);
22054 extract_base_offset_in_addr (mem_2, &base, &offset_2);
22055
22056 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22057
22058 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22059}
22060
350013bc
BC
22061/* Given OPERANDS of consecutive load/store, check if we can merge
22062 them into ldp/stp by adjusting the offset. LOAD is true if they
22063 are load instructions. MODE is the mode of memory operands.
22064
22065 Given below consecutive stores:
22066
22067 str w1, [xb, 0x100]
22068 str w1, [xb, 0x104]
22069 str w1, [xb, 0x108]
22070 str w1, [xb, 0x10c]
22071
22072 Though the offsets are out of the range supported by stp, we can
22073 still pair them after adjusting the offset, like:
22074
22075 add scratch, xb, 0x100
22076 stp w1, w1, [scratch]
22077 stp w1, w1, [scratch, 0x8]
22078
22079 The peephole patterns detecting this opportunity should guarantee
22080 the scratch register is avaliable. */
22081
22082bool
22083aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
146c2e3a 22084 scalar_mode mode)
350013bc 22085{
34d7854d
JW
22086 const int num_insns = 4;
22087 enum reg_class rclass;
22088 HOST_WIDE_INT offvals[num_insns], msize;
22089 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
350013bc
BC
22090
22091 if (load)
22092 {
34d7854d
JW
22093 for (int i = 0; i < num_insns; i++)
22094 {
22095 reg[i] = operands[2 * i];
22096 mem[i] = operands[2 * i + 1];
22097
22098 gcc_assert (REG_P (reg[i]));
22099 }
d0b51297
JW
22100
22101 /* Do not attempt to merge the loads if the loads clobber each other. */
22102 for (int i = 0; i < 8; i += 2)
22103 for (int j = i + 2; j < 8; j += 2)
22104 if (reg_overlap_mentioned_p (operands[i], operands[j]))
22105 return false;
350013bc
BC
22106 }
22107 else
34d7854d
JW
22108 for (int i = 0; i < num_insns; i++)
22109 {
22110 mem[i] = operands[2 * i];
22111 reg[i] = operands[2 * i + 1];
22112 }
350013bc 22113
34d7854d
JW
22114 /* Skip if memory operand is by itself valid for ldp/stp. */
22115 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
bf84ac44
AP
22116 return false;
22117
34d7854d
JW
22118 for (int i = 0; i < num_insns; i++)
22119 {
22120 /* The mems cannot be volatile. */
22121 if (MEM_VOLATILE_P (mem[i]))
22122 return false;
22123
22124 /* Check if the addresses are in the form of [base+offset]. */
22125 extract_base_offset_in_addr (mem[i], base + i, offset + i);
22126 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22127 return false;
22128 }
22129
363b395b
JW
22130 /* Check if the registers are of same class. */
22131 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22132 ? FP_REGS : GENERAL_REGS;
22133
22134 for (int i = 1; i < num_insns; i++)
22135 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22136 {
22137 if (rclass != FP_REGS)
22138 return false;
22139 }
22140 else
22141 {
22142 if (rclass != GENERAL_REGS)
22143 return false;
22144 }
22145
22146 /* Only the last register in the order in which they occur
22147 may be clobbered by the load. */
22148 if (rclass == GENERAL_REGS && load)
22149 for (int i = 0; i < num_insns - 1; i++)
34d7854d
JW
22150 if (reg_mentioned_p (reg[i], mem[i]))
22151 return false;
350013bc
BC
22152
22153 /* Check if the bases are same. */
34d7854d
JW
22154 for (int i = 0; i < num_insns - 1; i++)
22155 if (!rtx_equal_p (base[i], base[i + 1]))
22156 return false;
22157
22158 for (int i = 0; i < num_insns; i++)
22159 offvals[i] = INTVAL (offset[i]);
350013bc 22160
350013bc 22161 msize = GET_MODE_SIZE (mode);
d0b51297
JW
22162
22163 /* Check if the offsets can be put in the right order to do a ldp/stp. */
34d7854d
JW
22164 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22165 aarch64_host_wide_int_compare);
d0b51297
JW
22166
22167 if (!(offvals[1] == offvals[0] + msize
22168 && offvals[3] == offvals[2] + msize))
350013bc
BC
22169 return false;
22170
d0b51297
JW
22171 /* Check that offsets are within range of each other. The ldp/stp
22172 instructions have 7 bit immediate offsets, so use 0x80. */
22173 if (offvals[2] - offvals[0] >= msize * 0x80)
22174 return false;
350013bc 22175
d0b51297
JW
22176 /* The offsets must be aligned with respect to each other. */
22177 if (offvals[0] % msize != offvals[2] % msize)
22178 return false;
22179
54700e2e
AP
22180 /* If we have SImode and slow unaligned ldp,
22181 check the alignment to be at least 8 byte. */
22182 if (mode == SImode
22183 && (aarch64_tune_params.extra_tuning_flags
34d7854d 22184 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
54700e2e 22185 && !optimize_size
34d7854d 22186 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
54700e2e
AP
22187 return false;
22188
350013bc
BC
22189 return true;
22190}
22191
22192/* Given OPERANDS of consecutive load/store, this function pairs them
d0b51297
JW
22193 into LDP/STP after adjusting the offset. It depends on the fact
22194 that the operands can be sorted so the offsets are correct for STP.
350013bc
BC
22195 MODE is the mode of memory operands. CODE is the rtl operator
22196 which should be applied to all memory operands, it's SIGN_EXTEND,
22197 ZERO_EXTEND or UNKNOWN. */
22198
22199bool
22200aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
146c2e3a 22201 scalar_mode mode, RTX_CODE code)
350013bc 22202{
d0b51297 22203 rtx base, offset_1, offset_3, t1, t2;
350013bc 22204 rtx mem_1, mem_2, mem_3, mem_4;
d0b51297
JW
22205 rtx temp_operands[8];
22206 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22207 stp_off_upper_limit, stp_off_lower_limit, msize;
9b56ec11 22208
d0b51297
JW
22209 /* We make changes on a copy as we may still bail out. */
22210 for (int i = 0; i < 8; i ++)
22211 temp_operands[i] = operands[i];
9b56ec11 22212
d0b51297
JW
22213 /* Sort the operands. */
22214 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
9b56ec11 22215
f6af9c21
RE
22216 /* Copy the memory operands so that if we have to bail for some
22217 reason the original addresses are unchanged. */
350013bc
BC
22218 if (load)
22219 {
f6af9c21
RE
22220 mem_1 = copy_rtx (temp_operands[1]);
22221 mem_2 = copy_rtx (temp_operands[3]);
22222 mem_3 = copy_rtx (temp_operands[5]);
22223 mem_4 = copy_rtx (temp_operands[7]);
350013bc
BC
22224 }
22225 else
22226 {
f6af9c21
RE
22227 mem_1 = copy_rtx (temp_operands[0]);
22228 mem_2 = copy_rtx (temp_operands[2]);
22229 mem_3 = copy_rtx (temp_operands[4]);
22230 mem_4 = copy_rtx (temp_operands[6]);
350013bc
BC
22231 gcc_assert (code == UNKNOWN);
22232 }
22233
9b56ec11 22234 extract_base_offset_in_addr (mem_1, &base, &offset_1);
d0b51297
JW
22235 extract_base_offset_in_addr (mem_3, &base, &offset_3);
22236 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22237 && offset_3 != NULL_RTX);
350013bc 22238
d0b51297 22239 /* Adjust offset so it can fit in LDP/STP instruction. */
350013bc 22240 msize = GET_MODE_SIZE (mode);
d0b51297
JW
22241 stp_off_upper_limit = msize * (0x40 - 1);
22242 stp_off_lower_limit = - msize * 0x40;
350013bc 22243
d0b51297
JW
22244 off_val_1 = INTVAL (offset_1);
22245 off_val_3 = INTVAL (offset_3);
22246
22247 /* The base offset is optimally half way between the two STP/LDP offsets. */
22248 if (msize <= 4)
22249 base_off = (off_val_1 + off_val_3) / 2;
22250 else
22251 /* However, due to issues with negative LDP/STP offset generation for
22252 larger modes, for DF, DI and vector modes. we must not use negative
22253 addresses smaller than 9 signed unadjusted bits can store. This
22254 provides the most range in this case. */
22255 base_off = off_val_1;
22256
22257 /* Adjust the base so that it is aligned with the addresses but still
22258 optimal. */
22259 if (base_off % msize != off_val_1 % msize)
22260 /* Fix the offset, bearing in mind we want to make it bigger not
22261 smaller. */
22262 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22263 else if (msize <= 4)
22264 /* The negative range of LDP/STP is one larger than the positive range. */
22265 base_off += msize;
22266
22267 /* Check if base offset is too big or too small. We can attempt to resolve
22268 this issue by setting it to the maximum value and seeing if the offsets
22269 still fit. */
22270 if (base_off >= 0x1000)
350013bc 22271 {
d0b51297
JW
22272 base_off = 0x1000 - 1;
22273 /* We must still make sure that the base offset is aligned with respect
700d4cb0 22274 to the address. But it may not be made any bigger. */
d0b51297 22275 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
22276 }
22277
d0b51297
JW
22278 /* Likewise for the case where the base is too small. */
22279 if (base_off <= -0x1000)
350013bc 22280 {
d0b51297
JW
22281 base_off = -0x1000 + 1;
22282 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
350013bc
BC
22283 }
22284
d0b51297
JW
22285 /* Offset of the first STP/LDP. */
22286 new_off_1 = off_val_1 - base_off;
22287
22288 /* Offset of the second STP/LDP. */
22289 new_off_3 = off_val_3 - base_off;
350013bc 22290
d0b51297
JW
22291 /* The offsets must be within the range of the LDP/STP instructions. */
22292 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22293 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
350013bc
BC
22294 return false;
22295
d0b51297
JW
22296 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22297 new_off_1), true);
22298 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22299 new_off_1 + msize), true);
22300 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22301 new_off_3), true);
22302 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22303 new_off_3 + msize), true);
22304
22305 if (!aarch64_mem_pair_operand (mem_1, mode)
22306 || !aarch64_mem_pair_operand (mem_3, mode))
22307 return false;
350013bc
BC
22308
22309 if (code == ZERO_EXTEND)
22310 {
22311 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22312 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22313 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22314 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22315 }
22316 else if (code == SIGN_EXTEND)
22317 {
22318 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22319 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22320 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22321 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22322 }
22323
22324 if (load)
22325 {
d0b51297 22326 operands[0] = temp_operands[0];
350013bc 22327 operands[1] = mem_1;
d0b51297 22328 operands[2] = temp_operands[2];
350013bc 22329 operands[3] = mem_2;
d0b51297 22330 operands[4] = temp_operands[4];
350013bc 22331 operands[5] = mem_3;
d0b51297 22332 operands[6] = temp_operands[6];
350013bc
BC
22333 operands[7] = mem_4;
22334 }
22335 else
22336 {
22337 operands[0] = mem_1;
d0b51297 22338 operands[1] = temp_operands[1];
350013bc 22339 operands[2] = mem_2;
d0b51297 22340 operands[3] = temp_operands[3];
350013bc 22341 operands[4] = mem_3;
d0b51297 22342 operands[5] = temp_operands[5];
350013bc 22343 operands[6] = mem_4;
d0b51297 22344 operands[7] = temp_operands[7];
350013bc
BC
22345 }
22346
22347 /* Emit adjusting instruction. */
d0b51297 22348 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
350013bc 22349 /* Emit ldp/stp instructions. */
f7df4a84
RS
22350 t1 = gen_rtx_SET (operands[0], operands[1]);
22351 t2 = gen_rtx_SET (operands[2], operands[3]);
350013bc 22352 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
f7df4a84
RS
22353 t1 = gen_rtx_SET (operands[4], operands[5]);
22354 t2 = gen_rtx_SET (operands[6], operands[7]);
350013bc
BC
22355 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22356 return true;
22357}
22358
76a34e3f
RS
22359/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
22360 it isn't worth branching around empty masked ops (including masked
22361 stores). */
22362
22363static bool
22364aarch64_empty_mask_is_expensive (unsigned)
22365{
22366 return false;
22367}
22368
1b1e81f8
JW
22369/* Return 1 if pseudo register should be created and used to hold
22370 GOT address for PIC code. */
22371
22372bool
22373aarch64_use_pseudo_pic_reg (void)
22374{
22375 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22376}
22377
7b841a12
JW
22378/* Implement TARGET_UNSPEC_MAY_TRAP_P. */
22379
22380static int
22381aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22382{
22383 switch (XINT (x, 1))
22384 {
22385 case UNSPEC_GOTSMALLPIC:
22386 case UNSPEC_GOTSMALLPIC28K:
22387 case UNSPEC_GOTTINYPIC:
22388 return 0;
22389 default:
22390 break;
22391 }
22392
22393 return default_unspec_may_trap_p (x, flags);
22394}
22395
39252973
KT
22396
22397/* If X is a positive CONST_DOUBLE with a value that is a power of 2
22398 return the log2 of that value. Otherwise return -1. */
22399
22400int
22401aarch64_fpconst_pow_of_2 (rtx x)
22402{
22403 const REAL_VALUE_TYPE *r;
22404
22405 if (!CONST_DOUBLE_P (x))
22406 return -1;
22407
22408 r = CONST_DOUBLE_REAL_VALUE (x);
22409
22410 if (REAL_VALUE_NEGATIVE (*r)
22411 || REAL_VALUE_ISNAN (*r)
22412 || REAL_VALUE_ISINF (*r)
22413 || !real_isinteger (r, DFmode))
22414 return -1;
22415
22416 return exact_log2 (real_to_integer (r));
22417}
22418
188d0079
JH
22419/* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22420 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22421 return n. Otherwise return -1. */
22422
22423int
22424aarch64_fpconst_pow2_recip (rtx x)
22425{
22426 REAL_VALUE_TYPE r0;
22427
22428 if (!CONST_DOUBLE_P (x))
22429 return -1;
22430
22431 r0 = *CONST_DOUBLE_REAL_VALUE (x);
22432 if (exact_real_inverse (DFmode, &r0)
22433 && !REAL_VALUE_NEGATIVE (r0))
22434 {
22435 int ret = exact_log2 (real_to_integer (&r0));
22436 if (ret >= 1 && ret <= 32)
22437 return ret;
22438 }
22439 return -1;
22440}
22441
39252973
KT
22442/* If X is a vector of equal CONST_DOUBLE values and that value is
22443 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
22444
22445int
22446aarch64_vec_fpconst_pow_of_2 (rtx x)
22447{
6a70badb
RS
22448 int nelts;
22449 if (GET_CODE (x) != CONST_VECTOR
22450 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
39252973
KT
22451 return -1;
22452
22453 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22454 return -1;
22455
22456 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22457 if (firstval <= 0)
22458 return -1;
22459
6a70badb 22460 for (int i = 1; i < nelts; i++)
39252973
KT
22461 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22462 return -1;
22463
22464 return firstval;
22465}
22466
11e554b3
JG
22467/* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22468 to float.
22469
22470 __fp16 always promotes through this hook.
22471 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22472 through the generic excess precision logic rather than here. */
22473
c2ec330c
AL
22474static tree
22475aarch64_promoted_type (const_tree t)
22476{
11e554b3
JG
22477 if (SCALAR_FLOAT_TYPE_P (t)
22478 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
c2ec330c 22479 return float_type_node;
11e554b3 22480
c2ec330c
AL
22481 return NULL_TREE;
22482}
ee62a5a6
RS
22483
22484/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
22485
22486static bool
9acc9cbe 22487aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
ee62a5a6
RS
22488 optimization_type opt_type)
22489{
22490 switch (op)
22491 {
22492 case rsqrt_optab:
9acc9cbe 22493 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
ee62a5a6
RS
22494
22495 default:
22496 return true;
22497 }
22498}
22499
43cacb12
RS
22500/* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
22501
22502static unsigned int
22503aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22504 int *offset)
22505{
22506 /* Polynomial invariant 1 == (VG / 2) - 1. */
22507 gcc_assert (i == 1);
22508 *factor = 2;
22509 *offset = 1;
22510 return AARCH64_DWARF_VG;
22511}
22512
11e554b3
JG
22513/* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22514 if MODE is HFmode, and punt to the generic implementation otherwise. */
22515
22516static bool
7c5bd57a 22517aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
11e554b3
JG
22518{
22519 return (mode == HFmode
22520 ? true
22521 : default_libgcc_floating_mode_supported_p (mode));
22522}
22523
2e5f8203
JG
22524/* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22525 if MODE is HFmode, and punt to the generic implementation otherwise. */
22526
22527static bool
18e2a8b8 22528aarch64_scalar_mode_supported_p (scalar_mode mode)
2e5f8203
JG
22529{
22530 return (mode == HFmode
22531 ? true
22532 : default_scalar_mode_supported_p (mode));
22533}
22534
11e554b3
JG
22535/* Set the value of FLT_EVAL_METHOD.
22536 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22537
22538 0: evaluate all operations and constants, whose semantic type has at
22539 most the range and precision of type float, to the range and
22540 precision of float; evaluate all other operations and constants to
22541 the range and precision of the semantic type;
22542
22543 N, where _FloatN is a supported interchange floating type
22544 evaluate all operations and constants, whose semantic type has at
22545 most the range and precision of _FloatN type, to the range and
22546 precision of the _FloatN type; evaluate all other operations and
22547 constants to the range and precision of the semantic type;
22548
22549 If we have the ARMv8.2-A extensions then we support _Float16 in native
22550 precision, so we should set this to 16. Otherwise, we support the type,
22551 but want to evaluate expressions in float precision, so set this to
22552 0. */
22553
22554static enum flt_eval_method
22555aarch64_excess_precision (enum excess_precision_type type)
22556{
22557 switch (type)
22558 {
22559 case EXCESS_PRECISION_TYPE_FAST:
22560 case EXCESS_PRECISION_TYPE_STANDARD:
22561 /* We can calculate either in 16-bit range and precision or
22562 32-bit range and precision. Make that decision based on whether
22563 we have native support for the ARMv8.2-A 16-bit floating-point
22564 instructions or not. */
22565 return (TARGET_FP_F16INST
22566 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22567 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
22568 case EXCESS_PRECISION_TYPE_IMPLICIT:
22569 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
22570 default:
22571 gcc_unreachable ();
22572 }
22573 return FLT_EVAL_METHOD_UNPREDICTABLE;
22574}
22575
b48d6421
KT
22576/* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
22577 scheduled for speculative execution. Reject the long-running division
22578 and square-root instructions. */
22579
22580static bool
22581aarch64_sched_can_speculate_insn (rtx_insn *insn)
22582{
22583 switch (get_attr_type (insn))
22584 {
22585 case TYPE_SDIV:
22586 case TYPE_UDIV:
22587 case TYPE_FDIVS:
22588 case TYPE_FDIVD:
22589 case TYPE_FSQRTS:
22590 case TYPE_FSQRTD:
22591 case TYPE_NEON_FP_SQRT_S:
22592 case TYPE_NEON_FP_SQRT_D:
22593 case TYPE_NEON_FP_SQRT_S_Q:
22594 case TYPE_NEON_FP_SQRT_D_Q:
22595 case TYPE_NEON_FP_DIV_S:
22596 case TYPE_NEON_FP_DIV_D:
22597 case TYPE_NEON_FP_DIV_S_Q:
22598 case TYPE_NEON_FP_DIV_D_Q:
22599 return false;
22600 default:
22601 return true;
22602 }
22603}
22604
43cacb12
RS
22605/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
22606
22607static int
22608aarch64_compute_pressure_classes (reg_class *classes)
22609{
22610 int i = 0;
22611 classes[i++] = GENERAL_REGS;
22612 classes[i++] = FP_REGS;
22613 /* PR_REGS isn't a useful pressure class because many predicate pseudo
22614 registers need to go in PR_LO_REGS at some point during their
22615 lifetime. Splitting it into two halves has the effect of making
22616 all predicates count against PR_LO_REGS, so that we try whenever
22617 possible to restrict the number of live predicates to 8. This
22618 greatly reduces the amount of spilling in certain loops. */
22619 classes[i++] = PR_LO_REGS;
22620 classes[i++] = PR_HI_REGS;
22621 return i;
22622}
22623
22624/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
22625
22626static bool
22627aarch64_can_change_mode_class (machine_mode from,
22628 machine_mode to, reg_class_t)
22629{
76607e7e
RS
22630 unsigned int from_flags = aarch64_classify_vector_mode (from);
22631 unsigned int to_flags = aarch64_classify_vector_mode (to);
22632
22633 bool from_sve_p = (from_flags & VEC_ANY_SVE);
22634 bool to_sve_p = (to_flags & VEC_ANY_SVE);
22635
22636 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
22637 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
22638
38e62001
RS
22639 bool from_pred_p = (from_flags & VEC_SVE_PRED);
22640 bool to_pred_p = (to_flags & VEC_SVE_PRED);
22641
22642 /* Don't allow changes between predicate modes and other modes.
22643 Only predicate registers can hold predicate modes and only
22644 non-predicate registers can hold non-predicate modes, so any
22645 attempt to mix them would require a round trip through memory. */
22646 if (from_pred_p != to_pred_p)
22647 return false;
22648
76607e7e
RS
22649 /* Don't allow changes between partial SVE modes and other modes.
22650 The contents of partial SVE modes are distributed evenly across
22651 the register, whereas GCC expects them to be clustered together. */
22652 if (from_partial_sve_p != to_partial_sve_p)
22653 return false;
22654
22655 /* Similarly reject changes between partial SVE modes that have
22656 different patterns of significant and insignificant bits. */
22657 if (from_partial_sve_p
22658 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
22659 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
22660 return false;
22661
38e62001
RS
22662 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
22663 {
22664 /* Don't allow changes between SVE modes and other modes that might
22665 be bigger than 128 bits. In particular, OImode, CImode and XImode
22666 divide into 128-bit quantities while SVE modes divide into
22667 BITS_PER_SVE_VECTOR quantities. */
22668 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
22669 return false;
22670 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
22671 return false;
22672 }
22673
002092be
RS
22674 if (BYTES_BIG_ENDIAN)
22675 {
002092be
RS
22676 /* Don't allow changes between SVE data modes and non-SVE modes.
22677 See the comment at the head of aarch64-sve.md for details. */
22678 if (from_sve_p != to_sve_p)
22679 return false;
22680
22681 /* Don't allow changes in element size: lane 0 of the new vector
22682 would not then be lane 0 of the old vector. See the comment
22683 above aarch64_maybe_expand_sve_subreg_move for a more detailed
22684 description.
22685
22686 In the worst case, this forces a register to be spilled in
22687 one mode and reloaded in the other, which handles the
22688 endianness correctly. */
22689 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
22690 return false;
22691 }
43cacb12
RS
22692 return true;
22693}
22694
5cce8171
RS
22695/* Implement TARGET_EARLY_REMAT_MODES. */
22696
22697static void
22698aarch64_select_early_remat_modes (sbitmap modes)
22699{
22700 /* SVE values are not normally live across a call, so it should be
22701 worth doing early rematerialization even in VL-specific mode. */
22702 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
5c38705d
RS
22703 if (aarch64_sve_mode_p ((machine_mode) i))
22704 bitmap_set_bit (modes, i);
5cce8171
RS
22705}
22706
c0111dc4
RE
22707/* Override the default target speculation_safe_value. */
22708static rtx
22709aarch64_speculation_safe_value (machine_mode mode,
22710 rtx result, rtx val, rtx failval)
22711{
22712 /* Maybe we should warn if falling back to hard barriers. They are
22713 likely to be noticably more expensive than the alternative below. */
22714 if (!aarch64_track_speculation)
22715 return default_speculation_safe_value (mode, result, val, failval);
22716
22717 if (!REG_P (val))
22718 val = copy_to_mode_reg (mode, val);
22719
22720 if (!aarch64_reg_or_zero (failval, mode))
22721 failval = copy_to_mode_reg (mode, failval);
22722
21cebf90 22723 emit_insn (gen_despeculate_copy (mode, result, val, failval));
c0111dc4
RE
22724 return result;
22725}
22726
2d56d6ba
KT
22727/* Implement TARGET_ESTIMATED_POLY_VALUE.
22728 Look into the tuning structure for an estimate.
22729 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22730 Advanced SIMD 128 bits. */
22731
22732static HOST_WIDE_INT
22733aarch64_estimated_poly_value (poly_int64 val)
22734{
22735 enum aarch64_sve_vector_bits_enum width_source
22736 = aarch64_tune_params.sve_width;
22737
22738 /* If we still don't have an estimate, use the default. */
22739 if (width_source == SVE_SCALABLE)
22740 return default_estimated_poly_value (val);
22741
22742 HOST_WIDE_INT over_128 = width_source - 128;
22743 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
22744}
22745
d9186814
SE
22746
22747/* Return true for types that could be supported as SIMD return or
22748 argument types. */
22749
22750static bool
22751supported_simd_type (tree t)
22752{
22753 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
22754 {
22755 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
22756 return s == 1 || s == 2 || s == 4 || s == 8;
22757 }
22758 return false;
22759}
22760
22761/* Return true for types that currently are supported as SIMD return
22762 or argument types. */
22763
22764static bool
22765currently_supported_simd_type (tree t, tree b)
22766{
22767 if (COMPLEX_FLOAT_TYPE_P (t))
22768 return false;
22769
22770 if (TYPE_SIZE (t) != TYPE_SIZE (b))
22771 return false;
22772
22773 return supported_simd_type (t);
22774}
22775
22776/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
22777
22778static int
22779aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
22780 struct cgraph_simd_clone *clonei,
22781 tree base_type, int num)
22782{
22783 tree t, ret_type, arg_type;
22784 unsigned int elt_bits, vec_bits, count;
22785
22786 if (!TARGET_SIMD)
22787 return 0;
22788
22789 if (clonei->simdlen
22790 && (clonei->simdlen < 2
22791 || clonei->simdlen > 1024
22792 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
22793 {
22794 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22795 "unsupported simdlen %d", clonei->simdlen);
22796 return 0;
22797 }
22798
22799 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
22800 if (TREE_CODE (ret_type) != VOID_TYPE
22801 && !currently_supported_simd_type (ret_type, base_type))
22802 {
22803 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
22804 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22805 "GCC does not currently support mixed size types "
22806 "for %<simd%> functions");
22807 else if (supported_simd_type (ret_type))
22808 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22809 "GCC does not currently support return type %qT "
22810 "for %<simd%> functions", ret_type);
22811 else
22812 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22813 "unsupported return type %qT for %<simd%> functions",
22814 ret_type);
22815 return 0;
22816 }
22817
22818 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
22819 {
22820 arg_type = TREE_TYPE (t);
22821
22822 if (!currently_supported_simd_type (arg_type, base_type))
22823 {
22824 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
22825 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22826 "GCC does not currently support mixed size types "
22827 "for %<simd%> functions");
22828 else
22829 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22830 "GCC does not currently support argument type %qT "
22831 "for %<simd%> functions", arg_type);
22832 return 0;
22833 }
22834 }
22835
22836 clonei->vecsize_mangle = 'n';
22837 clonei->mask_mode = VOIDmode;
22838 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
22839 if (clonei->simdlen == 0)
22840 {
22841 count = 2;
22842 vec_bits = (num == 0 ? 64 : 128);
22843 clonei->simdlen = vec_bits / elt_bits;
22844 }
22845 else
22846 {
22847 count = 1;
22848 vec_bits = clonei->simdlen * elt_bits;
22849 if (vec_bits != 64 && vec_bits != 128)
22850 {
22851 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22852 "GCC does not currently support simdlen %d for type %qT",
22853 clonei->simdlen, base_type);
22854 return 0;
22855 }
22856 }
22857 clonei->vecsize_int = vec_bits;
22858 clonei->vecsize_float = vec_bits;
22859 return count;
22860}
22861
22862/* Implement TARGET_SIMD_CLONE_ADJUST. */
22863
22864static void
22865aarch64_simd_clone_adjust (struct cgraph_node *node)
22866{
22867 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22868 use the correct ABI. */
22869
22870 tree t = TREE_TYPE (node->decl);
22871 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
22872 TYPE_ATTRIBUTES (t));
22873}
22874
22875/* Implement TARGET_SIMD_CLONE_USABLE. */
22876
22877static int
22878aarch64_simd_clone_usable (struct cgraph_node *node)
22879{
22880 switch (node->simdclone->vecsize_mangle)
22881 {
22882 case 'n':
22883 if (!TARGET_SIMD)
22884 return -1;
22885 return 0;
22886 default:
22887 gcc_unreachable ();
22888 }
22889}
22890
497f281c
SE
22891/* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22892
22893static int
22894aarch64_comp_type_attributes (const_tree type1, const_tree type2)
22895{
31427b97
RS
22896 auto check_attr = [&](const char *name) {
22897 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
22898 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
22899 if (!attr1 && !attr2)
22900 return true;
22901
22902 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
22903 };
22904
22905 if (!check_attr ("aarch64_vector_pcs"))
22906 return 0;
22907 if (!check_attr ("Advanced SIMD type"))
497f281c
SE
22908 return 0;
22909 return 1;
22910}
22911
3bac1e20
SE
22912/* Implement TARGET_GET_MULTILIB_ABI_NAME */
22913
22914static const char *
22915aarch64_get_multilib_abi_name (void)
22916{
22917 if (TARGET_BIG_END)
22918 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
22919 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
22920}
22921
e76c8e56
JJ
22922/* Implement TARGET_STACK_PROTECT_GUARD. In case of a
22923 global variable based guard use the default else
22924 return a null tree. */
22925static tree
22926aarch64_stack_protect_guard (void)
22927{
22928 if (aarch64_stack_protector_guard == SSP_GLOBAL)
22929 return default_stack_protect_guard ();
22930
22931 return NULL_TREE;
22932}
22933
98698967
SMW
22934/* Return the diagnostic message string if conversion from FROMTYPE to
22935 TOTYPE is not allowed, NULL otherwise. */
22936
22937static const char *
22938aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
22939{
22940 if (element_mode (fromtype) != element_mode (totype))
22941 {
22942 /* Do no allow conversions to/from BFmode scalar types. */
22943 if (TYPE_MODE (fromtype) == BFmode)
22944 return N_("invalid conversion from type %<bfloat16_t%>");
22945 if (TYPE_MODE (totype) == BFmode)
22946 return N_("invalid conversion to type %<bfloat16_t%>");
22947 }
22948
22949 /* Conversion allowed. */
22950 return NULL;
22951}
22952
22953/* Return the diagnostic message string if the unary operation OP is
22954 not permitted on TYPE, NULL otherwise. */
22955
22956static const char *
22957aarch64_invalid_unary_op (int op, const_tree type)
22958{
22959 /* Reject all single-operand operations on BFmode except for &. */
22960 if (element_mode (type) == BFmode && op != ADDR_EXPR)
22961 return N_("operation not permitted on type %<bfloat16_t%>");
22962
22963 /* Operation allowed. */
22964 return NULL;
22965}
22966
22967/* Return the diagnostic message string if the binary operation OP is
22968 not permitted on TYPE1 and TYPE2, NULL otherwise. */
22969
22970static const char *
22971aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
22972 const_tree type2)
22973{
22974 /* Reject all 2-operand operations on BFmode. */
22975 if (element_mode (type1) == BFmode
22976 || element_mode (type2) == BFmode)
22977 return N_("operation not permitted on type %<bfloat16_t%>");
22978
38e62001
RS
22979 if (VECTOR_TYPE_P (type1)
22980 && VECTOR_TYPE_P (type2)
22981 && !TYPE_INDIVISIBLE_P (type1)
22982 && !TYPE_INDIVISIBLE_P (type2)
22983 && (aarch64_sve::builtin_type_p (type1)
22984 != aarch64_sve::builtin_type_p (type2)))
22985 return N_("cannot combine GNU and SVE vectors in a binary operation");
22986
98698967
SMW
22987 /* Operation allowed. */
22988 return NULL;
22989}
22990
32efff9f
SD
22991/* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
22992 section at the end if needed. */
22993#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
22994#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
22995#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
22996void
22997aarch64_file_end_indicate_exec_stack ()
22998{
22999 file_end_indicate_exec_stack ();
23000
23001 unsigned feature_1_and = 0;
23002 if (aarch64_bti_enabled ())
23003 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23004
23005 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23006 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23007
23008 if (feature_1_and)
23009 {
23010 /* Generate .note.gnu.property section. */
23011 switch_to_section (get_section (".note.gnu.property",
23012 SECTION_NOTYPE, NULL));
23013
23014 /* PT_NOTE header: namesz, descsz, type.
23015 namesz = 4 ("GNU\0")
23016 descsz = 16 (Size of the program property array)
23017 [(12 + padding) * Number of array elements]
23018 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
23019 assemble_align (POINTER_SIZE);
23020 assemble_integer (GEN_INT (4), 4, 32, 1);
23021 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23022 assemble_integer (GEN_INT (5), 4, 32, 1);
23023
23024 /* PT_NOTE name. */
23025 assemble_string ("GNU", 4);
23026
23027 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23028 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23029 datasz = 4
23030 data = feature_1_and. */
23031 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23032 assemble_integer (GEN_INT (4), 4, 32, 1);
23033 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23034
23035 /* Pad the size of the note to the required alignment. */
23036 assemble_align (POINTER_SIZE);
23037 }
23038}
23039#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23040#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23041#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
e76c8e56 23042
be178ecd
MM
23043/* Helper function for straight line speculation.
23044 Return what barrier should be emitted for straight line speculation
23045 mitigation.
23046 When not mitigating against straight line speculation this function returns
23047 an empty string.
23048 When mitigating against straight line speculation, use:
23049 * SB when the v8.5-A SB extension is enabled.
23050 * DSB+ISB otherwise. */
23051const char *
23052aarch64_sls_barrier (int mitigation_required)
23053{
23054 return mitigation_required
23055 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23056 : "";
23057}
23058
96b7f495
MM
23059static GTY (()) tree aarch64_sls_shared_thunks[30];
23060static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23061const char *indirect_symbol_names[30] = {
23062 "__call_indirect_x0",
23063 "__call_indirect_x1",
23064 "__call_indirect_x2",
23065 "__call_indirect_x3",
23066 "__call_indirect_x4",
23067 "__call_indirect_x5",
23068 "__call_indirect_x6",
23069 "__call_indirect_x7",
23070 "__call_indirect_x8",
23071 "__call_indirect_x9",
23072 "__call_indirect_x10",
23073 "__call_indirect_x11",
23074 "__call_indirect_x12",
23075 "__call_indirect_x13",
23076 "__call_indirect_x14",
23077 "__call_indirect_x15",
23078 "", /* "__call_indirect_x16", */
23079 "", /* "__call_indirect_x17", */
23080 "__call_indirect_x18",
23081 "__call_indirect_x19",
23082 "__call_indirect_x20",
23083 "__call_indirect_x21",
23084 "__call_indirect_x22",
23085 "__call_indirect_x23",
23086 "__call_indirect_x24",
23087 "__call_indirect_x25",
23088 "__call_indirect_x26",
23089 "__call_indirect_x27",
23090 "__call_indirect_x28",
23091 "__call_indirect_x29",
23092};
23093
23094/* Function to create a BLR thunk. This thunk is used to mitigate straight
23095 line speculation. Instead of a simple BLR that can be speculated past,
23096 we emit a BL to this thunk, and this thunk contains a BR to the relevant
23097 register. These thunks have the relevant speculation barries put after
23098 their indirect branch so that speculation is blocked.
23099
23100 We use such a thunk so the speculation barriers are kept off the
23101 architecturally executed path in order to reduce the performance overhead.
23102
23103 When optimizing for size we use stubs shared by the linked object.
23104 When optimizing for performance we emit stubs for each function in the hope
23105 that the branch predictor can better train on jumps specific for a given
23106 function. */
23107rtx
23108aarch64_sls_create_blr_label (int regnum)
23109{
23110 gcc_assert (STUB_REGNUM_P (regnum));
23111 if (optimize_function_for_size_p (cfun))
23112 {
23113 /* For the thunks shared between different functions in this compilation
23114 unit we use a named symbol -- this is just for users to more easily
23115 understand the generated assembly. */
23116 aarch64_sls_shared_thunks_needed = true;
23117 const char *thunk_name = indirect_symbol_names[regnum];
23118 if (aarch64_sls_shared_thunks[regnum] == NULL)
23119 {
23120 /* Build a decl representing this function stub and record it for
23121 later. We build a decl here so we can use the GCC machinery for
23122 handling sections automatically (through `get_named_section` and
23123 `make_decl_one_only`). That saves us a lot of trouble handling
23124 the specifics of different output file formats. */
23125 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23126 get_identifier (thunk_name),
23127 build_function_type_list (void_type_node,
23128 NULL_TREE));
23129 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23130 NULL_TREE, void_type_node);
23131 TREE_PUBLIC (decl) = 1;
23132 TREE_STATIC (decl) = 1;
23133 DECL_IGNORED_P (decl) = 1;
23134 DECL_ARTIFICIAL (decl) = 1;
23135 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23136 resolve_unique_section (decl, 0, false);
23137 aarch64_sls_shared_thunks[regnum] = decl;
23138 }
23139
23140 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23141 }
23142
23143 if (cfun->machine->call_via[regnum] == NULL)
23144 cfun->machine->call_via[regnum]
23145 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23146 return cfun->machine->call_via[regnum];
23147}
23148
23149/* Helper function for aarch64_sls_emit_blr_function_thunks and
23150 aarch64_sls_emit_shared_blr_thunks below. */
23151static void
23152aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23153{
23154 /* Save in x16 and branch to that function so this transformation does
23155 not prevent jumping to `BTI c` instructions. */
23156 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23157 asm_fprintf (out_file, "\tbr\tx16\n");
23158}
23159
23160/* Emit all BLR stubs for this particular function.
23161 Here we emit all the BLR stubs needed for the current function. Since we
23162 emit these stubs in a consecutive block we know there will be no speculation
23163 gadgets between each stub, and hence we only emit a speculation barrier at
23164 the end of the stub sequences.
23165
23166 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
23167void
23168aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23169{
23170 if (! aarch64_harden_sls_blr_p ())
23171 return;
23172
23173 bool any_functions_emitted = false;
23174 /* We must save and restore the current function section since this assembly
23175 is emitted at the end of the function. This means it can be emitted *just
23176 after* the cold section of a function. That cold part would be emitted in
23177 a different section. That switch would trigger a `.cfi_endproc` directive
23178 to be emitted in the original section and a `.cfi_startproc` directive to
23179 be emitted in the new section. Switching to the original section without
23180 restoring would mean that the `.cfi_endproc` emitted as a function ends
23181 would happen in a different section -- leaving an unmatched
23182 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23183 in the standard text section. */
23184 section *save_text_section = in_section;
23185 switch_to_section (function_section (current_function_decl));
23186 for (int regnum = 0; regnum < 30; ++regnum)
23187 {
23188 rtx specu_label = cfun->machine->call_via[regnum];
23189 if (specu_label == NULL)
23190 continue;
23191
23192 targetm.asm_out.print_operand (out_file, specu_label, 0);
23193 asm_fprintf (out_file, ":\n");
23194 aarch64_sls_emit_function_stub (out_file, regnum);
23195 any_functions_emitted = true;
23196 }
23197 if (any_functions_emitted)
23198 /* Can use the SB if needs be here, since this stub will only be used
23199 by the current function, and hence for the current target. */
23200 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23201 switch_to_section (save_text_section);
23202}
23203
23204/* Emit shared BLR stubs for the current compilation unit.
23205 Over the course of compiling this unit we may have converted some BLR
23206 instructions to a BL to a shared stub function. This is where we emit those
23207 stub functions.
23208 This function is for the stubs shared between different functions in this
23209 compilation unit. We share when optimizing for size instead of speed.
23210
23211 This function is called through the TARGET_ASM_FILE_END hook. */
23212void
23213aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23214{
23215 if (! aarch64_sls_shared_thunks_needed)
23216 return;
23217
23218 for (int regnum = 0; regnum < 30; ++regnum)
23219 {
23220 tree decl = aarch64_sls_shared_thunks[regnum];
23221 if (!decl)
23222 continue;
23223
23224 const char *name = indirect_symbol_names[regnum];
23225 switch_to_section (get_named_section (decl, NULL, 0));
23226 ASM_OUTPUT_ALIGN (out_file, 2);
23227 targetm.asm_out.globalize_label (out_file, name);
23228 /* Only emits if the compiler is configured for an assembler that can
23229 handle visibility directives. */
23230 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23231 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23232 ASM_OUTPUT_LABEL (out_file, name);
23233 aarch64_sls_emit_function_stub (out_file, regnum);
23234 /* Use the most conservative target to ensure it can always be used by any
23235 function in the translation unit. */
23236 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23237 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23238 }
23239}
23240
23241/* Implement TARGET_ASM_FILE_END. */
23242void
23243aarch64_asm_file_end ()
23244{
23245 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23246 /* Since this function will be called for the ASM_FILE_END hook, we ensure
23247 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23248 for FreeBSD) still gets called. */
23249#ifdef TARGET_ASM_FILE_END
23250 TARGET_ASM_FILE_END ();
23251#endif
23252}
23253
23254const char *
23255aarch64_indirect_call_asm (rtx addr)
23256{
23257 gcc_assert (REG_P (addr));
23258 if (aarch64_harden_sls_blr_p ())
23259 {
23260 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23261 output_asm_insn ("bl\t%0", &stub_label);
23262 }
23263 else
23264 output_asm_insn ("blr\t%0", &addr);
23265 return "";
23266}
23267
51b86113
DM
23268/* Target-specific selftests. */
23269
23270#if CHECKING_P
23271
23272namespace selftest {
23273
23274/* Selftest for the RTL loader.
23275 Verify that the RTL loader copes with a dump from
23276 print_rtx_function. This is essentially just a test that class
23277 function_reader can handle a real dump, but it also verifies
23278 that lookup_reg_by_dump_name correctly handles hard regs.
23279 The presence of hard reg names in the dump means that the test is
23280 target-specific, hence it is in this file. */
23281
23282static void
23283aarch64_test_loading_full_dump ()
23284{
23285 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23286
23287 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23288
23289 rtx_insn *insn_1 = get_insn_by_uid (1);
23290 ASSERT_EQ (NOTE, GET_CODE (insn_1));
23291
23292 rtx_insn *insn_15 = get_insn_by_uid (15);
23293 ASSERT_EQ (INSN, GET_CODE (insn_15));
23294 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23295
23296 /* Verify crtl->return_rtx. */
23297 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23298 ASSERT_EQ (0, REGNO (crtl->return_rtx));
23299 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23300}
23301
23302/* Run all target-specific selftests. */
23303
23304static void
23305aarch64_run_selftests (void)
23306{
23307 aarch64_test_loading_full_dump ();
23308}
23309
23310} // namespace selftest
23311
23312#endif /* #if CHECKING_P */
23313
cd0b2d36
RR
23314#undef TARGET_STACK_PROTECT_GUARD
23315#define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23316
43e9d192
IB
23317#undef TARGET_ADDRESS_COST
23318#define TARGET_ADDRESS_COST aarch64_address_cost
23319
23320/* This hook will determines whether unnamed bitfields affect the alignment
23321 of the containing structure. The hook returns true if the structure
23322 should inherit the alignment requirements of an unnamed bitfield's
23323 type. */
23324#undef TARGET_ALIGN_ANON_BITFIELD
23325#define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23326
23327#undef TARGET_ASM_ALIGNED_DI_OP
23328#define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23329
23330#undef TARGET_ASM_ALIGNED_HI_OP
23331#define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23332
23333#undef TARGET_ASM_ALIGNED_SI_OP
23334#define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23335
23336#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23337#define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23338 hook_bool_const_tree_hwi_hwi_const_tree_true
23339
e1c1ecb0
KT
23340#undef TARGET_ASM_FILE_START
23341#define TARGET_ASM_FILE_START aarch64_start_file
23342
43e9d192
IB
23343#undef TARGET_ASM_OUTPUT_MI_THUNK
23344#define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23345
23346#undef TARGET_ASM_SELECT_RTX_SECTION
23347#define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23348
23349#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23350#define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23351
c292cfe5
SN
23352#undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23353#define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23354
43e9d192
IB
23355#undef TARGET_BUILD_BUILTIN_VA_LIST
23356#define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23357
23358#undef TARGET_CALLEE_COPIES
7256c719 23359#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
43e9d192
IB
23360
23361#undef TARGET_CAN_ELIMINATE
23362#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23363
1fd8d40c
KT
23364#undef TARGET_CAN_INLINE_P
23365#define TARGET_CAN_INLINE_P aarch64_can_inline_p
23366
43e9d192
IB
23367#undef TARGET_CANNOT_FORCE_CONST_MEM
23368#define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23369
50487d79
EM
23370#undef TARGET_CASE_VALUES_THRESHOLD
23371#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23372
43e9d192
IB
23373#undef TARGET_CONDITIONAL_REGISTER_USAGE
23374#define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23375
38e62001
RS
23376#undef TARGET_MEMBER_TYPE_FORCES_BLK
23377#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23378
43e9d192
IB
23379/* Only the least significant bit is used for initialization guard
23380 variables. */
23381#undef TARGET_CXX_GUARD_MASK_BIT
23382#define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23383
23384#undef TARGET_C_MODE_FOR_SUFFIX
23385#define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23386
23387#ifdef TARGET_BIG_ENDIAN_DEFAULT
23388#undef TARGET_DEFAULT_TARGET_FLAGS
23389#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23390#endif
23391
23392#undef TARGET_CLASS_MAX_NREGS
23393#define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23394
119103ca
JG
23395#undef TARGET_BUILTIN_DECL
23396#define TARGET_BUILTIN_DECL aarch64_builtin_decl
23397
a6fc00da
BH
23398#undef TARGET_BUILTIN_RECIPROCAL
23399#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23400
11e554b3
JG
23401#undef TARGET_C_EXCESS_PRECISION
23402#define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23403
43e9d192
IB
23404#undef TARGET_EXPAND_BUILTIN
23405#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23406
23407#undef TARGET_EXPAND_BUILTIN_VA_START
23408#define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23409
9697e620
JG
23410#undef TARGET_FOLD_BUILTIN
23411#define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23412
43e9d192
IB
23413#undef TARGET_FUNCTION_ARG
23414#define TARGET_FUNCTION_ARG aarch64_function_arg
23415
23416#undef TARGET_FUNCTION_ARG_ADVANCE
23417#define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23418
23419#undef TARGET_FUNCTION_ARG_BOUNDARY
23420#define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23421
76b0cbf8
RS
23422#undef TARGET_FUNCTION_ARG_PADDING
23423#define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23424
43cacb12
RS
23425#undef TARGET_GET_RAW_RESULT_MODE
23426#define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23427#undef TARGET_GET_RAW_ARG_MODE
23428#define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23429
43e9d192
IB
23430#undef TARGET_FUNCTION_OK_FOR_SIBCALL
23431#define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23432
23433#undef TARGET_FUNCTION_VALUE
23434#define TARGET_FUNCTION_VALUE aarch64_function_value
23435
23436#undef TARGET_FUNCTION_VALUE_REGNO_P
23437#define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23438
fc72cba7
AL
23439#undef TARGET_GIMPLE_FOLD_BUILTIN
23440#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
0ac198d3 23441
43e9d192
IB
23442#undef TARGET_GIMPLIFY_VA_ARG_EXPR
23443#define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23444
23445#undef TARGET_INIT_BUILTINS
23446#define TARGET_INIT_BUILTINS aarch64_init_builtins
23447
c64f7d37
WD
23448#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23449#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23450 aarch64_ira_change_pseudo_allocno_class
23451
43e9d192
IB
23452#undef TARGET_LEGITIMATE_ADDRESS_P
23453#define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23454
23455#undef TARGET_LEGITIMATE_CONSTANT_P
23456#define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23457
491ec060
WD
23458#undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23459#define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23460 aarch64_legitimize_address_displacement
23461
43e9d192
IB
23462#undef TARGET_LIBGCC_CMP_RETURN_MODE
23463#define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23464
11e554b3
JG
23465#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23466#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23467aarch64_libgcc_floating_mode_supported_p
23468
ac2b960f
YZ
23469#undef TARGET_MANGLE_TYPE
23470#define TARGET_MANGLE_TYPE aarch64_mangle_type
23471
98698967
SMW
23472#undef TARGET_INVALID_CONVERSION
23473#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23474
23475#undef TARGET_INVALID_UNARY_OP
23476#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23477
23478#undef TARGET_INVALID_BINARY_OP
23479#define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23480
65ef05d0
RS
23481#undef TARGET_VERIFY_TYPE_CONTEXT
23482#define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23483
43e9d192
IB
23484#undef TARGET_MEMORY_MOVE_COST
23485#define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23486
26e0ff94
WD
23487#undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23488#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23489
43e9d192
IB
23490#undef TARGET_MUST_PASS_IN_STACK
23491#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23492
23493/* This target hook should return true if accesses to volatile bitfields
23494 should use the narrowest mode possible. It should return false if these
23495 accesses should use the bitfield container type. */
23496#undef TARGET_NARROW_VOLATILE_BITFIELD
23497#define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23498
23499#undef TARGET_OPTION_OVERRIDE
23500#define TARGET_OPTION_OVERRIDE aarch64_override_options
23501
23502#undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23503#define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23504 aarch64_override_options_after_change
23505
361fb3ee
KT
23506#undef TARGET_OPTION_SAVE
23507#define TARGET_OPTION_SAVE aarch64_option_save
23508
23509#undef TARGET_OPTION_RESTORE
23510#define TARGET_OPTION_RESTORE aarch64_option_restore
23511
23512#undef TARGET_OPTION_PRINT
23513#define TARGET_OPTION_PRINT aarch64_option_print
23514
5a2c8331
KT
23515#undef TARGET_OPTION_VALID_ATTRIBUTE_P
23516#define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
23517
d78006d9
KT
23518#undef TARGET_SET_CURRENT_FUNCTION
23519#define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
23520
43e9d192
IB
23521#undef TARGET_PASS_BY_REFERENCE
23522#define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
23523
23524#undef TARGET_PREFERRED_RELOAD_CLASS
23525#define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
23526
cee66c68
WD
23527#undef TARGET_SCHED_REASSOCIATION_WIDTH
23528#define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
23529
c2ec330c
AL
23530#undef TARGET_PROMOTED_TYPE
23531#define TARGET_PROMOTED_TYPE aarch64_promoted_type
23532
43e9d192
IB
23533#undef TARGET_SECONDARY_RELOAD
23534#define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
23535
23536#undef TARGET_SHIFT_TRUNCATION_MASK
23537#define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
23538
23539#undef TARGET_SETUP_INCOMING_VARARGS
23540#define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
23541
23542#undef TARGET_STRUCT_VALUE_RTX
23543#define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
23544
23545#undef TARGET_REGISTER_MOVE_COST
23546#define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
23547
23548#undef TARGET_RETURN_IN_MEMORY
23549#define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
23550
23551#undef TARGET_RETURN_IN_MSB
23552#define TARGET_RETURN_IN_MSB aarch64_return_in_msb
23553
23554#undef TARGET_RTX_COSTS
7cc2145f 23555#define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
43e9d192 23556
2e5f8203
JG
23557#undef TARGET_SCALAR_MODE_SUPPORTED_P
23558#define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
23559
d126a4ae
AP
23560#undef TARGET_SCHED_ISSUE_RATE
23561#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
23562
d0bc0cb6
RS
23563#undef TARGET_SCHED_VARIABLE_ISSUE
23564#define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
23565
d03f7e44
MK
23566#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
23567#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
23568 aarch64_sched_first_cycle_multipass_dfa_lookahead
23569
2d6bc7fa
KT
23570#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
23571#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
23572 aarch64_first_cycle_multipass_dfa_lookahead_guard
23573
827ab47a
KT
23574#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
23575#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
23576 aarch64_get_separate_components
23577
23578#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
23579#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
23580 aarch64_components_for_bb
23581
23582#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
23583#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
23584 aarch64_disqualify_components
23585
23586#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
23587#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
23588 aarch64_emit_prologue_components
23589
23590#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
23591#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
23592 aarch64_emit_epilogue_components
23593
23594#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
23595#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
23596 aarch64_set_handled_components
23597
43e9d192
IB
23598#undef TARGET_TRAMPOLINE_INIT
23599#define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
23600
23601#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
23602#define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
23603
23604#undef TARGET_VECTOR_MODE_SUPPORTED_P
23605#define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
23606
482b2b43
RS
23607#undef TARGET_COMPATIBLE_VECTOR_TYPES_P
23608#define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
23609
7df76747
N
23610#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
23611#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
23612 aarch64_builtin_support_vector_misalignment
23613
9f4cbab8
RS
23614#undef TARGET_ARRAY_MODE
23615#define TARGET_ARRAY_MODE aarch64_array_mode
23616
43e9d192
IB
23617#undef TARGET_ARRAY_MODE_SUPPORTED_P
23618#define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
23619
8990e73a
TB
23620#undef TARGET_VECTORIZE_ADD_STMT_COST
23621#define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
23622
23623#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
23624#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
23625 aarch64_builtin_vectorization_cost
23626
43e9d192
IB
23627#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
23628#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
23629
42fc9a7f
JG
23630#undef TARGET_VECTORIZE_BUILTINS
23631#define TARGET_VECTORIZE_BUILTINS
23632
23633#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
23634#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
23635 aarch64_builtin_vectorized_function
23636
e021fb86
RS
23637#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
23638#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
23639 aarch64_autovectorize_vector_modes
3b357264 23640
aa87aced
KV
23641#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
23642#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
23643 aarch64_atomic_assign_expand_fenv
23644
43e9d192
IB
23645/* Section anchor support. */
23646
23647#undef TARGET_MIN_ANCHOR_OFFSET
23648#define TARGET_MIN_ANCHOR_OFFSET -256
23649
23650/* Limit the maximum anchor offset to 4k-1, since that's the limit for a
23651 byte offset; we can do much more for larger data types, but have no way
23652 to determine the size of the access. We assume accesses are aligned. */
23653#undef TARGET_MAX_ANCHOR_OFFSET
23654#define TARGET_MAX_ANCHOR_OFFSET 4095
23655
db0253a4
TB
23656#undef TARGET_VECTOR_ALIGNMENT
23657#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
23658
43cacb12
RS
23659#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
23660#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
23661 aarch64_vectorize_preferred_vector_alignment
db0253a4
TB
23662#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
23663#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
23664 aarch64_simd_vector_alignment_reachable
23665
88b08073
JG
23666/* vec_perm support. */
23667
f151c9e1
RS
23668#undef TARGET_VECTORIZE_VEC_PERM_CONST
23669#define TARGET_VECTORIZE_VEC_PERM_CONST \
23670 aarch64_vectorize_vec_perm_const
88b08073 23671
74166aab
RS
23672#undef TARGET_VECTORIZE_RELATED_MODE
23673#define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
43cacb12
RS
23674#undef TARGET_VECTORIZE_GET_MASK_MODE
23675#define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
76a34e3f
RS
23676#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
23677#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
23678 aarch64_empty_mask_is_expensive
6a86928d
RS
23679#undef TARGET_PREFERRED_ELSE_VALUE
23680#define TARGET_PREFERRED_ELSE_VALUE \
23681 aarch64_preferred_else_value
43cacb12 23682
c2ec330c
AL
23683#undef TARGET_INIT_LIBFUNCS
23684#define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
70f09188 23685
706b2314 23686#undef TARGET_FIXED_CONDITION_CODE_REGS
70f09188
AP
23687#define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
23688
5cb74e90
RR
23689#undef TARGET_FLAGS_REGNUM
23690#define TARGET_FLAGS_REGNUM CC_REGNUM
23691
78607708
TV
23692#undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
23693#define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
23694
a3125fc2
CL
23695#undef TARGET_ASAN_SHADOW_OFFSET
23696#define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
23697
0c4ec427
RE
23698#undef TARGET_LEGITIMIZE_ADDRESS
23699#define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
23700
b48d6421
KT
23701#undef TARGET_SCHED_CAN_SPECULATE_INSN
23702#define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
23703
594bdd53
FY
23704#undef TARGET_CAN_USE_DOLOOP_P
23705#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
23706
9bca63d4
WD
23707#undef TARGET_SCHED_ADJUST_PRIORITY
23708#define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
23709
6a569cdd
KT
23710#undef TARGET_SCHED_MACRO_FUSION_P
23711#define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
23712
23713#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
23714#define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
23715
350013bc
BC
23716#undef TARGET_SCHED_FUSION_PRIORITY
23717#define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23718
7b841a12
JW
23719#undef TARGET_UNSPEC_MAY_TRAP_P
23720#define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23721
1b1e81f8
JW
23722#undef TARGET_USE_PSEUDO_PIC_REG
23723#define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23724
cc8ca59e
JB
23725#undef TARGET_PRINT_OPERAND
23726#define TARGET_PRINT_OPERAND aarch64_print_operand
23727
23728#undef TARGET_PRINT_OPERAND_ADDRESS
23729#define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23730
ee62a5a6
RS
23731#undef TARGET_OPTAB_SUPPORTED_P
23732#define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23733
43203dea
RR
23734#undef TARGET_OMIT_STRUCT_RETURN_REG
23735#define TARGET_OMIT_STRUCT_RETURN_REG true
23736
43cacb12
RS
23737#undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23738#define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23739 aarch64_dwarf_poly_indeterminate_value
23740
f46fe37e
EB
23741/* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
23742#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23743#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23744
c43f4279
RS
23745#undef TARGET_HARD_REGNO_NREGS
23746#define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
f939c3e6
RS
23747#undef TARGET_HARD_REGNO_MODE_OK
23748#define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23749
99e1629f
RS
23750#undef TARGET_MODES_TIEABLE_P
23751#define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23752
80ec73f4
RS
23753#undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23754#define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23755 aarch64_hard_regno_call_part_clobbered
23756
5a5a3bc5
RS
23757#undef TARGET_INSN_CALLEE_ABI
23758#define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
b3650d40 23759
58e17cf8
RS
23760#undef TARGET_CONSTANT_ALIGNMENT
23761#define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23762
8c6e3b23
TC
23763#undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23764#define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23765 aarch64_stack_clash_protection_alloca_probe_range
23766
43cacb12
RS
23767#undef TARGET_COMPUTE_PRESSURE_CLASSES
23768#define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23769
23770#undef TARGET_CAN_CHANGE_MODE_CLASS
23771#define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23772
5cce8171
RS
23773#undef TARGET_SELECT_EARLY_REMAT_MODES
23774#define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23775
c0111dc4
RE
23776#undef TARGET_SPECULATION_SAFE_VALUE
23777#define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23778
2d56d6ba
KT
23779#undef TARGET_ESTIMATED_POLY_VALUE
23780#define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23781
a0d0b980
SE
23782#undef TARGET_ATTRIBUTE_TABLE
23783#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23784
d9186814
SE
23785#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23786#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23787 aarch64_simd_clone_compute_vecsize_and_simdlen
23788
23789#undef TARGET_SIMD_CLONE_ADJUST
23790#define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23791
23792#undef TARGET_SIMD_CLONE_USABLE
23793#define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23794
497f281c
SE
23795#undef TARGET_COMP_TYPE_ATTRIBUTES
23796#define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23797
3bac1e20
SE
23798#undef TARGET_GET_MULTILIB_ABI_NAME
23799#define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23800
002ffd3c
RS
23801#undef TARGET_FNTYPE_ABI
23802#define TARGET_FNTYPE_ABI aarch64_fntype_abi
23803
51b86113
DM
23804#if CHECKING_P
23805#undef TARGET_RUN_TARGET_SELFTESTS
23806#define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23807#endif /* #if CHECKING_P */
23808
8fc16d72
ST
23809#undef TARGET_ASM_POST_CFI_STARTPROC
23810#define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23811
c600df9a
RS
23812#undef TARGET_STRICT_ARGUMENT_NAMING
23813#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23814
1a7a35c7
RH
23815#undef TARGET_MD_ASM_ADJUST
23816#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23817
96b7f495
MM
23818#undef TARGET_ASM_FILE_END
23819#define TARGET_ASM_FILE_END aarch64_asm_file_end
23820
23821#undef TARGET_ASM_FUNCTION_EPILOGUE
23822#define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
23823
43e9d192
IB
23824struct gcc_target targetm = TARGET_INITIALIZER;
23825
23826#include "gt-aarch64.h"