]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn.cc
[Ada] Move check for null array aggregates to expansion
[thirdparty/gcc.git] / gcc / config / gcn / gcn.cc
CommitLineData
7adcbafe 1/* Copyright (C) 2016-2022 Free Software Foundation, Inc.
5326695a
AS
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17/* {{{ Includes. */
18
19/* We want GET_MODE_SIZE et al to return integers, please. */
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "memmodel.h"
28#include "rtl.h"
29#include "tree.h"
30#include "df.h"
31#include "tm_p.h"
32#include "stringpool.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic-core.h"
38#include "insn-attr.h"
39#include "fold-const.h"
40#include "calls.h"
41#include "explow.h"
42#include "expr.h"
43#include "output.h"
44#include "cfgrtl.h"
45#include "langhooks.h"
46#include "builtins.h"
47#include "omp-general.h"
48#include "print-rtl.h"
49#include "attribs.h"
50#include "varasm.h"
51#include "intl.h"
52#include "rtl-iter.h"
b5bb7f32 53#include "dwarf2.h"
2961ac45 54#include "gimple.h"
b73c49f6 55#include "cgraph.h"
5326695a
AS
56
57/* This file should be included last. */
58#include "target-def.h"
59
60/* }}} */
61/* {{{ Global variables. */
62
63/* Constants used by FP instructions. */
64
65static REAL_VALUE_TYPE dconst4, dconst1over2pi;
66static bool ext_gcn_constants_init = 0;
67
68/* Holds the ISA variant, derived from the command line parameters. */
69
cde52d3a 70enum gcn_isa gcn_isa = ISA_GCN3; /* Default to GCN3. */
5326695a
AS
71
72/* Reserve this much space for LDS (for propagating variables from
73 worker-single mode to worker-partitioned mode), per workgroup. Global
74 analysis could calculate an exact bound, but we don't do that yet.
75
86b0eb81 76 We want to permit full occupancy, so size accordingly. */
5326695a 77
2a3f9f65
JB
78/* Use this as a default, but allow it to grow if the user requests a large
79 amount of gang-private shared-memory space. */
80static int acc_lds_size = 0x600;
81
86b0eb81 82#define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
2a3f9f65 83#define ACC_LDS_SIZE acc_lds_size
86b0eb81
AS
84#define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
85
86#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
87 : flag_openmp ? OMP_LDS_SIZE \
88 : OTHER_LDS_SIZE)
5326695a 89
2a3f9f65
JB
90static int gang_private_hwm = 32;
91static hash_map<tree, int> lds_allocs;
92
87fdbe69
KCY
93/* The number of registers usable by normal non-kernel functions.
94 The SGPR count includes any special extra registers such as VCC. */
95
f062c3f1 96#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
87fdbe69
KCY
97#define MAX_NORMAL_VGPR_COUNT 24
98
5326695a
AS
99/* }}} */
100/* {{{ Initialization and options. */
101
102/* Initialize machine_function. */
103
104static struct machine_function *
105gcn_init_machine_status (void)
106{
107 struct machine_function *f;
108
109 f = ggc_cleared_alloc<machine_function> ();
110
5326695a
AS
111 if (TARGET_GCN3)
112 f->use_flat_addressing = true;
113
114 return f;
115}
116
117/* Implement TARGET_OPTION_OVERRIDE.
118
119 Override option settings where defaults are variable, or we have specific
120 needs to consider. */
121
122static void
123gcn_option_override (void)
124{
125 init_machine_status = gcn_init_machine_status;
126
127 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
128 if (!flag_pie)
129 flag_pie = 2;
130 if (!flag_pic)
131 flag_pic = flag_pie;
132
cde52d3a
AS
133 gcn_isa = (gcn_arch == PROCESSOR_FIJI ? ISA_GCN3
134 : gcn_arch == PROCESSOR_VEGA10 ? ISA_GCN5
135 : gcn_arch == PROCESSOR_VEGA20 ? ISA_GCN5
136 : gcn_arch == PROCESSOR_GFX908 ? ISA_CDNA1
137 : gcn_arch == PROCESSOR_GFX90a ? ISA_CDNA2
138 : ISA_UNKNOWN);
139 gcc_assert (gcn_isa != ISA_UNKNOWN);
5326695a
AS
140
141 /* The default stack size needs to be small for offload kernels because
142 there may be many, many threads. Also, a smaller stack gives a
143 measureable performance boost. But, a small stack is insufficient
144 for running the testsuite, so we use a larger default for the stand
145 alone case. */
146 if (stack_size_opt == -1)
147 {
148 if (flag_openacc || flag_openmp)
149 /* 512 bytes per work item = 32kB total. */
150 stack_size_opt = 512 * 64;
151 else
152 /* 1MB total. */
153 stack_size_opt = 1048576;
154 }
aad32a00 155
2a3f9f65
JB
156 /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
157 worker broadcasts. */
158 if (gang_private_size_opt == -1)
159 gang_private_size_opt = 512;
160 else if (gang_private_size_opt < gang_private_hwm)
161 gang_private_size_opt = gang_private_hwm;
162 else if (gang_private_size_opt >= acc_lds_size - 1024)
163 {
164 /* We need some space for reductions and worker broadcasting. If the
165 user requests a large amount of gang-private LDS space, we might not
166 have enough left for the former. Increase the LDS allocation in that
167 case, although this may reduce the maximum occupancy on the
168 hardware. */
169 acc_lds_size = gang_private_size_opt + 1024;
170 if (acc_lds_size > 32768)
171 acc_lds_size = 32768;
172 }
173
aad32a00
AS
174 /* The xnack option is a placeholder, for now. */
175 if (flag_xnack)
176 sorry ("XNACK support");
5326695a
AS
177}
178
179/* }}} */
180/* {{{ Attributes. */
181
182/* This table defines the arguments that are permitted in
183 __attribute__ ((amdgpu_hsa_kernel (...))).
184
185 The names and values correspond to the HSA metadata that is encoded
186 into the assembler file and binary. */
187
188static const struct gcn_kernel_arg_type
189{
190 const char *name;
191 const char *header_pseudo;
192 machine_mode mode;
193
194 /* This should be set to -1 or -2 for a dynamically allocated register
195 number. Use -1 if this argument contributes to the user_sgpr_count,
196 -2 otherwise. */
197 int fixed_regno;
198} gcn_kernel_arg_types[] = {
199 {"exec", NULL, DImode, EXEC_REG},
200#define PRIVATE_SEGMENT_BUFFER_ARG 1
201 {"private_segment_buffer",
f062c3f1 202 ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
5326695a 203#define DISPATCH_PTR_ARG 2
f062c3f1 204 {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
5326695a 205#define QUEUE_PTR_ARG 3
f062c3f1 206 {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
5326695a 207#define KERNARG_SEGMENT_PTR_ARG 4
f062c3f1
AS
208 {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
209 {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
5326695a 210#define FLAT_SCRATCH_INIT_ARG 6
f062c3f1 211 {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
5326695a 212#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
f062c3f1
AS
213 {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
214#define WORKGROUP_ID_X_ARG 8
215 {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
216 {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
217 {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
218 {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
219#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
5326695a 220 {"private_segment_wave_offset",
f062c3f1
AS
221 ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
222#define WORK_ITEM_ID_X_ARG 13
5326695a 223 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
f062c3f1 224#define WORK_ITEM_ID_Y_ARG 14
5326695a 225 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
f062c3f1 226#define WORK_ITEM_ID_Z_ARG 15
5326695a
AS
227 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
228};
229
342f9464
KCY
230static const long default_requested_args
231 = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
232 | (1 << DISPATCH_PTR_ARG)
233 | (1 << QUEUE_PTR_ARG)
234 | (1 << KERNARG_SEGMENT_PTR_ARG)
235 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
236 | (1 << WORKGROUP_ID_X_ARG)
237 | (1 << WORK_ITEM_ID_X_ARG)
238 | (1 << WORK_ITEM_ID_Y_ARG)
239 | (1 << WORK_ITEM_ID_Z_ARG);
240
5326695a
AS
241/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
242 This function also sets the default values for some arguments.
243
244 Return true on success, with ARGS populated. */
245
246static bool
247gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
248 tree list)
249{
250 bool err = false;
342f9464 251 args->requested = default_requested_args;
5326695a
AS
252 args->nargs = 0;
253
254 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
255 args->reg[a] = -1;
256
257 for (; list; list = TREE_CHAIN (list))
258 {
259 const char *str;
260 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
261 {
55308fc2 262 error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
5326695a
AS
263 "arguments");
264 break;
265 }
266 str = TREE_STRING_POINTER (TREE_VALUE (list));
267 int a;
268 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
269 {
270 if (!strcmp (str, gcn_kernel_arg_types[a].name))
271 break;
272 }
273 if (a == GCN_KERNEL_ARG_TYPES)
274 {
5cded5af 275 error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
55308fc2 276 str);
5326695a
AS
277 err = true;
278 break;
279 }
280 if (args->requested & (1 << a))
281 {
55308fc2 282 error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
5326695a
AS
283 "attribute", str);
284 err = true;
285 break;
286 }
287 args->requested |= (1 << a);
288 args->order[args->nargs++] = a;
289 }
5326695a
AS
290
291 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
292 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
293 requesting WORK_ITEM_ID_X_ARG. */
294 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
295 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
296 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
297 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
298
5326695a
AS
299 int sgpr_regno = FIRST_SGPR_REG;
300 args->nsgprs = 0;
301 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
302 {
303 if (!(args->requested & (1 << a)))
304 continue;
305
306 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
307 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
308 else
309 {
310 int reg_count;
311
312 switch (gcn_kernel_arg_types[a].mode)
313 {
314 case E_SImode:
315 reg_count = 1;
316 break;
317 case E_DImode:
318 reg_count = 2;
319 break;
320 case E_TImode:
321 reg_count = 4;
322 break;
323 default:
324 gcc_unreachable ();
325 }
326 args->reg[a] = sgpr_regno;
327 sgpr_regno += reg_count;
328 if (gcn_kernel_arg_types[a].fixed_regno == -1)
329 args->nsgprs += reg_count;
330 }
331 }
332 if (sgpr_regno > FIRST_SGPR_REG + 16)
333 {
334 error ("too many arguments passed in sgpr registers");
335 }
336 return err;
337}
338
339/* Referenced by TARGET_ATTRIBUTE_TABLE.
340
341 Validates target specific attributes. */
342
343static tree
344gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
345 tree args, int, bool *no_add_attrs)
346{
7039cebf 347 if (!FUNC_OR_METHOD_TYPE_P (*node))
5326695a
AS
348 {
349 warning (OPT_Wattributes, "%qE attribute only applies to functions",
350 name);
351 *no_add_attrs = true;
352 return NULL_TREE;
353 }
354
355 /* Can combine regparm with all attributes but fastcall, and thiscall. */
356 if (is_attribute_p ("gcnhsa_kernel", name))
357 {
358 struct gcn_kernel_args kernelarg;
359
360 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
361 *no_add_attrs = true;
362
363 return NULL_TREE;
364 }
365
366 return NULL_TREE;
367}
368
369/* Implement TARGET_ATTRIBUTE_TABLE.
370
371 Create target-specific __attribute__ types. */
372
373static const struct attribute_spec gcn_attribute_table[] = {
374 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
375 affects_type_identity } */
376 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
377 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
378 /* End element. */
379 {NULL, 0, 0, false, false, false, false, NULL, NULL}
380};
381
382/* }}} */
383/* {{{ Registers and modes. */
384
8d0b2b33
AS
385/* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */
386
387bool
388gcn_scalar_mode_supported_p (scalar_mode mode)
389{
390 return (mode == BImode
391 || mode == QImode
392 || mode == HImode /* || mode == HFmode */
393 || mode == SImode || mode == SFmode
394 || mode == DImode || mode == DFmode
292da5c5 395 || mode == TImode);
8d0b2b33
AS
396}
397
5326695a
AS
398/* Implement TARGET_CLASS_MAX_NREGS.
399
400 Return the number of hard registers needed to hold a value of MODE in
401 a register of class RCLASS. */
402
403static unsigned char
404gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
405{
406 /* Scalar registers are 32bit, vector registers are in fact tuples of
407 64 lanes. */
408 if (rclass == VGPR_REGS)
409 {
410 if (vgpr_1reg_mode_p (mode))
411 return 1;
412 if (vgpr_2reg_mode_p (mode))
413 return 2;
414 /* TImode is used by DImode compare_and_swap. */
415 if (mode == TImode)
416 return 4;
417 }
418 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
419 return 2;
420 return CEIL (GET_MODE_SIZE (mode), 4);
421}
422
423/* Implement TARGET_HARD_REGNO_NREGS.
424
425 Return the number of hard registers needed to hold a value of MODE in
426 REGNO. */
427
428unsigned int
429gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
430{
431 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
432}
433
434/* Implement TARGET_HARD_REGNO_MODE_OK.
435
436 Return true if REGNO can hold value in MODE. */
437
438bool
439gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
440{
441 /* Treat a complex mode as if it were a scalar mode of the same overall
442 size for the purposes of allocating hard registers. */
443 if (COMPLEX_MODE_P (mode))
444 switch (mode)
445 {
446 case E_CQImode:
447 case E_CHImode:
448 mode = SImode;
449 break;
450 case E_CSImode:
451 mode = DImode;
452 break;
453 case E_CDImode:
454 mode = TImode;
455 break;
456 case E_HCmode:
457 mode = SFmode;
458 break;
459 case E_SCmode:
460 mode = DFmode;
461 break;
462 default:
463 /* Not supported. */
464 return false;
465 }
466
467 switch (regno)
468 {
469 case FLAT_SCRATCH_LO_REG:
470 case XNACK_MASK_LO_REG:
471 case TBA_LO_REG:
472 case TMA_LO_REG:
473 return (mode == SImode || mode == DImode);
474 case VCC_LO_REG:
475 case EXEC_LO_REG:
476 return (mode == BImode || mode == SImode || mode == DImode);
477 case M0_REG:
478 case FLAT_SCRATCH_HI_REG:
479 case XNACK_MASK_HI_REG:
480 case TBA_HI_REG:
481 case TMA_HI_REG:
482 return mode == SImode;
483 case VCC_HI_REG:
484 return false;
485 case EXEC_HI_REG:
486 return mode == SImode /*|| mode == V32BImode */ ;
487 case SCC_REG:
488 case VCCZ_REG:
489 case EXECZ_REG:
490 return mode == BImode;
491 }
492 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
493 return true;
494 if (SGPR_REGNO_P (regno))
495 /* We restrict double register values to aligned registers. */
496 return (sgpr_1reg_mode_p (mode)
497 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
498 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
499 if (VGPR_REGNO_P (regno))
3abfd4f3
AS
500 /* Vector instructions do not care about the alignment of register
501 pairs, but where there is no 64-bit instruction, many of the
502 define_split do not work if the input and output registers partially
503 overlap. We tried to fix this with early clobber and match
504 constraints, but it was bug prone, added complexity, and conflicts
505 with the 'U0' constraints on vec_merge.
506 Therefore, we restrict ourselved to aligned registers. */
507 return (vgpr_1reg_mode_p (mode)
508 || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
5326695a 509 /* TImode is used by DImode compare_and_swap. */
8ae0de56
AS
510 || (mode == TImode
511 && !((regno - FIRST_VGPR_REG) & 3)));
5326695a
AS
512 return false;
513}
514
515/* Implement REGNO_REG_CLASS via gcn.h.
516
517 Return smallest class containing REGNO. */
518
519enum reg_class
520gcn_regno_reg_class (int regno)
521{
522 switch (regno)
523 {
524 case SCC_REG:
525 return SCC_CONDITIONAL_REG;
9ecf84e6
KCY
526 case VCC_LO_REG:
527 case VCC_HI_REG:
528 return VCC_CONDITIONAL_REG;
5326695a
AS
529 case VCCZ_REG:
530 return VCCZ_CONDITIONAL_REG;
531 case EXECZ_REG:
532 return EXECZ_CONDITIONAL_REG;
533 case EXEC_LO_REG:
534 case EXEC_HI_REG:
535 return EXEC_MASK_REG;
536 }
537 if (VGPR_REGNO_P (regno))
538 return VGPR_REGS;
539 if (SGPR_REGNO_P (regno))
540 return SGPR_REGS;
541 if (regno < FIRST_VGPR_REG)
542 return GENERAL_REGS;
543 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
544 return AFP_REGS;
545 return ALL_REGS;
546}
547
548/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
549
550 GCC assumes that lowpart contains first part of value as stored in memory.
551 This is not the case for vector registers. */
552
553bool
554gcn_can_change_mode_class (machine_mode from, machine_mode to,
555 reg_class_t regclass)
556{
557 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
558 return true;
559 return (gcn_class_max_nregs (regclass, from)
560 == gcn_class_max_nregs (regclass, to));
561}
562
563/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
564
565 When this hook returns true for MODE, the compiler allows
566 registers explicitly used in the rtl to be used as spill registers
567 but prevents the compiler from extending the lifetime of these
568 registers. */
569
570bool
571gcn_small_register_classes_for_mode_p (machine_mode mode)
572{
573 /* We allocate into exec and vcc regs. Those make small register class. */
574 return mode == DImode || mode == SImode;
575}
576
577/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
578
579 Returns true if pseudos that have been assigned to registers of class RCLASS
580 would likely be spilled because registers of RCLASS are needed for spill
581 registers. */
582
583static bool
584gcn_class_likely_spilled_p (reg_class_t rclass)
585{
586 return (rclass == EXEC_MASK_REG
587 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
588}
589
590/* Implement TARGET_MODES_TIEABLE_P.
591
592 Returns true if a value of MODE1 is accessible in MODE2 without
593 copying. */
594
595bool
596gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
597{
598 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
599 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
600}
601
602/* Implement TARGET_TRULY_NOOP_TRUNCATION.
603
604 Returns true if it is safe to “convert” a value of INPREC bits to one of
605 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
606 it as if it had only OUTPREC bits. */
607
608bool
609gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
610{
611 return ((inprec <= 32) && (outprec <= inprec));
612}
613
614/* Return N-th part of value occupying multiple registers. */
615
616rtx
617gcn_operand_part (machine_mode mode, rtx op, int n)
618{
619 if (GET_MODE_SIZE (mode) >= 256)
620 {
621 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
622
623 if (REG_P (op))
624 {
625 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
626 return gen_rtx_REG (V64SImode, REGNO (op) + n);
627 }
628 if (GET_CODE (op) == CONST_VECTOR)
629 {
630 int units = GET_MODE_NUNITS (mode);
631 rtvec v = rtvec_alloc (units);
632
633 for (int i = 0; i < units; ++i)
634 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
635 CONST_VECTOR_ELT (op, i), n);
636
637 return gen_rtx_CONST_VECTOR (V64SImode, v);
638 }
639 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
640 return gcn_gen_undef (V64SImode);
641 gcc_unreachable ();
642 }
643 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
644 {
645 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
646 return gen_rtx_REG (SImode, REGNO (op) + n);
647 }
648 else
649 {
650 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
651 return gcn_gen_undef (SImode);
652
653 /* If it's a constant then let's assume it is of the largest mode
654 available, otherwise simplify_gen_subreg will fail. */
655 if (mode == VOIDmode && CONST_INT_P (op))
656 mode = DImode;
657 return simplify_gen_subreg (SImode, op, mode, n * 4);
658 }
659}
660
661/* Return N-th part of value occupying multiple registers. */
662
663rtx
664gcn_operand_doublepart (machine_mode mode, rtx op, int n)
665{
666 return simplify_gen_subreg (DImode, op, mode, n * 8);
667}
668
669/* Return true if OP can be split into subregs or high/low parts.
670 This is always true for scalars, but not normally true for vectors.
671 However, for vectors in hardregs we can use the low and high registers. */
672
673bool
674gcn_can_split_p (machine_mode, rtx op)
675{
676 if (vgpr_vector_mode_p (GET_MODE (op)))
677 {
678 if (GET_CODE (op) == SUBREG)
679 op = SUBREG_REG (op);
680 if (!REG_P (op))
681 return true;
682 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
683 }
684 return true;
685}
686
687/* Implement TARGET_SPILL_CLASS.
688
689 Return class of registers which could be used for pseudo of MODE
690 and of class RCLASS for spilling instead of memory. Return NO_REGS
691 if it is not possible or non-profitable. */
692
693static reg_class_t
694gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
695{
9ecf84e6
KCY
696 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
697 || c == VCC_CONDITIONAL_REG)
5326695a
AS
698 return SGPR_REGS;
699 else
700 return NO_REGS;
701}
702
703/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
704
705 Change allocno class for given pseudo from allocno and best class
706 calculated by IRA. */
707
708static reg_class_t
709gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
710 reg_class_t best_cl)
711{
712 /* Avoid returning classes that contain both vgpr and sgpr registers. */
713 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
714 return cl;
715 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
716 && best_cl != ALL_GPR_REGS)
717 return best_cl;
718
719 machine_mode mode = PSEUDO_REGNO_MODE (regno);
720 if (vgpr_vector_mode_p (mode))
721 return VGPR_REGS;
722
723 return GENERAL_REGS;
724}
725
726/* Create a new DImode pseudo reg and emit an instruction to initialize
727 it to VAL. */
728
729static rtx
730get_exec (int64_t val)
731{
732 rtx reg = gen_reg_rtx (DImode);
733 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
734 return reg;
735}
736
737/* Return value of scalar exec register. */
738
739rtx
740gcn_scalar_exec ()
741{
742 return const1_rtx;
743}
744
745/* Return pseudo holding scalar exec register. */
746
747rtx
748gcn_scalar_exec_reg ()
749{
750 return get_exec (1);
751}
752
753/* Return value of full exec register. */
754
755rtx
756gcn_full_exec ()
757{
758 return constm1_rtx;
759}
760
761/* Return pseudo holding full exec register. */
762
763rtx
764gcn_full_exec_reg ()
765{
766 return get_exec (-1);
767}
768
769/* }}} */
770/* {{{ Immediate constants. */
771
772/* Initialize shared numeric constants. */
773
774static void
775init_ext_gcn_constants (void)
776{
777 real_from_integer (&dconst4, DFmode, 4, SIGNED);
778
779 /* FIXME: this constant probably does not match what hardware really loads.
780 Reality check it eventually. */
781 real_from_string (&dconst1over2pi,
782 "0.1591549430918953357663423455968866839");
783 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
784
785 ext_gcn_constants_init = 1;
786}
787
788/* Return non-zero if X is a constant that can appear as an inline operand.
789 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
790 Or a vector of those.
791 The value returned should be the encoding of this constant. */
792
793int
794gcn_inline_fp_constant_p (rtx x, bool allow_vector)
795{
796 machine_mode mode = GET_MODE (x);
797
798 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
799 && allow_vector)
800 {
801 int n;
802 if (GET_CODE (x) != CONST_VECTOR)
803 return 0;
804 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
805 if (!n)
806 return 0;
807 for (int i = 1; i < 64; i++)
808 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
809 return 0;
810 return 1;
811 }
812
813 if (mode != HFmode && mode != SFmode && mode != DFmode)
814 return 0;
815
816 const REAL_VALUE_TYPE *r;
817
818 if (x == CONST0_RTX (mode))
819 return 128;
820 if (x == CONST1_RTX (mode))
821 return 242;
822
823 r = CONST_DOUBLE_REAL_VALUE (x);
824
825 if (real_identical (r, &dconstm1))
826 return 243;
827
828 if (real_identical (r, &dconsthalf))
829 return 240;
830 if (real_identical (r, &dconstm1))
831 return 243;
832 if (real_identical (r, &dconst2))
833 return 244;
834 if (real_identical (r, &dconst4))
835 return 246;
836 if (real_identical (r, &dconst1over2pi))
837 return 248;
838 if (!ext_gcn_constants_init)
839 init_ext_gcn_constants ();
840 real_value_negate (r);
841 if (real_identical (r, &dconsthalf))
842 return 241;
843 if (real_identical (r, &dconst2))
844 return 245;
845 if (real_identical (r, &dconst4))
846 return 247;
847
848 /* FIXME: add 4, -4 and 1/(2*PI). */
849
850 return 0;
851}
852
853/* Return non-zero if X is a constant that can appear as an immediate operand.
854 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
855 Or a vector of those.
856 The value returned should be the encoding of this constant. */
857
858bool
859gcn_fp_constant_p (rtx x, bool allow_vector)
860{
861 machine_mode mode = GET_MODE (x);
862
863 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
864 && allow_vector)
865 {
866 int n;
867 if (GET_CODE (x) != CONST_VECTOR)
868 return false;
869 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
870 if (!n)
871 return false;
872 for (int i = 1; i < 64; i++)
873 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
874 return false;
875 return true;
876 }
877 if (mode != HFmode && mode != SFmode && mode != DFmode)
878 return false;
879
880 if (gcn_inline_fp_constant_p (x, false))
881 return true;
882 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
883 return (mode != DFmode);
884}
885
886/* Return true if X is a constant representable as an inline immediate
887 constant in a 32-bit instruction encoding. */
888
889bool
890gcn_inline_constant_p (rtx x)
891{
892 if (GET_CODE (x) == CONST_INT)
5960de78 893 return INTVAL (x) >= -16 && INTVAL (x) <= 64;
5326695a
AS
894 if (GET_CODE (x) == CONST_DOUBLE)
895 return gcn_inline_fp_constant_p (x, false);
896 if (GET_CODE (x) == CONST_VECTOR)
897 {
898 int n;
899 if (!vgpr_vector_mode_p (GET_MODE (x)))
900 return false;
901 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
902 if (!n)
903 return false;
904 for (int i = 1; i < 64; i++)
905 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
906 return false;
907 return 1;
908 }
909 return false;
910}
911
912/* Return true if X is a constant representable as an immediate constant
913 in a 32 or 64-bit instruction encoding. */
914
915bool
916gcn_constant_p (rtx x)
917{
918 switch (GET_CODE (x))
919 {
920 case CONST_INT:
921 return true;
922
923 case CONST_DOUBLE:
924 return gcn_fp_constant_p (x, false);
925
926 case CONST_VECTOR:
927 {
928 int n;
929 if (!vgpr_vector_mode_p (GET_MODE (x)))
930 return false;
931 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
932 if (!n)
933 return false;
934 for (int i = 1; i < 64; i++)
935 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
936 return false;
937 return true;
938 }
939
940 case SYMBOL_REF:
941 case LABEL_REF:
942 return true;
943
944 default:
945 ;
946 }
947
948 return false;
949}
950
951/* Return true if X is a constant representable as two inline immediate
952 constants in a 64-bit instruction that is split into two 32-bit
66b01cc3
AS
953 instructions.
954 When MIXED is set, the low-part is permitted to use the full 32-bits. */
5326695a
AS
955
956bool
66b01cc3 957gcn_inline_constant64_p (rtx x, bool mixed)
5326695a
AS
958{
959 if (GET_CODE (x) == CONST_VECTOR)
960 {
961 if (!vgpr_vector_mode_p (GET_MODE (x)))
962 return false;
66b01cc3 963 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
5326695a
AS
964 return false;
965 for (int i = 1; i < 64; i++)
966 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
967 return false;
968
969 return true;
970 }
971
972 if (GET_CODE (x) != CONST_INT)
973 return false;
974
975 rtx val_lo = gcn_operand_part (DImode, x, 0);
976 rtx val_hi = gcn_operand_part (DImode, x, 1);
66b01cc3
AS
977 return ((mixed || gcn_inline_constant_p (val_lo))
978 && gcn_inline_constant_p (val_hi));
5326695a
AS
979}
980
981/* Return true if X is a constant representable as an immediate constant
982 in a 32 or 64-bit instruction encoding where the hardware will
983 extend the immediate to 64-bits. */
984
985bool
986gcn_constant64_p (rtx x)
987{
988 if (!gcn_constant_p (x))
989 return false;
990
991 if (GET_CODE (x) != CONST_INT)
992 return true;
993
994 /* Negative numbers are only allowed if they can be encoded within src0,
995 because the 32-bit immediates do not get sign-extended.
996 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
997 assembler will use a src0 inline immediate and that will get
998 sign-extended. */
999 HOST_WIDE_INT val = INTVAL (x);
1000 return (((val & 0xffffffff) == val /* Positive 32-bit. */
1001 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
1002 || gcn_inline_constant_p (x)); /* Src0. */
1003}
1004
1005/* Implement TARGET_LEGITIMATE_CONSTANT_P.
1006
1007 Returns true if X is a legitimate constant for a MODE immediate operand. */
1008
1009bool
1010gcn_legitimate_constant_p (machine_mode, rtx x)
1011{
1012 return gcn_constant_p (x);
1013}
1014
1015/* Return true if X is a CONST_VECTOR of single constant. */
1016
1017static bool
1018single_cst_vector_p (rtx x)
1019{
1020 if (GET_CODE (x) != CONST_VECTOR)
1021 return false;
1022 for (int i = 1; i < 64; i++)
1023 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1024 return false;
1025 return true;
1026}
1027
1028/* Create a CONST_VECTOR of duplicated value A. */
1029
1030rtx
1031gcn_vec_constant (machine_mode mode, int a)
1032{
1033 /*if (!a)
1034 return CONST0_RTX (mode);
1035 if (a == -1)
1036 return CONSTM1_RTX (mode);
1037 if (a == 1)
1038 return CONST1_RTX (mode);
1039 if (a == 2)
1040 return CONST2_RTX (mode);*/
1041
1042 int units = GET_MODE_NUNITS (mode);
95607c12
AS
1043 machine_mode innermode = GET_MODE_INNER (mode);
1044
1045 rtx tem;
1046 if (FLOAT_MODE_P (innermode))
1047 {
1048 REAL_VALUE_TYPE rv;
1049 real_from_integer (&rv, NULL, a, SIGNED);
1050 tem = const_double_from_real_value (rv, innermode);
1051 }
1052 else
1053 tem = gen_int_mode (a, innermode);
5326695a 1054
95607c12 1055 rtvec v = rtvec_alloc (units);
5326695a
AS
1056 for (int i = 0; i < units; ++i)
1057 RTVEC_ELT (v, i) = tem;
1058
1059 return gen_rtx_CONST_VECTOR (mode, v);
1060}
1061
1062/* Create a CONST_VECTOR of duplicated value A. */
1063
1064rtx
1065gcn_vec_constant (machine_mode mode, rtx a)
1066{
1067 int units = GET_MODE_NUNITS (mode);
1068 rtvec v = rtvec_alloc (units);
1069
1070 for (int i = 0; i < units; ++i)
1071 RTVEC_ELT (v, i) = a;
1072
1073 return gen_rtx_CONST_VECTOR (mode, v);
1074}
1075
1076/* Create an undefined vector value, used where an insn operand is
1077 optional. */
1078
1079rtx
1080gcn_gen_undef (machine_mode mode)
1081{
1082 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1083}
1084
1085/* }}} */
1086/* {{{ Addresses, pointers and moves. */
1087
1088/* Return true is REG is a valid place to store a pointer,
1089 for instructions that require an SGPR.
1090 FIXME rename. */
1091
1092static bool
1093gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1094{
1095 if (GET_CODE (reg) == SUBREG)
1096 reg = SUBREG_REG (reg);
1097
1098 if (!REG_P (reg))
1099 return false;
1100
1101 if (GET_MODE (reg) != mode)
1102 return false;
1103
1104 int regno = REGNO (reg);
1105
1106 if (regno >= FIRST_PSEUDO_REGISTER)
1107 {
1108 if (!strict)
1109 return true;
1110
1111 if (!reg_renumber)
1112 return false;
1113
1114 regno = reg_renumber[regno];
1115 }
1116
1117 return (SGPR_REGNO_P (regno) || regno == M0_REG
1118 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1119}
1120
1121/* Return true is REG is a valid place to store a pointer,
1122 for instructions that require a VGPR. */
1123
1124static bool
1125gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1126{
1127 if (GET_CODE (reg) == SUBREG)
1128 reg = SUBREG_REG (reg);
1129
1130 if (!REG_P (reg))
1131 return false;
1132
1133 if (GET_MODE (reg) != mode)
1134 return false;
1135
1136 int regno = REGNO (reg);
1137
1138 if (regno >= FIRST_PSEUDO_REGISTER)
1139 {
1140 if (!strict)
1141 return true;
1142
1143 if (!reg_renumber)
1144 return false;
1145
1146 regno = reg_renumber[regno];
1147 }
1148
1149 return VGPR_REGNO_P (regno);
1150}
1151
1152/* Return true if X would be valid inside a MEM using the Flat address
1153 space. */
1154
1155bool
1156gcn_flat_address_p (rtx x, machine_mode mode)
1157{
1158 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1159 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1160
1161 if (vec_mode && gcn_address_register_p (x, DImode, false))
1162 return true;
1163
1164 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1165 return true;
1166
1167 if (TARGET_GCN5_PLUS
1168 && GET_CODE (x) == PLUS
1169 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1170 && CONST_INT_P (XEXP (x, 1)))
1171 return true;
1172
1173 return false;
1174}
1175
1176/* Return true if X would be valid inside a MEM using the Scalar Flat
1177 address space. */
1178
1179bool
1180gcn_scalar_flat_address_p (rtx x)
1181{
1182 if (gcn_address_register_p (x, DImode, false))
1183 return true;
1184
1185 if (GET_CODE (x) == PLUS
1186 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1187 && CONST_INT_P (XEXP (x, 1)))
1188 return true;
1189
1190 return false;
1191}
1192
1193/* Return true if MEM X would be valid for the Scalar Flat address space. */
1194
1195bool
1196gcn_scalar_flat_mem_p (rtx x)
1197{
1198 if (!MEM_P (x))
1199 return false;
1200
1201 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1202 return false;
1203
1204 return gcn_scalar_flat_address_p (XEXP (x, 0));
1205}
1206
1207/* Return true if X would be valid inside a MEM using the LDS or GDS
1208 address spaces. */
1209
1210bool
1211gcn_ds_address_p (rtx x)
1212{
1213 if (gcn_vec_address_register_p (x, SImode, false))
1214 return true;
1215
1216 if (GET_CODE (x) == PLUS
1217 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1218 && CONST_INT_P (XEXP (x, 1)))
1219 return true;
1220
1221 return false;
1222}
1223
1224/* Return true if ADDR would be valid inside a MEM using the Global
1225 address space. */
1226
1227bool
1228gcn_global_address_p (rtx addr)
1229{
1230 if (gcn_address_register_p (addr, DImode, false)
1231 || gcn_vec_address_register_p (addr, DImode, false))
1232 return true;
1233
1234 if (GET_CODE (addr) == PLUS)
1235 {
1236 rtx base = XEXP (addr, 0);
1237 rtx offset = XEXP (addr, 1);
1238 bool immediate_p = (CONST_INT_P (offset)
1239 && INTVAL (offset) >= -(1 << 12)
1240 && INTVAL (offset) < (1 << 12));
1241
1242 if ((gcn_address_register_p (base, DImode, false)
1243 || gcn_vec_address_register_p (base, DImode, false))
1244 && immediate_p)
1245 /* SGPR + CONST or VGPR + CONST */
1246 return true;
1247
1248 if (gcn_address_register_p (base, DImode, false)
1249 && gcn_vgpr_register_operand (offset, SImode))
1250 /* SPGR + VGPR */
1251 return true;
1252
1253 if (GET_CODE (base) == PLUS
1254 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1255 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1256 && immediate_p)
1257 /* (SGPR + VGPR) + CONST */
1258 return true;
1259 }
1260
1261 return false;
1262}
1263
1264/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1265
1266 Recognizes RTL expressions that are valid memory addresses for an
1267 instruction. The MODE argument is the machine mode for the MEM
1268 expression that wants to use this address.
1269
1270 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1271 convert common non-canonical forms to canonical form so that they will
1272 be recognized. */
1273
1274static bool
1275gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1276 addr_space_t as)
1277{
1278 /* All vector instructions need to work on addresses in registers. */
1279 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1280 return false;
1281
1282 if (AS_SCALAR_FLAT_P (as))
1283 {
1284 if (mode == QImode || mode == HImode)
1285 return 0;
1286
1287 switch (GET_CODE (x))
1288 {
1289 case REG:
1290 return gcn_address_register_p (x, DImode, strict);
1291 /* Addresses are in the form BASE+OFFSET
1292 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1293 Writes and atomics do not accept SGPR. */
1294 case PLUS:
1295 {
1296 rtx x0 = XEXP (x, 0);
1297 rtx x1 = XEXP (x, 1);
1298 if (!gcn_address_register_p (x0, DImode, strict))
1299 return false;
1300 /* FIXME: This is disabled because of the mode mismatch between
1301 SImode (for the address or m0 register) and the DImode PLUS.
1302 We'll need a zero_extend or similar.
1303
1304 if (gcn_m0_register_p (x1, SImode, strict)
1305 || gcn_address_register_p (x1, SImode, strict))
1306 return true;
1307 else*/
1308 if (GET_CODE (x1) == CONST_INT)
1309 {
1310 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1311 /* The low bits of the offset are ignored, even when
1312 they're meant to realign the pointer. */
1313 && !(INTVAL (x1) & 0x3))
1314 return true;
1315 }
1316 return false;
1317 }
1318
1319 default:
1320 break;
1321 }
1322 }
1323 else if (AS_SCRATCH_P (as))
1324 return gcn_address_register_p (x, SImode, strict);
1325 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1326 {
1327 if (TARGET_GCN3 || GET_CODE (x) == REG)
1328 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1329 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1330 ? gcn_address_register_p (x, DImode, strict)
1331 : gcn_vec_address_register_p (x, DImode, strict));
1332 else
1333 {
1334 gcc_assert (TARGET_GCN5_PLUS);
1335
1336 if (GET_CODE (x) == PLUS)
1337 {
1338 rtx x1 = XEXP (x, 1);
1339
1340 if (VECTOR_MODE_P (mode)
1341 ? !gcn_address_register_p (x, DImode, strict)
1342 : !gcn_vec_address_register_p (x, DImode, strict))
1343 return false;
1344
1345 if (GET_CODE (x1) == CONST_INT)
1346 {
1347 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1348 /* The low bits of the offset are ignored, even when
1349 they're meant to realign the pointer. */
1350 && !(INTVAL (x1) & 0x3))
1351 return true;
1352 }
1353 }
1354 return false;
1355 }
1356 }
1357 else if (AS_GLOBAL_P (as))
1358 {
1359 gcc_assert (TARGET_GCN5_PLUS);
1360
1361 if (GET_CODE (x) == REG)
1362 return (gcn_address_register_p (x, DImode, strict)
1363 || (!VECTOR_MODE_P (mode)
1364 && gcn_vec_address_register_p (x, DImode, strict)));
1365 else if (GET_CODE (x) == PLUS)
1366 {
1367 rtx base = XEXP (x, 0);
1368 rtx offset = XEXP (x, 1);
1369
1370 bool immediate_p = (GET_CODE (offset) == CONST_INT
1371 /* Signed 13-bit immediate. */
1372 && INTVAL (offset) >= -(1 << 12)
1373 && INTVAL (offset) < (1 << 12)
1374 /* The low bits of the offset are ignored, even
1375 when they're meant to realign the pointer. */
1376 && !(INTVAL (offset) & 0x3));
1377
1378 if (!VECTOR_MODE_P (mode))
1379 {
1380 if ((gcn_address_register_p (base, DImode, strict)
1381 || gcn_vec_address_register_p (base, DImode, strict))
1382 && immediate_p)
1383 /* SGPR + CONST or VGPR + CONST */
1384 return true;
1385
1386 if (gcn_address_register_p (base, DImode, strict)
1387 && gcn_vgpr_register_operand (offset, SImode))
1388 /* SGPR + VGPR */
1389 return true;
1390
1391 if (GET_CODE (base) == PLUS
1392 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1393 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1394 && immediate_p)
1395 /* (SGPR + VGPR) + CONST */
1396 return true;
1397 }
1398 else
1399 {
1400 if (gcn_address_register_p (base, DImode, strict)
1401 && immediate_p)
1402 /* SGPR + CONST */
1403 return true;
1404 }
1405 }
1406 else
1407 return false;
1408 }
1409 else if (AS_ANY_DS_P (as))
1410 switch (GET_CODE (x))
1411 {
1412 case REG:
1413 return (VECTOR_MODE_P (mode)
1414 ? gcn_address_register_p (x, SImode, strict)
1415 : gcn_vec_address_register_p (x, SImode, strict));
1416 /* Addresses are in the form BASE+OFFSET
1417 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1418 Writes and atomics do not accept SGPR. */
1419 case PLUS:
1420 {
1421 rtx x0 = XEXP (x, 0);
1422 rtx x1 = XEXP (x, 1);
1423 if (!gcn_vec_address_register_p (x0, DImode, strict))
1424 return false;
1425 if (GET_CODE (x1) == REG)
1426 {
1427 if (GET_CODE (x1) != REG
1428 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1429 && !gcn_ssrc_register_operand (x1, DImode)))
1430 return false;
1431 }
1432 else if (GET_CODE (x1) == CONST_VECTOR
1433 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1434 && single_cst_vector_p (x1))
1435 {
1436 x1 = CONST_VECTOR_ELT (x1, 0);
1437 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1438 return true;
1439 }
1440 return false;
1441 }
1442
1443 default:
1444 break;
1445 }
1446 else
1447 gcc_unreachable ();
1448 return false;
1449}
1450
1451/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1452
1453 Return the appropriate mode for a named address pointer. */
1454
1455static scalar_int_mode
1456gcn_addr_space_pointer_mode (addr_space_t addrspace)
1457{
1458 switch (addrspace)
1459 {
1460 case ADDR_SPACE_SCRATCH:
1461 case ADDR_SPACE_LDS:
1462 case ADDR_SPACE_GDS:
1463 return SImode;
1464 case ADDR_SPACE_DEFAULT:
1465 case ADDR_SPACE_FLAT:
1466 case ADDR_SPACE_FLAT_SCRATCH:
1467 case ADDR_SPACE_SCALAR_FLAT:
1468 return DImode;
1469 default:
1470 gcc_unreachable ();
1471 }
1472}
1473
1474/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1475
1476 Return the appropriate mode for a named address space address. */
1477
1478static scalar_int_mode
1479gcn_addr_space_address_mode (addr_space_t addrspace)
1480{
1481 return gcn_addr_space_pointer_mode (addrspace);
1482}
1483
1484/* Implement TARGET_ADDR_SPACE_SUBSET_P.
1485
1486 Determine if one named address space is a subset of another. */
1487
1488static bool
1489gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1490{
1491 if (subset == superset)
1492 return true;
1493 /* FIXME is this true? */
1494 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1495 return true;
1496 return false;
1497}
1498
1499/* Convert from one address space to another. */
1500
1501static rtx
1502gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1503{
1504 gcc_assert (POINTER_TYPE_P (from_type));
1505 gcc_assert (POINTER_TYPE_P (to_type));
1506
1507 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1508 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1509
1510 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1511 {
1512 rtx queue = gen_rtx_REG (DImode,
1513 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1514 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1515 gen_rtx_PLUS (DImode, queue,
1516 gen_int_mode (64, SImode)));
1517 rtx tmp = gen_reg_rtx (DImode);
1518
1519 emit_move_insn (gen_lowpart (SImode, tmp), op);
1520 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1521 group_seg_aperture_hi);
1522
1523 return tmp;
1524 }
1525 else if (as_from == as_to)
1526 return op;
1527 else
1528 gcc_unreachable ();
1529}
1530
b5bb7f32
HAQ
1531/* Implement TARGET_ADDR_SPACE_DEBUG.
1532
1533 Return the dwarf address space class for each hardware address space. */
1534
1535static int
1536gcn_addr_space_debug (addr_space_t as)
1537{
1538 switch (as)
1539 {
1540 case ADDR_SPACE_DEFAULT:
1541 case ADDR_SPACE_FLAT:
1542 case ADDR_SPACE_SCALAR_FLAT:
1543 case ADDR_SPACE_FLAT_SCRATCH:
1544 return DW_ADDR_none;
1545 case ADDR_SPACE_GLOBAL:
1546 return 1; // DW_ADDR_LLVM_global
1547 case ADDR_SPACE_LDS:
1548 return 3; // DW_ADDR_LLVM_group
1549 case ADDR_SPACE_SCRATCH:
1550 return 4; // DW_ADDR_LLVM_private
1551 case ADDR_SPACE_GDS:
1552 return 0x8000; // DW_ADDR_AMDGPU_region
1553 }
1554 gcc_unreachable ();
1555}
1556
5326695a
AS
1557
1558/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1559
1560 Retun true if REGNO is OK for memory adressing. */
1561
1562bool
1563gcn_regno_mode_code_ok_for_base_p (int regno,
1564 machine_mode, addr_space_t as, int, int)
1565{
1566 if (regno >= FIRST_PSEUDO_REGISTER)
1567 {
1568 if (reg_renumber)
1569 regno = reg_renumber[regno];
1570 else
1571 return true;
1572 }
1573 if (AS_FLAT_P (as))
1574 return (VGPR_REGNO_P (regno)
1575 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1576 else if (AS_SCALAR_FLAT_P (as))
1577 return (SGPR_REGNO_P (regno)
1578 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1579 else if (AS_GLOBAL_P (as))
1580 {
1581 return (SGPR_REGNO_P (regno)
1582 || VGPR_REGNO_P (regno)
1583 || regno == ARG_POINTER_REGNUM
1584 || regno == FRAME_POINTER_REGNUM);
1585 }
1586 else
1587 /* For now. */
1588 return false;
1589}
1590
1591/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1592
1593 Return a suitable register class for memory addressing. */
1594
1595reg_class
1596gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1597 int ic)
1598{
1599 switch (as)
1600 {
1601 case ADDR_SPACE_DEFAULT:
1602 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1603 case ADDR_SPACE_SCALAR_FLAT:
1604 case ADDR_SPACE_SCRATCH:
1605 return SGPR_REGS;
1606 break;
1607 case ADDR_SPACE_FLAT:
1608 case ADDR_SPACE_FLAT_SCRATCH:
1609 case ADDR_SPACE_LDS:
1610 case ADDR_SPACE_GDS:
1611 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1612 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1613 ? SGPR_REGS : VGPR_REGS);
1614 case ADDR_SPACE_GLOBAL:
1615 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1616 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1617 ? SGPR_REGS : ALL_GPR_REGS);
1618 }
1619 gcc_unreachable ();
1620}
1621
1622/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1623
1624 Return true if REGNO is OK for index of memory addressing. */
1625
1626bool
1627regno_ok_for_index_p (int regno)
1628{
1629 if (regno >= FIRST_PSEUDO_REGISTER)
1630 {
1631 if (reg_renumber)
1632 regno = reg_renumber[regno];
1633 else
1634 return true;
1635 }
1636 return regno == M0_REG || VGPR_REGNO_P (regno);
1637}
1638
1639/* Generate move which uses the exec flags. If EXEC is NULL, then it is
1640 assumed that all lanes normally relevant to the mode of the move are
1641 affected. If PREV is NULL, then a sensible default is supplied for
1642 the inactive lanes. */
1643
1644static rtx
1645gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1646{
1647 machine_mode mode = GET_MODE (op0);
1648
1649 if (vgpr_vector_mode_p (mode))
1650 {
1651 if (exec && exec != CONSTM1_RTX (DImode))
1652 {
1653 if (!prev)
1654 prev = op0;
1655 }
1656 else
1657 {
1658 if (!prev)
1659 prev = gcn_gen_undef (mode);
1660 exec = gcn_full_exec_reg ();
1661 }
1662
1663 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1664
1665 return gen_rtx_PARALLEL (VOIDmode,
1666 gen_rtvec (2, set,
1667 gen_rtx_CLOBBER (VOIDmode,
1668 gen_rtx_SCRATCH (V64DImode))));
1669 }
1670
1671 return (gen_rtx_PARALLEL
1672 (VOIDmode,
1673 gen_rtvec (2, gen_rtx_SET (op0, op1),
1674 gen_rtx_USE (VOIDmode,
1675 exec ? exec : gcn_scalar_exec ()))));
1676}
1677
1678/* Generate masked move. */
1679
1680static rtx
1681gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1682{
1683 if (exec)
1684 return (gen_rtx_SET (op0,
1685 gen_rtx_VEC_MERGE (GET_MODE (op0),
1686 gen_rtx_VEC_DUPLICATE (GET_MODE
1687 (op0), op1),
1688 op2, exec)));
1689 else
1690 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1691}
1692
1693/* Expand vector init of OP0 by VEC.
1694 Implements vec_init instruction pattern. */
1695
1696void
1697gcn_expand_vector_init (rtx op0, rtx vec)
1698{
1699 int64_t initialized_mask = 0;
1700 int64_t curr_mask = 1;
1701 machine_mode mode = GET_MODE (op0);
1702
1703 rtx val = XVECEXP (vec, 0, 0);
1704
1705 for (int i = 1; i < 64; i++)
1706 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1707 curr_mask |= (int64_t) 1 << i;
1708
1709 if (gcn_constant_p (val))
1710 emit_move_insn (op0, gcn_vec_constant (mode, val));
1711 else
1712 {
1713 val = force_reg (GET_MODE_INNER (mode), val);
1714 emit_insn (gen_duplicate_load (op0, val));
1715 }
1716 initialized_mask |= curr_mask;
1717 for (int i = 1; i < 64; i++)
1718 if (!(initialized_mask & ((int64_t) 1 << i)))
1719 {
1720 curr_mask = (int64_t) 1 << i;
1721 rtx val = XVECEXP (vec, 0, i);
1722
1723 for (int j = i + 1; j < 64; j++)
1724 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1725 curr_mask |= (int64_t) 1 << j;
1726 if (gcn_constant_p (val))
1727 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1728 get_exec (curr_mask)));
1729 else
1730 {
1731 val = force_reg (GET_MODE_INNER (mode), val);
1732 emit_insn (gen_duplicate_load (op0, val, op0,
1733 get_exec (curr_mask)));
1734 }
1735 initialized_mask |= curr_mask;
1736 }
1737}
1738
1739/* Load vector constant where n-th lane contains BASE+n*VAL. */
1740
1741static rtx
1742strided_constant (machine_mode mode, int base, int val)
1743{
1744 rtx x = gen_reg_rtx (mode);
1745 emit_move_insn (x, gcn_vec_constant (mode, base));
1746 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1747 x, get_exec (0xffffffff00000000)));
1748 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1749 x, get_exec (0xffff0000ffff0000)));
1750 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1751 x, get_exec (0xff00ff00ff00ff00)));
1752 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1753 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1754 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1755 x, get_exec (0xcccccccccccccccc)));
1756 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1757 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1758 return x;
1759}
1760
1761/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1762
1763static rtx
1764gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1765 addr_space_t as)
1766{
1767 switch (as)
1768 {
1769 case ADDR_SPACE_DEFAULT:
1770 return gcn_addr_space_legitimize_address (x, old, mode,
1771 DEFAULT_ADDR_SPACE);
1772 case ADDR_SPACE_SCALAR_FLAT:
1773 case ADDR_SPACE_SCRATCH:
1774 /* Instructions working on vectors need the address to be in
1775 a register. */
1776 if (vgpr_vector_mode_p (mode))
1777 return force_reg (GET_MODE (x), x);
1778
1779 return x;
1780 case ADDR_SPACE_FLAT:
1781 case ADDR_SPACE_FLAT_SCRATCH:
1782 case ADDR_SPACE_GLOBAL:
1783 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1784 case ADDR_SPACE_LDS:
1785 case ADDR_SPACE_GDS:
1786 /* FIXME: LDS support offsets, handle them!. */
1787 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1788 {
1789 rtx addrs = gen_reg_rtx (V64SImode);
1790 rtx base = force_reg (SImode, x);
1791 rtx offsets = strided_constant (V64SImode, 0,
1792 GET_MODE_UNIT_SIZE (mode));
1793
1794 emit_insn (gen_vec_duplicatev64si (addrs, base));
1795 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1796 return addrs;
1797 }
1798 return x;
1799 }
1800 gcc_unreachable ();
1801}
1802
1803/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1804 proper vector of stepped addresses.
1805
1806 MEM will be a DImode address of a vector in an SGPR.
1807 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1808
1809rtx
1810gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1811 rtx tmp)
1812{
1813 gcc_assert (MEM_P (mem));
1814 rtx mem_base = XEXP (mem, 0);
1815 rtx mem_index = NULL_RTX;
1816
1817 if (!TARGET_GCN5_PLUS)
1818 {
1819 /* gcn_addr_space_legitimize_address should have put the address in a
1820 register. If not, it is too late to do anything about it. */
1821 gcc_assert (REG_P (mem_base));
1822 }
1823
1824 if (GET_CODE (mem_base) == PLUS)
1825 {
1826 mem_index = XEXP (mem_base, 1);
1827 mem_base = XEXP (mem_base, 0);
1828 }
1829
1830 /* RF and RM base registers for vector modes should be always an SGPR. */
1831 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1832 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1833
1834 machine_mode inner = GET_MODE_INNER (mode);
1835 int shift = exact_log2 (GET_MODE_SIZE (inner));
1836 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1837 rtx undef_v64si = gcn_gen_undef (V64SImode);
1838 rtx new_base = NULL_RTX;
1839 addr_space_t as = MEM_ADDR_SPACE (mem);
1840
1841 rtx tmplo = (REG_P (tmp)
1842 ? gcn_operand_part (V64DImode, tmp, 0)
1843 : gen_reg_rtx (V64SImode));
1844
1845 /* tmplo[:] = ramp[:] << shift */
1846 if (exec)
1847 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1848 gen_int_mode (shift, SImode),
1849 undef_v64si, exec));
1850 else
1851 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1852
1853 if (AS_FLAT_P (as))
1854 {
75d0b3d7
AS
1855 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1856
5326695a
AS
1857 if (REG_P (tmp))
1858 {
5326695a
AS
1859 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1860 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1861 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1862
1863 /* tmphi[:] = mem_base_hi */
1864 if (exec)
1865 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1866 undef_v64si, exec));
1867 else
1868 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1869
1870 /* tmp[:] += zext (mem_base) */
1871 if (exec)
1872 {
5326695a
AS
1873 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1874 vcc, undef_v64si, exec));
1875 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1876 vcc, vcc, undef_v64si, exec));
1877 }
1878 else
75d0b3d7 1879 emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
5326695a
AS
1880 }
1881 else
1882 {
1883 tmp = gen_reg_rtx (V64DImode);
1884 if (exec)
75d0b3d7
AS
1885 emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1886 (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1887 exec));
5326695a 1888 else
75d0b3d7 1889 emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
5326695a
AS
1890 }
1891
1892 new_base = tmp;
1893 }
1894 else if (AS_ANY_DS_P (as))
1895 {
1896 if (!exec)
1897 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1898 else
1899 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1900 gcn_gen_undef (V64SImode), exec));
1901 new_base = tmplo;
1902 }
1903 else
1904 {
1905 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1906 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1907 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1908 }
1909
1910 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1911 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1912 (mem_index ? mem_index
1913 : const0_rtx)));
1914}
1915
1916/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1917 suitable for the given address space. This is indented for use in
1918 gather/scatter patterns.
1919
1920 The offsets may be signed or unsigned, according to UNSIGNED_P.
1921 If EXEC is set then _exec patterns will be used, otherwise plain.
1922
1923 Return values.
1924 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1925 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1926
1927rtx
1928gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1929 bool unsigned_p, rtx exec)
1930{
5326695a
AS
1931 rtx tmpsi = gen_reg_rtx (V64SImode);
1932 rtx tmpdi = gen_reg_rtx (V64DImode);
1933 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1934 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1935
1936 if (CONST_INT_P (scale)
1937 && INTVAL (scale) > 0
1938 && exact_log2 (INTVAL (scale)) >= 0)
1939 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1940 GEN_INT (exact_log2 (INTVAL (scale)))));
1941 else
1942 (exec
1943 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1944 exec))
1945 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1946
1947 /* "Global" instructions do not support negative register offsets. */
1948 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1949 {
1950 if (unsigned_p)
1951 (exec
1952 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1953 undefdi, exec))
1954 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1955 else
1956 (exec
1957 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1958 undefdi, exec))
1959 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1960 return tmpdi;
1961 }
1962 else if (as == ADDR_SPACE_GLOBAL)
1963 return tmpsi;
1964
1965 gcc_unreachable ();
1966}
1967
1968/* Return true if move from OP0 to OP1 is known to be executed in vector
1969 unit. */
1970
1971bool
1972gcn_vgpr_move_p (rtx op0, rtx op1)
1973{
1974 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1975 return true;
1976 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1977 return true;
1978 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1979 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1980 || vgpr_vector_mode_p (GET_MODE (op0)));
1981}
1982
1983/* Return true if move from OP0 to OP1 is known to be executed in scalar
1984 unit. Used in the machine description. */
1985
1986bool
1987gcn_sgpr_move_p (rtx op0, rtx op1)
1988{
1989 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1990 return true;
1991 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1992 return true;
1993 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1994 || VGPR_REGNO_P (REGNO (op0)))
1995 return false;
1996 if (REG_P (op1)
1997 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1998 && !VGPR_REGNO_P (REGNO (op1)))
1999 return true;
2000 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
2001}
2002
2003/* Implement TARGET_SECONDARY_RELOAD.
2004
2005 The address space determines which registers can be used for loads and
2006 stores. */
2007
2008static reg_class_t
2009gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
2010 machine_mode reload_mode, secondary_reload_info *sri)
2011{
2012 reg_class_t result = NO_REGS;
2013 bool spilled_pseudo =
2014 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
2015
2016 if (dump_file && (dump_flags & TDF_DETAILS))
2017 {
2018 fprintf (dump_file, "gcn_secondary_reload: ");
2019 dump_value_slim (dump_file, x, 1);
2020 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
2021 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
2022 if (REG_P (x) || GET_CODE (x) == SUBREG)
2023 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
2024 (true_regnum (x) >= 0
2025 && true_regnum (x) < FIRST_PSEUDO_REGISTER
2026 ? reg_names[true_regnum (x)]
2027 : (spilled_pseudo ? "stack spill" : "??")));
2028 fprintf (dump_file, "\n");
2029 }
2030
2031 /* Some callers don't use or initialize icode. */
2032 sri->icode = CODE_FOR_nothing;
2033
2034 if (MEM_P (x) || spilled_pseudo)
2035 {
2036 addr_space_t as = DEFAULT_ADDR_SPACE;
2037
2038 /* If we have a spilled pseudo, we can't find the address space
2039 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
2040 ADDR_SPACE_GLOBAL for GCN5. */
2041 if (MEM_P (x))
2042 as = MEM_ADDR_SPACE (x);
2043
2044 if (as == ADDR_SPACE_DEFAULT)
2045 as = DEFAULT_ADDR_SPACE;
2046
2047 switch (as)
2048 {
2049 case ADDR_SPACE_SCALAR_FLAT:
2050 result =
2051 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
2052 break;
2053 case ADDR_SPACE_FLAT:
2054 case ADDR_SPACE_FLAT_SCRATCH:
2055 case ADDR_SPACE_GLOBAL:
2056 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
2057 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
2058 {
2059 if (in_p)
2060 switch (reload_mode)
2061 {
2062 case E_V64SImode:
2063 sri->icode = CODE_FOR_reload_inv64si;
2064 break;
2065 case E_V64SFmode:
2066 sri->icode = CODE_FOR_reload_inv64sf;
2067 break;
2068 case E_V64HImode:
2069 sri->icode = CODE_FOR_reload_inv64hi;
2070 break;
2071 case E_V64HFmode:
2072 sri->icode = CODE_FOR_reload_inv64hf;
2073 break;
2074 case E_V64QImode:
2075 sri->icode = CODE_FOR_reload_inv64qi;
2076 break;
2077 case E_V64DImode:
2078 sri->icode = CODE_FOR_reload_inv64di;
2079 break;
2080 case E_V64DFmode:
2081 sri->icode = CODE_FOR_reload_inv64df;
2082 break;
2083 default:
2084 gcc_unreachable ();
2085 }
2086 else
2087 switch (reload_mode)
2088 {
2089 case E_V64SImode:
2090 sri->icode = CODE_FOR_reload_outv64si;
2091 break;
2092 case E_V64SFmode:
2093 sri->icode = CODE_FOR_reload_outv64sf;
2094 break;
2095 case E_V64HImode:
2096 sri->icode = CODE_FOR_reload_outv64hi;
2097 break;
2098 case E_V64HFmode:
2099 sri->icode = CODE_FOR_reload_outv64hf;
2100 break;
2101 case E_V64QImode:
2102 sri->icode = CODE_FOR_reload_outv64qi;
2103 break;
2104 case E_V64DImode:
2105 sri->icode = CODE_FOR_reload_outv64di;
2106 break;
2107 case E_V64DFmode:
2108 sri->icode = CODE_FOR_reload_outv64df;
2109 break;
2110 default:
2111 gcc_unreachable ();
2112 }
2113 break;
2114 }
2115 /* Fallthrough. */
2116 case ADDR_SPACE_LDS:
2117 case ADDR_SPACE_GDS:
2118 case ADDR_SPACE_SCRATCH:
2119 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2120 break;
2121 }
2122 }
2123
2124 if (dump_file && (dump_flags & TDF_DETAILS))
2125 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2126 get_insn_name (sri->icode));
2127
2128 return result;
2129}
2130
2131/* Update register usage after having seen the compiler flags and kernel
2132 attributes. We typically want to fix registers that contain values
2133 set by the HSA runtime. */
2134
2135static void
2136gcn_conditional_register_usage (void)
2137{
342f9464
KCY
2138 if (!cfun || !cfun->machine)
2139 return;
5326695a 2140
342f9464
KCY
2141 if (cfun->machine->normal_function)
2142 {
2143 /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
f062c3f1 2144 for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
87fdbe69 2145 i <= LAST_SGPR_REG; i++)
342f9464 2146 fixed_regs[i] = 1, call_used_regs[i] = 1;
5326695a 2147
87fdbe69
KCY
2148 for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2149 i <= LAST_VGPR_REG; i++)
342f9464 2150 fixed_regs[i] = 1, call_used_regs[i] = 1;
5326695a 2151
5326695a
AS
2152 return;
2153 }
2154
342f9464
KCY
2155 /* If the set of requested args is the default set, nothing more needs to
2156 be done. */
2157 if (cfun->machine->args.requested == default_requested_args)
2158 return;
2159
2160 /* Requesting a set of args different from the default violates the ABI. */
2161 if (!leaf_function_p ())
2162 warning (0, "A non-default set of initial values has been requested, "
55308fc2 2163 "which violates the ABI");
342f9464
KCY
2164
2165 for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2166 fixed_regs[i] = 0;
2167
5326695a
AS
2168 /* Fix the runtime argument register containing values that may be
2169 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2170 needed after the prologue so there's no need to fix them. */
2171 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2172 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2173 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2174 {
342f9464
KCY
2175 /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2176 the containing registers to be used for other purposes. */
5326695a
AS
2177 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2178 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
5326695a
AS
2179 }
2180 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2181 {
2182 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2183 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2184 }
2185 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2186 {
2187 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2188 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2189 }
2190 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2191 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2192 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2193 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2194 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2195 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2196 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2197 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
5326695a
AS
2198}
2199
2200/* Determine if a load or store is valid, according to the register classes
2201 and address space. Used primarily by the machine description to decide
2202 when to split a move into two steps. */
2203
2204bool
2205gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2206{
2207 if (!MEM_P (dest) && !MEM_P (src))
2208 return true;
2209
2210 if (MEM_P (dest)
2211 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2212 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2213 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2214 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2215 && gcn_vgpr_register_operand (src, mode))
2216 return true;
2217 else if (MEM_P (src)
2218 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2219 && (gcn_flat_address_p (XEXP (src, 0), mode)
2220 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2221 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2222 && gcn_vgpr_register_operand (dest, mode))
2223 return true;
2224
2225 if (MEM_P (dest)
2226 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2227 && (gcn_global_address_p (XEXP (dest, 0))
2228 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2229 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2230 && gcn_vgpr_register_operand (src, mode))
2231 return true;
2232 else if (MEM_P (src)
2233 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2234 && (gcn_global_address_p (XEXP (src, 0))
2235 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2236 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2237 && gcn_vgpr_register_operand (dest, mode))
2238 return true;
2239
2240 if (MEM_P (dest)
2241 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2242 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2243 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2244 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2245 && gcn_ssrc_register_operand (src, mode))
2246 return true;
2247 else if (MEM_P (src)
2248 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2249 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2250 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2251 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2252 && gcn_sdst_register_operand (dest, mode))
2253 return true;
2254
2255 if (MEM_P (dest)
2256 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2257 && gcn_ds_address_p (XEXP (dest, 0))
2258 && gcn_vgpr_register_operand (src, mode))
2259 return true;
2260 else if (MEM_P (src)
2261 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2262 && gcn_ds_address_p (XEXP (src, 0))
2263 && gcn_vgpr_register_operand (dest, mode))
2264 return true;
2265
2266 return false;
2267}
2268
2269/* }}} */
2270/* {{{ Functions and ABI. */
2271
2272/* Implement TARGET_FUNCTION_VALUE.
2273
2274 Define how to find the value returned by a function.
2275 The register location is always the same, but the mode depends on
2276 VALTYPE. */
2277
2278static rtx
2279gcn_function_value (const_tree valtype, const_tree, bool)
2280{
2281 machine_mode mode = TYPE_MODE (valtype);
2282
2283 if (INTEGRAL_TYPE_P (valtype)
2284 && GET_MODE_CLASS (mode) == MODE_INT
2285 && GET_MODE_SIZE (mode) < 4)
2286 mode = SImode;
2287
4e191462 2288 return gen_rtx_REG (mode, RETURN_VALUE_REG);
5326695a
AS
2289}
2290
2291/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2292
2293 Return true if N is a possible register number for the function return
2294 value. */
2295
2296static bool
2297gcn_function_value_regno_p (const unsigned int n)
2298{
2299 return n == RETURN_VALUE_REG;
2300}
2301
0ffef200
RS
2302/* Calculate the number of registers required to hold function argument
2303 ARG. */
5326695a
AS
2304
2305static int
0ffef200 2306num_arg_regs (const function_arg_info &arg)
5326695a 2307{
0ffef200 2308 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2309 return 0;
2310
0ffef200 2311 int size = arg.promoted_size_in_bytes ();
4e191462
AS
2312 int regsize = UNITS_PER_WORD * (VECTOR_MODE_P (arg.mode)
2313 ? GET_MODE_NUNITS (arg.mode) : 1);
2314 return (size + regsize - 1) / regsize;
5326695a
AS
2315}
2316
2317/* Implement TARGET_STRICT_ARGUMENT_NAMING.
2318
2319 Return true if the location where a function argument is passed
2320 depends on whether or not it is a named argument
2321
2322 For gcn, we know how to handle functions declared as stdarg: by
2323 passing an extra pointer to the unnamed arguments. However, the
2324 Fortran frontend can produce a different situation, where a
2325 function pointer is declared with no arguments, but the actual
2326 function and calls to it take more arguments. In that case, we
2327 want to ensure the call matches the definition of the function. */
2328
2329static bool
2330gcn_strict_argument_naming (cumulative_args_t cum_v)
2331{
2332 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2333
2334 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2335}
2336
2337/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2338
2339 See comment on gcn_strict_argument_naming. */
2340
2341static bool
2342gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2343{
2344 return !gcn_strict_argument_naming (cum_v);
2345}
2346
2347/* Implement TARGET_FUNCTION_ARG.
2348
2349 Return an RTX indicating whether a function argument is passed in a register
2350 and if so, which register. */
2351
2352static rtx
6783fdb7 2353gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2354{
2355 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2356 if (cum->normal_function)
2357 {
6783fdb7 2358 if (!arg.named || arg.end_marker_p ())
5326695a
AS
2359 return 0;
2360
0ffef200 2361 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2362 return 0;
2363
4e191462
AS
2364 int first_reg = (VECTOR_MODE_P (arg.mode)
2365 ? FIRST_VPARM_REG : FIRST_PARM_REG);
2366 int cum_num = (VECTOR_MODE_P (arg.mode)
2367 ? cum->vnum : cum->num);
2368 int reg_num = first_reg + cum_num;
0ffef200 2369 int num_regs = num_arg_regs (arg);
5326695a
AS
2370 if (num_regs > 0)
2371 while (reg_num % num_regs != 0)
2372 reg_num++;
4e191462 2373 if (reg_num + num_regs <= first_reg + NUM_PARM_REGS)
6783fdb7 2374 return gen_rtx_REG (arg.mode, reg_num);
5326695a
AS
2375 }
2376 else
2377 {
2378 if (cum->num >= cum->args.nargs)
2379 {
6783fdb7
RS
2380 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2381 & -(TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2382 cfun->machine->kernarg_segment_alignment
2383 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
6783fdb7 2384 TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2385 rtx addr = gen_rtx_REG (DImode,
2386 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2387 if (cum->offset)
2388 addr = gen_rtx_PLUS (DImode, addr,
2389 gen_int_mode (cum->offset, DImode));
6783fdb7
RS
2390 rtx mem = gen_rtx_MEM (arg.mode, addr);
2391 set_mem_attributes (mem, arg.type, 1);
5326695a
AS
2392 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2393 MEM_READONLY_P (mem) = 1;
2394 return mem;
2395 }
2396
2397 int a = cum->args.order[cum->num];
6783fdb7 2398 if (arg.mode != gcn_kernel_arg_types[a].mode)
5326695a
AS
2399 {
2400 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2401 return 0;
2402 }
2403 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2404 cum->args.reg[a]);
2405 }
2406 return 0;
2407}
2408
2409/* Implement TARGET_FUNCTION_ARG_ADVANCE.
2410
2411 Updates the summarizer variable pointed to by CUM_V to advance past an
2412 argument in the argument list. */
2413
2414static void
6930c98c
RS
2415gcn_function_arg_advance (cumulative_args_t cum_v,
2416 const function_arg_info &arg)
5326695a
AS
2417{
2418 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2419
2420 if (cum->normal_function)
2421 {
6930c98c 2422 if (!arg.named)
5326695a
AS
2423 return;
2424
4e191462
AS
2425 int first_reg = (VECTOR_MODE_P (arg.mode)
2426 ? FIRST_VPARM_REG : FIRST_PARM_REG);
2427 int *cum_num = (VECTOR_MODE_P (arg.mode)
2428 ? &cum->vnum : &cum->num);
0ffef200 2429 int num_regs = num_arg_regs (arg);
5326695a 2430 if (num_regs > 0)
4e191462
AS
2431 while ((first_reg + *cum_num) % num_regs != 0)
2432 (*cum_num)++;
2433 *cum_num += num_regs;
5326695a
AS
2434 }
2435 else
2436 {
2437 if (cum->num < cum->args.nargs)
2438 cum->num++;
2439 else
2440 {
6930c98c 2441 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
5326695a
AS
2442 cfun->machine->kernarg_segment_byte_size = cum->offset;
2443 }
2444 }
2445}
2446
2447/* Implement TARGET_ARG_PARTIAL_BYTES.
2448
2449 Returns the number of bytes at the beginning of an argument that must be put
2450 in registers. The value must be zero for arguments that are passed entirely
2451 in registers or that are entirely pushed on the stack. */
2452
2453static int
a7c81bc1 2454gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2455{
2456 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2457
a7c81bc1 2458 if (!arg.named)
5326695a
AS
2459 return 0;
2460
0ffef200 2461 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2462 return 0;
2463
4e191462
AS
2464 int cum_num = (VECTOR_MODE_P (arg.mode) ? cum->vnum : cum->num);
2465 int regsize = UNITS_PER_WORD * (VECTOR_MODE_P (arg.mode)
2466 ? GET_MODE_NUNITS (arg.mode) : 1);
2467
2468 if (cum_num >= NUM_PARM_REGS)
5326695a
AS
2469 return 0;
2470
2471 /* If the argument fits entirely in registers, return 0. */
4e191462 2472 if (cum_num + num_arg_regs (arg) <= NUM_PARM_REGS)
5326695a
AS
2473 return 0;
2474
4e191462 2475 return (NUM_PARM_REGS - cum_num) * regsize;
5326695a
AS
2476}
2477
2478/* A normal function which takes a pointer argument (to a scalar) may be
2479 passed a pointer to LDS space (via a high-bits-set aperture), and that only
2480 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
2481 function has an incoming pointer-to-scalar parameter. */
2482
2483static void
2484gcn_detect_incoming_pointer_arg (tree fndecl)
2485{
2486 gcc_assert (cfun && cfun->machine);
2487
2488 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2489 arg;
2490 arg = TREE_CHAIN (arg))
2491 if (POINTER_TYPE_P (TREE_VALUE (arg))
2492 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2493 cfun->machine->use_flat_addressing = true;
2494}
2495
2496/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2497
2498 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2499 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2500
2501void
2502gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2503 tree fntype /* tree ptr for function decl */ ,
2504 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2505 tree fndecl, int caller)
2506{
2507 memset (cum, 0, sizeof (*cum));
2508 cum->fntype = fntype;
2509 if (libname)
2510 {
2511 gcc_assert (cfun && cfun->machine);
2512 cum->normal_function = true;
2513 if (!caller)
2514 {
2515 cfun->machine->normal_function = true;
2516 gcn_detect_incoming_pointer_arg (fndecl);
2517 }
2518 return;
2519 }
2520 tree attr = NULL;
2521 if (fndecl)
2522 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2523 if (fndecl && !attr)
2524 attr = lookup_attribute ("amdgpu_hsa_kernel",
2525 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2526 if (!attr && fntype)
2527 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2528 /* Handle main () as kernel, so we can run testsuite.
2529 Handle OpenACC kernels similarly to main. */
2530 if (!attr && !caller && fndecl
2531 && (MAIN_NAME_P (DECL_NAME (fndecl))
2532 || lookup_attribute ("omp target entrypoint",
2533 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2534 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2535 else
2536 {
2537 if (!attr || caller)
2538 {
2539 gcc_assert (cfun && cfun->machine);
2540 cum->normal_function = true;
2541 if (!caller)
2542 cfun->machine->normal_function = true;
2543 }
2544 gcn_parse_amdgpu_hsa_kernel_attribute
2545 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2546 }
2547 cfun->machine->args = cum->args;
2548 if (!caller && cfun->machine->normal_function)
2549 gcn_detect_incoming_pointer_arg (fndecl);
3ed8f692
KCY
2550
2551 reinit_regs ();
5326695a
AS
2552}
2553
2554static bool
2555gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2556{
2557 machine_mode mode = TYPE_MODE (type);
2558 HOST_WIDE_INT size = int_size_in_bytes (type);
2559
2560 if (AGGREGATE_TYPE_P (type))
2561 return true;
2562
2563 if (mode == BLKmode)
2564 return true;
2565
4e191462
AS
2566 if ((!VECTOR_TYPE_P (type) && size > 2 * UNITS_PER_WORD)
2567 || size > 2 * UNITS_PER_WORD * 64)
5326695a
AS
2568 return true;
2569
2570 return false;
2571}
2572
2573/* Implement TARGET_PROMOTE_FUNCTION_MODE.
2574
2575 Return the mode to use for outgoing function arguments. */
2576
2577machine_mode
2578gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2579 int *ARG_UNUSED (punsignedp),
2580 const_tree ARG_UNUSED (funtype),
2581 int ARG_UNUSED (for_return))
2582{
2583 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2584 return SImode;
2585
2586 return mode;
2587}
2588
2589/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2590
2591 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2592 ARGS_GROW_DOWNWARDS. */
2593
2594static tree
2595gcn_gimplify_va_arg_expr (tree valist, tree type,
2596 gimple_seq *ARG_UNUSED (pre_p),
2597 gimple_seq *ARG_UNUSED (post_p))
2598{
2599 tree ptr = build_pointer_type (type);
2600 tree valist_type;
2601 tree t, u;
2602 bool indirect;
2603
fde65a89 2604 indirect = pass_va_arg_by_reference (type);
5326695a
AS
2605 if (indirect)
2606 {
2607 type = ptr;
2608 ptr = build_pointer_type (type);
2609 }
2610 valist_type = TREE_TYPE (valist);
2611
2612 /* Args grow down. Not handled by generic routines. */
2613
2614 u = fold_convert (sizetype, size_in_bytes (type));
2615 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2616 t = fold_build_pointer_plus (valist, u);
2617
2618 /* Align to 8 byte boundary. */
2619
2620 u = build_int_cst (TREE_TYPE (t), -8);
2621 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2622 t = fold_convert (valist_type, t);
2623
2624 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2625
2626 t = fold_convert (ptr, t);
2627 t = build_va_arg_indirect_ref (t);
2628
2629 if (indirect)
2630 t = build_va_arg_indirect_ref (t);
2631
2632 return t;
2633}
2634
955cd057
TB
2635/* Return 1 if TRAIT NAME is present in the OpenMP context's
2636 device trait set, return 0 if not present in any OpenMP context in the
2637 whole translation unit, or -1 if not present in the current OpenMP context
2638 but might be present in another OpenMP context in the same TU. */
2639
2640int
2641gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2642 const char *name)
2643{
2644 switch (trait)
2645 {
2646 case omp_device_kind:
2647 return strcmp (name, "gpu") == 0;
2648 case omp_device_arch:
ebe5dace 2649 return strcmp (name, "amdgcn") == 0 || strcmp (name, "gcn") == 0;
955cd057 2650 case omp_device_isa:
955cd057
TB
2651 if (strcmp (name, "fiji") == 0)
2652 return gcn_arch == PROCESSOR_FIJI;
2653 if (strcmp (name, "gfx900") == 0)
f062c3f1 2654 return gcn_arch == PROCESSOR_VEGA10;
955cd057 2655 if (strcmp (name, "gfx906") == 0)
f062c3f1 2656 return gcn_arch == PROCESSOR_VEGA20;
3535402e
AS
2657 if (strcmp (name, "gfx908") == 0)
2658 return gcn_arch == PROCESSOR_GFX908;
cde52d3a
AS
2659 if (strcmp (name, "gfx90a") == 0)
2660 return gcn_arch == PROCESSOR_GFX90a;
955cd057
TB
2661 return 0;
2662 default:
2663 gcc_unreachable ();
2664 }
2665}
2666
5326695a
AS
2667/* Calculate stack offsets needed to create prologues and epilogues. */
2668
2669static struct machine_function *
2670gcn_compute_frame_offsets (void)
2671{
2672 machine_function *offsets = cfun->machine;
2673
2674 if (reload_completed)
2675 return offsets;
2676
2677 offsets->need_frame_pointer = frame_pointer_needed;
2678
2679 offsets->outgoing_args_size = crtl->outgoing_args_size;
2680 offsets->pretend_size = crtl->args.pretend_args_size;
2681
2682 offsets->local_vars = get_frame_size ();
2683
2684 offsets->lr_needs_saving = (!leaf_function_p ()
2685 || df_regs_ever_live_p (LR_REGNUM)
2686 || df_regs_ever_live_p (LR_REGNUM + 1));
2687
2688 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2689
2690 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 2691 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2692 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2693 && frame_pointer_needed))
2694 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2695
2696 /* Round up to 64-bit boundary to maintain stack alignment. */
2697 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2698
2699 return offsets;
2700}
2701
2702/* Insert code into the prologue or epilogue to store or load any
2703 callee-save register to/from the stack.
2704
2705 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2706
2707static void
2708move_callee_saved_registers (rtx sp, machine_function *offsets,
2709 bool prologue)
2710{
2711 int regno, offset, saved_scalars;
2712 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2713 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2714 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2715 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2716 HOST_WIDE_INT exec_set = 0;
2717 int offreg_set = 0;
251697a6 2718 auto_vec<int> saved_sgprs;
5326695a
AS
2719
2720 start_sequence ();
2721
2722 /* Move scalars into two vector registers. */
2723 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
a365fa06 2724 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2725 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2726 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2727 && offsets->need_frame_pointer))
2728 {
2729 rtx reg = gen_rtx_REG (SImode, regno);
2730 rtx vreg = gen_rtx_REG (V64SImode,
2731 VGPR_REGNO (6 + (saved_scalars / 64)));
2732 int lane = saved_scalars % 64;
2733
2734 if (prologue)
251697a6
HAQ
2735 {
2736 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2737 saved_sgprs.safe_push (regno);
2738 }
5326695a
AS
2739 else
2740 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2741
2742 saved_scalars++;
2743 }
2744
2745 rtx move_scalars = get_insns ();
2746 end_sequence ();
2747 start_sequence ();
2748
2749 /* Ensure that all vector lanes are moved. */
2750 exec_set = -1;
2751 emit_move_insn (exec, GEN_INT (exec_set));
2752
2753 /* Set up a vector stack pointer. */
2754 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2755 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2756 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2757 gcn_gen_undef (V64SImode), exec));
2758 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2759 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2760 exec));
2761 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2762 gcn_operand_part (V64SImode, vsp, 0),
2763 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2764 exec));
2765 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2766 gcn_operand_part (V64SImode, vsp, 1),
2767 const0_rtx, vcc, vcc,
2768 gcn_gen_undef (V64SImode), exec));
2769
2770 /* Move vectors. */
251697a6 2771 for (regno = FIRST_VGPR_REG, offset = 0;
5326695a 2772 regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 2773 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2774 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2775 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2776 {
2777 rtx reg = gen_rtx_REG (V64SImode, regno);
2778 int size = 256;
2779
2780 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2781 size = saved_scalars * 4;
2782 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2783 size = (saved_scalars - 64) * 4;
2784
2785 if (size != 256 || exec_set != -1)
2786 {
2787 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2788 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2789 }
2790
2791 if (prologue)
251697a6
HAQ
2792 {
2793 rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
2794 (vsp, const0_rtx, reg, as, const0_rtx,
2795 exec));
2796
2797 /* Add CFI metadata. */
2798 rtx note;
2799 if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
2800 {
2801 int start = (regno == VGPR_REGNO (7) ? 64 : 0);
2802 int count = MIN (saved_scalars - start, 64);
2803 int add_lr = (regno == VGPR_REGNO (6)
58d50a5d 2804 && offsets->lr_needs_saving);
251697a6
HAQ
2805 int lrdest = -1;
2806 rtvec seq = rtvec_alloc (count + add_lr);
2807
2808 /* Add an REG_FRAME_RELATED_EXPR entry for each scalar
2809 register that was saved in this batch. */
2810 for (int idx = 0; idx < count; idx++)
2811 {
2812 int stackaddr = offset + idx * 4;
2813 rtx dest = gen_rtx_MEM (SImode,
2814 gen_rtx_PLUS
2815 (DImode, sp,
2816 GEN_INT (stackaddr)));
2817 rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
2818 rtx set = gen_rtx_SET (dest, src);
2819 RTX_FRAME_RELATED_P (set) = 1;
2820 RTVEC_ELT (seq, idx) = set;
2821
2822 if (saved_sgprs[start + idx] == LINK_REGNUM)
2823 lrdest = stackaddr;
2824 }
2825
2826 /* Add an additional expression for DWARF_LINK_REGISTER if
2827 LINK_REGNUM was saved. */
2828 if (lrdest != -1)
2829 {
2830 rtx dest = gen_rtx_MEM (DImode,
2831 gen_rtx_PLUS
2832 (DImode, sp,
2833 GEN_INT (lrdest)));
2834 rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
2835 rtx set = gen_rtx_SET (dest, src);
2836 RTX_FRAME_RELATED_P (set) = 1;
2837 RTVEC_ELT (seq, count) = set;
2838 }
2839
2840 note = gen_rtx_SEQUENCE (VOIDmode, seq);
2841 }
2842 else
2843 {
2844 rtx dest = gen_rtx_MEM (V64SImode,
2845 gen_rtx_PLUS (DImode, sp,
2846 GEN_INT (offset)));
2847 rtx src = gen_rtx_REG (V64SImode, regno);
2848 note = gen_rtx_SET (dest, src);
2849 }
2850 RTX_FRAME_RELATED_P (insn) = 1;
2851 add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
2852 }
5326695a
AS
2853 else
2854 emit_insn (gen_gatherv64si_insn_1offset_exec
2855 (reg, vsp, const0_rtx, as, const0_rtx,
2856 gcn_gen_undef (V64SImode), exec));
2857
2858 /* Move our VSP to the next stack entry. */
2859 if (offreg_set != size)
2860 {
2861 offreg_set = size;
2862 emit_move_insn (offreg, GEN_INT (size));
2863 }
2864 if (exec_set != -1)
2865 {
2866 exec_set = -1;
2867 emit_move_insn (exec, GEN_INT (exec_set));
2868 }
2869 emit_insn (gen_addv64si3_vcc_dup_exec
2870 (gcn_operand_part (V64SImode, vsp, 0),
2871 offreg, gcn_operand_part (V64SImode, vsp, 0),
2872 vcc, gcn_gen_undef (V64SImode), exec));
2873 emit_insn (gen_addcv64si3_exec
2874 (gcn_operand_part (V64SImode, vsp, 1),
2875 gcn_operand_part (V64SImode, vsp, 1),
2876 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2877
2878 offset += size;
2879 }
2880
2881 rtx move_vectors = get_insns ();
2882 end_sequence ();
2883
2884 if (prologue)
2885 {
2886 emit_insn (move_scalars);
2887 emit_insn (move_vectors);
2888 }
2889 else
2890 {
2891 emit_insn (move_vectors);
2892 emit_insn (move_scalars);
2893 }
2894}
2895
2896/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2897
2898 For a non-kernel function, the stack layout looks like this (interim),
2899 growing *upwards*:
2900
2901 hi | + ...
2902 |__________________| <-- current SP
2903 | outgoing args |
2904 |__________________|
2905 | (alloca space) |
2906 |__________________|
2907 | local vars |
2908 |__________________| <-- FP/hard FP
2909 | callee-save regs |
2910 |__________________| <-- soft arg pointer
2911 | pretend args |
2912 |__________________| <-- incoming SP
2913 | incoming args |
2914 lo |..................|
2915
2916 This implies arguments (beyond the first N in registers) must grow
2917 downwards (as, apparently, PA has them do).
2918
2919 For a kernel function we have the simpler:
2920
2921 hi | + ...
2922 |__________________| <-- current SP
2923 | outgoing args |
2924 |__________________|
2925 | (alloca space) |
2926 |__________________|
2927 | local vars |
2928 lo |__________________| <-- FP/hard FP
2929
2930*/
2931
2932void
2933gcn_expand_prologue ()
2934{
2935 machine_function *offsets = gcn_compute_frame_offsets ();
2936
2937 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2938 {
2939 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
eff23b79
AS
2940 rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
2941 rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
5326695a 2942 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
eff23b79
AS
2943 rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
2944 rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
5326695a
AS
2945
2946 start_sequence ();
2947
2948 if (offsets->pretend_size > 0)
2949 {
2950 /* FIXME: Do the actual saving of register pretend args to the stack.
2951 Register order needs consideration. */
2952 }
2953
2954 /* Save callee-save regs. */
2955 move_callee_saved_registers (sp, offsets, true);
2956
2957 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2958 + offsets->callee_saves
2959 + offsets->local_vars + offsets->outgoing_args_size;
2960 if (sp_adjust > 0)
eff23b79
AS
2961 {
2962 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2963 we use split add explictly, and specify the DImode add in
2964 the note. */
2965 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2966 rtx adjustment = gen_int_mode (sp_adjust, SImode);
2967 rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
2968 adjustment, scc));
22f201e4
HAQ
2969 if (!offsets->need_frame_pointer)
2970 {
2971 RTX_FRAME_RELATED_P (insn) = 1;
2972 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
2973 gen_rtx_SET (sp,
2974 gen_rtx_PLUS (DImode, sp,
2975 adjustment)));
2976 }
eff23b79
AS
2977 emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
2978 }
5326695a
AS
2979
2980 if (offsets->need_frame_pointer)
eff23b79
AS
2981 {
2982 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2983 we use split add explictly, and specify the DImode add in
2984 the note. */
2985 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2986 int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
2987 rtx adjustment = gen_int_mode (fp_adjust, SImode);
2988 rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
2989 adjustment, scc));
eff23b79
AS
2990 emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
2991 (fp_adjust < 0 ? GEN_INT (-1)
2992 : const0_rtx),
2993 scc, scc));
22f201e4
HAQ
2994
2995 /* Set the CFA to the entry stack address, as an offset from the
2996 frame pointer. This is preferred because the frame pointer is
2997 saved in each frame, whereas the stack pointer is not. */
2998 RTX_FRAME_RELATED_P (insn) = 1;
2999 add_reg_note (insn, REG_CFA_DEF_CFA,
3000 gen_rtx_PLUS (DImode, fp,
3001 GEN_INT (-(offsets->pretend_size
3002 + offsets->callee_saves))));
eff23b79 3003 }
5326695a
AS
3004
3005 rtx_insn *seq = get_insns ();
3006 end_sequence ();
3007
5326695a
AS
3008 emit_insn (seq);
3009 }
3010 else
3011 {
3012 rtx wave_offset = gen_rtx_REG (SImode,
3013 cfun->machine->args.
3014 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
3015
5326695a
AS
3016 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
3017 {
3018 rtx fs_init_lo =
3019 gen_rtx_REG (SImode,
3020 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
3021 rtx fs_init_hi =
3022 gen_rtx_REG (SImode,
3023 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
3024 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
3025 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
3026
3027 /*rtx queue = gen_rtx_REG(DImode,
3028 cfun->machine->args.reg[QUEUE_PTR_ARG]);
3029 rtx aperture = gen_rtx_MEM (SImode,
3030 gen_rtx_PLUS (DImode, queue,
3031 gen_int_mode (68, SImode)));
3032 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
3033
3034 /* Set up flat_scratch. */
3035 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
3036 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
3037 gen_int_mode (8, SImode)));
3038 emit_move_insn (fs_reg_lo, fs_init_hi);
3039 }
3040
3041 /* Set up frame pointer and stack pointer. */
3042 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
eff23b79
AS
3043 rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
3044 rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
5326695a
AS
3045 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
3046 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
3047 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
3048
3049 HOST_WIDE_INT sp_adjust = (offsets->local_vars
3050 + offsets->outgoing_args_size);
3051
3052 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
3053 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
3054 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
3055 gen_int_mode (0xffff, SImode)));
3258c2d6
AS
3056 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3057 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
3058 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
5326695a 3059
eff23b79
AS
3060 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
3061 split add explictly, and specify the DImode add in the note.
3062 The DWARF info expects that the callee-save data is in the frame,
3063 even though it isn't (because this is the entry point), so we
3064 make a notional adjustment to the DWARF frame offset here. */
3065 rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
3066 DImode);
3067 rtx insn;
5326695a 3068 if (sp_adjust > 0)
eff23b79
AS
3069 {
3070 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3071 rtx adjustment = gen_int_mode (sp_adjust, DImode);
3072 insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
3073 scc));
3074 emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
3075 }
5326695a 3076 else
eff23b79
AS
3077 insn = emit_move_insn (sp, fp);
3078 RTX_FRAME_RELATED_P (insn) = 1;
3079 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3080 gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
3081 dbg_adjustment)));
5326695a 3082
22f201e4
HAQ
3083 if (offsets->need_frame_pointer)
3084 {
3085 /* Set the CFA to the entry stack address, as an offset from the
3086 frame pointer. This is necessary when alloca is used, and
3087 harmless otherwise. */
3088 rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
3089 add_reg_note (insn, REG_CFA_DEF_CFA,
3090 gen_rtx_PLUS (DImode, fp, neg_adjust));
3091 }
3092
5326695a
AS
3093 /* Make sure the flat scratch reg doesn't get optimised away. */
3094 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
3095 }
3096
3097 /* Ensure that the scheduler doesn't do anything unexpected. */
3098 emit_insn (gen_blockage ());
3099
cde52d3a
AS
3100 if (TARGET_M0_LDS_LIMIT)
3101 {
3102 /* m0 is initialized for the usual LDS DS and FLAT memory case.
3103 The low-part is the address of the topmost addressable byte, which is
3104 size-1. The high-part is an offset and should be zero. */
3105 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
3106 gen_int_mode (LDS_SIZE, SImode));
3107
3108 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
3109 }
5326695a 3110
cde52d3a
AS
3111 if (TARGET_PACKED_WORK_ITEMS
3112 && cfun && cfun->machine && !cfun->machine->normal_function)
3113 {
3114 /* v0 conatins the X, Y and Z dimensions all in one.
3115 Expand them out for ABI compatibility. */
3116 /* TODO: implement and use zero_extract. */
3117 rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
3118 emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
3119 gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
3120 emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
3121 emit_insn (gen_prologue_use (v1));
3122
3123 rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
3124 emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
3125 gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
3126 emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
3127 emit_insn (gen_prologue_use (v2));
3128 }
5326695a
AS
3129
3130 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
3131 {
3132 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
3133 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3134 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
3135 "gomp_gcn_enter_kernel"));
3136 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3137 }
3138}
3139
3140/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
3141
3142 See gcn_expand_prologue for stack details. */
3143
3144void
3145gcn_expand_epilogue (void)
3146{
3147 /* Ensure that the scheduler doesn't do anything unexpected. */
3148 emit_insn (gen_blockage ());
3149
3150 if (!cfun || !cfun->machine || cfun->machine->normal_function)
3151 {
3152 machine_function *offsets = gcn_compute_frame_offsets ();
3153 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
3154 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
3155
3156 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
3157
3158 if (offsets->need_frame_pointer)
3159 {
3160 /* Restore old SP from the frame pointer. */
3161 if (sp_adjust > 0)
3162 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
3163 else
3164 emit_move_insn (sp, fp);
3165 }
3166 else
3167 {
3168 /* Restore old SP from current SP. */
3169 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
3170
3171 if (sp_adjust > 0)
3172 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
3173 }
3174
3175 move_callee_saved_registers (sp, offsets, false);
3176
3177 /* There's no explicit use of the link register on the return insn. Emit
3178 one here instead. */
3179 if (offsets->lr_needs_saving)
3180 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
3181
3182 /* Similar for frame pointer. */
3183 if (offsets->need_frame_pointer)
3184 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
3185 }
3186 else if (flag_openmp)
3187 {
3188 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
3189 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3190 emit_move_insn (fn_reg,
3191 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
3192 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3193 }
3194 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
3195 {
3196 /* Assume that an exit value compatible with gcn-run is expected.
3197 That is, the third input parameter is an int*.
3198
3199 We can't allocate any new registers, but the kernarg_reg is
3200 dead after this, so we'll use that. */
3201 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3202 [KERNARG_SEGMENT_PTR_ARG]);
3203 rtx retptr_mem = gen_rtx_MEM (DImode,
3204 gen_rtx_PLUS (DImode, kernarg_reg,
3205 GEN_INT (16)));
3206 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
3207 emit_move_insn (kernarg_reg, retptr_mem);
3208
3209 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
4e191462 3210 rtx scalar_retval = gen_rtx_REG (SImode, FIRST_PARM_REG);
5326695a 3211 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
4e191462
AS
3212 emit_move_insn (scalar_retval, gen_rtx_REG (SImode, RETURN_VALUE_REG));
3213 emit_move_insn (retval_mem, scalar_retval);
5326695a
AS
3214 }
3215
3216 emit_jump_insn (gen_gcn_return ());
3217}
3218
22f201e4
HAQ
3219/* Implement TARGET_FRAME_POINTER_REQUIRED.
3220
3221 Return true if the frame pointer should not be eliminated. */
3222
3223bool
3224gcn_frame_pointer_rqd (void)
3225{
3226 /* GDB needs the frame pointer in order to unwind properly,
3227 but that's not important for the entry point, unless alloca is used.
3228 It's not important for code execution, so we should repect the
3229 -fomit-frame-pointer flag. */
3230 return (!flag_omit_frame_pointer
3231 && cfun
3232 && (cfun->calls_alloca
3233 || (cfun->machine && cfun->machine->normal_function)));
3234}
3235
5326695a
AS
3236/* Implement TARGET_CAN_ELIMINATE.
3237
3238 Return true if the compiler is allowed to try to replace register number
3239 FROM_REG with register number TO_REG.
3240
3241 FIXME: is the default "true" not enough? Should this be a negative set? */
3242
3243bool
3244gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
3245{
3246 return (to_reg == HARD_FRAME_POINTER_REGNUM
3247 || to_reg == STACK_POINTER_REGNUM);
3248}
3249
3250/* Implement INITIAL_ELIMINATION_OFFSET.
3251
3252 Returns the initial difference between the specified pair of registers, in
3253 terms of stack position. */
3254
3255HOST_WIDE_INT
3256gcn_initial_elimination_offset (int from, int to)
3257{
3258 machine_function *offsets = gcn_compute_frame_offsets ();
3259
3260 switch (from)
3261 {
3262 case ARG_POINTER_REGNUM:
3263 if (to == STACK_POINTER_REGNUM)
3264 return -(offsets->callee_saves + offsets->local_vars
3265 + offsets->outgoing_args_size);
3266 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3267 return -offsets->callee_saves;
3268 else
3269 gcc_unreachable ();
3270 break;
3271
3272 case FRAME_POINTER_REGNUM:
3273 if (to == STACK_POINTER_REGNUM)
3274 return -(offsets->local_vars + offsets->outgoing_args_size);
3275 else if (to == HARD_FRAME_POINTER_REGNUM)
3276 return 0;
3277 else
3278 gcc_unreachable ();
3279 break;
3280
3281 default:
3282 gcc_unreachable ();
3283 }
3284}
3285
3286/* Implement HARD_REGNO_RENAME_OK.
3287
3288 Return true if it is permissible to rename a hard register from
3289 FROM_REG to TO_REG. */
3290
3291bool
3292gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3293{
3294 if (from_reg == SCC_REG
3295 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3296 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3297 || to_reg == SCC_REG
3298 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3299 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3300 return false;
3301
3302 /* Allow the link register to be used if it was saved. */
3303 if ((to_reg & ~1) == LINK_REGNUM)
3304 return !cfun || cfun->machine->lr_needs_saving;
3305
3306 /* Allow the registers used for the static chain to be used if the chain is
3307 not in active use. */
3308 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3309 return !cfun
3310 || !(cfun->static_chain_decl
3311 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3312 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3313
3314 return true;
3315}
3316
3317/* Implement HARD_REGNO_CALLER_SAVE_MODE.
3318
3319 Which mode is required for saving NREGS of a pseudo-register in
3320 call-clobbered hard register REGNO. */
3321
3322machine_mode
3323gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3324 machine_mode regmode)
3325{
737d6a1a 3326 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
5326695a
AS
3327
3328 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3329 result = (nregs == 1 ? SImode : DImode);
3330
3331 return result;
3332}
3333
3334/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3335
3336 Output assembler code for a block containing the constant parts
3337 of a trampoline, leaving space for the variable parts. */
3338
3339static void
3340gcn_asm_trampoline_template (FILE *f)
3341{
3342 /* The source operand of the move instructions must be a 32-bit
3343 constant following the opcode. */
3344 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3345 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3346 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3347 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3348 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3349}
3350
3351/* Implement TARGET_TRAMPOLINE_INIT.
3352
3353 Emit RTL insns to initialize the variable parts of a trampoline.
3354 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3355 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3356 to be passed to the target function. */
3357
3358static void
3359gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3360{
b7c28a47
AS
3361 if (TARGET_GCN5_PLUS)
3362 sorry ("nested function trampolines not supported on GCN5 due to"
3363 " non-executable stacks");
3364
5326695a
AS
3365 emit_block_move (m_tramp, assemble_trampoline_template (),
3366 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3367
3368 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3369 rtx chain_value_reg = copy_to_reg (chain_value);
3370 rtx fnaddr_reg = copy_to_reg (fnaddr);
3371
3372 for (int i = 0; i < 4; i++)
3373 {
3374 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3375 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3376 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3377 }
3378
3379 rtx tramp_addr = XEXP (m_tramp, 0);
3380 emit_insn (gen_clear_icache (tramp_addr,
3381 plus_constant (ptr_mode, tramp_addr,
3382 TRAMPOLINE_SIZE)));
3383}
3384
3385/* }}} */
3386/* {{{ Miscellaneous. */
3387
3388/* Implement TARGET_CANNOT_COPY_INSN_P.
3389
3390 Return true if INSN must not be duplicated. */
3391
3392static bool
3393gcn_cannot_copy_insn_p (rtx_insn *insn)
3394{
3395 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3396 return true;
3397
3398 return false;
3399}
3400
3401/* Implement TARGET_DEBUG_UNWIND_INFO.
3402
3403 Defines the mechanism that will be used for describing frame unwind
3404 information to the debugger. */
3405
3406static enum unwind_info_type
3407gcn_debug_unwind_info ()
3408{
251697a6 3409 return UI_DWARF2;
5326695a
AS
3410}
3411
3412/* Determine if there is a suitable hardware conversion instruction.
3413 Used primarily by the machine description. */
3414
3415bool
3416gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3417{
3418 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3419 return false;
3420
3421 if (VECTOR_MODE_P (from))
3422 {
3423 from = GET_MODE_INNER (from);
3424 to = GET_MODE_INNER (to);
3425 }
3426
3427 switch (op)
3428 {
3429 case fix_trunc_cvt:
3430 case fixuns_trunc_cvt:
3431 if (GET_MODE_CLASS (from) != MODE_FLOAT
3432 || GET_MODE_CLASS (to) != MODE_INT)
3433 return false;
3434 break;
3435 case float_cvt:
3436 case floatuns_cvt:
3437 if (GET_MODE_CLASS (from) != MODE_INT
3438 || GET_MODE_CLASS (to) != MODE_FLOAT)
3439 return false;
3440 break;
3441 case extend_cvt:
3442 if (GET_MODE_CLASS (from) != MODE_FLOAT
3443 || GET_MODE_CLASS (to) != MODE_FLOAT
3444 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3445 return false;
3446 break;
3447 case trunc_cvt:
3448 if (GET_MODE_CLASS (from) != MODE_FLOAT
3449 || GET_MODE_CLASS (to) != MODE_FLOAT
3450 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3451 return false;
3452 break;
3453 }
3454
3455 return ((to == HImode && from == HFmode)
3456 || (to == SImode && (from == SFmode || from == DFmode))
3457 || (to == HFmode && (from == HImode || from == SFmode))
3458 || (to == SFmode && (from == SImode || from == HFmode
3459 || from == DFmode))
3460 || (to == DFmode && (from == SImode || from == SFmode)));
3461}
3462
76d46331
KCY
3463/* Implement TARGET_EMUTLS_VAR_INIT.
3464
3465 Disable emutls (gthr-gcn.h does not support it, yet). */
3466
3467tree
3468gcn_emutls_var_init (tree, tree decl, tree)
3469{
3470 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
6349e4f0 3471 return NULL_TREE;
76d46331
KCY
3472}
3473
5326695a
AS
3474/* }}} */
3475/* {{{ Costs. */
3476
3477/* Implement TARGET_RTX_COSTS.
3478
3479 Compute a (partial) cost for rtx X. Return true if the complete
3480 cost has been computed, and false if subexpressions should be
3481 scanned. In either case, *TOTAL contains the cost result. */
3482
3483static bool
3484gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3485{
3486 enum rtx_code code = GET_CODE (x);
3487 switch (code)
3488 {
3489 case CONST:
3490 case CONST_DOUBLE:
3491 case CONST_VECTOR:
3492 case CONST_INT:
3493 if (gcn_inline_constant_p (x))
3494 *total = 0;
3495 else if (code == CONST_INT
3496 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3497 *total = 1;
3498 else if (gcn_constant_p (x))
3499 *total = 2;
3500 else
3501 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3502 return true;
3503
3504 case DIV:
3505 *total = 100;
3506 return false;
3507
3508 default:
3509 *total = 3;
3510 return false;
3511 }
3512}
3513
3514/* Implement TARGET_MEMORY_MOVE_COST.
3515
3516 Return the cost of moving data of mode M between a
3517 register and memory. A value of 2 is the default; this cost is
3518 relative to those in `REGISTER_MOVE_COST'.
3519
3520 This function is used extensively by register_move_cost that is used to
3521 build tables at startup. Make it inline in this case.
3522 When IN is 2, return maximum of in and out move cost.
3523
3524 If moving between registers and memory is more expensive than
3525 between two registers, you should define this macro to express the
3526 relative cost.
3527
3528 Model also increased moving costs of QImode registers in non
3529 Q_REGS classes. */
3530
3531#define LOAD_COST 32
3532#define STORE_COST 32
3533static int
3534gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3535{
3536 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3537 switch (regclass)
3538 {
3539 case SCC_CONDITIONAL_REG:
3540 case VCCZ_CONDITIONAL_REG:
3541 case VCC_CONDITIONAL_REG:
3542 case EXECZ_CONDITIONAL_REG:
3543 case ALL_CONDITIONAL_REGS:
3544 case SGPR_REGS:
3545 case SGPR_EXEC_REGS:
3546 case EXEC_MASK_REG:
3547 case SGPR_VOP_SRC_REGS:
3548 case SGPR_MEM_SRC_REGS:
3549 case SGPR_SRC_REGS:
3550 case SGPR_DST_REGS:
3551 case GENERAL_REGS:
3552 case AFP_REGS:
3553 if (!in)
3554 return (STORE_COST + 2) * nregs;
3555 return LOAD_COST * nregs;
3556 case VGPR_REGS:
3557 if (in)
3558 return (LOAD_COST + 2) * nregs;
3559 return STORE_COST * nregs;
3560 case ALL_REGS:
3561 case ALL_GPR_REGS:
3562 case SRCDST_REGS:
3563 if (in)
3564 return (LOAD_COST + 2) * nregs;
3565 return (STORE_COST + 2) * nregs;
3566 default:
3567 gcc_unreachable ();
3568 }
3569}
3570
3571/* Implement TARGET_REGISTER_MOVE_COST.
3572
3573 Return the cost of moving data from a register in class CLASS1 to
3574 one in class CLASS2. Base value is 2. */
3575
3576static int
3577gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3578{
3579 /* Increase cost of moving from and to vector registers. While this is
3580 fast in hardware (I think), it has hidden cost of setting up the exec
3581 flags. */
3582 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3583 return 4;
3584 return 2;
3585}
3586
3587/* }}} */
3588/* {{{ Builtins. */
3589
3590/* Type codes used by GCN built-in definitions. */
3591
3592enum gcn_builtin_type_index
3593{
3594 GCN_BTI_END_OF_PARAMS,
3595
3596 GCN_BTI_VOID,
3597 GCN_BTI_BOOL,
3598 GCN_BTI_INT,
3599 GCN_BTI_UINT,
3600 GCN_BTI_SIZE_T,
3601 GCN_BTI_LLINT,
3602 GCN_BTI_LLUINT,
3603 GCN_BTI_EXEC,
3604
3605 GCN_BTI_SF,
3606 GCN_BTI_V64SI,
3607 GCN_BTI_V64SF,
3608 GCN_BTI_V64PTR,
3609 GCN_BTI_SIPTR,
3610 GCN_BTI_SFPTR,
3611 GCN_BTI_VOIDPTR,
3612
3613 GCN_BTI_LDS_VOIDPTR,
3614
3615 GCN_BTI_MAX
3616};
3617
3618static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3619
3620#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3621#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3622#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3623#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3624#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3625#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3626#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3627#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3628#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3629
3630static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3631 struct gcn_builtin_description *);
3632static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3633 struct gcn_builtin_description *);
3634
3635struct gcn_builtin_description;
3636typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3637 struct gcn_builtin_description *);
3638
3639enum gcn_builtin_type
3640{
3641 B_UNIMPLEMENTED, /* Sorry out */
3642 B_INSN, /* Emit a pattern */
3643 B_OVERLOAD /* Placeholder for an overloaded function */
3644};
3645
3646struct gcn_builtin_description
3647{
3648 int fcode;
3649 int icode;
3650 const char *name;
3651 enum gcn_builtin_type type;
3652 /* The first element of parm is always the return type. The rest
3653 are a zero terminated list of parameters. */
3654 int parm[6];
3655 gcn_builtin_expander expander;
3656};
3657
3658/* Read in the GCN builtins from gcn-builtins.def. */
3659
3660extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3661
3662struct gcn_builtin_description gcn_builtins[] = {
3663#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3664 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3665
3666#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3667 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3668 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3669 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3670 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3671 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3672 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3673 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3674 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3675
3676#include "gcn-builtins.def"
3677#undef DEF_BUILTIN_BINOP_INT_FP
3678#undef DEF_BUILTIN
3679};
3680
3681static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3682
3683/* Implement TARGET_BUILTIN_DECL.
3684
3685 Return the GCN builtin for CODE. */
3686
3687tree
3688gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3689{
3690 if (code >= GCN_BUILTIN_MAX)
3691 return error_mark_node;
3692
3693 return gcn_builtin_decls[code];
3694}
3695
3696/* Helper function for gcn_init_builtins. */
3697
3698static void
3699gcn_init_builtin_types (void)
3700{
3701 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3702 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3703 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3704 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3705 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3706 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3707 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3708
3709 exec_type_node = unsigned_intDI_type_node;
3710 sf_type_node = float32_type_node;
3711 v64si_type_node = build_vector_type (intSI_type_node, 64);
3712 v64sf_type_node = build_vector_type (float_type_node, 64);
3713 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3714 /*build_pointer_type
3715 (integer_type_node) */
3716 , 64);
3717 tree tmp = build_distinct_type_copy (intSI_type_node);
3718 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3719 siptr_type_node = build_pointer_type (tmp);
3720
3721 tmp = build_distinct_type_copy (float_type_node);
3722 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3723 sfptr_type_node = build_pointer_type (tmp);
3724
3725 tmp = build_distinct_type_copy (void_type_node);
3726 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3727 voidptr_type_node = build_pointer_type (tmp);
3728
3729 tmp = build_distinct_type_copy (void_type_node);
3730 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3731 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3732}
3733
3734/* Implement TARGET_INIT_BUILTINS.
3735
3736 Set up all builtin functions for this target. */
3737
3738static void
3739gcn_init_builtins (void)
3740{
3741 gcn_init_builtin_types ();
3742
3743 struct gcn_builtin_description *d;
3744 unsigned int i;
3745 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3746 {
3747 tree p;
3748 char name[64]; /* build_function will make a copy. */
3749 int parm;
3750
3751 /* FIXME: Is this necessary/useful? */
3752 if (d->name == 0)
3753 continue;
3754
3755 /* Find last parm. */
3756 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3757 ;
3758
3759 p = void_list_node;
3760 while (parm > 1)
3761 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3762
3763 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3764
3765 sprintf (name, "__builtin_gcn_%s", d->name);
3766 gcn_builtin_decls[i]
3767 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3768
3769 /* These builtins don't throw. */
3770 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3771 }
3772
5326695a
AS
3773 /* These builtins need to take/return an LDS pointer: override the generic
3774 versions here. */
3775
3776 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3777 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3778
3779 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3780 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3781 false);
3782
3783 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3784 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3785 false);
3786
3787 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3788 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
5326695a
AS
3789}
3790
a8a730cd
JB
3791/* Implement TARGET_INIT_LIBFUNCS. */
3792
3793static void
3794gcn_init_libfuncs (void)
3795{
3796 /* BITS_PER_UNIT * 2 is 64 bits, which causes
e53b6e56 3797 optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
a8a730cd
JB
3798 libcalls that we need to support operations for that type. Initialise
3799 them here instead. */
3800 set_optab_libfunc (udiv_optab, TImode, "__udivti3");
3801 set_optab_libfunc (umod_optab, TImode, "__umodti3");
3802 set_optab_libfunc (sdiv_optab, TImode, "__divti3");
3803 set_optab_libfunc (smod_optab, TImode, "__modti3");
3804 set_optab_libfunc (smul_optab, TImode, "__multi3");
3805 set_optab_libfunc (addv_optab, TImode, "__addvti3");
3806 set_optab_libfunc (subv_optab, TImode, "__subvti3");
3807 set_optab_libfunc (negv_optab, TImode, "__negvti2");
3808 set_optab_libfunc (absv_optab, TImode, "__absvti2");
3809 set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
3810 set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
3811 set_optab_libfunc (clz_optab, TImode, "__clzti2");
3812 set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
3813 set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
3814 set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
3815 set_optab_libfunc (parity_optab, TImode, "__parityti2");
3816 set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
3817}
3818
5326695a
AS
3819/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3820 not require taking the address of any object, other than the memory
3821 cell being operated on.
3822
3823 Helper function for gcn_expand_builtin_1. */
3824
3825static rtx
3826gcn_expand_cmp_swap (tree exp, rtx target)
3827{
3828 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3829 addr_space_t as
3830 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3831 machine_mode as_mode = gcn_addr_space_address_mode (as);
3832
3833 if (!target)
3834 target = gen_reg_rtx (mode);
3835
3836 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3837 NULL_RTX, as_mode, EXPAND_NORMAL);
3838 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3839 NULL_RTX, mode, EXPAND_NORMAL);
3840 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3841 NULL_RTX, mode, EXPAND_NORMAL);
3842 rtx pat;
3843
3844 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3845 set_mem_addr_space (mem, as);
3846
3847 if (!REG_P (cmp))
3848 cmp = copy_to_mode_reg (mode, cmp);
3849 if (!REG_P (src))
3850 src = copy_to_mode_reg (mode, src);
3851
3852 if (mode == SImode)
3853 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3854 else
3855 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3856
3857 emit_insn (pat);
3858
3859 return target;
3860}
3861
3862/* Expand many different builtins.
3863
3864 Intended for use in gcn-builtins.def. */
3865
3866static rtx
3867gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3868 machine_mode /*mode */ , int ignore,
3869 struct gcn_builtin_description *)
3870{
3871 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3872 switch (DECL_MD_FUNCTION_CODE (fndecl))
5326695a
AS
3873 {
3874 case GCN_BUILTIN_FLAT_LOAD_INT32:
3875 {
3876 if (ignore)
3877 return target;
3878 /*rtx exec = */
3879 force_reg (DImode,
3880 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3881 EXPAND_NORMAL));
3882 /*rtx ptr = */
3883 force_reg (V64DImode,
3884 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3885 EXPAND_NORMAL));
3886 /*emit_insn (gen_vector_flat_loadv64si
3887 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3888 return target;
3889 }
3890 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3891 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3892 {
3893 if (ignore)
3894 return target;
3895 rtx exec = force_reg (DImode,
3896 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3897 DImode,
3898 EXPAND_NORMAL));
3899 rtx ptr = force_reg (DImode,
3900 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3901 V64DImode,
3902 EXPAND_NORMAL));
3903 rtx offsets = force_reg (V64SImode,
3904 expand_expr (CALL_EXPR_ARG (exp, 2),
3905 NULL_RTX, V64DImode,
3906 EXPAND_NORMAL));
3907 rtx addrs = gen_reg_rtx (V64DImode);
3908 rtx tmp = gen_reg_rtx (V64SImode);
3909 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3910 GEN_INT (2),
3911 gcn_gen_undef (V64SImode), exec));
3912 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3913 gcn_gen_undef (V64DImode),
3914 exec));
3915 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3916 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3917 /* FIXME: set attributes. */
3918 emit_insn (gen_mov_with_exec (target, mem, exec));
3919 return target;
3920 }
3921 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3922 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3923 {
3924 rtx exec = force_reg (DImode,
3925 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3926 DImode,
3927 EXPAND_NORMAL));
3928 rtx ptr = force_reg (DImode,
3929 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3930 V64DImode,
3931 EXPAND_NORMAL));
3932 rtx offsets = force_reg (V64SImode,
3933 expand_expr (CALL_EXPR_ARG (exp, 2),
3934 NULL_RTX, V64DImode,
3935 EXPAND_NORMAL));
3936 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3937 3)));
3938 rtx val = force_reg (vmode,
3939 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3940 vmode,
3941 EXPAND_NORMAL));
3942 rtx addrs = gen_reg_rtx (V64DImode);
3943 rtx tmp = gen_reg_rtx (V64SImode);
3944 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3945 GEN_INT (2),
3946 gcn_gen_undef (V64SImode), exec));
3947 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3948 gcn_gen_undef (V64DImode),
3949 exec));
3950 rtx mem = gen_rtx_MEM (vmode, addrs);
3951 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3952 /* FIXME: set attributes. */
3953 emit_insn (gen_mov_with_exec (mem, val, exec));
3954 return target;
3955 }
3956 case GCN_BUILTIN_SQRTVF:
3957 {
3958 if (ignore)
3959 return target;
3960 rtx exec = gcn_full_exec_reg ();
3961 rtx arg = force_reg (V64SFmode,
3962 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3963 V64SFmode,
3964 EXPAND_NORMAL));
3965 emit_insn (gen_sqrtv64sf2_exec
3966 (target, arg, gcn_gen_undef (V64SFmode), exec));
3967 return target;
3968 }
3969 case GCN_BUILTIN_SQRTF:
3970 {
3971 if (ignore)
3972 return target;
3973 rtx arg = force_reg (SFmode,
3974 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3975 SFmode,
3976 EXPAND_NORMAL));
3977 emit_insn (gen_sqrtsf2 (target, arg));
3978 return target;
3979 }
3980 case GCN_BUILTIN_OMP_DIM_SIZE:
3981 {
3982 if (ignore)
3983 return target;
3984 emit_insn (gen_oacc_dim_size (target,
3985 expand_expr (CALL_EXPR_ARG (exp, 0),
3986 NULL_RTX, SImode,
3987 EXPAND_NORMAL)));
3988 return target;
3989 }
3990 case GCN_BUILTIN_OMP_DIM_POS:
3991 {
3992 if (ignore)
3993 return target;
3994 emit_insn (gen_oacc_dim_pos (target,
3995 expand_expr (CALL_EXPR_ARG (exp, 0),
3996 NULL_RTX, SImode,
3997 EXPAND_NORMAL)));
3998 return target;
3999 }
4000 case GCN_BUILTIN_CMP_SWAP:
4001 case GCN_BUILTIN_CMP_SWAPLL:
4002 return gcn_expand_cmp_swap (exp, target);
4003
4004 case GCN_BUILTIN_ACC_SINGLE_START:
4005 {
4006 if (ignore)
4007 return target;
4008
4009 rtx wavefront = gcn_oacc_dim_pos (1);
4010 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
4011 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
4012 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
4013 return cc;
4014 }
4015
4016 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
4017 {
4018 rtx blk = force_reg (SImode,
4019 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4020 SImode, EXPAND_NORMAL));
4021 rtx wavefront = gcn_oacc_dim_pos (1);
4022 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
4023 rtx not_zero = gen_label_rtx ();
4024 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
4025 emit_move_insn (blk, const0_rtx);
4026 emit_label (not_zero);
4027 return blk;
4028 }
4029
4030 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
4031 return target;
4032
4033 case GCN_BUILTIN_ACC_BARRIER:
4034 emit_insn (gen_gcn_wavefront_barrier ());
4035 return target;
4036
4037 default:
4038 gcc_unreachable ();
4039 }
4040}
4041
4042/* Expansion of simple arithmetic and bit binary operation builtins.
4043
4044 Intended for use with gcn_builtins table. */
4045
4046static rtx
4047gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
4048 machine_mode /*mode */ , int ignore,
4049 struct gcn_builtin_description *d)
4050{
4051 int icode = d->icode;
4052 if (ignore)
4053 return target;
4054
4055 rtx exec = force_reg (DImode,
4056 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4057 EXPAND_NORMAL));
4058
4059 machine_mode m1 = insn_data[icode].operand[1].mode;
4060 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
4061 EXPAND_NORMAL);
4062 if (!insn_data[icode].operand[1].predicate (arg1, m1))
4063 arg1 = force_reg (m1, arg1);
4064
4065 machine_mode m2 = insn_data[icode].operand[2].mode;
4066 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
4067 EXPAND_NORMAL);
4068 if (!insn_data[icode].operand[2].predicate (arg2, m2))
4069 arg2 = force_reg (m2, arg2);
4070
4071 rtx arg_prev;
4072 if (call_expr_nargs (exp) == 4)
4073 {
4074 machine_mode m_prev = insn_data[icode].operand[4].mode;
4075 arg_prev = force_reg (m_prev,
4076 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4077 m_prev, EXPAND_NORMAL));
4078 }
4079 else
4080 arg_prev = gcn_gen_undef (GET_MODE (target));
4081
4082 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
4083 emit_insn (pat);
4084 return target;
4085}
4086
4087/* Implement TARGET_EXPAND_BUILTIN.
4088
4089 Expand an expression EXP that calls a built-in function, with result going
4090 to TARGET if that's convenient (and in mode MODE if that's convenient).
4091 SUBTARGET may be used as the target for computing one of EXP's operands.
4092 IGNORE is nonzero if the value is to be ignored. */
4093
4094rtx
4095gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
4096 int ignore)
4097{
4098 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 4099 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
5326695a
AS
4100 struct gcn_builtin_description *d;
4101
4102 gcc_assert (fcode < GCN_BUILTIN_MAX);
4103 d = &gcn_builtins[fcode];
4104
4105 if (d->type == B_UNIMPLEMENTED)
4106 sorry ("Builtin not implemented");
4107
4108 return d->expander (exp, target, subtarget, mode, ignore, d);
4109}
4110
4111/* }}} */
4112/* {{{ Vectorization. */
4113
4114/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
4115
4116 A vector mask is a value that holds one boolean result for every element in
4117 a vector. */
4118
4119opt_machine_mode
10116ec1 4120gcn_vectorize_get_mask_mode (machine_mode)
5326695a
AS
4121{
4122 /* GCN uses a DImode bit-mask. */
4123 return DImode;
4124}
4125
4126/* Return an RTX that references a vector with the i-th lane containing
4127 PERM[i]*4.
4128
4129 Helper function for gcn_vectorize_vec_perm_const. */
4130
4131static rtx
4132gcn_make_vec_perm_address (unsigned int *perm)
4133{
4134 rtx x = gen_reg_rtx (V64SImode);
4135 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
4136
4137 /* Permutation addresses use byte addressing. With each vector lane being
4138 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
4139 so only set those.
4140
4141 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
4142 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
4143 only take one source vector, the most-significant bit can be ignored
4144 here. Instead, we can use EXEC masking to select the relevant part of
4145 each source vector after they are permuted separately. */
4146 uint64_t bit_mask = 1 << 2;
4147 for (int i = 2; i < 8; i++, bit_mask <<= 1)
4148 {
4149 uint64_t exec_mask = 0;
4150 uint64_t lane_mask = 1;
4151 for (int j = 0; j < 64; j++, lane_mask <<= 1)
4152 if ((perm[j] * 4) & bit_mask)
4153 exec_mask |= lane_mask;
4154
4155 if (exec_mask)
4156 emit_insn (gen_addv64si3_exec (x, x,
4157 gcn_vec_constant (V64SImode,
4158 bit_mask),
4159 x, get_exec (exec_mask)));
4160 }
4161
4162 return x;
4163}
4164
4165/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
4166
4167 Return true if permutation with SEL is possible.
4168
4169 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
4170 permutations. */
4171
4172static bool
ae8decf1
PK
4173gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
4174 rtx dst, rtx src0, rtx src1,
5326695a
AS
4175 const vec_perm_indices & sel)
4176{
ae8decf1
PK
4177 if (vmode != op_mode)
4178 return false;
4179
5326695a
AS
4180 unsigned int nelt = GET_MODE_NUNITS (vmode);
4181
4182 gcc_assert (VECTOR_MODE_P (vmode));
4183 gcc_assert (nelt <= 64);
4184 gcc_assert (sel.length () == nelt);
4185
4186 if (!dst)
4187 {
4188 /* All vector permutations are possible on this architecture,
4189 with varying degrees of efficiency depending on the permutation. */
4190 return true;
4191 }
4192
4193 unsigned int perm[64];
4194 for (unsigned int i = 0; i < nelt; ++i)
4195 perm[i] = sel[i] & (2 * nelt - 1);
55308fc2
AS
4196 for (unsigned int i = nelt; i < 64; ++i)
4197 perm[i] = 0;
5326695a 4198
b1d1e2b5
JJ
4199 src0 = force_reg (vmode, src0);
4200 src1 = force_reg (vmode, src1);
4201
5326695a
AS
4202 /* Make life a bit easier by swapping operands if necessary so that
4203 the first element always comes from src0. */
4204 if (perm[0] >= nelt)
4205 {
b1d1e2b5 4206 std::swap (src0, src1);
5326695a
AS
4207
4208 for (unsigned int i = 0; i < nelt; ++i)
4209 if (perm[i] < nelt)
4210 perm[i] += nelt;
4211 else
4212 perm[i] -= nelt;
4213 }
4214
4215 /* TODO: There are more efficient ways to implement certain permutations
4216 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
4217 this more inefficient generic approach is used. */
4218
4219 int64_t src1_lanes = 0;
4220 int64_t lane_bit = 1;
4221
4222 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
4223 {
4224 /* Set the bits for lanes from src1. */
4225 if (perm[i] >= nelt)
4226 src1_lanes |= lane_bit;
4227 }
4228
4229 rtx addr = gcn_make_vec_perm_address (perm);
4230 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
4231
4232 switch (vmode)
4233 {
4234 case E_V64QImode:
4235 ds_bpermute = gen_ds_bpermutev64qi;
4236 break;
4237 case E_V64HImode:
4238 ds_bpermute = gen_ds_bpermutev64hi;
4239 break;
4240 case E_V64SImode:
4241 ds_bpermute = gen_ds_bpermutev64si;
4242 break;
4243 case E_V64HFmode:
4244 ds_bpermute = gen_ds_bpermutev64hf;
4245 break;
4246 case E_V64SFmode:
4247 ds_bpermute = gen_ds_bpermutev64sf;
4248 break;
4249 case E_V64DImode:
4250 ds_bpermute = gen_ds_bpermutev64di;
4251 break;
4252 case E_V64DFmode:
4253 ds_bpermute = gen_ds_bpermutev64df;
4254 break;
4255 default:
4256 gcc_assert (false);
4257 }
4258
4259 /* Load elements from src0 to dst. */
4260 gcc_assert (~src1_lanes);
4261 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
4262
4263 /* Load elements from src1 to dst. */
4264 if (src1_lanes)
4265 {
4266 /* Masking a lane masks both the destination and source lanes for
4267 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
4268 then add an extra masked move to merge the results of permuting
4269 the two source vectors together.
4270 */
4271 rtx tmp = gen_reg_rtx (vmode);
4272 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
4273 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
4274 }
4275
4276 return true;
4277}
4278
4279/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4280
4281 Return nonzero if vector MODE is supported with at least move
4282 instructions. */
4283
4284static bool
4285gcn_vector_mode_supported_p (machine_mode mode)
4286{
2b99bed8
AS
4287 return (mode == V64QImode || mode == V64HImode
4288 || mode == V64SImode || mode == V64DImode
5326695a
AS
4289 || mode == V64SFmode || mode == V64DFmode);
4290}
4291
4292/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4293
4294 Enables autovectorization for all supported modes. */
4295
4296static machine_mode
4297gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4298{
4299 switch (mode)
4300 {
4301 case E_QImode:
4302 return V64QImode;
4303 case E_HImode:
4304 return V64HImode;
4305 case E_SImode:
4306 return V64SImode;
4307 case E_DImode:
4308 return V64DImode;
4309 case E_SFmode:
4310 return V64SFmode;
4311 case E_DFmode:
4312 return V64DFmode;
4313 default:
4314 return word_mode;
4315 }
4316}
4317
2b99bed8
AS
4318/* Implement TARGET_VECTORIZE_RELATED_MODE.
4319
4320 All GCN vectors are 64-lane, so this is simpler than other architectures.
4321 In particular, we do *not* want to match vector bit-size. */
4322
4323static opt_machine_mode
dd455df7
TB
4324gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
4325 scalar_mode element_mode, poly_uint64 nunits)
2b99bed8
AS
4326{
4327 if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4328 return VOIDmode;
4329
4330 machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4331 if (!VECTOR_MODE_P (pref_mode))
4332 return VOIDmode;
4333
4334 return pref_mode;
4335}
4336
5326695a
AS
4337/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4338
4339 Returns the preferred alignment in bits for accesses to vectors of type type
4340 in vectorized code. This might be less than or greater than the ABI-defined
4341 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4342 of a single element, in which case the vectorizer will not try to optimize
4343 for alignment. */
4344
4345static poly_uint64
4346gcn_preferred_vector_alignment (const_tree type)
4347{
4348 return TYPE_ALIGN (TREE_TYPE (type));
4349}
4350
4351/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4352
4353 Return true if the target supports misaligned vector store/load of a
4354 specific factor denoted in the misalignment parameter. */
4355
4356static bool
4357gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4358 const_tree type, int misalignment,
4359 bool is_packed)
4360{
4361 if (is_packed)
4362 return false;
4363
4364 /* If the misalignment is unknown, we should be able to handle the access
4365 so long as it is not to a member of a packed data structure. */
4366 if (misalignment == -1)
4367 return true;
4368
4369 /* Return true if the misalignment is a multiple of the natural alignment
4370 of the vector's element type. This is probably always going to be
4371 true in practice, since we've already established that this isn't a
4372 packed access. */
4373 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4374}
4375
4376/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4377
4378 Return true if vector alignment is reachable (by peeling N iterations) for
4379 the given scalar type TYPE. */
4380
4381static bool
4382gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4383{
4384 /* Vectors which aren't in packed structures will not be less aligned than
4385 the natural alignment of their element type, so this is safe. */
4386 return !is_packed;
4387}
4388
4389/* Generate DPP instructions used for vector reductions.
4390
4391 The opcode is given by INSN.
4392 The first operand of the operation is shifted right by SHIFT vector lanes.
4393 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4394 broadcast the next row (thereby acting like a shift of 16 for the end of
4395 each row). If SHIFT is 32, lane 31 is broadcast to all the
4396 following lanes (thereby acting like a shift of 32 for lane 63). */
4397
4398char *
4399gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4400 int unspec, int shift)
4401{
a5879399 4402 static char buf[128];
5326695a
AS
4403 const char *dpp;
4404 const char *vcc_in = "";
4405 const char *vcc_out = "";
4406
4407 /* Add the vcc operand if needed. */
4408 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4409 {
4410 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4411 vcc_in = ", vcc";
4412
4413 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4414 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4415 vcc_out = ", vcc";
4416 }
4417
4418 /* Add the DPP modifiers. */
4419 switch (shift)
4420 {
4421 case 1:
4422 dpp = "row_shr:1 bound_ctrl:0";
4423 break;
4424 case 2:
4425 dpp = "row_shr:2 bound_ctrl:0";
4426 break;
4427 case 4:
4428 dpp = "row_shr:4 bank_mask:0xe";
4429 break;
4430 case 8:
4431 dpp = "row_shr:8 bank_mask:0xc";
4432 break;
4433 case 16:
4434 dpp = "row_bcast:15 row_mask:0xa";
4435 break;
4436 case 32:
4437 dpp = "row_bcast:31 row_mask:0xc";
4438 break;
4439 default:
4440 gcc_unreachable ();
4441 }
4442
a5879399
AS
4443 if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4444 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4445 insn, dpp, insn, dpp);
4446 else if (unspec == UNSPEC_MOV_DPP_SHR)
4447 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4448 else
4449 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
5326695a
AS
4450
4451 return buf;
4452}
4453
4454/* Generate vector reductions in terms of DPP instructions.
4455
4456 The vector register SRC of mode MODE is reduced using the operation given
4457 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4458 register. */
4459
4460rtx
4461gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4462{
a5879399
AS
4463 machine_mode orig_mode = mode;
4464 bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4465 || unspec == UNSPEC_SMAX_DPP_SHR
4466 || unspec == UNSPEC_UMIN_DPP_SHR
4467 || unspec == UNSPEC_UMAX_DPP_SHR)
d9f50366
AS
4468 && (mode == V64DImode
4469 || mode == V64DFmode))
a5879399
AS
4470 || (unspec == UNSPEC_PLUS_DPP_SHR
4471 && mode == V64DFmode));
4472 rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4473 : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4474 : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4475 : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4476 : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4477 : UNKNOWN);
4478 bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4479 || unspec == UNSPEC_SMAX_DPP_SHR
4480 || unspec == UNSPEC_UMIN_DPP_SHR
4481 || unspec == UNSPEC_UMAX_DPP_SHR)
4482 && (mode == V64QImode
4483 || mode == V64HImode));
4484 bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4485 || unspec == UNSPEC_UMAX_DPP_SHR);
5326695a
AS
4486 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4487 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4488 && (TARGET_GCN3 || mode == V64DImode);
4489
4490 if (use_plus_carry)
4491 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4492
a5879399
AS
4493 if (use_extends)
4494 {
4495 rtx tmp = gen_reg_rtx (V64SImode);
4496 convert_move (tmp, src, unsignedp);
4497 src = tmp;
4498 mode = V64SImode;
4499 }
4500
5326695a
AS
4501 /* Perform reduction by first performing the reduction operation on every
4502 pair of lanes, then on every pair of results from the previous
4503 iteration (thereby effectively reducing every 4 lanes) and so on until
4504 all lanes are reduced. */
d51cad0b 4505 rtx in, out = force_reg (mode, src);
5326695a
AS
4506 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4507 {
4508 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
a5879399
AS
4509 in = out;
4510 out = gen_reg_rtx (mode);
4511
4512 if (use_moves)
5326695a 4513 {
a5879399
AS
4514 rtx tmp = gen_reg_rtx (mode);
4515 emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4516 emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
5326695a 4517 }
a5879399
AS
4518 else
4519 {
4520 rtx insn = gen_rtx_SET (out,
4521 gen_rtx_UNSPEC (mode,
4522 gen_rtvec (3, in, in,
4523 shift_val),
4524 unspec));
4525
4526 /* Add clobber for instructions that set the carry flags. */
4527 if (use_plus_carry)
4528 {
4529 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4530 gen_rtx_REG (DImode, VCC_REG));
4531 insn = gen_rtx_PARALLEL (VOIDmode,
4532 gen_rtvec (2, insn, clobber));
4533 }
5326695a 4534
a5879399
AS
4535 emit_insn (insn);
4536 }
4537 }
5326695a 4538
a5879399
AS
4539 if (use_extends)
4540 {
4541 rtx tmp = gen_reg_rtx (orig_mode);
4542 convert_move (tmp, out, unsignedp);
4543 out = tmp;
5326695a
AS
4544 }
4545
a5879399 4546 return out;
5326695a
AS
4547}
4548
4549/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4550
4551int
4552gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4553 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4554{
4555 /* Always vectorize. */
4556 return 1;
4557}
4558
b73c49f6
AS
4559/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
4560
4561static int
4562gcn_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *ARG_UNUSED (node),
4563 struct cgraph_simd_clone *clonei,
4564 tree base_type,
4565 int ARG_UNUSED (num))
4566{
4567 unsigned int elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
4568
4569 if (known_eq (clonei->simdlen, 0U))
4570 clonei->simdlen = 64;
4571 else if (maybe_ne (clonei->simdlen, 64U))
4572 {
4573 /* Note that x86 has a similar message that is likely to trigger on
4574 sizes that are OK for gcn; the user can't win. */
4575 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
4576 "unsupported simdlen %wd (amdgcn)",
4577 clonei->simdlen.to_constant ());
4578 return 0;
4579 }
4580
4581 clonei->vecsize_mangle = 'n';
4582 clonei->vecsize_int = 0;
4583 clonei->vecsize_float = 0;
4584
4585 /* DImode ought to be more natural here, but VOIDmode produces better code,
4586 at present, due to the shift-and-test steps not being optimized away
4587 inside the in-branch clones. */
4588 clonei->mask_mode = VOIDmode;
4589
4590 return 1;
4591}
4592
4593/* Implement TARGET_SIMD_CLONE_ADJUST. */
4594
4595static void
4596gcn_simd_clone_adjust (struct cgraph_node *ARG_UNUSED (node))
4597{
4598 /* This hook has to be defined when
4599 TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN is defined, but we don't
4600 need it to do anything yet. */
4601}
4602
4603/* Implement TARGET_SIMD_CLONE_USABLE. */
4604
4605static int
4606gcn_simd_clone_usable (struct cgraph_node *ARG_UNUSED (node))
4607{
4608 /* We don't need to do anything here because
4609 gcn_simd_clone_compute_vecsize_and_simdlen currently only returns one
4610 possibility. */
4611 return 0;
4612}
4613
5326695a
AS
4614/* }}} */
4615/* {{{ md_reorg pass. */
4616
4617/* Identify VMEM instructions from their "type" attribute. */
4618
4619static bool
4620gcn_vmem_insn_p (attr_type type)
4621{
4622 switch (type)
4623 {
4624 case TYPE_MUBUF:
4625 case TYPE_MTBUF:
4626 case TYPE_FLAT:
4627 return true;
4628 case TYPE_UNKNOWN:
4629 case TYPE_SOP1:
4630 case TYPE_SOP2:
4631 case TYPE_SOPK:
4632 case TYPE_SOPC:
4633 case TYPE_SOPP:
4634 case TYPE_SMEM:
4635 case TYPE_DS:
4636 case TYPE_VOP2:
4637 case TYPE_VOP1:
4638 case TYPE_VOPC:
4639 case TYPE_VOP3A:
4640 case TYPE_VOP3B:
4641 case TYPE_VOP_SDWA:
4642 case TYPE_VOP_DPP:
4643 case TYPE_MULT:
4644 case TYPE_VMULT:
4645 return false;
4646 }
4647 gcc_unreachable ();
4648 return false;
4649}
4650
4651/* If INSN sets the EXEC register to a constant value, return the value,
4652 otherwise return zero. */
4653
4654static int64_t
4655gcn_insn_exec_value (rtx_insn *insn)
4656{
4657 if (!NONDEBUG_INSN_P (insn))
4658 return 0;
4659
4660 rtx pattern = PATTERN (insn);
4661
4662 if (GET_CODE (pattern) == SET)
4663 {
4664 rtx dest = XEXP (pattern, 0);
4665 rtx src = XEXP (pattern, 1);
4666
4667 if (GET_MODE (dest) == DImode
4668 && REG_P (dest) && REGNO (dest) == EXEC_REG
4669 && CONST_INT_P (src))
4670 return INTVAL (src);
4671 }
4672
4673 return 0;
4674}
4675
4676/* Sets the EXEC register before INSN to the value that it had after
4677 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4678 known, otherwise it returns zero. */
4679
4680static int64_t
4681gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4682 bool curr_exec_known, bool &last_exec_def_saved)
4683{
4684 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4685 rtx exec;
4686
4687 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4688
4689 if (exec_value)
4690 {
4691 /* If the EXEC value is a constant and it happens to be the same as the
4692 current EXEC value, the restore can be skipped. */
4693 if (curr_exec_known && exec_value == curr_exec)
4694 return exec_value;
4695
4696 exec = GEN_INT (exec_value);
4697 }
4698 else
4699 {
4700 /* If the EXEC value is not a constant, save it in a register after the
4701 point of definition. */
4702 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4703
4704 if (!last_exec_def_saved)
4705 {
4706 start_sequence ();
4707 emit_move_insn (exec_save_reg, exec_reg);
4708 rtx_insn *seq = get_insns ();
4709 end_sequence ();
4710
4711 emit_insn_after (seq, last_exec_def);
4712 if (dump_file && (dump_flags & TDF_DETAILS))
4713 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4714 INSN_UID (last_exec_def));
4715
4716 last_exec_def_saved = true;
4717 }
4718
4719 exec = exec_save_reg;
4720 }
4721
4722 /* Restore EXEC register before the usage. */
4723 start_sequence ();
4724 emit_move_insn (exec_reg, exec);
4725 rtx_insn *seq = get_insns ();
4726 end_sequence ();
4727 emit_insn_before (seq, insn);
4728
4729 if (dump_file && (dump_flags & TDF_DETAILS))
4730 {
4731 if (exec_value)
4732 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4733 exec_value, INSN_UID (insn));
4734 else
4735 fprintf (dump_file,
4736 "Restoring EXEC from saved value before insn %d.\n",
4737 INSN_UID (insn));
4738 }
4739
4740 return exec_value;
4741}
4742
4743/* Implement TARGET_MACHINE_DEPENDENT_REORG.
4744
4745 Ensure that pipeline dependencies and lane masking are set correctly. */
4746
4747static void
4748gcn_md_reorg (void)
4749{
4750 basic_block bb;
4751 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
5326695a
AS
4752 regset_head live;
4753
4754 INIT_REG_SET (&live);
4755
4756 compute_bb_for_insn ();
4757
4758 if (!optimize)
4759 {
4760 split_all_insns ();
4761 if (dump_file && (dump_flags & TDF_DETAILS))
4762 {
4763 fprintf (dump_file, "After split:\n");
4764 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4765 }
4766
4767 /* Update data-flow information for split instructions. */
4768 df_insn_rescan_all ();
4769 }
4770
3df6fac0
JB
4771 df_live_add_problem ();
4772 df_live_set_all_dirty ();
5326695a
AS
4773 df_analyze ();
4774
4775 /* This pass ensures that the EXEC register is set correctly, according
4776 to the "exec" attribute. However, care must be taken so that the
4777 value that reaches explicit uses of the EXEC register remains the
4778 same as before.
4779 */
4780
4781 FOR_EACH_BB_FN (bb, cfun)
4782 {
4783 if (dump_file && (dump_flags & TDF_DETAILS))
4784 fprintf (dump_file, "BB %d:\n", bb->index);
4785
4786 rtx_insn *insn, *curr;
4787 rtx_insn *last_exec_def = BB_HEAD (bb);
4788 bool last_exec_def_saved = false;
4789 bool curr_exec_explicit = true;
4790 bool curr_exec_known = true;
4791 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4792 after last_exec_def is executed'. */
4793
3df6fac0
JB
4794 bitmap live_in = DF_LR_IN (bb);
4795 bool exec_live_on_entry = false;
4796 if (bitmap_bit_p (live_in, EXEC_LO_REG)
4797 || bitmap_bit_p (live_in, EXEC_HI_REG))
4798 {
4799 if (dump_file)
4800 fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
4801 (int) bb->index);
4802 exec_live_on_entry = true;
4803 }
4804
5326695a
AS
4805 FOR_BB_INSNS_SAFE (bb, insn, curr)
4806 {
4807 if (!NONDEBUG_INSN_P (insn))
4808 continue;
4809
4810 if (GET_CODE (PATTERN (insn)) == USE
4811 || GET_CODE (PATTERN (insn)) == CLOBBER)
4812 continue;
4813
4814 HARD_REG_SET defs, uses;
4815 CLEAR_HARD_REG_SET (defs);
4816 CLEAR_HARD_REG_SET (uses);
e8448ba5 4817 note_stores (insn, record_hard_reg_sets, &defs);
5326695a
AS
4818 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4819
4820 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4821 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4822 bool exec_used = (hard_reg_set_intersect_p
4823 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4824 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4825
4826 /* Check the instruction for implicit setting of EXEC via an
4827 attribute. */
4828 attr_exec exec_attr = get_attr_exec (insn);
4829 int64_t new_exec;
4830
4831 switch (exec_attr)
4832 {
4833 case EXEC_NONE:
4834 new_exec = 0;
4835 break;
4836
4837 case EXEC_SINGLE:
4838 /* Instructions that do not involve memory accesses only require
4839 bit 0 of EXEC to be set. */
4840 if (gcn_vmem_insn_p (get_attr_type (insn))
4841 || get_attr_type (insn) == TYPE_DS)
4842 new_exec = 1;
4843 else
4844 new_exec = curr_exec | 1;
4845 break;
4846
4847 case EXEC_FULL:
4848 new_exec = -1;
4849 break;
4850
4851 default: /* Auto-detect what setting is appropriate. */
4852 {
4853 new_exec = 0;
4854
4855 /* If EXEC is referenced explicitly then we don't need to do
4856 anything to set it, so we're done. */
4857 if (exec_used)
4858 break;
4859
4860 /* Scan the insn for VGPRs defs or uses. The mode determines
4861 what kind of exec is needed. */
4862 subrtx_iterator::array_type array;
4863 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4864 {
4865 const_rtx x = *iter;
4866 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4867 {
4868 if (VECTOR_MODE_P (GET_MODE (x)))
4869 {
4870 new_exec = -1;
4871 break;
4872 }
4873 else
4874 new_exec = 1;
4875 }
4876 }
4877 }
4878 break;
4879 }
4880
4881 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4882 {
4883 start_sequence ();
4884 emit_move_insn (exec_reg, GEN_INT (new_exec));
4885 rtx_insn *seq = get_insns ();
4886 end_sequence ();
4887 emit_insn_before (seq, insn);
4888
4889 if (dump_file && (dump_flags & TDF_DETAILS))
4890 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4891 new_exec, INSN_UID (insn));
4892
4893 curr_exec = new_exec;
4894 curr_exec_explicit = false;
4895 curr_exec_known = true;
4896 }
4897 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4898 {
4899 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4900 new_exec, INSN_UID (insn));
4901 }
4902
4903 /* The state of the EXEC register is unknown after a
4904 function call. */
4905 if (CALL_P (insn))
4906 curr_exec_known = false;
4907
4908 /* Handle explicit uses of EXEC. If the instruction is a partial
4909 explicit definition of EXEC, then treat it as an explicit use of
4910 EXEC as well. */
4911 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4912 {
4913 /* An instruction that explicitly uses EXEC should not also
4914 implicitly define it. */
4915 gcc_assert (!exec_used || !new_exec);
4916
4917 if (!curr_exec_known || !curr_exec_explicit)
4918 {
4919 /* Restore the previous explicitly defined value. */
4920 curr_exec = gcn_restore_exec (insn, last_exec_def,
4921 curr_exec, curr_exec_known,
4922 last_exec_def_saved);
4923 curr_exec_explicit = true;
4924 curr_exec_known = true;
4925 }
4926 }
4927
4928 /* Handle explicit definitions of EXEC. */
4929 if (exec_lo_def_p || exec_hi_def_p)
4930 {
4931 last_exec_def = insn;
4932 last_exec_def_saved = false;
4933 curr_exec = gcn_insn_exec_value (insn);
4934 curr_exec_explicit = true;
4935 curr_exec_known = true;
4936
4937 if (dump_file && (dump_flags & TDF_DETAILS))
4938 fprintf (dump_file,
4939 "Found %s definition of EXEC at insn %d.\n",
4940 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4941 INSN_UID (insn));
4942 }
3df6fac0
JB
4943
4944 exec_live_on_entry = false;
5326695a
AS
4945 }
4946
4947 COPY_REG_SET (&live, DF_LR_OUT (bb));
4948 df_simulate_initialize_backwards (bb, &live);
4949
4950 /* If EXEC is live after the basic block, restore the value of EXEC
4951 at the end of the block. */
4952 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4953 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
3df6fac0 4954 && (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
5326695a
AS
4955 {
4956 rtx_insn *end_insn = BB_END (bb);
4957
4958 /* If the instruction is not a jump instruction, do the restore
4959 after the last instruction in the basic block. */
4960 if (NONJUMP_INSN_P (end_insn))
4961 end_insn = NEXT_INSN (end_insn);
4962
4963 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4964 curr_exec_known, last_exec_def_saved);
4965 }
4966 }
4967
4968 CLEAR_REG_SET (&live);
4969
4970 /* "Manually Inserted Wait States (NOPs)."
4971
4972 GCN hardware detects most kinds of register dependencies, but there
4973 are some exceptions documented in the ISA manual. This pass
4974 detects the missed cases, and inserts the documented number of NOPs
4975 required for correct execution. */
4976
4977 const int max_waits = 5;
4978 struct ilist
4979 {
4980 rtx_insn *insn;
4981 attr_unit unit;
930c5599 4982 attr_delayeduse delayeduse;
5326695a 4983 HARD_REG_SET writes;
930c5599 4984 HARD_REG_SET reads;
5326695a
AS
4985 int age;
4986 } back[max_waits];
4987 int oldest = 0;
4988 for (int i = 0; i < max_waits; i++)
4989 back[i].insn = NULL;
4990
4991 rtx_insn *insn, *last_insn = NULL;
4992 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4993 {
4994 if (!NONDEBUG_INSN_P (insn))
4995 continue;
4996
4997 if (GET_CODE (PATTERN (insn)) == USE
4998 || GET_CODE (PATTERN (insn)) == CLOBBER)
4999 continue;
5000
5001 attr_type itype = get_attr_type (insn);
5002 attr_unit iunit = get_attr_unit (insn);
930c5599 5003 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
5326695a
AS
5004 HARD_REG_SET ireads, iwrites;
5005 CLEAR_HARD_REG_SET (ireads);
5006 CLEAR_HARD_REG_SET (iwrites);
e8448ba5 5007 note_stores (insn, record_hard_reg_sets, &iwrites);
5326695a
AS
5008 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
5009
5010 /* Scan recent previous instructions for dependencies not handled in
5011 hardware. */
5012 int nops_rqd = 0;
5013 for (int i = oldest; i < oldest + max_waits; i++)
5014 {
5015 struct ilist *prev_insn = &back[i % max_waits];
5016
5017 if (!prev_insn->insn)
5018 continue;
5019
5020 /* VALU writes SGPR followed by VMEM reading the same SGPR
5021 requires 5 wait states. */
5022 if ((prev_insn->age + nops_rqd) < 5
5023 && prev_insn->unit == UNIT_VECTOR
5024 && gcn_vmem_insn_p (itype))
5025 {
dc333d8f 5026 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
5027 if (hard_reg_set_intersect_p
5028 (regs, reg_class_contents[(int) SGPR_REGS]))
5029 nops_rqd = 5 - prev_insn->age;
5030 }
5031
5032 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
5033 requires 5 wait states. */
5034 if ((prev_insn->age + nops_rqd) < 5
5035 && prev_insn->unit == UNIT_VECTOR
5036 && iunit == UNIT_VECTOR
5037 && ((hard_reg_set_intersect_p
5038 (prev_insn->writes,
5039 reg_class_contents[(int) EXEC_MASK_REG])
5040 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
5041 ||
5042 (hard_reg_set_intersect_p
5043 (prev_insn->writes,
5044 reg_class_contents[(int) VCC_CONDITIONAL_REG])
5045 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
5046 nops_rqd = 5 - prev_insn->age;
5047
5048 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
5049 SGPR/VCC as lane select requires 4 wait states. */
5050 if ((prev_insn->age + nops_rqd) < 4
5051 && prev_insn->unit == UNIT_VECTOR
5052 && get_attr_laneselect (insn) == LANESELECT_YES)
5053 {
dc333d8f 5054 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
5055 if (hard_reg_set_intersect_p
5056 (regs, reg_class_contents[(int) SGPR_REGS])
5057 || hard_reg_set_intersect_p
5058 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
5059 nops_rqd = 4 - prev_insn->age;
5060 }
5061
5062 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
5063 requires 2 wait states. */
5064 if ((prev_insn->age + nops_rqd) < 2
5065 && prev_insn->unit == UNIT_VECTOR
5066 && itype == TYPE_VOP_DPP)
5067 {
dc333d8f 5068 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
5069 if (hard_reg_set_intersect_p
5070 (regs, reg_class_contents[(int) VGPR_REGS]))
5071 nops_rqd = 2 - prev_insn->age;
5072 }
930c5599
AS
5073
5074 /* Store that requires input registers are not overwritten by
5075 following instruction. */
5076 if ((prev_insn->age + nops_rqd) < 1
5077 && prev_insn->delayeduse == DELAYEDUSE_YES
5078 && ((hard_reg_set_intersect_p
5079 (prev_insn->reads, iwrites))))
5080 nops_rqd = 1 - prev_insn->age;
5326695a
AS
5081 }
5082
5083 /* Insert the required number of NOPs. */
5084 for (int i = nops_rqd; i > 0; i--)
5085 emit_insn_after (gen_nop (), last_insn);
5086
5087 /* Age the previous instructions. We can also ignore writes to
5088 registers subsequently overwritten. */
5089 HARD_REG_SET written;
5090 CLEAR_HARD_REG_SET (written);
5091 for (int i = oldest + max_waits - 1; i > oldest; i--)
5092 {
5093 struct ilist *prev_insn = &back[i % max_waits];
5094
5095 /* Assume all instructions are equivalent to one "wait", the same
5096 as s_nop. This is probably true for SALU, but not VALU (which
5097 may take longer), so this is not optimal. However, AMD do
5098 not publish the cycle times for instructions. */
5099 prev_insn->age += 1 + nops_rqd;
5100
44942965 5101 written |= iwrites;
d15e5131 5102 prev_insn->writes &= ~written;
5326695a
AS
5103 }
5104
5105 /* Track the current instruction as a previous instruction. */
5106 back[oldest].insn = insn;
5107 back[oldest].unit = iunit;
930c5599 5108 back[oldest].delayeduse = idelayeduse;
6576d245
RS
5109 back[oldest].writes = iwrites;
5110 back[oldest].reads = ireads;
5326695a
AS
5111 back[oldest].age = 0;
5112 oldest = (oldest + 1) % max_waits;
5113
5114 last_insn = insn;
5115 }
5116}
5117
5118/* }}} */
5119/* {{{ OpenACC / OpenMP. */
5120
5121#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
5122#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
5123#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
5124
5125/* Implement TARGET_GOACC_VALIDATE_DIMS.
5126
5127 Check the launch dimensions provided for an OpenACC compute
5128 region, or routine. */
5129
5130static bool
5131gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
5132 unsigned /*used*/)
5133{
5134 bool changed = false;
c408512e 5135 const int max_workers = 16;
fe22e0d4 5136
5326695a
AS
5137 /* The vector size must appear to be 64, to the user, unless this is a
5138 SEQ routine. The real, internal value is always 1, which means use
5139 autovectorization, but the user should not see that. */
5140 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5141 && dims[GOMP_DIM_VECTOR] >= 0)
5142 {
5143 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
5144 && dims[GOMP_DIM_VECTOR] != 64)
5145 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5146 OPT_Wopenacc_dims,
5147 (dims[GOMP_DIM_VECTOR]
55308fc2
AS
5148 ? G_("using %<vector_length (64)%>, ignoring %d")
5149 : G_("using %<vector_length (64)%>, "
5326695a
AS
5150 "ignoring runtime setting")),
5151 dims[GOMP_DIM_VECTOR]);
5152 dims[GOMP_DIM_VECTOR] = 1;
5153 changed = true;
5154 }
5155
5156 /* Check the num workers is not too large. */
5157 if (dims[GOMP_DIM_WORKER] > max_workers)
5158 {
5159 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5160 OPT_Wopenacc_dims,
55308fc2 5161 "using %<num_workers (%d)%>, ignoring %d",
5326695a
AS
5162 max_workers, dims[GOMP_DIM_WORKER]);
5163 dims[GOMP_DIM_WORKER] = max_workers;
5164 changed = true;
5165 }
5166
5167 /* Set global defaults. */
5168 if (!decl)
5169 {
5170 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
5171 if (dims[GOMP_DIM_WORKER] < 0)
c408512e 5172 dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
5326695a
AS
5173 if (dims[GOMP_DIM_GANG] < 0)
5174 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
5175 changed = true;
5176 }
5177
5178 return changed;
5179}
5180
5181/* Helper function for oacc_dim_size instruction.
5182 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
5183
5184rtx
5185gcn_oacc_dim_size (int dim)
5186{
5187 if (dim < 0 || dim > 2)
5188 error ("offload dimension out of range (%d)", dim);
5189
5190 /* Vectors are a special case. */
5191 if (dim == 2)
5192 return const1_rtx; /* Think of this as 1 times 64. */
5193
5194 static int offset[] = {
5195 /* Offsets into dispatch packet. */
5196 12, /* X dim = Gang / Team / Work-group. */
5197 20, /* Z dim = Worker / Thread / Wavefront. */
5198 16 /* Y dim = Vector / SIMD / Work-item. */
5199 };
5200 rtx addr = gen_rtx_PLUS (DImode,
5201 gen_rtx_REG (DImode,
5202 cfun->machine->args.
5203 reg[DISPATCH_PTR_ARG]),
5204 GEN_INT (offset[dim]));
5205 return gen_rtx_MEM (SImode, addr);
5206}
5207
5208/* Helper function for oacc_dim_pos instruction.
5209 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
5210
5211rtx
5212gcn_oacc_dim_pos (int dim)
5213{
5214 if (dim < 0 || dim > 2)
5215 error ("offload dimension out of range (%d)", dim);
5216
5217 static const int reg[] = {
5218 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
5219 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
5220 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
5221 };
5222
5223 int reg_num = cfun->machine->args.reg[reg[dim]];
5224
5225 /* The information must have been requested by the kernel. */
5226 gcc_assert (reg_num >= 0);
5227
5228 return gen_rtx_REG (SImode, reg_num);
5229}
5230
5231/* Implement TARGET_GOACC_FORK_JOIN. */
5232
5233static bool
2961ac45 5234gcn_fork_join (gcall *call, const int dims[], bool is_fork)
5326695a 5235{
2961ac45
JB
5236 tree arg = gimple_call_arg (call, 2);
5237 unsigned axis = TREE_INT_CST_LOW (arg);
5238
5239 if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
5240 return true;
5241
5326695a
AS
5242 return false;
5243}
5244
5245/* Implement ???????
5246 FIXME make this a real hook.
5247
5248 Adjust FNDECL such that options inherited from the host compiler
5249 are made appropriate for the accelerator compiler. */
5250
5251void
5252gcn_fixup_accel_lto_options (tree fndecl)
5253{
5254 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5255 if (!func_optimize)
5256 return;
5257
ba948b37
JJ
5258 tree old_optimize
5259 = build_optimization_node (&global_options, &global_options_set);
5326695a
AS
5260 tree new_optimize;
5261
5262 /* If the function changed the optimization levels as well as
5263 setting target options, start with the optimizations
5264 specified. */
5265 if (func_optimize != old_optimize)
ba948b37 5266 cl_optimization_restore (&global_options, &global_options_set,
5326695a
AS
5267 TREE_OPTIMIZATION (func_optimize));
5268
5269 gcn_option_override ();
5270
5271 /* The target attributes may also change some optimization flags,
5272 so update the optimization options if necessary. */
ba948b37
JJ
5273 new_optimize = build_optimization_node (&global_options,
5274 &global_options_set);
5326695a
AS
5275
5276 if (old_optimize != new_optimize)
5277 {
5278 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
ba948b37 5279 cl_optimization_restore (&global_options, &global_options_set,
5326695a
AS
5280 TREE_OPTIMIZATION (old_optimize));
5281 }
5282}
5283
2a3f9f65
JB
5284/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
5285
5286static void
5287gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
5288 unsigned HOST_WIDE_INT *hi,
5289 int ARG_UNUSED (dims[GOMP_DIM_MAX]),
5290 unsigned HOST_WIDE_INT
5291 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
5292 unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
5293{
5294 *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5295 /* !!! We can maybe use dims[] to estimate the maximum number of work
5296 groups/wavefronts/etc. we will launch, and therefore tune the maximum
5297 amount of LDS we should use. For now, use a minimal amount to try to
5298 maximise occupancy. */
5299 *hi = acc_lds_size;
5300 machine_function *machfun = cfun->machine;
5301 machfun->reduction_base = gang_private_size_opt;
5302 machfun->reduction_limit
5303 = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5304}
5305
5326695a
AS
5306/* }}} */
5307/* {{{ ASM Output. */
5308
5309/* Implement TARGET_ASM_FILE_START.
5310
5311 Print assembler file header text. */
5312
5313static void
5314output_file_start (void)
5315{
8086230e
AS
5316 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
5317 configuration. In GCC binaries, this is true for SRAM ECC, but not
5318 XNACK. */
5319 const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
5320 const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
5321 : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
5322 : "");
5323
dd455df7 5324 const char *cpu;
f062c3f1
AS
5325 switch (gcn_arch)
5326 {
1af16666
AS
5327 case PROCESSOR_FIJI:
5328 cpu = "gfx803";
8086230e
AS
5329 xnack = "";
5330 sram_ecc = "";
1af16666
AS
5331 break;
5332 case PROCESSOR_VEGA10:
5333 cpu = "gfx900";
8086230e 5334 sram_ecc = "";
1af16666
AS
5335 break;
5336 case PROCESSOR_VEGA20:
5337 cpu = "gfx906";
8086230e 5338 sram_ecc = "";
1af16666
AS
5339 break;
5340 case PROCESSOR_GFX908:
5341 cpu = "gfx908";
1af16666 5342 break;
cde52d3a
AS
5343 case PROCESSOR_GFX90a:
5344 cpu = "gfx90a";
5345 break;
f062c3f1
AS
5346 default: gcc_unreachable ();
5347 }
5348
aad32a00 5349 fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
8086230e 5350 cpu, sram_ecc, xnack);
5326695a
AS
5351}
5352
5353/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
5354
5355 Print the initial definition of a function name.
5356
5357 For GCN kernel entry points this includes all the HSA meta-data, special
5358 alignment constraints that don't apply to regular functions, and magic
5359 comments that pass information to mkoffload. */
5360
5361void
5362gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
5363{
5364 int sgpr, vgpr;
5365 bool xnack_enabled = false;
f062c3f1
AS
5366
5367 fputs ("\n\n", file);
5326695a
AS
5368
5369 if (cfun && cfun->machine && cfun->machine->normal_function)
5370 {
5371 fputs ("\t.type\t", file);
5372 assemble_name (file, name);
5373 fputs (",@function\n", file);
5374 assemble_name (file, name);
5375 fputs (":\n", file);
5376 return;
5377 }
5378
5379 /* Determine count of sgpr/vgpr registers by looking for last
5380 one used. */
5381 for (sgpr = 101; sgpr >= 0; sgpr--)
5382 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
5383 break;
5384 sgpr++;
5385 for (vgpr = 255; vgpr >= 0; vgpr--)
5386 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
5387 break;
5388 vgpr++;
5389
5326695a
AS
5390 if (!leaf_function_p ())
5391 {
5392 /* We can't know how many registers function calls might use. */
87fdbe69
KCY
5393 if (vgpr < MAX_NORMAL_VGPR_COUNT)
5394 vgpr = MAX_NORMAL_VGPR_COUNT;
f062c3f1
AS
5395 if (sgpr < MAX_NORMAL_SGPR_COUNT)
5396 sgpr = MAX_NORMAL_SGPR_COUNT;
5326695a
AS
5397 }
5398
cde52d3a
AS
5399 /* The gfx90a accum_offset field can't represent 0 registers. */
5400 if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4)
5401 vgpr = 4;
5402
f062c3f1
AS
5403 fputs ("\t.rodata\n"
5404 "\t.p2align\t6\n"
5405 "\t.amdhsa_kernel\t", file);
5326695a
AS
5406 assemble_name (file, name);
5407 fputs ("\n", file);
5326695a
AS
5408 int reg = FIRST_SGPR_REG;
5409 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5410 {
5411 int reg_first = -1;
5412 int reg_last;
5413 if ((cfun->machine->args.requested & (1 << a))
5414 && (gcn_kernel_arg_types[a].fixed_regno < 0))
5415 {
5416 reg_first = reg;
5417 reg_last = (reg_first
5418 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5419 / UNITS_PER_WORD) - 1);
5420 reg = reg_last + 1;
5421 }
5422
5423 if (gcn_kernel_arg_types[a].header_pseudo)
5424 {
f062c3f1
AS
5425 fprintf (file, "\t %s%s\t%i",
5426 (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
5326695a
AS
5427 gcn_kernel_arg_types[a].header_pseudo,
5428 (cfun->machine->args.requested & (1 << a)) != 0);
5429 if (reg_first != -1)
5430 {
5431 fprintf (file, " ; (");
5432 for (int i = reg_first; i <= reg_last; ++i)
5433 {
5434 if (i != reg_first)
5435 fprintf (file, ", ");
5436 fprintf (file, "%s", reg_names[i]);
5437 }
5438 fprintf (file, ")");
5439 }
5440 fprintf (file, "\n");
5441 }
5442 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5443 && cfun->machine->args.requested & (1 << a))
f062c3f1 5444 fprintf (file, "\t ; %s\t%i (%s)\n",
5326695a
AS
5445 gcn_kernel_arg_types[a].name,
5446 (cfun->machine->args.requested & (1 << a)) != 0,
5447 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5448 }
f062c3f1 5449 fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
5326695a
AS
5450 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5451 ? 2
5452 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5453 ? 1 : 0);
f062c3f1
AS
5454 fprintf (file,
5455 "\t .amdhsa_next_free_vgpr\t%i\n"
5456 "\t .amdhsa_next_free_sgpr\t%i\n"
5457 "\t .amdhsa_reserve_vcc\t1\n"
5458 "\t .amdhsa_reserve_flat_scratch\t0\n"
5459 "\t .amdhsa_reserve_xnack_mask\t%i\n"
5460 "\t .amdhsa_private_segment_fixed_size\t%i\n"
5461 "\t .amdhsa_group_segment_fixed_size\t%u\n"
5462 "\t .amdhsa_float_denorm_mode_32\t3\n"
5463 "\t .amdhsa_float_denorm_mode_16_64\t3\n",
5464 vgpr,
5465 sgpr,
5466 xnack_enabled,
5326695a
AS
5467 /* workitem_private_segment_bytes_size needs to be
5468 one 64th the wave-front stack size. */
5469 stack_size_opt / 64,
f062c3f1 5470 LDS_SIZE);
cde52d3a
AS
5471 if (gcn_arch == PROCESSOR_GFX90a)
5472 fprintf (file,
5473 "\t .amdhsa_accum_offset\t%i\n"
5474 "\t .amdhsa_tg_split\t0\n",
5475 (vgpr+3)&~3); // I think this means the AGPRs come after the VGPRs
f062c3f1
AS
5476 fputs ("\t.end_amdhsa_kernel\n", file);
5477
5478#if 1
5479 /* The following is YAML embedded in assembler; tabs are not allowed. */
5480 fputs (" .amdgpu_metadata\n"
5481 " amdhsa.version:\n"
5482 " - 1\n"
5483 " - 0\n"
5484 " amdhsa.kernels:\n"
5485 " - .name: ", file);
5486 assemble_name (file, name);
5487 fputs ("\n .symbol: ", file);
5488 assemble_name (file, name);
5489 fprintf (file,
5490 ".kd\n"
5491 " .kernarg_segment_size: %i\n"
5492 " .kernarg_segment_align: %i\n"
5493 " .group_segment_fixed_size: %u\n"
5494 " .private_segment_fixed_size: %i\n"
5495 " .wavefront_size: 64\n"
5496 " .sgpr_count: %i\n"
5497 " .vgpr_count: %i\n"
5498 " .max_flat_workgroup_size: 1024\n",
5499 cfun->machine->kernarg_segment_byte_size,
5326695a 5500 cfun->machine->kernarg_segment_alignment,
f062c3f1
AS
5501 LDS_SIZE,
5502 stack_size_opt / 64,
5503 sgpr, vgpr);
cde52d3a
AS
5504 if (gcn_arch == PROCESSOR_GFX90a)
5505 fprintf (file, " .agpr_count: 0\n"); // AGPRs are not used, yet
f062c3f1
AS
5506 fputs (" .end_amdgpu_metadata\n", file);
5507#endif
5508
5509 fputs ("\t.text\n", file);
5510 fputs ("\t.align\t256\n", file);
5511 fputs ("\t.type\t", file);
5512 assemble_name (file, name);
5513 fputs (",@function\n", file);
5514 assemble_name (file, name);
5515 fputs (":\n", file);
5326695a
AS
5516
5517 /* This comment is read by mkoffload. */
5518 if (flag_openacc)
5519 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5520 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5521 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5522 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5523}
5524
5525/* Implement TARGET_ASM_SELECT_SECTION.
5526
5527 Return the section into which EXP should be placed. */
5528
5529static section *
5530gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5531{
5532 if (TREE_TYPE (exp) != error_mark_node
5533 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5534 {
5535 if (!DECL_P (exp))
5536 return get_section (".lds_bss",
5537 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5538 NULL);
5539
5540 return get_named_section (exp, ".lds_bss", reloc);
5541 }
5542
5543 return default_elf_select_section (exp, reloc, align);
5544}
5545
5546/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5547
5548 Emits custom text into the assembler file at the head of each function. */
5549
5550static void
5551gcn_target_asm_function_prologue (FILE *file)
5552{
5553 machine_function *offsets = gcn_compute_frame_offsets ();
5554
5555 asm_fprintf (file, "\t; using %s addressing in function\n",
5556 offsets->use_flat_addressing ? "flat" : "global");
5557
5558 if (offsets->normal_function)
5559 {
5560 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5561 offsets->need_frame_pointer ? "true" : "false");
5562 asm_fprintf (file, "\t; lr needs saving: %s\n",
5563 offsets->lr_needs_saving ? "true" : "false");
5564 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5565 offsets->outgoing_args_size);
5566 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5567 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5568 asm_fprintf (file, "\t; callee save size: %wd\n",
5569 offsets->callee_saves);
5570 }
5571 else
5572 {
5573 asm_fprintf (file, "\t; HSA kernel entry point\n");
5574 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5575 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5576 offsets->outgoing_args_size);
5326695a
AS
5577 }
5578}
5579
5580/* Helper function for print_operand and print_operand_address.
5581
5582 Print a register as the assembler requires, according to mode and name. */
5583
5584static void
5585print_reg (FILE *file, rtx x)
5586{
5587 machine_mode mode = GET_MODE (x);
5588 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5589 || mode == HFmode || mode == SFmode
5590 || mode == V64SFmode || mode == V64SImode
5591 || mode == V64QImode || mode == V64HImode)
5592 fprintf (file, "%s", reg_names[REGNO (x)]);
5593 else if (mode == DImode || mode == V64DImode
5594 || mode == DFmode || mode == V64DFmode)
5595 {
5596 if (SGPR_REGNO_P (REGNO (x)))
5597 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5598 REGNO (x) - FIRST_SGPR_REG + 1);
5599 else if (VGPR_REGNO_P (REGNO (x)))
5600 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5601 REGNO (x) - FIRST_VGPR_REG + 1);
5602 else if (REGNO (x) == FLAT_SCRATCH_REG)
5603 fprintf (file, "flat_scratch");
5604 else if (REGNO (x) == EXEC_REG)
5605 fprintf (file, "exec");
5606 else if (REGNO (x) == VCC_LO_REG)
5607 fprintf (file, "vcc");
5608 else
5609 fprintf (file, "[%s:%s]",
5610 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5611 }
5612 else if (mode == TImode)
5613 {
5614 if (SGPR_REGNO_P (REGNO (x)))
5615 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5616 REGNO (x) - FIRST_SGPR_REG + 3);
5617 else if (VGPR_REGNO_P (REGNO (x)))
5618 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5619 REGNO (x) - FIRST_VGPR_REG + 3);
5620 else
5621 gcc_unreachable ();
5622 }
5623 else
5624 gcc_unreachable ();
5625}
5626
5627/* Implement TARGET_SECTION_TYPE_FLAGS.
5628
5629 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5630
5631static unsigned int
5632gcn_section_type_flags (tree decl, const char *name, int reloc)
5633{
5634 if (strcmp (name, ".lds_bss") == 0)
5635 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5636
5637 return default_section_type_flags (decl, name, reloc);
5638}
5639
5640/* Helper function for gcn_asm_output_symbol_ref.
5641
2a3f9f65
JB
5642 FIXME: This function is used to lay out gang-private variables in LDS
5643 on a per-CU basis.
5644 There may be cases in which gang-private variables in different compilation
5645 units could clobber each other. In that case we should be relying on the
5646 linker to lay out gang-private LDS space, but that doesn't appear to be
5647 possible at present. */
5326695a
AS
5648
5649static void
5650gcn_print_lds_decl (FILE *f, tree var)
5651{
5652 int *offset;
2a3f9f65 5653 if ((offset = lds_allocs.get (var)))
5326695a
AS
5654 fprintf (f, "%u", (unsigned) *offset);
5655 else
5656 {
5657 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5658 tree type = TREE_TYPE (var);
5659 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5660 if (size > align && size > 4 && align < 8)
5661 align = 8;
5662
2a3f9f65 5663 gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
5326695a 5664
2a3f9f65
JB
5665 lds_allocs.put (var, gang_private_hwm);
5666 fprintf (f, "%u", gang_private_hwm);
5667 gang_private_hwm += size;
5668 if (gang_private_hwm > gang_private_size_opt)
2579d612
TS
5669 error ("%d bytes of gang-private data-share memory exhausted"
5670 " (increase with %<-mgang-private-size=%d%>, for example)",
5671 gang_private_size_opt, gang_private_hwm);
5326695a
AS
5672 }
5673}
5674
5675/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5676
5677void
5678gcn_asm_output_symbol_ref (FILE *file, rtx x)
5679{
5680 tree decl;
9200b53a
JB
5681 if (cfun
5682 && (decl = SYMBOL_REF_DECL (x)) != 0
5326695a
AS
5683 && TREE_CODE (decl) == VAR_DECL
5684 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5685 {
5686 /* LDS symbols (emitted using this hook) are only used at present
5687 to propagate worker values from an active thread to neutered
5688 threads. Use the same offset for each such block, but don't
5689 use zero because null pointers are used to identify the active
5690 thread in GOACC_single_copy_start calls. */
5691 gcn_print_lds_decl (file, decl);
5692 }
5693 else
5694 {
5695 assemble_name (file, XSTR (x, 0));
5696 /* FIXME: See above -- this condition is unreachable. */
9200b53a
JB
5697 if (cfun
5698 && (decl = SYMBOL_REF_DECL (x)) != 0
5326695a
AS
5699 && TREE_CODE (decl) == VAR_DECL
5700 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5701 fputs ("@abs32", file);
5702 }
5703}
5704
5705/* Implement TARGET_CONSTANT_ALIGNMENT.
5706
5707 Returns the alignment in bits of a constant that is being placed in memory.
5708 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5709 would ordinarily have. */
5710
5711static HOST_WIDE_INT
5712gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5713 HOST_WIDE_INT basic_align)
5714{
5715 return basic_align > 128 ? basic_align : 128;
5716}
5717
5718/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5719
5720void
5721print_operand_address (FILE *file, rtx mem)
5722{
5723 gcc_assert (MEM_P (mem));
5724
5725 rtx reg;
5726 rtx offset;
5727 addr_space_t as = MEM_ADDR_SPACE (mem);
5728 rtx addr = XEXP (mem, 0);
5729 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5730
5731 if (AS_SCRATCH_P (as))
5732 switch (GET_CODE (addr))
5733 {
5734 case REG:
5735 print_reg (file, addr);
5736 break;
5737
5738 case PLUS:
5739 reg = XEXP (addr, 0);
5740 offset = XEXP (addr, 1);
5741 print_reg (file, reg);
5742 if (GET_CODE (offset) == CONST_INT)
5743 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5744 else
5745 abort ();
5746 break;
5747
5748 default:
5749 debug_rtx (addr);
5750 abort ();
5751 }
5752 else if (AS_ANY_FLAT_P (as))
5753 {
5754 if (GET_CODE (addr) == REG)
5755 print_reg (file, addr);
5756 else
5757 {
5758 gcc_assert (TARGET_GCN5_PLUS);
5759 print_reg (file, XEXP (addr, 0));
5760 }
5761 }
5762 else if (AS_GLOBAL_P (as))
5763 {
5764 gcc_assert (TARGET_GCN5_PLUS);
5765
5766 rtx base = addr;
5767 rtx vgpr_offset = NULL_RTX;
5768
5769 if (GET_CODE (addr) == PLUS)
5770 {
5771 base = XEXP (addr, 0);
5772
5773 if (GET_CODE (base) == PLUS)
5774 {
5775 /* (SGPR + VGPR) + CONST */
5776 vgpr_offset = XEXP (base, 1);
5777 base = XEXP (base, 0);
5778 }
5779 else
5780 {
5781 rtx offset = XEXP (addr, 1);
5782
5783 if (REG_P (offset))
5784 /* SGPR + VGPR */
5785 vgpr_offset = offset;
5786 else if (CONST_INT_P (offset))
5787 /* VGPR + CONST or SGPR + CONST */
5788 ;
5789 else
5790 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5791 }
5792 }
5793
5794 if (REG_P (base))
5795 {
5796 if (VGPR_REGNO_P (REGNO (base)))
5797 print_reg (file, base);
5798 else if (SGPR_REGNO_P (REGNO (base)))
5799 {
5800 /* The assembler requires a 64-bit VGPR pair here, even though
5801 the offset should be only 32-bit. */
5802 if (vgpr_offset == NULL_RTX)
f6e20012
KCY
5803 /* In this case, the vector offset is zero, so we use the first
5804 lane of v1, which is initialized to zero. */
8086230e 5805 fprintf (file, "v1");
5326695a
AS
5806 else if (REG_P (vgpr_offset)
5807 && VGPR_REGNO_P (REGNO (vgpr_offset)))
8086230e 5808 fprintf (file, "v%d", REGNO (vgpr_offset) - FIRST_VGPR_REG);
5326695a
AS
5809 else
5810 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5811 }
5812 }
5813 else
5814 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5815 }
5816 else if (AS_ANY_DS_P (as))
5817 switch (GET_CODE (addr))
5818 {
5819 case REG:
5820 print_reg (file, addr);
5821 break;
5822
5823 case PLUS:
5824 reg = XEXP (addr, 0);
5825 print_reg (file, reg);
5826 break;
5827
5828 default:
5829 debug_rtx (addr);
5830 abort ();
5831 }
5832 else
5833 switch (GET_CODE (addr))
5834 {
5835 case REG:
5836 print_reg (file, addr);
5837 fprintf (file, ", 0");
5838 break;
5839
5840 case PLUS:
5841 reg = XEXP (addr, 0);
5842 offset = XEXP (addr, 1);
5843 print_reg (file, reg);
5844 fprintf (file, ", ");
5845 if (GET_CODE (offset) == REG)
5846 print_reg (file, reg);
5847 else if (GET_CODE (offset) == CONST_INT)
5848 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5849 else
5850 abort ();
5851 break;
5852
5853 default:
5854 debug_rtx (addr);
5855 abort ();
5856 }
5857}
5858
5859/* Implement PRINT_OPERAND via gcn.h.
5860
5861 b - print operand size as untyped operand (b8/b16/b32/b64)
5862 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5863 i - print operand size as untyped operand (i16/b32/i64)
a5879399 5864 I - print operand size as SI/DI untyped operand(i32/b32/i64)
5326695a 5865 u - print operand size as untyped operand (u16/u32/u64)
a5879399 5866 U - print operand size as SI/DI untyped operand(u32/u64)
5326695a
AS
5867 o - print operand size as memory access size for loads
5868 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5869 s - print operand size as memory access size for stores
5870 (byte/short/dword/dwordx2/wordx3/dwordx4)
5871 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5872 c - print inverse conditional code for s_cbranch
5873 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5874 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5875 A - print address in formatting suitable for given address space.
5876 O - print offset:n for data share operations.
5877 ^ - print "_co" suffix for GCN5 mnemonics
5878 g - print "glc", if appropriate for given MEM
5879 */
5880
5881void
5882print_operand (FILE *file, rtx x, int code)
5883{
5884 int xcode = x ? GET_CODE (x) : 0;
5885 bool invert = false;
5886 switch (code)
5887 {
5888 /* Instructions have the following suffixes.
5889 If there are two suffixes, the first is the destination type,
5890 and the second is the source type.
5891
5892 B32 Bitfield (untyped data) 32-bit
5893 B64 Bitfield (untyped data) 64-bit
5894 F16 floating-point 16-bit
5895 F32 floating-point 32-bit (IEEE 754 single-precision float)
5896 F64 floating-point 64-bit (IEEE 754 double-precision float)
5897 I16 signed 32-bit integer
5898 I32 signed 32-bit integer
5899 I64 signed 64-bit integer
5900 U16 unsigned 32-bit integer
5901 U32 unsigned 32-bit integer
5902 U64 unsigned 64-bit integer */
5903
5904 /* Print operand size as untyped suffix. */
5905 case 'b':
5906 {
5907 const char *s = "";
5908 machine_mode mode = GET_MODE (x);
5909 if (VECTOR_MODE_P (mode))
5910 mode = GET_MODE_INNER (mode);
5911 switch (GET_MODE_SIZE (mode))
5912 {
5913 case 1:
5914 s = "_b8";
5915 break;
5916 case 2:
5917 s = "_b16";
5918 break;
5919 case 4:
5920 s = "_b32";
5921 break;
5922 case 8:
5923 s = "_b64";
5924 break;
5925 default:
5926 output_operand_lossage ("invalid operand %%xn code");
5927 return;
5928 }
5929 fputs (s, file);
5930 }
5931 return;
5932 case 'B':
5933 {
5934 const char *s = "";
5935 machine_mode mode = GET_MODE (x);
5936 if (VECTOR_MODE_P (mode))
5937 mode = GET_MODE_INNER (mode);
5938 switch (GET_MODE_SIZE (mode))
5939 {
5940 case 1:
5941 case 2:
5942 case 4:
5943 s = "_b32";
5944 break;
5945 case 8:
5946 s = "_b64";
5947 break;
5948 default:
5949 output_operand_lossage ("invalid operand %%xn code");
5950 return;
5951 }
5952 fputs (s, file);
5953 }
5954 return;
5955 case 'e':
5956 fputs ("sext(", file);
5957 print_operand (file, x, 0);
5958 fputs (")", file);
5959 return;
5960 case 'i':
a5879399 5961 case 'I':
5326695a 5962 case 'u':
a5879399 5963 case 'U':
5326695a
AS
5964 {
5965 bool signed_p = code == 'i';
a5879399 5966 bool min32_p = code == 'I' || code == 'U';
5326695a
AS
5967 const char *s = "";
5968 machine_mode mode = GET_MODE (x);
5969 if (VECTOR_MODE_P (mode))
5970 mode = GET_MODE_INNER (mode);
5971 if (mode == VOIDmode)
5972 switch (GET_CODE (x))
5973 {
5974 case CONST_INT:
5975 s = signed_p ? "_i32" : "_u32";
5976 break;
5977 case CONST_DOUBLE:
5978 s = "_f64";
5979 break;
5980 default:
5981 output_operand_lossage ("invalid operand %%xn code");
5982 return;
5983 }
5984 else if (FLOAT_MODE_P (mode))
5985 switch (GET_MODE_SIZE (mode))
5986 {
5987 case 2:
5988 s = "_f16";
5989 break;
5990 case 4:
5991 s = "_f32";
5992 break;
5993 case 8:
5994 s = "_f64";
5995 break;
5996 default:
5997 output_operand_lossage ("invalid operand %%xn code");
5998 return;
5999 }
a5879399
AS
6000 else if (min32_p)
6001 switch (GET_MODE_SIZE (mode))
6002 {
6003 case 1:
6004 case 2:
6005 case 4:
6006 s = signed_p ? "_i32" : "_u32";
6007 break;
6008 case 8:
6009 s = signed_p ? "_i64" : "_u64";
6010 break;
6011 default:
6012 output_operand_lossage ("invalid operand %%xn code");
6013 return;
6014 }
5326695a
AS
6015 else
6016 switch (GET_MODE_SIZE (mode))
6017 {
6018 case 1:
6019 s = signed_p ? "_i8" : "_u8";
6020 break;
6021 case 2:
6022 s = signed_p ? "_i16" : "_u16";
6023 break;
6024 case 4:
6025 s = signed_p ? "_i32" : "_u32";
6026 break;
6027 case 8:
6028 s = signed_p ? "_i64" : "_u64";
6029 break;
6030 default:
6031 output_operand_lossage ("invalid operand %%xn code");
6032 return;
6033 }
6034 fputs (s, file);
6035 }
6036 return;
6037 /* Print operand size as untyped suffix. */
6038 case 'o':
6039 {
6040 const char *s = 0;
6041 switch (GET_MODE_SIZE (GET_MODE (x)))
6042 {
6043 case 1:
6044 s = "_ubyte";
6045 break;
6046 case 2:
6047 s = "_ushort";
6048 break;
6049 /* The following are full-vector variants. */
6050 case 64:
6051 s = "_ubyte";
6052 break;
6053 case 128:
6054 s = "_ushort";
6055 break;
6056 }
6057
6058 if (s)
6059 {
6060 fputs (s, file);
6061 return;
6062 }
6063
6064 /* Fall-through - the other cases for 'o' are the same as for 's'. */
6065 gcc_fallthrough();
6066 }
6067 case 's':
6068 {
6069 const char *s = "";
6070 switch (GET_MODE_SIZE (GET_MODE (x)))
6071 {
6072 case 1:
6073 s = "_byte";
6074 break;
6075 case 2:
6076 s = "_short";
6077 break;
6078 case 4:
6079 s = "_dword";
6080 break;
6081 case 8:
6082 s = "_dwordx2";
6083 break;
6084 case 12:
6085 s = "_dwordx3";
6086 break;
6087 case 16:
6088 s = "_dwordx4";
6089 break;
6090 case 32:
6091 s = "_dwordx8";
6092 break;
6093 case 64:
6094 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
6095 break;
6096 /* The following are full-vector variants. */
6097 case 128:
6098 s = "_short";
6099 break;
6100 case 256:
6101 s = "_dword";
6102 break;
6103 case 512:
6104 s = "_dwordx2";
6105 break;
6106 default:
6107 output_operand_lossage ("invalid operand %%xn code");
6108 return;
6109 }
6110 fputs (s, file);
6111 }
6112 return;
6113 case 'A':
6114 if (xcode != MEM)
6115 {
6116 output_operand_lossage ("invalid %%xn code");
6117 return;
6118 }
6119 print_operand_address (file, x);
6120 return;
6121 case 'O':
6122 {
6123 if (xcode != MEM)
6124 {
6125 output_operand_lossage ("invalid %%xn code");
6126 return;
6127 }
6128 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
6129 fprintf (file, " gds");
6130
6131 rtx x0 = XEXP (x, 0);
6132 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
6133 {
6134 gcc_assert (TARGET_GCN5_PLUS);
6135
6136 fprintf (file, ", ");
6137
6138 rtx base = x0;
6139 rtx const_offset = NULL_RTX;
6140
6141 if (GET_CODE (base) == PLUS)
6142 {
6143 rtx offset = XEXP (x0, 1);
6144 base = XEXP (x0, 0);
6145
6146 if (GET_CODE (base) == PLUS)
6147 /* (SGPR + VGPR) + CONST */
6148 /* Ignore the VGPR offset for this operand. */
6149 base = XEXP (base, 0);
6150
6151 if (CONST_INT_P (offset))
6152 const_offset = XEXP (x0, 1);
6153 else if (REG_P (offset))
6154 /* SGPR + VGPR */
6155 /* Ignore the VGPR offset for this operand. */
6156 ;
6157 else
6158 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6159 }
6160
6161 if (REG_P (base))
6162 {
6163 if (VGPR_REGNO_P (REGNO (base)))
6164 /* The VGPR address is specified in the %A operand. */
6165 fprintf (file, "off");
6166 else if (SGPR_REGNO_P (REGNO (base)))
6167 print_reg (file, base);
6168 else
6169 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6170 }
6171 else
6172 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6173
6174 if (const_offset != NULL_RTX)
6175 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
6176 INTVAL (const_offset));
6177
6178 return;
6179 }
6180
6181 if (GET_CODE (x0) == REG)
6182 return;
6183 if (GET_CODE (x0) != PLUS)
6184 {
6185 output_operand_lossage ("invalid %%xn code");
6186 return;
6187 }
6188 rtx val = XEXP (x0, 1);
6189 if (GET_CODE (val) == CONST_VECTOR)
6190 val = CONST_VECTOR_ELT (val, 0);
6191 if (GET_CODE (val) != CONST_INT)
6192 {
6193 output_operand_lossage ("invalid %%xn code");
6194 return;
6195 }
6196 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
6197
6198 }
6199 return;
6200 case 'c':
6201 invert = true;
6202 /* Fall through. */
6203 case 'C':
6204 {
6205 const char *s;
6206 bool num = false;
6207 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
6208 {
6209 output_operand_lossage ("invalid %%xn code");
6210 return;
6211 }
6212 switch (REGNO (XEXP (x, 0)))
6213 {
6214 case VCC_REG:
6215 case VCCZ_REG:
6216 s = "_vcc";
6217 break;
6218 case SCC_REG:
6219 /* For some reason llvm-mc insists on scc0 instead of sccz. */
6220 num = true;
6221 s = "_scc";
6222 break;
6223 case EXECZ_REG:
6224 s = "_exec";
6225 break;
6226 default:
6227 output_operand_lossage ("invalid %%xn code");
6228 return;
6229 }
6230 fputs (s, file);
6231 if (xcode == (invert ? NE : EQ))
6232 fputc (num ? '0' : 'z', file);
6233 else
6234 fputs (num ? "1" : "nz", file);
6235 return;
6236 }
6237 case 'D':
6238 {
6239 const char *s;
6240 bool cmp_signed = false;
6241 switch (xcode)
6242 {
6243 case EQ:
6244 s = "_eq_";
6245 break;
6246 case NE:
6247 s = "_lg_";
6248 break;
6249 case LT:
6250 s = "_lt_";
6251 cmp_signed = true;
6252 break;
6253 case LE:
6254 s = "_le_";
6255 cmp_signed = true;
6256 break;
6257 case GT:
6258 s = "_gt_";
6259 cmp_signed = true;
6260 break;
6261 case GE:
6262 s = "_ge_";
6263 cmp_signed = true;
6264 break;
6265 case LTU:
6266 s = "_lt_";
6267 break;
6268 case LEU:
6269 s = "_le_";
6270 break;
6271 case GTU:
6272 s = "_gt_";
6273 break;
6274 case GEU:
6275 s = "_ge_";
6276 break;
6277 default:
6278 output_operand_lossage ("invalid %%xn code");
6279 return;
6280 }
6281 fputs (s, file);
6282 fputc (cmp_signed ? 'i' : 'u', file);
6283
6284 machine_mode mode = GET_MODE (XEXP (x, 0));
6285
6286 if (mode == VOIDmode)
6287 mode = GET_MODE (XEXP (x, 1));
6288
6289 /* If both sides are constants, then assume the instruction is in
6290 SImode since s_cmp can only do integer compares. */
6291 if (mode == VOIDmode)
6292 mode = SImode;
6293
6294 switch (GET_MODE_SIZE (mode))
6295 {
6296 case 4:
6297 s = "32";
6298 break;
6299 case 8:
6300 s = "64";
6301 break;
6302 default:
6303 output_operand_lossage ("invalid operand %%xn code");
6304 return;
6305 }
6306 fputs (s, file);
6307 return;
6308 }
6309 case 'E':
6310 {
6311 const char *s;
6312 bool cmp_signed = false;
6313 machine_mode mode = GET_MODE (XEXP (x, 0));
6314
6315 if (mode == VOIDmode)
6316 mode = GET_MODE (XEXP (x, 1));
6317
6318 /* If both sides are constants, assume the instruction is in SFmode
6319 if either operand is floating point, otherwise assume SImode. */
6320 if (mode == VOIDmode)
6321 {
6322 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
6323 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
6324 mode = SFmode;
6325 else
6326 mode = SImode;
6327 }
6328
6329 /* Use the same format code for vector comparisons. */
6330 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
6331 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
6332 mode = GET_MODE_INNER (mode);
6333
6334 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
6335
6336 switch (xcode)
6337 {
6338 case EQ:
6339 s = "_eq_";
6340 break;
6341 case NE:
6342 s = float_p ? "_neq_" : "_ne_";
6343 break;
6344 case LT:
6345 s = "_lt_";
6346 cmp_signed = true;
6347 break;
6348 case LE:
6349 s = "_le_";
6350 cmp_signed = true;
6351 break;
6352 case GT:
6353 s = "_gt_";
6354 cmp_signed = true;
6355 break;
6356 case GE:
6357 s = "_ge_";
6358 cmp_signed = true;
6359 break;
6360 case LTU:
6361 s = "_lt_";
6362 break;
6363 case LEU:
6364 s = "_le_";
6365 break;
6366 case GTU:
6367 s = "_gt_";
6368 break;
6369 case GEU:
6370 s = "_ge_";
6371 break;
6372 case ORDERED:
6373 s = "_o_";
6374 break;
6375 case UNORDERED:
6376 s = "_u_";
6377 break;
1dff18a1
KCY
6378 case UNEQ:
6379 s = "_nlg_";
6380 break;
6381 case UNGE:
6382 s = "_nlt_";
6383 break;
6384 case UNGT:
6385 s = "_nle_";
6386 break;
6387 case UNLE:
6388 s = "_ngt_";
6389 break;
6390 case UNLT:
6391 s = "_nge_";
6392 break;
59e6d62b
AS
6393 case LTGT:
6394 s = "_lg_";
6395 break;
5326695a
AS
6396 default:
6397 output_operand_lossage ("invalid %%xn code");
6398 return;
6399 }
6400 fputs (s, file);
6401 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6402
6403 switch (GET_MODE_SIZE (mode))
6404 {
6405 case 1:
0e159efc
AS
6406 output_operand_lossage ("operand %%xn code invalid for QImode");
6407 return;
5326695a 6408 case 2:
0e159efc 6409 s = "16";
5326695a
AS
6410 break;
6411 case 4:
6412 s = "32";
6413 break;
6414 case 8:
6415 s = "64";
6416 break;
6417 default:
6418 output_operand_lossage ("invalid operand %%xn code");
6419 return;
6420 }
6421 fputs (s, file);
6422 return;
6423 }
6424 case 'L':
6425 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6426 return;
6427 case 'H':
6428 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6429 return;
6430 case 'R':
6431 /* Print a scalar register number as an integer. Temporary hack. */
6432 gcc_assert (REG_P (x));
6433 fprintf (file, "%u", (int) REGNO (x));
6434 return;
6435 case 'V':
6436 /* Print a vector register number as an integer. Temporary hack. */
6437 gcc_assert (REG_P (x));
6438 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6439 return;
6440 case 0:
6441 if (xcode == REG)
6442 print_reg (file, x);
6443 else if (xcode == MEM)
6444 output_address (GET_MODE (x), x);
6445 else if (xcode == CONST_INT)
6446 fprintf (file, "%i", (int) INTVAL (x));
6447 else if (xcode == CONST_VECTOR)
6448 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6449 else if (xcode == CONST_DOUBLE)
6450 {
6451 const char *str;
6452 switch (gcn_inline_fp_constant_p (x, false))
6453 {
6454 case 240:
6455 str = "0.5";
6456 break;
6457 case 241:
6458 str = "-0.5";
6459 break;
6460 case 242:
6461 str = "1.0";
6462 break;
6463 case 243:
6464 str = "-1.0";
6465 break;
6466 case 244:
6467 str = "2.0";
6468 break;
6469 case 245:
6470 str = "-2.0";
6471 break;
6472 case 246:
6473 str = "4.0";
6474 break;
6475 case 247:
6476 str = "-4.0";
6477 break;
6478 case 248:
6479 str = "1/pi";
6480 break;
6481 default:
6482 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6483 ? DImode : SImode,
6484 x, GET_MODE (x), 0);
6485 if (x)
6486 print_operand (file, ix, code);
6487 else
a94d5170 6488 output_operand_lossage ("invalid fp constant");
5326695a
AS
6489 return;
6490 break;
6491 }
6492 fprintf (file, str);
6493 return;
6494 }
6495 else
6496 output_addr_const (file, x);
6497 return;
6498 case '^':
6499 if (TARGET_GCN5_PLUS)
6500 fputs ("_co", file);
6501 return;
6502 case 'g':
6503 gcc_assert (xcode == MEM);
6504 if (MEM_VOLATILE_P (x))
6505 fputs (" glc", file);
6506 return;
6507 default:
6508 output_operand_lossage ("invalid %%xn code");
6509 }
6510 gcc_unreachable ();
6511}
6512
eff23b79
AS
6513/* Implement DBX_REGISTER_NUMBER macro.
6514
6515 Return the DWARF register number that corresponds to the GCC internal
6516 REGNO. */
6517
6518unsigned int
6519gcn_dwarf_register_number (unsigned int regno)
6520{
6521 /* Registers defined in DWARF. */
6522 if (regno == EXEC_LO_REG)
6523 return 17;
6524 /* We need to use a more complex DWARF expression for this
6525 else if (regno == EXEC_HI_REG)
6526 return 17; */
6527 else if (regno == VCC_LO_REG)
6528 return 768;
6529 /* We need to use a more complex DWARF expression for this
6530 else if (regno == VCC_HI_REG)
6531 return 768; */
6532 else if (regno == SCC_REG)
6533 return 128;
251697a6
HAQ
6534 else if (regno == DWARF_LINK_REGISTER)
6535 return 16;
eff23b79
AS
6536 else if (SGPR_REGNO_P (regno))
6537 {
6538 if (regno - FIRST_SGPR_REG < 64)
6539 return (regno - FIRST_SGPR_REG + 32);
6540 else
6541 return (regno - FIRST_SGPR_REG + 1024);
6542 }
6543 else if (VGPR_REGNO_P (regno))
6544 return (regno - FIRST_VGPR_REG + 2560);
6545
6546 /* Otherwise, there's nothing sensible to do. */
6547 return regno + 100000;
6548}
6549
6550/* Implement TARGET_DWARF_REGISTER_SPAN.
6551
6552 DImode and Vector DImode require additional registers. */
6553
6554static rtx
6555gcn_dwarf_register_span (rtx rtl)
6556{
6557 machine_mode mode = GET_MODE (rtl);
6558
6559 if (VECTOR_MODE_P (mode))
6560 mode = GET_MODE_INNER (mode);
6561
6562 if (GET_MODE_SIZE (mode) != 8)
6563 return NULL_RTX;
6564
eff23b79 6565 unsigned regno = REGNO (rtl);
251697a6
HAQ
6566
6567 if (regno == DWARF_LINK_REGISTER)
6568 return NULL_RTX;
6569
6570 rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
eff23b79
AS
6571 XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
6572 XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
6573
6574 return p;
6575}
6576
5326695a
AS
6577/* }}} */
6578/* {{{ TARGET hook overrides. */
6579
6580#undef TARGET_ADDR_SPACE_ADDRESS_MODE
6581#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
b5bb7f32
HAQ
6582#undef TARGET_ADDR_SPACE_DEBUG
6583#define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
5326695a
AS
6584#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6585#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6586 gcn_addr_space_legitimate_address_p
6587#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6588#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6589#undef TARGET_ADDR_SPACE_POINTER_MODE
6590#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6591#undef TARGET_ADDR_SPACE_SUBSET_P
6592#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6593#undef TARGET_ADDR_SPACE_CONVERT
6594#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6595#undef TARGET_ARG_PARTIAL_BYTES
6596#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6597#undef TARGET_ASM_ALIGNED_DI_OP
6598#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
5326695a
AS
6599#undef TARGET_ASM_FILE_START
6600#define TARGET_ASM_FILE_START output_file_start
6601#undef TARGET_ASM_FUNCTION_PROLOGUE
6602#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6603#undef TARGET_ASM_SELECT_SECTION
6604#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6605#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6606#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6607#undef TARGET_ATTRIBUTE_TABLE
6608#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6609#undef TARGET_BUILTIN_DECL
6610#define TARGET_BUILTIN_DECL gcn_builtin_decl
6611#undef TARGET_CAN_CHANGE_MODE_CLASS
6612#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6613#undef TARGET_CAN_ELIMINATE
6614#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6615#undef TARGET_CANNOT_COPY_INSN_P
6616#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6617#undef TARGET_CLASS_LIKELY_SPILLED_P
6618#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6619#undef TARGET_CLASS_MAX_NREGS
6620#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6621#undef TARGET_CONDITIONAL_REGISTER_USAGE
6622#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6623#undef TARGET_CONSTANT_ALIGNMENT
6624#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6625#undef TARGET_DEBUG_UNWIND_INFO
6626#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
eff23b79
AS
6627#undef TARGET_DWARF_REGISTER_SPAN
6628#define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
76d46331
KCY
6629#undef TARGET_EMUTLS_VAR_INIT
6630#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
5326695a
AS
6631#undef TARGET_EXPAND_BUILTIN
6632#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
22f201e4
HAQ
6633#undef TARGET_FRAME_POINTER_REQUIRED
6634#define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
5326695a
AS
6635#undef TARGET_FUNCTION_ARG
6636#undef TARGET_FUNCTION_ARG_ADVANCE
6637#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6638#define TARGET_FUNCTION_ARG gcn_function_arg
6639#undef TARGET_FUNCTION_VALUE
6640#define TARGET_FUNCTION_VALUE gcn_function_value
6641#undef TARGET_FUNCTION_VALUE_REGNO_P
6642#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6643#undef TARGET_GIMPLIFY_VA_ARG_EXPR
6644#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
955cd057
TB
6645#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6646#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
29a2f518
JB
6647#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
6648#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
e2a58ed6
JB
6649#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
6650#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
6651 gcn_goacc_create_worker_broadcast_record
5326695a
AS
6652#undef TARGET_GOACC_FORK_JOIN
6653#define TARGET_GOACC_FORK_JOIN gcn_fork_join
6654#undef TARGET_GOACC_REDUCTION
6655#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6656#undef TARGET_GOACC_VALIDATE_DIMS
6657#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
2a3f9f65
JB
6658#undef TARGET_GOACC_SHARED_MEM_LAYOUT
6659#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
5326695a
AS
6660#undef TARGET_HARD_REGNO_MODE_OK
6661#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6662#undef TARGET_HARD_REGNO_NREGS
6663#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6664#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6665#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6666#undef TARGET_INIT_BUILTINS
6667#define TARGET_INIT_BUILTINS gcn_init_builtins
a8a730cd
JB
6668#undef TARGET_INIT_LIBFUNCS
6669#define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
5326695a
AS
6670#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6671#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6672 gcn_ira_change_pseudo_allocno_class
6673#undef TARGET_LEGITIMATE_CONSTANT_P
6674#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6675#undef TARGET_LRA_P
6676#define TARGET_LRA_P hook_bool_void_true
6677#undef TARGET_MACHINE_DEPENDENT_REORG
6678#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6679#undef TARGET_MEMORY_MOVE_COST
6680#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6681#undef TARGET_MODES_TIEABLE_P
6682#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6683#undef TARGET_OPTION_OVERRIDE
6684#define TARGET_OPTION_OVERRIDE gcn_option_override
6685#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6686#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6687 gcn_pretend_outgoing_varargs_named
6688#undef TARGET_PROMOTE_FUNCTION_MODE
6689#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6690#undef TARGET_REGISTER_MOVE_COST
6691#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6692#undef TARGET_RETURN_IN_MEMORY
6693#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6694#undef TARGET_RTX_COSTS
6695#define TARGET_RTX_COSTS gcn_rtx_costs
6696#undef TARGET_SECONDARY_RELOAD
6697#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6698#undef TARGET_SECTION_TYPE_FLAGS
6699#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
8d0b2b33
AS
6700#undef TARGET_SCALAR_MODE_SUPPORTED_P
6701#define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
b73c49f6
AS
6702#undef TARGET_SIMD_CLONE_ADJUST
6703#define TARGET_SIMD_CLONE_ADJUST gcn_simd_clone_adjust
6704#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
6705#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
6706 gcn_simd_clone_compute_vecsize_and_simdlen
6707#undef TARGET_SIMD_CLONE_USABLE
6708#define TARGET_SIMD_CLONE_USABLE gcn_simd_clone_usable
5326695a
AS
6709#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6710#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6711 gcn_small_register_classes_for_mode_p
6712#undef TARGET_SPILL_CLASS
6713#define TARGET_SPILL_CLASS gcn_spill_class
6714#undef TARGET_STRICT_ARGUMENT_NAMING
6715#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6716#undef TARGET_TRAMPOLINE_INIT
6717#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6718#undef TARGET_TRULY_NOOP_TRUNCATION
6719#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6720#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6721#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6722#undef TARGET_VECTORIZE_GET_MASK_MODE
6723#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6724#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6725#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6726#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6727#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6728 gcn_preferred_vector_alignment
2b99bed8
AS
6729#undef TARGET_VECTORIZE_RELATED_MODE
6730#define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
5326695a
AS
6731#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6732#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6733 gcn_vectorize_support_vector_misalignment
6734#undef TARGET_VECTORIZE_VEC_PERM_CONST
6735#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6736#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6737#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6738 gcn_vector_alignment_reachable
6739#undef TARGET_VECTOR_MODE_SUPPORTED_P
6740#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6741
6742struct gcc_target targetm = TARGET_INITIALIZER;
6743
6744#include "gt-gcn.h"
6745/* }}} */