]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn.cc
amdgcn: implement vector div and mod libfuncs
[thirdparty/gcc.git] / gcc / config / gcn / gcn.cc
CommitLineData
83ffe9cd 1/* Copyright (C) 2016-2023 Free Software Foundation, Inc.
5326695a
AS
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17/* {{{ Includes. */
18
19/* We want GET_MODE_SIZE et al to return integers, please. */
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "memmodel.h"
28#include "rtl.h"
29#include "tree.h"
30#include "df.h"
31#include "tm_p.h"
32#include "stringpool.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic-core.h"
38#include "insn-attr.h"
39#include "fold-const.h"
40#include "calls.h"
41#include "explow.h"
42#include "expr.h"
43#include "output.h"
44#include "cfgrtl.h"
45#include "langhooks.h"
46#include "builtins.h"
47#include "omp-general.h"
48#include "print-rtl.h"
49#include "attribs.h"
50#include "varasm.h"
51#include "intl.h"
52#include "rtl-iter.h"
b5bb7f32 53#include "dwarf2.h"
2961ac45 54#include "gimple.h"
b73c49f6 55#include "cgraph.h"
ce9cd725 56#include "case-cfn-macros.h"
5326695a
AS
57
58/* This file should be included last. */
59#include "target-def.h"
60
61/* }}} */
62/* {{{ Global variables. */
63
64/* Constants used by FP instructions. */
65
66static REAL_VALUE_TYPE dconst4, dconst1over2pi;
67static bool ext_gcn_constants_init = 0;
68
69/* Holds the ISA variant, derived from the command line parameters. */
70
cde52d3a 71enum gcn_isa gcn_isa = ISA_GCN3; /* Default to GCN3. */
5326695a
AS
72
73/* Reserve this much space for LDS (for propagating variables from
74 worker-single mode to worker-partitioned mode), per workgroup. Global
75 analysis could calculate an exact bound, but we don't do that yet.
76
86b0eb81 77 We want to permit full occupancy, so size accordingly. */
5326695a 78
2a3f9f65
JB
79/* Use this as a default, but allow it to grow if the user requests a large
80 amount of gang-private shared-memory space. */
81static int acc_lds_size = 0x600;
82
86b0eb81 83#define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
2a3f9f65 84#define ACC_LDS_SIZE acc_lds_size
86b0eb81
AS
85#define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
86
87#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
88 : flag_openmp ? OMP_LDS_SIZE \
89 : OTHER_LDS_SIZE)
5326695a 90
2a3f9f65
JB
91static int gang_private_hwm = 32;
92static hash_map<tree, int> lds_allocs;
93
87fdbe69
KCY
94/* The number of registers usable by normal non-kernel functions.
95 The SGPR count includes any special extra registers such as VCC. */
96
f062c3f1 97#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
87fdbe69
KCY
98#define MAX_NORMAL_VGPR_COUNT 24
99
5326695a
AS
100/* }}} */
101/* {{{ Initialization and options. */
102
103/* Initialize machine_function. */
104
105static struct machine_function *
106gcn_init_machine_status (void)
107{
108 struct machine_function *f;
109
110 f = ggc_cleared_alloc<machine_function> ();
111
5326695a
AS
112 if (TARGET_GCN3)
113 f->use_flat_addressing = true;
114
115 return f;
116}
117
118/* Implement TARGET_OPTION_OVERRIDE.
119
120 Override option settings where defaults are variable, or we have specific
121 needs to consider. */
122
123static void
124gcn_option_override (void)
125{
126 init_machine_status = gcn_init_machine_status;
127
128 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
129 if (!flag_pie)
130 flag_pie = 2;
131 if (!flag_pic)
132 flag_pic = flag_pie;
133
cde52d3a
AS
134 gcn_isa = (gcn_arch == PROCESSOR_FIJI ? ISA_GCN3
135 : gcn_arch == PROCESSOR_VEGA10 ? ISA_GCN5
136 : gcn_arch == PROCESSOR_VEGA20 ? ISA_GCN5
137 : gcn_arch == PROCESSOR_GFX908 ? ISA_CDNA1
138 : gcn_arch == PROCESSOR_GFX90a ? ISA_CDNA2
139 : ISA_UNKNOWN);
140 gcc_assert (gcn_isa != ISA_UNKNOWN);
5326695a 141
2a3f9f65
JB
142 /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
143 worker broadcasts. */
144 if (gang_private_size_opt == -1)
145 gang_private_size_opt = 512;
146 else if (gang_private_size_opt < gang_private_hwm)
147 gang_private_size_opt = gang_private_hwm;
148 else if (gang_private_size_opt >= acc_lds_size - 1024)
149 {
150 /* We need some space for reductions and worker broadcasting. If the
151 user requests a large amount of gang-private LDS space, we might not
152 have enough left for the former. Increase the LDS allocation in that
153 case, although this may reduce the maximum occupancy on the
154 hardware. */
155 acc_lds_size = gang_private_size_opt + 1024;
156 if (acc_lds_size > 32768)
157 acc_lds_size = 32768;
158 }
159
366e3d30
TB
160 /* The xnack option is a placeholder, for now. Before removing, update
161 gcn-hsa.h's XNACKOPT, gcn.opt's mxnack= default init+descr, and
162 invoke.texi's default description. */
163 if (flag_xnack != HSACO_ATTR_OFF)
aad32a00 164 sorry ("XNACK support");
5326695a
AS
165}
166
167/* }}} */
168/* {{{ Attributes. */
169
170/* This table defines the arguments that are permitted in
171 __attribute__ ((amdgpu_hsa_kernel (...))).
172
173 The names and values correspond to the HSA metadata that is encoded
174 into the assembler file and binary. */
175
176static const struct gcn_kernel_arg_type
177{
178 const char *name;
179 const char *header_pseudo;
180 machine_mode mode;
181
182 /* This should be set to -1 or -2 for a dynamically allocated register
183 number. Use -1 if this argument contributes to the user_sgpr_count,
184 -2 otherwise. */
185 int fixed_regno;
186} gcn_kernel_arg_types[] = {
187 {"exec", NULL, DImode, EXEC_REG},
188#define PRIVATE_SEGMENT_BUFFER_ARG 1
189 {"private_segment_buffer",
f062c3f1 190 ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
5326695a 191#define DISPATCH_PTR_ARG 2
f062c3f1 192 {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
5326695a 193#define QUEUE_PTR_ARG 3
f062c3f1 194 {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
5326695a 195#define KERNARG_SEGMENT_PTR_ARG 4
f062c3f1
AS
196 {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
197 {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
5326695a 198#define FLAT_SCRATCH_INIT_ARG 6
f062c3f1 199 {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
5326695a 200#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
f062c3f1
AS
201 {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
202#define WORKGROUP_ID_X_ARG 8
203 {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
204 {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
205 {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
206 {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
207#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
5326695a 208 {"private_segment_wave_offset",
f062c3f1
AS
209 ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
210#define WORK_ITEM_ID_X_ARG 13
5326695a 211 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
f062c3f1 212#define WORK_ITEM_ID_Y_ARG 14
5326695a 213 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
f062c3f1 214#define WORK_ITEM_ID_Z_ARG 15
5326695a
AS
215 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
216};
217
342f9464 218static const long default_requested_args
f6fff8a6 219 = (1 << DISPATCH_PTR_ARG)
342f9464
KCY
220 | (1 << QUEUE_PTR_ARG)
221 | (1 << KERNARG_SEGMENT_PTR_ARG)
342f9464
KCY
222 | (1 << WORKGROUP_ID_X_ARG)
223 | (1 << WORK_ITEM_ID_X_ARG)
224 | (1 << WORK_ITEM_ID_Y_ARG)
225 | (1 << WORK_ITEM_ID_Z_ARG);
226
5326695a
AS
227/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
228 This function also sets the default values for some arguments.
229
230 Return true on success, with ARGS populated. */
231
232static bool
233gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
234 tree list)
235{
236 bool err = false;
342f9464 237 args->requested = default_requested_args;
5326695a
AS
238 args->nargs = 0;
239
240 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
241 args->reg[a] = -1;
242
243 for (; list; list = TREE_CHAIN (list))
244 {
245 const char *str;
246 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
247 {
55308fc2 248 error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
5326695a
AS
249 "arguments");
250 break;
251 }
252 str = TREE_STRING_POINTER (TREE_VALUE (list));
253 int a;
254 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
255 {
256 if (!strcmp (str, gcn_kernel_arg_types[a].name))
257 break;
258 }
259 if (a == GCN_KERNEL_ARG_TYPES)
260 {
5cded5af 261 error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
55308fc2 262 str);
5326695a
AS
263 err = true;
264 break;
265 }
266 if (args->requested & (1 << a))
267 {
55308fc2 268 error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
5326695a
AS
269 "attribute", str);
270 err = true;
271 break;
272 }
273 args->requested |= (1 << a);
274 args->order[args->nargs++] = a;
275 }
5326695a
AS
276
277 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
278 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
279 requesting WORK_ITEM_ID_X_ARG. */
280 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
281 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
282 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
283 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
284
5326695a
AS
285 int sgpr_regno = FIRST_SGPR_REG;
286 args->nsgprs = 0;
287 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
288 {
289 if (!(args->requested & (1 << a)))
290 continue;
291
292 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
293 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
294 else
295 {
296 int reg_count;
297
298 switch (gcn_kernel_arg_types[a].mode)
299 {
300 case E_SImode:
301 reg_count = 1;
302 break;
303 case E_DImode:
304 reg_count = 2;
305 break;
306 case E_TImode:
307 reg_count = 4;
308 break;
309 default:
310 gcc_unreachable ();
311 }
312 args->reg[a] = sgpr_regno;
313 sgpr_regno += reg_count;
314 if (gcn_kernel_arg_types[a].fixed_regno == -1)
315 args->nsgprs += reg_count;
316 }
317 }
318 if (sgpr_regno > FIRST_SGPR_REG + 16)
319 {
320 error ("too many arguments passed in sgpr registers");
321 }
322 return err;
323}
324
325/* Referenced by TARGET_ATTRIBUTE_TABLE.
326
327 Validates target specific attributes. */
328
329static tree
330gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
331 tree args, int, bool *no_add_attrs)
332{
7039cebf 333 if (!FUNC_OR_METHOD_TYPE_P (*node))
5326695a
AS
334 {
335 warning (OPT_Wattributes, "%qE attribute only applies to functions",
336 name);
337 *no_add_attrs = true;
338 return NULL_TREE;
339 }
340
341 /* Can combine regparm with all attributes but fastcall, and thiscall. */
342 if (is_attribute_p ("gcnhsa_kernel", name))
343 {
344 struct gcn_kernel_args kernelarg;
345
346 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
347 *no_add_attrs = true;
348
349 return NULL_TREE;
350 }
351
352 return NULL_TREE;
353}
354
355/* Implement TARGET_ATTRIBUTE_TABLE.
356
357 Create target-specific __attribute__ types. */
358
359static const struct attribute_spec gcn_attribute_table[] = {
360 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
361 affects_type_identity } */
362 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
363 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
364 /* End element. */
365 {NULL, 0, 0, false, false, false, false, NULL, NULL}
366};
367
368/* }}} */
369/* {{{ Registers and modes. */
370
8d0b2b33
AS
371/* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */
372
373bool
374gcn_scalar_mode_supported_p (scalar_mode mode)
375{
376 return (mode == BImode
377 || mode == QImode
378 || mode == HImode /* || mode == HFmode */
379 || mode == SImode || mode == SFmode
380 || mode == DImode || mode == DFmode
292da5c5 381 || mode == TImode);
8d0b2b33
AS
382}
383
45381d6f
AS
384/* Return a vector mode with N lanes of MODE. */
385
386static machine_mode
387VnMODE (int n, machine_mode mode)
388{
389 switch (mode)
390 {
612de72b 391 case E_QImode:
45381d6f
AS
392 switch (n)
393 {
394 case 2: return V2QImode;
395 case 4: return V4QImode;
396 case 8: return V8QImode;
397 case 16: return V16QImode;
398 case 32: return V32QImode;
399 case 64: return V64QImode;
400 }
401 break;
612de72b 402 case E_HImode:
45381d6f
AS
403 switch (n)
404 {
405 case 2: return V2HImode;
406 case 4: return V4HImode;
407 case 8: return V8HImode;
408 case 16: return V16HImode;
409 case 32: return V32HImode;
410 case 64: return V64HImode;
411 }
412 break;
612de72b 413 case E_HFmode:
45381d6f
AS
414 switch (n)
415 {
416 case 2: return V2HFmode;
417 case 4: return V4HFmode;
418 case 8: return V8HFmode;
419 case 16: return V16HFmode;
420 case 32: return V32HFmode;
421 case 64: return V64HFmode;
422 }
423 break;
612de72b 424 case E_SImode:
45381d6f
AS
425 switch (n)
426 {
427 case 2: return V2SImode;
428 case 4: return V4SImode;
429 case 8: return V8SImode;
430 case 16: return V16SImode;
431 case 32: return V32SImode;
432 case 64: return V64SImode;
433 }
434 break;
612de72b 435 case E_SFmode:
45381d6f
AS
436 switch (n)
437 {
438 case 2: return V2SFmode;
439 case 4: return V4SFmode;
440 case 8: return V8SFmode;
441 case 16: return V16SFmode;
442 case 32: return V32SFmode;
443 case 64: return V64SFmode;
444 }
445 break;
612de72b 446 case E_DImode:
45381d6f
AS
447 switch (n)
448 {
449 case 2: return V2DImode;
450 case 4: return V4DImode;
451 case 8: return V8DImode;
452 case 16: return V16DImode;
453 case 32: return V32DImode;
454 case 64: return V64DImode;
455 }
456 break;
612de72b 457 case E_DFmode:
45381d6f
AS
458 switch (n)
459 {
460 case 2: return V2DFmode;
461 case 4: return V4DFmode;
462 case 8: return V8DFmode;
463 case 16: return V16DFmode;
464 case 32: return V32DFmode;
465 case 64: return V64DFmode;
466 }
467 break;
468 default:
469 break;
470 }
471
472 return VOIDmode;
473}
474
5326695a
AS
475/* Implement TARGET_CLASS_MAX_NREGS.
476
477 Return the number of hard registers needed to hold a value of MODE in
478 a register of class RCLASS. */
479
480static unsigned char
481gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
482{
483 /* Scalar registers are 32bit, vector registers are in fact tuples of
484 64 lanes. */
485 if (rclass == VGPR_REGS)
486 {
487 if (vgpr_1reg_mode_p (mode))
488 return 1;
489 if (vgpr_2reg_mode_p (mode))
490 return 2;
491 /* TImode is used by DImode compare_and_swap. */
8aeabd9f 492 if (vgpr_4reg_mode_p (mode))
5326695a
AS
493 return 4;
494 }
495 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
496 return 2;
3b97715a
AS
497
498 /* Vector modes in SGPRs are not supposed to happen (disallowed by
499 gcn_hard_regno_mode_ok), but there are some patterns that have an "Sv"
500 constraint and are used by splitters, post-reload.
501 This ensures that we don't accidentally mark the following 63 scalar
502 registers as "live". */
503 if (rclass == SGPR_REGS && VECTOR_MODE_P (mode))
504 return CEIL (GET_MODE_SIZE (GET_MODE_INNER (mode)), 4);
505
5326695a
AS
506 return CEIL (GET_MODE_SIZE (mode), 4);
507}
508
509/* Implement TARGET_HARD_REGNO_NREGS.
510
511 Return the number of hard registers needed to hold a value of MODE in
512 REGNO. */
513
514unsigned int
515gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
516{
517 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
518}
519
520/* Implement TARGET_HARD_REGNO_MODE_OK.
521
522 Return true if REGNO can hold value in MODE. */
523
524bool
525gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
526{
527 /* Treat a complex mode as if it were a scalar mode of the same overall
528 size for the purposes of allocating hard registers. */
529 if (COMPLEX_MODE_P (mode))
530 switch (mode)
531 {
532 case E_CQImode:
533 case E_CHImode:
534 mode = SImode;
535 break;
536 case E_CSImode:
537 mode = DImode;
538 break;
539 case E_CDImode:
540 mode = TImode;
541 break;
542 case E_HCmode:
543 mode = SFmode;
544 break;
545 case E_SCmode:
546 mode = DFmode;
547 break;
548 default:
549 /* Not supported. */
550 return false;
551 }
552
553 switch (regno)
554 {
555 case FLAT_SCRATCH_LO_REG:
556 case XNACK_MASK_LO_REG:
557 case TBA_LO_REG:
558 case TMA_LO_REG:
559 return (mode == SImode || mode == DImode);
560 case VCC_LO_REG:
561 case EXEC_LO_REG:
562 return (mode == BImode || mode == SImode || mode == DImode);
563 case M0_REG:
564 case FLAT_SCRATCH_HI_REG:
565 case XNACK_MASK_HI_REG:
566 case TBA_HI_REG:
567 case TMA_HI_REG:
568 return mode == SImode;
569 case VCC_HI_REG:
570 return false;
571 case EXEC_HI_REG:
572 return mode == SImode /*|| mode == V32BImode */ ;
573 case SCC_REG:
574 case VCCZ_REG:
575 case EXECZ_REG:
576 return mode == BImode;
577 }
578 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
579 return true;
580 if (SGPR_REGNO_P (regno))
581 /* We restrict double register values to aligned registers. */
582 return (sgpr_1reg_mode_p (mode)
583 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
584 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
585 if (VGPR_REGNO_P (regno))
3abfd4f3
AS
586 /* Vector instructions do not care about the alignment of register
587 pairs, but where there is no 64-bit instruction, many of the
588 define_split do not work if the input and output registers partially
589 overlap. We tried to fix this with early clobber and match
590 constraints, but it was bug prone, added complexity, and conflicts
591 with the 'U0' constraints on vec_merge.
592 Therefore, we restrict ourselved to aligned registers. */
593 return (vgpr_1reg_mode_p (mode)
594 || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
8aeabd9f
AS
595 /* TImode is used by DImode compare_and_swap,
596 and by DIVMOD V64DImode libfuncs. */
597 || (!((regno - FIRST_VGPR_REG) & 3) && vgpr_4reg_mode_p (mode)));
5326695a
AS
598 return false;
599}
600
601/* Implement REGNO_REG_CLASS via gcn.h.
602
603 Return smallest class containing REGNO. */
604
605enum reg_class
606gcn_regno_reg_class (int regno)
607{
608 switch (regno)
609 {
610 case SCC_REG:
611 return SCC_CONDITIONAL_REG;
9ecf84e6
KCY
612 case VCC_LO_REG:
613 case VCC_HI_REG:
614 return VCC_CONDITIONAL_REG;
5326695a
AS
615 case VCCZ_REG:
616 return VCCZ_CONDITIONAL_REG;
617 case EXECZ_REG:
618 return EXECZ_CONDITIONAL_REG;
619 case EXEC_LO_REG:
620 case EXEC_HI_REG:
621 return EXEC_MASK_REG;
622 }
623 if (VGPR_REGNO_P (regno))
624 return VGPR_REGS;
625 if (SGPR_REGNO_P (regno))
626 return SGPR_REGS;
627 if (regno < FIRST_VGPR_REG)
628 return GENERAL_REGS;
629 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
630 return AFP_REGS;
631 return ALL_REGS;
632}
633
634/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
635
636 GCC assumes that lowpart contains first part of value as stored in memory.
637 This is not the case for vector registers. */
638
639bool
640gcn_can_change_mode_class (machine_mode from, machine_mode to,
641 reg_class_t regclass)
642{
643 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
644 return true;
45381d6f
AS
645
646 /* Vector conversions are only valid when changing mode with a fixed number
647 of lanes, or changing number of lanes with a fixed mode. Anything else
648 would require actual data movement. */
649 if (VECTOR_MODE_P (from) && VECTOR_MODE_P (to)
650 && GET_MODE_NUNITS (from) != GET_MODE_NUNITS (to)
651 && GET_MODE_INNER (from) != GET_MODE_INNER (to))
652 return false;
653
654 /* Vector/scalar conversions are only permitted when the scalar mode
655 is the same or smaller than the inner vector mode. */
656 if ((VECTOR_MODE_P (from) && !VECTOR_MODE_P (to)
657 && GET_MODE_SIZE (to) >= GET_MODE_SIZE (GET_MODE_INNER (from)))
658 || (VECTOR_MODE_P (to) && !VECTOR_MODE_P (from)
659 && GET_MODE_SIZE (from) >= GET_MODE_SIZE (GET_MODE_INNER (to))))
660 return false;
661
5326695a
AS
662 return (gcn_class_max_nregs (regclass, from)
663 == gcn_class_max_nregs (regclass, to));
664}
665
666/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
667
668 When this hook returns true for MODE, the compiler allows
669 registers explicitly used in the rtl to be used as spill registers
670 but prevents the compiler from extending the lifetime of these
671 registers. */
672
673bool
674gcn_small_register_classes_for_mode_p (machine_mode mode)
675{
676 /* We allocate into exec and vcc regs. Those make small register class. */
677 return mode == DImode || mode == SImode;
678}
679
680/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
681
682 Returns true if pseudos that have been assigned to registers of class RCLASS
683 would likely be spilled because registers of RCLASS are needed for spill
684 registers. */
685
686static bool
687gcn_class_likely_spilled_p (reg_class_t rclass)
688{
689 return (rclass == EXEC_MASK_REG
690 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
691}
692
693/* Implement TARGET_MODES_TIEABLE_P.
694
695 Returns true if a value of MODE1 is accessible in MODE2 without
696 copying. */
697
698bool
699gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
700{
45381d6f
AS
701 if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
702 {
703 int vf1 = (VECTOR_MODE_P (mode1) ? GET_MODE_NUNITS (mode1) : 1);
704 int vf2 = (VECTOR_MODE_P (mode2) ? GET_MODE_NUNITS (mode2) : 1);
705 machine_mode inner1 = (vf1 > 1 ? GET_MODE_INNER (mode1) : mode1);
706 machine_mode inner2 = (vf2 > 1 ? GET_MODE_INNER (mode2) : mode2);
707
708 return (vf1 == vf2 || (inner1 == inner2 && vf2 <= vf1));
709 }
710
5326695a
AS
711 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
712 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
713}
714
715/* Implement TARGET_TRULY_NOOP_TRUNCATION.
716
717 Returns true if it is safe to “convert” a value of INPREC bits to one of
718 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
719 it as if it had only OUTPREC bits. */
720
721bool
722gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
723{
724 return ((inprec <= 32) && (outprec <= inprec));
725}
726
727/* Return N-th part of value occupying multiple registers. */
728
729rtx
730gcn_operand_part (machine_mode mode, rtx op, int n)
731{
45381d6f
AS
732 int vf = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
733
734 if (vf > 1)
5326695a 735 {
45381d6f 736 machine_mode vsimode = VnMODE (vf, SImode);
5326695a
AS
737
738 if (REG_P (op))
739 {
740 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
45381d6f 741 return gen_rtx_REG (vsimode, REGNO (op) + n);
5326695a
AS
742 }
743 if (GET_CODE (op) == CONST_VECTOR)
744 {
745 int units = GET_MODE_NUNITS (mode);
746 rtvec v = rtvec_alloc (units);
747
748 for (int i = 0; i < units; ++i)
749 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
750 CONST_VECTOR_ELT (op, i), n);
751
45381d6f 752 return gen_rtx_CONST_VECTOR (vsimode, v);
5326695a
AS
753 }
754 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
45381d6f 755 return gcn_gen_undef (vsimode);
5326695a
AS
756 gcc_unreachable ();
757 }
758 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
759 {
760 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
761 return gen_rtx_REG (SImode, REGNO (op) + n);
762 }
763 else
764 {
765 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
766 return gcn_gen_undef (SImode);
767
768 /* If it's a constant then let's assume it is of the largest mode
769 available, otherwise simplify_gen_subreg will fail. */
770 if (mode == VOIDmode && CONST_INT_P (op))
771 mode = DImode;
772 return simplify_gen_subreg (SImode, op, mode, n * 4);
773 }
774}
775
776/* Return N-th part of value occupying multiple registers. */
777
778rtx
779gcn_operand_doublepart (machine_mode mode, rtx op, int n)
780{
781 return simplify_gen_subreg (DImode, op, mode, n * 8);
782}
783
784/* Return true if OP can be split into subregs or high/low parts.
785 This is always true for scalars, but not normally true for vectors.
786 However, for vectors in hardregs we can use the low and high registers. */
787
788bool
789gcn_can_split_p (machine_mode, rtx op)
790{
791 if (vgpr_vector_mode_p (GET_MODE (op)))
792 {
793 if (GET_CODE (op) == SUBREG)
794 op = SUBREG_REG (op);
795 if (!REG_P (op))
796 return true;
797 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
798 }
799 return true;
800}
801
802/* Implement TARGET_SPILL_CLASS.
803
804 Return class of registers which could be used for pseudo of MODE
805 and of class RCLASS for spilling instead of memory. Return NO_REGS
806 if it is not possible or non-profitable. */
807
808static reg_class_t
809gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
810{
9ecf84e6 811 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
553ff252 812 || c == VCC_CONDITIONAL_REG || c == EXEC_MASK_REG)
5326695a
AS
813 return SGPR_REGS;
814 else
815 return NO_REGS;
816}
817
818/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
819
820 Change allocno class for given pseudo from allocno and best class
821 calculated by IRA. */
822
823static reg_class_t
824gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
825 reg_class_t best_cl)
826{
827 /* Avoid returning classes that contain both vgpr and sgpr registers. */
828 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
829 return cl;
830 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
831 && best_cl != ALL_GPR_REGS)
832 return best_cl;
833
834 machine_mode mode = PSEUDO_REGNO_MODE (regno);
835 if (vgpr_vector_mode_p (mode))
836 return VGPR_REGS;
837
838 return GENERAL_REGS;
839}
840
841/* Create a new DImode pseudo reg and emit an instruction to initialize
842 it to VAL. */
843
5cfe0855 844rtx
5326695a
AS
845get_exec (int64_t val)
846{
847 rtx reg = gen_reg_rtx (DImode);
848 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
849 return reg;
850}
851
5cfe0855
AS
852rtx
853get_exec (machine_mode mode)
854{
855 int vf = (VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1);
856 return get_exec (0xffffffffffffffffUL >> (64-vf));
857}
858
5326695a
AS
859/* }}} */
860/* {{{ Immediate constants. */
861
862/* Initialize shared numeric constants. */
863
864static void
865init_ext_gcn_constants (void)
866{
867 real_from_integer (&dconst4, DFmode, 4, SIGNED);
868
869 /* FIXME: this constant probably does not match what hardware really loads.
870 Reality check it eventually. */
871 real_from_string (&dconst1over2pi,
eff73c10 872 "0.15915494309189532");
5326695a
AS
873 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
874
875 ext_gcn_constants_init = 1;
876}
877
eff73c10
KCY
878REAL_VALUE_TYPE
879gcn_dconst1over2pi (void)
880{
881 if (!ext_gcn_constants_init)
882 init_ext_gcn_constants ();
883 return dconst1over2pi;
884}
885
5326695a
AS
886/* Return non-zero if X is a constant that can appear as an inline operand.
887 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
888 Or a vector of those.
889 The value returned should be the encoding of this constant. */
890
891int
892gcn_inline_fp_constant_p (rtx x, bool allow_vector)
893{
894 machine_mode mode = GET_MODE (x);
45381d6f 895 int vf = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
5326695a 896
45381d6f
AS
897 if (vf > 1)
898 mode = GET_MODE_INNER (mode);
899
900 if (vf > 1
901 && (mode == HFmode || mode == SFmode || mode == DFmode)
5326695a
AS
902 && allow_vector)
903 {
904 int n;
905 if (GET_CODE (x) != CONST_VECTOR)
906 return 0;
907 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
908 if (!n)
909 return 0;
45381d6f 910 for (int i = 1; i < vf; i++)
5326695a
AS
911 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
912 return 0;
913 return 1;
914 }
915
916 if (mode != HFmode && mode != SFmode && mode != DFmode)
917 return 0;
918
919 const REAL_VALUE_TYPE *r;
920
921 if (x == CONST0_RTX (mode))
922 return 128;
923 if (x == CONST1_RTX (mode))
924 return 242;
925
926 r = CONST_DOUBLE_REAL_VALUE (x);
927
928 if (real_identical (r, &dconstm1))
929 return 243;
930
931 if (real_identical (r, &dconsthalf))
932 return 240;
933 if (real_identical (r, &dconstm1))
934 return 243;
935 if (real_identical (r, &dconst2))
936 return 244;
937 if (real_identical (r, &dconst4))
938 return 246;
939 if (real_identical (r, &dconst1over2pi))
940 return 248;
941 if (!ext_gcn_constants_init)
942 init_ext_gcn_constants ();
943 real_value_negate (r);
944 if (real_identical (r, &dconsthalf))
945 return 241;
946 if (real_identical (r, &dconst2))
947 return 245;
948 if (real_identical (r, &dconst4))
949 return 247;
950
951 /* FIXME: add 4, -4 and 1/(2*PI). */
952
953 return 0;
954}
955
956/* Return non-zero if X is a constant that can appear as an immediate operand.
957 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
958 Or a vector of those.
959 The value returned should be the encoding of this constant. */
960
961bool
962gcn_fp_constant_p (rtx x, bool allow_vector)
963{
964 machine_mode mode = GET_MODE (x);
45381d6f 965 int vf = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
5326695a 966
45381d6f
AS
967 if (vf > 1)
968 mode = GET_MODE_INNER (mode);
969
970 if (vf > 1
971 && (mode == HFmode || mode == SFmode || mode == DFmode)
5326695a
AS
972 && allow_vector)
973 {
974 int n;
975 if (GET_CODE (x) != CONST_VECTOR)
976 return false;
977 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
978 if (!n)
979 return false;
45381d6f 980 for (int i = 1; i < vf; i++)
5326695a
AS
981 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
982 return false;
983 return true;
984 }
985 if (mode != HFmode && mode != SFmode && mode != DFmode)
986 return false;
987
988 if (gcn_inline_fp_constant_p (x, false))
989 return true;
990 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
991 return (mode != DFmode);
992}
993
994/* Return true if X is a constant representable as an inline immediate
995 constant in a 32-bit instruction encoding. */
996
997bool
998gcn_inline_constant_p (rtx x)
999{
1000 if (GET_CODE (x) == CONST_INT)
5960de78 1001 return INTVAL (x) >= -16 && INTVAL (x) <= 64;
5326695a
AS
1002 if (GET_CODE (x) == CONST_DOUBLE)
1003 return gcn_inline_fp_constant_p (x, false);
1004 if (GET_CODE (x) == CONST_VECTOR)
1005 {
1006 int n;
1007 if (!vgpr_vector_mode_p (GET_MODE (x)))
1008 return false;
1009 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
1010 if (!n)
1011 return false;
1012 for (int i = 1; i < 64; i++)
1013 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1014 return false;
1015 return 1;
1016 }
1017 return false;
1018}
1019
1020/* Return true if X is a constant representable as an immediate constant
1021 in a 32 or 64-bit instruction encoding. */
1022
1023bool
1024gcn_constant_p (rtx x)
1025{
1026 switch (GET_CODE (x))
1027 {
1028 case CONST_INT:
1029 return true;
1030
1031 case CONST_DOUBLE:
1032 return gcn_fp_constant_p (x, false);
1033
1034 case CONST_VECTOR:
1035 {
1036 int n;
1037 if (!vgpr_vector_mode_p (GET_MODE (x)))
1038 return false;
1039 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
1040 if (!n)
1041 return false;
1042 for (int i = 1; i < 64; i++)
1043 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1044 return false;
1045 return true;
1046 }
1047
1048 case SYMBOL_REF:
1049 case LABEL_REF:
1050 return true;
1051
1052 default:
1053 ;
1054 }
1055
1056 return false;
1057}
1058
1059/* Return true if X is a constant representable as two inline immediate
1060 constants in a 64-bit instruction that is split into two 32-bit
66b01cc3
AS
1061 instructions.
1062 When MIXED is set, the low-part is permitted to use the full 32-bits. */
5326695a
AS
1063
1064bool
66b01cc3 1065gcn_inline_constant64_p (rtx x, bool mixed)
5326695a
AS
1066{
1067 if (GET_CODE (x) == CONST_VECTOR)
1068 {
1069 if (!vgpr_vector_mode_p (GET_MODE (x)))
1070 return false;
66b01cc3 1071 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
5326695a
AS
1072 return false;
1073 for (int i = 1; i < 64; i++)
1074 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1075 return false;
1076
1077 return true;
1078 }
1079
1080 if (GET_CODE (x) != CONST_INT)
1081 return false;
1082
1083 rtx val_lo = gcn_operand_part (DImode, x, 0);
1084 rtx val_hi = gcn_operand_part (DImode, x, 1);
66b01cc3
AS
1085 return ((mixed || gcn_inline_constant_p (val_lo))
1086 && gcn_inline_constant_p (val_hi));
5326695a
AS
1087}
1088
1089/* Return true if X is a constant representable as an immediate constant
1090 in a 32 or 64-bit instruction encoding where the hardware will
1091 extend the immediate to 64-bits. */
1092
1093bool
1094gcn_constant64_p (rtx x)
1095{
1096 if (!gcn_constant_p (x))
1097 return false;
1098
1099 if (GET_CODE (x) != CONST_INT)
1100 return true;
1101
1102 /* Negative numbers are only allowed if they can be encoded within src0,
1103 because the 32-bit immediates do not get sign-extended.
1104 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
1105 assembler will use a src0 inline immediate and that will get
1106 sign-extended. */
1107 HOST_WIDE_INT val = INTVAL (x);
1108 return (((val & 0xffffffff) == val /* Positive 32-bit. */
1109 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
1110 || gcn_inline_constant_p (x)); /* Src0. */
1111}
1112
1113/* Implement TARGET_LEGITIMATE_CONSTANT_P.
1114
1115 Returns true if X is a legitimate constant for a MODE immediate operand. */
1116
1117bool
1118gcn_legitimate_constant_p (machine_mode, rtx x)
1119{
1120 return gcn_constant_p (x);
1121}
1122
1123/* Return true if X is a CONST_VECTOR of single constant. */
1124
1125static bool
1126single_cst_vector_p (rtx x)
1127{
1128 if (GET_CODE (x) != CONST_VECTOR)
1129 return false;
1130 for (int i = 1; i < 64; i++)
1131 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1132 return false;
1133 return true;
1134}
1135
1136/* Create a CONST_VECTOR of duplicated value A. */
1137
1138rtx
1139gcn_vec_constant (machine_mode mode, int a)
1140{
1141 /*if (!a)
1142 return CONST0_RTX (mode);
1143 if (a == -1)
1144 return CONSTM1_RTX (mode);
1145 if (a == 1)
1146 return CONST1_RTX (mode);
1147 if (a == 2)
1148 return CONST2_RTX (mode);*/
1149
1150 int units = GET_MODE_NUNITS (mode);
95607c12
AS
1151 machine_mode innermode = GET_MODE_INNER (mode);
1152
1153 rtx tem;
1154 if (FLOAT_MODE_P (innermode))
1155 {
1156 REAL_VALUE_TYPE rv;
1157 real_from_integer (&rv, NULL, a, SIGNED);
1158 tem = const_double_from_real_value (rv, innermode);
1159 }
1160 else
1161 tem = gen_int_mode (a, innermode);
5326695a 1162
95607c12 1163 rtvec v = rtvec_alloc (units);
5326695a
AS
1164 for (int i = 0; i < units; ++i)
1165 RTVEC_ELT (v, i) = tem;
1166
1167 return gen_rtx_CONST_VECTOR (mode, v);
1168}
1169
1170/* Create a CONST_VECTOR of duplicated value A. */
1171
1172rtx
1173gcn_vec_constant (machine_mode mode, rtx a)
1174{
1175 int units = GET_MODE_NUNITS (mode);
1176 rtvec v = rtvec_alloc (units);
1177
1178 for (int i = 0; i < units; ++i)
1179 RTVEC_ELT (v, i) = a;
1180
1181 return gen_rtx_CONST_VECTOR (mode, v);
1182}
1183
1184/* Create an undefined vector value, used where an insn operand is
1185 optional. */
1186
1187rtx
1188gcn_gen_undef (machine_mode mode)
1189{
1190 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1191}
1192
45381d6f
AS
1193/* }}} */
1194/* {{{ Utility functions. */
1195
1196/* Generalised accessor functions for instruction patterns.
1197 The machine desription '@' prefix does something similar, but as of
1198 GCC 10 is incompatible with define_subst, and anyway it doesn't
1199 auto-handle the exec feature.
1200
1201 Four macros are provided; each function only needs one:
1202
1203 GEN_VN - create accessor functions for all sizes of one mode
1204 GEN_VNM - create accessor functions for all sizes of all modes
1205 GEN_VN_NOEXEC - for insns without "_exec" variants
1206 GEN_VNM_NOEXEC - likewise
1207
1208 E.g. add<mode>3
1209 GEN_VNM (add, 3, A(rtx dest, rtx s1, rtx s2), A(dest, s1, s2)
1210
1211 gen_addvNsi3 (dst, a, b)
1212 -> calls gen_addv64si3, or gen_addv32si3, etc.
1213
1214 gen_addvNm3 (dst, a, b)
1215 -> calls gen_addv64qi3, or gen_addv2di3, etc.
1216
1217 The mode is determined from the first parameter, which must be called
1218 "dest" (or else the macro doesn't work).
1219
1220 Each function has two optional parameters at the end: merge_src and exec.
1221 If exec is non-null, the function will call the "_exec" variant of the
1222 insn. If exec is non-null but merge_src is null then an undef unspec
1223 will be created.
1224
1225 E.g. cont.
1226 gen_addvNsi3 (v64sidst, a, b, oldval, exec)
1227 -> calls gen_addv64si3_exec (v64sidst, a, b, oldval, exec)
1228
1229 gen_addvNm3 (v2qidst, a, b, NULL, exec)
1230 -> calls gen_addv2qi3_exec (v2qidst, a, b,
1231 gcn_gen_undef (V2QImode), exec)
1232 */
1233
1234#define A(...) __VA_ARGS__
1235#define GEN_VN_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \
1236static rtx \
1237gen_##PREFIX##vN##SUFFIX (PARAMS) \
1238{ \
1239 machine_mode mode = GET_MODE (dest); \
1240 int n = GET_MODE_NUNITS (mode); \
1241 \
1242 switch (n) \
1243 { \
1244 case 2: return gen_##PREFIX##v2##SUFFIX (ARGS); \
1245 case 4: return gen_##PREFIX##v4##SUFFIX (ARGS); \
1246 case 8: return gen_##PREFIX##v8##SUFFIX (ARGS); \
1247 case 16: return gen_##PREFIX##v16##SUFFIX (ARGS); \
1248 case 32: return gen_##PREFIX##v32##SUFFIX (ARGS); \
1249 case 64: return gen_##PREFIX##v64##SUFFIX (ARGS); \
1250 } \
1251 \
1252 gcc_unreachable (); \
1253 return NULL_RTX; \
1254}
1255
1256#define GEN_VNM_NOEXEC(PREFIX, SUFFIX, PARAMS, ARGS) \
1257GEN_VN_NOEXEC (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \
1258GEN_VN_NOEXEC (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \
1259GEN_VN_NOEXEC (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \
1260GEN_VN_NOEXEC (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \
1261GEN_VN_NOEXEC (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \
1262GEN_VN_NOEXEC (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \
1263GEN_VN_NOEXEC (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \
1264static rtx \
1265gen_##PREFIX##vNm##SUFFIX (PARAMS) \
1266{ \
1267 machine_mode mode = GET_MODE_INNER (GET_MODE (dest)); \
1268 \
1269 switch (mode) \
1270 { \
1271 case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS); \
1272 case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS); \
1273 case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS); \
1274 case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS); \
1275 case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS); \
1276 case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS); \
1277 case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS); \
1278 default: \
1279 break; \
1280 } \
1281 \
1282 gcc_unreachable (); \
1283 return NULL_RTX; \
1284}
1285
1286#define GEN_VN(PREFIX, SUFFIX, PARAMS, ARGS) \
1287static rtx \
1288gen_##PREFIX##vN##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
1289{ \
1290 machine_mode mode = GET_MODE (dest); \
1291 int n = GET_MODE_NUNITS (mode); \
1292 \
1293 if (exec && !merge_src) \
1294 merge_src = gcn_gen_undef (mode); \
1295 \
1296 if (exec) \
1297 switch (n) \
1298 { \
1299 case 2: return gen_##PREFIX##v2##SUFFIX##_exec (ARGS, merge_src, exec); \
1300 case 4: return gen_##PREFIX##v4##SUFFIX##_exec (ARGS, merge_src, exec); \
1301 case 8: return gen_##PREFIX##v8##SUFFIX##_exec (ARGS, merge_src, exec); \
1302 case 16: return gen_##PREFIX##v16##SUFFIX##_exec (ARGS, merge_src, exec); \
1303 case 32: return gen_##PREFIX##v32##SUFFIX##_exec (ARGS, merge_src, exec); \
1304 case 64: return gen_##PREFIX##v64##SUFFIX##_exec (ARGS, merge_src, exec); \
1305 } \
1306 else \
1307 switch (n) \
1308 { \
1309 case 2: return gen_##PREFIX##v2##SUFFIX (ARGS); \
1310 case 4: return gen_##PREFIX##v4##SUFFIX (ARGS); \
1311 case 8: return gen_##PREFIX##v8##SUFFIX (ARGS); \
1312 case 16: return gen_##PREFIX##v16##SUFFIX (ARGS); \
1313 case 32: return gen_##PREFIX##v32##SUFFIX (ARGS); \
1314 case 64: return gen_##PREFIX##v64##SUFFIX (ARGS); \
1315 } \
1316 \
1317 gcc_unreachable (); \
1318 return NULL_RTX; \
1319}
1320
1321#define GEN_VNM(PREFIX, SUFFIX, PARAMS, ARGS) \
1322GEN_VN (PREFIX, qi##SUFFIX, A(PARAMS), A(ARGS)) \
1323GEN_VN (PREFIX, hi##SUFFIX, A(PARAMS), A(ARGS)) \
1324GEN_VN (PREFIX, hf##SUFFIX, A(PARAMS), A(ARGS)) \
1325GEN_VN (PREFIX, si##SUFFIX, A(PARAMS), A(ARGS)) \
1326GEN_VN (PREFIX, sf##SUFFIX, A(PARAMS), A(ARGS)) \
1327GEN_VN (PREFIX, di##SUFFIX, A(PARAMS), A(ARGS)) \
1328GEN_VN (PREFIX, df##SUFFIX, A(PARAMS), A(ARGS)) \
8aeabd9f 1329USE_TI (GEN_VN (PREFIX, ti##SUFFIX, A(PARAMS), A(ARGS))) \
45381d6f
AS
1330static rtx \
1331gen_##PREFIX##vNm##SUFFIX (PARAMS, rtx merge_src=NULL, rtx exec=NULL) \
1332{ \
1333 machine_mode mode = GET_MODE_INNER (GET_MODE (dest)); \
1334 \
1335 switch (mode) \
1336 { \
1337 case E_QImode: return gen_##PREFIX##vNqi##SUFFIX (ARGS, merge_src, exec); \
1338 case E_HImode: return gen_##PREFIX##vNhi##SUFFIX (ARGS, merge_src, exec); \
1339 case E_HFmode: return gen_##PREFIX##vNhf##SUFFIX (ARGS, merge_src, exec); \
1340 case E_SImode: return gen_##PREFIX##vNsi##SUFFIX (ARGS, merge_src, exec); \
1341 case E_SFmode: return gen_##PREFIX##vNsf##SUFFIX (ARGS, merge_src, exec); \
1342 case E_DImode: return gen_##PREFIX##vNdi##SUFFIX (ARGS, merge_src, exec); \
1343 case E_DFmode: return gen_##PREFIX##vNdf##SUFFIX (ARGS, merge_src, exec); \
8aeabd9f
AS
1344 case E_TImode: \
1345 USE_TI (return gen_##PREFIX##vNti##SUFFIX (ARGS, merge_src, exec);) \
45381d6f
AS
1346 default: \
1347 break; \
1348 } \
1349 \
1350 gcc_unreachable (); \
1351 return NULL_RTX; \
1352}
1353
8aeabd9f
AS
1354/* These have TImode support. */
1355#define USE_TI(ARGS) ARGS
1356GEN_VNM (mov,, A(rtx dest, rtx src), A(dest, src))
1357GEN_VNM (vec_duplicate,, A(rtx dest, rtx src), A(dest, src))
1358
1359/* These do not have TImode support. */
1360#undef USE_TI
1361#define USE_TI(ARGS)
45381d6f
AS
1362GEN_VNM (add,3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
1363GEN_VN (add,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
1364GEN_VN (add,si3_vcc_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc),
1365 A(dest, src1, src2, vcc))
1366GEN_VN (add,di3_sext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
1367GEN_VN (add,di3_vcc_zext_dup, A(rtx dest, rtx src1, rtx src2, rtx vcc),
1368 A(dest, src1, src2, vcc))
1369GEN_VN (add,di3_zext_dup2, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
1370GEN_VN (add,di3_vcc_zext_dup2, A(rtx dest, rtx src1, rtx src2, rtx vcc),
1371 A(dest, src1, src2, vcc))
1372GEN_VN (addc,si3, A(rtx dest, rtx src1, rtx src2, rtx vccout, rtx vccin),
1373 A(dest, src1, src2, vccout, vccin))
769a10d0 1374GEN_VN (and,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
45381d6f
AS
1375GEN_VN (ashl,si3, A(rtx dest, rtx src, rtx shift), A(dest, src, shift))
1376GEN_VNM_NOEXEC (ds_bpermute,, A(rtx dest, rtx addr, rtx src, rtx exec),
1377 A(dest, addr, src, exec))
769a10d0
AS
1378GEN_VNM (gather,_expr, A(rtx dest, rtx addr, rtx as, rtx vol),
1379 A(dest, addr, as, vol))
45381d6f 1380GEN_VN (mul,si3_dup, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
769a10d0 1381GEN_VN (sub,si3, A(rtx dest, rtx src1, rtx src2), A(dest, src1, src2))
769a10d0 1382GEN_VN_NOEXEC (vec_series,si, A(rtx dest, rtx x, rtx c), A(dest, x, c))
45381d6f 1383
8aeabd9f 1384#undef USE_TI
45381d6f
AS
1385#undef GEN_VNM
1386#undef GEN_VN
1387#undef GET_VN_FN
1388#undef A
1389
1390/* Get icode for vector instructions without an optab. */
1391
1392#define CODE_FOR(PREFIX, SUFFIX) \
1393static int \
1394get_code_for_##PREFIX##vN##SUFFIX (int nunits) \
1395{ \
1396 switch (nunits) \
1397 { \
1398 case 2: return CODE_FOR_##PREFIX##v2##SUFFIX; \
1399 case 4: return CODE_FOR_##PREFIX##v4##SUFFIX; \
1400 case 8: return CODE_FOR_##PREFIX##v8##SUFFIX; \
1401 case 16: return CODE_FOR_##PREFIX##v16##SUFFIX; \
1402 case 32: return CODE_FOR_##PREFIX##v32##SUFFIX; \
1403 case 64: return CODE_FOR_##PREFIX##v64##SUFFIX; \
1404 } \
1405 \
1406 gcc_unreachable (); \
1407 return CODE_FOR_nothing; \
1408}
1409
1410#define CODE_FOR_OP(PREFIX) \
1411 CODE_FOR (PREFIX, qi) \
1412 CODE_FOR (PREFIX, hi) \
1413 CODE_FOR (PREFIX, hf) \
1414 CODE_FOR (PREFIX, si) \
1415 CODE_FOR (PREFIX, sf) \
1416 CODE_FOR (PREFIX, di) \
1417 CODE_FOR (PREFIX, df) \
8aeabd9f 1418 CODE_FOR (PREFIX, ti) \
45381d6f
AS
1419static int \
1420get_code_for_##PREFIX (machine_mode mode) \
1421{ \
1422 int vf = GET_MODE_NUNITS (mode); \
1423 machine_mode smode = GET_MODE_INNER (mode); \
1424 \
1425 switch (smode) \
1426 { \
1427 case E_QImode: return get_code_for_##PREFIX##vNqi (vf); \
1428 case E_HImode: return get_code_for_##PREFIX##vNhi (vf); \
1429 case E_HFmode: return get_code_for_##PREFIX##vNhf (vf); \
1430 case E_SImode: return get_code_for_##PREFIX##vNsi (vf); \
1431 case E_SFmode: return get_code_for_##PREFIX##vNsf (vf); \
1432 case E_DImode: return get_code_for_##PREFIX##vNdi (vf); \
1433 case E_DFmode: return get_code_for_##PREFIX##vNdf (vf); \
8aeabd9f 1434 case E_TImode: return get_code_for_##PREFIX##vNti (vf); \
45381d6f
AS
1435 default: break; \
1436 } \
1437 \
1438 gcc_unreachable (); \
1439 return CODE_FOR_nothing; \
1440}
1441
1442CODE_FOR_OP (reload_in)
1443CODE_FOR_OP (reload_out)
1444
1445#undef CODE_FOR_OP
1446#undef CODE_FOR
1447
db80ccd3
AS
1448/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
1449 series with step STEP. */
1450
1451bool
1452gcn_stepped_zero_int_parallel_p (rtx op, int step)
1453{
1454 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
1455 return false;
1456
1457 unsigned HOST_WIDE_INT base = 0;
1458 for (int i = 0; i < XVECLEN (op, 0); ++i)
1459 if (!CONST_INT_P (XVECEXP (op, 0, i))
1460 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
1461 return false;
1462
1463 return true;
1464}
1465
5326695a
AS
1466/* }}} */
1467/* {{{ Addresses, pointers and moves. */
1468
1469/* Return true is REG is a valid place to store a pointer,
1470 for instructions that require an SGPR.
1471 FIXME rename. */
1472
1473static bool
1474gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1475{
1476 if (GET_CODE (reg) == SUBREG)
1477 reg = SUBREG_REG (reg);
1478
1479 if (!REG_P (reg))
1480 return false;
1481
1482 if (GET_MODE (reg) != mode)
1483 return false;
1484
1485 int regno = REGNO (reg);
1486
1487 if (regno >= FIRST_PSEUDO_REGISTER)
1488 {
1489 if (!strict)
1490 return true;
1491
1492 if (!reg_renumber)
1493 return false;
1494
1495 regno = reg_renumber[regno];
1496 }
1497
1498 return (SGPR_REGNO_P (regno) || regno == M0_REG
1499 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1500}
1501
1502/* Return true is REG is a valid place to store a pointer,
1503 for instructions that require a VGPR. */
1504
1505static bool
1506gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1507{
1508 if (GET_CODE (reg) == SUBREG)
1509 reg = SUBREG_REG (reg);
1510
1511 if (!REG_P (reg))
1512 return false;
1513
1514 if (GET_MODE (reg) != mode)
1515 return false;
1516
1517 int regno = REGNO (reg);
1518
1519 if (regno >= FIRST_PSEUDO_REGISTER)
1520 {
1521 if (!strict)
1522 return true;
1523
1524 if (!reg_renumber)
1525 return false;
1526
1527 regno = reg_renumber[regno];
1528 }
1529
1530 return VGPR_REGNO_P (regno);
1531}
1532
1533/* Return true if X would be valid inside a MEM using the Flat address
1534 space. */
1535
1536bool
1537gcn_flat_address_p (rtx x, machine_mode mode)
1538{
1539 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1540 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1541
1542 if (vec_mode && gcn_address_register_p (x, DImode, false))
1543 return true;
1544
1545 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1546 return true;
1547
1548 if (TARGET_GCN5_PLUS
1549 && GET_CODE (x) == PLUS
1550 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1551 && CONST_INT_P (XEXP (x, 1)))
1552 return true;
1553
1554 return false;
1555}
1556
1557/* Return true if X would be valid inside a MEM using the Scalar Flat
1558 address space. */
1559
1560bool
1561gcn_scalar_flat_address_p (rtx x)
1562{
1563 if (gcn_address_register_p (x, DImode, false))
1564 return true;
1565
1566 if (GET_CODE (x) == PLUS
1567 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1568 && CONST_INT_P (XEXP (x, 1)))
1569 return true;
1570
1571 return false;
1572}
1573
1574/* Return true if MEM X would be valid for the Scalar Flat address space. */
1575
1576bool
1577gcn_scalar_flat_mem_p (rtx x)
1578{
1579 if (!MEM_P (x))
1580 return false;
1581
1582 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1583 return false;
1584
1585 return gcn_scalar_flat_address_p (XEXP (x, 0));
1586}
1587
1588/* Return true if X would be valid inside a MEM using the LDS or GDS
1589 address spaces. */
1590
1591bool
1592gcn_ds_address_p (rtx x)
1593{
1594 if (gcn_vec_address_register_p (x, SImode, false))
1595 return true;
1596
1597 if (GET_CODE (x) == PLUS
1598 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1599 && CONST_INT_P (XEXP (x, 1)))
1600 return true;
1601
1602 return false;
1603}
1604
1605/* Return true if ADDR would be valid inside a MEM using the Global
1606 address space. */
1607
1608bool
1609gcn_global_address_p (rtx addr)
1610{
1611 if (gcn_address_register_p (addr, DImode, false)
1612 || gcn_vec_address_register_p (addr, DImode, false))
1613 return true;
1614
1615 if (GET_CODE (addr) == PLUS)
1616 {
1617 rtx base = XEXP (addr, 0);
1618 rtx offset = XEXP (addr, 1);
1619 bool immediate_p = (CONST_INT_P (offset)
1620 && INTVAL (offset) >= -(1 << 12)
1621 && INTVAL (offset) < (1 << 12));
1622
1623 if ((gcn_address_register_p (base, DImode, false)
1624 || gcn_vec_address_register_p (base, DImode, false))
1625 && immediate_p)
1626 /* SGPR + CONST or VGPR + CONST */
1627 return true;
1628
1629 if (gcn_address_register_p (base, DImode, false)
1630 && gcn_vgpr_register_operand (offset, SImode))
1631 /* SPGR + VGPR */
1632 return true;
1633
1634 if (GET_CODE (base) == PLUS
1635 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1636 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1637 && immediate_p)
1638 /* (SGPR + VGPR) + CONST */
1639 return true;
1640 }
1641
1642 return false;
1643}
1644
1645/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1646
1647 Recognizes RTL expressions that are valid memory addresses for an
1648 instruction. The MODE argument is the machine mode for the MEM
1649 expression that wants to use this address.
1650
1651 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1652 convert common non-canonical forms to canonical form so that they will
1653 be recognized. */
1654
1655static bool
1656gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1657 addr_space_t as)
1658{
1659 /* All vector instructions need to work on addresses in registers. */
1660 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1661 return false;
1662
1663 if (AS_SCALAR_FLAT_P (as))
1664 {
1665 if (mode == QImode || mode == HImode)
1666 return 0;
1667
1668 switch (GET_CODE (x))
1669 {
1670 case REG:
1671 return gcn_address_register_p (x, DImode, strict);
1672 /* Addresses are in the form BASE+OFFSET
1673 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1674 Writes and atomics do not accept SGPR. */
1675 case PLUS:
1676 {
1677 rtx x0 = XEXP (x, 0);
1678 rtx x1 = XEXP (x, 1);
1679 if (!gcn_address_register_p (x0, DImode, strict))
1680 return false;
1681 /* FIXME: This is disabled because of the mode mismatch between
1682 SImode (for the address or m0 register) and the DImode PLUS.
1683 We'll need a zero_extend or similar.
1684
1685 if (gcn_m0_register_p (x1, SImode, strict)
1686 || gcn_address_register_p (x1, SImode, strict))
1687 return true;
1688 else*/
1689 if (GET_CODE (x1) == CONST_INT)
1690 {
1691 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1692 /* The low bits of the offset are ignored, even when
1693 they're meant to realign the pointer. */
1694 && !(INTVAL (x1) & 0x3))
1695 return true;
1696 }
1697 return false;
1698 }
1699
1700 default:
1701 break;
1702 }
1703 }
1704 else if (AS_SCRATCH_P (as))
1705 return gcn_address_register_p (x, SImode, strict);
1706 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1707 {
1708 if (TARGET_GCN3 || GET_CODE (x) == REG)
1709 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1710 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1711 ? gcn_address_register_p (x, DImode, strict)
1712 : gcn_vec_address_register_p (x, DImode, strict));
1713 else
1714 {
1715 gcc_assert (TARGET_GCN5_PLUS);
1716
1717 if (GET_CODE (x) == PLUS)
1718 {
1719 rtx x1 = XEXP (x, 1);
1720
1721 if (VECTOR_MODE_P (mode)
1722 ? !gcn_address_register_p (x, DImode, strict)
1723 : !gcn_vec_address_register_p (x, DImode, strict))
1724 return false;
1725
1726 if (GET_CODE (x1) == CONST_INT)
1727 {
1728 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1729 /* The low bits of the offset are ignored, even when
1730 they're meant to realign the pointer. */
1731 && !(INTVAL (x1) & 0x3))
1732 return true;
1733 }
1734 }
1735 return false;
1736 }
1737 }
1738 else if (AS_GLOBAL_P (as))
1739 {
1740 gcc_assert (TARGET_GCN5_PLUS);
1741
1742 if (GET_CODE (x) == REG)
1743 return (gcn_address_register_p (x, DImode, strict)
1744 || (!VECTOR_MODE_P (mode)
1745 && gcn_vec_address_register_p (x, DImode, strict)));
1746 else if (GET_CODE (x) == PLUS)
1747 {
1748 rtx base = XEXP (x, 0);
1749 rtx offset = XEXP (x, 1);
1750
1751 bool immediate_p = (GET_CODE (offset) == CONST_INT
1752 /* Signed 13-bit immediate. */
1753 && INTVAL (offset) >= -(1 << 12)
1754 && INTVAL (offset) < (1 << 12)
1755 /* The low bits of the offset are ignored, even
1756 when they're meant to realign the pointer. */
1757 && !(INTVAL (offset) & 0x3));
1758
1759 if (!VECTOR_MODE_P (mode))
1760 {
1761 if ((gcn_address_register_p (base, DImode, strict)
1762 || gcn_vec_address_register_p (base, DImode, strict))
1763 && immediate_p)
1764 /* SGPR + CONST or VGPR + CONST */
1765 return true;
1766
1767 if (gcn_address_register_p (base, DImode, strict)
1768 && gcn_vgpr_register_operand (offset, SImode))
1769 /* SGPR + VGPR */
1770 return true;
1771
1772 if (GET_CODE (base) == PLUS
1773 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1774 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1775 && immediate_p)
1776 /* (SGPR + VGPR) + CONST */
1777 return true;
1778 }
1779 else
1780 {
1781 if (gcn_address_register_p (base, DImode, strict)
1782 && immediate_p)
1783 /* SGPR + CONST */
1784 return true;
1785 }
1786 }
1787 else
1788 return false;
1789 }
1790 else if (AS_ANY_DS_P (as))
1791 switch (GET_CODE (x))
1792 {
1793 case REG:
1794 return (VECTOR_MODE_P (mode)
1795 ? gcn_address_register_p (x, SImode, strict)
1796 : gcn_vec_address_register_p (x, SImode, strict));
1797 /* Addresses are in the form BASE+OFFSET
1798 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1799 Writes and atomics do not accept SGPR. */
1800 case PLUS:
1801 {
1802 rtx x0 = XEXP (x, 0);
1803 rtx x1 = XEXP (x, 1);
1804 if (!gcn_vec_address_register_p (x0, DImode, strict))
1805 return false;
1806 if (GET_CODE (x1) == REG)
1807 {
1808 if (GET_CODE (x1) != REG
1809 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1810 && !gcn_ssrc_register_operand (x1, DImode)))
1811 return false;
1812 }
1813 else if (GET_CODE (x1) == CONST_VECTOR
1814 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1815 && single_cst_vector_p (x1))
1816 {
1817 x1 = CONST_VECTOR_ELT (x1, 0);
1818 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1819 return true;
1820 }
1821 return false;
1822 }
1823
1824 default:
1825 break;
1826 }
1827 else
1828 gcc_unreachable ();
1829 return false;
1830}
1831
1832/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1833
1834 Return the appropriate mode for a named address pointer. */
1835
1836static scalar_int_mode
1837gcn_addr_space_pointer_mode (addr_space_t addrspace)
1838{
1839 switch (addrspace)
1840 {
1841 case ADDR_SPACE_SCRATCH:
1842 case ADDR_SPACE_LDS:
1843 case ADDR_SPACE_GDS:
1844 return SImode;
1845 case ADDR_SPACE_DEFAULT:
1846 case ADDR_SPACE_FLAT:
1847 case ADDR_SPACE_FLAT_SCRATCH:
1848 case ADDR_SPACE_SCALAR_FLAT:
1849 return DImode;
1850 default:
1851 gcc_unreachable ();
1852 }
1853}
1854
1855/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1856
1857 Return the appropriate mode for a named address space address. */
1858
1859static scalar_int_mode
1860gcn_addr_space_address_mode (addr_space_t addrspace)
1861{
1862 return gcn_addr_space_pointer_mode (addrspace);
1863}
1864
1865/* Implement TARGET_ADDR_SPACE_SUBSET_P.
1866
1867 Determine if one named address space is a subset of another. */
1868
1869static bool
1870gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1871{
1872 if (subset == superset)
1873 return true;
1874 /* FIXME is this true? */
1875 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1876 return true;
1877 return false;
1878}
1879
1880/* Convert from one address space to another. */
1881
1882static rtx
1883gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1884{
1885 gcc_assert (POINTER_TYPE_P (from_type));
1886 gcc_assert (POINTER_TYPE_P (to_type));
1887
1888 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1889 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1890
1891 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1892 {
f6fff8a6
AS
1893 /* The high bits of the QUEUE_PTR_ARG register are used by
1894 GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out. */
1895 rtx queue_reg = gen_rtx_REG (DImode,
1896 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1897 rtx queue_ptr = gen_reg_rtx (DImode);
1898 emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff)));
5326695a 1899 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
f6fff8a6 1900 gen_rtx_PLUS (DImode, queue_ptr,
5326695a
AS
1901 gen_int_mode (64, SImode)));
1902 rtx tmp = gen_reg_rtx (DImode);
1903
1904 emit_move_insn (gen_lowpart (SImode, tmp), op);
1905 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1906 group_seg_aperture_hi);
1907
1908 return tmp;
1909 }
1910 else if (as_from == as_to)
1911 return op;
1912 else
1913 gcc_unreachable ();
1914}
1915
b5bb7f32
HAQ
1916/* Implement TARGET_ADDR_SPACE_DEBUG.
1917
1918 Return the dwarf address space class for each hardware address space. */
1919
1920static int
1921gcn_addr_space_debug (addr_space_t as)
1922{
1923 switch (as)
1924 {
1925 case ADDR_SPACE_DEFAULT:
1926 case ADDR_SPACE_FLAT:
1927 case ADDR_SPACE_SCALAR_FLAT:
1928 case ADDR_SPACE_FLAT_SCRATCH:
1929 return DW_ADDR_none;
1930 case ADDR_SPACE_GLOBAL:
1931 return 1; // DW_ADDR_LLVM_global
1932 case ADDR_SPACE_LDS:
1933 return 3; // DW_ADDR_LLVM_group
1934 case ADDR_SPACE_SCRATCH:
1935 return 4; // DW_ADDR_LLVM_private
1936 case ADDR_SPACE_GDS:
1937 return 0x8000; // DW_ADDR_AMDGPU_region
1938 }
1939 gcc_unreachable ();
1940}
1941
5326695a
AS
1942
1943/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1944
1945 Retun true if REGNO is OK for memory adressing. */
1946
1947bool
1948gcn_regno_mode_code_ok_for_base_p (int regno,
1949 machine_mode, addr_space_t as, int, int)
1950{
1951 if (regno >= FIRST_PSEUDO_REGISTER)
1952 {
1953 if (reg_renumber)
1954 regno = reg_renumber[regno];
1955 else
1956 return true;
1957 }
1958 if (AS_FLAT_P (as))
1959 return (VGPR_REGNO_P (regno)
1960 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1961 else if (AS_SCALAR_FLAT_P (as))
1962 return (SGPR_REGNO_P (regno)
1963 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1964 else if (AS_GLOBAL_P (as))
1965 {
1966 return (SGPR_REGNO_P (regno)
1967 || VGPR_REGNO_P (regno)
1968 || regno == ARG_POINTER_REGNUM
1969 || regno == FRAME_POINTER_REGNUM);
1970 }
1971 else
1972 /* For now. */
1973 return false;
1974}
1975
1976/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1977
1978 Return a suitable register class for memory addressing. */
1979
1980reg_class
1981gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1982 int ic)
1983{
1984 switch (as)
1985 {
1986 case ADDR_SPACE_DEFAULT:
1987 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1988 case ADDR_SPACE_SCALAR_FLAT:
1989 case ADDR_SPACE_SCRATCH:
1990 return SGPR_REGS;
1991 break;
1992 case ADDR_SPACE_FLAT:
1993 case ADDR_SPACE_FLAT_SCRATCH:
1994 case ADDR_SPACE_LDS:
1995 case ADDR_SPACE_GDS:
1996 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1997 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1998 ? SGPR_REGS : VGPR_REGS);
1999 case ADDR_SPACE_GLOBAL:
2000 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2001 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
2002 ? SGPR_REGS : ALL_GPR_REGS);
2003 }
2004 gcc_unreachable ();
2005}
2006
2007/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
2008
2009 Return true if REGNO is OK for index of memory addressing. */
2010
2011bool
2012regno_ok_for_index_p (int regno)
2013{
2014 if (regno >= FIRST_PSEUDO_REGISTER)
2015 {
2016 if (reg_renumber)
2017 regno = reg_renumber[regno];
2018 else
2019 return true;
2020 }
2021 return regno == M0_REG || VGPR_REGNO_P (regno);
2022}
2023
5326695a
AS
2024/* Expand vector init of OP0 by VEC.
2025 Implements vec_init instruction pattern. */
2026
2027void
2028gcn_expand_vector_init (rtx op0, rtx vec)
2029{
769a10d0 2030 rtx val[64];
5326695a 2031 machine_mode mode = GET_MODE (op0);
45381d6f 2032 int vf = GET_MODE_NUNITS (mode);
769a10d0
AS
2033 machine_mode addrmode = VnMODE (vf, DImode);
2034 machine_mode offsetmode = VnMODE (vf, SImode);
5326695a 2035
769a10d0
AS
2036 int64_t mem_mask = 0;
2037 int64_t item_mask[64];
2038 rtx ramp = gen_reg_rtx (offsetmode);
2039 rtx addr = gen_reg_rtx (addrmode);
5326695a 2040
769a10d0
AS
2041 int unit_size = GET_MODE_SIZE (GET_MODE_INNER (GET_MODE (op0)));
2042 emit_insn (gen_mulvNsi3_dup (ramp, gen_rtx_REG (offsetmode, VGPR_REGNO (1)),
2043 GEN_INT (unit_size)));
5326695a 2044
769a10d0
AS
2045 bool simple_repeat = true;
2046
2047 /* Expand nested vectors into one vector. */
2048 int item_count = XVECLEN (vec, 0);
2049 for (int i = 0, j = 0; i < item_count; i++)
5326695a 2050 {
769a10d0
AS
2051 rtx item = XVECEXP (vec, 0, i);
2052 machine_mode mode = GET_MODE (item);
2053 int units = VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 1;
2054 item_mask[j] = (((uint64_t)-1)>>(64-units)) << j;
2055
2056 if (simple_repeat && i != 0)
2057 simple_repeat = item == XVECEXP (vec, 0, i-1);
2058
2059 /* If its a vector of values then copy them into the final location. */
2060 if (GET_CODE (item) == CONST_VECTOR)
2061 {
2062 for (int k = 0; k < units; k++)
2063 val[j++] = XVECEXP (item, 0, k);
2064 continue;
2065 }
2066 /* Otherwise, we have a scalar or an expression that expands... */
2067
2068 if (MEM_P (item))
2069 {
2070 rtx base = XEXP (item, 0);
2071 if (MEM_ADDR_SPACE (item) == DEFAULT_ADDR_SPACE
2072 && REG_P (base))
2073 {
2074 /* We have a simple vector load. We can put the addresses in
2075 the vector, combine it with any other such MEMs, and load it
2076 all with a single gather at the end. */
2077 int64_t mask = ((0xffffffffffffffffUL
2078 >> (64-GET_MODE_NUNITS (mode)))
2079 << j);
2080 rtx exec = get_exec (mask);
2081 emit_insn (gen_subvNsi3
2082 (ramp, ramp,
2083 gcn_vec_constant (offsetmode, j*unit_size),
2084 ramp, exec));
2085 emit_insn (gen_addvNdi3_zext_dup2
2086 (addr, ramp, base,
2087 (mem_mask ? addr : gcn_gen_undef (addrmode)),
2088 exec));
2089 mem_mask |= mask;
2090 }
2091 else
2092 /* The MEM is non-trivial, so let's load it independently. */
2093 item = force_reg (mode, item);
2094 }
2095 else if (!CONST_INT_P (item) && !CONST_DOUBLE_P (item))
2096 /* The item may be a symbol_ref, or something else non-trivial. */
2097 item = force_reg (mode, item);
2098
2099 /* Duplicate the vector across each item.
2100 It is either a smaller vector register that needs shifting,
2101 or a MEM that needs loading. */
2102 val[j] = item;
2103 j += units;
5326695a 2104 }
769a10d0
AS
2105
2106 int64_t initialized_mask = 0;
2107 rtx prev = NULL;
2108
2109 if (mem_mask)
2110 {
2111 emit_insn (gen_gathervNm_expr
2112 (op0, gen_rtx_PLUS (addrmode, addr,
2113 gen_rtx_VEC_DUPLICATE (addrmode,
2114 const0_rtx)),
2115 GEN_INT (DEFAULT_ADDR_SPACE), GEN_INT (0),
2116 NULL, get_exec (mem_mask)));
2117 prev = op0;
2118 initialized_mask = mem_mask;
2119 }
2120
2121 if (simple_repeat && item_count > 1 && !prev)
2122 {
2123 /* Special case for instances of {A, B, A, B, A, B, ....}, etc. */
2124 rtx src = gen_rtx_SUBREG (mode, val[0], 0);
2125 rtx input_vf_mask = GEN_INT (GET_MODE_NUNITS (GET_MODE (val[0]))-1);
2126
2127 rtx permutation = gen_reg_rtx (VnMODE (vf, SImode));
2128 emit_insn (gen_vec_seriesvNsi (permutation, GEN_INT (0), GEN_INT (1)));
2129 rtx mask_dup = gen_reg_rtx (VnMODE (vf, SImode));
2130 emit_insn (gen_vec_duplicatevNsi (mask_dup, input_vf_mask));
2131 emit_insn (gen_andvNsi3 (permutation, permutation, mask_dup));
2132 emit_insn (gen_ashlvNsi3 (permutation, permutation, GEN_INT (2)));
2133 emit_insn (gen_ds_bpermutevNm (op0, permutation, src, get_exec (mode)));
2134 return;
2135 }
2136
2137 /* Write each value, elementwise, but coalesce matching values into one
2138 instruction, where possible. */
2139 for (int i = 0; i < vf; i++)
5326695a
AS
2140 if (!(initialized_mask & ((int64_t) 1 << i)))
2141 {
769a10d0
AS
2142 if (gcn_constant_p (val[i]))
2143 emit_insn (gen_movvNm (op0, gcn_vec_constant (mode, val[i]), prev,
2144 get_exec (item_mask[i])));
2145 else if (VECTOR_MODE_P (GET_MODE (val[i]))
2146 && (GET_MODE_NUNITS (GET_MODE (val[i])) == vf
2147 || i == 0))
2148 emit_insn (gen_movvNm (op0, gen_rtx_SUBREG (mode, val[i], 0), prev,
2149 get_exec (item_mask[i])));
2150 else if (VECTOR_MODE_P (GET_MODE (val[i])))
2151 {
2152 rtx permutation = gen_reg_rtx (VnMODE (vf, SImode));
2153 emit_insn (gen_vec_seriesvNsi (permutation, GEN_INT (-i*4),
2154 GEN_INT (4)));
2155 rtx tmp = gen_reg_rtx (mode);
2156 emit_insn (gen_ds_bpermutevNm (tmp, permutation,
2157 gen_rtx_SUBREG (mode, val[i], 0),
2158 get_exec (-1)));
2159 emit_insn (gen_movvNm (op0, tmp, prev, get_exec (item_mask[i])));
2160 }
5326695a
AS
2161 else
2162 {
769a10d0
AS
2163 rtx reg = force_reg (GET_MODE_INNER (mode), val[i]);
2164 emit_insn (gen_vec_duplicatevNm (op0, reg, prev,
2165 get_exec (item_mask[i])));
5326695a 2166 }
769a10d0
AS
2167
2168 initialized_mask |= item_mask[i];
2169 prev = op0;
5326695a
AS
2170 }
2171}
2172
2173/* Load vector constant where n-th lane contains BASE+n*VAL. */
2174
2175static rtx
2176strided_constant (machine_mode mode, int base, int val)
2177{
2178 rtx x = gen_reg_rtx (mode);
2179 emit_move_insn (x, gcn_vec_constant (mode, base));
45381d6f
AS
2180 emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 32),
2181 x, get_exec (0xffffffff00000000)));
2182 emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 16),
2183 x, get_exec (0xffff0000ffff0000)));
2184 emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 8),
2185 x, get_exec (0xff00ff00ff00ff00)));
2186 emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 4),
2187 x, get_exec (0xf0f0f0f0f0f0f0f0)));
2188 emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 2),
2189 x, get_exec (0xcccccccccccccccc)));
2190 emit_insn (gen_addvNm3 (x, x, gcn_vec_constant (mode, val * 1),
2191 x, get_exec (0xaaaaaaaaaaaaaaaa)));
5326695a
AS
2192 return x;
2193}
2194
2195/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
2196
2197static rtx
2198gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
2199 addr_space_t as)
2200{
2201 switch (as)
2202 {
2203 case ADDR_SPACE_DEFAULT:
2204 return gcn_addr_space_legitimize_address (x, old, mode,
2205 DEFAULT_ADDR_SPACE);
2206 case ADDR_SPACE_SCALAR_FLAT:
2207 case ADDR_SPACE_SCRATCH:
2208 /* Instructions working on vectors need the address to be in
2209 a register. */
2210 if (vgpr_vector_mode_p (mode))
2211 return force_reg (GET_MODE (x), x);
2212
2213 return x;
2214 case ADDR_SPACE_FLAT:
2215 case ADDR_SPACE_FLAT_SCRATCH:
2216 case ADDR_SPACE_GLOBAL:
2217 return TARGET_GCN3 ? force_reg (DImode, x) : x;
2218 case ADDR_SPACE_LDS:
2219 case ADDR_SPACE_GDS:
2220 /* FIXME: LDS support offsets, handle them!. */
45381d6f
AS
2221 if (vgpr_vector_mode_p (mode)
2222 && GET_MODE_INNER (GET_MODE (x)) != SImode)
5326695a 2223 {
45381d6f
AS
2224 machine_mode simode = VnMODE (GET_MODE_NUNITS (mode), SImode);
2225 rtx addrs = gen_reg_rtx (simode);
5326695a 2226 rtx base = force_reg (SImode, x);
45381d6f 2227 rtx offsets = strided_constant (simode, 0,
5326695a
AS
2228 GET_MODE_UNIT_SIZE (mode));
2229
45381d6f
AS
2230 emit_insn (gen_vec_duplicatevNsi (addrs, base));
2231 emit_insn (gen_addvNsi3 (addrs, offsets, addrs));
5326695a
AS
2232 return addrs;
2233 }
2234 return x;
2235 }
2236 gcc_unreachable ();
2237}
2238
45381d6f 2239/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:VnDI)) with the
5326695a
AS
2240 proper vector of stepped addresses.
2241
2242 MEM will be a DImode address of a vector in an SGPR.
45381d6f 2243 TMP will be a VnDImode VGPR pair or (scratch:VnDI). */
5326695a
AS
2244
2245rtx
2246gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
2247 rtx tmp)
2248{
45381d6f
AS
2249 machine_mode pmode = VnMODE (GET_MODE_NUNITS (mode), DImode);
2250 machine_mode offmode = VnMODE (GET_MODE_NUNITS (mode), SImode);
5326695a
AS
2251 gcc_assert (MEM_P (mem));
2252 rtx mem_base = XEXP (mem, 0);
2253 rtx mem_index = NULL_RTX;
2254
2255 if (!TARGET_GCN5_PLUS)
2256 {
2257 /* gcn_addr_space_legitimize_address should have put the address in a
2258 register. If not, it is too late to do anything about it. */
2259 gcc_assert (REG_P (mem_base));
2260 }
2261
2262 if (GET_CODE (mem_base) == PLUS)
2263 {
2264 mem_index = XEXP (mem_base, 1);
2265 mem_base = XEXP (mem_base, 0);
2266 }
2267
2268 /* RF and RM base registers for vector modes should be always an SGPR. */
2269 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
2270 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
2271
2272 machine_mode inner = GET_MODE_INNER (mode);
2273 int shift = exact_log2 (GET_MODE_SIZE (inner));
45381d6f 2274 rtx ramp = gen_rtx_REG (offmode, VGPR_REGNO (1));
5326695a
AS
2275 rtx new_base = NULL_RTX;
2276 addr_space_t as = MEM_ADDR_SPACE (mem);
2277
2278 rtx tmplo = (REG_P (tmp)
45381d6f
AS
2279 ? gcn_operand_part (pmode, tmp, 0)
2280 : gen_reg_rtx (offmode));
5326695a
AS
2281
2282 /* tmplo[:] = ramp[:] << shift */
45381d6f
AS
2283 emit_insn (gen_ashlvNsi3 (tmplo, ramp,
2284 gen_int_mode (shift, SImode),
2285 NULL, exec));
5326695a
AS
2286
2287 if (AS_FLAT_P (as))
2288 {
75d0b3d7
AS
2289 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
2290
5326695a
AS
2291 if (REG_P (tmp))
2292 {
5326695a
AS
2293 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
2294 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
45381d6f 2295 rtx tmphi = gcn_operand_part (pmode, tmp, 1);
5326695a
AS
2296
2297 /* tmphi[:] = mem_base_hi */
45381d6f 2298 emit_insn (gen_vec_duplicatevNsi (tmphi, mem_base_hi, NULL, exec));
5326695a
AS
2299
2300 /* tmp[:] += zext (mem_base) */
2301 if (exec)
2302 {
45381d6f
AS
2303 emit_insn (gen_addvNsi3_vcc_dup (tmplo, mem_base_lo, tmplo,
2304 vcc, NULL, exec));
2305 emit_insn (gen_addcvNsi3 (tmphi, tmphi, const0_rtx,
2306 vcc, vcc, NULL, exec));
5326695a
AS
2307 }
2308 else
45381d6f 2309 emit_insn (gen_addvNdi3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
5326695a
AS
2310 }
2311 else
2312 {
45381d6f
AS
2313 tmp = gen_reg_rtx (pmode);
2314 emit_insn (gen_addvNdi3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc,
2315 NULL, exec));
5326695a
AS
2316 }
2317
2318 new_base = tmp;
2319 }
2320 else if (AS_ANY_DS_P (as))
2321 {
45381d6f 2322 emit_insn (gen_addvNsi3_dup (tmplo, tmplo, mem_base, NULL, exec));
5326695a
AS
2323 new_base = tmplo;
2324 }
2325 else
2326 {
45381d6f
AS
2327 mem_base = gen_rtx_VEC_DUPLICATE (pmode, mem_base);
2328 new_base = gen_rtx_PLUS (pmode, mem_base,
2329 gen_rtx_SIGN_EXTEND (pmode, tmplo));
5326695a
AS
2330 }
2331
2332 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
2333 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
2334 (mem_index ? mem_index
2335 : const0_rtx)));
2336}
2337
2338/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
2339 suitable for the given address space. This is indented for use in
2340 gather/scatter patterns.
2341
2342 The offsets may be signed or unsigned, according to UNSIGNED_P.
2343 If EXEC is set then _exec patterns will be used, otherwise plain.
2344
2345 Return values.
45381d6f
AS
2346 ADDR_SPACE_FLAT - return VnDImode vector of absolute addresses.
2347 ADDR_SPACE_GLOBAL - return VnSImode vector of offsets. */
5326695a
AS
2348
2349rtx
2350gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
2351 bool unsigned_p, rtx exec)
2352{
45381d6f
AS
2353 int vf = GET_MODE_NUNITS (GET_MODE (offsets));
2354 rtx tmpsi = gen_reg_rtx (VnMODE (vf, SImode));
2355 rtx tmpdi = gen_reg_rtx (VnMODE (vf, DImode));
5326695a
AS
2356
2357 if (CONST_INT_P (scale)
2358 && INTVAL (scale) > 0
2359 && exact_log2 (INTVAL (scale)) >= 0)
45381d6f
AS
2360 emit_insn (gen_ashlvNsi3 (tmpsi, offsets,
2361 GEN_INT (exact_log2 (INTVAL (scale))),
2362 NULL, exec));
5326695a 2363 else
45381d6f 2364 emit_insn (gen_mulvNsi3_dup (tmpsi, offsets, scale, NULL, exec));
5326695a
AS
2365
2366 /* "Global" instructions do not support negative register offsets. */
2367 if (as == ADDR_SPACE_FLAT || !unsigned_p)
2368 {
2369 if (unsigned_p)
45381d6f 2370 emit_insn (gen_addvNdi3_zext_dup2 (tmpdi, tmpsi, base, NULL, exec));
5326695a 2371 else
45381d6f 2372 emit_insn (gen_addvNdi3_sext_dup2 (tmpdi, tmpsi, base, NULL, exec));
5326695a
AS
2373 return tmpdi;
2374 }
2375 else if (as == ADDR_SPACE_GLOBAL)
2376 return tmpsi;
2377
2378 gcc_unreachable ();
2379}
2380
2381/* Return true if move from OP0 to OP1 is known to be executed in vector
2382 unit. */
2383
2384bool
2385gcn_vgpr_move_p (rtx op0, rtx op1)
2386{
2387 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
2388 return true;
2389 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
2390 return true;
2391 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
2392 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
2393 || vgpr_vector_mode_p (GET_MODE (op0)));
2394}
2395
2396/* Return true if move from OP0 to OP1 is known to be executed in scalar
2397 unit. Used in the machine description. */
2398
2399bool
2400gcn_sgpr_move_p (rtx op0, rtx op1)
2401{
2402 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
2403 return true;
2404 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
2405 return true;
2406 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
2407 || VGPR_REGNO_P (REGNO (op0)))
2408 return false;
2409 if (REG_P (op1)
2410 && REGNO (op1) < FIRST_PSEUDO_REGISTER
2411 && !VGPR_REGNO_P (REGNO (op1)))
2412 return true;
2413 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
2414}
2415
2416/* Implement TARGET_SECONDARY_RELOAD.
2417
2418 The address space determines which registers can be used for loads and
2419 stores. */
2420
2421static reg_class_t
2422gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
2423 machine_mode reload_mode, secondary_reload_info *sri)
2424{
2425 reg_class_t result = NO_REGS;
2426 bool spilled_pseudo =
2427 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
2428
2429 if (dump_file && (dump_flags & TDF_DETAILS))
2430 {
2431 fprintf (dump_file, "gcn_secondary_reload: ");
2432 dump_value_slim (dump_file, x, 1);
2433 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
2434 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
2435 if (REG_P (x) || GET_CODE (x) == SUBREG)
2436 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
2437 (true_regnum (x) >= 0
2438 && true_regnum (x) < FIRST_PSEUDO_REGISTER
2439 ? reg_names[true_regnum (x)]
2440 : (spilled_pseudo ? "stack spill" : "??")));
2441 fprintf (dump_file, "\n");
2442 }
2443
2444 /* Some callers don't use or initialize icode. */
2445 sri->icode = CODE_FOR_nothing;
2446
2447 if (MEM_P (x) || spilled_pseudo)
2448 {
2449 addr_space_t as = DEFAULT_ADDR_SPACE;
2450
2451 /* If we have a spilled pseudo, we can't find the address space
2452 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
2453 ADDR_SPACE_GLOBAL for GCN5. */
2454 if (MEM_P (x))
2455 as = MEM_ADDR_SPACE (x);
2456
2457 if (as == ADDR_SPACE_DEFAULT)
2458 as = DEFAULT_ADDR_SPACE;
2459
2460 switch (as)
2461 {
2462 case ADDR_SPACE_SCALAR_FLAT:
2463 result =
2464 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
2465 break;
2466 case ADDR_SPACE_FLAT:
2467 case ADDR_SPACE_FLAT_SCRATCH:
2468 case ADDR_SPACE_GLOBAL:
2469 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
2470 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
2471 {
2472 if (in_p)
45381d6f 2473 sri->icode = get_code_for_reload_in (reload_mode);
5326695a 2474 else
45381d6f 2475 sri->icode = get_code_for_reload_out (reload_mode);
5326695a
AS
2476 break;
2477 }
2478 /* Fallthrough. */
2479 case ADDR_SPACE_LDS:
2480 case ADDR_SPACE_GDS:
2481 case ADDR_SPACE_SCRATCH:
2482 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2483 break;
2484 }
2485 }
2486
2487 if (dump_file && (dump_flags & TDF_DETAILS))
2488 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2489 get_insn_name (sri->icode));
2490
2491 return result;
2492}
2493
2494/* Update register usage after having seen the compiler flags and kernel
2495 attributes. We typically want to fix registers that contain values
2496 set by the HSA runtime. */
2497
2498static void
2499gcn_conditional_register_usage (void)
2500{
342f9464
KCY
2501 if (!cfun || !cfun->machine)
2502 return;
5326695a 2503
342f9464
KCY
2504 if (cfun->machine->normal_function)
2505 {
2506 /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
f062c3f1 2507 for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
87fdbe69 2508 i <= LAST_SGPR_REG; i++)
342f9464 2509 fixed_regs[i] = 1, call_used_regs[i] = 1;
5326695a 2510
87fdbe69
KCY
2511 for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2512 i <= LAST_VGPR_REG; i++)
342f9464 2513 fixed_regs[i] = 1, call_used_regs[i] = 1;
5326695a 2514
5326695a
AS
2515 return;
2516 }
2517
342f9464
KCY
2518 /* If the set of requested args is the default set, nothing more needs to
2519 be done. */
2520 if (cfun->machine->args.requested == default_requested_args)
2521 return;
2522
2523 /* Requesting a set of args different from the default violates the ABI. */
2524 if (!leaf_function_p ())
2525 warning (0, "A non-default set of initial values has been requested, "
55308fc2 2526 "which violates the ABI");
342f9464
KCY
2527
2528 for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2529 fixed_regs[i] = 0;
2530
5326695a
AS
2531 /* Fix the runtime argument register containing values that may be
2532 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2533 needed after the prologue so there's no need to fix them. */
2534 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2535 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2536 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2537 {
342f9464
KCY
2538 /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2539 the containing registers to be used for other purposes. */
5326695a
AS
2540 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2541 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
5326695a
AS
2542 }
2543 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2544 {
2545 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2546 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2547 }
2548 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2549 {
2550 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2551 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2552 }
f6fff8a6
AS
2553 if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
2554 {
2555 fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG]] = 1;
2556 fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG] + 1] = 1;
2557 }
5326695a
AS
2558 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2559 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2560 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2561 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2562 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2563 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2564 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2565 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
5326695a
AS
2566}
2567
2568/* Determine if a load or store is valid, according to the register classes
2569 and address space. Used primarily by the machine description to decide
2570 when to split a move into two steps. */
2571
2572bool
2573gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2574{
2575 if (!MEM_P (dest) && !MEM_P (src))
2576 return true;
2577
2578 if (MEM_P (dest)
2579 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2580 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2581 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2582 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2583 && gcn_vgpr_register_operand (src, mode))
2584 return true;
2585 else if (MEM_P (src)
2586 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2587 && (gcn_flat_address_p (XEXP (src, 0), mode)
2588 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2589 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2590 && gcn_vgpr_register_operand (dest, mode))
2591 return true;
2592
2593 if (MEM_P (dest)
2594 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2595 && (gcn_global_address_p (XEXP (dest, 0))
2596 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2597 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2598 && gcn_vgpr_register_operand (src, mode))
2599 return true;
2600 else if (MEM_P (src)
2601 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2602 && (gcn_global_address_p (XEXP (src, 0))
2603 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2604 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2605 && gcn_vgpr_register_operand (dest, mode))
2606 return true;
2607
2608 if (MEM_P (dest)
2609 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2610 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2611 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2612 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2613 && gcn_ssrc_register_operand (src, mode))
2614 return true;
2615 else if (MEM_P (src)
2616 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2617 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2618 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2619 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2620 && gcn_sdst_register_operand (dest, mode))
2621 return true;
2622
2623 if (MEM_P (dest)
2624 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2625 && gcn_ds_address_p (XEXP (dest, 0))
2626 && gcn_vgpr_register_operand (src, mode))
2627 return true;
2628 else if (MEM_P (src)
2629 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2630 && gcn_ds_address_p (XEXP (src, 0))
2631 && gcn_vgpr_register_operand (dest, mode))
2632 return true;
2633
2634 return false;
2635}
2636
2637/* }}} */
2638/* {{{ Functions and ABI. */
2639
2640/* Implement TARGET_FUNCTION_VALUE.
2641
2642 Define how to find the value returned by a function.
2643 The register location is always the same, but the mode depends on
2644 VALTYPE. */
2645
2646static rtx
2647gcn_function_value (const_tree valtype, const_tree, bool)
2648{
2649 machine_mode mode = TYPE_MODE (valtype);
2650
2651 if (INTEGRAL_TYPE_P (valtype)
2652 && GET_MODE_CLASS (mode) == MODE_INT
2653 && GET_MODE_SIZE (mode) < 4)
2654 mode = SImode;
2655
4e191462 2656 return gen_rtx_REG (mode, RETURN_VALUE_REG);
5326695a
AS
2657}
2658
2659/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2660
2661 Return true if N is a possible register number for the function return
2662 value. */
2663
2664static bool
2665gcn_function_value_regno_p (const unsigned int n)
2666{
2667 return n == RETURN_VALUE_REG;
2668}
2669
0ffef200
RS
2670/* Calculate the number of registers required to hold function argument
2671 ARG. */
5326695a
AS
2672
2673static int
0ffef200 2674num_arg_regs (const function_arg_info &arg)
5326695a 2675{
0ffef200 2676 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2677 return 0;
2678
0ffef200 2679 int size = arg.promoted_size_in_bytes ();
4e191462
AS
2680 int regsize = UNITS_PER_WORD * (VECTOR_MODE_P (arg.mode)
2681 ? GET_MODE_NUNITS (arg.mode) : 1);
2682 return (size + regsize - 1) / regsize;
5326695a
AS
2683}
2684
2685/* Implement TARGET_STRICT_ARGUMENT_NAMING.
2686
2687 Return true if the location where a function argument is passed
2688 depends on whether or not it is a named argument
2689
2690 For gcn, we know how to handle functions declared as stdarg: by
2691 passing an extra pointer to the unnamed arguments. However, the
2692 Fortran frontend can produce a different situation, where a
2693 function pointer is declared with no arguments, but the actual
2694 function and calls to it take more arguments. In that case, we
2695 want to ensure the call matches the definition of the function. */
2696
2697static bool
2698gcn_strict_argument_naming (cumulative_args_t cum_v)
2699{
2700 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2701
2702 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2703}
2704
2705/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2706
2707 See comment on gcn_strict_argument_naming. */
2708
2709static bool
2710gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2711{
2712 return !gcn_strict_argument_naming (cum_v);
2713}
2714
2715/* Implement TARGET_FUNCTION_ARG.
2716
2717 Return an RTX indicating whether a function argument is passed in a register
2718 and if so, which register. */
2719
2720static rtx
6783fdb7 2721gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2722{
2723 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2724 if (cum->normal_function)
2725 {
6783fdb7 2726 if (!arg.named || arg.end_marker_p ())
5326695a
AS
2727 return 0;
2728
0ffef200 2729 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2730 return 0;
2731
4e191462
AS
2732 int first_reg = (VECTOR_MODE_P (arg.mode)
2733 ? FIRST_VPARM_REG : FIRST_PARM_REG);
2734 int cum_num = (VECTOR_MODE_P (arg.mode)
2735 ? cum->vnum : cum->num);
2736 int reg_num = first_reg + cum_num;
0ffef200 2737 int num_regs = num_arg_regs (arg);
5326695a
AS
2738 if (num_regs > 0)
2739 while (reg_num % num_regs != 0)
2740 reg_num++;
4e191462 2741 if (reg_num + num_regs <= first_reg + NUM_PARM_REGS)
6783fdb7 2742 return gen_rtx_REG (arg.mode, reg_num);
5326695a
AS
2743 }
2744 else
2745 {
2746 if (cum->num >= cum->args.nargs)
2747 {
6783fdb7
RS
2748 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2749 & -(TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2750 cfun->machine->kernarg_segment_alignment
2751 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
6783fdb7 2752 TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2753 rtx addr = gen_rtx_REG (DImode,
2754 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2755 if (cum->offset)
2756 addr = gen_rtx_PLUS (DImode, addr,
2757 gen_int_mode (cum->offset, DImode));
6783fdb7
RS
2758 rtx mem = gen_rtx_MEM (arg.mode, addr);
2759 set_mem_attributes (mem, arg.type, 1);
5326695a
AS
2760 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2761 MEM_READONLY_P (mem) = 1;
2762 return mem;
2763 }
2764
2765 int a = cum->args.order[cum->num];
6783fdb7 2766 if (arg.mode != gcn_kernel_arg_types[a].mode)
5326695a
AS
2767 {
2768 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2769 return 0;
2770 }
2771 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2772 cum->args.reg[a]);
2773 }
2774 return 0;
2775}
2776
2777/* Implement TARGET_FUNCTION_ARG_ADVANCE.
2778
2779 Updates the summarizer variable pointed to by CUM_V to advance past an
2780 argument in the argument list. */
2781
2782static void
6930c98c
RS
2783gcn_function_arg_advance (cumulative_args_t cum_v,
2784 const function_arg_info &arg)
5326695a
AS
2785{
2786 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2787
2788 if (cum->normal_function)
2789 {
6930c98c 2790 if (!arg.named)
5326695a
AS
2791 return;
2792
4e191462
AS
2793 int first_reg = (VECTOR_MODE_P (arg.mode)
2794 ? FIRST_VPARM_REG : FIRST_PARM_REG);
2795 int *cum_num = (VECTOR_MODE_P (arg.mode)
2796 ? &cum->vnum : &cum->num);
0ffef200 2797 int num_regs = num_arg_regs (arg);
5326695a 2798 if (num_regs > 0)
4e191462
AS
2799 while ((first_reg + *cum_num) % num_regs != 0)
2800 (*cum_num)++;
2801 *cum_num += num_regs;
5326695a
AS
2802 }
2803 else
2804 {
2805 if (cum->num < cum->args.nargs)
2806 cum->num++;
2807 else
2808 {
6930c98c 2809 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
5326695a
AS
2810 cfun->machine->kernarg_segment_byte_size = cum->offset;
2811 }
2812 }
2813}
2814
2815/* Implement TARGET_ARG_PARTIAL_BYTES.
2816
2817 Returns the number of bytes at the beginning of an argument that must be put
2818 in registers. The value must be zero for arguments that are passed entirely
2819 in registers or that are entirely pushed on the stack. */
2820
2821static int
a7c81bc1 2822gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2823{
2824 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2825
a7c81bc1 2826 if (!arg.named)
5326695a
AS
2827 return 0;
2828
0ffef200 2829 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2830 return 0;
2831
4e191462
AS
2832 int cum_num = (VECTOR_MODE_P (arg.mode) ? cum->vnum : cum->num);
2833 int regsize = UNITS_PER_WORD * (VECTOR_MODE_P (arg.mode)
2834 ? GET_MODE_NUNITS (arg.mode) : 1);
2835
2836 if (cum_num >= NUM_PARM_REGS)
5326695a
AS
2837 return 0;
2838
2839 /* If the argument fits entirely in registers, return 0. */
4e191462 2840 if (cum_num + num_arg_regs (arg) <= NUM_PARM_REGS)
5326695a
AS
2841 return 0;
2842
4e191462 2843 return (NUM_PARM_REGS - cum_num) * regsize;
5326695a
AS
2844}
2845
7c55755d
JB
2846/* A normal function which takes a pointer argument may be passed a pointer to
2847 LDS space (via a high-bits-set aperture), and that only works with FLAT
2848 addressing, not GLOBAL. Force FLAT addressing if the function has an
2849 incoming pointer parameter. NOTE: This is a heuristic that works in the
2850 offloading case, but in general, a function might read global pointer
2851 variables, etc. that may refer to LDS space or other special memory areas
2852 not supported by GLOBAL instructions, and then this argument check would not
2853 suffice. */
5326695a
AS
2854
2855static void
2856gcn_detect_incoming_pointer_arg (tree fndecl)
2857{
2858 gcc_assert (cfun && cfun->machine);
2859
2860 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2861 arg;
2862 arg = TREE_CHAIN (arg))
7c55755d 2863 if (POINTER_TYPE_P (TREE_VALUE (arg)))
5326695a
AS
2864 cfun->machine->use_flat_addressing = true;
2865}
2866
2867/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2868
2869 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2870 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2871
2872void
2873gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2874 tree fntype /* tree ptr for function decl */ ,
2875 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2876 tree fndecl, int caller)
2877{
2878 memset (cum, 0, sizeof (*cum));
2879 cum->fntype = fntype;
2880 if (libname)
2881 {
2882 gcc_assert (cfun && cfun->machine);
2883 cum->normal_function = true;
2884 if (!caller)
2885 {
2886 cfun->machine->normal_function = true;
2887 gcn_detect_incoming_pointer_arg (fndecl);
2888 }
2889 return;
2890 }
2891 tree attr = NULL;
2892 if (fndecl)
2893 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2894 if (fndecl && !attr)
2895 attr = lookup_attribute ("amdgpu_hsa_kernel",
2896 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2897 if (!attr && fntype)
2898 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2899 /* Handle main () as kernel, so we can run testsuite.
2900 Handle OpenACC kernels similarly to main. */
2901 if (!attr && !caller && fndecl
2902 && (MAIN_NAME_P (DECL_NAME (fndecl))
2903 || lookup_attribute ("omp target entrypoint",
2904 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2905 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2906 else
2907 {
2908 if (!attr || caller)
2909 {
2910 gcc_assert (cfun && cfun->machine);
2911 cum->normal_function = true;
2912 if (!caller)
2913 cfun->machine->normal_function = true;
2914 }
2915 gcn_parse_amdgpu_hsa_kernel_attribute
2916 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2917 }
2918 cfun->machine->args = cum->args;
2919 if (!caller && cfun->machine->normal_function)
2920 gcn_detect_incoming_pointer_arg (fndecl);
3ed8f692
KCY
2921
2922 reinit_regs ();
5326695a
AS
2923}
2924
2925static bool
2926gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2927{
2928 machine_mode mode = TYPE_MODE (type);
2929 HOST_WIDE_INT size = int_size_in_bytes (type);
2930
2931 if (AGGREGATE_TYPE_P (type))
2932 return true;
2933
2934 if (mode == BLKmode)
2935 return true;
2936
4e191462
AS
2937 if ((!VECTOR_TYPE_P (type) && size > 2 * UNITS_PER_WORD)
2938 || size > 2 * UNITS_PER_WORD * 64)
5326695a
AS
2939 return true;
2940
2941 return false;
2942}
2943
2944/* Implement TARGET_PROMOTE_FUNCTION_MODE.
2945
2946 Return the mode to use for outgoing function arguments. */
2947
2948machine_mode
2949gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2950 int *ARG_UNUSED (punsignedp),
2951 const_tree ARG_UNUSED (funtype),
2952 int ARG_UNUSED (for_return))
2953{
2954 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2955 return SImode;
2956
2957 return mode;
2958}
2959
2960/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2961
2962 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2963 ARGS_GROW_DOWNWARDS. */
2964
2965static tree
2966gcn_gimplify_va_arg_expr (tree valist, tree type,
2967 gimple_seq *ARG_UNUSED (pre_p),
2968 gimple_seq *ARG_UNUSED (post_p))
2969{
2970 tree ptr = build_pointer_type (type);
2971 tree valist_type;
2972 tree t, u;
2973 bool indirect;
2974
fde65a89 2975 indirect = pass_va_arg_by_reference (type);
5326695a
AS
2976 if (indirect)
2977 {
2978 type = ptr;
2979 ptr = build_pointer_type (type);
2980 }
2981 valist_type = TREE_TYPE (valist);
2982
2983 /* Args grow down. Not handled by generic routines. */
2984
2985 u = fold_convert (sizetype, size_in_bytes (type));
2986 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2987 t = fold_build_pointer_plus (valist, u);
2988
2989 /* Align to 8 byte boundary. */
2990
2991 u = build_int_cst (TREE_TYPE (t), -8);
2992 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2993 t = fold_convert (valist_type, t);
2994
2995 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2996
2997 t = fold_convert (ptr, t);
2998 t = build_va_arg_indirect_ref (t);
2999
3000 if (indirect)
3001 t = build_va_arg_indirect_ref (t);
3002
3003 return t;
3004}
3005
955cd057
TB
3006/* Return 1 if TRAIT NAME is present in the OpenMP context's
3007 device trait set, return 0 if not present in any OpenMP context in the
3008 whole translation unit, or -1 if not present in the current OpenMP context
3009 but might be present in another OpenMP context in the same TU. */
3010
3011int
3012gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
3013 const char *name)
3014{
3015 switch (trait)
3016 {
3017 case omp_device_kind:
3018 return strcmp (name, "gpu") == 0;
3019 case omp_device_arch:
ebe5dace 3020 return strcmp (name, "amdgcn") == 0 || strcmp (name, "gcn") == 0;
955cd057 3021 case omp_device_isa:
1fd50874 3022 if (strcmp (name, "fiji") == 0 || strcmp (name, "gfx803") == 0)
955cd057
TB
3023 return gcn_arch == PROCESSOR_FIJI;
3024 if (strcmp (name, "gfx900") == 0)
f062c3f1 3025 return gcn_arch == PROCESSOR_VEGA10;
955cd057 3026 if (strcmp (name, "gfx906") == 0)
f062c3f1 3027 return gcn_arch == PROCESSOR_VEGA20;
3535402e
AS
3028 if (strcmp (name, "gfx908") == 0)
3029 return gcn_arch == PROCESSOR_GFX908;
cde52d3a
AS
3030 if (strcmp (name, "gfx90a") == 0)
3031 return gcn_arch == PROCESSOR_GFX90a;
955cd057
TB
3032 return 0;
3033 default:
3034 gcc_unreachable ();
3035 }
3036}
3037
5326695a
AS
3038/* Calculate stack offsets needed to create prologues and epilogues. */
3039
3040static struct machine_function *
3041gcn_compute_frame_offsets (void)
3042{
3043 machine_function *offsets = cfun->machine;
3044
3045 if (reload_completed)
3046 return offsets;
3047
3048 offsets->need_frame_pointer = frame_pointer_needed;
3049
3050 offsets->outgoing_args_size = crtl->outgoing_args_size;
3051 offsets->pretend_size = crtl->args.pretend_args_size;
3052
3053 offsets->local_vars = get_frame_size ();
3054
3055 offsets->lr_needs_saving = (!leaf_function_p ()
3056 || df_regs_ever_live_p (LR_REGNUM)
3057 || df_regs_ever_live_p (LR_REGNUM + 1));
3058
3059 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
3060
3061 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 3062 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
3063 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
3064 && frame_pointer_needed))
3065 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
3066
3067 /* Round up to 64-bit boundary to maintain stack alignment. */
3068 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
3069
3070 return offsets;
3071}
3072
3073/* Insert code into the prologue or epilogue to store or load any
3074 callee-save register to/from the stack.
3075
3076 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
3077
3078static void
3079move_callee_saved_registers (rtx sp, machine_function *offsets,
3080 bool prologue)
3081{
3082 int regno, offset, saved_scalars;
3083 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
3084 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
3085 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
3086 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
3087 HOST_WIDE_INT exec_set = 0;
3088 int offreg_set = 0;
251697a6 3089 auto_vec<int> saved_sgprs;
5326695a
AS
3090
3091 start_sequence ();
3092
3093 /* Move scalars into two vector registers. */
3094 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
a365fa06 3095 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
3096 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
3097 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
3098 && offsets->need_frame_pointer))
3099 {
3100 rtx reg = gen_rtx_REG (SImode, regno);
3101 rtx vreg = gen_rtx_REG (V64SImode,
3102 VGPR_REGNO (6 + (saved_scalars / 64)));
3103 int lane = saved_scalars % 64;
3104
3105 if (prologue)
251697a6
HAQ
3106 {
3107 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
3108 saved_sgprs.safe_push (regno);
3109 }
5326695a
AS
3110 else
3111 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
3112
3113 saved_scalars++;
3114 }
3115
3116 rtx move_scalars = get_insns ();
3117 end_sequence ();
3118 start_sequence ();
3119
3120 /* Ensure that all vector lanes are moved. */
3121 exec_set = -1;
3122 emit_move_insn (exec, GEN_INT (exec_set));
3123
3124 /* Set up a vector stack pointer. */
3125 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
3126 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
3127 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
3128 gcn_gen_undef (V64SImode), exec));
3129 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
3130 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
3131 exec));
3132 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
3133 gcn_operand_part (V64SImode, vsp, 0),
3134 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
3135 exec));
3136 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
3137 gcn_operand_part (V64SImode, vsp, 1),
3138 const0_rtx, vcc, vcc,
3139 gcn_gen_undef (V64SImode), exec));
3140
3141 /* Move vectors. */
251697a6 3142 for (regno = FIRST_VGPR_REG, offset = 0;
5326695a 3143 regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 3144 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
3145 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
3146 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
3147 {
3148 rtx reg = gen_rtx_REG (V64SImode, regno);
3149 int size = 256;
3150
3151 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
3152 size = saved_scalars * 4;
3153 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
3154 size = (saved_scalars - 64) * 4;
3155
3156 if (size != 256 || exec_set != -1)
3157 {
3158 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
3159 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
3160 }
3161
3162 if (prologue)
251697a6
HAQ
3163 {
3164 rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
3165 (vsp, const0_rtx, reg, as, const0_rtx,
3166 exec));
3167
3168 /* Add CFI metadata. */
3169 rtx note;
3170 if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
3171 {
3172 int start = (regno == VGPR_REGNO (7) ? 64 : 0);
3173 int count = MIN (saved_scalars - start, 64);
3174 int add_lr = (regno == VGPR_REGNO (6)
58d50a5d 3175 && offsets->lr_needs_saving);
251697a6
HAQ
3176 int lrdest = -1;
3177 rtvec seq = rtvec_alloc (count + add_lr);
3178
3179 /* Add an REG_FRAME_RELATED_EXPR entry for each scalar
3180 register that was saved in this batch. */
3181 for (int idx = 0; idx < count; idx++)
3182 {
3183 int stackaddr = offset + idx * 4;
3184 rtx dest = gen_rtx_MEM (SImode,
3185 gen_rtx_PLUS
3186 (DImode, sp,
3187 GEN_INT (stackaddr)));
3188 rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
3189 rtx set = gen_rtx_SET (dest, src);
3190 RTX_FRAME_RELATED_P (set) = 1;
3191 RTVEC_ELT (seq, idx) = set;
3192
3193 if (saved_sgprs[start + idx] == LINK_REGNUM)
3194 lrdest = stackaddr;
3195 }
3196
3197 /* Add an additional expression for DWARF_LINK_REGISTER if
3198 LINK_REGNUM was saved. */
3199 if (lrdest != -1)
3200 {
3201 rtx dest = gen_rtx_MEM (DImode,
3202 gen_rtx_PLUS
3203 (DImode, sp,
3204 GEN_INT (lrdest)));
3205 rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
3206 rtx set = gen_rtx_SET (dest, src);
3207 RTX_FRAME_RELATED_P (set) = 1;
3208 RTVEC_ELT (seq, count) = set;
3209 }
3210
3211 note = gen_rtx_SEQUENCE (VOIDmode, seq);
3212 }
3213 else
3214 {
3215 rtx dest = gen_rtx_MEM (V64SImode,
3216 gen_rtx_PLUS (DImode, sp,
3217 GEN_INT (offset)));
3218 rtx src = gen_rtx_REG (V64SImode, regno);
3219 note = gen_rtx_SET (dest, src);
3220 }
3221 RTX_FRAME_RELATED_P (insn) = 1;
3222 add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
3223 }
5326695a
AS
3224 else
3225 emit_insn (gen_gatherv64si_insn_1offset_exec
3226 (reg, vsp, const0_rtx, as, const0_rtx,
3227 gcn_gen_undef (V64SImode), exec));
3228
3229 /* Move our VSP to the next stack entry. */
3230 if (offreg_set != size)
3231 {
3232 offreg_set = size;
3233 emit_move_insn (offreg, GEN_INT (size));
3234 }
3235 if (exec_set != -1)
3236 {
3237 exec_set = -1;
3238 emit_move_insn (exec, GEN_INT (exec_set));
3239 }
3240 emit_insn (gen_addv64si3_vcc_dup_exec
3241 (gcn_operand_part (V64SImode, vsp, 0),
3242 offreg, gcn_operand_part (V64SImode, vsp, 0),
3243 vcc, gcn_gen_undef (V64SImode), exec));
3244 emit_insn (gen_addcv64si3_exec
3245 (gcn_operand_part (V64SImode, vsp, 1),
3246 gcn_operand_part (V64SImode, vsp, 1),
3247 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
3248
3249 offset += size;
3250 }
3251
3252 rtx move_vectors = get_insns ();
3253 end_sequence ();
3254
3255 if (prologue)
3256 {
3257 emit_insn (move_scalars);
3258 emit_insn (move_vectors);
3259 }
3260 else
3261 {
3262 emit_insn (move_vectors);
3263 emit_insn (move_scalars);
3264 }
3b97715a
AS
3265
3266 /* This happens when a new register becomes "live" after reload.
3267 Check your splitters! */
3268 gcc_assert (offset <= offsets->callee_saves);
5326695a
AS
3269}
3270
3271/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
3272
3273 For a non-kernel function, the stack layout looks like this (interim),
3274 growing *upwards*:
3275
3276 hi | + ...
3277 |__________________| <-- current SP
3278 | outgoing args |
3279 |__________________|
3280 | (alloca space) |
3281 |__________________|
3282 | local vars |
3283 |__________________| <-- FP/hard FP
3284 | callee-save regs |
3285 |__________________| <-- soft arg pointer
3286 | pretend args |
3287 |__________________| <-- incoming SP
3288 | incoming args |
3289 lo |..................|
3290
3291 This implies arguments (beyond the first N in registers) must grow
3292 downwards (as, apparently, PA has them do).
3293
3294 For a kernel function we have the simpler:
3295
3296 hi | + ...
3297 |__________________| <-- current SP
3298 | outgoing args |
3299 |__________________|
3300 | (alloca space) |
3301 |__________________|
3302 | local vars |
3303 lo |__________________| <-- FP/hard FP
3304
3305*/
3306
3307void
3308gcn_expand_prologue ()
3309{
3310 machine_function *offsets = gcn_compute_frame_offsets ();
3311
3312 if (!cfun || !cfun->machine || cfun->machine->normal_function)
3313 {
3314 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
eff23b79
AS
3315 rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
3316 rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
5326695a 3317 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
eff23b79
AS
3318 rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
3319 rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
5326695a
AS
3320
3321 start_sequence ();
3322
3323 if (offsets->pretend_size > 0)
3324 {
3325 /* FIXME: Do the actual saving of register pretend args to the stack.
3326 Register order needs consideration. */
3327 }
3328
3329 /* Save callee-save regs. */
3330 move_callee_saved_registers (sp, offsets, true);
3331
3332 HOST_WIDE_INT sp_adjust = offsets->pretend_size
3333 + offsets->callee_saves
3334 + offsets->local_vars + offsets->outgoing_args_size;
3335 if (sp_adjust > 0)
eff23b79
AS
3336 {
3337 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
3338 we use split add explictly, and specify the DImode add in
3339 the note. */
3340 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3341 rtx adjustment = gen_int_mode (sp_adjust, SImode);
3342 rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
3343 adjustment, scc));
22f201e4
HAQ
3344 if (!offsets->need_frame_pointer)
3345 {
3346 RTX_FRAME_RELATED_P (insn) = 1;
3347 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3348 gen_rtx_SET (sp,
3349 gen_rtx_PLUS (DImode, sp,
3350 adjustment)));
3351 }
eff23b79
AS
3352 emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
3353 }
5326695a
AS
3354
3355 if (offsets->need_frame_pointer)
eff23b79
AS
3356 {
3357 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
3358 we use split add explictly, and specify the DImode add in
3359 the note. */
3360 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3361 int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
3362 rtx adjustment = gen_int_mode (fp_adjust, SImode);
3363 rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
3364 adjustment, scc));
eff23b79
AS
3365 emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
3366 (fp_adjust < 0 ? GEN_INT (-1)
3367 : const0_rtx),
3368 scc, scc));
22f201e4
HAQ
3369
3370 /* Set the CFA to the entry stack address, as an offset from the
3371 frame pointer. This is preferred because the frame pointer is
3372 saved in each frame, whereas the stack pointer is not. */
3373 RTX_FRAME_RELATED_P (insn) = 1;
3374 add_reg_note (insn, REG_CFA_DEF_CFA,
3375 gen_rtx_PLUS (DImode, fp,
3376 GEN_INT (-(offsets->pretend_size
3377 + offsets->callee_saves))));
eff23b79 3378 }
5326695a
AS
3379
3380 rtx_insn *seq = get_insns ();
3381 end_sequence ();
3382
5326695a
AS
3383 emit_insn (seq);
3384 }
3385 else
3386 {
f6fff8a6
AS
3387 if (TARGET_PACKED_WORK_ITEMS)
3388 {
3389 /* v0 conatins the X, Y and Z dimensions all in one.
3390 Expand them out for ABI compatibility. */
3391 /* TODO: implement and use zero_extract. */
3392 rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
3393 emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
3394 gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
3395 emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
3396 emit_insn (gen_prologue_use (v1));
3397
3398 rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
3399 emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
3400 gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
3401 emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
3402 emit_insn (gen_prologue_use (v2));
3403 }
3404
3405 /* We no longer use the private segment for the stack (it's not
3406 accessible to reverse offload), so we must calculate a wave offset
3407 from the grid dimensions and stack size, which is calculated on the
3408 host, and passed in the kernargs region.
3409 See libgomp-gcn.h for details. */
3410 rtx wave_offset = gen_rtx_REG (SImode, FIRST_PARM_REG);
3411
3412 rtx num_waves_mem = gcn_oacc_dim_size (1);
3413 rtx num_waves = gen_rtx_REG (SImode, FIRST_PARM_REG+1);
3414 set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
3415 emit_move_insn (num_waves, num_waves_mem);
3416
3417 rtx workgroup_num = gcn_oacc_dim_pos (0);
3418 rtx wave_num = gen_rtx_REG (SImode, FIRST_PARM_REG+2);
3419 emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
5326695a 3420
f6fff8a6
AS
3421 rtx thread_id = gen_rtx_REG (SImode, FIRST_PARM_REG+3);
3422 emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
3423 emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
3424
3425 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3426 [KERNARG_SEGMENT_PTR_ARG]);
3427 rtx stack_size_mem = gen_rtx_MEM (SImode,
3428 gen_rtx_PLUS (DImode, kernarg_reg,
3429 GEN_INT (52)));
3430 set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
3431 emit_move_insn (wave_offset, stack_size_mem);
3432
3433 emit_insn (gen_mulsi3 (wave_offset, wave_offset, thread_id));
3434
3435 /* The FLAT_SCRATCH_INIT is not usually needed, but can be enabled
3436 via the function attributes. */
5326695a
AS
3437 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
3438 {
3439 rtx fs_init_lo =
3440 gen_rtx_REG (SImode,
3441 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
3442 rtx fs_init_hi =
3443 gen_rtx_REG (SImode,
3444 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
3445 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
3446 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
3447
3448 /*rtx queue = gen_rtx_REG(DImode,
3449 cfun->machine->args.reg[QUEUE_PTR_ARG]);
3450 rtx aperture = gen_rtx_MEM (SImode,
3451 gen_rtx_PLUS (DImode, queue,
3452 gen_int_mode (68, SImode)));
3453 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
3454
3455 /* Set up flat_scratch. */
3456 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
3457 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
3458 gen_int_mode (8, SImode)));
3459 emit_move_insn (fs_reg_lo, fs_init_hi);
3460 }
3461
3462 /* Set up frame pointer and stack pointer. */
3463 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
eff23b79
AS
3464 rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
3465 rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
5326695a
AS
3466 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
3467 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
3468 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
3469
3470 HOST_WIDE_INT sp_adjust = (offsets->local_vars
3471 + offsets->outgoing_args_size);
3472
f6fff8a6
AS
3473 /* Initialize FP and SP from space allocated on the host. */
3474 rtx stack_addr_mem = gen_rtx_MEM (DImode,
3475 gen_rtx_PLUS (DImode, kernarg_reg,
3476 GEN_INT (40)));
3477 set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
3478 emit_move_insn (fp, stack_addr_mem);
3258c2d6
AS
3479 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3480 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
3481 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
5326695a 3482
eff23b79
AS
3483 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
3484 split add explictly, and specify the DImode add in the note.
3485 The DWARF info expects that the callee-save data is in the frame,
3486 even though it isn't (because this is the entry point), so we
3487 make a notional adjustment to the DWARF frame offset here. */
3488 rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
3489 DImode);
3490 rtx insn;
5326695a 3491 if (sp_adjust > 0)
eff23b79
AS
3492 {
3493 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3494 rtx adjustment = gen_int_mode (sp_adjust, DImode);
3495 insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
3496 scc));
3497 emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
3498 }
5326695a 3499 else
eff23b79
AS
3500 insn = emit_move_insn (sp, fp);
3501 RTX_FRAME_RELATED_P (insn) = 1;
3502 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3503 gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
3504 dbg_adjustment)));
5326695a 3505
22f201e4
HAQ
3506 if (offsets->need_frame_pointer)
3507 {
3508 /* Set the CFA to the entry stack address, as an offset from the
3509 frame pointer. This is necessary when alloca is used, and
3510 harmless otherwise. */
3511 rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
3512 add_reg_note (insn, REG_CFA_DEF_CFA,
3513 gen_rtx_PLUS (DImode, fp, neg_adjust));
3514 }
3515
5326695a
AS
3516 /* Make sure the flat scratch reg doesn't get optimised away. */
3517 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
3518 }
3519
3520 /* Ensure that the scheduler doesn't do anything unexpected. */
3521 emit_insn (gen_blockage ());
3522
cde52d3a
AS
3523 if (TARGET_M0_LDS_LIMIT)
3524 {
3525 /* m0 is initialized for the usual LDS DS and FLAT memory case.
3526 The low-part is the address of the topmost addressable byte, which is
3527 size-1. The high-part is an offset and should be zero. */
3528 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
3529 gen_int_mode (LDS_SIZE, SImode));
3530
3531 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
3532 }
5326695a 3533
5326695a
AS
3534 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
3535 {
3536 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
3537 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3538 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
3539 "gomp_gcn_enter_kernel"));
3540 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3541 }
3542}
3543
3544/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
3545
3546 See gcn_expand_prologue for stack details. */
3547
3548void
3549gcn_expand_epilogue (void)
3550{
3551 /* Ensure that the scheduler doesn't do anything unexpected. */
3552 emit_insn (gen_blockage ());
3553
3554 if (!cfun || !cfun->machine || cfun->machine->normal_function)
3555 {
3556 machine_function *offsets = gcn_compute_frame_offsets ();
3557 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
3558 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
3559
3560 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
3561
3562 if (offsets->need_frame_pointer)
3563 {
3564 /* Restore old SP from the frame pointer. */
3565 if (sp_adjust > 0)
3566 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
3567 else
3568 emit_move_insn (sp, fp);
3569 }
3570 else
3571 {
3572 /* Restore old SP from current SP. */
3573 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
3574
3575 if (sp_adjust > 0)
3576 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
3577 }
3578
3579 move_callee_saved_registers (sp, offsets, false);
3580
3581 /* There's no explicit use of the link register on the return insn. Emit
3582 one here instead. */
3583 if (offsets->lr_needs_saving)
3584 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
3585
3586 /* Similar for frame pointer. */
3587 if (offsets->need_frame_pointer)
3588 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
3589 }
3590 else if (flag_openmp)
3591 {
3592 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
3593 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3594 emit_move_insn (fn_reg,
3595 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
3596 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3597 }
3598 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
3599 {
3600 /* Assume that an exit value compatible with gcn-run is expected.
3601 That is, the third input parameter is an int*.
3602
3603 We can't allocate any new registers, but the kernarg_reg is
3604 dead after this, so we'll use that. */
3605 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3606 [KERNARG_SEGMENT_PTR_ARG]);
3607 rtx retptr_mem = gen_rtx_MEM (DImode,
3608 gen_rtx_PLUS (DImode, kernarg_reg,
3609 GEN_INT (16)));
3610 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
3611 emit_move_insn (kernarg_reg, retptr_mem);
3612
3613 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
4e191462 3614 rtx scalar_retval = gen_rtx_REG (SImode, FIRST_PARM_REG);
5326695a 3615 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
4e191462
AS
3616 emit_move_insn (scalar_retval, gen_rtx_REG (SImode, RETURN_VALUE_REG));
3617 emit_move_insn (retval_mem, scalar_retval);
5326695a
AS
3618 }
3619
3620 emit_jump_insn (gen_gcn_return ());
3621}
3622
22f201e4
HAQ
3623/* Implement TARGET_FRAME_POINTER_REQUIRED.
3624
3625 Return true if the frame pointer should not be eliminated. */
3626
3627bool
3628gcn_frame_pointer_rqd (void)
3629{
3630 /* GDB needs the frame pointer in order to unwind properly,
3631 but that's not important for the entry point, unless alloca is used.
3632 It's not important for code execution, so we should repect the
3633 -fomit-frame-pointer flag. */
3634 return (!flag_omit_frame_pointer
3635 && cfun
3636 && (cfun->calls_alloca
3637 || (cfun->machine && cfun->machine->normal_function)));
3638}
3639
5326695a
AS
3640/* Implement TARGET_CAN_ELIMINATE.
3641
3642 Return true if the compiler is allowed to try to replace register number
3643 FROM_REG with register number TO_REG.
3644
3645 FIXME: is the default "true" not enough? Should this be a negative set? */
3646
3647bool
3648gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
3649{
3650 return (to_reg == HARD_FRAME_POINTER_REGNUM
3651 || to_reg == STACK_POINTER_REGNUM);
3652}
3653
3654/* Implement INITIAL_ELIMINATION_OFFSET.
3655
3656 Returns the initial difference between the specified pair of registers, in
3657 terms of stack position. */
3658
3659HOST_WIDE_INT
3660gcn_initial_elimination_offset (int from, int to)
3661{
3662 machine_function *offsets = gcn_compute_frame_offsets ();
3663
3664 switch (from)
3665 {
3666 case ARG_POINTER_REGNUM:
3667 if (to == STACK_POINTER_REGNUM)
3668 return -(offsets->callee_saves + offsets->local_vars
3669 + offsets->outgoing_args_size);
3670 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3671 return -offsets->callee_saves;
3672 else
3673 gcc_unreachable ();
3674 break;
3675
3676 case FRAME_POINTER_REGNUM:
3677 if (to == STACK_POINTER_REGNUM)
3678 return -(offsets->local_vars + offsets->outgoing_args_size);
3679 else if (to == HARD_FRAME_POINTER_REGNUM)
3680 return 0;
3681 else
3682 gcc_unreachable ();
3683 break;
3684
3685 default:
3686 gcc_unreachable ();
3687 }
3688}
3689
3690/* Implement HARD_REGNO_RENAME_OK.
3691
3692 Return true if it is permissible to rename a hard register from
3693 FROM_REG to TO_REG. */
3694
3695bool
3696gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3697{
3698 if (from_reg == SCC_REG
3699 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3700 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3701 || to_reg == SCC_REG
3702 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3703 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3704 return false;
3705
3706 /* Allow the link register to be used if it was saved. */
3707 if ((to_reg & ~1) == LINK_REGNUM)
3708 return !cfun || cfun->machine->lr_needs_saving;
3709
3710 /* Allow the registers used for the static chain to be used if the chain is
3711 not in active use. */
3712 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3713 return !cfun
3714 || !(cfun->static_chain_decl
3715 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3716 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3717
3718 return true;
3719}
3720
3721/* Implement HARD_REGNO_CALLER_SAVE_MODE.
3722
3723 Which mode is required for saving NREGS of a pseudo-register in
3724 call-clobbered hard register REGNO. */
3725
3726machine_mode
3727gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3728 machine_mode regmode)
3729{
737d6a1a 3730 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
5326695a
AS
3731
3732 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3733 result = (nregs == 1 ? SImode : DImode);
3734
3735 return result;
3736}
3737
3738/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3739
3740 Output assembler code for a block containing the constant parts
3741 of a trampoline, leaving space for the variable parts. */
3742
3743static void
3744gcn_asm_trampoline_template (FILE *f)
3745{
3746 /* The source operand of the move instructions must be a 32-bit
3747 constant following the opcode. */
3748 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3749 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3750 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3751 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3752 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3753}
3754
3755/* Implement TARGET_TRAMPOLINE_INIT.
3756
3757 Emit RTL insns to initialize the variable parts of a trampoline.
3758 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3759 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3760 to be passed to the target function. */
3761
3762static void
3763gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3764{
b7c28a47
AS
3765 if (TARGET_GCN5_PLUS)
3766 sorry ("nested function trampolines not supported on GCN5 due to"
3767 " non-executable stacks");
3768
5326695a
AS
3769 emit_block_move (m_tramp, assemble_trampoline_template (),
3770 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3771
3772 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3773 rtx chain_value_reg = copy_to_reg (chain_value);
3774 rtx fnaddr_reg = copy_to_reg (fnaddr);
3775
3776 for (int i = 0; i < 4; i++)
3777 {
3778 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3779 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3780 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3781 }
3782
3783 rtx tramp_addr = XEXP (m_tramp, 0);
3784 emit_insn (gen_clear_icache (tramp_addr,
3785 plus_constant (ptr_mode, tramp_addr,
3786 TRAMPOLINE_SIZE)));
3787}
3788
d9d67745
AS
3789/* Implement TARGET_EXPAND_DIVMOD_LIBFUNC.
3790
3791 There are divmod libfuncs for all modes except TImode. They return the
3792 two values packed into a larger integer/vector. */
3793
3794void
3795gcn_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0, rtx op1,
3796 rtx *quot, rtx *rem)
3797{
3798 machine_mode innermode = (VECTOR_MODE_P (mode)
3799 ? GET_MODE_INNER (mode) : mode);
3800 machine_mode wideinnermode = VOIDmode;
3801 machine_mode widemode = VOIDmode;
3802
3803 switch (innermode)
3804 {
3805 case E_QImode:
3806 case E_HImode:
3807 case E_SImode:
3808 wideinnermode = DImode;
3809 break;
3810 case E_DImode:
3811 wideinnermode = TImode;
3812 break;
3813 default:
3814 gcc_unreachable ();
3815 }
3816
3817 if (VECTOR_MODE_P (mode))
3818 widemode = VnMODE (GET_MODE_NUNITS (mode), wideinnermode);
3819 else
3820 widemode = wideinnermode;
3821
3822 emit_library_call_value (libfunc, gen_rtx_REG (widemode, RETURN_VALUE_REG),
3823 LCT_NORMAL, widemode, op0, mode, op1, mode);
3824
3825 *quot = gen_rtx_REG (mode, RETURN_VALUE_REG);
3826 *rem = gen_rtx_REG (mode,
3827 RETURN_VALUE_REG + (wideinnermode == TImode ? 2 : 1));
3828}
3829
5326695a
AS
3830/* }}} */
3831/* {{{ Miscellaneous. */
3832
3833/* Implement TARGET_CANNOT_COPY_INSN_P.
3834
3835 Return true if INSN must not be duplicated. */
3836
3837static bool
3838gcn_cannot_copy_insn_p (rtx_insn *insn)
3839{
3840 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3841 return true;
3842
3843 return false;
3844}
3845
3846/* Implement TARGET_DEBUG_UNWIND_INFO.
3847
3848 Defines the mechanism that will be used for describing frame unwind
3849 information to the debugger. */
3850
3851static enum unwind_info_type
3852gcn_debug_unwind_info ()
3853{
251697a6 3854 return UI_DWARF2;
5326695a
AS
3855}
3856
3857/* Determine if there is a suitable hardware conversion instruction.
3858 Used primarily by the machine description. */
3859
3860bool
3861gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3862{
3863 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3864 return false;
3865
3866 if (VECTOR_MODE_P (from))
3867 {
45381d6f
AS
3868 if (GET_MODE_NUNITS (from) != GET_MODE_NUNITS (to))
3869 return false;
3870
5326695a
AS
3871 from = GET_MODE_INNER (from);
3872 to = GET_MODE_INNER (to);
3873 }
3874
3875 switch (op)
3876 {
3877 case fix_trunc_cvt:
3878 case fixuns_trunc_cvt:
3879 if (GET_MODE_CLASS (from) != MODE_FLOAT
3880 || GET_MODE_CLASS (to) != MODE_INT)
3881 return false;
3882 break;
3883 case float_cvt:
3884 case floatuns_cvt:
3885 if (GET_MODE_CLASS (from) != MODE_INT
3886 || GET_MODE_CLASS (to) != MODE_FLOAT)
3887 return false;
3888 break;
3889 case extend_cvt:
3890 if (GET_MODE_CLASS (from) != MODE_FLOAT
3891 || GET_MODE_CLASS (to) != MODE_FLOAT
3892 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3893 return false;
3894 break;
3895 case trunc_cvt:
3896 if (GET_MODE_CLASS (from) != MODE_FLOAT
3897 || GET_MODE_CLASS (to) != MODE_FLOAT
3898 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3899 return false;
3900 break;
3901 }
3902
3903 return ((to == HImode && from == HFmode)
3904 || (to == SImode && (from == SFmode || from == DFmode))
3905 || (to == HFmode && (from == HImode || from == SFmode))
3906 || (to == SFmode && (from == SImode || from == HFmode
3907 || from == DFmode))
3908 || (to == DFmode && (from == SImode || from == SFmode)));
3909}
3910
76d46331
KCY
3911/* Implement TARGET_EMUTLS_VAR_INIT.
3912
3913 Disable emutls (gthr-gcn.h does not support it, yet). */
3914
3915tree
3916gcn_emutls_var_init (tree, tree decl, tree)
3917{
3918 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
6349e4f0 3919 return NULL_TREE;
76d46331
KCY
3920}
3921
5326695a
AS
3922/* }}} */
3923/* {{{ Costs. */
3924
3925/* Implement TARGET_RTX_COSTS.
3926
3927 Compute a (partial) cost for rtx X. Return true if the complete
3928 cost has been computed, and false if subexpressions should be
3929 scanned. In either case, *TOTAL contains the cost result. */
3930
3931static bool
3932gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3933{
3934 enum rtx_code code = GET_CODE (x);
3935 switch (code)
3936 {
3937 case CONST:
3938 case CONST_DOUBLE:
3939 case CONST_VECTOR:
3940 case CONST_INT:
3941 if (gcn_inline_constant_p (x))
3942 *total = 0;
3943 else if (code == CONST_INT
3944 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3945 *total = 1;
3946 else if (gcn_constant_p (x))
3947 *total = 2;
3948 else
3949 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3950 return true;
3951
3952 case DIV:
3953 *total = 100;
3954 return false;
3955
3956 default:
3957 *total = 3;
3958 return false;
3959 }
3960}
3961
3962/* Implement TARGET_MEMORY_MOVE_COST.
3963
3964 Return the cost of moving data of mode M between a
3965 register and memory. A value of 2 is the default; this cost is
3966 relative to those in `REGISTER_MOVE_COST'.
3967
3968 This function is used extensively by register_move_cost that is used to
3969 build tables at startup. Make it inline in this case.
3970 When IN is 2, return maximum of in and out move cost.
3971
3972 If moving between registers and memory is more expensive than
3973 between two registers, you should define this macro to express the
3974 relative cost.
3975
3976 Model also increased moving costs of QImode registers in non
3977 Q_REGS classes. */
3978
3979#define LOAD_COST 32
3980#define STORE_COST 32
3981static int
3982gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3983{
3984 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3985 switch (regclass)
3986 {
3987 case SCC_CONDITIONAL_REG:
3988 case VCCZ_CONDITIONAL_REG:
3989 case VCC_CONDITIONAL_REG:
3990 case EXECZ_CONDITIONAL_REG:
3991 case ALL_CONDITIONAL_REGS:
3992 case SGPR_REGS:
3993 case SGPR_EXEC_REGS:
3994 case EXEC_MASK_REG:
3995 case SGPR_VOP_SRC_REGS:
3996 case SGPR_MEM_SRC_REGS:
3997 case SGPR_SRC_REGS:
3998 case SGPR_DST_REGS:
3999 case GENERAL_REGS:
4000 case AFP_REGS:
4001 if (!in)
4002 return (STORE_COST + 2) * nregs;
4003 return LOAD_COST * nregs;
4004 case VGPR_REGS:
4005 if (in)
4006 return (LOAD_COST + 2) * nregs;
4007 return STORE_COST * nregs;
4008 case ALL_REGS:
4009 case ALL_GPR_REGS:
4010 case SRCDST_REGS:
4011 if (in)
4012 return (LOAD_COST + 2) * nregs;
4013 return (STORE_COST + 2) * nregs;
4014 default:
4015 gcc_unreachable ();
4016 }
4017}
4018
4019/* Implement TARGET_REGISTER_MOVE_COST.
4020
4021 Return the cost of moving data from a register in class CLASS1 to
4022 one in class CLASS2. Base value is 2. */
4023
4024static int
4025gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
4026{
4027 /* Increase cost of moving from and to vector registers. While this is
4028 fast in hardware (I think), it has hidden cost of setting up the exec
4029 flags. */
4030 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
4031 return 4;
4032 return 2;
4033}
4034
4035/* }}} */
4036/* {{{ Builtins. */
4037
4038/* Type codes used by GCN built-in definitions. */
4039
4040enum gcn_builtin_type_index
4041{
4042 GCN_BTI_END_OF_PARAMS,
4043
4044 GCN_BTI_VOID,
4045 GCN_BTI_BOOL,
4046 GCN_BTI_INT,
4047 GCN_BTI_UINT,
4048 GCN_BTI_SIZE_T,
4049 GCN_BTI_LLINT,
4050 GCN_BTI_LLUINT,
4051 GCN_BTI_EXEC,
4052
4053 GCN_BTI_SF,
4054 GCN_BTI_V64SI,
4055 GCN_BTI_V64SF,
eff73c10 4056 GCN_BTI_V64DF,
5326695a
AS
4057 GCN_BTI_V64PTR,
4058 GCN_BTI_SIPTR,
4059 GCN_BTI_SFPTR,
4060 GCN_BTI_VOIDPTR,
4061
4062 GCN_BTI_LDS_VOIDPTR,
4063
4064 GCN_BTI_MAX
4065};
4066
4067static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
4068
4069#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
4070#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
4071#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
4072#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
eff73c10 4073#define v64df_type_node (gcn_builtin_types[GCN_BTI_V64DF])
5326695a
AS
4074#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
4075#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
4076#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
4077#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
4078#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
4079
4080static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
4081 struct gcn_builtin_description *);
4082static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
4083 struct gcn_builtin_description *);
4084
4085struct gcn_builtin_description;
4086typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
4087 struct gcn_builtin_description *);
4088
4089enum gcn_builtin_type
4090{
4091 B_UNIMPLEMENTED, /* Sorry out */
4092 B_INSN, /* Emit a pattern */
4093 B_OVERLOAD /* Placeholder for an overloaded function */
4094};
4095
4096struct gcn_builtin_description
4097{
4098 int fcode;
4099 int icode;
4100 const char *name;
4101 enum gcn_builtin_type type;
4102 /* The first element of parm is always the return type. The rest
4103 are a zero terminated list of parameters. */
4104 int parm[6];
4105 gcn_builtin_expander expander;
4106};
4107
4108/* Read in the GCN builtins from gcn-builtins.def. */
4109
4110extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
4111
4112struct gcn_builtin_description gcn_builtins[] = {
4113#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
4114 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
4115
4116#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
4117 {GCN_BUILTIN_ ## fcode ## _V64SI, \
4118 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
4119 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
4120 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
4121 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
4122 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
4123 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
4124 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
4125
4126#include "gcn-builtins.def"
4127#undef DEF_BUILTIN_BINOP_INT_FP
4128#undef DEF_BUILTIN
4129};
4130
4131static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
4132
4133/* Implement TARGET_BUILTIN_DECL.
4134
4135 Return the GCN builtin for CODE. */
4136
4137tree
4138gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4139{
4140 if (code >= GCN_BUILTIN_MAX)
4141 return error_mark_node;
4142
4143 return gcn_builtin_decls[code];
4144}
4145
4146/* Helper function for gcn_init_builtins. */
4147
4148static void
4149gcn_init_builtin_types (void)
4150{
4151 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
4152 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
4153 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
4154 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
4155 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
4156 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
4157 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
4158
4159 exec_type_node = unsigned_intDI_type_node;
4160 sf_type_node = float32_type_node;
4161 v64si_type_node = build_vector_type (intSI_type_node, 64);
4162 v64sf_type_node = build_vector_type (float_type_node, 64);
eff73c10 4163 v64df_type_node = build_vector_type (double_type_node, 64);
5326695a
AS
4164 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
4165 /*build_pointer_type
4166 (integer_type_node) */
4167 , 64);
4168 tree tmp = build_distinct_type_copy (intSI_type_node);
6f83861c 4169 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_DEFAULT;
5326695a
AS
4170 siptr_type_node = build_pointer_type (tmp);
4171
4172 tmp = build_distinct_type_copy (float_type_node);
6f83861c 4173 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_DEFAULT;
5326695a
AS
4174 sfptr_type_node = build_pointer_type (tmp);
4175
4176 tmp = build_distinct_type_copy (void_type_node);
6f83861c 4177 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_DEFAULT;
5326695a
AS
4178 voidptr_type_node = build_pointer_type (tmp);
4179
4180 tmp = build_distinct_type_copy (void_type_node);
4181 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
4182 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
4183}
4184
4185/* Implement TARGET_INIT_BUILTINS.
4186
4187 Set up all builtin functions for this target. */
4188
4189static void
4190gcn_init_builtins (void)
4191{
4192 gcn_init_builtin_types ();
4193
4194 struct gcn_builtin_description *d;
4195 unsigned int i;
4196 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
4197 {
4198 tree p;
4199 char name[64]; /* build_function will make a copy. */
4200 int parm;
4201
4202 /* FIXME: Is this necessary/useful? */
4203 if (d->name == 0)
4204 continue;
4205
4206 /* Find last parm. */
4207 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
4208 ;
4209
4210 p = void_list_node;
4211 while (parm > 1)
4212 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
4213
4214 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
4215
4216 sprintf (name, "__builtin_gcn_%s", d->name);
4217 gcn_builtin_decls[i]
4218 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
4219
4220 /* These builtins don't throw. */
4221 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
4222 }
4223
5326695a
AS
4224 /* These builtins need to take/return an LDS pointer: override the generic
4225 versions here. */
4226
4227 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
4228 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
4229
4230 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
4231 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
4232 false);
4233
4234 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
4235 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
4236 false);
4237
4238 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
4239 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
5326695a
AS
4240}
4241
a8a730cd
JB
4242/* Implement TARGET_INIT_LIBFUNCS. */
4243
4244static void
4245gcn_init_libfuncs (void)
4246{
4247 /* BITS_PER_UNIT * 2 is 64 bits, which causes
e53b6e56 4248 optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
a8a730cd
JB
4249 libcalls that we need to support operations for that type. Initialise
4250 them here instead. */
4251 set_optab_libfunc (udiv_optab, TImode, "__udivti3");
4252 set_optab_libfunc (umod_optab, TImode, "__umodti3");
4253 set_optab_libfunc (sdiv_optab, TImode, "__divti3");
4254 set_optab_libfunc (smod_optab, TImode, "__modti3");
4255 set_optab_libfunc (smul_optab, TImode, "__multi3");
4256 set_optab_libfunc (addv_optab, TImode, "__addvti3");
4257 set_optab_libfunc (subv_optab, TImode, "__subvti3");
4258 set_optab_libfunc (negv_optab, TImode, "__negvti2");
4259 set_optab_libfunc (absv_optab, TImode, "__absvti2");
4260 set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
4261 set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
4262 set_optab_libfunc (clz_optab, TImode, "__clzti2");
4263 set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
4264 set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
4265 set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
4266 set_optab_libfunc (parity_optab, TImode, "__parityti2");
4267 set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
d9d67745
AS
4268
4269 set_optab_libfunc (sdivmod_optab, SImode, "__divmodsi4");
4270 set_optab_libfunc (udivmod_optab, SImode, "__udivmodsi4");
4271 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
4272 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
4273
4274 set_optab_libfunc (sdiv_optab, V2QImode, "__divv2qi3");
4275 set_optab_libfunc (udiv_optab, V2QImode, "__udivv2qi3");
4276 set_optab_libfunc (smod_optab, V2QImode, "__modv2qi3");
4277 set_optab_libfunc (umod_optab, V2QImode, "__umodv2qi3");
4278#if 0
4279 set_optab_libfunc (sdivmod_optab, V2QImode, "__divmodv2qi4");
4280 set_optab_libfunc (udivmod_optab, V2QImode, "__udivmodv2qi4");
4281#endif
4282 set_optab_libfunc (sdiv_optab, V4QImode, "__divv4qi3");
4283 set_optab_libfunc (udiv_optab, V4QImode, "__udivv4qi3");
4284 set_optab_libfunc (smod_optab, V4QImode, "__modv4qi3");
4285 set_optab_libfunc (umod_optab, V4QImode, "__umodv4qi3");
4286#if 0
4287 set_optab_libfunc (sdivmod_optab, V4QImode, "__divmodv4qi4");
4288 set_optab_libfunc (udivmod_optab, V4QImode, "__udivmodv4qi4");
4289#endif
4290 set_optab_libfunc (sdiv_optab, V8QImode, "__divv8qi3");
4291 set_optab_libfunc (udiv_optab, V8QImode, "__udivv8qi3");
4292 set_optab_libfunc (smod_optab, V8QImode, "__modv8qi3");
4293 set_optab_libfunc (umod_optab, V8QImode, "__umodv8qi3");
4294#if 0
4295 set_optab_libfunc (sdivmod_optab, V8QImode, "__divmodv8qi4");
4296 set_optab_libfunc (udivmod_optab, V8QImode, "__udivmodv8qi4");
4297#endif
4298 set_optab_libfunc (sdiv_optab, V16QImode, "__divv16qi3");
4299 set_optab_libfunc (udiv_optab, V16QImode, "__udivv16qi3");
4300 set_optab_libfunc (smod_optab, V16QImode, "__modv16qi3");
4301 set_optab_libfunc (umod_optab, V16QImode, "__umodv16qi3");
4302#if 0
4303 set_optab_libfunc (sdivmod_optab, V16QImode, "__divmodv16qi4");
4304 set_optab_libfunc (udivmod_optab, V16QImode, "__udivmodv16qi4");
4305#endif
4306 set_optab_libfunc (sdiv_optab, V32QImode, "__divv32qi3");
4307 set_optab_libfunc (udiv_optab, V32QImode, "__udivv32qi3");
4308 set_optab_libfunc (smod_optab, V32QImode, "__modv32qi3");
4309 set_optab_libfunc (umod_optab, V32QImode, "__umodv32qi3");
4310#if 0
4311 set_optab_libfunc (sdivmod_optab, V32QImode, "__divmodv32qi4");
4312 set_optab_libfunc (udivmod_optab, V32QImode, "__udivmodv32qi4");
4313#endif
4314 set_optab_libfunc (sdiv_optab, V64QImode, "__divv64qi3");
4315 set_optab_libfunc (udiv_optab, V64QImode, "__udivv64qi3");
4316 set_optab_libfunc (smod_optab, V64QImode, "__modv64qi3");
4317 set_optab_libfunc (umod_optab, V64QImode, "__umodv64qi3");
4318#if 0
4319 set_optab_libfunc (sdivmod_optab, V64QImode, "__divmodv64qi4");
4320 set_optab_libfunc (udivmod_optab, V64QImode, "__udivmodv64qi4");
4321#endif
4322
4323 set_optab_libfunc (sdiv_optab, V2HImode, "__divv2hi3");
4324 set_optab_libfunc (udiv_optab, V2HImode, "__udivv2hi3");
4325 set_optab_libfunc (smod_optab, V2HImode, "__modv2hi3");
4326 set_optab_libfunc (umod_optab, V2HImode, "__umodv2hi3");
4327#if 0
4328 set_optab_libfunc (sdivmod_optab, V2HImode, "__divmodv2hi4");
4329 set_optab_libfunc (udivmod_optab, V2HImode, "__udivmodv2hi4");
4330#endif
4331 set_optab_libfunc (sdiv_optab, V4HImode, "__divv4hi3");
4332 set_optab_libfunc (udiv_optab, V4HImode, "__udivv4hi3");
4333 set_optab_libfunc (smod_optab, V4HImode, "__modv4hi3");
4334 set_optab_libfunc (umod_optab, V4HImode, "__umodv4hi3");
4335#if 0
4336 set_optab_libfunc (sdivmod_optab, V4HImode, "__divmodv4hi4");
4337 set_optab_libfunc (udivmod_optab, V4HImode, "__udivmodv4hi4");
4338#endif
4339 set_optab_libfunc (sdiv_optab, V8HImode, "__divv8hi3");
4340 set_optab_libfunc (udiv_optab, V8HImode, "__udivv8hi3");
4341 set_optab_libfunc (smod_optab, V8HImode, "__modv8hi3");
4342 set_optab_libfunc (umod_optab, V8HImode, "__umodv8hi3");
4343#if 0
4344 set_optab_libfunc (sdivmod_optab, V8HImode, "__divmodv8hi4");
4345 set_optab_libfunc (udivmod_optab, V8HImode, "__udivmodv8hi4");
4346#endif
4347 set_optab_libfunc (sdiv_optab, V16HImode, "__divv16hi3");
4348 set_optab_libfunc (udiv_optab, V16HImode, "__udivv16hi3");
4349 set_optab_libfunc (smod_optab, V16HImode, "__modv16hi3");
4350 set_optab_libfunc (umod_optab, V16HImode, "__umodv16hi3");
4351#if 0
4352 set_optab_libfunc (sdivmod_optab, V16HImode, "__divmodv16hi4");
4353 set_optab_libfunc (udivmod_optab, V16HImode, "__udivmodv16hi4");
4354#endif
4355 set_optab_libfunc (sdiv_optab, V32HImode, "__divv32hi3");
4356 set_optab_libfunc (udiv_optab, V32HImode, "__udivv32hi3");
4357 set_optab_libfunc (smod_optab, V32HImode, "__modv32hi3");
4358 set_optab_libfunc (umod_optab, V32HImode, "__umodv32hi3");
4359#if 0
4360 set_optab_libfunc (sdivmod_optab, V32HImode, "__divmodv32hi4");
4361 set_optab_libfunc (udivmod_optab, V32HImode, "__udivmodv32hi4");
4362#endif
4363 set_optab_libfunc (sdiv_optab, V64HImode, "__divv64hi3");
4364 set_optab_libfunc (udiv_optab, V64HImode, "__udivv64hi3");
4365 set_optab_libfunc (smod_optab, V64HImode, "__modv64hi3");
4366 set_optab_libfunc (umod_optab, V64HImode, "__umodv64hi3");
4367#if 0
4368 set_optab_libfunc (sdivmod_optab, V64HImode, "__divmodv64hi4");
4369 set_optab_libfunc (udivmod_optab, V64HImode, "__udivmodv64hi4");
4370#endif
4371
4372 set_optab_libfunc (sdiv_optab, V2SImode, "__divv2si3");
4373 set_optab_libfunc (udiv_optab, V2SImode, "__udivv2si3");
4374 set_optab_libfunc (smod_optab, V2SImode, "__modv2si3");
4375 set_optab_libfunc (umod_optab, V2SImode, "__umodv2si3");
4376#if 0
4377 set_optab_libfunc (sdivmod_optab, V2SImode, "__divmodv2si4");
4378 set_optab_libfunc (udivmod_optab, V2SImode, "__udivmodv2si4");
4379#endif
4380 set_optab_libfunc (sdiv_optab, V4SImode, "__divv4si3");
4381 set_optab_libfunc (udiv_optab, V4SImode, "__udivv4si3");
4382 set_optab_libfunc (smod_optab, V4SImode, "__modv4si3");
4383 set_optab_libfunc (umod_optab, V4SImode, "__umodv4si3");
4384#if 0
4385 set_optab_libfunc (sdivmod_optab, V4SImode, "__divmodv4si4");
4386 set_optab_libfunc (udivmod_optab, V4SImode, "__udivmodv4si4");
4387#endif
4388 set_optab_libfunc (sdiv_optab, V8SImode, "__divv8si3");
4389 set_optab_libfunc (udiv_optab, V8SImode, "__udivv8si3");
4390 set_optab_libfunc (smod_optab, V8SImode, "__modv8si3");
4391 set_optab_libfunc (umod_optab, V8SImode, "__umodv8si3");
4392#if 0
4393 set_optab_libfunc (sdivmod_optab, V8SImode, "__divmodv8si4");
4394 set_optab_libfunc (udivmod_optab, V8SImode, "__udivmodv8si4");
4395#endif
4396 set_optab_libfunc (sdiv_optab, V16SImode, "__divv16si3");
4397 set_optab_libfunc (udiv_optab, V16SImode, "__udivv16si3");
4398 set_optab_libfunc (smod_optab, V16SImode, "__modv16si3");
4399 set_optab_libfunc (umod_optab, V16SImode, "__umodv16si3");
4400#if 0
4401 set_optab_libfunc (sdivmod_optab, V16SImode, "__divmodv16si4");
4402 set_optab_libfunc (udivmod_optab, V16SImode, "__udivmodv16si4");
4403#endif
4404 set_optab_libfunc (sdiv_optab, V32SImode, "__divv32si3");
4405 set_optab_libfunc (udiv_optab, V32SImode, "__udivv32si3");
4406 set_optab_libfunc (smod_optab, V32SImode, "__modv32si3");
4407 set_optab_libfunc (umod_optab, V32SImode, "__umodv32si3");
4408#if 0
4409 set_optab_libfunc (sdivmod_optab, V32SImode, "__divmodv32si4");
4410 set_optab_libfunc (udivmod_optab, V32SImode, "__udivmodv32si4");
4411#endif
4412 set_optab_libfunc (sdiv_optab, V64SImode, "__divv64si3");
4413 set_optab_libfunc (udiv_optab, V64SImode, "__udivv64si3");
4414 set_optab_libfunc (smod_optab, V64SImode, "__modv64si3");
4415 set_optab_libfunc (umod_optab, V64SImode, "__umodv64si3");
4416#if 0
4417 set_optab_libfunc (sdivmod_optab, V64SImode, "__divmodv64si4");
4418 set_optab_libfunc (udivmod_optab, V64SImode, "__udivmodv64si4");
4419#endif
4420
4421 set_optab_libfunc (sdiv_optab, V2DImode, "__divv2di3");
4422 set_optab_libfunc (udiv_optab, V2DImode, "__udivv2di3");
4423 set_optab_libfunc (smod_optab, V2DImode, "__modv2di3");
4424 set_optab_libfunc (umod_optab, V2DImode, "__umodv2di3");
4425#if 0
4426 set_optab_libfunc (sdivmod_optab, V2DImode, "__divmodv2di4");
4427 set_optab_libfunc (udivmod_optab, V2DImode, "__udivmodv2di4");
4428#endif
4429 set_optab_libfunc (sdiv_optab, V4DImode, "__divv4di3");
4430 set_optab_libfunc (udiv_optab, V4DImode, "__udivv4di3");
4431 set_optab_libfunc (smod_optab, V4DImode, "__modv4di3");
4432 set_optab_libfunc (umod_optab, V4DImode, "__umodv4di3");
4433#if 0
4434 set_optab_libfunc (sdivmod_optab, V4DImode, "__divmodv4di4");
4435 set_optab_libfunc (udivmod_optab, V4DImode, "__udivmodv4di4");
4436#endif
4437 set_optab_libfunc (sdiv_optab, V8DImode, "__divv8di3");
4438 set_optab_libfunc (udiv_optab, V8DImode, "__udivv8di3");
4439 set_optab_libfunc (smod_optab, V8DImode, "__modv8di3");
4440 set_optab_libfunc (umod_optab, V8DImode, "__umodv8di3");
4441#if 0
4442 set_optab_libfunc (sdivmod_optab, V8DImode, "__divmodv8di4");
4443 set_optab_libfunc (udivmod_optab, V8DImode, "__udivmodv8di4");
4444#endif
4445 set_optab_libfunc (sdiv_optab, V16DImode, "__divv16di3");
4446 set_optab_libfunc (udiv_optab, V16DImode, "__udivv16di3");
4447 set_optab_libfunc (smod_optab, V16DImode, "__modv16di3");
4448 set_optab_libfunc (umod_optab, V16DImode, "__umodv16di3");
4449#if 0
4450 set_optab_libfunc (sdivmod_optab, V16DImode, "__divmodv16di4");
4451 set_optab_libfunc (udivmod_optab, V16DImode, "__udivmodv16di4");
4452#endif
4453 set_optab_libfunc (sdiv_optab, V32DImode, "__divv32di3");
4454 set_optab_libfunc (udiv_optab, V32DImode, "__udivv32di3");
4455 set_optab_libfunc (smod_optab, V32DImode, "__modv32di3");
4456 set_optab_libfunc (umod_optab, V32DImode, "__umodv32di3");
4457#if 0
4458 set_optab_libfunc (sdivmod_optab, V32DImode, "__divmodv32di4");
4459 set_optab_libfunc (udivmod_optab, V32DImode, "__udivmodv32di4");
4460#endif
4461 set_optab_libfunc (sdiv_optab, V64DImode, "__divv64di3");
4462 set_optab_libfunc (udiv_optab, V64DImode, "__udivv64di3");
4463 set_optab_libfunc (smod_optab, V64DImode, "__modv64di3");
4464 set_optab_libfunc (umod_optab, V64DImode, "__umodv64di3");
4465#if 0
4466 set_optab_libfunc (sdivmod_optab, V64DImode, "__divmodv64di4");
4467 set_optab_libfunc (udivmod_optab, V64DImode, "__udivmodv64di4");
4468#endif
a8a730cd
JB
4469}
4470
5326695a
AS
4471/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
4472 not require taking the address of any object, other than the memory
4473 cell being operated on.
4474
4475 Helper function for gcn_expand_builtin_1. */
4476
4477static rtx
4478gcn_expand_cmp_swap (tree exp, rtx target)
4479{
4480 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4481 addr_space_t as
4482 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
4483 machine_mode as_mode = gcn_addr_space_address_mode (as);
4484
4485 if (!target)
4486 target = gen_reg_rtx (mode);
4487
4488 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
4489 NULL_RTX, as_mode, EXPAND_NORMAL);
4490 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4491 NULL_RTX, mode, EXPAND_NORMAL);
4492 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4493 NULL_RTX, mode, EXPAND_NORMAL);
4494 rtx pat;
4495
4496 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
4497 set_mem_addr_space (mem, as);
4498
4499 if (!REG_P (cmp))
4500 cmp = copy_to_mode_reg (mode, cmp);
4501 if (!REG_P (src))
4502 src = copy_to_mode_reg (mode, src);
4503
4504 if (mode == SImode)
4505 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
4506 else
4507 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
4508
4509 emit_insn (pat);
4510
4511 return target;
4512}
4513
4514/* Expand many different builtins.
4515
4516 Intended for use in gcn-builtins.def. */
4517
4518static rtx
4519gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
4520 machine_mode /*mode */ , int ignore,
4521 struct gcn_builtin_description *)
4522{
4523 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 4524 switch (DECL_MD_FUNCTION_CODE (fndecl))
5326695a
AS
4525 {
4526 case GCN_BUILTIN_FLAT_LOAD_INT32:
4527 {
4528 if (ignore)
4529 return target;
4530 /*rtx exec = */
4531 force_reg (DImode,
4532 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4533 EXPAND_NORMAL));
4534 /*rtx ptr = */
4535 force_reg (V64DImode,
4536 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
4537 EXPAND_NORMAL));
4538 /*emit_insn (gen_vector_flat_loadv64si
4539 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
4540 return target;
4541 }
4542 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
4543 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
4544 {
4545 if (ignore)
4546 return target;
4547 rtx exec = force_reg (DImode,
4548 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4549 DImode,
4550 EXPAND_NORMAL));
4551 rtx ptr = force_reg (DImode,
4552 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
4553 V64DImode,
4554 EXPAND_NORMAL));
4555 rtx offsets = force_reg (V64SImode,
4556 expand_expr (CALL_EXPR_ARG (exp, 2),
4557 NULL_RTX, V64DImode,
4558 EXPAND_NORMAL));
4559 rtx addrs = gen_reg_rtx (V64DImode);
4560 rtx tmp = gen_reg_rtx (V64SImode);
4561 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
4562 GEN_INT (2),
4563 gcn_gen_undef (V64SImode), exec));
4564 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
4565 gcn_gen_undef (V64DImode),
4566 exec));
4567 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
4568 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
4569 /* FIXME: set attributes. */
45381d6f 4570 emit_insn (gen_movvNm (target, mem, NULL, exec));
5326695a
AS
4571 return target;
4572 }
4573 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
4574 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
4575 {
4576 rtx exec = force_reg (DImode,
4577 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4578 DImode,
4579 EXPAND_NORMAL));
4580 rtx ptr = force_reg (DImode,
4581 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
4582 V64DImode,
4583 EXPAND_NORMAL));
4584 rtx offsets = force_reg (V64SImode,
4585 expand_expr (CALL_EXPR_ARG (exp, 2),
4586 NULL_RTX, V64DImode,
4587 EXPAND_NORMAL));
4588 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
4589 3)));
4590 rtx val = force_reg (vmode,
4591 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4592 vmode,
4593 EXPAND_NORMAL));
4594 rtx addrs = gen_reg_rtx (V64DImode);
4595 rtx tmp = gen_reg_rtx (V64SImode);
4596 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
4597 GEN_INT (2),
4598 gcn_gen_undef (V64SImode), exec));
4599 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
4600 gcn_gen_undef (V64DImode),
4601 exec));
4602 rtx mem = gen_rtx_MEM (vmode, addrs);
4603 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
4604 /* FIXME: set attributes. */
45381d6f 4605 emit_insn (gen_movvNm (mem, val, NULL, exec));
5326695a
AS
4606 return target;
4607 }
4608 case GCN_BUILTIN_SQRTVF:
4609 {
4610 if (ignore)
4611 return target;
5326695a
AS
4612 rtx arg = force_reg (V64SFmode,
4613 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4614 V64SFmode,
4615 EXPAND_NORMAL));
45381d6f 4616 emit_insn (gen_sqrtv64sf2 (target, arg));
5326695a
AS
4617 return target;
4618 }
4619 case GCN_BUILTIN_SQRTF:
4620 {
4621 if (ignore)
4622 return target;
4623 rtx arg = force_reg (SFmode,
4624 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4625 SFmode,
4626 EXPAND_NORMAL));
4627 emit_insn (gen_sqrtsf2 (target, arg));
4628 return target;
4629 }
eff73c10
KCY
4630 case GCN_BUILTIN_FABSVF:
4631 {
4632 if (ignore)
4633 return target;
eff73c10
KCY
4634 rtx arg = force_reg (V64SFmode,
4635 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4636 V64SFmode,
4637 EXPAND_NORMAL));
45381d6f 4638 emit_insn (gen_absv64sf2 (target, arg));
eff73c10
KCY
4639 return target;
4640 }
ee2be8f3
KCY
4641 case GCN_BUILTIN_FABSV:
4642 {
4643 if (ignore)
4644 return target;
4645 rtx arg = force_reg (V64DFmode,
4646 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4647 V64DFmode,
4648 EXPAND_NORMAL));
4649 emit_insn (gen_absv64df2 (target, arg));
4650 return target;
4651 }
4652 case GCN_BUILTIN_FLOORVF:
4653 {
4654 if (ignore)
4655 return target;
4656 rtx arg = force_reg (V64SFmode,
4657 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4658 V64SFmode,
4659 EXPAND_NORMAL));
4660 emit_insn (gen_floorv64sf2 (target, arg));
4661 return target;
4662 }
4663 case GCN_BUILTIN_FLOORV:
4664 {
4665 if (ignore)
4666 return target;
4667 rtx arg = force_reg (V64DFmode,
4668 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4669 V64DFmode,
4670 EXPAND_NORMAL));
4671 emit_insn (gen_floorv64df2 (target, arg));
4672 return target;
4673 }
eff73c10
KCY
4674 case GCN_BUILTIN_LDEXPVF:
4675 {
4676 if (ignore)
4677 return target;
eff73c10
KCY
4678 rtx arg1 = force_reg (V64SFmode,
4679 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4680 V64SFmode,
4681 EXPAND_NORMAL));
4682 rtx arg2 = force_reg (V64SImode,
4683 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
4684 V64SImode,
4685 EXPAND_NORMAL));
45381d6f 4686 emit_insn (gen_ldexpv64sf3 (target, arg1, arg2));
eff73c10
KCY
4687 return target;
4688 }
4689 case GCN_BUILTIN_LDEXPV:
4690 {
4691 if (ignore)
4692 return target;
eff73c10
KCY
4693 rtx arg1 = force_reg (V64DFmode,
4694 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
cb0a2b1f 4695 V64DFmode,
eff73c10
KCY
4696 EXPAND_NORMAL));
4697 rtx arg2 = force_reg (V64SImode,
4698 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
4699 V64SImode,
4700 EXPAND_NORMAL));
45381d6f 4701 emit_insn (gen_ldexpv64df3 (target, arg1, arg2));
eff73c10
KCY
4702 return target;
4703 }
4704 case GCN_BUILTIN_FREXPVF_EXP:
4705 {
4706 if (ignore)
4707 return target;
eff73c10
KCY
4708 rtx arg = force_reg (V64SFmode,
4709 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4710 V64SFmode,
4711 EXPAND_NORMAL));
45381d6f 4712 emit_insn (gen_frexpv64sf_exp2 (target, arg));
eff73c10
KCY
4713 return target;
4714 }
4715 case GCN_BUILTIN_FREXPVF_MANT:
4716 {
4717 if (ignore)
4718 return target;
eff73c10
KCY
4719 rtx arg = force_reg (V64SFmode,
4720 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4721 V64SFmode,
4722 EXPAND_NORMAL));
45381d6f 4723 emit_insn (gen_frexpv64sf_mant2 (target, arg));
eff73c10
KCY
4724 return target;
4725 }
4726 case GCN_BUILTIN_FREXPV_EXP:
4727 {
4728 if (ignore)
4729 return target;
eff73c10
KCY
4730 rtx arg = force_reg (V64DFmode,
4731 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4732 V64DFmode,
4733 EXPAND_NORMAL));
45381d6f 4734 emit_insn (gen_frexpv64df_exp2 (target, arg));
eff73c10
KCY
4735 return target;
4736 }
4737 case GCN_BUILTIN_FREXPV_MANT:
4738 {
4739 if (ignore)
4740 return target;
eff73c10
KCY
4741 rtx arg = force_reg (V64DFmode,
4742 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4743 V64DFmode,
4744 EXPAND_NORMAL));
45381d6f 4745 emit_insn (gen_frexpv64df_mant2 (target, arg));
eff73c10
KCY
4746 return target;
4747 }
5326695a
AS
4748 case GCN_BUILTIN_OMP_DIM_SIZE:
4749 {
4750 if (ignore)
4751 return target;
4752 emit_insn (gen_oacc_dim_size (target,
4753 expand_expr (CALL_EXPR_ARG (exp, 0),
4754 NULL_RTX, SImode,
4755 EXPAND_NORMAL)));
4756 return target;
4757 }
4758 case GCN_BUILTIN_OMP_DIM_POS:
4759 {
4760 if (ignore)
4761 return target;
4762 emit_insn (gen_oacc_dim_pos (target,
4763 expand_expr (CALL_EXPR_ARG (exp, 0),
4764 NULL_RTX, SImode,
4765 EXPAND_NORMAL)));
4766 return target;
4767 }
4768 case GCN_BUILTIN_CMP_SWAP:
4769 case GCN_BUILTIN_CMP_SWAPLL:
4770 return gcn_expand_cmp_swap (exp, target);
4771
4772 case GCN_BUILTIN_ACC_SINGLE_START:
4773 {
4774 if (ignore)
4775 return target;
4776
4777 rtx wavefront = gcn_oacc_dim_pos (1);
4778 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
4779 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
4780 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
4781 return cc;
4782 }
4783
4784 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
4785 {
4786 rtx blk = force_reg (SImode,
4787 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
4788 SImode, EXPAND_NORMAL));
4789 rtx wavefront = gcn_oacc_dim_pos (1);
4790 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
4791 rtx not_zero = gen_label_rtx ();
4792 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
4793 emit_move_insn (blk, const0_rtx);
4794 emit_label (not_zero);
4795 return blk;
4796 }
4797
4798 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
4799 return target;
4800
4801 case GCN_BUILTIN_ACC_BARRIER:
4802 emit_insn (gen_gcn_wavefront_barrier ());
4803 return target;
4804
d6bbca7b
TB
4805 case GCN_BUILTIN_GET_STACK_LIMIT:
4806 {
4807 /* stackbase = (stack_segment_decr & 0x0000ffffffffffff)
4808 + stack_wave_offset);
4809 seg_size = dispatch_ptr->private_segment_size;
4810 stacklimit = stackbase + seg_size*64;
4811 with segsize = *(uint32_t *) ((char *) dispatch_ptr
4812 + 6*sizeof(int16_t) + 3*sizeof(int32_t));
4813 cf. struct hsa_kernel_dispatch_packet_s in the HSA doc. */
4814 rtx ptr;
4815 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
f6fff8a6 4816 && cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
d6bbca7b 4817 {
f6fff8a6
AS
4818 rtx num_waves_mem = gcn_oacc_dim_size (1);
4819 rtx num_waves = gen_reg_rtx (SImode);
4820 set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
4821 emit_move_insn (num_waves, num_waves_mem);
4822
4823 rtx workgroup_num = gcn_oacc_dim_pos (0);
4824 rtx wave_num = gen_reg_rtx (SImode);
4825 emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
4826
4827 rtx thread_id = gen_reg_rtx (SImode);
4828 emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
4829 emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
4830
4831 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
4832 [KERNARG_SEGMENT_PTR_ARG]);
4833 rtx stack_size_mem = gen_rtx_MEM (SImode,
4834 gen_rtx_PLUS (DImode,
4835 kernarg_reg,
4836 GEN_INT (52)));
4837 set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
4838 rtx stack_size = gen_reg_rtx (SImode);
4839 emit_move_insn (stack_size, stack_size_mem);
4840
4841 rtx wave_offset = gen_reg_rtx (SImode);
4842 emit_insn (gen_mulsi3 (wave_offset, stack_size, thread_id));
4843
4844 rtx stack_limit_offset = gen_reg_rtx (SImode);
4845 emit_insn (gen_addsi3 (stack_limit_offset, wave_offset,
4846 stack_size));
4847
4848 rtx stack_limit_offset_di = gen_reg_rtx (DImode);
4849 emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 4),
4850 const0_rtx);
4851 emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 0),
4852 stack_limit_offset);
4853
4854 rtx stack_addr_mem = gen_rtx_MEM (DImode,
4855 gen_rtx_PLUS (DImode,
4856 kernarg_reg,
4857 GEN_INT (40)));
4858 set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
4859 rtx stack_addr = gen_reg_rtx (DImode);
4860 emit_move_insn (stack_addr, stack_addr_mem);
4861
4862 ptr = gen_rtx_PLUS (DImode, stack_addr, stack_limit_offset_di);
d6bbca7b
TB
4863 }
4864 else
4865 {
4866 ptr = gen_reg_rtx (DImode);
4867 emit_move_insn (ptr, const0_rtx);
4868 }
4869 return ptr;
4870 }
6f83861c
TB
4871 case GCN_BUILTIN_KERNARG_PTR:
4872 {
4873 rtx ptr;
4874 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
4875 ptr = gen_rtx_REG (DImode,
4876 cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]);
4877 else
4878 {
4879 ptr = gen_reg_rtx (DImode);
4880 emit_move_insn (ptr, const0_rtx);
4881 }
4882 return ptr;
4883 }
d6bbca7b
TB
4884 case GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P:
4885 {
4886 /* Stash a marker in the unused upper 16 bits of s[0:1] to indicate
4887 whether it was the first call. */
4888 rtx result = gen_reg_rtx (BImode);
4889 emit_move_insn (result, const0_rtx);
f6fff8a6 4890 if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
d6bbca7b
TB
4891 {
4892 rtx not_first = gen_label_rtx ();
4893 rtx reg = gen_rtx_REG (DImode,
f6fff8a6 4894 cfun->machine->args.reg[QUEUE_PTR_ARG]);
9fa67f1c
TB
4895 reg = gcn_operand_part (DImode, reg, 1);
4896 rtx cmp = force_reg (SImode,
4897 gen_rtx_LSHIFTRT (SImode, reg, GEN_INT (16)));
d6bbca7b
TB
4898 emit_insn (gen_cstoresi4 (result, gen_rtx_NE (BImode, cmp,
4899 GEN_INT(12345)),
4900 cmp, GEN_INT(12345)));
4901 emit_jump_insn (gen_cjump (not_first, gen_rtx_EQ (BImode, result,
4902 const0_rtx),
4903 result));
4904 emit_move_insn (reg,
9fa67f1c
TB
4905 force_reg (SImode,
4906 gen_rtx_IOR (SImode,
4907 gen_rtx_AND (SImode, reg, GEN_INT (0x0000ffff)),
4908 GEN_INT (12345L << 16))));
4909 emit_insn (gen_rtx_USE (VOIDmode, reg));
d6bbca7b
TB
4910 emit_label (not_first);
4911 }
4912 return result;
4913 }
5326695a
AS
4914 default:
4915 gcc_unreachable ();
4916 }
4917}
4918
4919/* Expansion of simple arithmetic and bit binary operation builtins.
4920
4921 Intended for use with gcn_builtins table. */
4922
4923static rtx
4924gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
4925 machine_mode /*mode */ , int ignore,
4926 struct gcn_builtin_description *d)
4927{
4928 int icode = d->icode;
4929 if (ignore)
4930 return target;
4931
4932 rtx exec = force_reg (DImode,
4933 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4934 EXPAND_NORMAL));
4935
4936 machine_mode m1 = insn_data[icode].operand[1].mode;
4937 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
4938 EXPAND_NORMAL);
4939 if (!insn_data[icode].operand[1].predicate (arg1, m1))
4940 arg1 = force_reg (m1, arg1);
4941
4942 machine_mode m2 = insn_data[icode].operand[2].mode;
4943 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
4944 EXPAND_NORMAL);
4945 if (!insn_data[icode].operand[2].predicate (arg2, m2))
4946 arg2 = force_reg (m2, arg2);
4947
4948 rtx arg_prev;
4949 if (call_expr_nargs (exp) == 4)
4950 {
4951 machine_mode m_prev = insn_data[icode].operand[4].mode;
4952 arg_prev = force_reg (m_prev,
4953 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4954 m_prev, EXPAND_NORMAL));
4955 }
4956 else
4957 arg_prev = gcn_gen_undef (GET_MODE (target));
4958
4959 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
4960 emit_insn (pat);
4961 return target;
4962}
4963
4964/* Implement TARGET_EXPAND_BUILTIN.
4965
4966 Expand an expression EXP that calls a built-in function, with result going
4967 to TARGET if that's convenient (and in mode MODE if that's convenient).
4968 SUBTARGET may be used as the target for computing one of EXP's operands.
4969 IGNORE is nonzero if the value is to be ignored. */
4970
4971rtx
4972gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
4973 int ignore)
4974{
4975 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 4976 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
5326695a
AS
4977 struct gcn_builtin_description *d;
4978
4979 gcc_assert (fcode < GCN_BUILTIN_MAX);
4980 d = &gcn_builtins[fcode];
4981
4982 if (d->type == B_UNIMPLEMENTED)
4983 sorry ("Builtin not implemented");
4984
4985 return d->expander (exp, target, subtarget, mode, ignore, d);
4986}
4987
4988/* }}} */
4989/* {{{ Vectorization. */
4990
4991/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
4992
4993 A vector mask is a value that holds one boolean result for every element in
4994 a vector. */
4995
4996opt_machine_mode
10116ec1 4997gcn_vectorize_get_mask_mode (machine_mode)
5326695a
AS
4998{
4999 /* GCN uses a DImode bit-mask. */
5000 return DImode;
5001}
5002
5003/* Return an RTX that references a vector with the i-th lane containing
5004 PERM[i]*4.
5005
5006 Helper function for gcn_vectorize_vec_perm_const. */
5007
5008static rtx
45381d6f 5009gcn_make_vec_perm_address (unsigned int *perm, int nelt)
5326695a 5010{
45381d6f
AS
5011 machine_mode mode = VnMODE (nelt, SImode);
5012 rtx x = gen_reg_rtx (mode);
5013 emit_move_insn (x, gcn_vec_constant (mode, 0));
5326695a
AS
5014
5015 /* Permutation addresses use byte addressing. With each vector lane being
5016 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
5017 so only set those.
5018
5019 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
5020 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
5021 only take one source vector, the most-significant bit can be ignored
5022 here. Instead, we can use EXEC masking to select the relevant part of
5023 each source vector after they are permuted separately. */
5024 uint64_t bit_mask = 1 << 2;
5025 for (int i = 2; i < 8; i++, bit_mask <<= 1)
5026 {
5027 uint64_t exec_mask = 0;
5028 uint64_t lane_mask = 1;
45381d6f
AS
5029 for (int j = 0; j < nelt; j++, lane_mask <<= 1)
5030 if (((perm[j] % nelt) * 4) & bit_mask)
5326695a
AS
5031 exec_mask |= lane_mask;
5032
5033 if (exec_mask)
45381d6f
AS
5034 emit_insn (gen_addvNsi3 (x, x, gcn_vec_constant (mode, bit_mask),
5035 x, get_exec (exec_mask)));
5326695a
AS
5036 }
5037
5038 return x;
5039}
5040
5041/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
5042
5043 Return true if permutation with SEL is possible.
5044
5045 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
5046 permutations. */
5047
5048static bool
ae8decf1
PK
5049gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
5050 rtx dst, rtx src0, rtx src1,
5326695a
AS
5051 const vec_perm_indices & sel)
5052{
ae8decf1
PK
5053 if (vmode != op_mode)
5054 return false;
5055
5326695a
AS
5056 unsigned int nelt = GET_MODE_NUNITS (vmode);
5057
5058 gcc_assert (VECTOR_MODE_P (vmode));
5059 gcc_assert (nelt <= 64);
5060 gcc_assert (sel.length () == nelt);
5061
5062 if (!dst)
5063 {
5064 /* All vector permutations are possible on this architecture,
5065 with varying degrees of efficiency depending on the permutation. */
5066 return true;
5067 }
5068
5069 unsigned int perm[64];
5070 for (unsigned int i = 0; i < nelt; ++i)
5071 perm[i] = sel[i] & (2 * nelt - 1);
55308fc2
AS
5072 for (unsigned int i = nelt; i < 64; ++i)
5073 perm[i] = 0;
5326695a 5074
b1d1e2b5
JJ
5075 src0 = force_reg (vmode, src0);
5076 src1 = force_reg (vmode, src1);
5077
5326695a
AS
5078 /* Make life a bit easier by swapping operands if necessary so that
5079 the first element always comes from src0. */
5080 if (perm[0] >= nelt)
5081 {
b1d1e2b5 5082 std::swap (src0, src1);
5326695a
AS
5083
5084 for (unsigned int i = 0; i < nelt; ++i)
5085 if (perm[i] < nelt)
5086 perm[i] += nelt;
5087 else
5088 perm[i] -= nelt;
5089 }
5090
5091 /* TODO: There are more efficient ways to implement certain permutations
5092 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
5093 this more inefficient generic approach is used. */
5094
5095 int64_t src1_lanes = 0;
5096 int64_t lane_bit = 1;
5097
5098 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
5099 {
5100 /* Set the bits for lanes from src1. */
5101 if (perm[i] >= nelt)
5102 src1_lanes |= lane_bit;
5103 }
5104
45381d6f 5105 rtx addr = gcn_make_vec_perm_address (perm, nelt);
5326695a
AS
5106
5107 /* Load elements from src0 to dst. */
45381d6f
AS
5108 gcc_assert ((~src1_lanes) & (0xffffffffffffffffUL > (64-nelt)));
5109 emit_insn (gen_ds_bpermutevNm (dst, addr, src0, get_exec (vmode)));
5326695a
AS
5110
5111 /* Load elements from src1 to dst. */
5112 if (src1_lanes)
5113 {
5114 /* Masking a lane masks both the destination and source lanes for
5115 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
5116 then add an extra masked move to merge the results of permuting
5117 the two source vectors together.
5118 */
5119 rtx tmp = gen_reg_rtx (vmode);
45381d6f
AS
5120 emit_insn (gen_ds_bpermutevNm (tmp, addr, src1, get_exec (vmode)));
5121 emit_insn (gen_movvNm (dst, tmp, dst, get_exec (src1_lanes)));
5326695a
AS
5122 }
5123
5124 return true;
5125}
5126
5127/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
5128
5129 Return nonzero if vector MODE is supported with at least move
5130 instructions. */
5131
5132static bool
5133gcn_vector_mode_supported_p (machine_mode mode)
5134{
2b99bed8
AS
5135 return (mode == V64QImode || mode == V64HImode
5136 || mode == V64SImode || mode == V64DImode
45381d6f
AS
5137 || mode == V64SFmode || mode == V64DFmode
5138 || mode == V32QImode || mode == V32HImode
5139 || mode == V32SImode || mode == V32DImode
5140 || mode == V32SFmode || mode == V32DFmode
5141 || mode == V16QImode || mode == V16HImode
5142 || mode == V16SImode || mode == V16DImode
5143 || mode == V16SFmode || mode == V16DFmode
5144 || mode == V8QImode || mode == V8HImode
5145 || mode == V8SImode || mode == V8DImode
5146 || mode == V8SFmode || mode == V8DFmode
5147 || mode == V4QImode || mode == V4HImode
5148 || mode == V4SImode || mode == V4DImode
5149 || mode == V4SFmode || mode == V4DFmode
5150 || mode == V2QImode || mode == V2HImode
5151 || mode == V2SImode || mode == V2DImode
8aeabd9f
AS
5152 || mode == V2SFmode || mode == V2DFmode
5153 /* TImode vectors are allowed to exist for divmod, but there
5154 are almost no instructions defined for them, and the
5155 autovectorizer does not use them. */
5156 || mode == V64TImode || mode == V32TImode
5157 || mode == V16TImode || mode == V8TImode
5158 || mode == V4TImode || mode == V2TImode);
5326695a
AS
5159}
5160
5161/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
5162
5163 Enables autovectorization for all supported modes. */
5164
5165static machine_mode
5166gcn_vectorize_preferred_simd_mode (scalar_mode mode)
5167{
5168 switch (mode)
5169 {
5170 case E_QImode:
5171 return V64QImode;
5172 case E_HImode:
5173 return V64HImode;
5174 case E_SImode:
5175 return V64SImode;
5176 case E_DImode:
5177 return V64DImode;
5178 case E_SFmode:
5179 return V64SFmode;
5180 case E_DFmode:
5181 return V64DFmode;
5182 default:
5183 return word_mode;
5184 }
5185}
5186
45381d6f
AS
5187/* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES.
5188
5189 Try all the vector modes. */
5190
5191unsigned int gcn_autovectorize_vector_modes (vector_modes *modes,
5192 bool ARG_UNUSED (all))
5193{
5194 modes->safe_push (V64QImode);
5195 modes->safe_push (V64HImode);
5196 modes->safe_push (V64SImode);
5197 modes->safe_push (V64SFmode);
5198 modes->safe_push (V64DImode);
5199 modes->safe_push (V64DFmode);
5200
5201 modes->safe_push (V32QImode);
5202 modes->safe_push (V32HImode);
5203 modes->safe_push (V32SImode);
5204 modes->safe_push (V32SFmode);
5205 modes->safe_push (V32DImode);
5206 modes->safe_push (V32DFmode);
5207
5208 modes->safe_push (V16QImode);
5209 modes->safe_push (V16HImode);
5210 modes->safe_push (V16SImode);
5211 modes->safe_push (V16SFmode);
5212 modes->safe_push (V16DImode);
5213 modes->safe_push (V16DFmode);
5214
5215 modes->safe_push (V8QImode);
5216 modes->safe_push (V8HImode);
5217 modes->safe_push (V8SImode);
5218 modes->safe_push (V8SFmode);
5219 modes->safe_push (V8DImode);
5220 modes->safe_push (V8DFmode);
5221
5222 modes->safe_push (V4QImode);
5223 modes->safe_push (V4HImode);
5224 modes->safe_push (V4SImode);
5225 modes->safe_push (V4SFmode);
5226 modes->safe_push (V4DImode);
5227 modes->safe_push (V4DFmode);
5228
5229 modes->safe_push (V2QImode);
5230 modes->safe_push (V2HImode);
5231 modes->safe_push (V2SImode);
5232 modes->safe_push (V2SFmode);
5233 modes->safe_push (V2DImode);
5234 modes->safe_push (V2DFmode);
5235
5236 /* We shouldn't need VECT_COMPARE_COSTS as they should all cost the same. */
5237 return 0;
5238}
5239
2b99bed8
AS
5240/* Implement TARGET_VECTORIZE_RELATED_MODE.
5241
5242 All GCN vectors are 64-lane, so this is simpler than other architectures.
5243 In particular, we do *not* want to match vector bit-size. */
5244
5245static opt_machine_mode
45381d6f 5246gcn_related_vector_mode (machine_mode vector_mode,
dd455df7 5247 scalar_mode element_mode, poly_uint64 nunits)
2b99bed8 5248{
45381d6f 5249 int n = nunits.to_constant ();
2b99bed8 5250
45381d6f
AS
5251 if (n == 0)
5252 n = GET_MODE_NUNITS (vector_mode);
2b99bed8 5253
45381d6f 5254 return VnMODE (n, element_mode);
2b99bed8
AS
5255}
5256
5326695a
AS
5257/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
5258
5259 Returns the preferred alignment in bits for accesses to vectors of type type
5260 in vectorized code. This might be less than or greater than the ABI-defined
5261 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
5262 of a single element, in which case the vectorizer will not try to optimize
5263 for alignment. */
5264
5265static poly_uint64
5266gcn_preferred_vector_alignment (const_tree type)
5267{
5268 return TYPE_ALIGN (TREE_TYPE (type));
5269}
5270
5271/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
5272
5273 Return true if the target supports misaligned vector store/load of a
5274 specific factor denoted in the misalignment parameter. */
5275
5276static bool
5277gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
5278 const_tree type, int misalignment,
5279 bool is_packed)
5280{
5281 if (is_packed)
5282 return false;
5283
5284 /* If the misalignment is unknown, we should be able to handle the access
5285 so long as it is not to a member of a packed data structure. */
5286 if (misalignment == -1)
5287 return true;
5288
5289 /* Return true if the misalignment is a multiple of the natural alignment
5290 of the vector's element type. This is probably always going to be
5291 true in practice, since we've already established that this isn't a
5292 packed access. */
5293 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
5294}
5295
5296/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
5297
5298 Return true if vector alignment is reachable (by peeling N iterations) for
5299 the given scalar type TYPE. */
5300
5301static bool
5302gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
5303{
5304 /* Vectors which aren't in packed structures will not be less aligned than
5305 the natural alignment of their element type, so this is safe. */
5306 return !is_packed;
5307}
5308
1bde3ace
AJ
5309/* Generate DPP pairwise swap instruction.
5310 This instruction swaps the values in each even lane with the value in the
5311 next one:
5312 a, b, c, d -> b, a, d, c.
5313 The opcode is given by INSN. */
5314
5315char *
5316gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
5317 int ARG_UNUSED (unspec))
5318{
5319 static char buf[128];
5320 const char *dpp;
5321
5322 /* Add the DPP modifiers. */
5323 dpp = "quad_perm:[1,0,3,2]";
5324
5325 if (vgpr_2reg_mode_p (mode))
5326 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
5327 insn, dpp, insn, dpp);
5328 else
5329 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
5330
5331 return buf;
5332}
5333
5334/* Generate DPP distribute even instruction.
5335 This instruction copies the value in each even lane to the next one:
5336 a, b, c, d -> a, a, c, c.
5337 The opcode is given by INSN. */
5338
5339char *
5340gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
5341 int ARG_UNUSED (unspec))
5342{
5343 static char buf[128];
5344 const char *dpp;
5345
5346 /* Add the DPP modifiers. */
5347 dpp = "quad_perm:[0,0,2,2]";
5348
5349 if (vgpr_2reg_mode_p (mode))
5350 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
5351 insn, dpp, insn, dpp);
5352 else
5353 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
5354
5355 return buf;
5356}
5357
5358/* Generate DPP distribute odd instruction.
5359 This isntruction copies the value in each odd lane to the previous one:
5360 a, b, c, d -> b, b, d, d.
5361 The opcode is given by INSN. */
5362
5363char *
5364gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
5365 int ARG_UNUSED (unspec))
5366{
5367 static char buf[128];
5368 const char *dpp;
5369
5370 /* Add the DPP modifiers. */
5371 dpp = "quad_perm:[1,1,3,3]";
5372
5373 if (vgpr_2reg_mode_p (mode))
5374 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
5375 insn, dpp, insn, dpp);
5376 else
5377 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
5378
5379 return buf;
5380}
5381
5326695a
AS
5382/* Generate DPP instructions used for vector reductions.
5383
5384 The opcode is given by INSN.
5385 The first operand of the operation is shifted right by SHIFT vector lanes.
5386 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
5387 broadcast the next row (thereby acting like a shift of 16 for the end of
5388 each row). If SHIFT is 32, lane 31 is broadcast to all the
5389 following lanes (thereby acting like a shift of 32 for lane 63). */
5390
5391char *
5392gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
5393 int unspec, int shift)
5394{
a5879399 5395 static char buf[128];
5326695a
AS
5396 const char *dpp;
5397 const char *vcc_in = "";
5398 const char *vcc_out = "";
5399
5400 /* Add the vcc operand if needed. */
5401 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
5402 {
5403 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
5404 vcc_in = ", vcc";
5405
5406 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
5407 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
5408 vcc_out = ", vcc";
5409 }
5410
5411 /* Add the DPP modifiers. */
5412 switch (shift)
5413 {
5414 case 1:
5415 dpp = "row_shr:1 bound_ctrl:0";
5416 break;
5417 case 2:
5418 dpp = "row_shr:2 bound_ctrl:0";
5419 break;
5420 case 4:
5421 dpp = "row_shr:4 bank_mask:0xe";
5422 break;
5423 case 8:
5424 dpp = "row_shr:8 bank_mask:0xc";
5425 break;
5426 case 16:
5427 dpp = "row_bcast:15 row_mask:0xa";
5428 break;
5429 case 32:
5430 dpp = "row_bcast:31 row_mask:0xc";
5431 break;
5432 default:
5433 gcc_unreachable ();
5434 }
5435
a5879399
AS
5436 if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
5437 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
5438 insn, dpp, insn, dpp);
5439 else if (unspec == UNSPEC_MOV_DPP_SHR)
5440 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
5441 else
5442 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
5326695a
AS
5443
5444 return buf;
5445}
5446
5447/* Generate vector reductions in terms of DPP instructions.
5448
5449 The vector register SRC of mode MODE is reduced using the operation given
5450 by UNSPEC, and the scalar result is returned in lane 63 of a vector
f539029c 5451 register (or lane 31, 15, 7, 3, 1 for partial vectors). */
5326695a
AS
5452
5453rtx
5454gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
5455{
a5879399 5456 machine_mode orig_mode = mode;
f539029c
AS
5457 machine_mode scalar_mode = GET_MODE_INNER (mode);
5458 int vf = GET_MODE_NUNITS (mode);
a5879399
AS
5459 bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
5460 || unspec == UNSPEC_SMAX_DPP_SHR
5461 || unspec == UNSPEC_UMIN_DPP_SHR
5462 || unspec == UNSPEC_UMAX_DPP_SHR)
f539029c
AS
5463 && (scalar_mode == DImode
5464 || scalar_mode == DFmode))
a5879399 5465 || (unspec == UNSPEC_PLUS_DPP_SHR
f539029c 5466 && scalar_mode == DFmode));
a5879399
AS
5467 rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
5468 : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
5469 : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
5470 : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
5471 : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
5472 : UNKNOWN);
5473 bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
5474 || unspec == UNSPEC_SMAX_DPP_SHR
5475 || unspec == UNSPEC_UMIN_DPP_SHR
5476 || unspec == UNSPEC_UMAX_DPP_SHR)
f539029c
AS
5477 && (scalar_mode == QImode
5478 || scalar_mode == HImode));
a5879399
AS
5479 bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
5480 || unspec == UNSPEC_UMAX_DPP_SHR);
5326695a
AS
5481 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
5482 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
f539029c 5483 && (TARGET_GCN3 || scalar_mode == DImode);
5326695a
AS
5484
5485 if (use_plus_carry)
5486 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
5487
a5879399
AS
5488 if (use_extends)
5489 {
f539029c
AS
5490 mode = VnMODE (vf, SImode);
5491 rtx tmp = gen_reg_rtx (mode);
a5879399
AS
5492 convert_move (tmp, src, unsignedp);
5493 src = tmp;
a5879399
AS
5494 }
5495
5326695a
AS
5496 /* Perform reduction by first performing the reduction operation on every
5497 pair of lanes, then on every pair of results from the previous
5498 iteration (thereby effectively reducing every 4 lanes) and so on until
5499 all lanes are reduced. */
d51cad0b 5500 rtx in, out = force_reg (mode, src);
f539029c
AS
5501 int iterations = exact_log2 (vf);
5502 for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
5326695a
AS
5503 {
5504 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
a5879399
AS
5505 in = out;
5506 out = gen_reg_rtx (mode);
5507
5508 if (use_moves)
5326695a 5509 {
a5879399
AS
5510 rtx tmp = gen_reg_rtx (mode);
5511 emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
5512 emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
5326695a 5513 }
a5879399
AS
5514 else
5515 {
5516 rtx insn = gen_rtx_SET (out,
5517 gen_rtx_UNSPEC (mode,
5518 gen_rtvec (3, in, in,
5519 shift_val),
5520 unspec));
5521
5522 /* Add clobber for instructions that set the carry flags. */
5523 if (use_plus_carry)
5524 {
5525 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
5526 gen_rtx_REG (DImode, VCC_REG));
5527 insn = gen_rtx_PARALLEL (VOIDmode,
5528 gen_rtvec (2, insn, clobber));
5529 }
5326695a 5530
a5879399
AS
5531 emit_insn (insn);
5532 }
5533 }
5326695a 5534
a5879399
AS
5535 if (use_extends)
5536 {
5537 rtx tmp = gen_reg_rtx (orig_mode);
5538 convert_move (tmp, out, unsignedp);
5539 out = tmp;
5326695a
AS
5540 }
5541
a5879399 5542 return out;
5326695a
AS
5543}
5544
5545/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
5546
5547int
5548gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
5549 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
5550{
5551 /* Always vectorize. */
5552 return 1;
5553}
5554
b73c49f6
AS
5555/* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
5556
5557static int
5558gcn_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *ARG_UNUSED (node),
5559 struct cgraph_simd_clone *clonei,
12a10856 5560 tree ARG_UNUSED (base_type),
309e2d95
SL
5561 int ARG_UNUSED (num),
5562 bool explicit_p)
b73c49f6 5563{
b73c49f6
AS
5564 if (known_eq (clonei->simdlen, 0U))
5565 clonei->simdlen = 64;
5566 else if (maybe_ne (clonei->simdlen, 64U))
5567 {
5568 /* Note that x86 has a similar message that is likely to trigger on
5569 sizes that are OK for gcn; the user can't win. */
309e2d95
SL
5570 if (explicit_p)
5571 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
5572 "unsupported simdlen %wd (amdgcn)",
5573 clonei->simdlen.to_constant ());
b73c49f6
AS
5574 return 0;
5575 }
5576
5577 clonei->vecsize_mangle = 'n';
5578 clonei->vecsize_int = 0;
5579 clonei->vecsize_float = 0;
5580
5581 /* DImode ought to be more natural here, but VOIDmode produces better code,
5582 at present, due to the shift-and-test steps not being optimized away
5583 inside the in-branch clones. */
5584 clonei->mask_mode = VOIDmode;
5585
5586 return 1;
5587}
5588
5589/* Implement TARGET_SIMD_CLONE_ADJUST. */
5590
5591static void
5592gcn_simd_clone_adjust (struct cgraph_node *ARG_UNUSED (node))
5593{
5594 /* This hook has to be defined when
5595 TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN is defined, but we don't
5596 need it to do anything yet. */
5597}
5598
5599/* Implement TARGET_SIMD_CLONE_USABLE. */
5600
5601static int
5602gcn_simd_clone_usable (struct cgraph_node *ARG_UNUSED (node))
5603{
5604 /* We don't need to do anything here because
5605 gcn_simd_clone_compute_vecsize_and_simdlen currently only returns one
5606 possibility. */
5607 return 0;
5608}
5609
ce9cd725
KCY
5610tree mathfn_built_in_explicit (tree, combined_fn);
5611
5612/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION.
5613 Return the function declaration of the vectorized version of the builtin
5614 in the math library if available. */
5615
5616tree
5617gcn_vectorize_builtin_vectorized_function (unsigned int fn, tree type_out,
5618 tree type_in)
5619{
5620 if (TREE_CODE (type_out) != VECTOR_TYPE
5621 || TREE_CODE (type_in) != VECTOR_TYPE)
5622 return NULL_TREE;
5623
5624 machine_mode out_mode = TYPE_MODE (TREE_TYPE (type_out));
5625 int out_n = TYPE_VECTOR_SUBPARTS (type_out);
ce9cd725
KCY
5626 combined_fn cfn = combined_fn (fn);
5627
5628 /* Keep this consistent with the list of vectorized math routines. */
5629 int implicit_p;
5630 switch (fn)
5631 {
5632 CASE_CFN_ACOS:
5633 CASE_CFN_ACOSH:
5634 CASE_CFN_ASIN:
5635 CASE_CFN_ASINH:
5636 CASE_CFN_ATAN:
5637 CASE_CFN_ATAN2:
5638 CASE_CFN_ATANH:
5639 CASE_CFN_COPYSIGN:
5640 CASE_CFN_COS:
5641 CASE_CFN_COSH:
5642 CASE_CFN_ERF:
5643 CASE_CFN_EXP:
5644 CASE_CFN_EXP2:
5645 CASE_CFN_FINITE:
5646 CASE_CFN_FMOD:
5647 CASE_CFN_GAMMA:
5648 CASE_CFN_HYPOT:
5649 CASE_CFN_ISNAN:
5650 CASE_CFN_LGAMMA:
5651 CASE_CFN_LOG:
5652 CASE_CFN_LOG10:
5653 CASE_CFN_LOG2:
5654 CASE_CFN_POW:
5655 CASE_CFN_REMAINDER:
5656 CASE_CFN_RINT:
5657 CASE_CFN_SIN:
5658 CASE_CFN_SINH:
5659 CASE_CFN_SQRT:
5660 CASE_CFN_TAN:
5661 CASE_CFN_TANH:
5662 CASE_CFN_TGAMMA:
5663 implicit_p = 1;
5664 break;
5665
5666 CASE_CFN_SCALB:
5667 CASE_CFN_SIGNIFICAND:
5668 implicit_p = 0;
5669 break;
5670
5671 default:
5672 return NULL_TREE;
5673 }
5674
5675 tree out_t_node = (out_mode == DFmode) ? double_type_node : float_type_node;
5676 tree fndecl = implicit_p ? mathfn_built_in (out_t_node, cfn)
5677 : mathfn_built_in_explicit (out_t_node, cfn);
5678
5679 const char *bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
5680 char name[20];
5681 sprintf (name, out_mode == DFmode ? "v%ddf_%s" : "v%dsf_%s",
5682 out_n, bname + 10);
5683
5684 unsigned arity = 0;
5685 for (tree args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
5686 arity++;
5687
5688 tree fntype = (arity == 1)
5689 ? build_function_type_list (type_out, type_in, NULL)
5690 : build_function_type_list (type_out, type_in, type_in, NULL);
5691
5692 /* Build a function declaration for the vectorized function. */
5693 tree new_fndecl = build_decl (BUILTINS_LOCATION,
5694 FUNCTION_DECL, get_identifier (name), fntype);
5695 TREE_PUBLIC (new_fndecl) = 1;
5696 DECL_EXTERNAL (new_fndecl) = 1;
5697 DECL_IS_NOVOPS (new_fndecl) = 1;
5698 TREE_READONLY (new_fndecl) = 1;
5699
5700 return new_fndecl;
5701}
5702
5703/* Implement TARGET_LIBC_HAS_FUNCTION. */
5704
5705bool
5706gcn_libc_has_function (enum function_class fn_class,
5707 tree type)
5708{
5709 return bsd_libc_has_function (fn_class, type);
5710}
5711
5326695a
AS
5712/* }}} */
5713/* {{{ md_reorg pass. */
5714
5715/* Identify VMEM instructions from their "type" attribute. */
5716
5717static bool
5718gcn_vmem_insn_p (attr_type type)
5719{
5720 switch (type)
5721 {
5722 case TYPE_MUBUF:
5723 case TYPE_MTBUF:
5724 case TYPE_FLAT:
5725 return true;
5726 case TYPE_UNKNOWN:
5727 case TYPE_SOP1:
5728 case TYPE_SOP2:
5729 case TYPE_SOPK:
5730 case TYPE_SOPC:
5731 case TYPE_SOPP:
5732 case TYPE_SMEM:
5733 case TYPE_DS:
5734 case TYPE_VOP2:
5735 case TYPE_VOP1:
5736 case TYPE_VOPC:
5737 case TYPE_VOP3A:
5738 case TYPE_VOP3B:
5739 case TYPE_VOP_SDWA:
5740 case TYPE_VOP_DPP:
5741 case TYPE_MULT:
5742 case TYPE_VMULT:
5743 return false;
5744 }
5745 gcc_unreachable ();
5746 return false;
5747}
5748
5749/* If INSN sets the EXEC register to a constant value, return the value,
5750 otherwise return zero. */
5751
5752static int64_t
5753gcn_insn_exec_value (rtx_insn *insn)
5754{
5755 if (!NONDEBUG_INSN_P (insn))
5756 return 0;
5757
5758 rtx pattern = PATTERN (insn);
5759
5760 if (GET_CODE (pattern) == SET)
5761 {
5762 rtx dest = XEXP (pattern, 0);
5763 rtx src = XEXP (pattern, 1);
5764
5765 if (GET_MODE (dest) == DImode
5766 && REG_P (dest) && REGNO (dest) == EXEC_REG
5767 && CONST_INT_P (src))
5768 return INTVAL (src);
5769 }
5770
5771 return 0;
5772}
5773
5774/* Sets the EXEC register before INSN to the value that it had after
5775 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
5776 known, otherwise it returns zero. */
5777
5778static int64_t
5779gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
5780 bool curr_exec_known, bool &last_exec_def_saved)
5781{
5782 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
5783 rtx exec;
5784
5785 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
5786
5787 if (exec_value)
5788 {
5789 /* If the EXEC value is a constant and it happens to be the same as the
5790 current EXEC value, the restore can be skipped. */
5791 if (curr_exec_known && exec_value == curr_exec)
5792 return exec_value;
5793
5794 exec = GEN_INT (exec_value);
5795 }
5796 else
5797 {
5798 /* If the EXEC value is not a constant, save it in a register after the
5799 point of definition. */
5800 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
5801
5802 if (!last_exec_def_saved)
5803 {
5804 start_sequence ();
5805 emit_move_insn (exec_save_reg, exec_reg);
5806 rtx_insn *seq = get_insns ();
5807 end_sequence ();
5808
5809 emit_insn_after (seq, last_exec_def);
5810 if (dump_file && (dump_flags & TDF_DETAILS))
5811 fprintf (dump_file, "Saving EXEC after insn %d.\n",
5812 INSN_UID (last_exec_def));
5813
5814 last_exec_def_saved = true;
5815 }
5816
5817 exec = exec_save_reg;
5818 }
5819
5820 /* Restore EXEC register before the usage. */
5821 start_sequence ();
5822 emit_move_insn (exec_reg, exec);
5823 rtx_insn *seq = get_insns ();
5824 end_sequence ();
5825 emit_insn_before (seq, insn);
5826
5827 if (dump_file && (dump_flags & TDF_DETAILS))
5828 {
5829 if (exec_value)
5830 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
5831 exec_value, INSN_UID (insn));
5832 else
5833 fprintf (dump_file,
5834 "Restoring EXEC from saved value before insn %d.\n",
5835 INSN_UID (insn));
5836 }
5837
5838 return exec_value;
5839}
5840
5841/* Implement TARGET_MACHINE_DEPENDENT_REORG.
5842
5843 Ensure that pipeline dependencies and lane masking are set correctly. */
5844
5845static void
5846gcn_md_reorg (void)
5847{
5848 basic_block bb;
5849 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
5326695a
AS
5850 regset_head live;
5851
5852 INIT_REG_SET (&live);
5853
5854 compute_bb_for_insn ();
5855
5856 if (!optimize)
5857 {
5858 split_all_insns ();
5859 if (dump_file && (dump_flags & TDF_DETAILS))
5860 {
5861 fprintf (dump_file, "After split:\n");
5862 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
5863 }
5864
5865 /* Update data-flow information for split instructions. */
5866 df_insn_rescan_all ();
5867 }
5868
3df6fac0
JB
5869 df_live_add_problem ();
5870 df_live_set_all_dirty ();
5326695a
AS
5871 df_analyze ();
5872
5873 /* This pass ensures that the EXEC register is set correctly, according
5874 to the "exec" attribute. However, care must be taken so that the
5875 value that reaches explicit uses of the EXEC register remains the
5876 same as before.
5877 */
5878
5879 FOR_EACH_BB_FN (bb, cfun)
5880 {
5881 if (dump_file && (dump_flags & TDF_DETAILS))
5882 fprintf (dump_file, "BB %d:\n", bb->index);
5883
5884 rtx_insn *insn, *curr;
5885 rtx_insn *last_exec_def = BB_HEAD (bb);
5886 bool last_exec_def_saved = false;
5887 bool curr_exec_explicit = true;
5888 bool curr_exec_known = true;
5889 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
5890 after last_exec_def is executed'. */
5891
3df6fac0
JB
5892 bitmap live_in = DF_LR_IN (bb);
5893 bool exec_live_on_entry = false;
5894 if (bitmap_bit_p (live_in, EXEC_LO_REG)
5895 || bitmap_bit_p (live_in, EXEC_HI_REG))
5896 {
5897 if (dump_file)
5898 fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
5899 (int) bb->index);
5900 exec_live_on_entry = true;
5901 }
5902
5326695a
AS
5903 FOR_BB_INSNS_SAFE (bb, insn, curr)
5904 {
5905 if (!NONDEBUG_INSN_P (insn))
5906 continue;
5907
5908 if (GET_CODE (PATTERN (insn)) == USE
5909 || GET_CODE (PATTERN (insn)) == CLOBBER)
5910 continue;
5911
5912 HARD_REG_SET defs, uses;
5913 CLEAR_HARD_REG_SET (defs);
5914 CLEAR_HARD_REG_SET (uses);
e8448ba5 5915 note_stores (insn, record_hard_reg_sets, &defs);
5326695a
AS
5916 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
5917
5918 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
5919 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
5920 bool exec_used = (hard_reg_set_intersect_p
5921 (uses, reg_class_contents[(int) EXEC_MASK_REG])
5922 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
5923
5924 /* Check the instruction for implicit setting of EXEC via an
5925 attribute. */
5926 attr_exec exec_attr = get_attr_exec (insn);
5927 int64_t new_exec;
5928
5929 switch (exec_attr)
5930 {
5931 case EXEC_NONE:
5932 new_exec = 0;
5933 break;
5934
5935 case EXEC_SINGLE:
5936 /* Instructions that do not involve memory accesses only require
5937 bit 0 of EXEC to be set. */
5938 if (gcn_vmem_insn_p (get_attr_type (insn))
5939 || get_attr_type (insn) == TYPE_DS)
5940 new_exec = 1;
5941 else
5942 new_exec = curr_exec | 1;
5943 break;
5944
5945 case EXEC_FULL:
5946 new_exec = -1;
5947 break;
5948
5949 default: /* Auto-detect what setting is appropriate. */
5950 {
5951 new_exec = 0;
5952
5953 /* If EXEC is referenced explicitly then we don't need to do
5954 anything to set it, so we're done. */
5955 if (exec_used)
5956 break;
5957
5958 /* Scan the insn for VGPRs defs or uses. The mode determines
5959 what kind of exec is needed. */
5960 subrtx_iterator::array_type array;
5961 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
5962 {
5963 const_rtx x = *iter;
5964 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
5965 {
5966 if (VECTOR_MODE_P (GET_MODE (x)))
5967 {
45381d6f
AS
5968 int vf = GET_MODE_NUNITS (GET_MODE (x));
5969 new_exec = MAX ((uint64_t)new_exec,
5970 0xffffffffffffffffUL >> (64-vf));
5326695a 5971 }
45381d6f 5972 else if (new_exec == 0)
5326695a
AS
5973 new_exec = 1;
5974 }
5975 }
5976 }
5977 break;
5978 }
5979
5980 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
5981 {
5982 start_sequence ();
5983 emit_move_insn (exec_reg, GEN_INT (new_exec));
5984 rtx_insn *seq = get_insns ();
5985 end_sequence ();
5986 emit_insn_before (seq, insn);
5987
5988 if (dump_file && (dump_flags & TDF_DETAILS))
5989 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
5990 new_exec, INSN_UID (insn));
5991
5992 curr_exec = new_exec;
5993 curr_exec_explicit = false;
5994 curr_exec_known = true;
5995 }
5996 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
5997 {
5998 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
5999 new_exec, INSN_UID (insn));
6000 }
6001
6002 /* The state of the EXEC register is unknown after a
6003 function call. */
6004 if (CALL_P (insn))
6005 curr_exec_known = false;
6006
6007 /* Handle explicit uses of EXEC. If the instruction is a partial
6008 explicit definition of EXEC, then treat it as an explicit use of
6009 EXEC as well. */
6010 if (exec_used || exec_lo_def_p != exec_hi_def_p)
6011 {
6012 /* An instruction that explicitly uses EXEC should not also
6013 implicitly define it. */
6014 gcc_assert (!exec_used || !new_exec);
6015
6016 if (!curr_exec_known || !curr_exec_explicit)
6017 {
6018 /* Restore the previous explicitly defined value. */
6019 curr_exec = gcn_restore_exec (insn, last_exec_def,
6020 curr_exec, curr_exec_known,
6021 last_exec_def_saved);
6022 curr_exec_explicit = true;
6023 curr_exec_known = true;
6024 }
6025 }
6026
6027 /* Handle explicit definitions of EXEC. */
6028 if (exec_lo_def_p || exec_hi_def_p)
6029 {
6030 last_exec_def = insn;
6031 last_exec_def_saved = false;
6032 curr_exec = gcn_insn_exec_value (insn);
6033 curr_exec_explicit = true;
6034 curr_exec_known = true;
6035
6036 if (dump_file && (dump_flags & TDF_DETAILS))
6037 fprintf (dump_file,
6038 "Found %s definition of EXEC at insn %d.\n",
6039 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
6040 INSN_UID (insn));
6041 }
3df6fac0
JB
6042
6043 exec_live_on_entry = false;
5326695a
AS
6044 }
6045
6046 COPY_REG_SET (&live, DF_LR_OUT (bb));
6047 df_simulate_initialize_backwards (bb, &live);
6048
6049 /* If EXEC is live after the basic block, restore the value of EXEC
6050 at the end of the block. */
6051 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
6052 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
3df6fac0 6053 && (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
5326695a
AS
6054 {
6055 rtx_insn *end_insn = BB_END (bb);
6056
6057 /* If the instruction is not a jump instruction, do the restore
6058 after the last instruction in the basic block. */
6059 if (NONJUMP_INSN_P (end_insn))
6060 end_insn = NEXT_INSN (end_insn);
6061
6062 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
6063 curr_exec_known, last_exec_def_saved);
6064 }
6065 }
6066
6067 CLEAR_REG_SET (&live);
6068
6069 /* "Manually Inserted Wait States (NOPs)."
6070
6071 GCN hardware detects most kinds of register dependencies, but there
6072 are some exceptions documented in the ISA manual. This pass
6073 detects the missed cases, and inserts the documented number of NOPs
6074 required for correct execution. */
6075
6076 const int max_waits = 5;
6077 struct ilist
6078 {
6079 rtx_insn *insn;
6080 attr_unit unit;
930c5599 6081 attr_delayeduse delayeduse;
5326695a 6082 HARD_REG_SET writes;
930c5599 6083 HARD_REG_SET reads;
5326695a
AS
6084 int age;
6085 } back[max_waits];
6086 int oldest = 0;
6087 for (int i = 0; i < max_waits; i++)
6088 back[i].insn = NULL;
6089
6090 rtx_insn *insn, *last_insn = NULL;
6091 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
6092 {
6093 if (!NONDEBUG_INSN_P (insn))
6094 continue;
6095
6096 if (GET_CODE (PATTERN (insn)) == USE
6097 || GET_CODE (PATTERN (insn)) == CLOBBER)
6098 continue;
6099
6100 attr_type itype = get_attr_type (insn);
6101 attr_unit iunit = get_attr_unit (insn);
930c5599 6102 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
cfdc45f7 6103 int ivccwait = get_attr_vccwait (insn);
5326695a
AS
6104 HARD_REG_SET ireads, iwrites;
6105 CLEAR_HARD_REG_SET (ireads);
6106 CLEAR_HARD_REG_SET (iwrites);
e8448ba5 6107 note_stores (insn, record_hard_reg_sets, &iwrites);
5326695a
AS
6108 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
6109
6110 /* Scan recent previous instructions for dependencies not handled in
6111 hardware. */
6112 int nops_rqd = 0;
6113 for (int i = oldest; i < oldest + max_waits; i++)
6114 {
6115 struct ilist *prev_insn = &back[i % max_waits];
6116
6117 if (!prev_insn->insn)
6118 continue;
6119
6120 /* VALU writes SGPR followed by VMEM reading the same SGPR
6121 requires 5 wait states. */
6122 if ((prev_insn->age + nops_rqd) < 5
6123 && prev_insn->unit == UNIT_VECTOR
6124 && gcn_vmem_insn_p (itype))
6125 {
dc333d8f 6126 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
6127 if (hard_reg_set_intersect_p
6128 (regs, reg_class_contents[(int) SGPR_REGS]))
6129 nops_rqd = 5 - prev_insn->age;
6130 }
6131
6132 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
6133 requires 5 wait states. */
6134 if ((prev_insn->age + nops_rqd) < 5
6135 && prev_insn->unit == UNIT_VECTOR
6136 && iunit == UNIT_VECTOR
6137 && ((hard_reg_set_intersect_p
6138 (prev_insn->writes,
6139 reg_class_contents[(int) EXEC_MASK_REG])
6140 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
6141 ||
6142 (hard_reg_set_intersect_p
6143 (prev_insn->writes,
6144 reg_class_contents[(int) VCC_CONDITIONAL_REG])
6145 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
6146 nops_rqd = 5 - prev_insn->age;
6147
6148 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
6149 SGPR/VCC as lane select requires 4 wait states. */
6150 if ((prev_insn->age + nops_rqd) < 4
6151 && prev_insn->unit == UNIT_VECTOR
6152 && get_attr_laneselect (insn) == LANESELECT_YES)
6153 {
dc333d8f 6154 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
6155 if (hard_reg_set_intersect_p
6156 (regs, reg_class_contents[(int) SGPR_REGS])
6157 || hard_reg_set_intersect_p
6158 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
6159 nops_rqd = 4 - prev_insn->age;
6160 }
6161
6162 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
6163 requires 2 wait states. */
6164 if ((prev_insn->age + nops_rqd) < 2
6165 && prev_insn->unit == UNIT_VECTOR
6166 && itype == TYPE_VOP_DPP)
6167 {
dc333d8f 6168 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
6169 if (hard_reg_set_intersect_p
6170 (regs, reg_class_contents[(int) VGPR_REGS]))
6171 nops_rqd = 2 - prev_insn->age;
6172 }
930c5599
AS
6173
6174 /* Store that requires input registers are not overwritten by
6175 following instruction. */
6176 if ((prev_insn->age + nops_rqd) < 1
6177 && prev_insn->delayeduse == DELAYEDUSE_YES
6178 && ((hard_reg_set_intersect_p
6179 (prev_insn->reads, iwrites))))
6180 nops_rqd = 1 - prev_insn->age;
cfdc45f7
AS
6181
6182 /* Instruction that requires VCC is not written too close before
6183 using it. */
6184 if (prev_insn->age < ivccwait
6185 && (hard_reg_set_intersect_p
6186 (prev_insn->writes,
6187 reg_class_contents[(int)VCC_CONDITIONAL_REG])))
6188 nops_rqd = ivccwait - prev_insn->age;
5326695a
AS
6189 }
6190
6191 /* Insert the required number of NOPs. */
6192 for (int i = nops_rqd; i > 0; i--)
6193 emit_insn_after (gen_nop (), last_insn);
6194
6195 /* Age the previous instructions. We can also ignore writes to
6196 registers subsequently overwritten. */
6197 HARD_REG_SET written;
6198 CLEAR_HARD_REG_SET (written);
6199 for (int i = oldest + max_waits - 1; i > oldest; i--)
6200 {
6201 struct ilist *prev_insn = &back[i % max_waits];
6202
6203 /* Assume all instructions are equivalent to one "wait", the same
6204 as s_nop. This is probably true for SALU, but not VALU (which
6205 may take longer), so this is not optimal. However, AMD do
6206 not publish the cycle times for instructions. */
6207 prev_insn->age += 1 + nops_rqd;
6208
44942965 6209 written |= iwrites;
d15e5131 6210 prev_insn->writes &= ~written;
5326695a
AS
6211 }
6212
6213 /* Track the current instruction as a previous instruction. */
6214 back[oldest].insn = insn;
6215 back[oldest].unit = iunit;
930c5599 6216 back[oldest].delayeduse = idelayeduse;
6576d245
RS
6217 back[oldest].writes = iwrites;
6218 back[oldest].reads = ireads;
5326695a
AS
6219 back[oldest].age = 0;
6220 oldest = (oldest + 1) % max_waits;
6221
6222 last_insn = insn;
6223 }
6224}
6225
6226/* }}} */
6227/* {{{ OpenACC / OpenMP. */
6228
6229#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
6230#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
6231#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
6232
6233/* Implement TARGET_GOACC_VALIDATE_DIMS.
6234
6235 Check the launch dimensions provided for an OpenACC compute
6236 region, or routine. */
6237
6238static bool
6239gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
6240 unsigned /*used*/)
6241{
6242 bool changed = false;
c408512e 6243 const int max_workers = 16;
fe22e0d4 6244
5326695a
AS
6245 /* The vector size must appear to be 64, to the user, unless this is a
6246 SEQ routine. The real, internal value is always 1, which means use
6247 autovectorization, but the user should not see that. */
6248 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
6249 && dims[GOMP_DIM_VECTOR] >= 0)
6250 {
6251 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
6252 && dims[GOMP_DIM_VECTOR] != 64)
6253 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
6254 OPT_Wopenacc_dims,
6255 (dims[GOMP_DIM_VECTOR]
55308fc2
AS
6256 ? G_("using %<vector_length (64)%>, ignoring %d")
6257 : G_("using %<vector_length (64)%>, "
5326695a
AS
6258 "ignoring runtime setting")),
6259 dims[GOMP_DIM_VECTOR]);
6260 dims[GOMP_DIM_VECTOR] = 1;
6261 changed = true;
6262 }
6263
6264 /* Check the num workers is not too large. */
6265 if (dims[GOMP_DIM_WORKER] > max_workers)
6266 {
6267 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
6268 OPT_Wopenacc_dims,
55308fc2 6269 "using %<num_workers (%d)%>, ignoring %d",
5326695a
AS
6270 max_workers, dims[GOMP_DIM_WORKER]);
6271 dims[GOMP_DIM_WORKER] = max_workers;
6272 changed = true;
6273 }
6274
6275 /* Set global defaults. */
6276 if (!decl)
6277 {
6278 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
6279 if (dims[GOMP_DIM_WORKER] < 0)
c408512e 6280 dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
5326695a
AS
6281 if (dims[GOMP_DIM_GANG] < 0)
6282 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
6283 changed = true;
6284 }
6285
6286 return changed;
6287}
6288
6289/* Helper function for oacc_dim_size instruction.
6290 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
6291
6292rtx
6293gcn_oacc_dim_size (int dim)
6294{
6295 if (dim < 0 || dim > 2)
6296 error ("offload dimension out of range (%d)", dim);
6297
6298 /* Vectors are a special case. */
6299 if (dim == 2)
6300 return const1_rtx; /* Think of this as 1 times 64. */
6301
6302 static int offset[] = {
6303 /* Offsets into dispatch packet. */
6304 12, /* X dim = Gang / Team / Work-group. */
6305 20, /* Z dim = Worker / Thread / Wavefront. */
6306 16 /* Y dim = Vector / SIMD / Work-item. */
6307 };
6308 rtx addr = gen_rtx_PLUS (DImode,
6309 gen_rtx_REG (DImode,
6310 cfun->machine->args.
6311 reg[DISPATCH_PTR_ARG]),
6312 GEN_INT (offset[dim]));
6f83861c
TB
6313 rtx mem = gen_rtx_MEM (SImode, addr);
6314 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
6315 return mem;
5326695a
AS
6316}
6317
6318/* Helper function for oacc_dim_pos instruction.
6319 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
6320
6321rtx
6322gcn_oacc_dim_pos (int dim)
6323{
6324 if (dim < 0 || dim > 2)
6325 error ("offload dimension out of range (%d)", dim);
6326
6327 static const int reg[] = {
6328 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
6329 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
6330 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
6331 };
6332
6333 int reg_num = cfun->machine->args.reg[reg[dim]];
6334
6335 /* The information must have been requested by the kernel. */
6336 gcc_assert (reg_num >= 0);
6337
6338 return gen_rtx_REG (SImode, reg_num);
6339}
6340
6341/* Implement TARGET_GOACC_FORK_JOIN. */
6342
6343static bool
2961ac45 6344gcn_fork_join (gcall *call, const int dims[], bool is_fork)
5326695a 6345{
2961ac45
JB
6346 tree arg = gimple_call_arg (call, 2);
6347 unsigned axis = TREE_INT_CST_LOW (arg);
6348
6349 if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
6350 return true;
6351
5326695a
AS
6352 return false;
6353}
6354
6355/* Implement ???????
6356 FIXME make this a real hook.
6357
6358 Adjust FNDECL such that options inherited from the host compiler
6359 are made appropriate for the accelerator compiler. */
6360
6361void
6362gcn_fixup_accel_lto_options (tree fndecl)
6363{
6364 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
6365 if (!func_optimize)
6366 return;
6367
ba948b37
JJ
6368 tree old_optimize
6369 = build_optimization_node (&global_options, &global_options_set);
5326695a
AS
6370 tree new_optimize;
6371
6372 /* If the function changed the optimization levels as well as
6373 setting target options, start with the optimizations
6374 specified. */
6375 if (func_optimize != old_optimize)
ba948b37 6376 cl_optimization_restore (&global_options, &global_options_set,
5326695a
AS
6377 TREE_OPTIMIZATION (func_optimize));
6378
6379 gcn_option_override ();
6380
6381 /* The target attributes may also change some optimization flags,
6382 so update the optimization options if necessary. */
ba948b37
JJ
6383 new_optimize = build_optimization_node (&global_options,
6384 &global_options_set);
5326695a
AS
6385
6386 if (old_optimize != new_optimize)
6387 {
6388 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
ba948b37 6389 cl_optimization_restore (&global_options, &global_options_set,
5326695a
AS
6390 TREE_OPTIMIZATION (old_optimize));
6391 }
6392}
6393
2a3f9f65
JB
6394/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
6395
6396static void
6397gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
6398 unsigned HOST_WIDE_INT *hi,
6399 int ARG_UNUSED (dims[GOMP_DIM_MAX]),
6400 unsigned HOST_WIDE_INT
6401 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
6402 unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
6403{
6404 *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
6405 /* !!! We can maybe use dims[] to estimate the maximum number of work
6406 groups/wavefronts/etc. we will launch, and therefore tune the maximum
6407 amount of LDS we should use. For now, use a minimal amount to try to
6408 maximise occupancy. */
6409 *hi = acc_lds_size;
6410 machine_function *machfun = cfun->machine;
6411 machfun->reduction_base = gang_private_size_opt;
6412 machfun->reduction_limit
6413 = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
6414}
6415
5326695a
AS
6416/* }}} */
6417/* {{{ ASM Output. */
6418
6419/* Implement TARGET_ASM_FILE_START.
6420
6421 Print assembler file header text. */
6422
6423static void
6424output_file_start (void)
6425{
8086230e 6426 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
366e3d30
TB
6427 configuration. */
6428 const char *xnack = (flag_xnack == HSACO_ATTR_ON ? ":xnack+"
6429 : flag_xnack == HSACO_ATTR_OFF ? ":xnack-"
6430 : "");
6431 const char *sram_ecc = (flag_sram_ecc == HSACO_ATTR_ON ? ":sramecc+"
6432 : flag_sram_ecc == HSACO_ATTR_OFF ? ":sramecc-"
8086230e
AS
6433 : "");
6434
dd455df7 6435 const char *cpu;
f062c3f1
AS
6436 switch (gcn_arch)
6437 {
1af16666
AS
6438 case PROCESSOR_FIJI:
6439 cpu = "gfx803";
8086230e
AS
6440 xnack = "";
6441 sram_ecc = "";
1af16666
AS
6442 break;
6443 case PROCESSOR_VEGA10:
6444 cpu = "gfx900";
8086230e 6445 sram_ecc = "";
1af16666
AS
6446 break;
6447 case PROCESSOR_VEGA20:
6448 cpu = "gfx906";
8086230e 6449 sram_ecc = "";
1af16666
AS
6450 break;
6451 case PROCESSOR_GFX908:
6452 cpu = "gfx908";
1af16666 6453 break;
cde52d3a
AS
6454 case PROCESSOR_GFX90a:
6455 cpu = "gfx90a";
6456 break;
f062c3f1
AS
6457 default: gcc_unreachable ();
6458 }
6459
aad32a00 6460 fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
8086230e 6461 cpu, sram_ecc, xnack);
5326695a
AS
6462}
6463
6464/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
6465
6466 Print the initial definition of a function name.
6467
6468 For GCN kernel entry points this includes all the HSA meta-data, special
6469 alignment constraints that don't apply to regular functions, and magic
6470 comments that pass information to mkoffload. */
6471
6472void
6473gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
6474{
6475 int sgpr, vgpr;
366e3d30 6476 bool xnack_enabled = TARGET_XNACK;
f062c3f1
AS
6477
6478 fputs ("\n\n", file);
5326695a
AS
6479
6480 if (cfun && cfun->machine && cfun->machine->normal_function)
6481 {
6482 fputs ("\t.type\t", file);
6483 assemble_name (file, name);
6484 fputs (",@function\n", file);
6485 assemble_name (file, name);
6486 fputs (":\n", file);
6487 return;
6488 }
6489
6490 /* Determine count of sgpr/vgpr registers by looking for last
6491 one used. */
6492 for (sgpr = 101; sgpr >= 0; sgpr--)
6493 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
6494 break;
6495 sgpr++;
6496 for (vgpr = 255; vgpr >= 0; vgpr--)
6497 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
6498 break;
6499 vgpr++;
6500
5326695a
AS
6501 if (!leaf_function_p ())
6502 {
6503 /* We can't know how many registers function calls might use. */
87fdbe69
KCY
6504 if (vgpr < MAX_NORMAL_VGPR_COUNT)
6505 vgpr = MAX_NORMAL_VGPR_COUNT;
f062c3f1
AS
6506 if (sgpr < MAX_NORMAL_SGPR_COUNT)
6507 sgpr = MAX_NORMAL_SGPR_COUNT;
5326695a
AS
6508 }
6509
cde52d3a
AS
6510 /* The gfx90a accum_offset field can't represent 0 registers. */
6511 if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4)
6512 vgpr = 4;
6513
f062c3f1
AS
6514 fputs ("\t.rodata\n"
6515 "\t.p2align\t6\n"
6516 "\t.amdhsa_kernel\t", file);
5326695a
AS
6517 assemble_name (file, name);
6518 fputs ("\n", file);
5326695a
AS
6519 int reg = FIRST_SGPR_REG;
6520 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
6521 {
6522 int reg_first = -1;
6523 int reg_last;
6524 if ((cfun->machine->args.requested & (1 << a))
6525 && (gcn_kernel_arg_types[a].fixed_regno < 0))
6526 {
6527 reg_first = reg;
6528 reg_last = (reg_first
6529 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
6530 / UNITS_PER_WORD) - 1);
6531 reg = reg_last + 1;
6532 }
6533
6534 if (gcn_kernel_arg_types[a].header_pseudo)
6535 {
f062c3f1
AS
6536 fprintf (file, "\t %s%s\t%i",
6537 (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
5326695a
AS
6538 gcn_kernel_arg_types[a].header_pseudo,
6539 (cfun->machine->args.requested & (1 << a)) != 0);
6540 if (reg_first != -1)
6541 {
6542 fprintf (file, " ; (");
6543 for (int i = reg_first; i <= reg_last; ++i)
6544 {
6545 if (i != reg_first)
6546 fprintf (file, ", ");
6547 fprintf (file, "%s", reg_names[i]);
6548 }
6549 fprintf (file, ")");
6550 }
6551 fprintf (file, "\n");
6552 }
6553 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
6554 && cfun->machine->args.requested & (1 << a))
f062c3f1 6555 fprintf (file, "\t ; %s\t%i (%s)\n",
5326695a
AS
6556 gcn_kernel_arg_types[a].name,
6557 (cfun->machine->args.requested & (1 << a)) != 0,
6558 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
6559 }
f062c3f1 6560 fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
5326695a
AS
6561 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
6562 ? 2
6563 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
6564 ? 1 : 0);
f062c3f1
AS
6565 fprintf (file,
6566 "\t .amdhsa_next_free_vgpr\t%i\n"
6567 "\t .amdhsa_next_free_sgpr\t%i\n"
6568 "\t .amdhsa_reserve_vcc\t1\n"
6569 "\t .amdhsa_reserve_flat_scratch\t0\n"
6570 "\t .amdhsa_reserve_xnack_mask\t%i\n"
f6fff8a6 6571 "\t .amdhsa_private_segment_fixed_size\t0\n"
f062c3f1
AS
6572 "\t .amdhsa_group_segment_fixed_size\t%u\n"
6573 "\t .amdhsa_float_denorm_mode_32\t3\n"
6574 "\t .amdhsa_float_denorm_mode_16_64\t3\n",
6575 vgpr,
6576 sgpr,
6577 xnack_enabled,
f062c3f1 6578 LDS_SIZE);
cde52d3a
AS
6579 if (gcn_arch == PROCESSOR_GFX90a)
6580 fprintf (file,
6581 "\t .amdhsa_accum_offset\t%i\n"
6582 "\t .amdhsa_tg_split\t0\n",
6583 (vgpr+3)&~3); // I think this means the AGPRs come after the VGPRs
f062c3f1
AS
6584 fputs ("\t.end_amdhsa_kernel\n", file);
6585
6586#if 1
6587 /* The following is YAML embedded in assembler; tabs are not allowed. */
6588 fputs (" .amdgpu_metadata\n"
6589 " amdhsa.version:\n"
6590 " - 1\n"
6591 " - 0\n"
6592 " amdhsa.kernels:\n"
6593 " - .name: ", file);
6594 assemble_name (file, name);
6595 fputs ("\n .symbol: ", file);
6596 assemble_name (file, name);
6597 fprintf (file,
6598 ".kd\n"
6599 " .kernarg_segment_size: %i\n"
6600 " .kernarg_segment_align: %i\n"
6601 " .group_segment_fixed_size: %u\n"
f6fff8a6 6602 " .private_segment_fixed_size: 0\n"
f062c3f1
AS
6603 " .wavefront_size: 64\n"
6604 " .sgpr_count: %i\n"
6605 " .vgpr_count: %i\n"
6606 " .max_flat_workgroup_size: 1024\n",
6607 cfun->machine->kernarg_segment_byte_size,
5326695a 6608 cfun->machine->kernarg_segment_alignment,
f062c3f1 6609 LDS_SIZE,
f062c3f1 6610 sgpr, vgpr);
cde52d3a
AS
6611 if (gcn_arch == PROCESSOR_GFX90a)
6612 fprintf (file, " .agpr_count: 0\n"); // AGPRs are not used, yet
f062c3f1
AS
6613 fputs (" .end_amdgpu_metadata\n", file);
6614#endif
6615
6616 fputs ("\t.text\n", file);
6617 fputs ("\t.align\t256\n", file);
6618 fputs ("\t.type\t", file);
6619 assemble_name (file, name);
6620 fputs (",@function\n", file);
6621 assemble_name (file, name);
6622 fputs (":\n", file);
5326695a
AS
6623
6624 /* This comment is read by mkoffload. */
6625 if (flag_openacc)
6626 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
6627 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
6628 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
6629 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
6630}
6631
6632/* Implement TARGET_ASM_SELECT_SECTION.
6633
6634 Return the section into which EXP should be placed. */
6635
6636static section *
6637gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
6638{
6639 if (TREE_TYPE (exp) != error_mark_node
6640 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
6641 {
6642 if (!DECL_P (exp))
6643 return get_section (".lds_bss",
6644 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
6645 NULL);
6646
6647 return get_named_section (exp, ".lds_bss", reloc);
6648 }
6649
6650 return default_elf_select_section (exp, reloc, align);
6651}
6652
6653/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
6654
6655 Emits custom text into the assembler file at the head of each function. */
6656
6657static void
6658gcn_target_asm_function_prologue (FILE *file)
6659{
6660 machine_function *offsets = gcn_compute_frame_offsets ();
6661
6662 asm_fprintf (file, "\t; using %s addressing in function\n",
6663 offsets->use_flat_addressing ? "flat" : "global");
6664
6665 if (offsets->normal_function)
6666 {
6667 asm_fprintf (file, "\t; frame pointer needed: %s\n",
6668 offsets->need_frame_pointer ? "true" : "false");
6669 asm_fprintf (file, "\t; lr needs saving: %s\n",
6670 offsets->lr_needs_saving ? "true" : "false");
6671 asm_fprintf (file, "\t; outgoing args size: %wd\n",
6672 offsets->outgoing_args_size);
6673 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
6674 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
6675 asm_fprintf (file, "\t; callee save size: %wd\n",
6676 offsets->callee_saves);
6677 }
6678 else
6679 {
6680 asm_fprintf (file, "\t; HSA kernel entry point\n");
6681 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
6682 asm_fprintf (file, "\t; outgoing args size: %wd\n",
6683 offsets->outgoing_args_size);
5326695a
AS
6684 }
6685}
6686
6687/* Helper function for print_operand and print_operand_address.
6688
6689 Print a register as the assembler requires, according to mode and name. */
6690
6691static void
6692print_reg (FILE *file, rtx x)
6693{
6694 machine_mode mode = GET_MODE (x);
45381d6f
AS
6695 if (VECTOR_MODE_P (mode))
6696 mode = GET_MODE_INNER (mode);
5326695a 6697 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
45381d6f 6698 || mode == HFmode || mode == SFmode)
5326695a 6699 fprintf (file, "%s", reg_names[REGNO (x)]);
45381d6f 6700 else if (mode == DImode || mode == DFmode)
5326695a
AS
6701 {
6702 if (SGPR_REGNO_P (REGNO (x)))
6703 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
6704 REGNO (x) - FIRST_SGPR_REG + 1);
6705 else if (VGPR_REGNO_P (REGNO (x)))
6706 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
6707 REGNO (x) - FIRST_VGPR_REG + 1);
6708 else if (REGNO (x) == FLAT_SCRATCH_REG)
6709 fprintf (file, "flat_scratch");
6710 else if (REGNO (x) == EXEC_REG)
6711 fprintf (file, "exec");
6712 else if (REGNO (x) == VCC_LO_REG)
6713 fprintf (file, "vcc");
6714 else
6715 fprintf (file, "[%s:%s]",
6716 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
6717 }
6718 else if (mode == TImode)
6719 {
6720 if (SGPR_REGNO_P (REGNO (x)))
6721 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
6722 REGNO (x) - FIRST_SGPR_REG + 3);
6723 else if (VGPR_REGNO_P (REGNO (x)))
6724 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
6725 REGNO (x) - FIRST_VGPR_REG + 3);
6726 else
6727 gcc_unreachable ();
6728 }
6729 else
6730 gcc_unreachable ();
6731}
6732
6733/* Implement TARGET_SECTION_TYPE_FLAGS.
6734
6735 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
6736
6737static unsigned int
6738gcn_section_type_flags (tree decl, const char *name, int reloc)
6739{
6740 if (strcmp (name, ".lds_bss") == 0)
6741 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
6742
6743 return default_section_type_flags (decl, name, reloc);
6744}
6745
6746/* Helper function for gcn_asm_output_symbol_ref.
6747
2a3f9f65
JB
6748 FIXME: This function is used to lay out gang-private variables in LDS
6749 on a per-CU basis.
6750 There may be cases in which gang-private variables in different compilation
6751 units could clobber each other. In that case we should be relying on the
6752 linker to lay out gang-private LDS space, but that doesn't appear to be
6753 possible at present. */
5326695a
AS
6754
6755static void
6756gcn_print_lds_decl (FILE *f, tree var)
6757{
6758 int *offset;
2a3f9f65 6759 if ((offset = lds_allocs.get (var)))
5326695a
AS
6760 fprintf (f, "%u", (unsigned) *offset);
6761 else
6762 {
6763 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
6764 tree type = TREE_TYPE (var);
6765 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6766 if (size > align && size > 4 && align < 8)
6767 align = 8;
6768
2a3f9f65 6769 gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
5326695a 6770
2a3f9f65
JB
6771 lds_allocs.put (var, gang_private_hwm);
6772 fprintf (f, "%u", gang_private_hwm);
6773 gang_private_hwm += size;
6774 if (gang_private_hwm > gang_private_size_opt)
2579d612
TS
6775 error ("%d bytes of gang-private data-share memory exhausted"
6776 " (increase with %<-mgang-private-size=%d%>, for example)",
6777 gang_private_size_opt, gang_private_hwm);
5326695a
AS
6778 }
6779}
6780
6781/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
6782
6783void
6784gcn_asm_output_symbol_ref (FILE *file, rtx x)
6785{
6786 tree decl;
9200b53a
JB
6787 if (cfun
6788 && (decl = SYMBOL_REF_DECL (x)) != 0
9907413a 6789 && VAR_P (decl)
5326695a
AS
6790 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
6791 {
6792 /* LDS symbols (emitted using this hook) are only used at present
6793 to propagate worker values from an active thread to neutered
6794 threads. Use the same offset for each such block, but don't
6795 use zero because null pointers are used to identify the active
6796 thread in GOACC_single_copy_start calls. */
6797 gcn_print_lds_decl (file, decl);
6798 }
6799 else
6800 {
6801 assemble_name (file, XSTR (x, 0));
6802 /* FIXME: See above -- this condition is unreachable. */
9200b53a
JB
6803 if (cfun
6804 && (decl = SYMBOL_REF_DECL (x)) != 0
9907413a 6805 && VAR_P (decl)
5326695a
AS
6806 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
6807 fputs ("@abs32", file);
6808 }
6809}
6810
6811/* Implement TARGET_CONSTANT_ALIGNMENT.
6812
6813 Returns the alignment in bits of a constant that is being placed in memory.
6814 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
6815 would ordinarily have. */
6816
6817static HOST_WIDE_INT
6818gcn_constant_alignment (const_tree ARG_UNUSED (constant),
6819 HOST_WIDE_INT basic_align)
6820{
6821 return basic_align > 128 ? basic_align : 128;
6822}
6823
6824/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
6825
6826void
6827print_operand_address (FILE *file, rtx mem)
6828{
6829 gcc_assert (MEM_P (mem));
6830
6831 rtx reg;
6832 rtx offset;
6833 addr_space_t as = MEM_ADDR_SPACE (mem);
6834 rtx addr = XEXP (mem, 0);
6835 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
6836
6837 if (AS_SCRATCH_P (as))
6838 switch (GET_CODE (addr))
6839 {
6840 case REG:
6841 print_reg (file, addr);
6842 break;
6843
6844 case PLUS:
6845 reg = XEXP (addr, 0);
6846 offset = XEXP (addr, 1);
6847 print_reg (file, reg);
6848 if (GET_CODE (offset) == CONST_INT)
6849 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
6850 else
6851 abort ();
6852 break;
6853
6854 default:
6855 debug_rtx (addr);
6856 abort ();
6857 }
6858 else if (AS_ANY_FLAT_P (as))
6859 {
6860 if (GET_CODE (addr) == REG)
6861 print_reg (file, addr);
6862 else
6863 {
6864 gcc_assert (TARGET_GCN5_PLUS);
6865 print_reg (file, XEXP (addr, 0));
6866 }
6867 }
6868 else if (AS_GLOBAL_P (as))
6869 {
6870 gcc_assert (TARGET_GCN5_PLUS);
6871
6872 rtx base = addr;
6873 rtx vgpr_offset = NULL_RTX;
6874
6875 if (GET_CODE (addr) == PLUS)
6876 {
6877 base = XEXP (addr, 0);
6878
6879 if (GET_CODE (base) == PLUS)
6880 {
6881 /* (SGPR + VGPR) + CONST */
6882 vgpr_offset = XEXP (base, 1);
6883 base = XEXP (base, 0);
6884 }
6885 else
6886 {
6887 rtx offset = XEXP (addr, 1);
6888
6889 if (REG_P (offset))
6890 /* SGPR + VGPR */
6891 vgpr_offset = offset;
6892 else if (CONST_INT_P (offset))
6893 /* VGPR + CONST or SGPR + CONST */
6894 ;
6895 else
6896 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6897 }
6898 }
6899
6900 if (REG_P (base))
6901 {
6902 if (VGPR_REGNO_P (REGNO (base)))
6903 print_reg (file, base);
6904 else if (SGPR_REGNO_P (REGNO (base)))
6905 {
6906 /* The assembler requires a 64-bit VGPR pair here, even though
6907 the offset should be only 32-bit. */
6908 if (vgpr_offset == NULL_RTX)
f6e20012
KCY
6909 /* In this case, the vector offset is zero, so we use the first
6910 lane of v1, which is initialized to zero. */
8086230e 6911 fprintf (file, "v1");
5326695a
AS
6912 else if (REG_P (vgpr_offset)
6913 && VGPR_REGNO_P (REGNO (vgpr_offset)))
8086230e 6914 fprintf (file, "v%d", REGNO (vgpr_offset) - FIRST_VGPR_REG);
5326695a
AS
6915 else
6916 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6917 }
6918 }
6919 else
6920 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6921 }
6922 else if (AS_ANY_DS_P (as))
6923 switch (GET_CODE (addr))
6924 {
6925 case REG:
6926 print_reg (file, addr);
6927 break;
6928
6929 case PLUS:
6930 reg = XEXP (addr, 0);
6931 print_reg (file, reg);
6932 break;
6933
6934 default:
6935 debug_rtx (addr);
6936 abort ();
6937 }
6938 else
6939 switch (GET_CODE (addr))
6940 {
6941 case REG:
6942 print_reg (file, addr);
6943 fprintf (file, ", 0");
6944 break;
6945
6946 case PLUS:
6947 reg = XEXP (addr, 0);
6948 offset = XEXP (addr, 1);
6949 print_reg (file, reg);
6950 fprintf (file, ", ");
6951 if (GET_CODE (offset) == REG)
6952 print_reg (file, reg);
6953 else if (GET_CODE (offset) == CONST_INT)
6954 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
6955 else
6956 abort ();
6957 break;
6958
6959 default:
6960 debug_rtx (addr);
6961 abort ();
6962 }
6963}
6964
6965/* Implement PRINT_OPERAND via gcn.h.
6966
6967 b - print operand size as untyped operand (b8/b16/b32/b64)
6968 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
6969 i - print operand size as untyped operand (i16/b32/i64)
a5879399 6970 I - print operand size as SI/DI untyped operand(i32/b32/i64)
5326695a 6971 u - print operand size as untyped operand (u16/u32/u64)
a5879399 6972 U - print operand size as SI/DI untyped operand(u32/u64)
5326695a
AS
6973 o - print operand size as memory access size for loads
6974 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
6975 s - print operand size as memory access size for stores
6976 (byte/short/dword/dwordx2/wordx3/dwordx4)
6977 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
6978 c - print inverse conditional code for s_cbranch
6979 D - print conditional code for s_cmp (eq_u64/lg_u64...)
6980 E - print conditional code for v_cmp (eq_u64/ne_u64...)
6981 A - print address in formatting suitable for given address space.
6982 O - print offset:n for data share operations.
6983 ^ - print "_co" suffix for GCN5 mnemonics
6984 g - print "glc", if appropriate for given MEM
8aeabd9f
AS
6985 L - print low-part of a multi-reg value
6986 H - print second part of a multi-reg value (high-part of 2-reg value)
6987 J - print third part of a multi-reg value
6988 K - print fourth part of a multi-reg value
5326695a
AS
6989 */
6990
6991void
6992print_operand (FILE *file, rtx x, int code)
6993{
6994 int xcode = x ? GET_CODE (x) : 0;
6995 bool invert = false;
6996 switch (code)
6997 {
6998 /* Instructions have the following suffixes.
6999 If there are two suffixes, the first is the destination type,
7000 and the second is the source type.
7001
7002 B32 Bitfield (untyped data) 32-bit
7003 B64 Bitfield (untyped data) 64-bit
7004 F16 floating-point 16-bit
7005 F32 floating-point 32-bit (IEEE 754 single-precision float)
7006 F64 floating-point 64-bit (IEEE 754 double-precision float)
7007 I16 signed 32-bit integer
7008 I32 signed 32-bit integer
7009 I64 signed 64-bit integer
7010 U16 unsigned 32-bit integer
7011 U32 unsigned 32-bit integer
7012 U64 unsigned 64-bit integer */
7013
7014 /* Print operand size as untyped suffix. */
7015 case 'b':
7016 {
7017 const char *s = "";
7018 machine_mode mode = GET_MODE (x);
7019 if (VECTOR_MODE_P (mode))
7020 mode = GET_MODE_INNER (mode);
7021 switch (GET_MODE_SIZE (mode))
7022 {
7023 case 1:
7024 s = "_b8";
7025 break;
7026 case 2:
7027 s = "_b16";
7028 break;
7029 case 4:
7030 s = "_b32";
7031 break;
7032 case 8:
7033 s = "_b64";
7034 break;
7035 default:
7036 output_operand_lossage ("invalid operand %%xn code");
7037 return;
7038 }
7039 fputs (s, file);
7040 }
7041 return;
7042 case 'B':
7043 {
7044 const char *s = "";
7045 machine_mode mode = GET_MODE (x);
7046 if (VECTOR_MODE_P (mode))
7047 mode = GET_MODE_INNER (mode);
7048 switch (GET_MODE_SIZE (mode))
7049 {
7050 case 1:
7051 case 2:
7052 case 4:
7053 s = "_b32";
7054 break;
7055 case 8:
7056 s = "_b64";
7057 break;
7058 default:
7059 output_operand_lossage ("invalid operand %%xn code");
7060 return;
7061 }
7062 fputs (s, file);
7063 }
7064 return;
7065 case 'e':
7066 fputs ("sext(", file);
7067 print_operand (file, x, 0);
7068 fputs (")", file);
7069 return;
7070 case 'i':
a5879399 7071 case 'I':
5326695a 7072 case 'u':
a5879399 7073 case 'U':
5326695a
AS
7074 {
7075 bool signed_p = code == 'i';
a5879399 7076 bool min32_p = code == 'I' || code == 'U';
5326695a
AS
7077 const char *s = "";
7078 machine_mode mode = GET_MODE (x);
7079 if (VECTOR_MODE_P (mode))
7080 mode = GET_MODE_INNER (mode);
7081 if (mode == VOIDmode)
7082 switch (GET_CODE (x))
7083 {
7084 case CONST_INT:
7085 s = signed_p ? "_i32" : "_u32";
7086 break;
7087 case CONST_DOUBLE:
7088 s = "_f64";
7089 break;
7090 default:
7091 output_operand_lossage ("invalid operand %%xn code");
7092 return;
7093 }
7094 else if (FLOAT_MODE_P (mode))
7095 switch (GET_MODE_SIZE (mode))
7096 {
7097 case 2:
7098 s = "_f16";
7099 break;
7100 case 4:
7101 s = "_f32";
7102 break;
7103 case 8:
7104 s = "_f64";
7105 break;
7106 default:
7107 output_operand_lossage ("invalid operand %%xn code");
7108 return;
7109 }
a5879399
AS
7110 else if (min32_p)
7111 switch (GET_MODE_SIZE (mode))
7112 {
7113 case 1:
7114 case 2:
7115 case 4:
7116 s = signed_p ? "_i32" : "_u32";
7117 break;
7118 case 8:
7119 s = signed_p ? "_i64" : "_u64";
7120 break;
7121 default:
7122 output_operand_lossage ("invalid operand %%xn code");
7123 return;
7124 }
5326695a
AS
7125 else
7126 switch (GET_MODE_SIZE (mode))
7127 {
7128 case 1:
7129 s = signed_p ? "_i8" : "_u8";
7130 break;
7131 case 2:
7132 s = signed_p ? "_i16" : "_u16";
7133 break;
7134 case 4:
7135 s = signed_p ? "_i32" : "_u32";
7136 break;
7137 case 8:
7138 s = signed_p ? "_i64" : "_u64";
7139 break;
7140 default:
7141 output_operand_lossage ("invalid operand %%xn code");
7142 return;
7143 }
7144 fputs (s, file);
7145 }
7146 return;
7147 /* Print operand size as untyped suffix. */
7148 case 'o':
7149 {
7150 const char *s = 0;
45381d6f
AS
7151 machine_mode mode = GET_MODE (x);
7152 if (VECTOR_MODE_P (mode))
7153 mode = GET_MODE_INNER (mode);
7154
7155 switch (mode)
5326695a 7156 {
45381d6f 7157 case E_QImode:
5326695a
AS
7158 s = "_ubyte";
7159 break;
45381d6f
AS
7160 case E_HImode:
7161 case E_HFmode:
5326695a
AS
7162 s = "_ushort";
7163 break;
45381d6f 7164 default:
5326695a
AS
7165 break;
7166 }
7167
7168 if (s)
7169 {
7170 fputs (s, file);
7171 return;
7172 }
7173
7174 /* Fall-through - the other cases for 'o' are the same as for 's'. */
7175 gcc_fallthrough();
7176 }
7177 case 's':
7178 {
45381d6f
AS
7179 const char *s;
7180 machine_mode mode = GET_MODE (x);
7181 if (VECTOR_MODE_P (mode))
7182 mode = GET_MODE_INNER (mode);
7183
7184 switch (mode)
5326695a 7185 {
45381d6f 7186 case E_QImode:
5326695a
AS
7187 s = "_byte";
7188 break;
45381d6f
AS
7189 case E_HImode:
7190 case E_HFmode:
5326695a
AS
7191 s = "_short";
7192 break;
45381d6f
AS
7193 case E_SImode:
7194 case E_SFmode:
5326695a
AS
7195 s = "_dword";
7196 break;
45381d6f
AS
7197 case E_DImode:
7198 case E_DFmode:
5326695a
AS
7199 s = "_dwordx2";
7200 break;
45381d6f 7201 case E_TImode:
5326695a
AS
7202 s = "_dwordx4";
7203 break;
5326695a
AS
7204 default:
7205 output_operand_lossage ("invalid operand %%xn code");
7206 return;
7207 }
7208 fputs (s, file);
7209 }
7210 return;
7211 case 'A':
7212 if (xcode != MEM)
7213 {
7214 output_operand_lossage ("invalid %%xn code");
7215 return;
7216 }
7217 print_operand_address (file, x);
7218 return;
7219 case 'O':
7220 {
7221 if (xcode != MEM)
7222 {
7223 output_operand_lossage ("invalid %%xn code");
7224 return;
7225 }
7226 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
7227 fprintf (file, " gds");
7228
7229 rtx x0 = XEXP (x, 0);
7230 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
7231 {
7232 gcc_assert (TARGET_GCN5_PLUS);
7233
7234 fprintf (file, ", ");
7235
7236 rtx base = x0;
7237 rtx const_offset = NULL_RTX;
7238
7239 if (GET_CODE (base) == PLUS)
7240 {
7241 rtx offset = XEXP (x0, 1);
7242 base = XEXP (x0, 0);
7243
7244 if (GET_CODE (base) == PLUS)
7245 /* (SGPR + VGPR) + CONST */
7246 /* Ignore the VGPR offset for this operand. */
7247 base = XEXP (base, 0);
7248
7249 if (CONST_INT_P (offset))
7250 const_offset = XEXP (x0, 1);
7251 else if (REG_P (offset))
7252 /* SGPR + VGPR */
7253 /* Ignore the VGPR offset for this operand. */
7254 ;
7255 else
7256 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
7257 }
7258
7259 if (REG_P (base))
7260 {
7261 if (VGPR_REGNO_P (REGNO (base)))
7262 /* The VGPR address is specified in the %A operand. */
7263 fprintf (file, "off");
7264 else if (SGPR_REGNO_P (REGNO (base)))
7265 print_reg (file, base);
7266 else
7267 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
7268 }
7269 else
7270 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
7271
7272 if (const_offset != NULL_RTX)
7273 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
7274 INTVAL (const_offset));
7275
7276 return;
7277 }
7278
7279 if (GET_CODE (x0) == REG)
7280 return;
7281 if (GET_CODE (x0) != PLUS)
7282 {
7283 output_operand_lossage ("invalid %%xn code");
7284 return;
7285 }
7286 rtx val = XEXP (x0, 1);
7287 if (GET_CODE (val) == CONST_VECTOR)
7288 val = CONST_VECTOR_ELT (val, 0);
7289 if (GET_CODE (val) != CONST_INT)
7290 {
7291 output_operand_lossage ("invalid %%xn code");
7292 return;
7293 }
7294 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
7295
7296 }
7297 return;
7298 case 'c':
7299 invert = true;
7300 /* Fall through. */
7301 case 'C':
7302 {
7303 const char *s;
7304 bool num = false;
7305 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
7306 {
7307 output_operand_lossage ("invalid %%xn code");
7308 return;
7309 }
7310 switch (REGNO (XEXP (x, 0)))
7311 {
7312 case VCC_REG:
7313 case VCCZ_REG:
7314 s = "_vcc";
7315 break;
7316 case SCC_REG:
7317 /* For some reason llvm-mc insists on scc0 instead of sccz. */
7318 num = true;
7319 s = "_scc";
7320 break;
7321 case EXECZ_REG:
7322 s = "_exec";
7323 break;
7324 default:
7325 output_operand_lossage ("invalid %%xn code");
7326 return;
7327 }
7328 fputs (s, file);
7329 if (xcode == (invert ? NE : EQ))
7330 fputc (num ? '0' : 'z', file);
7331 else
7332 fputs (num ? "1" : "nz", file);
7333 return;
7334 }
7335 case 'D':
7336 {
7337 const char *s;
7338 bool cmp_signed = false;
7339 switch (xcode)
7340 {
7341 case EQ:
7342 s = "_eq_";
7343 break;
7344 case NE:
7345 s = "_lg_";
7346 break;
7347 case LT:
7348 s = "_lt_";
7349 cmp_signed = true;
7350 break;
7351 case LE:
7352 s = "_le_";
7353 cmp_signed = true;
7354 break;
7355 case GT:
7356 s = "_gt_";
7357 cmp_signed = true;
7358 break;
7359 case GE:
7360 s = "_ge_";
7361 cmp_signed = true;
7362 break;
7363 case LTU:
7364 s = "_lt_";
7365 break;
7366 case LEU:
7367 s = "_le_";
7368 break;
7369 case GTU:
7370 s = "_gt_";
7371 break;
7372 case GEU:
7373 s = "_ge_";
7374 break;
7375 default:
7376 output_operand_lossage ("invalid %%xn code");
7377 return;
7378 }
7379 fputs (s, file);
7380 fputc (cmp_signed ? 'i' : 'u', file);
7381
7382 machine_mode mode = GET_MODE (XEXP (x, 0));
7383
7384 if (mode == VOIDmode)
7385 mode = GET_MODE (XEXP (x, 1));
7386
7387 /* If both sides are constants, then assume the instruction is in
7388 SImode since s_cmp can only do integer compares. */
7389 if (mode == VOIDmode)
7390 mode = SImode;
7391
7392 switch (GET_MODE_SIZE (mode))
7393 {
7394 case 4:
7395 s = "32";
7396 break;
7397 case 8:
7398 s = "64";
7399 break;
7400 default:
7401 output_operand_lossage ("invalid operand %%xn code");
7402 return;
7403 }
7404 fputs (s, file);
7405 return;
7406 }
7407 case 'E':
7408 {
7409 const char *s;
7410 bool cmp_signed = false;
7411 machine_mode mode = GET_MODE (XEXP (x, 0));
7412
7413 if (mode == VOIDmode)
7414 mode = GET_MODE (XEXP (x, 1));
7415
7416 /* If both sides are constants, assume the instruction is in SFmode
7417 if either operand is floating point, otherwise assume SImode. */
7418 if (mode == VOIDmode)
7419 {
7420 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
7421 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
7422 mode = SFmode;
7423 else
7424 mode = SImode;
7425 }
7426
7427 /* Use the same format code for vector comparisons. */
7428 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
7429 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
7430 mode = GET_MODE_INNER (mode);
7431
7432 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
7433
7434 switch (xcode)
7435 {
7436 case EQ:
7437 s = "_eq_";
7438 break;
7439 case NE:
7440 s = float_p ? "_neq_" : "_ne_";
7441 break;
7442 case LT:
7443 s = "_lt_";
7444 cmp_signed = true;
7445 break;
7446 case LE:
7447 s = "_le_";
7448 cmp_signed = true;
7449 break;
7450 case GT:
7451 s = "_gt_";
7452 cmp_signed = true;
7453 break;
7454 case GE:
7455 s = "_ge_";
7456 cmp_signed = true;
7457 break;
7458 case LTU:
7459 s = "_lt_";
7460 break;
7461 case LEU:
7462 s = "_le_";
7463 break;
7464 case GTU:
7465 s = "_gt_";
7466 break;
7467 case GEU:
7468 s = "_ge_";
7469 break;
7470 case ORDERED:
7471 s = "_o_";
7472 break;
7473 case UNORDERED:
7474 s = "_u_";
7475 break;
1dff18a1
KCY
7476 case UNEQ:
7477 s = "_nlg_";
7478 break;
7479 case UNGE:
7480 s = "_nlt_";
7481 break;
7482 case UNGT:
7483 s = "_nle_";
7484 break;
7485 case UNLE:
7486 s = "_ngt_";
7487 break;
7488 case UNLT:
7489 s = "_nge_";
7490 break;
59e6d62b
AS
7491 case LTGT:
7492 s = "_lg_";
7493 break;
5326695a
AS
7494 default:
7495 output_operand_lossage ("invalid %%xn code");
7496 return;
7497 }
7498 fputs (s, file);
7499 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
7500
7501 switch (GET_MODE_SIZE (mode))
7502 {
7503 case 1:
0e159efc
AS
7504 output_operand_lossage ("operand %%xn code invalid for QImode");
7505 return;
5326695a 7506 case 2:
0e159efc 7507 s = "16";
5326695a
AS
7508 break;
7509 case 4:
7510 s = "32";
7511 break;
7512 case 8:
7513 s = "64";
7514 break;
7515 default:
7516 output_operand_lossage ("invalid operand %%xn code");
7517 return;
7518 }
7519 fputs (s, file);
7520 return;
7521 }
7522 case 'L':
7523 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
7524 return;
7525 case 'H':
7526 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
7527 return;
8aeabd9f
AS
7528 case 'J':
7529 print_operand (file, gcn_operand_part (GET_MODE (x), x, 2), 0);
7530 return;
7531 case 'K':
7532 print_operand (file, gcn_operand_part (GET_MODE (x), x, 3), 0);
7533 return;
5326695a
AS
7534 case 'R':
7535 /* Print a scalar register number as an integer. Temporary hack. */
7536 gcc_assert (REG_P (x));
7537 fprintf (file, "%u", (int) REGNO (x));
7538 return;
7539 case 'V':
7540 /* Print a vector register number as an integer. Temporary hack. */
7541 gcc_assert (REG_P (x));
7542 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
7543 return;
7544 case 0:
7545 if (xcode == REG)
7546 print_reg (file, x);
7547 else if (xcode == MEM)
7548 output_address (GET_MODE (x), x);
7549 else if (xcode == CONST_INT)
7550 fprintf (file, "%i", (int) INTVAL (x));
7551 else if (xcode == CONST_VECTOR)
7552 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
7553 else if (xcode == CONST_DOUBLE)
7554 {
7555 const char *str;
7556 switch (gcn_inline_fp_constant_p (x, false))
7557 {
7558 case 240:
7559 str = "0.5";
7560 break;
7561 case 241:
7562 str = "-0.5";
7563 break;
7564 case 242:
7565 str = "1.0";
7566 break;
7567 case 243:
7568 str = "-1.0";
7569 break;
7570 case 244:
7571 str = "2.0";
7572 break;
7573 case 245:
7574 str = "-2.0";
7575 break;
7576 case 246:
7577 str = "4.0";
7578 break;
7579 case 247:
7580 str = "-4.0";
7581 break;
7582 case 248:
eff73c10 7583 str = "0.15915494";
5326695a
AS
7584 break;
7585 default:
7586 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
7587 ? DImode : SImode,
7588 x, GET_MODE (x), 0);
7589 if (x)
7590 print_operand (file, ix, code);
7591 else
a94d5170 7592 output_operand_lossage ("invalid fp constant");
5326695a
AS
7593 return;
7594 break;
7595 }
7596 fprintf (file, str);
7597 return;
7598 }
7599 else
7600 output_addr_const (file, x);
7601 return;
7602 case '^':
7603 if (TARGET_GCN5_PLUS)
7604 fputs ("_co", file);
7605 return;
7606 case 'g':
7607 gcc_assert (xcode == MEM);
7608 if (MEM_VOLATILE_P (x))
7609 fputs (" glc", file);
7610 return;
7611 default:
7612 output_operand_lossage ("invalid %%xn code");
7613 }
7614 gcc_unreachable ();
7615}
7616
ca60bd93 7617/* Implement DEBUGGER_REGNO macro.
eff23b79
AS
7618
7619 Return the DWARF register number that corresponds to the GCC internal
7620 REGNO. */
7621
7622unsigned int
7623gcn_dwarf_register_number (unsigned int regno)
7624{
7625 /* Registers defined in DWARF. */
7626 if (regno == EXEC_LO_REG)
7627 return 17;
7628 /* We need to use a more complex DWARF expression for this
7629 else if (regno == EXEC_HI_REG)
7630 return 17; */
7631 else if (regno == VCC_LO_REG)
7632 return 768;
7633 /* We need to use a more complex DWARF expression for this
7634 else if (regno == VCC_HI_REG)
7635 return 768; */
7636 else if (regno == SCC_REG)
7637 return 128;
251697a6
HAQ
7638 else if (regno == DWARF_LINK_REGISTER)
7639 return 16;
eff23b79
AS
7640 else if (SGPR_REGNO_P (regno))
7641 {
7642 if (regno - FIRST_SGPR_REG < 64)
7643 return (regno - FIRST_SGPR_REG + 32);
7644 else
7645 return (regno - FIRST_SGPR_REG + 1024);
7646 }
7647 else if (VGPR_REGNO_P (regno))
7648 return (regno - FIRST_VGPR_REG + 2560);
7649
7650 /* Otherwise, there's nothing sensible to do. */
7651 return regno + 100000;
7652}
7653
7654/* Implement TARGET_DWARF_REGISTER_SPAN.
7655
7656 DImode and Vector DImode require additional registers. */
7657
7658static rtx
7659gcn_dwarf_register_span (rtx rtl)
7660{
7661 machine_mode mode = GET_MODE (rtl);
7662
7663 if (VECTOR_MODE_P (mode))
7664 mode = GET_MODE_INNER (mode);
7665
7666 if (GET_MODE_SIZE (mode) != 8)
7667 return NULL_RTX;
7668
eff23b79 7669 unsigned regno = REGNO (rtl);
251697a6
HAQ
7670
7671 if (regno == DWARF_LINK_REGISTER)
7672 return NULL_RTX;
7673
7674 rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
eff23b79
AS
7675 XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
7676 XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
7677
7678 return p;
7679}
7680
5326695a
AS
7681/* }}} */
7682/* {{{ TARGET hook overrides. */
7683
7684#undef TARGET_ADDR_SPACE_ADDRESS_MODE
7685#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
b5bb7f32
HAQ
7686#undef TARGET_ADDR_SPACE_DEBUG
7687#define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
5326695a
AS
7688#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
7689#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
7690 gcn_addr_space_legitimate_address_p
7691#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
7692#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
7693#undef TARGET_ADDR_SPACE_POINTER_MODE
7694#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
7695#undef TARGET_ADDR_SPACE_SUBSET_P
7696#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
7697#undef TARGET_ADDR_SPACE_CONVERT
7698#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
7699#undef TARGET_ARG_PARTIAL_BYTES
7700#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
7701#undef TARGET_ASM_ALIGNED_DI_OP
7702#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
5326695a
AS
7703#undef TARGET_ASM_FILE_START
7704#define TARGET_ASM_FILE_START output_file_start
7705#undef TARGET_ASM_FUNCTION_PROLOGUE
7706#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
7707#undef TARGET_ASM_SELECT_SECTION
7708#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
7709#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
7710#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
7711#undef TARGET_ATTRIBUTE_TABLE
7712#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
45381d6f
AS
7713#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
7714#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
7715 gcn_autovectorize_vector_modes
5326695a
AS
7716#undef TARGET_BUILTIN_DECL
7717#define TARGET_BUILTIN_DECL gcn_builtin_decl
7718#undef TARGET_CAN_CHANGE_MODE_CLASS
7719#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
7720#undef TARGET_CAN_ELIMINATE
7721#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
7722#undef TARGET_CANNOT_COPY_INSN_P
7723#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
7724#undef TARGET_CLASS_LIKELY_SPILLED_P
7725#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
7726#undef TARGET_CLASS_MAX_NREGS
7727#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
7728#undef TARGET_CONDITIONAL_REGISTER_USAGE
7729#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
7730#undef TARGET_CONSTANT_ALIGNMENT
7731#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
7732#undef TARGET_DEBUG_UNWIND_INFO
7733#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
eff23b79
AS
7734#undef TARGET_DWARF_REGISTER_SPAN
7735#define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
76d46331
KCY
7736#undef TARGET_EMUTLS_VAR_INIT
7737#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
5326695a
AS
7738#undef TARGET_EXPAND_BUILTIN
7739#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
d9d67745
AS
7740#undef TARGET_EXPAND_DIVMOD_LIBFUNC
7741#define TARGET_EXPAND_DIVMOD_LIBFUNC gcn_expand_divmod_libfunc
22f201e4
HAQ
7742#undef TARGET_FRAME_POINTER_REQUIRED
7743#define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
5326695a
AS
7744#undef TARGET_FUNCTION_ARG
7745#undef TARGET_FUNCTION_ARG_ADVANCE
7746#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
7747#define TARGET_FUNCTION_ARG gcn_function_arg
7748#undef TARGET_FUNCTION_VALUE
7749#define TARGET_FUNCTION_VALUE gcn_function_value
7750#undef TARGET_FUNCTION_VALUE_REGNO_P
7751#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
7752#undef TARGET_GIMPLIFY_VA_ARG_EXPR
7753#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
955cd057
TB
7754#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7755#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
29a2f518
JB
7756#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7757#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
e2a58ed6
JB
7758#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
7759#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
7760 gcn_goacc_create_worker_broadcast_record
5326695a
AS
7761#undef TARGET_GOACC_FORK_JOIN
7762#define TARGET_GOACC_FORK_JOIN gcn_fork_join
7763#undef TARGET_GOACC_REDUCTION
7764#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
7765#undef TARGET_GOACC_VALIDATE_DIMS
7766#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
2a3f9f65
JB
7767#undef TARGET_GOACC_SHARED_MEM_LAYOUT
7768#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
5326695a
AS
7769#undef TARGET_HARD_REGNO_MODE_OK
7770#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
7771#undef TARGET_HARD_REGNO_NREGS
7772#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
7773#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7774#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7775#undef TARGET_INIT_BUILTINS
7776#define TARGET_INIT_BUILTINS gcn_init_builtins
a8a730cd
JB
7777#undef TARGET_INIT_LIBFUNCS
7778#define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
5326695a
AS
7779#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
7780#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
7781 gcn_ira_change_pseudo_allocno_class
7782#undef TARGET_LEGITIMATE_CONSTANT_P
7783#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
ce9cd725
KCY
7784#undef TARGET_LIBC_HAS_FUNCTION
7785#define TARGET_LIBC_HAS_FUNCTION gcn_libc_has_function
5326695a
AS
7786#undef TARGET_LRA_P
7787#define TARGET_LRA_P hook_bool_void_true
7788#undef TARGET_MACHINE_DEPENDENT_REORG
7789#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
7790#undef TARGET_MEMORY_MOVE_COST
7791#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
7792#undef TARGET_MODES_TIEABLE_P
7793#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
7794#undef TARGET_OPTION_OVERRIDE
7795#define TARGET_OPTION_OVERRIDE gcn_option_override
7796#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
7797#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
7798 gcn_pretend_outgoing_varargs_named
7799#undef TARGET_PROMOTE_FUNCTION_MODE
7800#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
7801#undef TARGET_REGISTER_MOVE_COST
7802#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
7803#undef TARGET_RETURN_IN_MEMORY
7804#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
7805#undef TARGET_RTX_COSTS
7806#define TARGET_RTX_COSTS gcn_rtx_costs
7807#undef TARGET_SECONDARY_RELOAD
7808#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
7809#undef TARGET_SECTION_TYPE_FLAGS
7810#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
8d0b2b33
AS
7811#undef TARGET_SCALAR_MODE_SUPPORTED_P
7812#define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
b73c49f6
AS
7813#undef TARGET_SIMD_CLONE_ADJUST
7814#define TARGET_SIMD_CLONE_ADJUST gcn_simd_clone_adjust
7815#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
7816#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
7817 gcn_simd_clone_compute_vecsize_and_simdlen
7818#undef TARGET_SIMD_CLONE_USABLE
7819#define TARGET_SIMD_CLONE_USABLE gcn_simd_clone_usable
5326695a
AS
7820#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
7821#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
7822 gcn_small_register_classes_for_mode_p
7823#undef TARGET_SPILL_CLASS
7824#define TARGET_SPILL_CLASS gcn_spill_class
7825#undef TARGET_STRICT_ARGUMENT_NAMING
7826#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
7827#undef TARGET_TRAMPOLINE_INIT
7828#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
7829#undef TARGET_TRULY_NOOP_TRUNCATION
7830#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
7831#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
7832#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
ce9cd725
KCY
7833#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
7834#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
7835 gcn_vectorize_builtin_vectorized_function
5326695a
AS
7836#undef TARGET_VECTORIZE_GET_MASK_MODE
7837#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
7838#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7839#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
7840#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
7841#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
7842 gcn_preferred_vector_alignment
2b99bed8
AS
7843#undef TARGET_VECTORIZE_RELATED_MODE
7844#define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
5326695a
AS
7845#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
7846#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
7847 gcn_vectorize_support_vector_misalignment
7848#undef TARGET_VECTORIZE_VEC_PERM_CONST
7849#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
7850#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
7851#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
7852 gcn_vector_alignment_reachable
7853#undef TARGET_VECTOR_MODE_SUPPORTED_P
7854#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
7855
7856struct gcc_target targetm = TARGET_INITIALIZER;
7857
7858#include "gt-gcn.h"
7859/* }}} */