]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn.c
ipa: Remove stray declaration
[thirdparty/gcc.git] / gcc / config / gcn / gcn.c
CommitLineData
5326695a
AS
1/* Copyright (C) 2016-2019 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17/* {{{ Includes. */
18
19/* We want GET_MODE_SIZE et al to return integers, please. */
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "memmodel.h"
28#include "rtl.h"
29#include "tree.h"
30#include "df.h"
31#include "tm_p.h"
32#include "stringpool.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic-core.h"
38#include "insn-attr.h"
39#include "fold-const.h"
40#include "calls.h"
41#include "explow.h"
42#include "expr.h"
43#include "output.h"
44#include "cfgrtl.h"
45#include "langhooks.h"
46#include "builtins.h"
47#include "omp-general.h"
48#include "print-rtl.h"
49#include "attribs.h"
50#include "varasm.h"
51#include "intl.h"
52#include "rtl-iter.h"
53
54/* This file should be included last. */
55#include "target-def.h"
56
57/* }}} */
58/* {{{ Global variables. */
59
60/* Constants used by FP instructions. */
61
62static REAL_VALUE_TYPE dconst4, dconst1over2pi;
63static bool ext_gcn_constants_init = 0;
64
65/* Holds the ISA variant, derived from the command line parameters. */
66
67int gcn_isa = 3; /* Default to GCN3. */
68
69/* Reserve this much space for LDS (for propagating variables from
70 worker-single mode to worker-partitioned mode), per workgroup. Global
71 analysis could calculate an exact bound, but we don't do that yet.
72
73 We reserve the whole LDS, which also prevents any other workgroup
74 sharing the Compute Unit. */
75
76#define LDS_SIZE 65536
77
78/* }}} */
79/* {{{ Initialization and options. */
80
81/* Initialize machine_function. */
82
83static struct machine_function *
84gcn_init_machine_status (void)
85{
86 struct machine_function *f;
87
88 f = ggc_cleared_alloc<machine_function> ();
89
90 /* Set up LDS allocation for broadcasting for this function. */
91 f->lds_allocated = 32;
92 f->lds_allocs = hash_map<tree, int>::create_ggc (64);
93
94 /* And LDS temporary decls for worker reductions. */
95 vec_alloc (f->reduc_decls, 0);
96
97 if (TARGET_GCN3)
98 f->use_flat_addressing = true;
99
100 return f;
101}
102
103/* Implement TARGET_OPTION_OVERRIDE.
104
105 Override option settings where defaults are variable, or we have specific
106 needs to consider. */
107
108static void
109gcn_option_override (void)
110{
111 init_machine_status = gcn_init_machine_status;
112
113 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
114 if (!flag_pie)
115 flag_pie = 2;
116 if (!flag_pic)
117 flag_pic = flag_pie;
118
119 gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
120
121 /* The default stack size needs to be small for offload kernels because
122 there may be many, many threads. Also, a smaller stack gives a
123 measureable performance boost. But, a small stack is insufficient
124 for running the testsuite, so we use a larger default for the stand
125 alone case. */
126 if (stack_size_opt == -1)
127 {
128 if (flag_openacc || flag_openmp)
129 /* 512 bytes per work item = 32kB total. */
130 stack_size_opt = 512 * 64;
131 else
132 /* 1MB total. */
133 stack_size_opt = 1048576;
134 }
135}
136
137/* }}} */
138/* {{{ Attributes. */
139
140/* This table defines the arguments that are permitted in
141 __attribute__ ((amdgpu_hsa_kernel (...))).
142
143 The names and values correspond to the HSA metadata that is encoded
144 into the assembler file and binary. */
145
146static const struct gcn_kernel_arg_type
147{
148 const char *name;
149 const char *header_pseudo;
150 machine_mode mode;
151
152 /* This should be set to -1 or -2 for a dynamically allocated register
153 number. Use -1 if this argument contributes to the user_sgpr_count,
154 -2 otherwise. */
155 int fixed_regno;
156} gcn_kernel_arg_types[] = {
157 {"exec", NULL, DImode, EXEC_REG},
158#define PRIVATE_SEGMENT_BUFFER_ARG 1
159 {"private_segment_buffer",
160 "enable_sgpr_private_segment_buffer", TImode, -1},
161#define DISPATCH_PTR_ARG 2
162 {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
163#define QUEUE_PTR_ARG 3
164 {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
165#define KERNARG_SEGMENT_PTR_ARG 4
166 {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
167 {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
168#define FLAT_SCRATCH_INIT_ARG 6
169 {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
170#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
171 {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
172 {"grid_workgroup_count_X",
173 "enable_sgpr_grid_workgroup_count_x", SImode, -1},
174 {"grid_workgroup_count_Y",
175 "enable_sgpr_grid_workgroup_count_y", SImode, -1},
176 {"grid_workgroup_count_Z",
177 "enable_sgpr_grid_workgroup_count_z", SImode, -1},
178#define WORKGROUP_ID_X_ARG 11
179 {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
180 {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
181 {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
182 {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
183#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
184 {"private_segment_wave_offset",
185 "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
186#define WORK_ITEM_ID_X_ARG 16
187 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
188#define WORK_ITEM_ID_Y_ARG 17
189 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
190#define WORK_ITEM_ID_Z_ARG 18
191 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
192};
193
342f9464
KCY
194static const long default_requested_args
195 = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
196 | (1 << DISPATCH_PTR_ARG)
197 | (1 << QUEUE_PTR_ARG)
198 | (1 << KERNARG_SEGMENT_PTR_ARG)
199 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
200 | (1 << WORKGROUP_ID_X_ARG)
201 | (1 << WORK_ITEM_ID_X_ARG)
202 | (1 << WORK_ITEM_ID_Y_ARG)
203 | (1 << WORK_ITEM_ID_Z_ARG);
204
5326695a
AS
205/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
206 This function also sets the default values for some arguments.
207
208 Return true on success, with ARGS populated. */
209
210static bool
211gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
212 tree list)
213{
214 bool err = false;
342f9464 215 args->requested = default_requested_args;
5326695a
AS
216 args->nargs = 0;
217
218 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
219 args->reg[a] = -1;
220
221 for (; list; list = TREE_CHAIN (list))
222 {
223 const char *str;
224 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
225 {
226 error ("amdgpu_hsa_kernel attribute requires string constant "
227 "arguments");
228 break;
229 }
230 str = TREE_STRING_POINTER (TREE_VALUE (list));
231 int a;
232 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
233 {
234 if (!strcmp (str, gcn_kernel_arg_types[a].name))
235 break;
236 }
237 if (a == GCN_KERNEL_ARG_TYPES)
238 {
239 error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str);
240 err = true;
241 break;
242 }
243 if (args->requested & (1 << a))
244 {
245 error ("duplicated parameter specifier %s in amdgpu_hsa_kernel "
246 "attribute", str);
247 err = true;
248 break;
249 }
250 args->requested |= (1 << a);
251 args->order[args->nargs++] = a;
252 }
5326695a
AS
253
254 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
255 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
256 requesting WORK_ITEM_ID_X_ARG. */
257 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
258 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
259 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
260 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
261
5326695a
AS
262 int sgpr_regno = FIRST_SGPR_REG;
263 args->nsgprs = 0;
264 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
265 {
266 if (!(args->requested & (1 << a)))
267 continue;
268
269 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
270 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
271 else
272 {
273 int reg_count;
274
275 switch (gcn_kernel_arg_types[a].mode)
276 {
277 case E_SImode:
278 reg_count = 1;
279 break;
280 case E_DImode:
281 reg_count = 2;
282 break;
283 case E_TImode:
284 reg_count = 4;
285 break;
286 default:
287 gcc_unreachable ();
288 }
289 args->reg[a] = sgpr_regno;
290 sgpr_regno += reg_count;
291 if (gcn_kernel_arg_types[a].fixed_regno == -1)
292 args->nsgprs += reg_count;
293 }
294 }
295 if (sgpr_regno > FIRST_SGPR_REG + 16)
296 {
297 error ("too many arguments passed in sgpr registers");
298 }
299 return err;
300}
301
302/* Referenced by TARGET_ATTRIBUTE_TABLE.
303
304 Validates target specific attributes. */
305
306static tree
307gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
308 tree args, int, bool *no_add_attrs)
309{
7039cebf 310 if (!FUNC_OR_METHOD_TYPE_P (*node))
5326695a
AS
311 {
312 warning (OPT_Wattributes, "%qE attribute only applies to functions",
313 name);
314 *no_add_attrs = true;
315 return NULL_TREE;
316 }
317
318 /* Can combine regparm with all attributes but fastcall, and thiscall. */
319 if (is_attribute_p ("gcnhsa_kernel", name))
320 {
321 struct gcn_kernel_args kernelarg;
322
323 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
324 *no_add_attrs = true;
325
326 return NULL_TREE;
327 }
328
329 return NULL_TREE;
330}
331
332/* Implement TARGET_ATTRIBUTE_TABLE.
333
334 Create target-specific __attribute__ types. */
335
336static const struct attribute_spec gcn_attribute_table[] = {
337 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
338 affects_type_identity } */
339 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
340 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
341 /* End element. */
342 {NULL, 0, 0, false, false, false, false, NULL, NULL}
343};
344
345/* }}} */
346/* {{{ Registers and modes. */
347
348/* Implement TARGET_CLASS_MAX_NREGS.
349
350 Return the number of hard registers needed to hold a value of MODE in
351 a register of class RCLASS. */
352
353static unsigned char
354gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
355{
356 /* Scalar registers are 32bit, vector registers are in fact tuples of
357 64 lanes. */
358 if (rclass == VGPR_REGS)
359 {
360 if (vgpr_1reg_mode_p (mode))
361 return 1;
362 if (vgpr_2reg_mode_p (mode))
363 return 2;
364 /* TImode is used by DImode compare_and_swap. */
365 if (mode == TImode)
366 return 4;
367 }
368 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
369 return 2;
370 return CEIL (GET_MODE_SIZE (mode), 4);
371}
372
373/* Implement TARGET_HARD_REGNO_NREGS.
374
375 Return the number of hard registers needed to hold a value of MODE in
376 REGNO. */
377
378unsigned int
379gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
380{
381 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
382}
383
384/* Implement TARGET_HARD_REGNO_MODE_OK.
385
386 Return true if REGNO can hold value in MODE. */
387
388bool
389gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
390{
391 /* Treat a complex mode as if it were a scalar mode of the same overall
392 size for the purposes of allocating hard registers. */
393 if (COMPLEX_MODE_P (mode))
394 switch (mode)
395 {
396 case E_CQImode:
397 case E_CHImode:
398 mode = SImode;
399 break;
400 case E_CSImode:
401 mode = DImode;
402 break;
403 case E_CDImode:
404 mode = TImode;
405 break;
406 case E_HCmode:
407 mode = SFmode;
408 break;
409 case E_SCmode:
410 mode = DFmode;
411 break;
412 default:
413 /* Not supported. */
414 return false;
415 }
416
417 switch (regno)
418 {
419 case FLAT_SCRATCH_LO_REG:
420 case XNACK_MASK_LO_REG:
421 case TBA_LO_REG:
422 case TMA_LO_REG:
423 return (mode == SImode || mode == DImode);
424 case VCC_LO_REG:
425 case EXEC_LO_REG:
426 return (mode == BImode || mode == SImode || mode == DImode);
427 case M0_REG:
428 case FLAT_SCRATCH_HI_REG:
429 case XNACK_MASK_HI_REG:
430 case TBA_HI_REG:
431 case TMA_HI_REG:
432 return mode == SImode;
433 case VCC_HI_REG:
434 return false;
435 case EXEC_HI_REG:
436 return mode == SImode /*|| mode == V32BImode */ ;
437 case SCC_REG:
438 case VCCZ_REG:
439 case EXECZ_REG:
440 return mode == BImode;
441 }
442 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
443 return true;
444 if (SGPR_REGNO_P (regno))
445 /* We restrict double register values to aligned registers. */
446 return (sgpr_1reg_mode_p (mode)
447 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
448 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
449 if (VGPR_REGNO_P (regno))
450 return (vgpr_1reg_mode_p (mode) || vgpr_2reg_mode_p (mode)
451 /* TImode is used by DImode compare_and_swap. */
452 || mode == TImode);
453 return false;
454}
455
456/* Implement REGNO_REG_CLASS via gcn.h.
457
458 Return smallest class containing REGNO. */
459
460enum reg_class
461gcn_regno_reg_class (int regno)
462{
463 switch (regno)
464 {
465 case SCC_REG:
466 return SCC_CONDITIONAL_REG;
9ecf84e6
KCY
467 case VCC_LO_REG:
468 case VCC_HI_REG:
469 return VCC_CONDITIONAL_REG;
5326695a
AS
470 case VCCZ_REG:
471 return VCCZ_CONDITIONAL_REG;
472 case EXECZ_REG:
473 return EXECZ_CONDITIONAL_REG;
474 case EXEC_LO_REG:
475 case EXEC_HI_REG:
476 return EXEC_MASK_REG;
477 }
478 if (VGPR_REGNO_P (regno))
479 return VGPR_REGS;
480 if (SGPR_REGNO_P (regno))
481 return SGPR_REGS;
482 if (regno < FIRST_VGPR_REG)
483 return GENERAL_REGS;
484 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
485 return AFP_REGS;
486 return ALL_REGS;
487}
488
489/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
490
491 GCC assumes that lowpart contains first part of value as stored in memory.
492 This is not the case for vector registers. */
493
494bool
495gcn_can_change_mode_class (machine_mode from, machine_mode to,
496 reg_class_t regclass)
497{
498 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
499 return true;
500 return (gcn_class_max_nregs (regclass, from)
501 == gcn_class_max_nregs (regclass, to));
502}
503
504/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
505
506 When this hook returns true for MODE, the compiler allows
507 registers explicitly used in the rtl to be used as spill registers
508 but prevents the compiler from extending the lifetime of these
509 registers. */
510
511bool
512gcn_small_register_classes_for_mode_p (machine_mode mode)
513{
514 /* We allocate into exec and vcc regs. Those make small register class. */
515 return mode == DImode || mode == SImode;
516}
517
518/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
519
520 Returns true if pseudos that have been assigned to registers of class RCLASS
521 would likely be spilled because registers of RCLASS are needed for spill
522 registers. */
523
524static bool
525gcn_class_likely_spilled_p (reg_class_t rclass)
526{
527 return (rclass == EXEC_MASK_REG
528 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
529}
530
531/* Implement TARGET_MODES_TIEABLE_P.
532
533 Returns true if a value of MODE1 is accessible in MODE2 without
534 copying. */
535
536bool
537gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
538{
539 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
540 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
541}
542
543/* Implement TARGET_TRULY_NOOP_TRUNCATION.
544
545 Returns true if it is safe to “convert” a value of INPREC bits to one of
546 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
547 it as if it had only OUTPREC bits. */
548
549bool
550gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
551{
552 return ((inprec <= 32) && (outprec <= inprec));
553}
554
555/* Return N-th part of value occupying multiple registers. */
556
557rtx
558gcn_operand_part (machine_mode mode, rtx op, int n)
559{
560 if (GET_MODE_SIZE (mode) >= 256)
561 {
562 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
563
564 if (REG_P (op))
565 {
566 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
567 return gen_rtx_REG (V64SImode, REGNO (op) + n);
568 }
569 if (GET_CODE (op) == CONST_VECTOR)
570 {
571 int units = GET_MODE_NUNITS (mode);
572 rtvec v = rtvec_alloc (units);
573
574 for (int i = 0; i < units; ++i)
575 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
576 CONST_VECTOR_ELT (op, i), n);
577
578 return gen_rtx_CONST_VECTOR (V64SImode, v);
579 }
580 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
581 return gcn_gen_undef (V64SImode);
582 gcc_unreachable ();
583 }
584 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
585 {
586 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
587 return gen_rtx_REG (SImode, REGNO (op) + n);
588 }
589 else
590 {
591 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
592 return gcn_gen_undef (SImode);
593
594 /* If it's a constant then let's assume it is of the largest mode
595 available, otherwise simplify_gen_subreg will fail. */
596 if (mode == VOIDmode && CONST_INT_P (op))
597 mode = DImode;
598 return simplify_gen_subreg (SImode, op, mode, n * 4);
599 }
600}
601
602/* Return N-th part of value occupying multiple registers. */
603
604rtx
605gcn_operand_doublepart (machine_mode mode, rtx op, int n)
606{
607 return simplify_gen_subreg (DImode, op, mode, n * 8);
608}
609
610/* Return true if OP can be split into subregs or high/low parts.
611 This is always true for scalars, but not normally true for vectors.
612 However, for vectors in hardregs we can use the low and high registers. */
613
614bool
615gcn_can_split_p (machine_mode, rtx op)
616{
617 if (vgpr_vector_mode_p (GET_MODE (op)))
618 {
619 if (GET_CODE (op) == SUBREG)
620 op = SUBREG_REG (op);
621 if (!REG_P (op))
622 return true;
623 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
624 }
625 return true;
626}
627
628/* Implement TARGET_SPILL_CLASS.
629
630 Return class of registers which could be used for pseudo of MODE
631 and of class RCLASS for spilling instead of memory. Return NO_REGS
632 if it is not possible or non-profitable. */
633
634static reg_class_t
635gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
636{
9ecf84e6
KCY
637 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
638 || c == VCC_CONDITIONAL_REG)
5326695a
AS
639 return SGPR_REGS;
640 else
641 return NO_REGS;
642}
643
644/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
645
646 Change allocno class for given pseudo from allocno and best class
647 calculated by IRA. */
648
649static reg_class_t
650gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
651 reg_class_t best_cl)
652{
653 /* Avoid returning classes that contain both vgpr and sgpr registers. */
654 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
655 return cl;
656 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
657 && best_cl != ALL_GPR_REGS)
658 return best_cl;
659
660 machine_mode mode = PSEUDO_REGNO_MODE (regno);
661 if (vgpr_vector_mode_p (mode))
662 return VGPR_REGS;
663
664 return GENERAL_REGS;
665}
666
667/* Create a new DImode pseudo reg and emit an instruction to initialize
668 it to VAL. */
669
670static rtx
671get_exec (int64_t val)
672{
673 rtx reg = gen_reg_rtx (DImode);
674 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
675 return reg;
676}
677
678/* Return value of scalar exec register. */
679
680rtx
681gcn_scalar_exec ()
682{
683 return const1_rtx;
684}
685
686/* Return pseudo holding scalar exec register. */
687
688rtx
689gcn_scalar_exec_reg ()
690{
691 return get_exec (1);
692}
693
694/* Return value of full exec register. */
695
696rtx
697gcn_full_exec ()
698{
699 return constm1_rtx;
700}
701
702/* Return pseudo holding full exec register. */
703
704rtx
705gcn_full_exec_reg ()
706{
707 return get_exec (-1);
708}
709
710/* }}} */
711/* {{{ Immediate constants. */
712
713/* Initialize shared numeric constants. */
714
715static void
716init_ext_gcn_constants (void)
717{
718 real_from_integer (&dconst4, DFmode, 4, SIGNED);
719
720 /* FIXME: this constant probably does not match what hardware really loads.
721 Reality check it eventually. */
722 real_from_string (&dconst1over2pi,
723 "0.1591549430918953357663423455968866839");
724 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
725
726 ext_gcn_constants_init = 1;
727}
728
729/* Return non-zero if X is a constant that can appear as an inline operand.
730 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
731 Or a vector of those.
732 The value returned should be the encoding of this constant. */
733
734int
735gcn_inline_fp_constant_p (rtx x, bool allow_vector)
736{
737 machine_mode mode = GET_MODE (x);
738
739 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
740 && allow_vector)
741 {
742 int n;
743 if (GET_CODE (x) != CONST_VECTOR)
744 return 0;
745 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
746 if (!n)
747 return 0;
748 for (int i = 1; i < 64; i++)
749 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
750 return 0;
751 return 1;
752 }
753
754 if (mode != HFmode && mode != SFmode && mode != DFmode)
755 return 0;
756
757 const REAL_VALUE_TYPE *r;
758
759 if (x == CONST0_RTX (mode))
760 return 128;
761 if (x == CONST1_RTX (mode))
762 return 242;
763
764 r = CONST_DOUBLE_REAL_VALUE (x);
765
766 if (real_identical (r, &dconstm1))
767 return 243;
768
769 if (real_identical (r, &dconsthalf))
770 return 240;
771 if (real_identical (r, &dconstm1))
772 return 243;
773 if (real_identical (r, &dconst2))
774 return 244;
775 if (real_identical (r, &dconst4))
776 return 246;
777 if (real_identical (r, &dconst1over2pi))
778 return 248;
779 if (!ext_gcn_constants_init)
780 init_ext_gcn_constants ();
781 real_value_negate (r);
782 if (real_identical (r, &dconsthalf))
783 return 241;
784 if (real_identical (r, &dconst2))
785 return 245;
786 if (real_identical (r, &dconst4))
787 return 247;
788
789 /* FIXME: add 4, -4 and 1/(2*PI). */
790
791 return 0;
792}
793
794/* Return non-zero if X is a constant that can appear as an immediate operand.
795 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
796 Or a vector of those.
797 The value returned should be the encoding of this constant. */
798
799bool
800gcn_fp_constant_p (rtx x, bool allow_vector)
801{
802 machine_mode mode = GET_MODE (x);
803
804 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
805 && allow_vector)
806 {
807 int n;
808 if (GET_CODE (x) != CONST_VECTOR)
809 return false;
810 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
811 if (!n)
812 return false;
813 for (int i = 1; i < 64; i++)
814 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
815 return false;
816 return true;
817 }
818 if (mode != HFmode && mode != SFmode && mode != DFmode)
819 return false;
820
821 if (gcn_inline_fp_constant_p (x, false))
822 return true;
823 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
824 return (mode != DFmode);
825}
826
827/* Return true if X is a constant representable as an inline immediate
828 constant in a 32-bit instruction encoding. */
829
830bool
831gcn_inline_constant_p (rtx x)
832{
833 if (GET_CODE (x) == CONST_INT)
834 return INTVAL (x) >= -16 && INTVAL (x) < 64;
835 if (GET_CODE (x) == CONST_DOUBLE)
836 return gcn_inline_fp_constant_p (x, false);
837 if (GET_CODE (x) == CONST_VECTOR)
838 {
839 int n;
840 if (!vgpr_vector_mode_p (GET_MODE (x)))
841 return false;
842 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
843 if (!n)
844 return false;
845 for (int i = 1; i < 64; i++)
846 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
847 return false;
848 return 1;
849 }
850 return false;
851}
852
853/* Return true if X is a constant representable as an immediate constant
854 in a 32 or 64-bit instruction encoding. */
855
856bool
857gcn_constant_p (rtx x)
858{
859 switch (GET_CODE (x))
860 {
861 case CONST_INT:
862 return true;
863
864 case CONST_DOUBLE:
865 return gcn_fp_constant_p (x, false);
866
867 case CONST_VECTOR:
868 {
869 int n;
870 if (!vgpr_vector_mode_p (GET_MODE (x)))
871 return false;
872 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
873 if (!n)
874 return false;
875 for (int i = 1; i < 64; i++)
876 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
877 return false;
878 return true;
879 }
880
881 case SYMBOL_REF:
882 case LABEL_REF:
883 return true;
884
885 default:
886 ;
887 }
888
889 return false;
890}
891
892/* Return true if X is a constant representable as two inline immediate
893 constants in a 64-bit instruction that is split into two 32-bit
894 instructions. */
895
896bool
897gcn_inline_constant64_p (rtx x)
898{
899 if (GET_CODE (x) == CONST_VECTOR)
900 {
901 if (!vgpr_vector_mode_p (GET_MODE (x)))
902 return false;
903 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0)))
904 return false;
905 for (int i = 1; i < 64; i++)
906 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
907 return false;
908
909 return true;
910 }
911
912 if (GET_CODE (x) != CONST_INT)
913 return false;
914
915 rtx val_lo = gcn_operand_part (DImode, x, 0);
916 rtx val_hi = gcn_operand_part (DImode, x, 1);
917 return gcn_inline_constant_p (val_lo) && gcn_inline_constant_p (val_hi);
918}
919
920/* Return true if X is a constant representable as an immediate constant
921 in a 32 or 64-bit instruction encoding where the hardware will
922 extend the immediate to 64-bits. */
923
924bool
925gcn_constant64_p (rtx x)
926{
927 if (!gcn_constant_p (x))
928 return false;
929
930 if (GET_CODE (x) != CONST_INT)
931 return true;
932
933 /* Negative numbers are only allowed if they can be encoded within src0,
934 because the 32-bit immediates do not get sign-extended.
935 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
936 assembler will use a src0 inline immediate and that will get
937 sign-extended. */
938 HOST_WIDE_INT val = INTVAL (x);
939 return (((val & 0xffffffff) == val /* Positive 32-bit. */
940 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
941 || gcn_inline_constant_p (x)); /* Src0. */
942}
943
944/* Implement TARGET_LEGITIMATE_CONSTANT_P.
945
946 Returns true if X is a legitimate constant for a MODE immediate operand. */
947
948bool
949gcn_legitimate_constant_p (machine_mode, rtx x)
950{
951 return gcn_constant_p (x);
952}
953
954/* Return true if X is a CONST_VECTOR of single constant. */
955
956static bool
957single_cst_vector_p (rtx x)
958{
959 if (GET_CODE (x) != CONST_VECTOR)
960 return false;
961 for (int i = 1; i < 64; i++)
962 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
963 return false;
964 return true;
965}
966
967/* Create a CONST_VECTOR of duplicated value A. */
968
969rtx
970gcn_vec_constant (machine_mode mode, int a)
971{
972 /*if (!a)
973 return CONST0_RTX (mode);
974 if (a == -1)
975 return CONSTM1_RTX (mode);
976 if (a == 1)
977 return CONST1_RTX (mode);
978 if (a == 2)
979 return CONST2_RTX (mode);*/
980
981 int units = GET_MODE_NUNITS (mode);
982 rtx tem = gen_int_mode (a, GET_MODE_INNER (mode));
983 rtvec v = rtvec_alloc (units);
984
985 for (int i = 0; i < units; ++i)
986 RTVEC_ELT (v, i) = tem;
987
988 return gen_rtx_CONST_VECTOR (mode, v);
989}
990
991/* Create a CONST_VECTOR of duplicated value A. */
992
993rtx
994gcn_vec_constant (machine_mode mode, rtx a)
995{
996 int units = GET_MODE_NUNITS (mode);
997 rtvec v = rtvec_alloc (units);
998
999 for (int i = 0; i < units; ++i)
1000 RTVEC_ELT (v, i) = a;
1001
1002 return gen_rtx_CONST_VECTOR (mode, v);
1003}
1004
1005/* Create an undefined vector value, used where an insn operand is
1006 optional. */
1007
1008rtx
1009gcn_gen_undef (machine_mode mode)
1010{
1011 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1012}
1013
1014/* }}} */
1015/* {{{ Addresses, pointers and moves. */
1016
1017/* Return true is REG is a valid place to store a pointer,
1018 for instructions that require an SGPR.
1019 FIXME rename. */
1020
1021static bool
1022gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1023{
1024 if (GET_CODE (reg) == SUBREG)
1025 reg = SUBREG_REG (reg);
1026
1027 if (!REG_P (reg))
1028 return false;
1029
1030 if (GET_MODE (reg) != mode)
1031 return false;
1032
1033 int regno = REGNO (reg);
1034
1035 if (regno >= FIRST_PSEUDO_REGISTER)
1036 {
1037 if (!strict)
1038 return true;
1039
1040 if (!reg_renumber)
1041 return false;
1042
1043 regno = reg_renumber[regno];
1044 }
1045
1046 return (SGPR_REGNO_P (regno) || regno == M0_REG
1047 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1048}
1049
1050/* Return true is REG is a valid place to store a pointer,
1051 for instructions that require a VGPR. */
1052
1053static bool
1054gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1055{
1056 if (GET_CODE (reg) == SUBREG)
1057 reg = SUBREG_REG (reg);
1058
1059 if (!REG_P (reg))
1060 return false;
1061
1062 if (GET_MODE (reg) != mode)
1063 return false;
1064
1065 int regno = REGNO (reg);
1066
1067 if (regno >= FIRST_PSEUDO_REGISTER)
1068 {
1069 if (!strict)
1070 return true;
1071
1072 if (!reg_renumber)
1073 return false;
1074
1075 regno = reg_renumber[regno];
1076 }
1077
1078 return VGPR_REGNO_P (regno);
1079}
1080
1081/* Return true if X would be valid inside a MEM using the Flat address
1082 space. */
1083
1084bool
1085gcn_flat_address_p (rtx x, machine_mode mode)
1086{
1087 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1088 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1089
1090 if (vec_mode && gcn_address_register_p (x, DImode, false))
1091 return true;
1092
1093 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1094 return true;
1095
1096 if (TARGET_GCN5_PLUS
1097 && GET_CODE (x) == PLUS
1098 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1099 && CONST_INT_P (XEXP (x, 1)))
1100 return true;
1101
1102 return false;
1103}
1104
1105/* Return true if X would be valid inside a MEM using the Scalar Flat
1106 address space. */
1107
1108bool
1109gcn_scalar_flat_address_p (rtx x)
1110{
1111 if (gcn_address_register_p (x, DImode, false))
1112 return true;
1113
1114 if (GET_CODE (x) == PLUS
1115 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1116 && CONST_INT_P (XEXP (x, 1)))
1117 return true;
1118
1119 return false;
1120}
1121
1122/* Return true if MEM X would be valid for the Scalar Flat address space. */
1123
1124bool
1125gcn_scalar_flat_mem_p (rtx x)
1126{
1127 if (!MEM_P (x))
1128 return false;
1129
1130 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1131 return false;
1132
1133 return gcn_scalar_flat_address_p (XEXP (x, 0));
1134}
1135
1136/* Return true if X would be valid inside a MEM using the LDS or GDS
1137 address spaces. */
1138
1139bool
1140gcn_ds_address_p (rtx x)
1141{
1142 if (gcn_vec_address_register_p (x, SImode, false))
1143 return true;
1144
1145 if (GET_CODE (x) == PLUS
1146 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1147 && CONST_INT_P (XEXP (x, 1)))
1148 return true;
1149
1150 return false;
1151}
1152
1153/* Return true if ADDR would be valid inside a MEM using the Global
1154 address space. */
1155
1156bool
1157gcn_global_address_p (rtx addr)
1158{
1159 if (gcn_address_register_p (addr, DImode, false)
1160 || gcn_vec_address_register_p (addr, DImode, false))
1161 return true;
1162
1163 if (GET_CODE (addr) == PLUS)
1164 {
1165 rtx base = XEXP (addr, 0);
1166 rtx offset = XEXP (addr, 1);
1167 bool immediate_p = (CONST_INT_P (offset)
1168 && INTVAL (offset) >= -(1 << 12)
1169 && INTVAL (offset) < (1 << 12));
1170
1171 if ((gcn_address_register_p (base, DImode, false)
1172 || gcn_vec_address_register_p (base, DImode, false))
1173 && immediate_p)
1174 /* SGPR + CONST or VGPR + CONST */
1175 return true;
1176
1177 if (gcn_address_register_p (base, DImode, false)
1178 && gcn_vgpr_register_operand (offset, SImode))
1179 /* SPGR + VGPR */
1180 return true;
1181
1182 if (GET_CODE (base) == PLUS
1183 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1184 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1185 && immediate_p)
1186 /* (SGPR + VGPR) + CONST */
1187 return true;
1188 }
1189
1190 return false;
1191}
1192
1193/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1194
1195 Recognizes RTL expressions that are valid memory addresses for an
1196 instruction. The MODE argument is the machine mode for the MEM
1197 expression that wants to use this address.
1198
1199 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1200 convert common non-canonical forms to canonical form so that they will
1201 be recognized. */
1202
1203static bool
1204gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1205 addr_space_t as)
1206{
1207 /* All vector instructions need to work on addresses in registers. */
1208 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1209 return false;
1210
1211 if (AS_SCALAR_FLAT_P (as))
1212 {
1213 if (mode == QImode || mode == HImode)
1214 return 0;
1215
1216 switch (GET_CODE (x))
1217 {
1218 case REG:
1219 return gcn_address_register_p (x, DImode, strict);
1220 /* Addresses are in the form BASE+OFFSET
1221 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1222 Writes and atomics do not accept SGPR. */
1223 case PLUS:
1224 {
1225 rtx x0 = XEXP (x, 0);
1226 rtx x1 = XEXP (x, 1);
1227 if (!gcn_address_register_p (x0, DImode, strict))
1228 return false;
1229 /* FIXME: This is disabled because of the mode mismatch between
1230 SImode (for the address or m0 register) and the DImode PLUS.
1231 We'll need a zero_extend or similar.
1232
1233 if (gcn_m0_register_p (x1, SImode, strict)
1234 || gcn_address_register_p (x1, SImode, strict))
1235 return true;
1236 else*/
1237 if (GET_CODE (x1) == CONST_INT)
1238 {
1239 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1240 /* The low bits of the offset are ignored, even when
1241 they're meant to realign the pointer. */
1242 && !(INTVAL (x1) & 0x3))
1243 return true;
1244 }
1245 return false;
1246 }
1247
1248 default:
1249 break;
1250 }
1251 }
1252 else if (AS_SCRATCH_P (as))
1253 return gcn_address_register_p (x, SImode, strict);
1254 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1255 {
1256 if (TARGET_GCN3 || GET_CODE (x) == REG)
1257 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1258 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1259 ? gcn_address_register_p (x, DImode, strict)
1260 : gcn_vec_address_register_p (x, DImode, strict));
1261 else
1262 {
1263 gcc_assert (TARGET_GCN5_PLUS);
1264
1265 if (GET_CODE (x) == PLUS)
1266 {
1267 rtx x1 = XEXP (x, 1);
1268
1269 if (VECTOR_MODE_P (mode)
1270 ? !gcn_address_register_p (x, DImode, strict)
1271 : !gcn_vec_address_register_p (x, DImode, strict))
1272 return false;
1273
1274 if (GET_CODE (x1) == CONST_INT)
1275 {
1276 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1277 /* The low bits of the offset are ignored, even when
1278 they're meant to realign the pointer. */
1279 && !(INTVAL (x1) & 0x3))
1280 return true;
1281 }
1282 }
1283 return false;
1284 }
1285 }
1286 else if (AS_GLOBAL_P (as))
1287 {
1288 gcc_assert (TARGET_GCN5_PLUS);
1289
1290 if (GET_CODE (x) == REG)
1291 return (gcn_address_register_p (x, DImode, strict)
1292 || (!VECTOR_MODE_P (mode)
1293 && gcn_vec_address_register_p (x, DImode, strict)));
1294 else if (GET_CODE (x) == PLUS)
1295 {
1296 rtx base = XEXP (x, 0);
1297 rtx offset = XEXP (x, 1);
1298
1299 bool immediate_p = (GET_CODE (offset) == CONST_INT
1300 /* Signed 13-bit immediate. */
1301 && INTVAL (offset) >= -(1 << 12)
1302 && INTVAL (offset) < (1 << 12)
1303 /* The low bits of the offset are ignored, even
1304 when they're meant to realign the pointer. */
1305 && !(INTVAL (offset) & 0x3));
1306
1307 if (!VECTOR_MODE_P (mode))
1308 {
1309 if ((gcn_address_register_p (base, DImode, strict)
1310 || gcn_vec_address_register_p (base, DImode, strict))
1311 && immediate_p)
1312 /* SGPR + CONST or VGPR + CONST */
1313 return true;
1314
1315 if (gcn_address_register_p (base, DImode, strict)
1316 && gcn_vgpr_register_operand (offset, SImode))
1317 /* SGPR + VGPR */
1318 return true;
1319
1320 if (GET_CODE (base) == PLUS
1321 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1322 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1323 && immediate_p)
1324 /* (SGPR + VGPR) + CONST */
1325 return true;
1326 }
1327 else
1328 {
1329 if (gcn_address_register_p (base, DImode, strict)
1330 && immediate_p)
1331 /* SGPR + CONST */
1332 return true;
1333 }
1334 }
1335 else
1336 return false;
1337 }
1338 else if (AS_ANY_DS_P (as))
1339 switch (GET_CODE (x))
1340 {
1341 case REG:
1342 return (VECTOR_MODE_P (mode)
1343 ? gcn_address_register_p (x, SImode, strict)
1344 : gcn_vec_address_register_p (x, SImode, strict));
1345 /* Addresses are in the form BASE+OFFSET
1346 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1347 Writes and atomics do not accept SGPR. */
1348 case PLUS:
1349 {
1350 rtx x0 = XEXP (x, 0);
1351 rtx x1 = XEXP (x, 1);
1352 if (!gcn_vec_address_register_p (x0, DImode, strict))
1353 return false;
1354 if (GET_CODE (x1) == REG)
1355 {
1356 if (GET_CODE (x1) != REG
1357 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1358 && !gcn_ssrc_register_operand (x1, DImode)))
1359 return false;
1360 }
1361 else if (GET_CODE (x1) == CONST_VECTOR
1362 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1363 && single_cst_vector_p (x1))
1364 {
1365 x1 = CONST_VECTOR_ELT (x1, 0);
1366 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1367 return true;
1368 }
1369 return false;
1370 }
1371
1372 default:
1373 break;
1374 }
1375 else
1376 gcc_unreachable ();
1377 return false;
1378}
1379
1380/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1381
1382 Return the appropriate mode for a named address pointer. */
1383
1384static scalar_int_mode
1385gcn_addr_space_pointer_mode (addr_space_t addrspace)
1386{
1387 switch (addrspace)
1388 {
1389 case ADDR_SPACE_SCRATCH:
1390 case ADDR_SPACE_LDS:
1391 case ADDR_SPACE_GDS:
1392 return SImode;
1393 case ADDR_SPACE_DEFAULT:
1394 case ADDR_SPACE_FLAT:
1395 case ADDR_SPACE_FLAT_SCRATCH:
1396 case ADDR_SPACE_SCALAR_FLAT:
1397 return DImode;
1398 default:
1399 gcc_unreachable ();
1400 }
1401}
1402
1403/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1404
1405 Return the appropriate mode for a named address space address. */
1406
1407static scalar_int_mode
1408gcn_addr_space_address_mode (addr_space_t addrspace)
1409{
1410 return gcn_addr_space_pointer_mode (addrspace);
1411}
1412
1413/* Implement TARGET_ADDR_SPACE_SUBSET_P.
1414
1415 Determine if one named address space is a subset of another. */
1416
1417static bool
1418gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1419{
1420 if (subset == superset)
1421 return true;
1422 /* FIXME is this true? */
1423 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1424 return true;
1425 return false;
1426}
1427
1428/* Convert from one address space to another. */
1429
1430static rtx
1431gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1432{
1433 gcc_assert (POINTER_TYPE_P (from_type));
1434 gcc_assert (POINTER_TYPE_P (to_type));
1435
1436 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1437 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1438
1439 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1440 {
1441 rtx queue = gen_rtx_REG (DImode,
1442 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1443 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1444 gen_rtx_PLUS (DImode, queue,
1445 gen_int_mode (64, SImode)));
1446 rtx tmp = gen_reg_rtx (DImode);
1447
1448 emit_move_insn (gen_lowpart (SImode, tmp), op);
1449 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1450 group_seg_aperture_hi);
1451
1452 return tmp;
1453 }
1454 else if (as_from == as_to)
1455 return op;
1456 else
1457 gcc_unreachable ();
1458}
1459
1460
1461/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1462
1463 Retun true if REGNO is OK for memory adressing. */
1464
1465bool
1466gcn_regno_mode_code_ok_for_base_p (int regno,
1467 machine_mode, addr_space_t as, int, int)
1468{
1469 if (regno >= FIRST_PSEUDO_REGISTER)
1470 {
1471 if (reg_renumber)
1472 regno = reg_renumber[regno];
1473 else
1474 return true;
1475 }
1476 if (AS_FLAT_P (as))
1477 return (VGPR_REGNO_P (regno)
1478 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1479 else if (AS_SCALAR_FLAT_P (as))
1480 return (SGPR_REGNO_P (regno)
1481 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1482 else if (AS_GLOBAL_P (as))
1483 {
1484 return (SGPR_REGNO_P (regno)
1485 || VGPR_REGNO_P (regno)
1486 || regno == ARG_POINTER_REGNUM
1487 || regno == FRAME_POINTER_REGNUM);
1488 }
1489 else
1490 /* For now. */
1491 return false;
1492}
1493
1494/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1495
1496 Return a suitable register class for memory addressing. */
1497
1498reg_class
1499gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1500 int ic)
1501{
1502 switch (as)
1503 {
1504 case ADDR_SPACE_DEFAULT:
1505 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1506 case ADDR_SPACE_SCALAR_FLAT:
1507 case ADDR_SPACE_SCRATCH:
1508 return SGPR_REGS;
1509 break;
1510 case ADDR_SPACE_FLAT:
1511 case ADDR_SPACE_FLAT_SCRATCH:
1512 case ADDR_SPACE_LDS:
1513 case ADDR_SPACE_GDS:
1514 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1515 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1516 ? SGPR_REGS : VGPR_REGS);
1517 case ADDR_SPACE_GLOBAL:
1518 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1519 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1520 ? SGPR_REGS : ALL_GPR_REGS);
1521 }
1522 gcc_unreachable ();
1523}
1524
1525/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1526
1527 Return true if REGNO is OK for index of memory addressing. */
1528
1529bool
1530regno_ok_for_index_p (int regno)
1531{
1532 if (regno >= FIRST_PSEUDO_REGISTER)
1533 {
1534 if (reg_renumber)
1535 regno = reg_renumber[regno];
1536 else
1537 return true;
1538 }
1539 return regno == M0_REG || VGPR_REGNO_P (regno);
1540}
1541
1542/* Generate move which uses the exec flags. If EXEC is NULL, then it is
1543 assumed that all lanes normally relevant to the mode of the move are
1544 affected. If PREV is NULL, then a sensible default is supplied for
1545 the inactive lanes. */
1546
1547static rtx
1548gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1549{
1550 machine_mode mode = GET_MODE (op0);
1551
1552 if (vgpr_vector_mode_p (mode))
1553 {
1554 if (exec && exec != CONSTM1_RTX (DImode))
1555 {
1556 if (!prev)
1557 prev = op0;
1558 }
1559 else
1560 {
1561 if (!prev)
1562 prev = gcn_gen_undef (mode);
1563 exec = gcn_full_exec_reg ();
1564 }
1565
1566 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1567
1568 return gen_rtx_PARALLEL (VOIDmode,
1569 gen_rtvec (2, set,
1570 gen_rtx_CLOBBER (VOIDmode,
1571 gen_rtx_SCRATCH (V64DImode))));
1572 }
1573
1574 return (gen_rtx_PARALLEL
1575 (VOIDmode,
1576 gen_rtvec (2, gen_rtx_SET (op0, op1),
1577 gen_rtx_USE (VOIDmode,
1578 exec ? exec : gcn_scalar_exec ()))));
1579}
1580
1581/* Generate masked move. */
1582
1583static rtx
1584gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1585{
1586 if (exec)
1587 return (gen_rtx_SET (op0,
1588 gen_rtx_VEC_MERGE (GET_MODE (op0),
1589 gen_rtx_VEC_DUPLICATE (GET_MODE
1590 (op0), op1),
1591 op2, exec)));
1592 else
1593 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1594}
1595
1596/* Expand vector init of OP0 by VEC.
1597 Implements vec_init instruction pattern. */
1598
1599void
1600gcn_expand_vector_init (rtx op0, rtx vec)
1601{
1602 int64_t initialized_mask = 0;
1603 int64_t curr_mask = 1;
1604 machine_mode mode = GET_MODE (op0);
1605
1606 rtx val = XVECEXP (vec, 0, 0);
1607
1608 for (int i = 1; i < 64; i++)
1609 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1610 curr_mask |= (int64_t) 1 << i;
1611
1612 if (gcn_constant_p (val))
1613 emit_move_insn (op0, gcn_vec_constant (mode, val));
1614 else
1615 {
1616 val = force_reg (GET_MODE_INNER (mode), val);
1617 emit_insn (gen_duplicate_load (op0, val));
1618 }
1619 initialized_mask |= curr_mask;
1620 for (int i = 1; i < 64; i++)
1621 if (!(initialized_mask & ((int64_t) 1 << i)))
1622 {
1623 curr_mask = (int64_t) 1 << i;
1624 rtx val = XVECEXP (vec, 0, i);
1625
1626 for (int j = i + 1; j < 64; j++)
1627 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1628 curr_mask |= (int64_t) 1 << j;
1629 if (gcn_constant_p (val))
1630 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1631 get_exec (curr_mask)));
1632 else
1633 {
1634 val = force_reg (GET_MODE_INNER (mode), val);
1635 emit_insn (gen_duplicate_load (op0, val, op0,
1636 get_exec (curr_mask)));
1637 }
1638 initialized_mask |= curr_mask;
1639 }
1640}
1641
1642/* Load vector constant where n-th lane contains BASE+n*VAL. */
1643
1644static rtx
1645strided_constant (machine_mode mode, int base, int val)
1646{
1647 rtx x = gen_reg_rtx (mode);
1648 emit_move_insn (x, gcn_vec_constant (mode, base));
1649 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1650 x, get_exec (0xffffffff00000000)));
1651 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1652 x, get_exec (0xffff0000ffff0000)));
1653 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1654 x, get_exec (0xff00ff00ff00ff00)));
1655 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1656 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1657 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1658 x, get_exec (0xcccccccccccccccc)));
1659 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1660 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1661 return x;
1662}
1663
1664/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1665
1666static rtx
1667gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1668 addr_space_t as)
1669{
1670 switch (as)
1671 {
1672 case ADDR_SPACE_DEFAULT:
1673 return gcn_addr_space_legitimize_address (x, old, mode,
1674 DEFAULT_ADDR_SPACE);
1675 case ADDR_SPACE_SCALAR_FLAT:
1676 case ADDR_SPACE_SCRATCH:
1677 /* Instructions working on vectors need the address to be in
1678 a register. */
1679 if (vgpr_vector_mode_p (mode))
1680 return force_reg (GET_MODE (x), x);
1681
1682 return x;
1683 case ADDR_SPACE_FLAT:
1684 case ADDR_SPACE_FLAT_SCRATCH:
1685 case ADDR_SPACE_GLOBAL:
1686 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1687 case ADDR_SPACE_LDS:
1688 case ADDR_SPACE_GDS:
1689 /* FIXME: LDS support offsets, handle them!. */
1690 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1691 {
1692 rtx addrs = gen_reg_rtx (V64SImode);
1693 rtx base = force_reg (SImode, x);
1694 rtx offsets = strided_constant (V64SImode, 0,
1695 GET_MODE_UNIT_SIZE (mode));
1696
1697 emit_insn (gen_vec_duplicatev64si (addrs, base));
1698 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1699 return addrs;
1700 }
1701 return x;
1702 }
1703 gcc_unreachable ();
1704}
1705
1706/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1707 proper vector of stepped addresses.
1708
1709 MEM will be a DImode address of a vector in an SGPR.
1710 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1711
1712rtx
1713gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1714 rtx tmp)
1715{
1716 gcc_assert (MEM_P (mem));
1717 rtx mem_base = XEXP (mem, 0);
1718 rtx mem_index = NULL_RTX;
1719
1720 if (!TARGET_GCN5_PLUS)
1721 {
1722 /* gcn_addr_space_legitimize_address should have put the address in a
1723 register. If not, it is too late to do anything about it. */
1724 gcc_assert (REG_P (mem_base));
1725 }
1726
1727 if (GET_CODE (mem_base) == PLUS)
1728 {
1729 mem_index = XEXP (mem_base, 1);
1730 mem_base = XEXP (mem_base, 0);
1731 }
1732
1733 /* RF and RM base registers for vector modes should be always an SGPR. */
1734 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1735 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1736
1737 machine_mode inner = GET_MODE_INNER (mode);
1738 int shift = exact_log2 (GET_MODE_SIZE (inner));
1739 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1740 rtx undef_v64si = gcn_gen_undef (V64SImode);
1741 rtx new_base = NULL_RTX;
1742 addr_space_t as = MEM_ADDR_SPACE (mem);
1743
1744 rtx tmplo = (REG_P (tmp)
1745 ? gcn_operand_part (V64DImode, tmp, 0)
1746 : gen_reg_rtx (V64SImode));
1747
1748 /* tmplo[:] = ramp[:] << shift */
1749 if (exec)
1750 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1751 gen_int_mode (shift, SImode),
1752 undef_v64si, exec));
1753 else
1754 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1755
1756 if (AS_FLAT_P (as))
1757 {
1758 if (REG_P (tmp))
1759 {
1760 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1761 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1762 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1763 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1764
1765 /* tmphi[:] = mem_base_hi */
1766 if (exec)
1767 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1768 undef_v64si, exec));
1769 else
1770 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1771
1772 /* tmp[:] += zext (mem_base) */
1773 if (exec)
1774 {
1775 rtx undef_di = gcn_gen_undef (DImode);
1776 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1777 vcc, undef_v64si, exec));
1778 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1779 vcc, vcc, undef_v64si, exec));
1780 }
1781 else
1782 emit_insn (gen_addv64di3_zext_dup (tmp, mem_base_lo, tmp));
1783 }
1784 else
1785 {
1786 tmp = gen_reg_rtx (V64DImode);
1787 if (exec)
1788 emit_insn (gen_addv64di3_zext_dup2_exec (tmp, tmplo, mem_base,
1789 gcn_gen_undef (V64DImode),
1790 exec));
1791 else
1792 emit_insn (gen_addv64di3_zext_dup2 (tmp, tmplo, mem_base));
1793 }
1794
1795 new_base = tmp;
1796 }
1797 else if (AS_ANY_DS_P (as))
1798 {
1799 if (!exec)
1800 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1801 else
1802 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1803 gcn_gen_undef (V64SImode), exec));
1804 new_base = tmplo;
1805 }
1806 else
1807 {
1808 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1809 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1810 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1811 }
1812
1813 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1814 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1815 (mem_index ? mem_index
1816 : const0_rtx)));
1817}
1818
1819/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1820 suitable for the given address space. This is indented for use in
1821 gather/scatter patterns.
1822
1823 The offsets may be signed or unsigned, according to UNSIGNED_P.
1824 If EXEC is set then _exec patterns will be used, otherwise plain.
1825
1826 Return values.
1827 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1828 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1829
1830rtx
1831gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1832 bool unsigned_p, rtx exec)
1833{
1834 /* Convert the offsets to V64SImode.
1835 TODO: more conversions will be needed when more types are vectorized. */
1836 if (GET_MODE (offsets) == V64DImode)
1837 {
1838 rtx tmp = gen_reg_rtx (V64SImode);
1839 emit_insn (gen_vec_truncatev64div64si (tmp, offsets));
1840 offsets = tmp;
1841 }
1842
1843 rtx tmpsi = gen_reg_rtx (V64SImode);
1844 rtx tmpdi = gen_reg_rtx (V64DImode);
1845 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1846 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1847
1848 if (CONST_INT_P (scale)
1849 && INTVAL (scale) > 0
1850 && exact_log2 (INTVAL (scale)) >= 0)
1851 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1852 GEN_INT (exact_log2 (INTVAL (scale)))));
1853 else
1854 (exec
1855 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1856 exec))
1857 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1858
1859 /* "Global" instructions do not support negative register offsets. */
1860 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1861 {
1862 if (unsigned_p)
1863 (exec
1864 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1865 undefdi, exec))
1866 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1867 else
1868 (exec
1869 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1870 undefdi, exec))
1871 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1872 return tmpdi;
1873 }
1874 else if (as == ADDR_SPACE_GLOBAL)
1875 return tmpsi;
1876
1877 gcc_unreachable ();
1878}
1879
1880/* Return true if move from OP0 to OP1 is known to be executed in vector
1881 unit. */
1882
1883bool
1884gcn_vgpr_move_p (rtx op0, rtx op1)
1885{
1886 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1887 return true;
1888 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1889 return true;
1890 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1891 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1892 || vgpr_vector_mode_p (GET_MODE (op0)));
1893}
1894
1895/* Return true if move from OP0 to OP1 is known to be executed in scalar
1896 unit. Used in the machine description. */
1897
1898bool
1899gcn_sgpr_move_p (rtx op0, rtx op1)
1900{
1901 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1902 return true;
1903 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1904 return true;
1905 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1906 || VGPR_REGNO_P (REGNO (op0)))
1907 return false;
1908 if (REG_P (op1)
1909 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1910 && !VGPR_REGNO_P (REGNO (op1)))
1911 return true;
1912 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1913}
1914
1915/* Implement TARGET_SECONDARY_RELOAD.
1916
1917 The address space determines which registers can be used for loads and
1918 stores. */
1919
1920static reg_class_t
1921gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
1922 machine_mode reload_mode, secondary_reload_info *sri)
1923{
1924 reg_class_t result = NO_REGS;
1925 bool spilled_pseudo =
1926 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
1927
1928 if (dump_file && (dump_flags & TDF_DETAILS))
1929 {
1930 fprintf (dump_file, "gcn_secondary_reload: ");
1931 dump_value_slim (dump_file, x, 1);
1932 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
1933 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
1934 if (REG_P (x) || GET_CODE (x) == SUBREG)
1935 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
1936 (true_regnum (x) >= 0
1937 && true_regnum (x) < FIRST_PSEUDO_REGISTER
1938 ? reg_names[true_regnum (x)]
1939 : (spilled_pseudo ? "stack spill" : "??")));
1940 fprintf (dump_file, "\n");
1941 }
1942
1943 /* Some callers don't use or initialize icode. */
1944 sri->icode = CODE_FOR_nothing;
1945
1946 if (MEM_P (x) || spilled_pseudo)
1947 {
1948 addr_space_t as = DEFAULT_ADDR_SPACE;
1949
1950 /* If we have a spilled pseudo, we can't find the address space
1951 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
1952 ADDR_SPACE_GLOBAL for GCN5. */
1953 if (MEM_P (x))
1954 as = MEM_ADDR_SPACE (x);
1955
1956 if (as == ADDR_SPACE_DEFAULT)
1957 as = DEFAULT_ADDR_SPACE;
1958
1959 switch (as)
1960 {
1961 case ADDR_SPACE_SCALAR_FLAT:
1962 result =
1963 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
1964 break;
1965 case ADDR_SPACE_FLAT:
1966 case ADDR_SPACE_FLAT_SCRATCH:
1967 case ADDR_SPACE_GLOBAL:
1968 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
1969 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
1970 {
1971 if (in_p)
1972 switch (reload_mode)
1973 {
1974 case E_V64SImode:
1975 sri->icode = CODE_FOR_reload_inv64si;
1976 break;
1977 case E_V64SFmode:
1978 sri->icode = CODE_FOR_reload_inv64sf;
1979 break;
1980 case E_V64HImode:
1981 sri->icode = CODE_FOR_reload_inv64hi;
1982 break;
1983 case E_V64HFmode:
1984 sri->icode = CODE_FOR_reload_inv64hf;
1985 break;
1986 case E_V64QImode:
1987 sri->icode = CODE_FOR_reload_inv64qi;
1988 break;
1989 case E_V64DImode:
1990 sri->icode = CODE_FOR_reload_inv64di;
1991 break;
1992 case E_V64DFmode:
1993 sri->icode = CODE_FOR_reload_inv64df;
1994 break;
1995 default:
1996 gcc_unreachable ();
1997 }
1998 else
1999 switch (reload_mode)
2000 {
2001 case E_V64SImode:
2002 sri->icode = CODE_FOR_reload_outv64si;
2003 break;
2004 case E_V64SFmode:
2005 sri->icode = CODE_FOR_reload_outv64sf;
2006 break;
2007 case E_V64HImode:
2008 sri->icode = CODE_FOR_reload_outv64hi;
2009 break;
2010 case E_V64HFmode:
2011 sri->icode = CODE_FOR_reload_outv64hf;
2012 break;
2013 case E_V64QImode:
2014 sri->icode = CODE_FOR_reload_outv64qi;
2015 break;
2016 case E_V64DImode:
2017 sri->icode = CODE_FOR_reload_outv64di;
2018 break;
2019 case E_V64DFmode:
2020 sri->icode = CODE_FOR_reload_outv64df;
2021 break;
2022 default:
2023 gcc_unreachable ();
2024 }
2025 break;
2026 }
2027 /* Fallthrough. */
2028 case ADDR_SPACE_LDS:
2029 case ADDR_SPACE_GDS:
2030 case ADDR_SPACE_SCRATCH:
2031 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2032 break;
2033 }
2034 }
2035
2036 if (dump_file && (dump_flags & TDF_DETAILS))
2037 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2038 get_insn_name (sri->icode));
2039
2040 return result;
2041}
2042
2043/* Update register usage after having seen the compiler flags and kernel
2044 attributes. We typically want to fix registers that contain values
2045 set by the HSA runtime. */
2046
2047static void
2048gcn_conditional_register_usage (void)
2049{
342f9464
KCY
2050 if (!cfun || !cfun->machine)
2051 return;
5326695a 2052
342f9464
KCY
2053 if (cfun->machine->normal_function)
2054 {
2055 /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
2056 for (int i = SGPR_REGNO (62); i <= LAST_SGPR_REG; i++)
2057 fixed_regs[i] = 1, call_used_regs[i] = 1;
5326695a 2058
342f9464
KCY
2059 for (int i = VGPR_REGNO (24); i <= LAST_VGPR_REG; i++)
2060 fixed_regs[i] = 1, call_used_regs[i] = 1;
5326695a 2061
5326695a
AS
2062 return;
2063 }
2064
342f9464
KCY
2065 /* If the set of requested args is the default set, nothing more needs to
2066 be done. */
2067 if (cfun->machine->args.requested == default_requested_args)
2068 return;
2069
2070 /* Requesting a set of args different from the default violates the ABI. */
2071 if (!leaf_function_p ())
2072 warning (0, "A non-default set of initial values has been requested, "
2073 "which violates the ABI!");
2074
2075 for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2076 fixed_regs[i] = 0;
2077
5326695a
AS
2078 /* Fix the runtime argument register containing values that may be
2079 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2080 needed after the prologue so there's no need to fix them. */
2081 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2082 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2083 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2084 {
342f9464
KCY
2085 /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2086 the containing registers to be used for other purposes. */
5326695a
AS
2087 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2088 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
5326695a
AS
2089 }
2090 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2091 {
2092 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2093 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2094 }
2095 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2096 {
2097 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2098 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2099 }
2100 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2101 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2102 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2103 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2104 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2105 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2106 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2107 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2108
2109 if (TARGET_GCN5_PLUS)
2110 /* v0 is always zero, for global nul-offsets. */
2111 fixed_regs[VGPR_REGNO (0)] = 1;
2112}
2113
2114/* Determine if a load or store is valid, according to the register classes
2115 and address space. Used primarily by the machine description to decide
2116 when to split a move into two steps. */
2117
2118bool
2119gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2120{
2121 if (!MEM_P (dest) && !MEM_P (src))
2122 return true;
2123
2124 if (MEM_P (dest)
2125 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2126 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2127 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2128 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2129 && gcn_vgpr_register_operand (src, mode))
2130 return true;
2131 else if (MEM_P (src)
2132 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2133 && (gcn_flat_address_p (XEXP (src, 0), mode)
2134 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2135 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2136 && gcn_vgpr_register_operand (dest, mode))
2137 return true;
2138
2139 if (MEM_P (dest)
2140 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2141 && (gcn_global_address_p (XEXP (dest, 0))
2142 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2143 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2144 && gcn_vgpr_register_operand (src, mode))
2145 return true;
2146 else if (MEM_P (src)
2147 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2148 && (gcn_global_address_p (XEXP (src, 0))
2149 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2150 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2151 && gcn_vgpr_register_operand (dest, mode))
2152 return true;
2153
2154 if (MEM_P (dest)
2155 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2156 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2157 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2158 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2159 && gcn_ssrc_register_operand (src, mode))
2160 return true;
2161 else if (MEM_P (src)
2162 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2163 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2164 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2165 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2166 && gcn_sdst_register_operand (dest, mode))
2167 return true;
2168
2169 if (MEM_P (dest)
2170 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2171 && gcn_ds_address_p (XEXP (dest, 0))
2172 && gcn_vgpr_register_operand (src, mode))
2173 return true;
2174 else if (MEM_P (src)
2175 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2176 && gcn_ds_address_p (XEXP (src, 0))
2177 && gcn_vgpr_register_operand (dest, mode))
2178 return true;
2179
2180 return false;
2181}
2182
2183/* }}} */
2184/* {{{ Functions and ABI. */
2185
2186/* Implement TARGET_FUNCTION_VALUE.
2187
2188 Define how to find the value returned by a function.
2189 The register location is always the same, but the mode depends on
2190 VALTYPE. */
2191
2192static rtx
2193gcn_function_value (const_tree valtype, const_tree, bool)
2194{
2195 machine_mode mode = TYPE_MODE (valtype);
2196
2197 if (INTEGRAL_TYPE_P (valtype)
2198 && GET_MODE_CLASS (mode) == MODE_INT
2199 && GET_MODE_SIZE (mode) < 4)
2200 mode = SImode;
2201
2202 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2203}
2204
2205/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2206
2207 Return true if N is a possible register number for the function return
2208 value. */
2209
2210static bool
2211gcn_function_value_regno_p (const unsigned int n)
2212{
2213 return n == RETURN_VALUE_REG;
2214}
2215
0ffef200
RS
2216/* Calculate the number of registers required to hold function argument
2217 ARG. */
5326695a
AS
2218
2219static int
0ffef200 2220num_arg_regs (const function_arg_info &arg)
5326695a 2221{
0ffef200 2222 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2223 return 0;
2224
0ffef200 2225 int size = arg.promoted_size_in_bytes ();
5326695a
AS
2226 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2227}
2228
2229/* Implement TARGET_STRICT_ARGUMENT_NAMING.
2230
2231 Return true if the location where a function argument is passed
2232 depends on whether or not it is a named argument
2233
2234 For gcn, we know how to handle functions declared as stdarg: by
2235 passing an extra pointer to the unnamed arguments. However, the
2236 Fortran frontend can produce a different situation, where a
2237 function pointer is declared with no arguments, but the actual
2238 function and calls to it take more arguments. In that case, we
2239 want to ensure the call matches the definition of the function. */
2240
2241static bool
2242gcn_strict_argument_naming (cumulative_args_t cum_v)
2243{
2244 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2245
2246 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2247}
2248
2249/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2250
2251 See comment on gcn_strict_argument_naming. */
2252
2253static bool
2254gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2255{
2256 return !gcn_strict_argument_naming (cum_v);
2257}
2258
2259/* Implement TARGET_FUNCTION_ARG.
2260
2261 Return an RTX indicating whether a function argument is passed in a register
2262 and if so, which register. */
2263
2264static rtx
6783fdb7 2265gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2266{
2267 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2268 if (cum->normal_function)
2269 {
6783fdb7 2270 if (!arg.named || arg.end_marker_p ())
5326695a
AS
2271 return 0;
2272
0ffef200 2273 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2274 return 0;
2275
2276 int reg_num = FIRST_PARM_REG + cum->num;
0ffef200 2277 int num_regs = num_arg_regs (arg);
5326695a
AS
2278 if (num_regs > 0)
2279 while (reg_num % num_regs != 0)
2280 reg_num++;
2281 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
6783fdb7 2282 return gen_rtx_REG (arg.mode, reg_num);
5326695a
AS
2283 }
2284 else
2285 {
2286 if (cum->num >= cum->args.nargs)
2287 {
6783fdb7
RS
2288 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2289 & -(TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2290 cfun->machine->kernarg_segment_alignment
2291 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
6783fdb7 2292 TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2293 rtx addr = gen_rtx_REG (DImode,
2294 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2295 if (cum->offset)
2296 addr = gen_rtx_PLUS (DImode, addr,
2297 gen_int_mode (cum->offset, DImode));
6783fdb7
RS
2298 rtx mem = gen_rtx_MEM (arg.mode, addr);
2299 set_mem_attributes (mem, arg.type, 1);
5326695a
AS
2300 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2301 MEM_READONLY_P (mem) = 1;
2302 return mem;
2303 }
2304
2305 int a = cum->args.order[cum->num];
6783fdb7 2306 if (arg.mode != gcn_kernel_arg_types[a].mode)
5326695a
AS
2307 {
2308 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2309 return 0;
2310 }
2311 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2312 cum->args.reg[a]);
2313 }
2314 return 0;
2315}
2316
2317/* Implement TARGET_FUNCTION_ARG_ADVANCE.
2318
2319 Updates the summarizer variable pointed to by CUM_V to advance past an
2320 argument in the argument list. */
2321
2322static void
6930c98c
RS
2323gcn_function_arg_advance (cumulative_args_t cum_v,
2324 const function_arg_info &arg)
5326695a
AS
2325{
2326 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2327
2328 if (cum->normal_function)
2329 {
6930c98c 2330 if (!arg.named)
5326695a
AS
2331 return;
2332
0ffef200 2333 int num_regs = num_arg_regs (arg);
5326695a
AS
2334 if (num_regs > 0)
2335 while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2336 cum->num++;
2337 cum->num += num_regs;
2338 }
2339 else
2340 {
2341 if (cum->num < cum->args.nargs)
2342 cum->num++;
2343 else
2344 {
6930c98c 2345 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
5326695a
AS
2346 cfun->machine->kernarg_segment_byte_size = cum->offset;
2347 }
2348 }
2349}
2350
2351/* Implement TARGET_ARG_PARTIAL_BYTES.
2352
2353 Returns the number of bytes at the beginning of an argument that must be put
2354 in registers. The value must be zero for arguments that are passed entirely
2355 in registers or that are entirely pushed on the stack. */
2356
2357static int
a7c81bc1 2358gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2359{
2360 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2361
a7c81bc1 2362 if (!arg.named)
5326695a
AS
2363 return 0;
2364
0ffef200 2365 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2366 return 0;
2367
2368 if (cum->num >= NUM_PARM_REGS)
2369 return 0;
2370
2371 /* If the argument fits entirely in registers, return 0. */
0ffef200 2372 if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
5326695a
AS
2373 return 0;
2374
2375 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2376}
2377
2378/* A normal function which takes a pointer argument (to a scalar) may be
2379 passed a pointer to LDS space (via a high-bits-set aperture), and that only
2380 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
2381 function has an incoming pointer-to-scalar parameter. */
2382
2383static void
2384gcn_detect_incoming_pointer_arg (tree fndecl)
2385{
2386 gcc_assert (cfun && cfun->machine);
2387
2388 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2389 arg;
2390 arg = TREE_CHAIN (arg))
2391 if (POINTER_TYPE_P (TREE_VALUE (arg))
2392 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2393 cfun->machine->use_flat_addressing = true;
2394}
2395
2396/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2397
2398 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2399 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2400
2401void
2402gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2403 tree fntype /* tree ptr for function decl */ ,
2404 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2405 tree fndecl, int caller)
2406{
2407 memset (cum, 0, sizeof (*cum));
2408 cum->fntype = fntype;
2409 if (libname)
2410 {
2411 gcc_assert (cfun && cfun->machine);
2412 cum->normal_function = true;
2413 if (!caller)
2414 {
2415 cfun->machine->normal_function = true;
2416 gcn_detect_incoming_pointer_arg (fndecl);
2417 }
2418 return;
2419 }
2420 tree attr = NULL;
2421 if (fndecl)
2422 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2423 if (fndecl && !attr)
2424 attr = lookup_attribute ("amdgpu_hsa_kernel",
2425 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2426 if (!attr && fntype)
2427 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2428 /* Handle main () as kernel, so we can run testsuite.
2429 Handle OpenACC kernels similarly to main. */
2430 if (!attr && !caller && fndecl
2431 && (MAIN_NAME_P (DECL_NAME (fndecl))
2432 || lookup_attribute ("omp target entrypoint",
2433 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2434 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2435 else
2436 {
2437 if (!attr || caller)
2438 {
2439 gcc_assert (cfun && cfun->machine);
2440 cum->normal_function = true;
2441 if (!caller)
2442 cfun->machine->normal_function = true;
2443 }
2444 gcn_parse_amdgpu_hsa_kernel_attribute
2445 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2446 }
2447 cfun->machine->args = cum->args;
2448 if (!caller && cfun->machine->normal_function)
2449 gcn_detect_incoming_pointer_arg (fndecl);
3ed8f692
KCY
2450
2451 reinit_regs ();
5326695a
AS
2452}
2453
2454static bool
2455gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2456{
2457 machine_mode mode = TYPE_MODE (type);
2458 HOST_WIDE_INT size = int_size_in_bytes (type);
2459
2460 if (AGGREGATE_TYPE_P (type))
2461 return true;
2462
2463 if (mode == BLKmode)
2464 return true;
2465
2466 if (size > 2 * UNITS_PER_WORD)
2467 return true;
2468
2469 return false;
2470}
2471
2472/* Implement TARGET_PROMOTE_FUNCTION_MODE.
2473
2474 Return the mode to use for outgoing function arguments. */
2475
2476machine_mode
2477gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2478 int *ARG_UNUSED (punsignedp),
2479 const_tree ARG_UNUSED (funtype),
2480 int ARG_UNUSED (for_return))
2481{
2482 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2483 return SImode;
2484
2485 return mode;
2486}
2487
2488/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2489
2490 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2491 ARGS_GROW_DOWNWARDS. */
2492
2493static tree
2494gcn_gimplify_va_arg_expr (tree valist, tree type,
2495 gimple_seq *ARG_UNUSED (pre_p),
2496 gimple_seq *ARG_UNUSED (post_p))
2497{
2498 tree ptr = build_pointer_type (type);
2499 tree valist_type;
2500 tree t, u;
2501 bool indirect;
2502
fde65a89 2503 indirect = pass_va_arg_by_reference (type);
5326695a
AS
2504 if (indirect)
2505 {
2506 type = ptr;
2507 ptr = build_pointer_type (type);
2508 }
2509 valist_type = TREE_TYPE (valist);
2510
2511 /* Args grow down. Not handled by generic routines. */
2512
2513 u = fold_convert (sizetype, size_in_bytes (type));
2514 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2515 t = fold_build_pointer_plus (valist, u);
2516
2517 /* Align to 8 byte boundary. */
2518
2519 u = build_int_cst (TREE_TYPE (t), -8);
2520 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2521 t = fold_convert (valist_type, t);
2522
2523 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2524
2525 t = fold_convert (ptr, t);
2526 t = build_va_arg_indirect_ref (t);
2527
2528 if (indirect)
2529 t = build_va_arg_indirect_ref (t);
2530
2531 return t;
2532}
2533
955cd057
TB
2534/* Return 1 if TRAIT NAME is present in the OpenMP context's
2535 device trait set, return 0 if not present in any OpenMP context in the
2536 whole translation unit, or -1 if not present in the current OpenMP context
2537 but might be present in another OpenMP context in the same TU. */
2538
2539int
2540gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2541 const char *name)
2542{
2543 switch (trait)
2544 {
2545 case omp_device_kind:
2546 return strcmp (name, "gpu") == 0;
2547 case omp_device_arch:
2548 return strcmp (name, "gcn") == 0;
2549 case omp_device_isa:
2550 if (strcmp (name, "carrizo") == 0)
2551 return gcn_arch == PROCESSOR_CARRIZO;
2552 if (strcmp (name, "fiji") == 0)
2553 return gcn_arch == PROCESSOR_FIJI;
2554 if (strcmp (name, "gfx900") == 0)
2555 return gcn_arch == PROCESSOR_VEGA;
2556 if (strcmp (name, "gfx906") == 0)
2557 return gcn_arch == PROCESSOR_VEGA;
2558 return 0;
2559 default:
2560 gcc_unreachable ();
2561 }
2562}
2563
5326695a
AS
2564/* Calculate stack offsets needed to create prologues and epilogues. */
2565
2566static struct machine_function *
2567gcn_compute_frame_offsets (void)
2568{
2569 machine_function *offsets = cfun->machine;
2570
2571 if (reload_completed)
2572 return offsets;
2573
2574 offsets->need_frame_pointer = frame_pointer_needed;
2575
2576 offsets->outgoing_args_size = crtl->outgoing_args_size;
2577 offsets->pretend_size = crtl->args.pretend_args_size;
2578
2579 offsets->local_vars = get_frame_size ();
2580
2581 offsets->lr_needs_saving = (!leaf_function_p ()
2582 || df_regs_ever_live_p (LR_REGNUM)
2583 || df_regs_ever_live_p (LR_REGNUM + 1));
2584
2585 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2586
2587 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 2588 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2589 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2590 && frame_pointer_needed))
2591 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2592
2593 /* Round up to 64-bit boundary to maintain stack alignment. */
2594 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2595
2596 return offsets;
2597}
2598
2599/* Insert code into the prologue or epilogue to store or load any
2600 callee-save register to/from the stack.
2601
2602 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2603
2604static void
2605move_callee_saved_registers (rtx sp, machine_function *offsets,
2606 bool prologue)
2607{
2608 int regno, offset, saved_scalars;
2609 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2610 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2611 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2612 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2613 HOST_WIDE_INT exec_set = 0;
2614 int offreg_set = 0;
2615
2616 start_sequence ();
2617
2618 /* Move scalars into two vector registers. */
2619 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
a365fa06 2620 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2621 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2622 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2623 && offsets->need_frame_pointer))
2624 {
2625 rtx reg = gen_rtx_REG (SImode, regno);
2626 rtx vreg = gen_rtx_REG (V64SImode,
2627 VGPR_REGNO (6 + (saved_scalars / 64)));
2628 int lane = saved_scalars % 64;
2629
2630 if (prologue)
2631 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2632 else
2633 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2634
2635 saved_scalars++;
2636 }
2637
2638 rtx move_scalars = get_insns ();
2639 end_sequence ();
2640 start_sequence ();
2641
2642 /* Ensure that all vector lanes are moved. */
2643 exec_set = -1;
2644 emit_move_insn (exec, GEN_INT (exec_set));
2645
2646 /* Set up a vector stack pointer. */
2647 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2648 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2649 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2650 gcn_gen_undef (V64SImode), exec));
2651 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2652 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2653 exec));
2654 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2655 gcn_operand_part (V64SImode, vsp, 0),
2656 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2657 exec));
2658 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2659 gcn_operand_part (V64SImode, vsp, 1),
2660 const0_rtx, vcc, vcc,
2661 gcn_gen_undef (V64SImode), exec));
2662
2663 /* Move vectors. */
2664 for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size;
2665 regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 2666 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2667 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2668 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2669 {
2670 rtx reg = gen_rtx_REG (V64SImode, regno);
2671 int size = 256;
2672
2673 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2674 size = saved_scalars * 4;
2675 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2676 size = (saved_scalars - 64) * 4;
2677
2678 if (size != 256 || exec_set != -1)
2679 {
2680 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2681 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2682 }
2683
2684 if (prologue)
2685 emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg,
2686 as, const0_rtx, exec));
2687 else
2688 emit_insn (gen_gatherv64si_insn_1offset_exec
2689 (reg, vsp, const0_rtx, as, const0_rtx,
2690 gcn_gen_undef (V64SImode), exec));
2691
2692 /* Move our VSP to the next stack entry. */
2693 if (offreg_set != size)
2694 {
2695 offreg_set = size;
2696 emit_move_insn (offreg, GEN_INT (size));
2697 }
2698 if (exec_set != -1)
2699 {
2700 exec_set = -1;
2701 emit_move_insn (exec, GEN_INT (exec_set));
2702 }
2703 emit_insn (gen_addv64si3_vcc_dup_exec
2704 (gcn_operand_part (V64SImode, vsp, 0),
2705 offreg, gcn_operand_part (V64SImode, vsp, 0),
2706 vcc, gcn_gen_undef (V64SImode), exec));
2707 emit_insn (gen_addcv64si3_exec
2708 (gcn_operand_part (V64SImode, vsp, 1),
2709 gcn_operand_part (V64SImode, vsp, 1),
2710 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2711
2712 offset += size;
2713 }
2714
2715 rtx move_vectors = get_insns ();
2716 end_sequence ();
2717
2718 if (prologue)
2719 {
2720 emit_insn (move_scalars);
2721 emit_insn (move_vectors);
2722 }
2723 else
2724 {
2725 emit_insn (move_vectors);
2726 emit_insn (move_scalars);
2727 }
2728}
2729
2730/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2731
2732 For a non-kernel function, the stack layout looks like this (interim),
2733 growing *upwards*:
2734
2735 hi | + ...
2736 |__________________| <-- current SP
2737 | outgoing args |
2738 |__________________|
2739 | (alloca space) |
2740 |__________________|
2741 | local vars |
2742 |__________________| <-- FP/hard FP
2743 | callee-save regs |
2744 |__________________| <-- soft arg pointer
2745 | pretend args |
2746 |__________________| <-- incoming SP
2747 | incoming args |
2748 lo |..................|
2749
2750 This implies arguments (beyond the first N in registers) must grow
2751 downwards (as, apparently, PA has them do).
2752
2753 For a kernel function we have the simpler:
2754
2755 hi | + ...
2756 |__________________| <-- current SP
2757 | outgoing args |
2758 |__________________|
2759 | (alloca space) |
2760 |__________________|
2761 | local vars |
2762 lo |__________________| <-- FP/hard FP
2763
2764*/
2765
2766void
2767gcn_expand_prologue ()
2768{
2769 machine_function *offsets = gcn_compute_frame_offsets ();
2770
2771 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2772 {
2773 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2774 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2775
2776 start_sequence ();
2777
2778 if (offsets->pretend_size > 0)
2779 {
2780 /* FIXME: Do the actual saving of register pretend args to the stack.
2781 Register order needs consideration. */
2782 }
2783
2784 /* Save callee-save regs. */
2785 move_callee_saved_registers (sp, offsets, true);
2786
2787 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2788 + offsets->callee_saves
2789 + offsets->local_vars + offsets->outgoing_args_size;
2790 if (sp_adjust > 0)
2791 emit_insn (gen_adddi3_scc (sp, sp, gen_int_mode (sp_adjust, DImode)));
2792
2793 if (offsets->need_frame_pointer)
2794 emit_insn (gen_adddi3_scc (fp, sp,
2795 gen_int_mode
2796 (-(offsets->local_vars +
2797 offsets->outgoing_args_size),
2798 DImode)));
2799
2800 rtx_insn *seq = get_insns ();
2801 end_sequence ();
2802
2803 /* FIXME: Prologue insns should have this flag set for debug output, etc.
2804 but it causes issues for now.
2805 for (insn = seq; insn; insn = NEXT_INSN (insn))
2806 if (INSN_P (insn))
2807 RTX_FRAME_RELATED_P (insn) = 1;*/
2808
2809 emit_insn (seq);
2810 }
2811 else
2812 {
2813 rtx wave_offset = gen_rtx_REG (SImode,
2814 cfun->machine->args.
2815 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
2816
5326695a
AS
2817 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
2818 {
2819 rtx fs_init_lo =
2820 gen_rtx_REG (SImode,
2821 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
2822 rtx fs_init_hi =
2823 gen_rtx_REG (SImode,
2824 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
2825 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
2826 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
2827
2828 /*rtx queue = gen_rtx_REG(DImode,
2829 cfun->machine->args.reg[QUEUE_PTR_ARG]);
2830 rtx aperture = gen_rtx_MEM (SImode,
2831 gen_rtx_PLUS (DImode, queue,
2832 gen_int_mode (68, SImode)));
2833 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
2834
2835 /* Set up flat_scratch. */
2836 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
2837 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
2838 gen_int_mode (8, SImode)));
2839 emit_move_insn (fs_reg_lo, fs_init_hi);
2840 }
2841
2842 /* Set up frame pointer and stack pointer. */
2843 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
2844 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
2845 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
2846 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
2847
2848 HOST_WIDE_INT sp_adjust = (offsets->local_vars
2849 + offsets->outgoing_args_size);
2850
2851 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
2852 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
2853 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
2854 gen_int_mode (0xffff, SImode)));
3258c2d6
AS
2855 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2856 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
2857 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
5326695a
AS
2858
2859 if (sp_adjust > 0)
2860 emit_insn (gen_adddi3_scc (sp, fp, gen_int_mode (sp_adjust, DImode)));
2861 else
2862 emit_move_insn (sp, fp);
2863
2864 /* Make sure the flat scratch reg doesn't get optimised away. */
2865 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
2866 }
2867
2868 /* Ensure that the scheduler doesn't do anything unexpected. */
2869 emit_insn (gen_blockage ());
2870
2871 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
2872 gen_int_mode (LDS_SIZE, SImode));
2873
2874 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
5326695a
AS
2875
2876 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
2877 {
2878 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
2879 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2880 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
2881 "gomp_gcn_enter_kernel"));
2882 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2883 }
2884}
2885
2886/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
2887
2888 See gcn_expand_prologue for stack details. */
2889
2890void
2891gcn_expand_epilogue (void)
2892{
2893 /* Ensure that the scheduler doesn't do anything unexpected. */
2894 emit_insn (gen_blockage ());
2895
2896 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2897 {
2898 machine_function *offsets = gcn_compute_frame_offsets ();
2899 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2900 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2901
2902 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
2903
2904 if (offsets->need_frame_pointer)
2905 {
2906 /* Restore old SP from the frame pointer. */
2907 if (sp_adjust > 0)
2908 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
2909 else
2910 emit_move_insn (sp, fp);
2911 }
2912 else
2913 {
2914 /* Restore old SP from current SP. */
2915 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
2916
2917 if (sp_adjust > 0)
2918 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
2919 }
2920
2921 move_callee_saved_registers (sp, offsets, false);
2922
2923 /* There's no explicit use of the link register on the return insn. Emit
2924 one here instead. */
2925 if (offsets->lr_needs_saving)
2926 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
2927
2928 /* Similar for frame pointer. */
2929 if (offsets->need_frame_pointer)
2930 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
2931 }
2932 else if (flag_openmp)
2933 {
2934 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
2935 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2936 emit_move_insn (fn_reg,
2937 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
2938 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2939 }
2940 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
2941 {
2942 /* Assume that an exit value compatible with gcn-run is expected.
2943 That is, the third input parameter is an int*.
2944
2945 We can't allocate any new registers, but the kernarg_reg is
2946 dead after this, so we'll use that. */
2947 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
2948 [KERNARG_SEGMENT_PTR_ARG]);
2949 rtx retptr_mem = gen_rtx_MEM (DImode,
2950 gen_rtx_PLUS (DImode, kernarg_reg,
2951 GEN_INT (16)));
2952 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
2953 emit_move_insn (kernarg_reg, retptr_mem);
2954
2955 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
2956 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
2957 emit_move_insn (retval_mem,
2958 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
2959 }
2960
2961 emit_jump_insn (gen_gcn_return ());
2962}
2963
2964/* Implement TARGET_CAN_ELIMINATE.
2965
2966 Return true if the compiler is allowed to try to replace register number
2967 FROM_REG with register number TO_REG.
2968
2969 FIXME: is the default "true" not enough? Should this be a negative set? */
2970
2971bool
2972gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
2973{
2974 return (to_reg == HARD_FRAME_POINTER_REGNUM
2975 || to_reg == STACK_POINTER_REGNUM);
2976}
2977
2978/* Implement INITIAL_ELIMINATION_OFFSET.
2979
2980 Returns the initial difference between the specified pair of registers, in
2981 terms of stack position. */
2982
2983HOST_WIDE_INT
2984gcn_initial_elimination_offset (int from, int to)
2985{
2986 machine_function *offsets = gcn_compute_frame_offsets ();
2987
2988 switch (from)
2989 {
2990 case ARG_POINTER_REGNUM:
2991 if (to == STACK_POINTER_REGNUM)
2992 return -(offsets->callee_saves + offsets->local_vars
2993 + offsets->outgoing_args_size);
2994 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
2995 return -offsets->callee_saves;
2996 else
2997 gcc_unreachable ();
2998 break;
2999
3000 case FRAME_POINTER_REGNUM:
3001 if (to == STACK_POINTER_REGNUM)
3002 return -(offsets->local_vars + offsets->outgoing_args_size);
3003 else if (to == HARD_FRAME_POINTER_REGNUM)
3004 return 0;
3005 else
3006 gcc_unreachable ();
3007 break;
3008
3009 default:
3010 gcc_unreachable ();
3011 }
3012}
3013
3014/* Implement HARD_REGNO_RENAME_OK.
3015
3016 Return true if it is permissible to rename a hard register from
3017 FROM_REG to TO_REG. */
3018
3019bool
3020gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3021{
3022 if (from_reg == SCC_REG
3023 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3024 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3025 || to_reg == SCC_REG
3026 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3027 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3028 return false;
3029
3030 /* Allow the link register to be used if it was saved. */
3031 if ((to_reg & ~1) == LINK_REGNUM)
3032 return !cfun || cfun->machine->lr_needs_saving;
3033
3034 /* Allow the registers used for the static chain to be used if the chain is
3035 not in active use. */
3036 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3037 return !cfun
3038 || !(cfun->static_chain_decl
3039 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3040 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3041
3042 return true;
3043}
3044
3045/* Implement HARD_REGNO_CALLER_SAVE_MODE.
3046
3047 Which mode is required for saving NREGS of a pseudo-register in
3048 call-clobbered hard register REGNO. */
3049
3050machine_mode
3051gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3052 machine_mode regmode)
3053{
737d6a1a 3054 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
5326695a
AS
3055
3056 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3057 result = (nregs == 1 ? SImode : DImode);
3058
3059 return result;
3060}
3061
3062/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3063
3064 Output assembler code for a block containing the constant parts
3065 of a trampoline, leaving space for the variable parts. */
3066
3067static void
3068gcn_asm_trampoline_template (FILE *f)
3069{
3070 /* The source operand of the move instructions must be a 32-bit
3071 constant following the opcode. */
3072 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3073 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3074 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3075 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3076 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3077}
3078
3079/* Implement TARGET_TRAMPOLINE_INIT.
3080
3081 Emit RTL insns to initialize the variable parts of a trampoline.
3082 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3083 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3084 to be passed to the target function. */
3085
3086static void
3087gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3088{
b7c28a47
AS
3089 if (TARGET_GCN5_PLUS)
3090 sorry ("nested function trampolines not supported on GCN5 due to"
3091 " non-executable stacks");
3092
5326695a
AS
3093 emit_block_move (m_tramp, assemble_trampoline_template (),
3094 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3095
3096 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3097 rtx chain_value_reg = copy_to_reg (chain_value);
3098 rtx fnaddr_reg = copy_to_reg (fnaddr);
3099
3100 for (int i = 0; i < 4; i++)
3101 {
3102 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3103 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3104 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3105 }
3106
3107 rtx tramp_addr = XEXP (m_tramp, 0);
3108 emit_insn (gen_clear_icache (tramp_addr,
3109 plus_constant (ptr_mode, tramp_addr,
3110 TRAMPOLINE_SIZE)));
3111}
3112
3113/* }}} */
3114/* {{{ Miscellaneous. */
3115
3116/* Implement TARGET_CANNOT_COPY_INSN_P.
3117
3118 Return true if INSN must not be duplicated. */
3119
3120static bool
3121gcn_cannot_copy_insn_p (rtx_insn *insn)
3122{
3123 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3124 return true;
3125
3126 return false;
3127}
3128
3129/* Implement TARGET_DEBUG_UNWIND_INFO.
3130
3131 Defines the mechanism that will be used for describing frame unwind
3132 information to the debugger. */
3133
3134static enum unwind_info_type
3135gcn_debug_unwind_info ()
3136{
3137 /* No support for debug info, yet. */
3138 return UI_NONE;
3139}
3140
3141/* Determine if there is a suitable hardware conversion instruction.
3142 Used primarily by the machine description. */
3143
3144bool
3145gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3146{
3147 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3148 return false;
3149
3150 if (VECTOR_MODE_P (from))
3151 {
3152 from = GET_MODE_INNER (from);
3153 to = GET_MODE_INNER (to);
3154 }
3155
3156 switch (op)
3157 {
3158 case fix_trunc_cvt:
3159 case fixuns_trunc_cvt:
3160 if (GET_MODE_CLASS (from) != MODE_FLOAT
3161 || GET_MODE_CLASS (to) != MODE_INT)
3162 return false;
3163 break;
3164 case float_cvt:
3165 case floatuns_cvt:
3166 if (GET_MODE_CLASS (from) != MODE_INT
3167 || GET_MODE_CLASS (to) != MODE_FLOAT)
3168 return false;
3169 break;
3170 case extend_cvt:
3171 if (GET_MODE_CLASS (from) != MODE_FLOAT
3172 || GET_MODE_CLASS (to) != MODE_FLOAT
3173 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3174 return false;
3175 break;
3176 case trunc_cvt:
3177 if (GET_MODE_CLASS (from) != MODE_FLOAT
3178 || GET_MODE_CLASS (to) != MODE_FLOAT
3179 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3180 return false;
3181 break;
3182 }
3183
3184 return ((to == HImode && from == HFmode)
3185 || (to == SImode && (from == SFmode || from == DFmode))
3186 || (to == HFmode && (from == HImode || from == SFmode))
3187 || (to == SFmode && (from == SImode || from == HFmode
3188 || from == DFmode))
3189 || (to == DFmode && (from == SImode || from == SFmode)));
3190}
3191
76d46331
KCY
3192/* Implement TARGET_EMUTLS_VAR_INIT.
3193
3194 Disable emutls (gthr-gcn.h does not support it, yet). */
3195
3196tree
3197gcn_emutls_var_init (tree, tree decl, tree)
3198{
3199 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3200}
3201
5326695a
AS
3202/* }}} */
3203/* {{{ Costs. */
3204
3205/* Implement TARGET_RTX_COSTS.
3206
3207 Compute a (partial) cost for rtx X. Return true if the complete
3208 cost has been computed, and false if subexpressions should be
3209 scanned. In either case, *TOTAL contains the cost result. */
3210
3211static bool
3212gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3213{
3214 enum rtx_code code = GET_CODE (x);
3215 switch (code)
3216 {
3217 case CONST:
3218 case CONST_DOUBLE:
3219 case CONST_VECTOR:
3220 case CONST_INT:
3221 if (gcn_inline_constant_p (x))
3222 *total = 0;
3223 else if (code == CONST_INT
3224 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3225 *total = 1;
3226 else if (gcn_constant_p (x))
3227 *total = 2;
3228 else
3229 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3230 return true;
3231
3232 case DIV:
3233 *total = 100;
3234 return false;
3235
3236 default:
3237 *total = 3;
3238 return false;
3239 }
3240}
3241
3242/* Implement TARGET_MEMORY_MOVE_COST.
3243
3244 Return the cost of moving data of mode M between a
3245 register and memory. A value of 2 is the default; this cost is
3246 relative to those in `REGISTER_MOVE_COST'.
3247
3248 This function is used extensively by register_move_cost that is used to
3249 build tables at startup. Make it inline in this case.
3250 When IN is 2, return maximum of in and out move cost.
3251
3252 If moving between registers and memory is more expensive than
3253 between two registers, you should define this macro to express the
3254 relative cost.
3255
3256 Model also increased moving costs of QImode registers in non
3257 Q_REGS classes. */
3258
3259#define LOAD_COST 32
3260#define STORE_COST 32
3261static int
3262gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3263{
3264 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3265 switch (regclass)
3266 {
3267 case SCC_CONDITIONAL_REG:
3268 case VCCZ_CONDITIONAL_REG:
3269 case VCC_CONDITIONAL_REG:
3270 case EXECZ_CONDITIONAL_REG:
3271 case ALL_CONDITIONAL_REGS:
3272 case SGPR_REGS:
3273 case SGPR_EXEC_REGS:
3274 case EXEC_MASK_REG:
3275 case SGPR_VOP_SRC_REGS:
3276 case SGPR_MEM_SRC_REGS:
3277 case SGPR_SRC_REGS:
3278 case SGPR_DST_REGS:
3279 case GENERAL_REGS:
3280 case AFP_REGS:
3281 if (!in)
3282 return (STORE_COST + 2) * nregs;
3283 return LOAD_COST * nregs;
3284 case VGPR_REGS:
3285 if (in)
3286 return (LOAD_COST + 2) * nregs;
3287 return STORE_COST * nregs;
3288 case ALL_REGS:
3289 case ALL_GPR_REGS:
3290 case SRCDST_REGS:
3291 if (in)
3292 return (LOAD_COST + 2) * nregs;
3293 return (STORE_COST + 2) * nregs;
3294 default:
3295 gcc_unreachable ();
3296 }
3297}
3298
3299/* Implement TARGET_REGISTER_MOVE_COST.
3300
3301 Return the cost of moving data from a register in class CLASS1 to
3302 one in class CLASS2. Base value is 2. */
3303
3304static int
3305gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3306{
3307 /* Increase cost of moving from and to vector registers. While this is
3308 fast in hardware (I think), it has hidden cost of setting up the exec
3309 flags. */
3310 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3311 return 4;
3312 return 2;
3313}
3314
3315/* }}} */
3316/* {{{ Builtins. */
3317
3318/* Type codes used by GCN built-in definitions. */
3319
3320enum gcn_builtin_type_index
3321{
3322 GCN_BTI_END_OF_PARAMS,
3323
3324 GCN_BTI_VOID,
3325 GCN_BTI_BOOL,
3326 GCN_BTI_INT,
3327 GCN_BTI_UINT,
3328 GCN_BTI_SIZE_T,
3329 GCN_BTI_LLINT,
3330 GCN_BTI_LLUINT,
3331 GCN_BTI_EXEC,
3332
3333 GCN_BTI_SF,
3334 GCN_BTI_V64SI,
3335 GCN_BTI_V64SF,
3336 GCN_BTI_V64PTR,
3337 GCN_BTI_SIPTR,
3338 GCN_BTI_SFPTR,
3339 GCN_BTI_VOIDPTR,
3340
3341 GCN_BTI_LDS_VOIDPTR,
3342
3343 GCN_BTI_MAX
3344};
3345
3346static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3347
3348#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3349#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3350#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3351#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3352#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3353#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3354#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3355#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3356#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3357
3358static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3359 struct gcn_builtin_description *);
3360static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3361 struct gcn_builtin_description *);
3362
3363struct gcn_builtin_description;
3364typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3365 struct gcn_builtin_description *);
3366
3367enum gcn_builtin_type
3368{
3369 B_UNIMPLEMENTED, /* Sorry out */
3370 B_INSN, /* Emit a pattern */
3371 B_OVERLOAD /* Placeholder for an overloaded function */
3372};
3373
3374struct gcn_builtin_description
3375{
3376 int fcode;
3377 int icode;
3378 const char *name;
3379 enum gcn_builtin_type type;
3380 /* The first element of parm is always the return type. The rest
3381 are a zero terminated list of parameters. */
3382 int parm[6];
3383 gcn_builtin_expander expander;
3384};
3385
3386/* Read in the GCN builtins from gcn-builtins.def. */
3387
3388extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3389
3390struct gcn_builtin_description gcn_builtins[] = {
3391#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3392 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3393
3394#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3395 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3396 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3397 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3398 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3399 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3400 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3401 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3402 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3403
3404#include "gcn-builtins.def"
3405#undef DEF_BUILTIN_BINOP_INT_FP
3406#undef DEF_BUILTIN
3407};
3408
3409static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3410
3411/* Implement TARGET_BUILTIN_DECL.
3412
3413 Return the GCN builtin for CODE. */
3414
3415tree
3416gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3417{
3418 if (code >= GCN_BUILTIN_MAX)
3419 return error_mark_node;
3420
3421 return gcn_builtin_decls[code];
3422}
3423
3424/* Helper function for gcn_init_builtins. */
3425
3426static void
3427gcn_init_builtin_types (void)
3428{
3429 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3430 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3431 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3432 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3433 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3434 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3435 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3436
3437 exec_type_node = unsigned_intDI_type_node;
3438 sf_type_node = float32_type_node;
3439 v64si_type_node = build_vector_type (intSI_type_node, 64);
3440 v64sf_type_node = build_vector_type (float_type_node, 64);
3441 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3442 /*build_pointer_type
3443 (integer_type_node) */
3444 , 64);
3445 tree tmp = build_distinct_type_copy (intSI_type_node);
3446 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3447 siptr_type_node = build_pointer_type (tmp);
3448
3449 tmp = build_distinct_type_copy (float_type_node);
3450 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3451 sfptr_type_node = build_pointer_type (tmp);
3452
3453 tmp = build_distinct_type_copy (void_type_node);
3454 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3455 voidptr_type_node = build_pointer_type (tmp);
3456
3457 tmp = build_distinct_type_copy (void_type_node);
3458 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3459 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3460}
3461
3462/* Implement TARGET_INIT_BUILTINS.
3463
3464 Set up all builtin functions for this target. */
3465
3466static void
3467gcn_init_builtins (void)
3468{
3469 gcn_init_builtin_types ();
3470
3471 struct gcn_builtin_description *d;
3472 unsigned int i;
3473 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3474 {
3475 tree p;
3476 char name[64]; /* build_function will make a copy. */
3477 int parm;
3478
3479 /* FIXME: Is this necessary/useful? */
3480 if (d->name == 0)
3481 continue;
3482
3483 /* Find last parm. */
3484 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3485 ;
3486
3487 p = void_list_node;
3488 while (parm > 1)
3489 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3490
3491 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3492
3493 sprintf (name, "__builtin_gcn_%s", d->name);
3494 gcn_builtin_decls[i]
3495 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3496
3497 /* These builtins don't throw. */
3498 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3499 }
3500
3501/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
3502#ifdef BUILT_IN_GOACC_SINGLE_START
3503 /* These builtins need to take/return an LDS pointer: override the generic
3504 versions here. */
3505
3506 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3507 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3508
3509 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3510 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3511 false);
3512
3513 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3514 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3515 false);
3516
3517 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3518 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3519#endif
3520}
3521
3522/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3523 not require taking the address of any object, other than the memory
3524 cell being operated on.
3525
3526 Helper function for gcn_expand_builtin_1. */
3527
3528static rtx
3529gcn_expand_cmp_swap (tree exp, rtx target)
3530{
3531 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3532 addr_space_t as
3533 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3534 machine_mode as_mode = gcn_addr_space_address_mode (as);
3535
3536 if (!target)
3537 target = gen_reg_rtx (mode);
3538
3539 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3540 NULL_RTX, as_mode, EXPAND_NORMAL);
3541 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3542 NULL_RTX, mode, EXPAND_NORMAL);
3543 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3544 NULL_RTX, mode, EXPAND_NORMAL);
3545 rtx pat;
3546
3547 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3548 set_mem_addr_space (mem, as);
3549
3550 if (!REG_P (cmp))
3551 cmp = copy_to_mode_reg (mode, cmp);
3552 if (!REG_P (src))
3553 src = copy_to_mode_reg (mode, src);
3554
3555 if (mode == SImode)
3556 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3557 else
3558 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3559
3560 emit_insn (pat);
3561
3562 return target;
3563}
3564
3565/* Expand many different builtins.
3566
3567 Intended for use in gcn-builtins.def. */
3568
3569static rtx
3570gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3571 machine_mode /*mode */ , int ignore,
3572 struct gcn_builtin_description *)
3573{
3574 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3575 switch (DECL_MD_FUNCTION_CODE (fndecl))
5326695a
AS
3576 {
3577 case GCN_BUILTIN_FLAT_LOAD_INT32:
3578 {
3579 if (ignore)
3580 return target;
3581 /*rtx exec = */
3582 force_reg (DImode,
3583 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3584 EXPAND_NORMAL));
3585 /*rtx ptr = */
3586 force_reg (V64DImode,
3587 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3588 EXPAND_NORMAL));
3589 /*emit_insn (gen_vector_flat_loadv64si
3590 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3591 return target;
3592 }
3593 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3594 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3595 {
3596 if (ignore)
3597 return target;
3598 rtx exec = force_reg (DImode,
3599 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3600 DImode,
3601 EXPAND_NORMAL));
3602 rtx ptr = force_reg (DImode,
3603 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3604 V64DImode,
3605 EXPAND_NORMAL));
3606 rtx offsets = force_reg (V64SImode,
3607 expand_expr (CALL_EXPR_ARG (exp, 2),
3608 NULL_RTX, V64DImode,
3609 EXPAND_NORMAL));
3610 rtx addrs = gen_reg_rtx (V64DImode);
3611 rtx tmp = gen_reg_rtx (V64SImode);
3612 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3613 GEN_INT (2),
3614 gcn_gen_undef (V64SImode), exec));
3615 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3616 gcn_gen_undef (V64DImode),
3617 exec));
3618 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3619 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3620 /* FIXME: set attributes. */
3621 emit_insn (gen_mov_with_exec (target, mem, exec));
3622 return target;
3623 }
3624 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3625 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3626 {
3627 rtx exec = force_reg (DImode,
3628 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3629 DImode,
3630 EXPAND_NORMAL));
3631 rtx ptr = force_reg (DImode,
3632 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3633 V64DImode,
3634 EXPAND_NORMAL));
3635 rtx offsets = force_reg (V64SImode,
3636 expand_expr (CALL_EXPR_ARG (exp, 2),
3637 NULL_RTX, V64DImode,
3638 EXPAND_NORMAL));
3639 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3640 3)));
3641 rtx val = force_reg (vmode,
3642 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3643 vmode,
3644 EXPAND_NORMAL));
3645 rtx addrs = gen_reg_rtx (V64DImode);
3646 rtx tmp = gen_reg_rtx (V64SImode);
3647 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3648 GEN_INT (2),
3649 gcn_gen_undef (V64SImode), exec));
3650 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3651 gcn_gen_undef (V64DImode),
3652 exec));
3653 rtx mem = gen_rtx_MEM (vmode, addrs);
3654 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3655 /* FIXME: set attributes. */
3656 emit_insn (gen_mov_with_exec (mem, val, exec));
3657 return target;
3658 }
3659 case GCN_BUILTIN_SQRTVF:
3660 {
3661 if (ignore)
3662 return target;
3663 rtx exec = gcn_full_exec_reg ();
3664 rtx arg = force_reg (V64SFmode,
3665 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3666 V64SFmode,
3667 EXPAND_NORMAL));
3668 emit_insn (gen_sqrtv64sf2_exec
3669 (target, arg, gcn_gen_undef (V64SFmode), exec));
3670 return target;
3671 }
3672 case GCN_BUILTIN_SQRTF:
3673 {
3674 if (ignore)
3675 return target;
3676 rtx arg = force_reg (SFmode,
3677 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3678 SFmode,
3679 EXPAND_NORMAL));
3680 emit_insn (gen_sqrtsf2 (target, arg));
3681 return target;
3682 }
3683 case GCN_BUILTIN_OMP_DIM_SIZE:
3684 {
3685 if (ignore)
3686 return target;
3687 emit_insn (gen_oacc_dim_size (target,
3688 expand_expr (CALL_EXPR_ARG (exp, 0),
3689 NULL_RTX, SImode,
3690 EXPAND_NORMAL)));
3691 return target;
3692 }
3693 case GCN_BUILTIN_OMP_DIM_POS:
3694 {
3695 if (ignore)
3696 return target;
3697 emit_insn (gen_oacc_dim_pos (target,
3698 expand_expr (CALL_EXPR_ARG (exp, 0),
3699 NULL_RTX, SImode,
3700 EXPAND_NORMAL)));
3701 return target;
3702 }
3703 case GCN_BUILTIN_CMP_SWAP:
3704 case GCN_BUILTIN_CMP_SWAPLL:
3705 return gcn_expand_cmp_swap (exp, target);
3706
3707 case GCN_BUILTIN_ACC_SINGLE_START:
3708 {
3709 if (ignore)
3710 return target;
3711
3712 rtx wavefront = gcn_oacc_dim_pos (1);
3713 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3714 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3715 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3716 return cc;
3717 }
3718
3719 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3720 {
3721 rtx blk = force_reg (SImode,
3722 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3723 SImode, EXPAND_NORMAL));
3724 rtx wavefront = gcn_oacc_dim_pos (1);
3725 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3726 rtx not_zero = gen_label_rtx ();
3727 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3728 emit_move_insn (blk, const0_rtx);
3729 emit_label (not_zero);
3730 return blk;
3731 }
3732
3733 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3734 return target;
3735
3736 case GCN_BUILTIN_ACC_BARRIER:
3737 emit_insn (gen_gcn_wavefront_barrier ());
3738 return target;
3739
3740 default:
3741 gcc_unreachable ();
3742 }
3743}
3744
3745/* Expansion of simple arithmetic and bit binary operation builtins.
3746
3747 Intended for use with gcn_builtins table. */
3748
3749static rtx
3750gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
3751 machine_mode /*mode */ , int ignore,
3752 struct gcn_builtin_description *d)
3753{
3754 int icode = d->icode;
3755 if (ignore)
3756 return target;
3757
3758 rtx exec = force_reg (DImode,
3759 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3760 EXPAND_NORMAL));
3761
3762 machine_mode m1 = insn_data[icode].operand[1].mode;
3763 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
3764 EXPAND_NORMAL);
3765 if (!insn_data[icode].operand[1].predicate (arg1, m1))
3766 arg1 = force_reg (m1, arg1);
3767
3768 machine_mode m2 = insn_data[icode].operand[2].mode;
3769 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
3770 EXPAND_NORMAL);
3771 if (!insn_data[icode].operand[2].predicate (arg2, m2))
3772 arg2 = force_reg (m2, arg2);
3773
3774 rtx arg_prev;
3775 if (call_expr_nargs (exp) == 4)
3776 {
3777 machine_mode m_prev = insn_data[icode].operand[4].mode;
3778 arg_prev = force_reg (m_prev,
3779 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3780 m_prev, EXPAND_NORMAL));
3781 }
3782 else
3783 arg_prev = gcn_gen_undef (GET_MODE (target));
3784
3785 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
3786 emit_insn (pat);
3787 return target;
3788}
3789
3790/* Implement TARGET_EXPAND_BUILTIN.
3791
3792 Expand an expression EXP that calls a built-in function, with result going
3793 to TARGET if that's convenient (and in mode MODE if that's convenient).
3794 SUBTARGET may be used as the target for computing one of EXP's operands.
3795 IGNORE is nonzero if the value is to be ignored. */
3796
3797rtx
3798gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
3799 int ignore)
3800{
3801 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3802 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
5326695a
AS
3803 struct gcn_builtin_description *d;
3804
3805 gcc_assert (fcode < GCN_BUILTIN_MAX);
3806 d = &gcn_builtins[fcode];
3807
3808 if (d->type == B_UNIMPLEMENTED)
3809 sorry ("Builtin not implemented");
3810
3811 return d->expander (exp, target, subtarget, mode, ignore, d);
3812}
3813
3814/* }}} */
3815/* {{{ Vectorization. */
3816
3817/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
3818
3819 A vector mask is a value that holds one boolean result for every element in
3820 a vector. */
3821
3822opt_machine_mode
10116ec1 3823gcn_vectorize_get_mask_mode (machine_mode)
5326695a
AS
3824{
3825 /* GCN uses a DImode bit-mask. */
3826 return DImode;
3827}
3828
3829/* Return an RTX that references a vector with the i-th lane containing
3830 PERM[i]*4.
3831
3832 Helper function for gcn_vectorize_vec_perm_const. */
3833
3834static rtx
3835gcn_make_vec_perm_address (unsigned int *perm)
3836{
3837 rtx x = gen_reg_rtx (V64SImode);
3838 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
3839
3840 /* Permutation addresses use byte addressing. With each vector lane being
3841 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
3842 so only set those.
3843
3844 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
3845 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
3846 only take one source vector, the most-significant bit can be ignored
3847 here. Instead, we can use EXEC masking to select the relevant part of
3848 each source vector after they are permuted separately. */
3849 uint64_t bit_mask = 1 << 2;
3850 for (int i = 2; i < 8; i++, bit_mask <<= 1)
3851 {
3852 uint64_t exec_mask = 0;
3853 uint64_t lane_mask = 1;
3854 for (int j = 0; j < 64; j++, lane_mask <<= 1)
3855 if ((perm[j] * 4) & bit_mask)
3856 exec_mask |= lane_mask;
3857
3858 if (exec_mask)
3859 emit_insn (gen_addv64si3_exec (x, x,
3860 gcn_vec_constant (V64SImode,
3861 bit_mask),
3862 x, get_exec (exec_mask)));
3863 }
3864
3865 return x;
3866}
3867
3868/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
3869
3870 Return true if permutation with SEL is possible.
3871
3872 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
3873 permutations. */
3874
3875static bool
3876gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
3877 rtx src0, rtx src1,
3878 const vec_perm_indices & sel)
3879{
3880 unsigned int nelt = GET_MODE_NUNITS (vmode);
3881
3882 gcc_assert (VECTOR_MODE_P (vmode));
3883 gcc_assert (nelt <= 64);
3884 gcc_assert (sel.length () == nelt);
3885
3886 if (!dst)
3887 {
3888 /* All vector permutations are possible on this architecture,
3889 with varying degrees of efficiency depending on the permutation. */
3890 return true;
3891 }
3892
3893 unsigned int perm[64];
3894 for (unsigned int i = 0; i < nelt; ++i)
3895 perm[i] = sel[i] & (2 * nelt - 1);
3896
3897 /* Make life a bit easier by swapping operands if necessary so that
3898 the first element always comes from src0. */
3899 if (perm[0] >= nelt)
3900 {
3901 rtx temp = src0;
3902 src0 = src1;
3903 src1 = temp;
3904
3905 for (unsigned int i = 0; i < nelt; ++i)
3906 if (perm[i] < nelt)
3907 perm[i] += nelt;
3908 else
3909 perm[i] -= nelt;
3910 }
3911
3912 /* TODO: There are more efficient ways to implement certain permutations
3913 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
3914 this more inefficient generic approach is used. */
3915
3916 int64_t src1_lanes = 0;
3917 int64_t lane_bit = 1;
3918
3919 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
3920 {
3921 /* Set the bits for lanes from src1. */
3922 if (perm[i] >= nelt)
3923 src1_lanes |= lane_bit;
3924 }
3925
3926 rtx addr = gcn_make_vec_perm_address (perm);
3927 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
3928
3929 switch (vmode)
3930 {
3931 case E_V64QImode:
3932 ds_bpermute = gen_ds_bpermutev64qi;
3933 break;
3934 case E_V64HImode:
3935 ds_bpermute = gen_ds_bpermutev64hi;
3936 break;
3937 case E_V64SImode:
3938 ds_bpermute = gen_ds_bpermutev64si;
3939 break;
3940 case E_V64HFmode:
3941 ds_bpermute = gen_ds_bpermutev64hf;
3942 break;
3943 case E_V64SFmode:
3944 ds_bpermute = gen_ds_bpermutev64sf;
3945 break;
3946 case E_V64DImode:
3947 ds_bpermute = gen_ds_bpermutev64di;
3948 break;
3949 case E_V64DFmode:
3950 ds_bpermute = gen_ds_bpermutev64df;
3951 break;
3952 default:
3953 gcc_assert (false);
3954 }
3955
3956 /* Load elements from src0 to dst. */
3957 gcc_assert (~src1_lanes);
3958 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
3959
3960 /* Load elements from src1 to dst. */
3961 if (src1_lanes)
3962 {
3963 /* Masking a lane masks both the destination and source lanes for
3964 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
3965 then add an extra masked move to merge the results of permuting
3966 the two source vectors together.
3967 */
3968 rtx tmp = gen_reg_rtx (vmode);
3969 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
3970 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
3971 }
3972
3973 return true;
3974}
3975
3976/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
3977
3978 Return nonzero if vector MODE is supported with at least move
3979 instructions. */
3980
3981static bool
3982gcn_vector_mode_supported_p (machine_mode mode)
3983{
3984 /* FIXME: Enable V64QImode and V64HImode.
3985 We should support these modes, but vector operations are usually
3986 assumed to automatically truncate types, and GCN does not. We
3987 need to add explicit truncates and/or use SDWA for QI/HI insns. */
3988 return (/* mode == V64QImode || mode == V64HImode
3989 ||*/ mode == V64SImode || mode == V64DImode
3990 || mode == V64SFmode || mode == V64DFmode);
3991}
3992
3993/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
3994
3995 Enables autovectorization for all supported modes. */
3996
3997static machine_mode
3998gcn_vectorize_preferred_simd_mode (scalar_mode mode)
3999{
4000 switch (mode)
4001 {
4002 case E_QImode:
4003 return V64QImode;
4004 case E_HImode:
4005 return V64HImode;
4006 case E_SImode:
4007 return V64SImode;
4008 case E_DImode:
4009 return V64DImode;
4010 case E_SFmode:
4011 return V64SFmode;
4012 case E_DFmode:
4013 return V64DFmode;
4014 default:
4015 return word_mode;
4016 }
4017}
4018
4019/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4020
4021 Returns the preferred alignment in bits for accesses to vectors of type type
4022 in vectorized code. This might be less than or greater than the ABI-defined
4023 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4024 of a single element, in which case the vectorizer will not try to optimize
4025 for alignment. */
4026
4027static poly_uint64
4028gcn_preferred_vector_alignment (const_tree type)
4029{
4030 return TYPE_ALIGN (TREE_TYPE (type));
4031}
4032
4033/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4034
4035 Return true if the target supports misaligned vector store/load of a
4036 specific factor denoted in the misalignment parameter. */
4037
4038static bool
4039gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4040 const_tree type, int misalignment,
4041 bool is_packed)
4042{
4043 if (is_packed)
4044 return false;
4045
4046 /* If the misalignment is unknown, we should be able to handle the access
4047 so long as it is not to a member of a packed data structure. */
4048 if (misalignment == -1)
4049 return true;
4050
4051 /* Return true if the misalignment is a multiple of the natural alignment
4052 of the vector's element type. This is probably always going to be
4053 true in practice, since we've already established that this isn't a
4054 packed access. */
4055 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4056}
4057
4058/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4059
4060 Return true if vector alignment is reachable (by peeling N iterations) for
4061 the given scalar type TYPE. */
4062
4063static bool
4064gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4065{
4066 /* Vectors which aren't in packed structures will not be less aligned than
4067 the natural alignment of their element type, so this is safe. */
4068 return !is_packed;
4069}
4070
4071/* Generate DPP instructions used for vector reductions.
4072
4073 The opcode is given by INSN.
4074 The first operand of the operation is shifted right by SHIFT vector lanes.
4075 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4076 broadcast the next row (thereby acting like a shift of 16 for the end of
4077 each row). If SHIFT is 32, lane 31 is broadcast to all the
4078 following lanes (thereby acting like a shift of 32 for lane 63). */
4079
4080char *
4081gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4082 int unspec, int shift)
4083{
4084 static char buf[64];
4085 const char *dpp;
4086 const char *vcc_in = "";
4087 const char *vcc_out = "";
4088
4089 /* Add the vcc operand if needed. */
4090 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4091 {
4092 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4093 vcc_in = ", vcc";
4094
4095 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4096 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4097 vcc_out = ", vcc";
4098 }
4099
4100 /* Add the DPP modifiers. */
4101 switch (shift)
4102 {
4103 case 1:
4104 dpp = "row_shr:1 bound_ctrl:0";
4105 break;
4106 case 2:
4107 dpp = "row_shr:2 bound_ctrl:0";
4108 break;
4109 case 4:
4110 dpp = "row_shr:4 bank_mask:0xe";
4111 break;
4112 case 8:
4113 dpp = "row_shr:8 bank_mask:0xc";
4114 break;
4115 case 16:
4116 dpp = "row_bcast:15 row_mask:0xa";
4117 break;
4118 case 32:
4119 dpp = "row_bcast:31 row_mask:0xc";
4120 break;
4121 default:
4122 gcc_unreachable ();
4123 }
4124
4125 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4126
4127 return buf;
4128}
4129
4130/* Generate vector reductions in terms of DPP instructions.
4131
4132 The vector register SRC of mode MODE is reduced using the operation given
4133 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4134 register. */
4135
4136rtx
4137gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4138{
4139 rtx tmp = gen_reg_rtx (mode);
4140 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4141 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4142 && (TARGET_GCN3 || mode == V64DImode);
4143
4144 if (use_plus_carry)
4145 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4146
4147 /* Perform reduction by first performing the reduction operation on every
4148 pair of lanes, then on every pair of results from the previous
4149 iteration (thereby effectively reducing every 4 lanes) and so on until
4150 all lanes are reduced. */
4151 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4152 {
4153 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4154 rtx insn = gen_rtx_SET (tmp,
4155 gen_rtx_UNSPEC (mode,
4156 gen_rtvec (3,
4157 src, src, shift_val),
4158 unspec));
4159
4160 /* Add clobber for instructions that set the carry flags. */
4161 if (use_plus_carry)
4162 {
4163 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4164 gen_rtx_REG (DImode, VCC_REG));
4165 insn = gen_rtx_PARALLEL (VOIDmode,
4166 gen_rtvec (2, insn, clobber));
4167 }
4168
4169 emit_insn (insn);
4170
4171 /* The source operands for every iteration after the first
4172 should be TMP. */
4173 src = tmp;
4174 }
4175
4176 return tmp;
4177}
4178
4179/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4180
4181int
4182gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4183 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4184{
4185 /* Always vectorize. */
4186 return 1;
4187}
4188
4189/* }}} */
4190/* {{{ md_reorg pass. */
4191
4192/* Identify VMEM instructions from their "type" attribute. */
4193
4194static bool
4195gcn_vmem_insn_p (attr_type type)
4196{
4197 switch (type)
4198 {
4199 case TYPE_MUBUF:
4200 case TYPE_MTBUF:
4201 case TYPE_FLAT:
4202 return true;
4203 case TYPE_UNKNOWN:
4204 case TYPE_SOP1:
4205 case TYPE_SOP2:
4206 case TYPE_SOPK:
4207 case TYPE_SOPC:
4208 case TYPE_SOPP:
4209 case TYPE_SMEM:
4210 case TYPE_DS:
4211 case TYPE_VOP2:
4212 case TYPE_VOP1:
4213 case TYPE_VOPC:
4214 case TYPE_VOP3A:
4215 case TYPE_VOP3B:
4216 case TYPE_VOP_SDWA:
4217 case TYPE_VOP_DPP:
4218 case TYPE_MULT:
4219 case TYPE_VMULT:
4220 return false;
4221 }
4222 gcc_unreachable ();
4223 return false;
4224}
4225
4226/* If INSN sets the EXEC register to a constant value, return the value,
4227 otherwise return zero. */
4228
4229static int64_t
4230gcn_insn_exec_value (rtx_insn *insn)
4231{
4232 if (!NONDEBUG_INSN_P (insn))
4233 return 0;
4234
4235 rtx pattern = PATTERN (insn);
4236
4237 if (GET_CODE (pattern) == SET)
4238 {
4239 rtx dest = XEXP (pattern, 0);
4240 rtx src = XEXP (pattern, 1);
4241
4242 if (GET_MODE (dest) == DImode
4243 && REG_P (dest) && REGNO (dest) == EXEC_REG
4244 && CONST_INT_P (src))
4245 return INTVAL (src);
4246 }
4247
4248 return 0;
4249}
4250
4251/* Sets the EXEC register before INSN to the value that it had after
4252 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4253 known, otherwise it returns zero. */
4254
4255static int64_t
4256gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4257 bool curr_exec_known, bool &last_exec_def_saved)
4258{
4259 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4260 rtx exec;
4261
4262 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4263
4264 if (exec_value)
4265 {
4266 /* If the EXEC value is a constant and it happens to be the same as the
4267 current EXEC value, the restore can be skipped. */
4268 if (curr_exec_known && exec_value == curr_exec)
4269 return exec_value;
4270
4271 exec = GEN_INT (exec_value);
4272 }
4273 else
4274 {
4275 /* If the EXEC value is not a constant, save it in a register after the
4276 point of definition. */
4277 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4278
4279 if (!last_exec_def_saved)
4280 {
4281 start_sequence ();
4282 emit_move_insn (exec_save_reg, exec_reg);
4283 rtx_insn *seq = get_insns ();
4284 end_sequence ();
4285
4286 emit_insn_after (seq, last_exec_def);
4287 if (dump_file && (dump_flags & TDF_DETAILS))
4288 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4289 INSN_UID (last_exec_def));
4290
4291 last_exec_def_saved = true;
4292 }
4293
4294 exec = exec_save_reg;
4295 }
4296
4297 /* Restore EXEC register before the usage. */
4298 start_sequence ();
4299 emit_move_insn (exec_reg, exec);
4300 rtx_insn *seq = get_insns ();
4301 end_sequence ();
4302 emit_insn_before (seq, insn);
4303
4304 if (dump_file && (dump_flags & TDF_DETAILS))
4305 {
4306 if (exec_value)
4307 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4308 exec_value, INSN_UID (insn));
4309 else
4310 fprintf (dump_file,
4311 "Restoring EXEC from saved value before insn %d.\n",
4312 INSN_UID (insn));
4313 }
4314
4315 return exec_value;
4316}
4317
4318/* Implement TARGET_MACHINE_DEPENDENT_REORG.
4319
4320 Ensure that pipeline dependencies and lane masking are set correctly. */
4321
4322static void
4323gcn_md_reorg (void)
4324{
4325 basic_block bb;
4326 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4327 rtx exec_lo_reg = gen_rtx_REG (SImode, EXEC_LO_REG);
4328 rtx exec_hi_reg = gen_rtx_REG (SImode, EXEC_HI_REG);
4329 regset_head live;
4330
4331 INIT_REG_SET (&live);
4332
4333 compute_bb_for_insn ();
4334
4335 if (!optimize)
4336 {
4337 split_all_insns ();
4338 if (dump_file && (dump_flags & TDF_DETAILS))
4339 {
4340 fprintf (dump_file, "After split:\n");
4341 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4342 }
4343
4344 /* Update data-flow information for split instructions. */
4345 df_insn_rescan_all ();
4346 }
4347
4348 df_analyze ();
4349
4350 /* This pass ensures that the EXEC register is set correctly, according
4351 to the "exec" attribute. However, care must be taken so that the
4352 value that reaches explicit uses of the EXEC register remains the
4353 same as before.
4354 */
4355
4356 FOR_EACH_BB_FN (bb, cfun)
4357 {
4358 if (dump_file && (dump_flags & TDF_DETAILS))
4359 fprintf (dump_file, "BB %d:\n", bb->index);
4360
4361 rtx_insn *insn, *curr;
4362 rtx_insn *last_exec_def = BB_HEAD (bb);
4363 bool last_exec_def_saved = false;
4364 bool curr_exec_explicit = true;
4365 bool curr_exec_known = true;
4366 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4367 after last_exec_def is executed'. */
4368
4369 FOR_BB_INSNS_SAFE (bb, insn, curr)
4370 {
4371 if (!NONDEBUG_INSN_P (insn))
4372 continue;
4373
4374 if (GET_CODE (PATTERN (insn)) == USE
4375 || GET_CODE (PATTERN (insn)) == CLOBBER)
4376 continue;
4377
4378 HARD_REG_SET defs, uses;
4379 CLEAR_HARD_REG_SET (defs);
4380 CLEAR_HARD_REG_SET (uses);
e8448ba5 4381 note_stores (insn, record_hard_reg_sets, &defs);
5326695a
AS
4382 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4383
4384 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4385 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4386 bool exec_used = (hard_reg_set_intersect_p
4387 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4388 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4389
4390 /* Check the instruction for implicit setting of EXEC via an
4391 attribute. */
4392 attr_exec exec_attr = get_attr_exec (insn);
4393 int64_t new_exec;
4394
4395 switch (exec_attr)
4396 {
4397 case EXEC_NONE:
4398 new_exec = 0;
4399 break;
4400
4401 case EXEC_SINGLE:
4402 /* Instructions that do not involve memory accesses only require
4403 bit 0 of EXEC to be set. */
4404 if (gcn_vmem_insn_p (get_attr_type (insn))
4405 || get_attr_type (insn) == TYPE_DS)
4406 new_exec = 1;
4407 else
4408 new_exec = curr_exec | 1;
4409 break;
4410
4411 case EXEC_FULL:
4412 new_exec = -1;
4413 break;
4414
4415 default: /* Auto-detect what setting is appropriate. */
4416 {
4417 new_exec = 0;
4418
4419 /* If EXEC is referenced explicitly then we don't need to do
4420 anything to set it, so we're done. */
4421 if (exec_used)
4422 break;
4423
4424 /* Scan the insn for VGPRs defs or uses. The mode determines
4425 what kind of exec is needed. */
4426 subrtx_iterator::array_type array;
4427 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4428 {
4429 const_rtx x = *iter;
4430 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4431 {
4432 if (VECTOR_MODE_P (GET_MODE (x)))
4433 {
4434 new_exec = -1;
4435 break;
4436 }
4437 else
4438 new_exec = 1;
4439 }
4440 }
4441 }
4442 break;
4443 }
4444
4445 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4446 {
4447 start_sequence ();
4448 emit_move_insn (exec_reg, GEN_INT (new_exec));
4449 rtx_insn *seq = get_insns ();
4450 end_sequence ();
4451 emit_insn_before (seq, insn);
4452
4453 if (dump_file && (dump_flags & TDF_DETAILS))
4454 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4455 new_exec, INSN_UID (insn));
4456
4457 curr_exec = new_exec;
4458 curr_exec_explicit = false;
4459 curr_exec_known = true;
4460 }
4461 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4462 {
4463 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4464 new_exec, INSN_UID (insn));
4465 }
4466
4467 /* The state of the EXEC register is unknown after a
4468 function call. */
4469 if (CALL_P (insn))
4470 curr_exec_known = false;
4471
4472 /* Handle explicit uses of EXEC. If the instruction is a partial
4473 explicit definition of EXEC, then treat it as an explicit use of
4474 EXEC as well. */
4475 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4476 {
4477 /* An instruction that explicitly uses EXEC should not also
4478 implicitly define it. */
4479 gcc_assert (!exec_used || !new_exec);
4480
4481 if (!curr_exec_known || !curr_exec_explicit)
4482 {
4483 /* Restore the previous explicitly defined value. */
4484 curr_exec = gcn_restore_exec (insn, last_exec_def,
4485 curr_exec, curr_exec_known,
4486 last_exec_def_saved);
4487 curr_exec_explicit = true;
4488 curr_exec_known = true;
4489 }
4490 }
4491
4492 /* Handle explicit definitions of EXEC. */
4493 if (exec_lo_def_p || exec_hi_def_p)
4494 {
4495 last_exec_def = insn;
4496 last_exec_def_saved = false;
4497 curr_exec = gcn_insn_exec_value (insn);
4498 curr_exec_explicit = true;
4499 curr_exec_known = true;
4500
4501 if (dump_file && (dump_flags & TDF_DETAILS))
4502 fprintf (dump_file,
4503 "Found %s definition of EXEC at insn %d.\n",
4504 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4505 INSN_UID (insn));
4506 }
4507 }
4508
4509 COPY_REG_SET (&live, DF_LR_OUT (bb));
4510 df_simulate_initialize_backwards (bb, &live);
4511
4512 /* If EXEC is live after the basic block, restore the value of EXEC
4513 at the end of the block. */
4514 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4515 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4516 && (!curr_exec_known || !curr_exec_explicit))
4517 {
4518 rtx_insn *end_insn = BB_END (bb);
4519
4520 /* If the instruction is not a jump instruction, do the restore
4521 after the last instruction in the basic block. */
4522 if (NONJUMP_INSN_P (end_insn))
4523 end_insn = NEXT_INSN (end_insn);
4524
4525 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4526 curr_exec_known, last_exec_def_saved);
4527 }
4528 }
4529
4530 CLEAR_REG_SET (&live);
4531
4532 /* "Manually Inserted Wait States (NOPs)."
4533
4534 GCN hardware detects most kinds of register dependencies, but there
4535 are some exceptions documented in the ISA manual. This pass
4536 detects the missed cases, and inserts the documented number of NOPs
4537 required for correct execution. */
4538
4539 const int max_waits = 5;
4540 struct ilist
4541 {
4542 rtx_insn *insn;
4543 attr_unit unit;
930c5599 4544 attr_delayeduse delayeduse;
5326695a 4545 HARD_REG_SET writes;
930c5599 4546 HARD_REG_SET reads;
5326695a
AS
4547 int age;
4548 } back[max_waits];
4549 int oldest = 0;
4550 for (int i = 0; i < max_waits; i++)
4551 back[i].insn = NULL;
4552
4553 rtx_insn *insn, *last_insn = NULL;
4554 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4555 {
4556 if (!NONDEBUG_INSN_P (insn))
4557 continue;
4558
4559 if (GET_CODE (PATTERN (insn)) == USE
4560 || GET_CODE (PATTERN (insn)) == CLOBBER)
4561 continue;
4562
4563 attr_type itype = get_attr_type (insn);
4564 attr_unit iunit = get_attr_unit (insn);
930c5599 4565 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
5326695a
AS
4566 HARD_REG_SET ireads, iwrites;
4567 CLEAR_HARD_REG_SET (ireads);
4568 CLEAR_HARD_REG_SET (iwrites);
e8448ba5 4569 note_stores (insn, record_hard_reg_sets, &iwrites);
5326695a
AS
4570 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4571
4572 /* Scan recent previous instructions for dependencies not handled in
4573 hardware. */
4574 int nops_rqd = 0;
4575 for (int i = oldest; i < oldest + max_waits; i++)
4576 {
4577 struct ilist *prev_insn = &back[i % max_waits];
4578
4579 if (!prev_insn->insn)
4580 continue;
4581
4582 /* VALU writes SGPR followed by VMEM reading the same SGPR
4583 requires 5 wait states. */
4584 if ((prev_insn->age + nops_rqd) < 5
4585 && prev_insn->unit == UNIT_VECTOR
4586 && gcn_vmem_insn_p (itype))
4587 {
dc333d8f 4588 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
4589 if (hard_reg_set_intersect_p
4590 (regs, reg_class_contents[(int) SGPR_REGS]))
4591 nops_rqd = 5 - prev_insn->age;
4592 }
4593
4594 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4595 requires 5 wait states. */
4596 if ((prev_insn->age + nops_rqd) < 5
4597 && prev_insn->unit == UNIT_VECTOR
4598 && iunit == UNIT_VECTOR
4599 && ((hard_reg_set_intersect_p
4600 (prev_insn->writes,
4601 reg_class_contents[(int) EXEC_MASK_REG])
4602 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4603 ||
4604 (hard_reg_set_intersect_p
4605 (prev_insn->writes,
4606 reg_class_contents[(int) VCC_CONDITIONAL_REG])
4607 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4608 nops_rqd = 5 - prev_insn->age;
4609
4610 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4611 SGPR/VCC as lane select requires 4 wait states. */
4612 if ((prev_insn->age + nops_rqd) < 4
4613 && prev_insn->unit == UNIT_VECTOR
4614 && get_attr_laneselect (insn) == LANESELECT_YES)
4615 {
dc333d8f 4616 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
4617 if (hard_reg_set_intersect_p
4618 (regs, reg_class_contents[(int) SGPR_REGS])
4619 || hard_reg_set_intersect_p
4620 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4621 nops_rqd = 4 - prev_insn->age;
4622 }
4623
4624 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4625 requires 2 wait states. */
4626 if ((prev_insn->age + nops_rqd) < 2
4627 && prev_insn->unit == UNIT_VECTOR
4628 && itype == TYPE_VOP_DPP)
4629 {
dc333d8f 4630 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
4631 if (hard_reg_set_intersect_p
4632 (regs, reg_class_contents[(int) VGPR_REGS]))
4633 nops_rqd = 2 - prev_insn->age;
4634 }
930c5599
AS
4635
4636 /* Store that requires input registers are not overwritten by
4637 following instruction. */
4638 if ((prev_insn->age + nops_rqd) < 1
4639 && prev_insn->delayeduse == DELAYEDUSE_YES
4640 && ((hard_reg_set_intersect_p
4641 (prev_insn->reads, iwrites))))
4642 nops_rqd = 1 - prev_insn->age;
5326695a
AS
4643 }
4644
4645 /* Insert the required number of NOPs. */
4646 for (int i = nops_rqd; i > 0; i--)
4647 emit_insn_after (gen_nop (), last_insn);
4648
4649 /* Age the previous instructions. We can also ignore writes to
4650 registers subsequently overwritten. */
4651 HARD_REG_SET written;
4652 CLEAR_HARD_REG_SET (written);
4653 for (int i = oldest + max_waits - 1; i > oldest; i--)
4654 {
4655 struct ilist *prev_insn = &back[i % max_waits];
4656
4657 /* Assume all instructions are equivalent to one "wait", the same
4658 as s_nop. This is probably true for SALU, but not VALU (which
4659 may take longer), so this is not optimal. However, AMD do
4660 not publish the cycle times for instructions. */
4661 prev_insn->age += 1 + nops_rqd;
4662
44942965 4663 written |= iwrites;
d15e5131 4664 prev_insn->writes &= ~written;
5326695a
AS
4665 }
4666
4667 /* Track the current instruction as a previous instruction. */
4668 back[oldest].insn = insn;
4669 back[oldest].unit = iunit;
930c5599 4670 back[oldest].delayeduse = idelayeduse;
6576d245
RS
4671 back[oldest].writes = iwrites;
4672 back[oldest].reads = ireads;
5326695a
AS
4673 back[oldest].age = 0;
4674 oldest = (oldest + 1) % max_waits;
4675
4676 last_insn = insn;
4677 }
4678}
4679
4680/* }}} */
4681/* {{{ OpenACC / OpenMP. */
4682
4683#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
4684#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
4685#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
4686
4687/* Implement TARGET_GOACC_VALIDATE_DIMS.
4688
4689 Check the launch dimensions provided for an OpenACC compute
4690 region, or routine. */
4691
4692static bool
4693gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
4694 unsigned /*used*/)
4695{
4696 bool changed = false;
4697
4698 /* FIXME: remove -facc-experimental-workers when they're ready. */
4699 int max_workers = flag_worker_partitioning ? 16 : 1;
4700
fe22e0d4
AS
4701 gcc_assert (!flag_worker_partitioning);
4702
5326695a
AS
4703 /* The vector size must appear to be 64, to the user, unless this is a
4704 SEQ routine. The real, internal value is always 1, which means use
4705 autovectorization, but the user should not see that. */
4706 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4707 && dims[GOMP_DIM_VECTOR] >= 0)
4708 {
4709 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
4710 && dims[GOMP_DIM_VECTOR] != 64)
4711 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4712 OPT_Wopenacc_dims,
4713 (dims[GOMP_DIM_VECTOR]
4714 ? G_("using vector_length (64), ignoring %d")
4715 : G_("using vector_length (64), "
4716 "ignoring runtime setting")),
4717 dims[GOMP_DIM_VECTOR]);
4718 dims[GOMP_DIM_VECTOR] = 1;
4719 changed = true;
4720 }
4721
4722 /* Check the num workers is not too large. */
4723 if (dims[GOMP_DIM_WORKER] > max_workers)
4724 {
4725 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4726 OPT_Wopenacc_dims,
4727 "using num_workers (%d), ignoring %d",
4728 max_workers, dims[GOMP_DIM_WORKER]);
4729 dims[GOMP_DIM_WORKER] = max_workers;
4730 changed = true;
4731 }
4732
4733 /* Set global defaults. */
4734 if (!decl)
4735 {
4736 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
4737 if (dims[GOMP_DIM_WORKER] < 0)
4738 dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
4739 ? GCN_DEFAULT_WORKERS : 1);
4740 if (dims[GOMP_DIM_GANG] < 0)
4741 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
4742 changed = true;
4743 }
4744
4745 return changed;
4746}
4747
4748/* Helper function for oacc_dim_size instruction.
4749 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
4750
4751rtx
4752gcn_oacc_dim_size (int dim)
4753{
4754 if (dim < 0 || dim > 2)
4755 error ("offload dimension out of range (%d)", dim);
4756
4757 /* Vectors are a special case. */
4758 if (dim == 2)
4759 return const1_rtx; /* Think of this as 1 times 64. */
4760
4761 static int offset[] = {
4762 /* Offsets into dispatch packet. */
4763 12, /* X dim = Gang / Team / Work-group. */
4764 20, /* Z dim = Worker / Thread / Wavefront. */
4765 16 /* Y dim = Vector / SIMD / Work-item. */
4766 };
4767 rtx addr = gen_rtx_PLUS (DImode,
4768 gen_rtx_REG (DImode,
4769 cfun->machine->args.
4770 reg[DISPATCH_PTR_ARG]),
4771 GEN_INT (offset[dim]));
4772 return gen_rtx_MEM (SImode, addr);
4773}
4774
4775/* Helper function for oacc_dim_pos instruction.
4776 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
4777
4778rtx
4779gcn_oacc_dim_pos (int dim)
4780{
4781 if (dim < 0 || dim > 2)
4782 error ("offload dimension out of range (%d)", dim);
4783
4784 static const int reg[] = {
4785 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
4786 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
4787 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
4788 };
4789
4790 int reg_num = cfun->machine->args.reg[reg[dim]];
4791
4792 /* The information must have been requested by the kernel. */
4793 gcc_assert (reg_num >= 0);
4794
4795 return gen_rtx_REG (SImode, reg_num);
4796}
4797
4798/* Implement TARGET_GOACC_FORK_JOIN. */
4799
4800static bool
4801gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
4802 bool ARG_UNUSED (is_fork))
4803{
4804 /* GCN does not use the fork/join concept invented for NVPTX.
4805 Instead we use standard autovectorization. */
4806 return false;
4807}
4808
4809/* Implement ???????
4810 FIXME make this a real hook.
4811
4812 Adjust FNDECL such that options inherited from the host compiler
4813 are made appropriate for the accelerator compiler. */
4814
4815void
4816gcn_fixup_accel_lto_options (tree fndecl)
4817{
4818 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4819 if (!func_optimize)
4820 return;
4821
4822 tree old_optimize = build_optimization_node (&global_options);
4823 tree new_optimize;
4824
4825 /* If the function changed the optimization levels as well as
4826 setting target options, start with the optimizations
4827 specified. */
4828 if (func_optimize != old_optimize)
4829 cl_optimization_restore (&global_options,
4830 TREE_OPTIMIZATION (func_optimize));
4831
4832 gcn_option_override ();
4833
4834 /* The target attributes may also change some optimization flags,
4835 so update the optimization options if necessary. */
4836 new_optimize = build_optimization_node (&global_options);
4837
4838 if (old_optimize != new_optimize)
4839 {
4840 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4841 cl_optimization_restore (&global_options,
4842 TREE_OPTIMIZATION (old_optimize));
4843 }
4844}
4845
4846/* }}} */
4847/* {{{ ASM Output. */
4848
4849/* Implement TARGET_ASM_FILE_START.
4850
4851 Print assembler file header text. */
4852
4853static void
4854output_file_start (void)
4855{
4856 fprintf (asm_out_file, "\t.text\n");
4857 fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
4858 fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
4859 fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
4860 fprintf (asm_out_file, "\t.text\n");
4861}
4862
4863/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
4864
4865 Print the initial definition of a function name.
4866
4867 For GCN kernel entry points this includes all the HSA meta-data, special
4868 alignment constraints that don't apply to regular functions, and magic
4869 comments that pass information to mkoffload. */
4870
4871void
4872gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
4873{
4874 int sgpr, vgpr;
4875 bool xnack_enabled = false;
4876 int extra_regs = 0;
4877
4878 if (cfun && cfun->machine && cfun->machine->normal_function)
4879 {
4880 fputs ("\t.type\t", file);
4881 assemble_name (file, name);
4882 fputs (",@function\n", file);
4883 assemble_name (file, name);
4884 fputs (":\n", file);
4885 return;
4886 }
4887
4888 /* Determine count of sgpr/vgpr registers by looking for last
4889 one used. */
4890 for (sgpr = 101; sgpr >= 0; sgpr--)
4891 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
4892 break;
4893 sgpr++;
4894 for (vgpr = 255; vgpr >= 0; vgpr--)
4895 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
4896 break;
4897 vgpr++;
4898
4899 if (xnack_enabled)
4900 extra_regs = 6;
4901 if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
4902 || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
4903 extra_regs = 4;
4904 else if (df_regs_ever_live_p (VCC_LO_REG)
4905 || df_regs_ever_live_p (VCC_HI_REG))
4906 extra_regs = 2;
4907
4908 if (!leaf_function_p ())
4909 {
4910 /* We can't know how many registers function calls might use. */
4911 if (vgpr < 64)
4912 vgpr = 64;
4913 if (sgpr + extra_regs < 102)
4914 sgpr = 102 - extra_regs;
4915 }
4916
4917 fputs ("\t.align\t256\n", file);
4918 fputs ("\t.type\t", file);
4919 assemble_name (file, name);
4920 fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
4921 assemble_name (file, name);
4922 fputs ("\n", file);
4923 assemble_name (file, name);
4924 fputs (":\n", file);
4925 fprintf (file, "\t.amd_kernel_code_t\n"
4926 "\t\tkernel_code_version_major = 1\n"
4927 "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
4928 /* "\t\tmachine_version_major = 8\n"
4929 "\t\tmachine_version_minor = 0\n"
4930 "\t\tmachine_version_stepping = 1\n" */
4931 "\t\tkernel_code_entry_byte_offset = 256\n"
4932 "\t\tkernel_code_prefetch_byte_size = 0\n"
4933 "\t\tmax_scratch_backing_memory_byte_size = 0\n"
4934 "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
4935 "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
4936 "\t\tcompute_pgm_rsrc1_priority = 0\n"
4937 "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
4938 "\t\tcompute_pgm_rsrc1_priv = 0\n"
4939 "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
4940 "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
4941 "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
4942 /* We enable scratch memory. */
4943 "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
4944 "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
4945 "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
4946 "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
4947 "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
4948 "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
4949 "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
4950 "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
4951 "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */
4952 "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
4953 (vgpr - 1) / 4,
4954 /* Must match wavefront_sgpr_count */
4955 (sgpr + extra_regs + 7) / 8 - 1,
4956 /* The total number of SGPR user data registers requested. This
4957 number must match the number of user data registers enabled. */
4958 cfun->machine->args.nsgprs);
4959 int reg = FIRST_SGPR_REG;
4960 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
4961 {
4962 int reg_first = -1;
4963 int reg_last;
4964 if ((cfun->machine->args.requested & (1 << a))
4965 && (gcn_kernel_arg_types[a].fixed_regno < 0))
4966 {
4967 reg_first = reg;
4968 reg_last = (reg_first
4969 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
4970 / UNITS_PER_WORD) - 1);
4971 reg = reg_last + 1;
4972 }
4973
4974 if (gcn_kernel_arg_types[a].header_pseudo)
4975 {
4976 fprintf (file, "\t\t%s = %i",
4977 gcn_kernel_arg_types[a].header_pseudo,
4978 (cfun->machine->args.requested & (1 << a)) != 0);
4979 if (reg_first != -1)
4980 {
4981 fprintf (file, " ; (");
4982 for (int i = reg_first; i <= reg_last; ++i)
4983 {
4984 if (i != reg_first)
4985 fprintf (file, ", ");
4986 fprintf (file, "%s", reg_names[i]);
4987 }
4988 fprintf (file, ")");
4989 }
4990 fprintf (file, "\n");
4991 }
4992 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
4993 && cfun->machine->args.requested & (1 << a))
4994 fprintf (file, "\t\t; %s = %i (%s)\n",
4995 gcn_kernel_arg_types[a].name,
4996 (cfun->machine->args.requested & (1 << a)) != 0,
4997 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
4998 }
4999 fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
5000 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5001 ? 2
5002 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5003 ? 1 : 0);
5004 fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
5005 "\t\tprivate_element_size = 1\n"
5006 "\t\tis_ptr64 = 1\n"
5007 "\t\tis_dynamic_callstack = 0\n"
5008 "\t\tis_debug_enabled = 0\n"
5009 "\t\tis_xnack_enabled = %i\n"
5010 "\t\tworkitem_private_segment_byte_size = %i\n"
5011 "\t\tworkgroup_group_segment_byte_size = %u\n"
5012 "\t\tgds_segment_byte_size = 0\n"
5013 "\t\tkernarg_segment_byte_size = %i\n"
5014 "\t\tworkgroup_fbarrier_count = 0\n"
5015 "\t\twavefront_sgpr_count = %i\n"
5016 "\t\tworkitem_vgpr_count = %i\n"
5017 "\t\treserved_vgpr_first = 0\n"
5018 "\t\treserved_vgpr_count = 0\n"
5019 "\t\treserved_sgpr_first = 0\n"
5020 "\t\treserved_sgpr_count = 0\n"
5021 "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
5022 "\t\tdebug_private_segment_buffer_sgpr = 0\n"
5023 "\t\tkernarg_segment_alignment = %i\n"
5024 "\t\tgroup_segment_alignment = 4\n"
5025 "\t\tprivate_segment_alignment = %i\n"
5026 "\t\twavefront_size = 6\n"
5027 "\t\tcall_convention = 0\n"
5028 "\t\truntime_loader_kernel_symbol = 0\n"
5029 "\t.end_amd_kernel_code_t\n", xnack_enabled,
5030 /* workitem_private_segment_bytes_size needs to be
5031 one 64th the wave-front stack size. */
5032 stack_size_opt / 64,
5033 LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
5034 /* Number of scalar registers used by a wavefront. This
5035 includes the special SGPRs for VCC, Flat Scratch (Base,
5036 Size) and XNACK (for GFX8 (VI)+). It does not include the
5037 16 SGPR added if a trap handler is enabled. Must match
5038 compute_pgm_rsrc1.sgprs. */
5039 sgpr + extra_regs, vgpr,
5040 cfun->machine->kernarg_segment_alignment,
5041 crtl->stack_alignment_needed / 8);
5042
5043 /* This comment is read by mkoffload. */
5044 if (flag_openacc)
5045 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5046 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5047 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5048 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5049}
5050
5051/* Implement TARGET_ASM_SELECT_SECTION.
5052
5053 Return the section into which EXP should be placed. */
5054
5055static section *
5056gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5057{
5058 if (TREE_TYPE (exp) != error_mark_node
5059 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5060 {
5061 if (!DECL_P (exp))
5062 return get_section (".lds_bss",
5063 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5064 NULL);
5065
5066 return get_named_section (exp, ".lds_bss", reloc);
5067 }
5068
5069 return default_elf_select_section (exp, reloc, align);
5070}
5071
5072/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5073
5074 Emits custom text into the assembler file at the head of each function. */
5075
5076static void
5077gcn_target_asm_function_prologue (FILE *file)
5078{
5079 machine_function *offsets = gcn_compute_frame_offsets ();
5080
5081 asm_fprintf (file, "\t; using %s addressing in function\n",
5082 offsets->use_flat_addressing ? "flat" : "global");
5083
5084 if (offsets->normal_function)
5085 {
5086 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5087 offsets->need_frame_pointer ? "true" : "false");
5088 asm_fprintf (file, "\t; lr needs saving: %s\n",
5089 offsets->lr_needs_saving ? "true" : "false");
5090 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5091 offsets->outgoing_args_size);
5092 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5093 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5094 asm_fprintf (file, "\t; callee save size: %wd\n",
5095 offsets->callee_saves);
5096 }
5097 else
5098 {
5099 asm_fprintf (file, "\t; HSA kernel entry point\n");
5100 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5101 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5102 offsets->outgoing_args_size);
5103
5104 /* Enable denorms. */
5105 asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
5106 " input and output denorms\n");
5107 asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
5108 }
5109}
5110
5111/* Helper function for print_operand and print_operand_address.
5112
5113 Print a register as the assembler requires, according to mode and name. */
5114
5115static void
5116print_reg (FILE *file, rtx x)
5117{
5118 machine_mode mode = GET_MODE (x);
5119 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5120 || mode == HFmode || mode == SFmode
5121 || mode == V64SFmode || mode == V64SImode
5122 || mode == V64QImode || mode == V64HImode)
5123 fprintf (file, "%s", reg_names[REGNO (x)]);
5124 else if (mode == DImode || mode == V64DImode
5125 || mode == DFmode || mode == V64DFmode)
5126 {
5127 if (SGPR_REGNO_P (REGNO (x)))
5128 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5129 REGNO (x) - FIRST_SGPR_REG + 1);
5130 else if (VGPR_REGNO_P (REGNO (x)))
5131 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5132 REGNO (x) - FIRST_VGPR_REG + 1);
5133 else if (REGNO (x) == FLAT_SCRATCH_REG)
5134 fprintf (file, "flat_scratch");
5135 else if (REGNO (x) == EXEC_REG)
5136 fprintf (file, "exec");
5137 else if (REGNO (x) == VCC_LO_REG)
5138 fprintf (file, "vcc");
5139 else
5140 fprintf (file, "[%s:%s]",
5141 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5142 }
5143 else if (mode == TImode)
5144 {
5145 if (SGPR_REGNO_P (REGNO (x)))
5146 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5147 REGNO (x) - FIRST_SGPR_REG + 3);
5148 else if (VGPR_REGNO_P (REGNO (x)))
5149 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5150 REGNO (x) - FIRST_VGPR_REG + 3);
5151 else
5152 gcc_unreachable ();
5153 }
5154 else
5155 gcc_unreachable ();
5156}
5157
5158/* Implement TARGET_SECTION_TYPE_FLAGS.
5159
5160 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5161
5162static unsigned int
5163gcn_section_type_flags (tree decl, const char *name, int reloc)
5164{
5165 if (strcmp (name, ".lds_bss") == 0)
5166 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5167
5168 return default_section_type_flags (decl, name, reloc);
5169}
5170
5171/* Helper function for gcn_asm_output_symbol_ref.
5172
5173 FIXME: If we want to have propagation blocks allocated separately and
5174 statically like this, it would be better done via symbol refs and the
5175 assembler/linker. This is a temporary hack. */
5176
5177static void
5178gcn_print_lds_decl (FILE *f, tree var)
5179{
5180 int *offset;
5181 machine_function *machfun = cfun->machine;
5182
5183 if ((offset = machfun->lds_allocs->get (var)))
5184 fprintf (f, "%u", (unsigned) *offset);
5185 else
5186 {
5187 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5188 tree type = TREE_TYPE (var);
5189 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5190 if (size > align && size > 4 && align < 8)
5191 align = 8;
5192
5193 machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
5194 & ~(align - 1));
5195
5196 machfun->lds_allocs->put (var, machfun->lds_allocated);
5197 fprintf (f, "%u", machfun->lds_allocated);
5198 machfun->lds_allocated += size;
5199 if (machfun->lds_allocated > LDS_SIZE)
5200 error ("local data-share memory exhausted");
5201 }
5202}
5203
5204/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5205
5206void
5207gcn_asm_output_symbol_ref (FILE *file, rtx x)
5208{
5209 tree decl;
5210 if ((decl = SYMBOL_REF_DECL (x)) != 0
5211 && TREE_CODE (decl) == VAR_DECL
5212 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5213 {
5214 /* LDS symbols (emitted using this hook) are only used at present
5215 to propagate worker values from an active thread to neutered
5216 threads. Use the same offset for each such block, but don't
5217 use zero because null pointers are used to identify the active
5218 thread in GOACC_single_copy_start calls. */
5219 gcn_print_lds_decl (file, decl);
5220 }
5221 else
5222 {
5223 assemble_name (file, XSTR (x, 0));
5224 /* FIXME: See above -- this condition is unreachable. */
5225 if ((decl = SYMBOL_REF_DECL (x)) != 0
5226 && TREE_CODE (decl) == VAR_DECL
5227 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5228 fputs ("@abs32", file);
5229 }
5230}
5231
5232/* Implement TARGET_CONSTANT_ALIGNMENT.
5233
5234 Returns the alignment in bits of a constant that is being placed in memory.
5235 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5236 would ordinarily have. */
5237
5238static HOST_WIDE_INT
5239gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5240 HOST_WIDE_INT basic_align)
5241{
5242 return basic_align > 128 ? basic_align : 128;
5243}
5244
5245/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5246
5247void
5248print_operand_address (FILE *file, rtx mem)
5249{
5250 gcc_assert (MEM_P (mem));
5251
5252 rtx reg;
5253 rtx offset;
5254 addr_space_t as = MEM_ADDR_SPACE (mem);
5255 rtx addr = XEXP (mem, 0);
5256 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5257
5258 if (AS_SCRATCH_P (as))
5259 switch (GET_CODE (addr))
5260 {
5261 case REG:
5262 print_reg (file, addr);
5263 break;
5264
5265 case PLUS:
5266 reg = XEXP (addr, 0);
5267 offset = XEXP (addr, 1);
5268 print_reg (file, reg);
5269 if (GET_CODE (offset) == CONST_INT)
5270 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5271 else
5272 abort ();
5273 break;
5274
5275 default:
5276 debug_rtx (addr);
5277 abort ();
5278 }
5279 else if (AS_ANY_FLAT_P (as))
5280 {
5281 if (GET_CODE (addr) == REG)
5282 print_reg (file, addr);
5283 else
5284 {
5285 gcc_assert (TARGET_GCN5_PLUS);
5286 print_reg (file, XEXP (addr, 0));
5287 }
5288 }
5289 else if (AS_GLOBAL_P (as))
5290 {
5291 gcc_assert (TARGET_GCN5_PLUS);
5292
5293 rtx base = addr;
5294 rtx vgpr_offset = NULL_RTX;
5295
5296 if (GET_CODE (addr) == PLUS)
5297 {
5298 base = XEXP (addr, 0);
5299
5300 if (GET_CODE (base) == PLUS)
5301 {
5302 /* (SGPR + VGPR) + CONST */
5303 vgpr_offset = XEXP (base, 1);
5304 base = XEXP (base, 0);
5305 }
5306 else
5307 {
5308 rtx offset = XEXP (addr, 1);
5309
5310 if (REG_P (offset))
5311 /* SGPR + VGPR */
5312 vgpr_offset = offset;
5313 else if (CONST_INT_P (offset))
5314 /* VGPR + CONST or SGPR + CONST */
5315 ;
5316 else
5317 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5318 }
5319 }
5320
5321 if (REG_P (base))
5322 {
5323 if (VGPR_REGNO_P (REGNO (base)))
5324 print_reg (file, base);
5325 else if (SGPR_REGNO_P (REGNO (base)))
5326 {
5327 /* The assembler requires a 64-bit VGPR pair here, even though
5328 the offset should be only 32-bit. */
5329 if (vgpr_offset == NULL_RTX)
f6e20012
KCY
5330 /* In this case, the vector offset is zero, so we use the first
5331 lane of v1, which is initialized to zero. */
5332 fprintf (file, "v[1:2]");
5326695a
AS
5333 else if (REG_P (vgpr_offset)
5334 && VGPR_REGNO_P (REGNO (vgpr_offset)))
5335 {
5336 fprintf (file, "v[%d:%d]",
5337 REGNO (vgpr_offset) - FIRST_VGPR_REG,
5338 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5339 }
5340 else
5341 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5342 }
5343 }
5344 else
5345 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5346 }
5347 else if (AS_ANY_DS_P (as))
5348 switch (GET_CODE (addr))
5349 {
5350 case REG:
5351 print_reg (file, addr);
5352 break;
5353
5354 case PLUS:
5355 reg = XEXP (addr, 0);
5356 print_reg (file, reg);
5357 break;
5358
5359 default:
5360 debug_rtx (addr);
5361 abort ();
5362 }
5363 else
5364 switch (GET_CODE (addr))
5365 {
5366 case REG:
5367 print_reg (file, addr);
5368 fprintf (file, ", 0");
5369 break;
5370
5371 case PLUS:
5372 reg = XEXP (addr, 0);
5373 offset = XEXP (addr, 1);
5374 print_reg (file, reg);
5375 fprintf (file, ", ");
5376 if (GET_CODE (offset) == REG)
5377 print_reg (file, reg);
5378 else if (GET_CODE (offset) == CONST_INT)
5379 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5380 else
5381 abort ();
5382 break;
5383
5384 default:
5385 debug_rtx (addr);
5386 abort ();
5387 }
5388}
5389
5390/* Implement PRINT_OPERAND via gcn.h.
5391
5392 b - print operand size as untyped operand (b8/b16/b32/b64)
5393 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5394 i - print operand size as untyped operand (i16/b32/i64)
5395 u - print operand size as untyped operand (u16/u32/u64)
5396 o - print operand size as memory access size for loads
5397 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5398 s - print operand size as memory access size for stores
5399 (byte/short/dword/dwordx2/wordx3/dwordx4)
5400 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5401 c - print inverse conditional code for s_cbranch
5402 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5403 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5404 A - print address in formatting suitable for given address space.
5405 O - print offset:n for data share operations.
5406 ^ - print "_co" suffix for GCN5 mnemonics
5407 g - print "glc", if appropriate for given MEM
5408 */
5409
5410void
5411print_operand (FILE *file, rtx x, int code)
5412{
5413 int xcode = x ? GET_CODE (x) : 0;
5414 bool invert = false;
5415 switch (code)
5416 {
5417 /* Instructions have the following suffixes.
5418 If there are two suffixes, the first is the destination type,
5419 and the second is the source type.
5420
5421 B32 Bitfield (untyped data) 32-bit
5422 B64 Bitfield (untyped data) 64-bit
5423 F16 floating-point 16-bit
5424 F32 floating-point 32-bit (IEEE 754 single-precision float)
5425 F64 floating-point 64-bit (IEEE 754 double-precision float)
5426 I16 signed 32-bit integer
5427 I32 signed 32-bit integer
5428 I64 signed 64-bit integer
5429 U16 unsigned 32-bit integer
5430 U32 unsigned 32-bit integer
5431 U64 unsigned 64-bit integer */
5432
5433 /* Print operand size as untyped suffix. */
5434 case 'b':
5435 {
5436 const char *s = "";
5437 machine_mode mode = GET_MODE (x);
5438 if (VECTOR_MODE_P (mode))
5439 mode = GET_MODE_INNER (mode);
5440 switch (GET_MODE_SIZE (mode))
5441 {
5442 case 1:
5443 s = "_b8";
5444 break;
5445 case 2:
5446 s = "_b16";
5447 break;
5448 case 4:
5449 s = "_b32";
5450 break;
5451 case 8:
5452 s = "_b64";
5453 break;
5454 default:
5455 output_operand_lossage ("invalid operand %%xn code");
5456 return;
5457 }
5458 fputs (s, file);
5459 }
5460 return;
5461 case 'B':
5462 {
5463 const char *s = "";
5464 machine_mode mode = GET_MODE (x);
5465 if (VECTOR_MODE_P (mode))
5466 mode = GET_MODE_INNER (mode);
5467 switch (GET_MODE_SIZE (mode))
5468 {
5469 case 1:
5470 case 2:
5471 case 4:
5472 s = "_b32";
5473 break;
5474 case 8:
5475 s = "_b64";
5476 break;
5477 default:
5478 output_operand_lossage ("invalid operand %%xn code");
5479 return;
5480 }
5481 fputs (s, file);
5482 }
5483 return;
5484 case 'e':
5485 fputs ("sext(", file);
5486 print_operand (file, x, 0);
5487 fputs (")", file);
5488 return;
5489 case 'i':
5490 case 'u':
5491 {
5492 bool signed_p = code == 'i';
5493 const char *s = "";
5494 machine_mode mode = GET_MODE (x);
5495 if (VECTOR_MODE_P (mode))
5496 mode = GET_MODE_INNER (mode);
5497 if (mode == VOIDmode)
5498 switch (GET_CODE (x))
5499 {
5500 case CONST_INT:
5501 s = signed_p ? "_i32" : "_u32";
5502 break;
5503 case CONST_DOUBLE:
5504 s = "_f64";
5505 break;
5506 default:
5507 output_operand_lossage ("invalid operand %%xn code");
5508 return;
5509 }
5510 else if (FLOAT_MODE_P (mode))
5511 switch (GET_MODE_SIZE (mode))
5512 {
5513 case 2:
5514 s = "_f16";
5515 break;
5516 case 4:
5517 s = "_f32";
5518 break;
5519 case 8:
5520 s = "_f64";
5521 break;
5522 default:
5523 output_operand_lossage ("invalid operand %%xn code");
5524 return;
5525 }
5526 else
5527 switch (GET_MODE_SIZE (mode))
5528 {
5529 case 1:
5530 s = signed_p ? "_i8" : "_u8";
5531 break;
5532 case 2:
5533 s = signed_p ? "_i16" : "_u16";
5534 break;
5535 case 4:
5536 s = signed_p ? "_i32" : "_u32";
5537 break;
5538 case 8:
5539 s = signed_p ? "_i64" : "_u64";
5540 break;
5541 default:
5542 output_operand_lossage ("invalid operand %%xn code");
5543 return;
5544 }
5545 fputs (s, file);
5546 }
5547 return;
5548 /* Print operand size as untyped suffix. */
5549 case 'o':
5550 {
5551 const char *s = 0;
5552 switch (GET_MODE_SIZE (GET_MODE (x)))
5553 {
5554 case 1:
5555 s = "_ubyte";
5556 break;
5557 case 2:
5558 s = "_ushort";
5559 break;
5560 /* The following are full-vector variants. */
5561 case 64:
5562 s = "_ubyte";
5563 break;
5564 case 128:
5565 s = "_ushort";
5566 break;
5567 }
5568
5569 if (s)
5570 {
5571 fputs (s, file);
5572 return;
5573 }
5574
5575 /* Fall-through - the other cases for 'o' are the same as for 's'. */
5576 gcc_fallthrough();
5577 }
5578 case 's':
5579 {
5580 const char *s = "";
5581 switch (GET_MODE_SIZE (GET_MODE (x)))
5582 {
5583 case 1:
5584 s = "_byte";
5585 break;
5586 case 2:
5587 s = "_short";
5588 break;
5589 case 4:
5590 s = "_dword";
5591 break;
5592 case 8:
5593 s = "_dwordx2";
5594 break;
5595 case 12:
5596 s = "_dwordx3";
5597 break;
5598 case 16:
5599 s = "_dwordx4";
5600 break;
5601 case 32:
5602 s = "_dwordx8";
5603 break;
5604 case 64:
5605 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
5606 break;
5607 /* The following are full-vector variants. */
5608 case 128:
5609 s = "_short";
5610 break;
5611 case 256:
5612 s = "_dword";
5613 break;
5614 case 512:
5615 s = "_dwordx2";
5616 break;
5617 default:
5618 output_operand_lossage ("invalid operand %%xn code");
5619 return;
5620 }
5621 fputs (s, file);
5622 }
5623 return;
5624 case 'A':
5625 if (xcode != MEM)
5626 {
5627 output_operand_lossage ("invalid %%xn code");
5628 return;
5629 }
5630 print_operand_address (file, x);
5631 return;
5632 case 'O':
5633 {
5634 if (xcode != MEM)
5635 {
5636 output_operand_lossage ("invalid %%xn code");
5637 return;
5638 }
5639 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
5640 fprintf (file, " gds");
5641
5642 rtx x0 = XEXP (x, 0);
5643 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
5644 {
5645 gcc_assert (TARGET_GCN5_PLUS);
5646
5647 fprintf (file, ", ");
5648
5649 rtx base = x0;
5650 rtx const_offset = NULL_RTX;
5651
5652 if (GET_CODE (base) == PLUS)
5653 {
5654 rtx offset = XEXP (x0, 1);
5655 base = XEXP (x0, 0);
5656
5657 if (GET_CODE (base) == PLUS)
5658 /* (SGPR + VGPR) + CONST */
5659 /* Ignore the VGPR offset for this operand. */
5660 base = XEXP (base, 0);
5661
5662 if (CONST_INT_P (offset))
5663 const_offset = XEXP (x0, 1);
5664 else if (REG_P (offset))
5665 /* SGPR + VGPR */
5666 /* Ignore the VGPR offset for this operand. */
5667 ;
5668 else
5669 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5670 }
5671
5672 if (REG_P (base))
5673 {
5674 if (VGPR_REGNO_P (REGNO (base)))
5675 /* The VGPR address is specified in the %A operand. */
5676 fprintf (file, "off");
5677 else if (SGPR_REGNO_P (REGNO (base)))
5678 print_reg (file, base);
5679 else
5680 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5681 }
5682 else
5683 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5684
5685 if (const_offset != NULL_RTX)
5686 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
5687 INTVAL (const_offset));
5688
5689 return;
5690 }
5691
5692 if (GET_CODE (x0) == REG)
5693 return;
5694 if (GET_CODE (x0) != PLUS)
5695 {
5696 output_operand_lossage ("invalid %%xn code");
5697 return;
5698 }
5699 rtx val = XEXP (x0, 1);
5700 if (GET_CODE (val) == CONST_VECTOR)
5701 val = CONST_VECTOR_ELT (val, 0);
5702 if (GET_CODE (val) != CONST_INT)
5703 {
5704 output_operand_lossage ("invalid %%xn code");
5705 return;
5706 }
5707 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
5708
5709 }
5710 return;
5711 case 'c':
5712 invert = true;
5713 /* Fall through. */
5714 case 'C':
5715 {
5716 const char *s;
5717 bool num = false;
5718 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
5719 {
5720 output_operand_lossage ("invalid %%xn code");
5721 return;
5722 }
5723 switch (REGNO (XEXP (x, 0)))
5724 {
5725 case VCC_REG:
5726 case VCCZ_REG:
5727 s = "_vcc";
5728 break;
5729 case SCC_REG:
5730 /* For some reason llvm-mc insists on scc0 instead of sccz. */
5731 num = true;
5732 s = "_scc";
5733 break;
5734 case EXECZ_REG:
5735 s = "_exec";
5736 break;
5737 default:
5738 output_operand_lossage ("invalid %%xn code");
5739 return;
5740 }
5741 fputs (s, file);
5742 if (xcode == (invert ? NE : EQ))
5743 fputc (num ? '0' : 'z', file);
5744 else
5745 fputs (num ? "1" : "nz", file);
5746 return;
5747 }
5748 case 'D':
5749 {
5750 const char *s;
5751 bool cmp_signed = false;
5752 switch (xcode)
5753 {
5754 case EQ:
5755 s = "_eq_";
5756 break;
5757 case NE:
5758 s = "_lg_";
5759 break;
5760 case LT:
5761 s = "_lt_";
5762 cmp_signed = true;
5763 break;
5764 case LE:
5765 s = "_le_";
5766 cmp_signed = true;
5767 break;
5768 case GT:
5769 s = "_gt_";
5770 cmp_signed = true;
5771 break;
5772 case GE:
5773 s = "_ge_";
5774 cmp_signed = true;
5775 break;
5776 case LTU:
5777 s = "_lt_";
5778 break;
5779 case LEU:
5780 s = "_le_";
5781 break;
5782 case GTU:
5783 s = "_gt_";
5784 break;
5785 case GEU:
5786 s = "_ge_";
5787 break;
5788 default:
5789 output_operand_lossage ("invalid %%xn code");
5790 return;
5791 }
5792 fputs (s, file);
5793 fputc (cmp_signed ? 'i' : 'u', file);
5794
5795 machine_mode mode = GET_MODE (XEXP (x, 0));
5796
5797 if (mode == VOIDmode)
5798 mode = GET_MODE (XEXP (x, 1));
5799
5800 /* If both sides are constants, then assume the instruction is in
5801 SImode since s_cmp can only do integer compares. */
5802 if (mode == VOIDmode)
5803 mode = SImode;
5804
5805 switch (GET_MODE_SIZE (mode))
5806 {
5807 case 4:
5808 s = "32";
5809 break;
5810 case 8:
5811 s = "64";
5812 break;
5813 default:
5814 output_operand_lossage ("invalid operand %%xn code");
5815 return;
5816 }
5817 fputs (s, file);
5818 return;
5819 }
5820 case 'E':
5821 {
5822 const char *s;
5823 bool cmp_signed = false;
5824 machine_mode mode = GET_MODE (XEXP (x, 0));
5825
5826 if (mode == VOIDmode)
5827 mode = GET_MODE (XEXP (x, 1));
5828
5829 /* If both sides are constants, assume the instruction is in SFmode
5830 if either operand is floating point, otherwise assume SImode. */
5831 if (mode == VOIDmode)
5832 {
5833 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
5834 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
5835 mode = SFmode;
5836 else
5837 mode = SImode;
5838 }
5839
5840 /* Use the same format code for vector comparisons. */
5841 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
5842 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
5843 mode = GET_MODE_INNER (mode);
5844
5845 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
5846
5847 switch (xcode)
5848 {
5849 case EQ:
5850 s = "_eq_";
5851 break;
5852 case NE:
5853 s = float_p ? "_neq_" : "_ne_";
5854 break;
5855 case LT:
5856 s = "_lt_";
5857 cmp_signed = true;
5858 break;
5859 case LE:
5860 s = "_le_";
5861 cmp_signed = true;
5862 break;
5863 case GT:
5864 s = "_gt_";
5865 cmp_signed = true;
5866 break;
5867 case GE:
5868 s = "_ge_";
5869 cmp_signed = true;
5870 break;
5871 case LTU:
5872 s = "_lt_";
5873 break;
5874 case LEU:
5875 s = "_le_";
5876 break;
5877 case GTU:
5878 s = "_gt_";
5879 break;
5880 case GEU:
5881 s = "_ge_";
5882 break;
5883 case ORDERED:
5884 s = "_o_";
5885 break;
5886 case UNORDERED:
5887 s = "_u_";
5888 break;
5889 default:
5890 output_operand_lossage ("invalid %%xn code");
5891 return;
5892 }
5893 fputs (s, file);
5894 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
5895
5896 switch (GET_MODE_SIZE (mode))
5897 {
5898 case 1:
5899 s = "32";
5900 break;
5901 case 2:
5902 s = float_p ? "16" : "32";
5903 break;
5904 case 4:
5905 s = "32";
5906 break;
5907 case 8:
5908 s = "64";
5909 break;
5910 default:
5911 output_operand_lossage ("invalid operand %%xn code");
5912 return;
5913 }
5914 fputs (s, file);
5915 return;
5916 }
5917 case 'L':
5918 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
5919 return;
5920 case 'H':
5921 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
5922 return;
5923 case 'R':
5924 /* Print a scalar register number as an integer. Temporary hack. */
5925 gcc_assert (REG_P (x));
5926 fprintf (file, "%u", (int) REGNO (x));
5927 return;
5928 case 'V':
5929 /* Print a vector register number as an integer. Temporary hack. */
5930 gcc_assert (REG_P (x));
5931 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
5932 return;
5933 case 0:
5934 if (xcode == REG)
5935 print_reg (file, x);
5936 else if (xcode == MEM)
5937 output_address (GET_MODE (x), x);
5938 else if (xcode == CONST_INT)
5939 fprintf (file, "%i", (int) INTVAL (x));
5940 else if (xcode == CONST_VECTOR)
5941 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
5942 else if (xcode == CONST_DOUBLE)
5943 {
5944 const char *str;
5945 switch (gcn_inline_fp_constant_p (x, false))
5946 {
5947 case 240:
5948 str = "0.5";
5949 break;
5950 case 241:
5951 str = "-0.5";
5952 break;
5953 case 242:
5954 str = "1.0";
5955 break;
5956 case 243:
5957 str = "-1.0";
5958 break;
5959 case 244:
5960 str = "2.0";
5961 break;
5962 case 245:
5963 str = "-2.0";
5964 break;
5965 case 246:
5966 str = "4.0";
5967 break;
5968 case 247:
5969 str = "-4.0";
5970 break;
5971 case 248:
5972 str = "1/pi";
5973 break;
5974 default:
5975 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
5976 ? DImode : SImode,
5977 x, GET_MODE (x), 0);
5978 if (x)
5979 print_operand (file, ix, code);
5980 else
a94d5170 5981 output_operand_lossage ("invalid fp constant");
5326695a
AS
5982 return;
5983 break;
5984 }
5985 fprintf (file, str);
5986 return;
5987 }
5988 else
5989 output_addr_const (file, x);
5990 return;
5991 case '^':
5992 if (TARGET_GCN5_PLUS)
5993 fputs ("_co", file);
5994 return;
5995 case 'g':
5996 gcc_assert (xcode == MEM);
5997 if (MEM_VOLATILE_P (x))
5998 fputs (" glc", file);
5999 return;
6000 default:
6001 output_operand_lossage ("invalid %%xn code");
6002 }
6003 gcc_unreachable ();
6004}
6005
6006/* }}} */
6007/* {{{ TARGET hook overrides. */
6008
6009#undef TARGET_ADDR_SPACE_ADDRESS_MODE
6010#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6011#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6012#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6013 gcn_addr_space_legitimate_address_p
6014#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6015#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6016#undef TARGET_ADDR_SPACE_POINTER_MODE
6017#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6018#undef TARGET_ADDR_SPACE_SUBSET_P
6019#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6020#undef TARGET_ADDR_SPACE_CONVERT
6021#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6022#undef TARGET_ARG_PARTIAL_BYTES
6023#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6024#undef TARGET_ASM_ALIGNED_DI_OP
6025#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
5326695a
AS
6026#undef TARGET_ASM_FILE_START
6027#define TARGET_ASM_FILE_START output_file_start
6028#undef TARGET_ASM_FUNCTION_PROLOGUE
6029#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6030#undef TARGET_ASM_SELECT_SECTION
6031#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6032#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6033#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6034#undef TARGET_ATTRIBUTE_TABLE
6035#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6036#undef TARGET_BUILTIN_DECL
6037#define TARGET_BUILTIN_DECL gcn_builtin_decl
6038#undef TARGET_CAN_CHANGE_MODE_CLASS
6039#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6040#undef TARGET_CAN_ELIMINATE
6041#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6042#undef TARGET_CANNOT_COPY_INSN_P
6043#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6044#undef TARGET_CLASS_LIKELY_SPILLED_P
6045#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6046#undef TARGET_CLASS_MAX_NREGS
6047#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6048#undef TARGET_CONDITIONAL_REGISTER_USAGE
6049#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6050#undef TARGET_CONSTANT_ALIGNMENT
6051#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6052#undef TARGET_DEBUG_UNWIND_INFO
6053#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
76d46331
KCY
6054#undef TARGET_EMUTLS_VAR_INIT
6055#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
5326695a
AS
6056#undef TARGET_EXPAND_BUILTIN
6057#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6058#undef TARGET_FUNCTION_ARG
6059#undef TARGET_FUNCTION_ARG_ADVANCE
6060#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6061#define TARGET_FUNCTION_ARG gcn_function_arg
6062#undef TARGET_FUNCTION_VALUE
6063#define TARGET_FUNCTION_VALUE gcn_function_value
6064#undef TARGET_FUNCTION_VALUE_REGNO_P
6065#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6066#undef TARGET_GIMPLIFY_VA_ARG_EXPR
6067#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
955cd057
TB
6068#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6069#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
5326695a
AS
6070#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
6071#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
6072 gcn_goacc_adjust_propagation_record
6073#undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
6074#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
6075#undef TARGET_GOACC_FORK_JOIN
6076#define TARGET_GOACC_FORK_JOIN gcn_fork_join
6077#undef TARGET_GOACC_REDUCTION
6078#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6079#undef TARGET_GOACC_VALIDATE_DIMS
6080#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
5326695a
AS
6081#undef TARGET_HARD_REGNO_MODE_OK
6082#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6083#undef TARGET_HARD_REGNO_NREGS
6084#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6085#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6086#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6087#undef TARGET_INIT_BUILTINS
6088#define TARGET_INIT_BUILTINS gcn_init_builtins
6089#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6090#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6091 gcn_ira_change_pseudo_allocno_class
6092#undef TARGET_LEGITIMATE_CONSTANT_P
6093#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6094#undef TARGET_LRA_P
6095#define TARGET_LRA_P hook_bool_void_true
6096#undef TARGET_MACHINE_DEPENDENT_REORG
6097#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6098#undef TARGET_MEMORY_MOVE_COST
6099#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6100#undef TARGET_MODES_TIEABLE_P
6101#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6102#undef TARGET_OPTION_OVERRIDE
6103#define TARGET_OPTION_OVERRIDE gcn_option_override
6104#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6105#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6106 gcn_pretend_outgoing_varargs_named
6107#undef TARGET_PROMOTE_FUNCTION_MODE
6108#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6109#undef TARGET_REGISTER_MOVE_COST
6110#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6111#undef TARGET_RETURN_IN_MEMORY
6112#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6113#undef TARGET_RTX_COSTS
6114#define TARGET_RTX_COSTS gcn_rtx_costs
6115#undef TARGET_SECONDARY_RELOAD
6116#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6117#undef TARGET_SECTION_TYPE_FLAGS
6118#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6119#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6120#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6121 gcn_small_register_classes_for_mode_p
6122#undef TARGET_SPILL_CLASS
6123#define TARGET_SPILL_CLASS gcn_spill_class
6124#undef TARGET_STRICT_ARGUMENT_NAMING
6125#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6126#undef TARGET_TRAMPOLINE_INIT
6127#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6128#undef TARGET_TRULY_NOOP_TRUNCATION
6129#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6130#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6131#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6132#undef TARGET_VECTORIZE_GET_MASK_MODE
6133#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6134#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6135#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6136#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6137#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6138 gcn_preferred_vector_alignment
6139#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6140#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6141 gcn_vectorize_support_vector_misalignment
6142#undef TARGET_VECTORIZE_VEC_PERM_CONST
6143#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6144#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6145#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6146 gcn_vector_alignment_reachable
6147#undef TARGET_VECTOR_MODE_SUPPORTED_P
6148#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6149
6150struct gcc_target targetm = TARGET_INITIALIZER;
6151
6152#include "gt-gcn.h"
6153/* }}} */