]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn.c
re PR ipa/92528 (ICE in ipa_get_parm_lattices since r278219)
[thirdparty/gcc.git] / gcc / config / gcn / gcn.c
CommitLineData
5326695a
AS
1/* Copyright (C) 2016-2019 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17/* {{{ Includes. */
18
19/* We want GET_MODE_SIZE et al to return integers, please. */
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "memmodel.h"
28#include "rtl.h"
29#include "tree.h"
30#include "df.h"
31#include "tm_p.h"
32#include "stringpool.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic-core.h"
38#include "insn-attr.h"
39#include "fold-const.h"
40#include "calls.h"
41#include "explow.h"
42#include "expr.h"
43#include "output.h"
44#include "cfgrtl.h"
45#include "langhooks.h"
46#include "builtins.h"
47#include "omp-general.h"
48#include "print-rtl.h"
49#include "attribs.h"
50#include "varasm.h"
51#include "intl.h"
52#include "rtl-iter.h"
53
54/* This file should be included last. */
55#include "target-def.h"
56
57/* }}} */
58/* {{{ Global variables. */
59
60/* Constants used by FP instructions. */
61
62static REAL_VALUE_TYPE dconst4, dconst1over2pi;
63static bool ext_gcn_constants_init = 0;
64
65/* Holds the ISA variant, derived from the command line parameters. */
66
67int gcn_isa = 3; /* Default to GCN3. */
68
69/* Reserve this much space for LDS (for propagating variables from
70 worker-single mode to worker-partitioned mode), per workgroup. Global
71 analysis could calculate an exact bound, but we don't do that yet.
72
73 We reserve the whole LDS, which also prevents any other workgroup
74 sharing the Compute Unit. */
75
76#define LDS_SIZE 65536
77
78/* }}} */
79/* {{{ Initialization and options. */
80
81/* Initialize machine_function. */
82
83static struct machine_function *
84gcn_init_machine_status (void)
85{
86 struct machine_function *f;
87
88 f = ggc_cleared_alloc<machine_function> ();
89
90 /* Set up LDS allocation for broadcasting for this function. */
91 f->lds_allocated = 32;
92 f->lds_allocs = hash_map<tree, int>::create_ggc (64);
93
94 /* And LDS temporary decls for worker reductions. */
95 vec_alloc (f->reduc_decls, 0);
96
97 if (TARGET_GCN3)
98 f->use_flat_addressing = true;
99
100 return f;
101}
102
103/* Implement TARGET_OPTION_OVERRIDE.
104
105 Override option settings where defaults are variable, or we have specific
106 needs to consider. */
107
108static void
109gcn_option_override (void)
110{
111 init_machine_status = gcn_init_machine_status;
112
113 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
114 if (!flag_pie)
115 flag_pie = 2;
116 if (!flag_pic)
117 flag_pic = flag_pie;
118
119 gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
120
121 /* The default stack size needs to be small for offload kernels because
122 there may be many, many threads. Also, a smaller stack gives a
123 measureable performance boost. But, a small stack is insufficient
124 for running the testsuite, so we use a larger default for the stand
125 alone case. */
126 if (stack_size_opt == -1)
127 {
128 if (flag_openacc || flag_openmp)
129 /* 512 bytes per work item = 32kB total. */
130 stack_size_opt = 512 * 64;
131 else
132 /* 1MB total. */
133 stack_size_opt = 1048576;
134 }
135}
136
137/* }}} */
138/* {{{ Attributes. */
139
140/* This table defines the arguments that are permitted in
141 __attribute__ ((amdgpu_hsa_kernel (...))).
142
143 The names and values correspond to the HSA metadata that is encoded
144 into the assembler file and binary. */
145
146static const struct gcn_kernel_arg_type
147{
148 const char *name;
149 const char *header_pseudo;
150 machine_mode mode;
151
152 /* This should be set to -1 or -2 for a dynamically allocated register
153 number. Use -1 if this argument contributes to the user_sgpr_count,
154 -2 otherwise. */
155 int fixed_regno;
156} gcn_kernel_arg_types[] = {
157 {"exec", NULL, DImode, EXEC_REG},
158#define PRIVATE_SEGMENT_BUFFER_ARG 1
159 {"private_segment_buffer",
160 "enable_sgpr_private_segment_buffer", TImode, -1},
161#define DISPATCH_PTR_ARG 2
162 {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
163#define QUEUE_PTR_ARG 3
164 {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
165#define KERNARG_SEGMENT_PTR_ARG 4
166 {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
167 {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
168#define FLAT_SCRATCH_INIT_ARG 6
169 {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
170#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
171 {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
172 {"grid_workgroup_count_X",
173 "enable_sgpr_grid_workgroup_count_x", SImode, -1},
174 {"grid_workgroup_count_Y",
175 "enable_sgpr_grid_workgroup_count_y", SImode, -1},
176 {"grid_workgroup_count_Z",
177 "enable_sgpr_grid_workgroup_count_z", SImode, -1},
178#define WORKGROUP_ID_X_ARG 11
179 {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
180 {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
181 {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
182 {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
183#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
184 {"private_segment_wave_offset",
185 "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
186#define WORK_ITEM_ID_X_ARG 16
187 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
188#define WORK_ITEM_ID_Y_ARG 17
189 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
190#define WORK_ITEM_ID_Z_ARG 18
191 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
192};
193
194/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
195 This function also sets the default values for some arguments.
196
197 Return true on success, with ARGS populated. */
198
199static bool
200gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
201 tree list)
202{
203 bool err = false;
204 args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG)
205 | (1 << QUEUE_PTR_ARG)
206 | (1 << KERNARG_SEGMENT_PTR_ARG)
207 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG));
208 args->nargs = 0;
209
210 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
211 args->reg[a] = -1;
212
213 for (; list; list = TREE_CHAIN (list))
214 {
215 const char *str;
216 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
217 {
218 error ("amdgpu_hsa_kernel attribute requires string constant "
219 "arguments");
220 break;
221 }
222 str = TREE_STRING_POINTER (TREE_VALUE (list));
223 int a;
224 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
225 {
226 if (!strcmp (str, gcn_kernel_arg_types[a].name))
227 break;
228 }
229 if (a == GCN_KERNEL_ARG_TYPES)
230 {
231 error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str);
232 err = true;
233 break;
234 }
235 if (args->requested & (1 << a))
236 {
237 error ("duplicated parameter specifier %s in amdgpu_hsa_kernel "
238 "attribute", str);
239 err = true;
240 break;
241 }
242 args->requested |= (1 << a);
243 args->order[args->nargs++] = a;
244 }
245 args->requested |= (1 << WORKGROUP_ID_X_ARG);
246 args->requested |= (1 << WORK_ITEM_ID_Z_ARG);
247
248 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
249 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
250 requesting WORK_ITEM_ID_X_ARG. */
251 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
252 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
253 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
254 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
255
256 /* Always enable this so that kernargs is in a predictable place for
257 gomp_print, etc. */
258 args->requested |= (1 << DISPATCH_PTR_ARG);
259
260 int sgpr_regno = FIRST_SGPR_REG;
261 args->nsgprs = 0;
262 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
263 {
264 if (!(args->requested & (1 << a)))
265 continue;
266
267 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
268 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
269 else
270 {
271 int reg_count;
272
273 switch (gcn_kernel_arg_types[a].mode)
274 {
275 case E_SImode:
276 reg_count = 1;
277 break;
278 case E_DImode:
279 reg_count = 2;
280 break;
281 case E_TImode:
282 reg_count = 4;
283 break;
284 default:
285 gcc_unreachable ();
286 }
287 args->reg[a] = sgpr_regno;
288 sgpr_regno += reg_count;
289 if (gcn_kernel_arg_types[a].fixed_regno == -1)
290 args->nsgprs += reg_count;
291 }
292 }
293 if (sgpr_regno > FIRST_SGPR_REG + 16)
294 {
295 error ("too many arguments passed in sgpr registers");
296 }
297 return err;
298}
299
300/* Referenced by TARGET_ATTRIBUTE_TABLE.
301
302 Validates target specific attributes. */
303
304static tree
305gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
306 tree args, int, bool *no_add_attrs)
307{
7039cebf 308 if (!FUNC_OR_METHOD_TYPE_P (*node))
5326695a
AS
309 {
310 warning (OPT_Wattributes, "%qE attribute only applies to functions",
311 name);
312 *no_add_attrs = true;
313 return NULL_TREE;
314 }
315
316 /* Can combine regparm with all attributes but fastcall, and thiscall. */
317 if (is_attribute_p ("gcnhsa_kernel", name))
318 {
319 struct gcn_kernel_args kernelarg;
320
321 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
322 *no_add_attrs = true;
323
324 return NULL_TREE;
325 }
326
327 return NULL_TREE;
328}
329
330/* Implement TARGET_ATTRIBUTE_TABLE.
331
332 Create target-specific __attribute__ types. */
333
334static const struct attribute_spec gcn_attribute_table[] = {
335 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
336 affects_type_identity } */
337 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
338 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
339 /* End element. */
340 {NULL, 0, 0, false, false, false, false, NULL, NULL}
341};
342
343/* }}} */
344/* {{{ Registers and modes. */
345
346/* Implement TARGET_CLASS_MAX_NREGS.
347
348 Return the number of hard registers needed to hold a value of MODE in
349 a register of class RCLASS. */
350
351static unsigned char
352gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
353{
354 /* Scalar registers are 32bit, vector registers are in fact tuples of
355 64 lanes. */
356 if (rclass == VGPR_REGS)
357 {
358 if (vgpr_1reg_mode_p (mode))
359 return 1;
360 if (vgpr_2reg_mode_p (mode))
361 return 2;
362 /* TImode is used by DImode compare_and_swap. */
363 if (mode == TImode)
364 return 4;
365 }
366 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
367 return 2;
368 return CEIL (GET_MODE_SIZE (mode), 4);
369}
370
371/* Implement TARGET_HARD_REGNO_NREGS.
372
373 Return the number of hard registers needed to hold a value of MODE in
374 REGNO. */
375
376unsigned int
377gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
378{
379 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
380}
381
382/* Implement TARGET_HARD_REGNO_MODE_OK.
383
384 Return true if REGNO can hold value in MODE. */
385
386bool
387gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
388{
389 /* Treat a complex mode as if it were a scalar mode of the same overall
390 size for the purposes of allocating hard registers. */
391 if (COMPLEX_MODE_P (mode))
392 switch (mode)
393 {
394 case E_CQImode:
395 case E_CHImode:
396 mode = SImode;
397 break;
398 case E_CSImode:
399 mode = DImode;
400 break;
401 case E_CDImode:
402 mode = TImode;
403 break;
404 case E_HCmode:
405 mode = SFmode;
406 break;
407 case E_SCmode:
408 mode = DFmode;
409 break;
410 default:
411 /* Not supported. */
412 return false;
413 }
414
415 switch (regno)
416 {
417 case FLAT_SCRATCH_LO_REG:
418 case XNACK_MASK_LO_REG:
419 case TBA_LO_REG:
420 case TMA_LO_REG:
421 return (mode == SImode || mode == DImode);
422 case VCC_LO_REG:
423 case EXEC_LO_REG:
424 return (mode == BImode || mode == SImode || mode == DImode);
425 case M0_REG:
426 case FLAT_SCRATCH_HI_REG:
427 case XNACK_MASK_HI_REG:
428 case TBA_HI_REG:
429 case TMA_HI_REG:
430 return mode == SImode;
431 case VCC_HI_REG:
432 return false;
433 case EXEC_HI_REG:
434 return mode == SImode /*|| mode == V32BImode */ ;
435 case SCC_REG:
436 case VCCZ_REG:
437 case EXECZ_REG:
438 return mode == BImode;
439 }
440 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
441 return true;
442 if (SGPR_REGNO_P (regno))
443 /* We restrict double register values to aligned registers. */
444 return (sgpr_1reg_mode_p (mode)
445 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
446 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
447 if (VGPR_REGNO_P (regno))
448 return (vgpr_1reg_mode_p (mode) || vgpr_2reg_mode_p (mode)
449 /* TImode is used by DImode compare_and_swap. */
450 || mode == TImode);
451 return false;
452}
453
454/* Implement REGNO_REG_CLASS via gcn.h.
455
456 Return smallest class containing REGNO. */
457
458enum reg_class
459gcn_regno_reg_class (int regno)
460{
461 switch (regno)
462 {
463 case SCC_REG:
464 return SCC_CONDITIONAL_REG;
9ecf84e6
KCY
465 case VCC_LO_REG:
466 case VCC_HI_REG:
467 return VCC_CONDITIONAL_REG;
5326695a
AS
468 case VCCZ_REG:
469 return VCCZ_CONDITIONAL_REG;
470 case EXECZ_REG:
471 return EXECZ_CONDITIONAL_REG;
472 case EXEC_LO_REG:
473 case EXEC_HI_REG:
474 return EXEC_MASK_REG;
475 }
476 if (VGPR_REGNO_P (regno))
477 return VGPR_REGS;
478 if (SGPR_REGNO_P (regno))
479 return SGPR_REGS;
480 if (regno < FIRST_VGPR_REG)
481 return GENERAL_REGS;
482 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
483 return AFP_REGS;
484 return ALL_REGS;
485}
486
487/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
488
489 GCC assumes that lowpart contains first part of value as stored in memory.
490 This is not the case for vector registers. */
491
492bool
493gcn_can_change_mode_class (machine_mode from, machine_mode to,
494 reg_class_t regclass)
495{
496 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
497 return true;
498 return (gcn_class_max_nregs (regclass, from)
499 == gcn_class_max_nregs (regclass, to));
500}
501
502/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
503
504 When this hook returns true for MODE, the compiler allows
505 registers explicitly used in the rtl to be used as spill registers
506 but prevents the compiler from extending the lifetime of these
507 registers. */
508
509bool
510gcn_small_register_classes_for_mode_p (machine_mode mode)
511{
512 /* We allocate into exec and vcc regs. Those make small register class. */
513 return mode == DImode || mode == SImode;
514}
515
516/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
517
518 Returns true if pseudos that have been assigned to registers of class RCLASS
519 would likely be spilled because registers of RCLASS are needed for spill
520 registers. */
521
522static bool
523gcn_class_likely_spilled_p (reg_class_t rclass)
524{
525 return (rclass == EXEC_MASK_REG
526 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
527}
528
529/* Implement TARGET_MODES_TIEABLE_P.
530
531 Returns true if a value of MODE1 is accessible in MODE2 without
532 copying. */
533
534bool
535gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
536{
537 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
538 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
539}
540
541/* Implement TARGET_TRULY_NOOP_TRUNCATION.
542
543 Returns true if it is safe to “convert” a value of INPREC bits to one of
544 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
545 it as if it had only OUTPREC bits. */
546
547bool
548gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
549{
550 return ((inprec <= 32) && (outprec <= inprec));
551}
552
553/* Return N-th part of value occupying multiple registers. */
554
555rtx
556gcn_operand_part (machine_mode mode, rtx op, int n)
557{
558 if (GET_MODE_SIZE (mode) >= 256)
559 {
560 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
561
562 if (REG_P (op))
563 {
564 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
565 return gen_rtx_REG (V64SImode, REGNO (op) + n);
566 }
567 if (GET_CODE (op) == CONST_VECTOR)
568 {
569 int units = GET_MODE_NUNITS (mode);
570 rtvec v = rtvec_alloc (units);
571
572 for (int i = 0; i < units; ++i)
573 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
574 CONST_VECTOR_ELT (op, i), n);
575
576 return gen_rtx_CONST_VECTOR (V64SImode, v);
577 }
578 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
579 return gcn_gen_undef (V64SImode);
580 gcc_unreachable ();
581 }
582 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
583 {
584 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
585 return gen_rtx_REG (SImode, REGNO (op) + n);
586 }
587 else
588 {
589 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
590 return gcn_gen_undef (SImode);
591
592 /* If it's a constant then let's assume it is of the largest mode
593 available, otherwise simplify_gen_subreg will fail. */
594 if (mode == VOIDmode && CONST_INT_P (op))
595 mode = DImode;
596 return simplify_gen_subreg (SImode, op, mode, n * 4);
597 }
598}
599
600/* Return N-th part of value occupying multiple registers. */
601
602rtx
603gcn_operand_doublepart (machine_mode mode, rtx op, int n)
604{
605 return simplify_gen_subreg (DImode, op, mode, n * 8);
606}
607
608/* Return true if OP can be split into subregs or high/low parts.
609 This is always true for scalars, but not normally true for vectors.
610 However, for vectors in hardregs we can use the low and high registers. */
611
612bool
613gcn_can_split_p (machine_mode, rtx op)
614{
615 if (vgpr_vector_mode_p (GET_MODE (op)))
616 {
617 if (GET_CODE (op) == SUBREG)
618 op = SUBREG_REG (op);
619 if (!REG_P (op))
620 return true;
621 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
622 }
623 return true;
624}
625
626/* Implement TARGET_SPILL_CLASS.
627
628 Return class of registers which could be used for pseudo of MODE
629 and of class RCLASS for spilling instead of memory. Return NO_REGS
630 if it is not possible or non-profitable. */
631
632static reg_class_t
633gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
634{
9ecf84e6
KCY
635 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
636 || c == VCC_CONDITIONAL_REG)
5326695a
AS
637 return SGPR_REGS;
638 else
639 return NO_REGS;
640}
641
642/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
643
644 Change allocno class for given pseudo from allocno and best class
645 calculated by IRA. */
646
647static reg_class_t
648gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
649 reg_class_t best_cl)
650{
651 /* Avoid returning classes that contain both vgpr and sgpr registers. */
652 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
653 return cl;
654 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
655 && best_cl != ALL_GPR_REGS)
656 return best_cl;
657
658 machine_mode mode = PSEUDO_REGNO_MODE (regno);
659 if (vgpr_vector_mode_p (mode))
660 return VGPR_REGS;
661
662 return GENERAL_REGS;
663}
664
665/* Create a new DImode pseudo reg and emit an instruction to initialize
666 it to VAL. */
667
668static rtx
669get_exec (int64_t val)
670{
671 rtx reg = gen_reg_rtx (DImode);
672 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
673 return reg;
674}
675
676/* Return value of scalar exec register. */
677
678rtx
679gcn_scalar_exec ()
680{
681 return const1_rtx;
682}
683
684/* Return pseudo holding scalar exec register. */
685
686rtx
687gcn_scalar_exec_reg ()
688{
689 return get_exec (1);
690}
691
692/* Return value of full exec register. */
693
694rtx
695gcn_full_exec ()
696{
697 return constm1_rtx;
698}
699
700/* Return pseudo holding full exec register. */
701
702rtx
703gcn_full_exec_reg ()
704{
705 return get_exec (-1);
706}
707
708/* }}} */
709/* {{{ Immediate constants. */
710
711/* Initialize shared numeric constants. */
712
713static void
714init_ext_gcn_constants (void)
715{
716 real_from_integer (&dconst4, DFmode, 4, SIGNED);
717
718 /* FIXME: this constant probably does not match what hardware really loads.
719 Reality check it eventually. */
720 real_from_string (&dconst1over2pi,
721 "0.1591549430918953357663423455968866839");
722 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
723
724 ext_gcn_constants_init = 1;
725}
726
727/* Return non-zero if X is a constant that can appear as an inline operand.
728 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
729 Or a vector of those.
730 The value returned should be the encoding of this constant. */
731
732int
733gcn_inline_fp_constant_p (rtx x, bool allow_vector)
734{
735 machine_mode mode = GET_MODE (x);
736
737 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
738 && allow_vector)
739 {
740 int n;
741 if (GET_CODE (x) != CONST_VECTOR)
742 return 0;
743 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
744 if (!n)
745 return 0;
746 for (int i = 1; i < 64; i++)
747 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
748 return 0;
749 return 1;
750 }
751
752 if (mode != HFmode && mode != SFmode && mode != DFmode)
753 return 0;
754
755 const REAL_VALUE_TYPE *r;
756
757 if (x == CONST0_RTX (mode))
758 return 128;
759 if (x == CONST1_RTX (mode))
760 return 242;
761
762 r = CONST_DOUBLE_REAL_VALUE (x);
763
764 if (real_identical (r, &dconstm1))
765 return 243;
766
767 if (real_identical (r, &dconsthalf))
768 return 240;
769 if (real_identical (r, &dconstm1))
770 return 243;
771 if (real_identical (r, &dconst2))
772 return 244;
773 if (real_identical (r, &dconst4))
774 return 246;
775 if (real_identical (r, &dconst1over2pi))
776 return 248;
777 if (!ext_gcn_constants_init)
778 init_ext_gcn_constants ();
779 real_value_negate (r);
780 if (real_identical (r, &dconsthalf))
781 return 241;
782 if (real_identical (r, &dconst2))
783 return 245;
784 if (real_identical (r, &dconst4))
785 return 247;
786
787 /* FIXME: add 4, -4 and 1/(2*PI). */
788
789 return 0;
790}
791
792/* Return non-zero if X is a constant that can appear as an immediate operand.
793 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
794 Or a vector of those.
795 The value returned should be the encoding of this constant. */
796
797bool
798gcn_fp_constant_p (rtx x, bool allow_vector)
799{
800 machine_mode mode = GET_MODE (x);
801
802 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
803 && allow_vector)
804 {
805 int n;
806 if (GET_CODE (x) != CONST_VECTOR)
807 return false;
808 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
809 if (!n)
810 return false;
811 for (int i = 1; i < 64; i++)
812 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
813 return false;
814 return true;
815 }
816 if (mode != HFmode && mode != SFmode && mode != DFmode)
817 return false;
818
819 if (gcn_inline_fp_constant_p (x, false))
820 return true;
821 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
822 return (mode != DFmode);
823}
824
825/* Return true if X is a constant representable as an inline immediate
826 constant in a 32-bit instruction encoding. */
827
828bool
829gcn_inline_constant_p (rtx x)
830{
831 if (GET_CODE (x) == CONST_INT)
832 return INTVAL (x) >= -16 && INTVAL (x) < 64;
833 if (GET_CODE (x) == CONST_DOUBLE)
834 return gcn_inline_fp_constant_p (x, false);
835 if (GET_CODE (x) == CONST_VECTOR)
836 {
837 int n;
838 if (!vgpr_vector_mode_p (GET_MODE (x)))
839 return false;
840 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
841 if (!n)
842 return false;
843 for (int i = 1; i < 64; i++)
844 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
845 return false;
846 return 1;
847 }
848 return false;
849}
850
851/* Return true if X is a constant representable as an immediate constant
852 in a 32 or 64-bit instruction encoding. */
853
854bool
855gcn_constant_p (rtx x)
856{
857 switch (GET_CODE (x))
858 {
859 case CONST_INT:
860 return true;
861
862 case CONST_DOUBLE:
863 return gcn_fp_constant_p (x, false);
864
865 case CONST_VECTOR:
866 {
867 int n;
868 if (!vgpr_vector_mode_p (GET_MODE (x)))
869 return false;
870 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
871 if (!n)
872 return false;
873 for (int i = 1; i < 64; i++)
874 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
875 return false;
876 return true;
877 }
878
879 case SYMBOL_REF:
880 case LABEL_REF:
881 return true;
882
883 default:
884 ;
885 }
886
887 return false;
888}
889
890/* Return true if X is a constant representable as two inline immediate
891 constants in a 64-bit instruction that is split into two 32-bit
892 instructions. */
893
894bool
895gcn_inline_constant64_p (rtx x)
896{
897 if (GET_CODE (x) == CONST_VECTOR)
898 {
899 if (!vgpr_vector_mode_p (GET_MODE (x)))
900 return false;
901 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0)))
902 return false;
903 for (int i = 1; i < 64; i++)
904 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
905 return false;
906
907 return true;
908 }
909
910 if (GET_CODE (x) != CONST_INT)
911 return false;
912
913 rtx val_lo = gcn_operand_part (DImode, x, 0);
914 rtx val_hi = gcn_operand_part (DImode, x, 1);
915 return gcn_inline_constant_p (val_lo) && gcn_inline_constant_p (val_hi);
916}
917
918/* Return true if X is a constant representable as an immediate constant
919 in a 32 or 64-bit instruction encoding where the hardware will
920 extend the immediate to 64-bits. */
921
922bool
923gcn_constant64_p (rtx x)
924{
925 if (!gcn_constant_p (x))
926 return false;
927
928 if (GET_CODE (x) != CONST_INT)
929 return true;
930
931 /* Negative numbers are only allowed if they can be encoded within src0,
932 because the 32-bit immediates do not get sign-extended.
933 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
934 assembler will use a src0 inline immediate and that will get
935 sign-extended. */
936 HOST_WIDE_INT val = INTVAL (x);
937 return (((val & 0xffffffff) == val /* Positive 32-bit. */
938 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
939 || gcn_inline_constant_p (x)); /* Src0. */
940}
941
942/* Implement TARGET_LEGITIMATE_CONSTANT_P.
943
944 Returns true if X is a legitimate constant for a MODE immediate operand. */
945
946bool
947gcn_legitimate_constant_p (machine_mode, rtx x)
948{
949 return gcn_constant_p (x);
950}
951
952/* Return true if X is a CONST_VECTOR of single constant. */
953
954static bool
955single_cst_vector_p (rtx x)
956{
957 if (GET_CODE (x) != CONST_VECTOR)
958 return false;
959 for (int i = 1; i < 64; i++)
960 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
961 return false;
962 return true;
963}
964
965/* Create a CONST_VECTOR of duplicated value A. */
966
967rtx
968gcn_vec_constant (machine_mode mode, int a)
969{
970 /*if (!a)
971 return CONST0_RTX (mode);
972 if (a == -1)
973 return CONSTM1_RTX (mode);
974 if (a == 1)
975 return CONST1_RTX (mode);
976 if (a == 2)
977 return CONST2_RTX (mode);*/
978
979 int units = GET_MODE_NUNITS (mode);
980 rtx tem = gen_int_mode (a, GET_MODE_INNER (mode));
981 rtvec v = rtvec_alloc (units);
982
983 for (int i = 0; i < units; ++i)
984 RTVEC_ELT (v, i) = tem;
985
986 return gen_rtx_CONST_VECTOR (mode, v);
987}
988
989/* Create a CONST_VECTOR of duplicated value A. */
990
991rtx
992gcn_vec_constant (machine_mode mode, rtx a)
993{
994 int units = GET_MODE_NUNITS (mode);
995 rtvec v = rtvec_alloc (units);
996
997 for (int i = 0; i < units; ++i)
998 RTVEC_ELT (v, i) = a;
999
1000 return gen_rtx_CONST_VECTOR (mode, v);
1001}
1002
1003/* Create an undefined vector value, used where an insn operand is
1004 optional. */
1005
1006rtx
1007gcn_gen_undef (machine_mode mode)
1008{
1009 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1010}
1011
1012/* }}} */
1013/* {{{ Addresses, pointers and moves. */
1014
1015/* Return true is REG is a valid place to store a pointer,
1016 for instructions that require an SGPR.
1017 FIXME rename. */
1018
1019static bool
1020gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1021{
1022 if (GET_CODE (reg) == SUBREG)
1023 reg = SUBREG_REG (reg);
1024
1025 if (!REG_P (reg))
1026 return false;
1027
1028 if (GET_MODE (reg) != mode)
1029 return false;
1030
1031 int regno = REGNO (reg);
1032
1033 if (regno >= FIRST_PSEUDO_REGISTER)
1034 {
1035 if (!strict)
1036 return true;
1037
1038 if (!reg_renumber)
1039 return false;
1040
1041 regno = reg_renumber[regno];
1042 }
1043
1044 return (SGPR_REGNO_P (regno) || regno == M0_REG
1045 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1046}
1047
1048/* Return true is REG is a valid place to store a pointer,
1049 for instructions that require a VGPR. */
1050
1051static bool
1052gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1053{
1054 if (GET_CODE (reg) == SUBREG)
1055 reg = SUBREG_REG (reg);
1056
1057 if (!REG_P (reg))
1058 return false;
1059
1060 if (GET_MODE (reg) != mode)
1061 return false;
1062
1063 int regno = REGNO (reg);
1064
1065 if (regno >= FIRST_PSEUDO_REGISTER)
1066 {
1067 if (!strict)
1068 return true;
1069
1070 if (!reg_renumber)
1071 return false;
1072
1073 regno = reg_renumber[regno];
1074 }
1075
1076 return VGPR_REGNO_P (regno);
1077}
1078
1079/* Return true if X would be valid inside a MEM using the Flat address
1080 space. */
1081
1082bool
1083gcn_flat_address_p (rtx x, machine_mode mode)
1084{
1085 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1086 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1087
1088 if (vec_mode && gcn_address_register_p (x, DImode, false))
1089 return true;
1090
1091 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1092 return true;
1093
1094 if (TARGET_GCN5_PLUS
1095 && GET_CODE (x) == PLUS
1096 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1097 && CONST_INT_P (XEXP (x, 1)))
1098 return true;
1099
1100 return false;
1101}
1102
1103/* Return true if X would be valid inside a MEM using the Scalar Flat
1104 address space. */
1105
1106bool
1107gcn_scalar_flat_address_p (rtx x)
1108{
1109 if (gcn_address_register_p (x, DImode, false))
1110 return true;
1111
1112 if (GET_CODE (x) == PLUS
1113 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1114 && CONST_INT_P (XEXP (x, 1)))
1115 return true;
1116
1117 return false;
1118}
1119
1120/* Return true if MEM X would be valid for the Scalar Flat address space. */
1121
1122bool
1123gcn_scalar_flat_mem_p (rtx x)
1124{
1125 if (!MEM_P (x))
1126 return false;
1127
1128 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1129 return false;
1130
1131 return gcn_scalar_flat_address_p (XEXP (x, 0));
1132}
1133
1134/* Return true if X would be valid inside a MEM using the LDS or GDS
1135 address spaces. */
1136
1137bool
1138gcn_ds_address_p (rtx x)
1139{
1140 if (gcn_vec_address_register_p (x, SImode, false))
1141 return true;
1142
1143 if (GET_CODE (x) == PLUS
1144 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1145 && CONST_INT_P (XEXP (x, 1)))
1146 return true;
1147
1148 return false;
1149}
1150
1151/* Return true if ADDR would be valid inside a MEM using the Global
1152 address space. */
1153
1154bool
1155gcn_global_address_p (rtx addr)
1156{
1157 if (gcn_address_register_p (addr, DImode, false)
1158 || gcn_vec_address_register_p (addr, DImode, false))
1159 return true;
1160
1161 if (GET_CODE (addr) == PLUS)
1162 {
1163 rtx base = XEXP (addr, 0);
1164 rtx offset = XEXP (addr, 1);
1165 bool immediate_p = (CONST_INT_P (offset)
1166 && INTVAL (offset) >= -(1 << 12)
1167 && INTVAL (offset) < (1 << 12));
1168
1169 if ((gcn_address_register_p (base, DImode, false)
1170 || gcn_vec_address_register_p (base, DImode, false))
1171 && immediate_p)
1172 /* SGPR + CONST or VGPR + CONST */
1173 return true;
1174
1175 if (gcn_address_register_p (base, DImode, false)
1176 && gcn_vgpr_register_operand (offset, SImode))
1177 /* SPGR + VGPR */
1178 return true;
1179
1180 if (GET_CODE (base) == PLUS
1181 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1182 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1183 && immediate_p)
1184 /* (SGPR + VGPR) + CONST */
1185 return true;
1186 }
1187
1188 return false;
1189}
1190
1191/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1192
1193 Recognizes RTL expressions that are valid memory addresses for an
1194 instruction. The MODE argument is the machine mode for the MEM
1195 expression that wants to use this address.
1196
1197 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1198 convert common non-canonical forms to canonical form so that they will
1199 be recognized. */
1200
1201static bool
1202gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1203 addr_space_t as)
1204{
1205 /* All vector instructions need to work on addresses in registers. */
1206 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1207 return false;
1208
1209 if (AS_SCALAR_FLAT_P (as))
1210 {
1211 if (mode == QImode || mode == HImode)
1212 return 0;
1213
1214 switch (GET_CODE (x))
1215 {
1216 case REG:
1217 return gcn_address_register_p (x, DImode, strict);
1218 /* Addresses are in the form BASE+OFFSET
1219 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1220 Writes and atomics do not accept SGPR. */
1221 case PLUS:
1222 {
1223 rtx x0 = XEXP (x, 0);
1224 rtx x1 = XEXP (x, 1);
1225 if (!gcn_address_register_p (x0, DImode, strict))
1226 return false;
1227 /* FIXME: This is disabled because of the mode mismatch between
1228 SImode (for the address or m0 register) and the DImode PLUS.
1229 We'll need a zero_extend or similar.
1230
1231 if (gcn_m0_register_p (x1, SImode, strict)
1232 || gcn_address_register_p (x1, SImode, strict))
1233 return true;
1234 else*/
1235 if (GET_CODE (x1) == CONST_INT)
1236 {
1237 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1238 /* The low bits of the offset are ignored, even when
1239 they're meant to realign the pointer. */
1240 && !(INTVAL (x1) & 0x3))
1241 return true;
1242 }
1243 return false;
1244 }
1245
1246 default:
1247 break;
1248 }
1249 }
1250 else if (AS_SCRATCH_P (as))
1251 return gcn_address_register_p (x, SImode, strict);
1252 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1253 {
1254 if (TARGET_GCN3 || GET_CODE (x) == REG)
1255 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1256 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1257 ? gcn_address_register_p (x, DImode, strict)
1258 : gcn_vec_address_register_p (x, DImode, strict));
1259 else
1260 {
1261 gcc_assert (TARGET_GCN5_PLUS);
1262
1263 if (GET_CODE (x) == PLUS)
1264 {
1265 rtx x1 = XEXP (x, 1);
1266
1267 if (VECTOR_MODE_P (mode)
1268 ? !gcn_address_register_p (x, DImode, strict)
1269 : !gcn_vec_address_register_p (x, DImode, strict))
1270 return false;
1271
1272 if (GET_CODE (x1) == CONST_INT)
1273 {
1274 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1275 /* The low bits of the offset are ignored, even when
1276 they're meant to realign the pointer. */
1277 && !(INTVAL (x1) & 0x3))
1278 return true;
1279 }
1280 }
1281 return false;
1282 }
1283 }
1284 else if (AS_GLOBAL_P (as))
1285 {
1286 gcc_assert (TARGET_GCN5_PLUS);
1287
1288 if (GET_CODE (x) == REG)
1289 return (gcn_address_register_p (x, DImode, strict)
1290 || (!VECTOR_MODE_P (mode)
1291 && gcn_vec_address_register_p (x, DImode, strict)));
1292 else if (GET_CODE (x) == PLUS)
1293 {
1294 rtx base = XEXP (x, 0);
1295 rtx offset = XEXP (x, 1);
1296
1297 bool immediate_p = (GET_CODE (offset) == CONST_INT
1298 /* Signed 13-bit immediate. */
1299 && INTVAL (offset) >= -(1 << 12)
1300 && INTVAL (offset) < (1 << 12)
1301 /* The low bits of the offset are ignored, even
1302 when they're meant to realign the pointer. */
1303 && !(INTVAL (offset) & 0x3));
1304
1305 if (!VECTOR_MODE_P (mode))
1306 {
1307 if ((gcn_address_register_p (base, DImode, strict)
1308 || gcn_vec_address_register_p (base, DImode, strict))
1309 && immediate_p)
1310 /* SGPR + CONST or VGPR + CONST */
1311 return true;
1312
1313 if (gcn_address_register_p (base, DImode, strict)
1314 && gcn_vgpr_register_operand (offset, SImode))
1315 /* SGPR + VGPR */
1316 return true;
1317
1318 if (GET_CODE (base) == PLUS
1319 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1320 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1321 && immediate_p)
1322 /* (SGPR + VGPR) + CONST */
1323 return true;
1324 }
1325 else
1326 {
1327 if (gcn_address_register_p (base, DImode, strict)
1328 && immediate_p)
1329 /* SGPR + CONST */
1330 return true;
1331 }
1332 }
1333 else
1334 return false;
1335 }
1336 else if (AS_ANY_DS_P (as))
1337 switch (GET_CODE (x))
1338 {
1339 case REG:
1340 return (VECTOR_MODE_P (mode)
1341 ? gcn_address_register_p (x, SImode, strict)
1342 : gcn_vec_address_register_p (x, SImode, strict));
1343 /* Addresses are in the form BASE+OFFSET
1344 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1345 Writes and atomics do not accept SGPR. */
1346 case PLUS:
1347 {
1348 rtx x0 = XEXP (x, 0);
1349 rtx x1 = XEXP (x, 1);
1350 if (!gcn_vec_address_register_p (x0, DImode, strict))
1351 return false;
1352 if (GET_CODE (x1) == REG)
1353 {
1354 if (GET_CODE (x1) != REG
1355 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1356 && !gcn_ssrc_register_operand (x1, DImode)))
1357 return false;
1358 }
1359 else if (GET_CODE (x1) == CONST_VECTOR
1360 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1361 && single_cst_vector_p (x1))
1362 {
1363 x1 = CONST_VECTOR_ELT (x1, 0);
1364 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1365 return true;
1366 }
1367 return false;
1368 }
1369
1370 default:
1371 break;
1372 }
1373 else
1374 gcc_unreachable ();
1375 return false;
1376}
1377
1378/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1379
1380 Return the appropriate mode for a named address pointer. */
1381
1382static scalar_int_mode
1383gcn_addr_space_pointer_mode (addr_space_t addrspace)
1384{
1385 switch (addrspace)
1386 {
1387 case ADDR_SPACE_SCRATCH:
1388 case ADDR_SPACE_LDS:
1389 case ADDR_SPACE_GDS:
1390 return SImode;
1391 case ADDR_SPACE_DEFAULT:
1392 case ADDR_SPACE_FLAT:
1393 case ADDR_SPACE_FLAT_SCRATCH:
1394 case ADDR_SPACE_SCALAR_FLAT:
1395 return DImode;
1396 default:
1397 gcc_unreachable ();
1398 }
1399}
1400
1401/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1402
1403 Return the appropriate mode for a named address space address. */
1404
1405static scalar_int_mode
1406gcn_addr_space_address_mode (addr_space_t addrspace)
1407{
1408 return gcn_addr_space_pointer_mode (addrspace);
1409}
1410
1411/* Implement TARGET_ADDR_SPACE_SUBSET_P.
1412
1413 Determine if one named address space is a subset of another. */
1414
1415static bool
1416gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1417{
1418 if (subset == superset)
1419 return true;
1420 /* FIXME is this true? */
1421 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1422 return true;
1423 return false;
1424}
1425
1426/* Convert from one address space to another. */
1427
1428static rtx
1429gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1430{
1431 gcc_assert (POINTER_TYPE_P (from_type));
1432 gcc_assert (POINTER_TYPE_P (to_type));
1433
1434 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1435 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1436
1437 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1438 {
1439 rtx queue = gen_rtx_REG (DImode,
1440 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1441 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1442 gen_rtx_PLUS (DImode, queue,
1443 gen_int_mode (64, SImode)));
1444 rtx tmp = gen_reg_rtx (DImode);
1445
1446 emit_move_insn (gen_lowpart (SImode, tmp), op);
1447 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1448 group_seg_aperture_hi);
1449
1450 return tmp;
1451 }
1452 else if (as_from == as_to)
1453 return op;
1454 else
1455 gcc_unreachable ();
1456}
1457
1458
1459/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1460
1461 Retun true if REGNO is OK for memory adressing. */
1462
1463bool
1464gcn_regno_mode_code_ok_for_base_p (int regno,
1465 machine_mode, addr_space_t as, int, int)
1466{
1467 if (regno >= FIRST_PSEUDO_REGISTER)
1468 {
1469 if (reg_renumber)
1470 regno = reg_renumber[regno];
1471 else
1472 return true;
1473 }
1474 if (AS_FLAT_P (as))
1475 return (VGPR_REGNO_P (regno)
1476 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1477 else if (AS_SCALAR_FLAT_P (as))
1478 return (SGPR_REGNO_P (regno)
1479 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1480 else if (AS_GLOBAL_P (as))
1481 {
1482 return (SGPR_REGNO_P (regno)
1483 || VGPR_REGNO_P (regno)
1484 || regno == ARG_POINTER_REGNUM
1485 || regno == FRAME_POINTER_REGNUM);
1486 }
1487 else
1488 /* For now. */
1489 return false;
1490}
1491
1492/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1493
1494 Return a suitable register class for memory addressing. */
1495
1496reg_class
1497gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1498 int ic)
1499{
1500 switch (as)
1501 {
1502 case ADDR_SPACE_DEFAULT:
1503 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1504 case ADDR_SPACE_SCALAR_FLAT:
1505 case ADDR_SPACE_SCRATCH:
1506 return SGPR_REGS;
1507 break;
1508 case ADDR_SPACE_FLAT:
1509 case ADDR_SPACE_FLAT_SCRATCH:
1510 case ADDR_SPACE_LDS:
1511 case ADDR_SPACE_GDS:
1512 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1513 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1514 ? SGPR_REGS : VGPR_REGS);
1515 case ADDR_SPACE_GLOBAL:
1516 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1517 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1518 ? SGPR_REGS : ALL_GPR_REGS);
1519 }
1520 gcc_unreachable ();
1521}
1522
1523/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1524
1525 Return true if REGNO is OK for index of memory addressing. */
1526
1527bool
1528regno_ok_for_index_p (int regno)
1529{
1530 if (regno >= FIRST_PSEUDO_REGISTER)
1531 {
1532 if (reg_renumber)
1533 regno = reg_renumber[regno];
1534 else
1535 return true;
1536 }
1537 return regno == M0_REG || VGPR_REGNO_P (regno);
1538}
1539
1540/* Generate move which uses the exec flags. If EXEC is NULL, then it is
1541 assumed that all lanes normally relevant to the mode of the move are
1542 affected. If PREV is NULL, then a sensible default is supplied for
1543 the inactive lanes. */
1544
1545static rtx
1546gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1547{
1548 machine_mode mode = GET_MODE (op0);
1549
1550 if (vgpr_vector_mode_p (mode))
1551 {
1552 if (exec && exec != CONSTM1_RTX (DImode))
1553 {
1554 if (!prev)
1555 prev = op0;
1556 }
1557 else
1558 {
1559 if (!prev)
1560 prev = gcn_gen_undef (mode);
1561 exec = gcn_full_exec_reg ();
1562 }
1563
1564 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1565
1566 return gen_rtx_PARALLEL (VOIDmode,
1567 gen_rtvec (2, set,
1568 gen_rtx_CLOBBER (VOIDmode,
1569 gen_rtx_SCRATCH (V64DImode))));
1570 }
1571
1572 return (gen_rtx_PARALLEL
1573 (VOIDmode,
1574 gen_rtvec (2, gen_rtx_SET (op0, op1),
1575 gen_rtx_USE (VOIDmode,
1576 exec ? exec : gcn_scalar_exec ()))));
1577}
1578
1579/* Generate masked move. */
1580
1581static rtx
1582gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1583{
1584 if (exec)
1585 return (gen_rtx_SET (op0,
1586 gen_rtx_VEC_MERGE (GET_MODE (op0),
1587 gen_rtx_VEC_DUPLICATE (GET_MODE
1588 (op0), op1),
1589 op2, exec)));
1590 else
1591 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1592}
1593
1594/* Expand vector init of OP0 by VEC.
1595 Implements vec_init instruction pattern. */
1596
1597void
1598gcn_expand_vector_init (rtx op0, rtx vec)
1599{
1600 int64_t initialized_mask = 0;
1601 int64_t curr_mask = 1;
1602 machine_mode mode = GET_MODE (op0);
1603
1604 rtx val = XVECEXP (vec, 0, 0);
1605
1606 for (int i = 1; i < 64; i++)
1607 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1608 curr_mask |= (int64_t) 1 << i;
1609
1610 if (gcn_constant_p (val))
1611 emit_move_insn (op0, gcn_vec_constant (mode, val));
1612 else
1613 {
1614 val = force_reg (GET_MODE_INNER (mode), val);
1615 emit_insn (gen_duplicate_load (op0, val));
1616 }
1617 initialized_mask |= curr_mask;
1618 for (int i = 1; i < 64; i++)
1619 if (!(initialized_mask & ((int64_t) 1 << i)))
1620 {
1621 curr_mask = (int64_t) 1 << i;
1622 rtx val = XVECEXP (vec, 0, i);
1623
1624 for (int j = i + 1; j < 64; j++)
1625 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1626 curr_mask |= (int64_t) 1 << j;
1627 if (gcn_constant_p (val))
1628 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1629 get_exec (curr_mask)));
1630 else
1631 {
1632 val = force_reg (GET_MODE_INNER (mode), val);
1633 emit_insn (gen_duplicate_load (op0, val, op0,
1634 get_exec (curr_mask)));
1635 }
1636 initialized_mask |= curr_mask;
1637 }
1638}
1639
1640/* Load vector constant where n-th lane contains BASE+n*VAL. */
1641
1642static rtx
1643strided_constant (machine_mode mode, int base, int val)
1644{
1645 rtx x = gen_reg_rtx (mode);
1646 emit_move_insn (x, gcn_vec_constant (mode, base));
1647 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1648 x, get_exec (0xffffffff00000000)));
1649 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1650 x, get_exec (0xffff0000ffff0000)));
1651 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1652 x, get_exec (0xff00ff00ff00ff00)));
1653 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1654 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1655 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1656 x, get_exec (0xcccccccccccccccc)));
1657 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1658 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1659 return x;
1660}
1661
1662/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1663
1664static rtx
1665gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1666 addr_space_t as)
1667{
1668 switch (as)
1669 {
1670 case ADDR_SPACE_DEFAULT:
1671 return gcn_addr_space_legitimize_address (x, old, mode,
1672 DEFAULT_ADDR_SPACE);
1673 case ADDR_SPACE_SCALAR_FLAT:
1674 case ADDR_SPACE_SCRATCH:
1675 /* Instructions working on vectors need the address to be in
1676 a register. */
1677 if (vgpr_vector_mode_p (mode))
1678 return force_reg (GET_MODE (x), x);
1679
1680 return x;
1681 case ADDR_SPACE_FLAT:
1682 case ADDR_SPACE_FLAT_SCRATCH:
1683 case ADDR_SPACE_GLOBAL:
1684 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1685 case ADDR_SPACE_LDS:
1686 case ADDR_SPACE_GDS:
1687 /* FIXME: LDS support offsets, handle them!. */
1688 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1689 {
1690 rtx addrs = gen_reg_rtx (V64SImode);
1691 rtx base = force_reg (SImode, x);
1692 rtx offsets = strided_constant (V64SImode, 0,
1693 GET_MODE_UNIT_SIZE (mode));
1694
1695 emit_insn (gen_vec_duplicatev64si (addrs, base));
1696 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1697 return addrs;
1698 }
1699 return x;
1700 }
1701 gcc_unreachable ();
1702}
1703
1704/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1705 proper vector of stepped addresses.
1706
1707 MEM will be a DImode address of a vector in an SGPR.
1708 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1709
1710rtx
1711gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1712 rtx tmp)
1713{
1714 gcc_assert (MEM_P (mem));
1715 rtx mem_base = XEXP (mem, 0);
1716 rtx mem_index = NULL_RTX;
1717
1718 if (!TARGET_GCN5_PLUS)
1719 {
1720 /* gcn_addr_space_legitimize_address should have put the address in a
1721 register. If not, it is too late to do anything about it. */
1722 gcc_assert (REG_P (mem_base));
1723 }
1724
1725 if (GET_CODE (mem_base) == PLUS)
1726 {
1727 mem_index = XEXP (mem_base, 1);
1728 mem_base = XEXP (mem_base, 0);
1729 }
1730
1731 /* RF and RM base registers for vector modes should be always an SGPR. */
1732 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1733 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1734
1735 machine_mode inner = GET_MODE_INNER (mode);
1736 int shift = exact_log2 (GET_MODE_SIZE (inner));
1737 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1738 rtx undef_v64si = gcn_gen_undef (V64SImode);
1739 rtx new_base = NULL_RTX;
1740 addr_space_t as = MEM_ADDR_SPACE (mem);
1741
1742 rtx tmplo = (REG_P (tmp)
1743 ? gcn_operand_part (V64DImode, tmp, 0)
1744 : gen_reg_rtx (V64SImode));
1745
1746 /* tmplo[:] = ramp[:] << shift */
1747 if (exec)
1748 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1749 gen_int_mode (shift, SImode),
1750 undef_v64si, exec));
1751 else
1752 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1753
1754 if (AS_FLAT_P (as))
1755 {
1756 if (REG_P (tmp))
1757 {
1758 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1759 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1760 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1761 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1762
1763 /* tmphi[:] = mem_base_hi */
1764 if (exec)
1765 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1766 undef_v64si, exec));
1767 else
1768 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1769
1770 /* tmp[:] += zext (mem_base) */
1771 if (exec)
1772 {
1773 rtx undef_di = gcn_gen_undef (DImode);
1774 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1775 vcc, undef_v64si, exec));
1776 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1777 vcc, vcc, undef_v64si, exec));
1778 }
1779 else
1780 emit_insn (gen_addv64di3_zext_dup (tmp, mem_base_lo, tmp));
1781 }
1782 else
1783 {
1784 tmp = gen_reg_rtx (V64DImode);
1785 if (exec)
1786 emit_insn (gen_addv64di3_zext_dup2_exec (tmp, tmplo, mem_base,
1787 gcn_gen_undef (V64DImode),
1788 exec));
1789 else
1790 emit_insn (gen_addv64di3_zext_dup2 (tmp, tmplo, mem_base));
1791 }
1792
1793 new_base = tmp;
1794 }
1795 else if (AS_ANY_DS_P (as))
1796 {
1797 if (!exec)
1798 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1799 else
1800 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1801 gcn_gen_undef (V64SImode), exec));
1802 new_base = tmplo;
1803 }
1804 else
1805 {
1806 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1807 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1808 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1809 }
1810
1811 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1812 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1813 (mem_index ? mem_index
1814 : const0_rtx)));
1815}
1816
1817/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1818 suitable for the given address space. This is indented for use in
1819 gather/scatter patterns.
1820
1821 The offsets may be signed or unsigned, according to UNSIGNED_P.
1822 If EXEC is set then _exec patterns will be used, otherwise plain.
1823
1824 Return values.
1825 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1826 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1827
1828rtx
1829gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1830 bool unsigned_p, rtx exec)
1831{
1832 /* Convert the offsets to V64SImode.
1833 TODO: more conversions will be needed when more types are vectorized. */
1834 if (GET_MODE (offsets) == V64DImode)
1835 {
1836 rtx tmp = gen_reg_rtx (V64SImode);
1837 emit_insn (gen_vec_truncatev64div64si (tmp, offsets));
1838 offsets = tmp;
1839 }
1840
1841 rtx tmpsi = gen_reg_rtx (V64SImode);
1842 rtx tmpdi = gen_reg_rtx (V64DImode);
1843 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1844 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1845
1846 if (CONST_INT_P (scale)
1847 && INTVAL (scale) > 0
1848 && exact_log2 (INTVAL (scale)) >= 0)
1849 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1850 GEN_INT (exact_log2 (INTVAL (scale)))));
1851 else
1852 (exec
1853 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1854 exec))
1855 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1856
1857 /* "Global" instructions do not support negative register offsets. */
1858 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1859 {
1860 if (unsigned_p)
1861 (exec
1862 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1863 undefdi, exec))
1864 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1865 else
1866 (exec
1867 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1868 undefdi, exec))
1869 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1870 return tmpdi;
1871 }
1872 else if (as == ADDR_SPACE_GLOBAL)
1873 return tmpsi;
1874
1875 gcc_unreachable ();
1876}
1877
1878/* Return true if move from OP0 to OP1 is known to be executed in vector
1879 unit. */
1880
1881bool
1882gcn_vgpr_move_p (rtx op0, rtx op1)
1883{
1884 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1885 return true;
1886 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1887 return true;
1888 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1889 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1890 || vgpr_vector_mode_p (GET_MODE (op0)));
1891}
1892
1893/* Return true if move from OP0 to OP1 is known to be executed in scalar
1894 unit. Used in the machine description. */
1895
1896bool
1897gcn_sgpr_move_p (rtx op0, rtx op1)
1898{
1899 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1900 return true;
1901 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1902 return true;
1903 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1904 || VGPR_REGNO_P (REGNO (op0)))
1905 return false;
1906 if (REG_P (op1)
1907 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1908 && !VGPR_REGNO_P (REGNO (op1)))
1909 return true;
1910 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1911}
1912
1913/* Implement TARGET_SECONDARY_RELOAD.
1914
1915 The address space determines which registers can be used for loads and
1916 stores. */
1917
1918static reg_class_t
1919gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
1920 machine_mode reload_mode, secondary_reload_info *sri)
1921{
1922 reg_class_t result = NO_REGS;
1923 bool spilled_pseudo =
1924 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
1925
1926 if (dump_file && (dump_flags & TDF_DETAILS))
1927 {
1928 fprintf (dump_file, "gcn_secondary_reload: ");
1929 dump_value_slim (dump_file, x, 1);
1930 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
1931 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
1932 if (REG_P (x) || GET_CODE (x) == SUBREG)
1933 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
1934 (true_regnum (x) >= 0
1935 && true_regnum (x) < FIRST_PSEUDO_REGISTER
1936 ? reg_names[true_regnum (x)]
1937 : (spilled_pseudo ? "stack spill" : "??")));
1938 fprintf (dump_file, "\n");
1939 }
1940
1941 /* Some callers don't use or initialize icode. */
1942 sri->icode = CODE_FOR_nothing;
1943
1944 if (MEM_P (x) || spilled_pseudo)
1945 {
1946 addr_space_t as = DEFAULT_ADDR_SPACE;
1947
1948 /* If we have a spilled pseudo, we can't find the address space
1949 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
1950 ADDR_SPACE_GLOBAL for GCN5. */
1951 if (MEM_P (x))
1952 as = MEM_ADDR_SPACE (x);
1953
1954 if (as == ADDR_SPACE_DEFAULT)
1955 as = DEFAULT_ADDR_SPACE;
1956
1957 switch (as)
1958 {
1959 case ADDR_SPACE_SCALAR_FLAT:
1960 result =
1961 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
1962 break;
1963 case ADDR_SPACE_FLAT:
1964 case ADDR_SPACE_FLAT_SCRATCH:
1965 case ADDR_SPACE_GLOBAL:
1966 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
1967 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
1968 {
1969 if (in_p)
1970 switch (reload_mode)
1971 {
1972 case E_V64SImode:
1973 sri->icode = CODE_FOR_reload_inv64si;
1974 break;
1975 case E_V64SFmode:
1976 sri->icode = CODE_FOR_reload_inv64sf;
1977 break;
1978 case E_V64HImode:
1979 sri->icode = CODE_FOR_reload_inv64hi;
1980 break;
1981 case E_V64HFmode:
1982 sri->icode = CODE_FOR_reload_inv64hf;
1983 break;
1984 case E_V64QImode:
1985 sri->icode = CODE_FOR_reload_inv64qi;
1986 break;
1987 case E_V64DImode:
1988 sri->icode = CODE_FOR_reload_inv64di;
1989 break;
1990 case E_V64DFmode:
1991 sri->icode = CODE_FOR_reload_inv64df;
1992 break;
1993 default:
1994 gcc_unreachable ();
1995 }
1996 else
1997 switch (reload_mode)
1998 {
1999 case E_V64SImode:
2000 sri->icode = CODE_FOR_reload_outv64si;
2001 break;
2002 case E_V64SFmode:
2003 sri->icode = CODE_FOR_reload_outv64sf;
2004 break;
2005 case E_V64HImode:
2006 sri->icode = CODE_FOR_reload_outv64hi;
2007 break;
2008 case E_V64HFmode:
2009 sri->icode = CODE_FOR_reload_outv64hf;
2010 break;
2011 case E_V64QImode:
2012 sri->icode = CODE_FOR_reload_outv64qi;
2013 break;
2014 case E_V64DImode:
2015 sri->icode = CODE_FOR_reload_outv64di;
2016 break;
2017 case E_V64DFmode:
2018 sri->icode = CODE_FOR_reload_outv64df;
2019 break;
2020 default:
2021 gcc_unreachable ();
2022 }
2023 break;
2024 }
2025 /* Fallthrough. */
2026 case ADDR_SPACE_LDS:
2027 case ADDR_SPACE_GDS:
2028 case ADDR_SPACE_SCRATCH:
2029 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2030 break;
2031 }
2032 }
2033
2034 if (dump_file && (dump_flags & TDF_DETAILS))
2035 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2036 get_insn_name (sri->icode));
2037
2038 return result;
2039}
2040
2041/* Update register usage after having seen the compiler flags and kernel
2042 attributes. We typically want to fix registers that contain values
2043 set by the HSA runtime. */
2044
2045static void
2046gcn_conditional_register_usage (void)
2047{
2048 int i;
2049
2050 /* FIXME: Do we need to reset fixed_regs? */
2051
2052/* Limit ourselves to 1/16 the register file for maximimum sized workgroups.
2053 There are enough SGPRs not to limit those.
2054 TODO: Adjust this more dynamically. */
2055 for (i = FIRST_VGPR_REG + 64; i <= LAST_VGPR_REG; i++)
2056 fixed_regs[i] = 1, call_used_regs[i] = 1;
2057
2058 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2059 {
2060 /* Normal functions can't know what kernel argument registers are
2061 live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs. */
2062 for (i = 0; i < 16; i++)
2063 fixed_regs[FIRST_SGPR_REG + i] = 1;
2064 for (i = 0; i < 3; i++)
2065 fixed_regs[FIRST_VGPR_REG + i] = 1;
2066 return;
2067 }
2068
2069 /* Fix the runtime argument register containing values that may be
2070 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2071 needed after the prologue so there's no need to fix them. */
2072 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2073 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2074 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2075 {
2076 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2077 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2078 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 2] = 1;
2079 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 3] = 1;
2080 }
2081 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2082 {
2083 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2084 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2085 }
2086 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2087 {
2088 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2089 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2090 }
2091 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2092 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2093 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2094 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2095 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2096 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2097 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2098 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2099
2100 if (TARGET_GCN5_PLUS)
2101 /* v0 is always zero, for global nul-offsets. */
2102 fixed_regs[VGPR_REGNO (0)] = 1;
2103}
2104
2105/* Determine if a load or store is valid, according to the register classes
2106 and address space. Used primarily by the machine description to decide
2107 when to split a move into two steps. */
2108
2109bool
2110gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2111{
2112 if (!MEM_P (dest) && !MEM_P (src))
2113 return true;
2114
2115 if (MEM_P (dest)
2116 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2117 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2118 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2119 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2120 && gcn_vgpr_register_operand (src, mode))
2121 return true;
2122 else if (MEM_P (src)
2123 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2124 && (gcn_flat_address_p (XEXP (src, 0), mode)
2125 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2126 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2127 && gcn_vgpr_register_operand (dest, mode))
2128 return true;
2129
2130 if (MEM_P (dest)
2131 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2132 && (gcn_global_address_p (XEXP (dest, 0))
2133 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2134 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2135 && gcn_vgpr_register_operand (src, mode))
2136 return true;
2137 else if (MEM_P (src)
2138 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2139 && (gcn_global_address_p (XEXP (src, 0))
2140 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2141 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2142 && gcn_vgpr_register_operand (dest, mode))
2143 return true;
2144
2145 if (MEM_P (dest)
2146 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2147 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2148 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2149 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2150 && gcn_ssrc_register_operand (src, mode))
2151 return true;
2152 else if (MEM_P (src)
2153 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2154 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2155 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2156 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2157 && gcn_sdst_register_operand (dest, mode))
2158 return true;
2159
2160 if (MEM_P (dest)
2161 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2162 && gcn_ds_address_p (XEXP (dest, 0))
2163 && gcn_vgpr_register_operand (src, mode))
2164 return true;
2165 else if (MEM_P (src)
2166 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2167 && gcn_ds_address_p (XEXP (src, 0))
2168 && gcn_vgpr_register_operand (dest, mode))
2169 return true;
2170
2171 return false;
2172}
2173
2174/* }}} */
2175/* {{{ Functions and ABI. */
2176
2177/* Implement TARGET_FUNCTION_VALUE.
2178
2179 Define how to find the value returned by a function.
2180 The register location is always the same, but the mode depends on
2181 VALTYPE. */
2182
2183static rtx
2184gcn_function_value (const_tree valtype, const_tree, bool)
2185{
2186 machine_mode mode = TYPE_MODE (valtype);
2187
2188 if (INTEGRAL_TYPE_P (valtype)
2189 && GET_MODE_CLASS (mode) == MODE_INT
2190 && GET_MODE_SIZE (mode) < 4)
2191 mode = SImode;
2192
2193 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2194}
2195
2196/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2197
2198 Return true if N is a possible register number for the function return
2199 value. */
2200
2201static bool
2202gcn_function_value_regno_p (const unsigned int n)
2203{
2204 return n == RETURN_VALUE_REG;
2205}
2206
0ffef200
RS
2207/* Calculate the number of registers required to hold function argument
2208 ARG. */
5326695a
AS
2209
2210static int
0ffef200 2211num_arg_regs (const function_arg_info &arg)
5326695a 2212{
0ffef200 2213 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2214 return 0;
2215
0ffef200 2216 int size = arg.promoted_size_in_bytes ();
5326695a
AS
2217 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2218}
2219
2220/* Implement TARGET_STRICT_ARGUMENT_NAMING.
2221
2222 Return true if the location where a function argument is passed
2223 depends on whether or not it is a named argument
2224
2225 For gcn, we know how to handle functions declared as stdarg: by
2226 passing an extra pointer to the unnamed arguments. However, the
2227 Fortran frontend can produce a different situation, where a
2228 function pointer is declared with no arguments, but the actual
2229 function and calls to it take more arguments. In that case, we
2230 want to ensure the call matches the definition of the function. */
2231
2232static bool
2233gcn_strict_argument_naming (cumulative_args_t cum_v)
2234{
2235 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2236
2237 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2238}
2239
2240/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2241
2242 See comment on gcn_strict_argument_naming. */
2243
2244static bool
2245gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2246{
2247 return !gcn_strict_argument_naming (cum_v);
2248}
2249
2250/* Implement TARGET_FUNCTION_ARG.
2251
2252 Return an RTX indicating whether a function argument is passed in a register
2253 and if so, which register. */
2254
2255static rtx
6783fdb7 2256gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2257{
2258 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2259 if (cum->normal_function)
2260 {
6783fdb7 2261 if (!arg.named || arg.end_marker_p ())
5326695a
AS
2262 return 0;
2263
0ffef200 2264 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2265 return 0;
2266
2267 int reg_num = FIRST_PARM_REG + cum->num;
0ffef200 2268 int num_regs = num_arg_regs (arg);
5326695a
AS
2269 if (num_regs > 0)
2270 while (reg_num % num_regs != 0)
2271 reg_num++;
2272 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
6783fdb7 2273 return gen_rtx_REG (arg.mode, reg_num);
5326695a
AS
2274 }
2275 else
2276 {
2277 if (cum->num >= cum->args.nargs)
2278 {
6783fdb7
RS
2279 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2280 & -(TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2281 cfun->machine->kernarg_segment_alignment
2282 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
6783fdb7 2283 TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2284 rtx addr = gen_rtx_REG (DImode,
2285 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2286 if (cum->offset)
2287 addr = gen_rtx_PLUS (DImode, addr,
2288 gen_int_mode (cum->offset, DImode));
6783fdb7
RS
2289 rtx mem = gen_rtx_MEM (arg.mode, addr);
2290 set_mem_attributes (mem, arg.type, 1);
5326695a
AS
2291 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2292 MEM_READONLY_P (mem) = 1;
2293 return mem;
2294 }
2295
2296 int a = cum->args.order[cum->num];
6783fdb7 2297 if (arg.mode != gcn_kernel_arg_types[a].mode)
5326695a
AS
2298 {
2299 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2300 return 0;
2301 }
2302 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2303 cum->args.reg[a]);
2304 }
2305 return 0;
2306}
2307
2308/* Implement TARGET_FUNCTION_ARG_ADVANCE.
2309
2310 Updates the summarizer variable pointed to by CUM_V to advance past an
2311 argument in the argument list. */
2312
2313static void
6930c98c
RS
2314gcn_function_arg_advance (cumulative_args_t cum_v,
2315 const function_arg_info &arg)
5326695a
AS
2316{
2317 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2318
2319 if (cum->normal_function)
2320 {
6930c98c 2321 if (!arg.named)
5326695a
AS
2322 return;
2323
0ffef200 2324 int num_regs = num_arg_regs (arg);
5326695a
AS
2325 if (num_regs > 0)
2326 while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2327 cum->num++;
2328 cum->num += num_regs;
2329 }
2330 else
2331 {
2332 if (cum->num < cum->args.nargs)
2333 cum->num++;
2334 else
2335 {
6930c98c 2336 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
5326695a
AS
2337 cfun->machine->kernarg_segment_byte_size = cum->offset;
2338 }
2339 }
2340}
2341
2342/* Implement TARGET_ARG_PARTIAL_BYTES.
2343
2344 Returns the number of bytes at the beginning of an argument that must be put
2345 in registers. The value must be zero for arguments that are passed entirely
2346 in registers or that are entirely pushed on the stack. */
2347
2348static int
a7c81bc1 2349gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2350{
2351 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2352
a7c81bc1 2353 if (!arg.named)
5326695a
AS
2354 return 0;
2355
0ffef200 2356 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2357 return 0;
2358
2359 if (cum->num >= NUM_PARM_REGS)
2360 return 0;
2361
2362 /* If the argument fits entirely in registers, return 0. */
0ffef200 2363 if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
5326695a
AS
2364 return 0;
2365
2366 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2367}
2368
2369/* A normal function which takes a pointer argument (to a scalar) may be
2370 passed a pointer to LDS space (via a high-bits-set aperture), and that only
2371 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
2372 function has an incoming pointer-to-scalar parameter. */
2373
2374static void
2375gcn_detect_incoming_pointer_arg (tree fndecl)
2376{
2377 gcc_assert (cfun && cfun->machine);
2378
2379 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2380 arg;
2381 arg = TREE_CHAIN (arg))
2382 if (POINTER_TYPE_P (TREE_VALUE (arg))
2383 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2384 cfun->machine->use_flat_addressing = true;
2385}
2386
2387/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2388
2389 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2390 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2391
2392void
2393gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2394 tree fntype /* tree ptr for function decl */ ,
2395 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2396 tree fndecl, int caller)
2397{
2398 memset (cum, 0, sizeof (*cum));
2399 cum->fntype = fntype;
2400 if (libname)
2401 {
2402 gcc_assert (cfun && cfun->machine);
2403 cum->normal_function = true;
2404 if (!caller)
2405 {
2406 cfun->machine->normal_function = true;
2407 gcn_detect_incoming_pointer_arg (fndecl);
2408 }
2409 return;
2410 }
2411 tree attr = NULL;
2412 if (fndecl)
2413 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2414 if (fndecl && !attr)
2415 attr = lookup_attribute ("amdgpu_hsa_kernel",
2416 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2417 if (!attr && fntype)
2418 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2419 /* Handle main () as kernel, so we can run testsuite.
2420 Handle OpenACC kernels similarly to main. */
2421 if (!attr && !caller && fndecl
2422 && (MAIN_NAME_P (DECL_NAME (fndecl))
2423 || lookup_attribute ("omp target entrypoint",
2424 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2425 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2426 else
2427 {
2428 if (!attr || caller)
2429 {
2430 gcc_assert (cfun && cfun->machine);
2431 cum->normal_function = true;
2432 if (!caller)
2433 cfun->machine->normal_function = true;
2434 }
2435 gcn_parse_amdgpu_hsa_kernel_attribute
2436 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2437 }
2438 cfun->machine->args = cum->args;
2439 if (!caller && cfun->machine->normal_function)
2440 gcn_detect_incoming_pointer_arg (fndecl);
3ed8f692
KCY
2441
2442 reinit_regs ();
5326695a
AS
2443}
2444
2445static bool
2446gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2447{
2448 machine_mode mode = TYPE_MODE (type);
2449 HOST_WIDE_INT size = int_size_in_bytes (type);
2450
2451 if (AGGREGATE_TYPE_P (type))
2452 return true;
2453
2454 if (mode == BLKmode)
2455 return true;
2456
2457 if (size > 2 * UNITS_PER_WORD)
2458 return true;
2459
2460 return false;
2461}
2462
2463/* Implement TARGET_PROMOTE_FUNCTION_MODE.
2464
2465 Return the mode to use for outgoing function arguments. */
2466
2467machine_mode
2468gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2469 int *ARG_UNUSED (punsignedp),
2470 const_tree ARG_UNUSED (funtype),
2471 int ARG_UNUSED (for_return))
2472{
2473 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2474 return SImode;
2475
2476 return mode;
2477}
2478
2479/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2480
2481 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2482 ARGS_GROW_DOWNWARDS. */
2483
2484static tree
2485gcn_gimplify_va_arg_expr (tree valist, tree type,
2486 gimple_seq *ARG_UNUSED (pre_p),
2487 gimple_seq *ARG_UNUSED (post_p))
2488{
2489 tree ptr = build_pointer_type (type);
2490 tree valist_type;
2491 tree t, u;
2492 bool indirect;
2493
fde65a89 2494 indirect = pass_va_arg_by_reference (type);
5326695a
AS
2495 if (indirect)
2496 {
2497 type = ptr;
2498 ptr = build_pointer_type (type);
2499 }
2500 valist_type = TREE_TYPE (valist);
2501
2502 /* Args grow down. Not handled by generic routines. */
2503
2504 u = fold_convert (sizetype, size_in_bytes (type));
2505 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2506 t = fold_build_pointer_plus (valist, u);
2507
2508 /* Align to 8 byte boundary. */
2509
2510 u = build_int_cst (TREE_TYPE (t), -8);
2511 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2512 t = fold_convert (valist_type, t);
2513
2514 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2515
2516 t = fold_convert (ptr, t);
2517 t = build_va_arg_indirect_ref (t);
2518
2519 if (indirect)
2520 t = build_va_arg_indirect_ref (t);
2521
2522 return t;
2523}
2524
955cd057
TB
2525/* Return 1 if TRAIT NAME is present in the OpenMP context's
2526 device trait set, return 0 if not present in any OpenMP context in the
2527 whole translation unit, or -1 if not present in the current OpenMP context
2528 but might be present in another OpenMP context in the same TU. */
2529
2530int
2531gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2532 const char *name)
2533{
2534 switch (trait)
2535 {
2536 case omp_device_kind:
2537 return strcmp (name, "gpu") == 0;
2538 case omp_device_arch:
2539 return strcmp (name, "gcn") == 0;
2540 case omp_device_isa:
2541 if (strcmp (name, "carrizo") == 0)
2542 return gcn_arch == PROCESSOR_CARRIZO;
2543 if (strcmp (name, "fiji") == 0)
2544 return gcn_arch == PROCESSOR_FIJI;
2545 if (strcmp (name, "gfx900") == 0)
2546 return gcn_arch == PROCESSOR_VEGA;
2547 if (strcmp (name, "gfx906") == 0)
2548 return gcn_arch == PROCESSOR_VEGA;
2549 return 0;
2550 default:
2551 gcc_unreachable ();
2552 }
2553}
2554
5326695a
AS
2555/* Calculate stack offsets needed to create prologues and epilogues. */
2556
2557static struct machine_function *
2558gcn_compute_frame_offsets (void)
2559{
2560 machine_function *offsets = cfun->machine;
2561
2562 if (reload_completed)
2563 return offsets;
2564
2565 offsets->need_frame_pointer = frame_pointer_needed;
2566
2567 offsets->outgoing_args_size = crtl->outgoing_args_size;
2568 offsets->pretend_size = crtl->args.pretend_args_size;
2569
2570 offsets->local_vars = get_frame_size ();
2571
2572 offsets->lr_needs_saving = (!leaf_function_p ()
2573 || df_regs_ever_live_p (LR_REGNUM)
2574 || df_regs_ever_live_p (LR_REGNUM + 1));
2575
2576 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2577
2578 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 2579 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2580 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2581 && frame_pointer_needed))
2582 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2583
2584 /* Round up to 64-bit boundary to maintain stack alignment. */
2585 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2586
2587 return offsets;
2588}
2589
2590/* Insert code into the prologue or epilogue to store or load any
2591 callee-save register to/from the stack.
2592
2593 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2594
2595static void
2596move_callee_saved_registers (rtx sp, machine_function *offsets,
2597 bool prologue)
2598{
2599 int regno, offset, saved_scalars;
2600 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2601 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2602 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2603 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2604 HOST_WIDE_INT exec_set = 0;
2605 int offreg_set = 0;
2606
2607 start_sequence ();
2608
2609 /* Move scalars into two vector registers. */
2610 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
a365fa06 2611 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2612 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2613 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2614 && offsets->need_frame_pointer))
2615 {
2616 rtx reg = gen_rtx_REG (SImode, regno);
2617 rtx vreg = gen_rtx_REG (V64SImode,
2618 VGPR_REGNO (6 + (saved_scalars / 64)));
2619 int lane = saved_scalars % 64;
2620
2621 if (prologue)
2622 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2623 else
2624 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2625
2626 saved_scalars++;
2627 }
2628
2629 rtx move_scalars = get_insns ();
2630 end_sequence ();
2631 start_sequence ();
2632
2633 /* Ensure that all vector lanes are moved. */
2634 exec_set = -1;
2635 emit_move_insn (exec, GEN_INT (exec_set));
2636
2637 /* Set up a vector stack pointer. */
2638 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2639 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2640 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2641 gcn_gen_undef (V64SImode), exec));
2642 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2643 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2644 exec));
2645 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2646 gcn_operand_part (V64SImode, vsp, 0),
2647 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2648 exec));
2649 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2650 gcn_operand_part (V64SImode, vsp, 1),
2651 const0_rtx, vcc, vcc,
2652 gcn_gen_undef (V64SImode), exec));
2653
2654 /* Move vectors. */
2655 for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size;
2656 regno < FIRST_PSEUDO_REGISTER; regno++)
a365fa06 2657 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
5326695a
AS
2658 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2659 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2660 {
2661 rtx reg = gen_rtx_REG (V64SImode, regno);
2662 int size = 256;
2663
2664 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2665 size = saved_scalars * 4;
2666 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2667 size = (saved_scalars - 64) * 4;
2668
2669 if (size != 256 || exec_set != -1)
2670 {
2671 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2672 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2673 }
2674
2675 if (prologue)
2676 emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg,
2677 as, const0_rtx, exec));
2678 else
2679 emit_insn (gen_gatherv64si_insn_1offset_exec
2680 (reg, vsp, const0_rtx, as, const0_rtx,
2681 gcn_gen_undef (V64SImode), exec));
2682
2683 /* Move our VSP to the next stack entry. */
2684 if (offreg_set != size)
2685 {
2686 offreg_set = size;
2687 emit_move_insn (offreg, GEN_INT (size));
2688 }
2689 if (exec_set != -1)
2690 {
2691 exec_set = -1;
2692 emit_move_insn (exec, GEN_INT (exec_set));
2693 }
2694 emit_insn (gen_addv64si3_vcc_dup_exec
2695 (gcn_operand_part (V64SImode, vsp, 0),
2696 offreg, gcn_operand_part (V64SImode, vsp, 0),
2697 vcc, gcn_gen_undef (V64SImode), exec));
2698 emit_insn (gen_addcv64si3_exec
2699 (gcn_operand_part (V64SImode, vsp, 1),
2700 gcn_operand_part (V64SImode, vsp, 1),
2701 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2702
2703 offset += size;
2704 }
2705
2706 rtx move_vectors = get_insns ();
2707 end_sequence ();
2708
2709 if (prologue)
2710 {
2711 emit_insn (move_scalars);
2712 emit_insn (move_vectors);
2713 }
2714 else
2715 {
2716 emit_insn (move_vectors);
2717 emit_insn (move_scalars);
2718 }
2719}
2720
2721/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2722
2723 For a non-kernel function, the stack layout looks like this (interim),
2724 growing *upwards*:
2725
2726 hi | + ...
2727 |__________________| <-- current SP
2728 | outgoing args |
2729 |__________________|
2730 | (alloca space) |
2731 |__________________|
2732 | local vars |
2733 |__________________| <-- FP/hard FP
2734 | callee-save regs |
2735 |__________________| <-- soft arg pointer
2736 | pretend args |
2737 |__________________| <-- incoming SP
2738 | incoming args |
2739 lo |..................|
2740
2741 This implies arguments (beyond the first N in registers) must grow
2742 downwards (as, apparently, PA has them do).
2743
2744 For a kernel function we have the simpler:
2745
2746 hi | + ...
2747 |__________________| <-- current SP
2748 | outgoing args |
2749 |__________________|
2750 | (alloca space) |
2751 |__________________|
2752 | local vars |
2753 lo |__________________| <-- FP/hard FP
2754
2755*/
2756
2757void
2758gcn_expand_prologue ()
2759{
2760 machine_function *offsets = gcn_compute_frame_offsets ();
2761
2762 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2763 {
2764 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2765 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2766
2767 start_sequence ();
2768
2769 if (offsets->pretend_size > 0)
2770 {
2771 /* FIXME: Do the actual saving of register pretend args to the stack.
2772 Register order needs consideration. */
2773 }
2774
2775 /* Save callee-save regs. */
2776 move_callee_saved_registers (sp, offsets, true);
2777
2778 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2779 + offsets->callee_saves
2780 + offsets->local_vars + offsets->outgoing_args_size;
2781 if (sp_adjust > 0)
2782 emit_insn (gen_adddi3_scc (sp, sp, gen_int_mode (sp_adjust, DImode)));
2783
2784 if (offsets->need_frame_pointer)
2785 emit_insn (gen_adddi3_scc (fp, sp,
2786 gen_int_mode
2787 (-(offsets->local_vars +
2788 offsets->outgoing_args_size),
2789 DImode)));
2790
2791 rtx_insn *seq = get_insns ();
2792 end_sequence ();
2793
2794 /* FIXME: Prologue insns should have this flag set for debug output, etc.
2795 but it causes issues for now.
2796 for (insn = seq; insn; insn = NEXT_INSN (insn))
2797 if (INSN_P (insn))
2798 RTX_FRAME_RELATED_P (insn) = 1;*/
2799
2800 emit_insn (seq);
2801 }
2802 else
2803 {
2804 rtx wave_offset = gen_rtx_REG (SImode,
2805 cfun->machine->args.
2806 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
2807
5326695a
AS
2808 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
2809 {
2810 rtx fs_init_lo =
2811 gen_rtx_REG (SImode,
2812 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
2813 rtx fs_init_hi =
2814 gen_rtx_REG (SImode,
2815 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
2816 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
2817 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
2818
2819 /*rtx queue = gen_rtx_REG(DImode,
2820 cfun->machine->args.reg[QUEUE_PTR_ARG]);
2821 rtx aperture = gen_rtx_MEM (SImode,
2822 gen_rtx_PLUS (DImode, queue,
2823 gen_int_mode (68, SImode)));
2824 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
2825
2826 /* Set up flat_scratch. */
2827 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
2828 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
2829 gen_int_mode (8, SImode)));
2830 emit_move_insn (fs_reg_lo, fs_init_hi);
2831 }
2832
2833 /* Set up frame pointer and stack pointer. */
2834 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
2835 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
2836 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
2837 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
2838
2839 HOST_WIDE_INT sp_adjust = (offsets->local_vars
2840 + offsets->outgoing_args_size);
2841
2842 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
2843 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
2844 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
2845 gen_int_mode (0xffff, SImode)));
3258c2d6
AS
2846 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2847 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
2848 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
5326695a
AS
2849
2850 if (sp_adjust > 0)
2851 emit_insn (gen_adddi3_scc (sp, fp, gen_int_mode (sp_adjust, DImode)));
2852 else
2853 emit_move_insn (sp, fp);
2854
2855 /* Make sure the flat scratch reg doesn't get optimised away. */
2856 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
2857 }
2858
2859 /* Ensure that the scheduler doesn't do anything unexpected. */
2860 emit_insn (gen_blockage ());
2861
2862 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
2863 gen_int_mode (LDS_SIZE, SImode));
2864
2865 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
5326695a
AS
2866
2867 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
2868 {
2869 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
2870 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2871 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
2872 "gomp_gcn_enter_kernel"));
2873 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2874 }
2875}
2876
2877/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
2878
2879 See gcn_expand_prologue for stack details. */
2880
2881void
2882gcn_expand_epilogue (void)
2883{
2884 /* Ensure that the scheduler doesn't do anything unexpected. */
2885 emit_insn (gen_blockage ());
2886
2887 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2888 {
2889 machine_function *offsets = gcn_compute_frame_offsets ();
2890 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2891 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2892
2893 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
2894
2895 if (offsets->need_frame_pointer)
2896 {
2897 /* Restore old SP from the frame pointer. */
2898 if (sp_adjust > 0)
2899 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
2900 else
2901 emit_move_insn (sp, fp);
2902 }
2903 else
2904 {
2905 /* Restore old SP from current SP. */
2906 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
2907
2908 if (sp_adjust > 0)
2909 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
2910 }
2911
2912 move_callee_saved_registers (sp, offsets, false);
2913
2914 /* There's no explicit use of the link register on the return insn. Emit
2915 one here instead. */
2916 if (offsets->lr_needs_saving)
2917 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
2918
2919 /* Similar for frame pointer. */
2920 if (offsets->need_frame_pointer)
2921 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
2922 }
2923 else if (flag_openmp)
2924 {
2925 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
2926 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2927 emit_move_insn (fn_reg,
2928 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
2929 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2930 }
2931 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
2932 {
2933 /* Assume that an exit value compatible with gcn-run is expected.
2934 That is, the third input parameter is an int*.
2935
2936 We can't allocate any new registers, but the kernarg_reg is
2937 dead after this, so we'll use that. */
2938 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
2939 [KERNARG_SEGMENT_PTR_ARG]);
2940 rtx retptr_mem = gen_rtx_MEM (DImode,
2941 gen_rtx_PLUS (DImode, kernarg_reg,
2942 GEN_INT (16)));
2943 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
2944 emit_move_insn (kernarg_reg, retptr_mem);
2945
2946 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
2947 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
2948 emit_move_insn (retval_mem,
2949 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
2950 }
2951
2952 emit_jump_insn (gen_gcn_return ());
2953}
2954
2955/* Implement TARGET_CAN_ELIMINATE.
2956
2957 Return true if the compiler is allowed to try to replace register number
2958 FROM_REG with register number TO_REG.
2959
2960 FIXME: is the default "true" not enough? Should this be a negative set? */
2961
2962bool
2963gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
2964{
2965 return (to_reg == HARD_FRAME_POINTER_REGNUM
2966 || to_reg == STACK_POINTER_REGNUM);
2967}
2968
2969/* Implement INITIAL_ELIMINATION_OFFSET.
2970
2971 Returns the initial difference between the specified pair of registers, in
2972 terms of stack position. */
2973
2974HOST_WIDE_INT
2975gcn_initial_elimination_offset (int from, int to)
2976{
2977 machine_function *offsets = gcn_compute_frame_offsets ();
2978
2979 switch (from)
2980 {
2981 case ARG_POINTER_REGNUM:
2982 if (to == STACK_POINTER_REGNUM)
2983 return -(offsets->callee_saves + offsets->local_vars
2984 + offsets->outgoing_args_size);
2985 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
2986 return -offsets->callee_saves;
2987 else
2988 gcc_unreachable ();
2989 break;
2990
2991 case FRAME_POINTER_REGNUM:
2992 if (to == STACK_POINTER_REGNUM)
2993 return -(offsets->local_vars + offsets->outgoing_args_size);
2994 else if (to == HARD_FRAME_POINTER_REGNUM)
2995 return 0;
2996 else
2997 gcc_unreachable ();
2998 break;
2999
3000 default:
3001 gcc_unreachable ();
3002 }
3003}
3004
3005/* Implement HARD_REGNO_RENAME_OK.
3006
3007 Return true if it is permissible to rename a hard register from
3008 FROM_REG to TO_REG. */
3009
3010bool
3011gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3012{
3013 if (from_reg == SCC_REG
3014 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3015 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3016 || to_reg == SCC_REG
3017 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3018 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3019 return false;
3020
3021 /* Allow the link register to be used if it was saved. */
3022 if ((to_reg & ~1) == LINK_REGNUM)
3023 return !cfun || cfun->machine->lr_needs_saving;
3024
3025 /* Allow the registers used for the static chain to be used if the chain is
3026 not in active use. */
3027 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3028 return !cfun
3029 || !(cfun->static_chain_decl
3030 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3031 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3032
3033 return true;
3034}
3035
3036/* Implement HARD_REGNO_CALLER_SAVE_MODE.
3037
3038 Which mode is required for saving NREGS of a pseudo-register in
3039 call-clobbered hard register REGNO. */
3040
3041machine_mode
3042gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3043 machine_mode regmode)
3044{
737d6a1a 3045 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
5326695a
AS
3046
3047 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3048 result = (nregs == 1 ? SImode : DImode);
3049
3050 return result;
3051}
3052
3053/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3054
3055 Output assembler code for a block containing the constant parts
3056 of a trampoline, leaving space for the variable parts. */
3057
3058static void
3059gcn_asm_trampoline_template (FILE *f)
3060{
3061 /* The source operand of the move instructions must be a 32-bit
3062 constant following the opcode. */
3063 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3064 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3065 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3066 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3067 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3068}
3069
3070/* Implement TARGET_TRAMPOLINE_INIT.
3071
3072 Emit RTL insns to initialize the variable parts of a trampoline.
3073 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3074 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3075 to be passed to the target function. */
3076
3077static void
3078gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3079{
b7c28a47
AS
3080 if (TARGET_GCN5_PLUS)
3081 sorry ("nested function trampolines not supported on GCN5 due to"
3082 " non-executable stacks");
3083
5326695a
AS
3084 emit_block_move (m_tramp, assemble_trampoline_template (),
3085 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3086
3087 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3088 rtx chain_value_reg = copy_to_reg (chain_value);
3089 rtx fnaddr_reg = copy_to_reg (fnaddr);
3090
3091 for (int i = 0; i < 4; i++)
3092 {
3093 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3094 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3095 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3096 }
3097
3098 rtx tramp_addr = XEXP (m_tramp, 0);
3099 emit_insn (gen_clear_icache (tramp_addr,
3100 plus_constant (ptr_mode, tramp_addr,
3101 TRAMPOLINE_SIZE)));
3102}
3103
3104/* }}} */
3105/* {{{ Miscellaneous. */
3106
3107/* Implement TARGET_CANNOT_COPY_INSN_P.
3108
3109 Return true if INSN must not be duplicated. */
3110
3111static bool
3112gcn_cannot_copy_insn_p (rtx_insn *insn)
3113{
3114 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3115 return true;
3116
3117 return false;
3118}
3119
3120/* Implement TARGET_DEBUG_UNWIND_INFO.
3121
3122 Defines the mechanism that will be used for describing frame unwind
3123 information to the debugger. */
3124
3125static enum unwind_info_type
3126gcn_debug_unwind_info ()
3127{
3128 /* No support for debug info, yet. */
3129 return UI_NONE;
3130}
3131
3132/* Determine if there is a suitable hardware conversion instruction.
3133 Used primarily by the machine description. */
3134
3135bool
3136gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3137{
3138 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3139 return false;
3140
3141 if (VECTOR_MODE_P (from))
3142 {
3143 from = GET_MODE_INNER (from);
3144 to = GET_MODE_INNER (to);
3145 }
3146
3147 switch (op)
3148 {
3149 case fix_trunc_cvt:
3150 case fixuns_trunc_cvt:
3151 if (GET_MODE_CLASS (from) != MODE_FLOAT
3152 || GET_MODE_CLASS (to) != MODE_INT)
3153 return false;
3154 break;
3155 case float_cvt:
3156 case floatuns_cvt:
3157 if (GET_MODE_CLASS (from) != MODE_INT
3158 || GET_MODE_CLASS (to) != MODE_FLOAT)
3159 return false;
3160 break;
3161 case extend_cvt:
3162 if (GET_MODE_CLASS (from) != MODE_FLOAT
3163 || GET_MODE_CLASS (to) != MODE_FLOAT
3164 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3165 return false;
3166 break;
3167 case trunc_cvt:
3168 if (GET_MODE_CLASS (from) != MODE_FLOAT
3169 || GET_MODE_CLASS (to) != MODE_FLOAT
3170 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3171 return false;
3172 break;
3173 }
3174
3175 return ((to == HImode && from == HFmode)
3176 || (to == SImode && (from == SFmode || from == DFmode))
3177 || (to == HFmode && (from == HImode || from == SFmode))
3178 || (to == SFmode && (from == SImode || from == HFmode
3179 || from == DFmode))
3180 || (to == DFmode && (from == SImode || from == SFmode)));
3181}
3182
76d46331
KCY
3183/* Implement TARGET_EMUTLS_VAR_INIT.
3184
3185 Disable emutls (gthr-gcn.h does not support it, yet). */
3186
3187tree
3188gcn_emutls_var_init (tree, tree decl, tree)
3189{
3190 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3191}
3192
5326695a
AS
3193/* }}} */
3194/* {{{ Costs. */
3195
3196/* Implement TARGET_RTX_COSTS.
3197
3198 Compute a (partial) cost for rtx X. Return true if the complete
3199 cost has been computed, and false if subexpressions should be
3200 scanned. In either case, *TOTAL contains the cost result. */
3201
3202static bool
3203gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3204{
3205 enum rtx_code code = GET_CODE (x);
3206 switch (code)
3207 {
3208 case CONST:
3209 case CONST_DOUBLE:
3210 case CONST_VECTOR:
3211 case CONST_INT:
3212 if (gcn_inline_constant_p (x))
3213 *total = 0;
3214 else if (code == CONST_INT
3215 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3216 *total = 1;
3217 else if (gcn_constant_p (x))
3218 *total = 2;
3219 else
3220 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3221 return true;
3222
3223 case DIV:
3224 *total = 100;
3225 return false;
3226
3227 default:
3228 *total = 3;
3229 return false;
3230 }
3231}
3232
3233/* Implement TARGET_MEMORY_MOVE_COST.
3234
3235 Return the cost of moving data of mode M between a
3236 register and memory. A value of 2 is the default; this cost is
3237 relative to those in `REGISTER_MOVE_COST'.
3238
3239 This function is used extensively by register_move_cost that is used to
3240 build tables at startup. Make it inline in this case.
3241 When IN is 2, return maximum of in and out move cost.
3242
3243 If moving between registers and memory is more expensive than
3244 between two registers, you should define this macro to express the
3245 relative cost.
3246
3247 Model also increased moving costs of QImode registers in non
3248 Q_REGS classes. */
3249
3250#define LOAD_COST 32
3251#define STORE_COST 32
3252static int
3253gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3254{
3255 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3256 switch (regclass)
3257 {
3258 case SCC_CONDITIONAL_REG:
3259 case VCCZ_CONDITIONAL_REG:
3260 case VCC_CONDITIONAL_REG:
3261 case EXECZ_CONDITIONAL_REG:
3262 case ALL_CONDITIONAL_REGS:
3263 case SGPR_REGS:
3264 case SGPR_EXEC_REGS:
3265 case EXEC_MASK_REG:
3266 case SGPR_VOP_SRC_REGS:
3267 case SGPR_MEM_SRC_REGS:
3268 case SGPR_SRC_REGS:
3269 case SGPR_DST_REGS:
3270 case GENERAL_REGS:
3271 case AFP_REGS:
3272 if (!in)
3273 return (STORE_COST + 2) * nregs;
3274 return LOAD_COST * nregs;
3275 case VGPR_REGS:
3276 if (in)
3277 return (LOAD_COST + 2) * nregs;
3278 return STORE_COST * nregs;
3279 case ALL_REGS:
3280 case ALL_GPR_REGS:
3281 case SRCDST_REGS:
3282 if (in)
3283 return (LOAD_COST + 2) * nregs;
3284 return (STORE_COST + 2) * nregs;
3285 default:
3286 gcc_unreachable ();
3287 }
3288}
3289
3290/* Implement TARGET_REGISTER_MOVE_COST.
3291
3292 Return the cost of moving data from a register in class CLASS1 to
3293 one in class CLASS2. Base value is 2. */
3294
3295static int
3296gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3297{
3298 /* Increase cost of moving from and to vector registers. While this is
3299 fast in hardware (I think), it has hidden cost of setting up the exec
3300 flags. */
3301 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3302 return 4;
3303 return 2;
3304}
3305
3306/* }}} */
3307/* {{{ Builtins. */
3308
3309/* Type codes used by GCN built-in definitions. */
3310
3311enum gcn_builtin_type_index
3312{
3313 GCN_BTI_END_OF_PARAMS,
3314
3315 GCN_BTI_VOID,
3316 GCN_BTI_BOOL,
3317 GCN_BTI_INT,
3318 GCN_BTI_UINT,
3319 GCN_BTI_SIZE_T,
3320 GCN_BTI_LLINT,
3321 GCN_BTI_LLUINT,
3322 GCN_BTI_EXEC,
3323
3324 GCN_BTI_SF,
3325 GCN_BTI_V64SI,
3326 GCN_BTI_V64SF,
3327 GCN_BTI_V64PTR,
3328 GCN_BTI_SIPTR,
3329 GCN_BTI_SFPTR,
3330 GCN_BTI_VOIDPTR,
3331
3332 GCN_BTI_LDS_VOIDPTR,
3333
3334 GCN_BTI_MAX
3335};
3336
3337static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3338
3339#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3340#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3341#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3342#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3343#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3344#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3345#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3346#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3347#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3348
3349static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3350 struct gcn_builtin_description *);
3351static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3352 struct gcn_builtin_description *);
3353
3354struct gcn_builtin_description;
3355typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3356 struct gcn_builtin_description *);
3357
3358enum gcn_builtin_type
3359{
3360 B_UNIMPLEMENTED, /* Sorry out */
3361 B_INSN, /* Emit a pattern */
3362 B_OVERLOAD /* Placeholder for an overloaded function */
3363};
3364
3365struct gcn_builtin_description
3366{
3367 int fcode;
3368 int icode;
3369 const char *name;
3370 enum gcn_builtin_type type;
3371 /* The first element of parm is always the return type. The rest
3372 are a zero terminated list of parameters. */
3373 int parm[6];
3374 gcn_builtin_expander expander;
3375};
3376
3377/* Read in the GCN builtins from gcn-builtins.def. */
3378
3379extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3380
3381struct gcn_builtin_description gcn_builtins[] = {
3382#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3383 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3384
3385#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3386 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3387 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3388 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3389 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3390 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3391 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3392 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3393 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3394
3395#include "gcn-builtins.def"
3396#undef DEF_BUILTIN_BINOP_INT_FP
3397#undef DEF_BUILTIN
3398};
3399
3400static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3401
3402/* Implement TARGET_BUILTIN_DECL.
3403
3404 Return the GCN builtin for CODE. */
3405
3406tree
3407gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3408{
3409 if (code >= GCN_BUILTIN_MAX)
3410 return error_mark_node;
3411
3412 return gcn_builtin_decls[code];
3413}
3414
3415/* Helper function for gcn_init_builtins. */
3416
3417static void
3418gcn_init_builtin_types (void)
3419{
3420 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3421 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3422 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3423 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3424 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3425 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3426 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3427
3428 exec_type_node = unsigned_intDI_type_node;
3429 sf_type_node = float32_type_node;
3430 v64si_type_node = build_vector_type (intSI_type_node, 64);
3431 v64sf_type_node = build_vector_type (float_type_node, 64);
3432 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3433 /*build_pointer_type
3434 (integer_type_node) */
3435 , 64);
3436 tree tmp = build_distinct_type_copy (intSI_type_node);
3437 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3438 siptr_type_node = build_pointer_type (tmp);
3439
3440 tmp = build_distinct_type_copy (float_type_node);
3441 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3442 sfptr_type_node = build_pointer_type (tmp);
3443
3444 tmp = build_distinct_type_copy (void_type_node);
3445 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3446 voidptr_type_node = build_pointer_type (tmp);
3447
3448 tmp = build_distinct_type_copy (void_type_node);
3449 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3450 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3451}
3452
3453/* Implement TARGET_INIT_BUILTINS.
3454
3455 Set up all builtin functions for this target. */
3456
3457static void
3458gcn_init_builtins (void)
3459{
3460 gcn_init_builtin_types ();
3461
3462 struct gcn_builtin_description *d;
3463 unsigned int i;
3464 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3465 {
3466 tree p;
3467 char name[64]; /* build_function will make a copy. */
3468 int parm;
3469
3470 /* FIXME: Is this necessary/useful? */
3471 if (d->name == 0)
3472 continue;
3473
3474 /* Find last parm. */
3475 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3476 ;
3477
3478 p = void_list_node;
3479 while (parm > 1)
3480 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3481
3482 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3483
3484 sprintf (name, "__builtin_gcn_%s", d->name);
3485 gcn_builtin_decls[i]
3486 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3487
3488 /* These builtins don't throw. */
3489 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3490 }
3491
3492/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
3493#ifdef BUILT_IN_GOACC_SINGLE_START
3494 /* These builtins need to take/return an LDS pointer: override the generic
3495 versions here. */
3496
3497 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3498 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3499
3500 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3501 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3502 false);
3503
3504 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3505 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3506 false);
3507
3508 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3509 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3510#endif
3511}
3512
3513/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3514 not require taking the address of any object, other than the memory
3515 cell being operated on.
3516
3517 Helper function for gcn_expand_builtin_1. */
3518
3519static rtx
3520gcn_expand_cmp_swap (tree exp, rtx target)
3521{
3522 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3523 addr_space_t as
3524 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3525 machine_mode as_mode = gcn_addr_space_address_mode (as);
3526
3527 if (!target)
3528 target = gen_reg_rtx (mode);
3529
3530 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3531 NULL_RTX, as_mode, EXPAND_NORMAL);
3532 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3533 NULL_RTX, mode, EXPAND_NORMAL);
3534 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3535 NULL_RTX, mode, EXPAND_NORMAL);
3536 rtx pat;
3537
3538 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3539 set_mem_addr_space (mem, as);
3540
3541 if (!REG_P (cmp))
3542 cmp = copy_to_mode_reg (mode, cmp);
3543 if (!REG_P (src))
3544 src = copy_to_mode_reg (mode, src);
3545
3546 if (mode == SImode)
3547 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3548 else
3549 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3550
3551 emit_insn (pat);
3552
3553 return target;
3554}
3555
3556/* Expand many different builtins.
3557
3558 Intended for use in gcn-builtins.def. */
3559
3560static rtx
3561gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3562 machine_mode /*mode */ , int ignore,
3563 struct gcn_builtin_description *)
3564{
3565 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3566 switch (DECL_MD_FUNCTION_CODE (fndecl))
5326695a
AS
3567 {
3568 case GCN_BUILTIN_FLAT_LOAD_INT32:
3569 {
3570 if (ignore)
3571 return target;
3572 /*rtx exec = */
3573 force_reg (DImode,
3574 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3575 EXPAND_NORMAL));
3576 /*rtx ptr = */
3577 force_reg (V64DImode,
3578 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3579 EXPAND_NORMAL));
3580 /*emit_insn (gen_vector_flat_loadv64si
3581 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3582 return target;
3583 }
3584 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3585 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3586 {
3587 if (ignore)
3588 return target;
3589 rtx exec = force_reg (DImode,
3590 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3591 DImode,
3592 EXPAND_NORMAL));
3593 rtx ptr = force_reg (DImode,
3594 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3595 V64DImode,
3596 EXPAND_NORMAL));
3597 rtx offsets = force_reg (V64SImode,
3598 expand_expr (CALL_EXPR_ARG (exp, 2),
3599 NULL_RTX, V64DImode,
3600 EXPAND_NORMAL));
3601 rtx addrs = gen_reg_rtx (V64DImode);
3602 rtx tmp = gen_reg_rtx (V64SImode);
3603 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3604 GEN_INT (2),
3605 gcn_gen_undef (V64SImode), exec));
3606 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3607 gcn_gen_undef (V64DImode),
3608 exec));
3609 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3610 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3611 /* FIXME: set attributes. */
3612 emit_insn (gen_mov_with_exec (target, mem, exec));
3613 return target;
3614 }
3615 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3616 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3617 {
3618 rtx exec = force_reg (DImode,
3619 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3620 DImode,
3621 EXPAND_NORMAL));
3622 rtx ptr = force_reg (DImode,
3623 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3624 V64DImode,
3625 EXPAND_NORMAL));
3626 rtx offsets = force_reg (V64SImode,
3627 expand_expr (CALL_EXPR_ARG (exp, 2),
3628 NULL_RTX, V64DImode,
3629 EXPAND_NORMAL));
3630 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3631 3)));
3632 rtx val = force_reg (vmode,
3633 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3634 vmode,
3635 EXPAND_NORMAL));
3636 rtx addrs = gen_reg_rtx (V64DImode);
3637 rtx tmp = gen_reg_rtx (V64SImode);
3638 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3639 GEN_INT (2),
3640 gcn_gen_undef (V64SImode), exec));
3641 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3642 gcn_gen_undef (V64DImode),
3643 exec));
3644 rtx mem = gen_rtx_MEM (vmode, addrs);
3645 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3646 /* FIXME: set attributes. */
3647 emit_insn (gen_mov_with_exec (mem, val, exec));
3648 return target;
3649 }
3650 case GCN_BUILTIN_SQRTVF:
3651 {
3652 if (ignore)
3653 return target;
3654 rtx exec = gcn_full_exec_reg ();
3655 rtx arg = force_reg (V64SFmode,
3656 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3657 V64SFmode,
3658 EXPAND_NORMAL));
3659 emit_insn (gen_sqrtv64sf2_exec
3660 (target, arg, gcn_gen_undef (V64SFmode), exec));
3661 return target;
3662 }
3663 case GCN_BUILTIN_SQRTF:
3664 {
3665 if (ignore)
3666 return target;
3667 rtx arg = force_reg (SFmode,
3668 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3669 SFmode,
3670 EXPAND_NORMAL));
3671 emit_insn (gen_sqrtsf2 (target, arg));
3672 return target;
3673 }
3674 case GCN_BUILTIN_OMP_DIM_SIZE:
3675 {
3676 if (ignore)
3677 return target;
3678 emit_insn (gen_oacc_dim_size (target,
3679 expand_expr (CALL_EXPR_ARG (exp, 0),
3680 NULL_RTX, SImode,
3681 EXPAND_NORMAL)));
3682 return target;
3683 }
3684 case GCN_BUILTIN_OMP_DIM_POS:
3685 {
3686 if (ignore)
3687 return target;
3688 emit_insn (gen_oacc_dim_pos (target,
3689 expand_expr (CALL_EXPR_ARG (exp, 0),
3690 NULL_RTX, SImode,
3691 EXPAND_NORMAL)));
3692 return target;
3693 }
3694 case GCN_BUILTIN_CMP_SWAP:
3695 case GCN_BUILTIN_CMP_SWAPLL:
3696 return gcn_expand_cmp_swap (exp, target);
3697
3698 case GCN_BUILTIN_ACC_SINGLE_START:
3699 {
3700 if (ignore)
3701 return target;
3702
3703 rtx wavefront = gcn_oacc_dim_pos (1);
3704 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3705 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3706 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3707 return cc;
3708 }
3709
3710 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3711 {
3712 rtx blk = force_reg (SImode,
3713 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3714 SImode, EXPAND_NORMAL));
3715 rtx wavefront = gcn_oacc_dim_pos (1);
3716 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3717 rtx not_zero = gen_label_rtx ();
3718 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3719 emit_move_insn (blk, const0_rtx);
3720 emit_label (not_zero);
3721 return blk;
3722 }
3723
3724 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3725 return target;
3726
3727 case GCN_BUILTIN_ACC_BARRIER:
3728 emit_insn (gen_gcn_wavefront_barrier ());
3729 return target;
3730
3731 default:
3732 gcc_unreachable ();
3733 }
3734}
3735
3736/* Expansion of simple arithmetic and bit binary operation builtins.
3737
3738 Intended for use with gcn_builtins table. */
3739
3740static rtx
3741gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
3742 machine_mode /*mode */ , int ignore,
3743 struct gcn_builtin_description *d)
3744{
3745 int icode = d->icode;
3746 if (ignore)
3747 return target;
3748
3749 rtx exec = force_reg (DImode,
3750 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3751 EXPAND_NORMAL));
3752
3753 machine_mode m1 = insn_data[icode].operand[1].mode;
3754 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
3755 EXPAND_NORMAL);
3756 if (!insn_data[icode].operand[1].predicate (arg1, m1))
3757 arg1 = force_reg (m1, arg1);
3758
3759 machine_mode m2 = insn_data[icode].operand[2].mode;
3760 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
3761 EXPAND_NORMAL);
3762 if (!insn_data[icode].operand[2].predicate (arg2, m2))
3763 arg2 = force_reg (m2, arg2);
3764
3765 rtx arg_prev;
3766 if (call_expr_nargs (exp) == 4)
3767 {
3768 machine_mode m_prev = insn_data[icode].operand[4].mode;
3769 arg_prev = force_reg (m_prev,
3770 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3771 m_prev, EXPAND_NORMAL));
3772 }
3773 else
3774 arg_prev = gcn_gen_undef (GET_MODE (target));
3775
3776 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
3777 emit_insn (pat);
3778 return target;
3779}
3780
3781/* Implement TARGET_EXPAND_BUILTIN.
3782
3783 Expand an expression EXP that calls a built-in function, with result going
3784 to TARGET if that's convenient (and in mode MODE if that's convenient).
3785 SUBTARGET may be used as the target for computing one of EXP's operands.
3786 IGNORE is nonzero if the value is to be ignored. */
3787
3788rtx
3789gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
3790 int ignore)
3791{
3792 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3793 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
5326695a
AS
3794 struct gcn_builtin_description *d;
3795
3796 gcc_assert (fcode < GCN_BUILTIN_MAX);
3797 d = &gcn_builtins[fcode];
3798
3799 if (d->type == B_UNIMPLEMENTED)
3800 sorry ("Builtin not implemented");
3801
3802 return d->expander (exp, target, subtarget, mode, ignore, d);
3803}
3804
3805/* }}} */
3806/* {{{ Vectorization. */
3807
3808/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
3809
3810 A vector mask is a value that holds one boolean result for every element in
3811 a vector. */
3812
3813opt_machine_mode
10116ec1 3814gcn_vectorize_get_mask_mode (machine_mode)
5326695a
AS
3815{
3816 /* GCN uses a DImode bit-mask. */
3817 return DImode;
3818}
3819
3820/* Return an RTX that references a vector with the i-th lane containing
3821 PERM[i]*4.
3822
3823 Helper function for gcn_vectorize_vec_perm_const. */
3824
3825static rtx
3826gcn_make_vec_perm_address (unsigned int *perm)
3827{
3828 rtx x = gen_reg_rtx (V64SImode);
3829 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
3830
3831 /* Permutation addresses use byte addressing. With each vector lane being
3832 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
3833 so only set those.
3834
3835 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
3836 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
3837 only take one source vector, the most-significant bit can be ignored
3838 here. Instead, we can use EXEC masking to select the relevant part of
3839 each source vector after they are permuted separately. */
3840 uint64_t bit_mask = 1 << 2;
3841 for (int i = 2; i < 8; i++, bit_mask <<= 1)
3842 {
3843 uint64_t exec_mask = 0;
3844 uint64_t lane_mask = 1;
3845 for (int j = 0; j < 64; j++, lane_mask <<= 1)
3846 if ((perm[j] * 4) & bit_mask)
3847 exec_mask |= lane_mask;
3848
3849 if (exec_mask)
3850 emit_insn (gen_addv64si3_exec (x, x,
3851 gcn_vec_constant (V64SImode,
3852 bit_mask),
3853 x, get_exec (exec_mask)));
3854 }
3855
3856 return x;
3857}
3858
3859/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
3860
3861 Return true if permutation with SEL is possible.
3862
3863 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
3864 permutations. */
3865
3866static bool
3867gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
3868 rtx src0, rtx src1,
3869 const vec_perm_indices & sel)
3870{
3871 unsigned int nelt = GET_MODE_NUNITS (vmode);
3872
3873 gcc_assert (VECTOR_MODE_P (vmode));
3874 gcc_assert (nelt <= 64);
3875 gcc_assert (sel.length () == nelt);
3876
3877 if (!dst)
3878 {
3879 /* All vector permutations are possible on this architecture,
3880 with varying degrees of efficiency depending on the permutation. */
3881 return true;
3882 }
3883
3884 unsigned int perm[64];
3885 for (unsigned int i = 0; i < nelt; ++i)
3886 perm[i] = sel[i] & (2 * nelt - 1);
3887
3888 /* Make life a bit easier by swapping operands if necessary so that
3889 the first element always comes from src0. */
3890 if (perm[0] >= nelt)
3891 {
3892 rtx temp = src0;
3893 src0 = src1;
3894 src1 = temp;
3895
3896 for (unsigned int i = 0; i < nelt; ++i)
3897 if (perm[i] < nelt)
3898 perm[i] += nelt;
3899 else
3900 perm[i] -= nelt;
3901 }
3902
3903 /* TODO: There are more efficient ways to implement certain permutations
3904 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
3905 this more inefficient generic approach is used. */
3906
3907 int64_t src1_lanes = 0;
3908 int64_t lane_bit = 1;
3909
3910 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
3911 {
3912 /* Set the bits for lanes from src1. */
3913 if (perm[i] >= nelt)
3914 src1_lanes |= lane_bit;
3915 }
3916
3917 rtx addr = gcn_make_vec_perm_address (perm);
3918 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
3919
3920 switch (vmode)
3921 {
3922 case E_V64QImode:
3923 ds_bpermute = gen_ds_bpermutev64qi;
3924 break;
3925 case E_V64HImode:
3926 ds_bpermute = gen_ds_bpermutev64hi;
3927 break;
3928 case E_V64SImode:
3929 ds_bpermute = gen_ds_bpermutev64si;
3930 break;
3931 case E_V64HFmode:
3932 ds_bpermute = gen_ds_bpermutev64hf;
3933 break;
3934 case E_V64SFmode:
3935 ds_bpermute = gen_ds_bpermutev64sf;
3936 break;
3937 case E_V64DImode:
3938 ds_bpermute = gen_ds_bpermutev64di;
3939 break;
3940 case E_V64DFmode:
3941 ds_bpermute = gen_ds_bpermutev64df;
3942 break;
3943 default:
3944 gcc_assert (false);
3945 }
3946
3947 /* Load elements from src0 to dst. */
3948 gcc_assert (~src1_lanes);
3949 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
3950
3951 /* Load elements from src1 to dst. */
3952 if (src1_lanes)
3953 {
3954 /* Masking a lane masks both the destination and source lanes for
3955 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
3956 then add an extra masked move to merge the results of permuting
3957 the two source vectors together.
3958 */
3959 rtx tmp = gen_reg_rtx (vmode);
3960 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
3961 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
3962 }
3963
3964 return true;
3965}
3966
3967/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
3968
3969 Return nonzero if vector MODE is supported with at least move
3970 instructions. */
3971
3972static bool
3973gcn_vector_mode_supported_p (machine_mode mode)
3974{
3975 /* FIXME: Enable V64QImode and V64HImode.
3976 We should support these modes, but vector operations are usually
3977 assumed to automatically truncate types, and GCN does not. We
3978 need to add explicit truncates and/or use SDWA for QI/HI insns. */
3979 return (/* mode == V64QImode || mode == V64HImode
3980 ||*/ mode == V64SImode || mode == V64DImode
3981 || mode == V64SFmode || mode == V64DFmode);
3982}
3983
3984/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
3985
3986 Enables autovectorization for all supported modes. */
3987
3988static machine_mode
3989gcn_vectorize_preferred_simd_mode (scalar_mode mode)
3990{
3991 switch (mode)
3992 {
3993 case E_QImode:
3994 return V64QImode;
3995 case E_HImode:
3996 return V64HImode;
3997 case E_SImode:
3998 return V64SImode;
3999 case E_DImode:
4000 return V64DImode;
4001 case E_SFmode:
4002 return V64SFmode;
4003 case E_DFmode:
4004 return V64DFmode;
4005 default:
4006 return word_mode;
4007 }
4008}
4009
4010/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4011
4012 Returns the preferred alignment in bits for accesses to vectors of type type
4013 in vectorized code. This might be less than or greater than the ABI-defined
4014 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4015 of a single element, in which case the vectorizer will not try to optimize
4016 for alignment. */
4017
4018static poly_uint64
4019gcn_preferred_vector_alignment (const_tree type)
4020{
4021 return TYPE_ALIGN (TREE_TYPE (type));
4022}
4023
4024/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4025
4026 Return true if the target supports misaligned vector store/load of a
4027 specific factor denoted in the misalignment parameter. */
4028
4029static bool
4030gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4031 const_tree type, int misalignment,
4032 bool is_packed)
4033{
4034 if (is_packed)
4035 return false;
4036
4037 /* If the misalignment is unknown, we should be able to handle the access
4038 so long as it is not to a member of a packed data structure. */
4039 if (misalignment == -1)
4040 return true;
4041
4042 /* Return true if the misalignment is a multiple of the natural alignment
4043 of the vector's element type. This is probably always going to be
4044 true in practice, since we've already established that this isn't a
4045 packed access. */
4046 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4047}
4048
4049/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4050
4051 Return true if vector alignment is reachable (by peeling N iterations) for
4052 the given scalar type TYPE. */
4053
4054static bool
4055gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4056{
4057 /* Vectors which aren't in packed structures will not be less aligned than
4058 the natural alignment of their element type, so this is safe. */
4059 return !is_packed;
4060}
4061
4062/* Generate DPP instructions used for vector reductions.
4063
4064 The opcode is given by INSN.
4065 The first operand of the operation is shifted right by SHIFT vector lanes.
4066 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4067 broadcast the next row (thereby acting like a shift of 16 for the end of
4068 each row). If SHIFT is 32, lane 31 is broadcast to all the
4069 following lanes (thereby acting like a shift of 32 for lane 63). */
4070
4071char *
4072gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4073 int unspec, int shift)
4074{
4075 static char buf[64];
4076 const char *dpp;
4077 const char *vcc_in = "";
4078 const char *vcc_out = "";
4079
4080 /* Add the vcc operand if needed. */
4081 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4082 {
4083 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4084 vcc_in = ", vcc";
4085
4086 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4087 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4088 vcc_out = ", vcc";
4089 }
4090
4091 /* Add the DPP modifiers. */
4092 switch (shift)
4093 {
4094 case 1:
4095 dpp = "row_shr:1 bound_ctrl:0";
4096 break;
4097 case 2:
4098 dpp = "row_shr:2 bound_ctrl:0";
4099 break;
4100 case 4:
4101 dpp = "row_shr:4 bank_mask:0xe";
4102 break;
4103 case 8:
4104 dpp = "row_shr:8 bank_mask:0xc";
4105 break;
4106 case 16:
4107 dpp = "row_bcast:15 row_mask:0xa";
4108 break;
4109 case 32:
4110 dpp = "row_bcast:31 row_mask:0xc";
4111 break;
4112 default:
4113 gcc_unreachable ();
4114 }
4115
4116 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4117
4118 return buf;
4119}
4120
4121/* Generate vector reductions in terms of DPP instructions.
4122
4123 The vector register SRC of mode MODE is reduced using the operation given
4124 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4125 register. */
4126
4127rtx
4128gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4129{
4130 rtx tmp = gen_reg_rtx (mode);
4131 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4132 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4133 && (TARGET_GCN3 || mode == V64DImode);
4134
4135 if (use_plus_carry)
4136 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4137
4138 /* Perform reduction by first performing the reduction operation on every
4139 pair of lanes, then on every pair of results from the previous
4140 iteration (thereby effectively reducing every 4 lanes) and so on until
4141 all lanes are reduced. */
4142 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4143 {
4144 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4145 rtx insn = gen_rtx_SET (tmp,
4146 gen_rtx_UNSPEC (mode,
4147 gen_rtvec (3,
4148 src, src, shift_val),
4149 unspec));
4150
4151 /* Add clobber for instructions that set the carry flags. */
4152 if (use_plus_carry)
4153 {
4154 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4155 gen_rtx_REG (DImode, VCC_REG));
4156 insn = gen_rtx_PARALLEL (VOIDmode,
4157 gen_rtvec (2, insn, clobber));
4158 }
4159
4160 emit_insn (insn);
4161
4162 /* The source operands for every iteration after the first
4163 should be TMP. */
4164 src = tmp;
4165 }
4166
4167 return tmp;
4168}
4169
4170/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4171
4172int
4173gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4174 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4175{
4176 /* Always vectorize. */
4177 return 1;
4178}
4179
4180/* }}} */
4181/* {{{ md_reorg pass. */
4182
4183/* Identify VMEM instructions from their "type" attribute. */
4184
4185static bool
4186gcn_vmem_insn_p (attr_type type)
4187{
4188 switch (type)
4189 {
4190 case TYPE_MUBUF:
4191 case TYPE_MTBUF:
4192 case TYPE_FLAT:
4193 return true;
4194 case TYPE_UNKNOWN:
4195 case TYPE_SOP1:
4196 case TYPE_SOP2:
4197 case TYPE_SOPK:
4198 case TYPE_SOPC:
4199 case TYPE_SOPP:
4200 case TYPE_SMEM:
4201 case TYPE_DS:
4202 case TYPE_VOP2:
4203 case TYPE_VOP1:
4204 case TYPE_VOPC:
4205 case TYPE_VOP3A:
4206 case TYPE_VOP3B:
4207 case TYPE_VOP_SDWA:
4208 case TYPE_VOP_DPP:
4209 case TYPE_MULT:
4210 case TYPE_VMULT:
4211 return false;
4212 }
4213 gcc_unreachable ();
4214 return false;
4215}
4216
4217/* If INSN sets the EXEC register to a constant value, return the value,
4218 otherwise return zero. */
4219
4220static int64_t
4221gcn_insn_exec_value (rtx_insn *insn)
4222{
4223 if (!NONDEBUG_INSN_P (insn))
4224 return 0;
4225
4226 rtx pattern = PATTERN (insn);
4227
4228 if (GET_CODE (pattern) == SET)
4229 {
4230 rtx dest = XEXP (pattern, 0);
4231 rtx src = XEXP (pattern, 1);
4232
4233 if (GET_MODE (dest) == DImode
4234 && REG_P (dest) && REGNO (dest) == EXEC_REG
4235 && CONST_INT_P (src))
4236 return INTVAL (src);
4237 }
4238
4239 return 0;
4240}
4241
4242/* Sets the EXEC register before INSN to the value that it had after
4243 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4244 known, otherwise it returns zero. */
4245
4246static int64_t
4247gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4248 bool curr_exec_known, bool &last_exec_def_saved)
4249{
4250 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4251 rtx exec;
4252
4253 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4254
4255 if (exec_value)
4256 {
4257 /* If the EXEC value is a constant and it happens to be the same as the
4258 current EXEC value, the restore can be skipped. */
4259 if (curr_exec_known && exec_value == curr_exec)
4260 return exec_value;
4261
4262 exec = GEN_INT (exec_value);
4263 }
4264 else
4265 {
4266 /* If the EXEC value is not a constant, save it in a register after the
4267 point of definition. */
4268 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4269
4270 if (!last_exec_def_saved)
4271 {
4272 start_sequence ();
4273 emit_move_insn (exec_save_reg, exec_reg);
4274 rtx_insn *seq = get_insns ();
4275 end_sequence ();
4276
4277 emit_insn_after (seq, last_exec_def);
4278 if (dump_file && (dump_flags & TDF_DETAILS))
4279 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4280 INSN_UID (last_exec_def));
4281
4282 last_exec_def_saved = true;
4283 }
4284
4285 exec = exec_save_reg;
4286 }
4287
4288 /* Restore EXEC register before the usage. */
4289 start_sequence ();
4290 emit_move_insn (exec_reg, exec);
4291 rtx_insn *seq = get_insns ();
4292 end_sequence ();
4293 emit_insn_before (seq, insn);
4294
4295 if (dump_file && (dump_flags & TDF_DETAILS))
4296 {
4297 if (exec_value)
4298 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4299 exec_value, INSN_UID (insn));
4300 else
4301 fprintf (dump_file,
4302 "Restoring EXEC from saved value before insn %d.\n",
4303 INSN_UID (insn));
4304 }
4305
4306 return exec_value;
4307}
4308
4309/* Implement TARGET_MACHINE_DEPENDENT_REORG.
4310
4311 Ensure that pipeline dependencies and lane masking are set correctly. */
4312
4313static void
4314gcn_md_reorg (void)
4315{
4316 basic_block bb;
4317 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4318 rtx exec_lo_reg = gen_rtx_REG (SImode, EXEC_LO_REG);
4319 rtx exec_hi_reg = gen_rtx_REG (SImode, EXEC_HI_REG);
4320 regset_head live;
4321
4322 INIT_REG_SET (&live);
4323
4324 compute_bb_for_insn ();
4325
4326 if (!optimize)
4327 {
4328 split_all_insns ();
4329 if (dump_file && (dump_flags & TDF_DETAILS))
4330 {
4331 fprintf (dump_file, "After split:\n");
4332 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4333 }
4334
4335 /* Update data-flow information for split instructions. */
4336 df_insn_rescan_all ();
4337 }
4338
4339 df_analyze ();
4340
4341 /* This pass ensures that the EXEC register is set correctly, according
4342 to the "exec" attribute. However, care must be taken so that the
4343 value that reaches explicit uses of the EXEC register remains the
4344 same as before.
4345 */
4346
4347 FOR_EACH_BB_FN (bb, cfun)
4348 {
4349 if (dump_file && (dump_flags & TDF_DETAILS))
4350 fprintf (dump_file, "BB %d:\n", bb->index);
4351
4352 rtx_insn *insn, *curr;
4353 rtx_insn *last_exec_def = BB_HEAD (bb);
4354 bool last_exec_def_saved = false;
4355 bool curr_exec_explicit = true;
4356 bool curr_exec_known = true;
4357 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4358 after last_exec_def is executed'. */
4359
4360 FOR_BB_INSNS_SAFE (bb, insn, curr)
4361 {
4362 if (!NONDEBUG_INSN_P (insn))
4363 continue;
4364
4365 if (GET_CODE (PATTERN (insn)) == USE
4366 || GET_CODE (PATTERN (insn)) == CLOBBER)
4367 continue;
4368
4369 HARD_REG_SET defs, uses;
4370 CLEAR_HARD_REG_SET (defs);
4371 CLEAR_HARD_REG_SET (uses);
e8448ba5 4372 note_stores (insn, record_hard_reg_sets, &defs);
5326695a
AS
4373 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4374
4375 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4376 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4377 bool exec_used = (hard_reg_set_intersect_p
4378 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4379 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4380
4381 /* Check the instruction for implicit setting of EXEC via an
4382 attribute. */
4383 attr_exec exec_attr = get_attr_exec (insn);
4384 int64_t new_exec;
4385
4386 switch (exec_attr)
4387 {
4388 case EXEC_NONE:
4389 new_exec = 0;
4390 break;
4391
4392 case EXEC_SINGLE:
4393 /* Instructions that do not involve memory accesses only require
4394 bit 0 of EXEC to be set. */
4395 if (gcn_vmem_insn_p (get_attr_type (insn))
4396 || get_attr_type (insn) == TYPE_DS)
4397 new_exec = 1;
4398 else
4399 new_exec = curr_exec | 1;
4400 break;
4401
4402 case EXEC_FULL:
4403 new_exec = -1;
4404 break;
4405
4406 default: /* Auto-detect what setting is appropriate. */
4407 {
4408 new_exec = 0;
4409
4410 /* If EXEC is referenced explicitly then we don't need to do
4411 anything to set it, so we're done. */
4412 if (exec_used)
4413 break;
4414
4415 /* Scan the insn for VGPRs defs or uses. The mode determines
4416 what kind of exec is needed. */
4417 subrtx_iterator::array_type array;
4418 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4419 {
4420 const_rtx x = *iter;
4421 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4422 {
4423 if (VECTOR_MODE_P (GET_MODE (x)))
4424 {
4425 new_exec = -1;
4426 break;
4427 }
4428 else
4429 new_exec = 1;
4430 }
4431 }
4432 }
4433 break;
4434 }
4435
4436 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4437 {
4438 start_sequence ();
4439 emit_move_insn (exec_reg, GEN_INT (new_exec));
4440 rtx_insn *seq = get_insns ();
4441 end_sequence ();
4442 emit_insn_before (seq, insn);
4443
4444 if (dump_file && (dump_flags & TDF_DETAILS))
4445 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4446 new_exec, INSN_UID (insn));
4447
4448 curr_exec = new_exec;
4449 curr_exec_explicit = false;
4450 curr_exec_known = true;
4451 }
4452 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4453 {
4454 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4455 new_exec, INSN_UID (insn));
4456 }
4457
4458 /* The state of the EXEC register is unknown after a
4459 function call. */
4460 if (CALL_P (insn))
4461 curr_exec_known = false;
4462
4463 /* Handle explicit uses of EXEC. If the instruction is a partial
4464 explicit definition of EXEC, then treat it as an explicit use of
4465 EXEC as well. */
4466 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4467 {
4468 /* An instruction that explicitly uses EXEC should not also
4469 implicitly define it. */
4470 gcc_assert (!exec_used || !new_exec);
4471
4472 if (!curr_exec_known || !curr_exec_explicit)
4473 {
4474 /* Restore the previous explicitly defined value. */
4475 curr_exec = gcn_restore_exec (insn, last_exec_def,
4476 curr_exec, curr_exec_known,
4477 last_exec_def_saved);
4478 curr_exec_explicit = true;
4479 curr_exec_known = true;
4480 }
4481 }
4482
4483 /* Handle explicit definitions of EXEC. */
4484 if (exec_lo_def_p || exec_hi_def_p)
4485 {
4486 last_exec_def = insn;
4487 last_exec_def_saved = false;
4488 curr_exec = gcn_insn_exec_value (insn);
4489 curr_exec_explicit = true;
4490 curr_exec_known = true;
4491
4492 if (dump_file && (dump_flags & TDF_DETAILS))
4493 fprintf (dump_file,
4494 "Found %s definition of EXEC at insn %d.\n",
4495 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4496 INSN_UID (insn));
4497 }
4498 }
4499
4500 COPY_REG_SET (&live, DF_LR_OUT (bb));
4501 df_simulate_initialize_backwards (bb, &live);
4502
4503 /* If EXEC is live after the basic block, restore the value of EXEC
4504 at the end of the block. */
4505 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4506 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4507 && (!curr_exec_known || !curr_exec_explicit))
4508 {
4509 rtx_insn *end_insn = BB_END (bb);
4510
4511 /* If the instruction is not a jump instruction, do the restore
4512 after the last instruction in the basic block. */
4513 if (NONJUMP_INSN_P (end_insn))
4514 end_insn = NEXT_INSN (end_insn);
4515
4516 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4517 curr_exec_known, last_exec_def_saved);
4518 }
4519 }
4520
4521 CLEAR_REG_SET (&live);
4522
4523 /* "Manually Inserted Wait States (NOPs)."
4524
4525 GCN hardware detects most kinds of register dependencies, but there
4526 are some exceptions documented in the ISA manual. This pass
4527 detects the missed cases, and inserts the documented number of NOPs
4528 required for correct execution. */
4529
4530 const int max_waits = 5;
4531 struct ilist
4532 {
4533 rtx_insn *insn;
4534 attr_unit unit;
930c5599 4535 attr_delayeduse delayeduse;
5326695a 4536 HARD_REG_SET writes;
930c5599 4537 HARD_REG_SET reads;
5326695a
AS
4538 int age;
4539 } back[max_waits];
4540 int oldest = 0;
4541 for (int i = 0; i < max_waits; i++)
4542 back[i].insn = NULL;
4543
4544 rtx_insn *insn, *last_insn = NULL;
4545 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4546 {
4547 if (!NONDEBUG_INSN_P (insn))
4548 continue;
4549
4550 if (GET_CODE (PATTERN (insn)) == USE
4551 || GET_CODE (PATTERN (insn)) == CLOBBER)
4552 continue;
4553
4554 attr_type itype = get_attr_type (insn);
4555 attr_unit iunit = get_attr_unit (insn);
930c5599 4556 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
5326695a
AS
4557 HARD_REG_SET ireads, iwrites;
4558 CLEAR_HARD_REG_SET (ireads);
4559 CLEAR_HARD_REG_SET (iwrites);
e8448ba5 4560 note_stores (insn, record_hard_reg_sets, &iwrites);
5326695a
AS
4561 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4562
4563 /* Scan recent previous instructions for dependencies not handled in
4564 hardware. */
4565 int nops_rqd = 0;
4566 for (int i = oldest; i < oldest + max_waits; i++)
4567 {
4568 struct ilist *prev_insn = &back[i % max_waits];
4569
4570 if (!prev_insn->insn)
4571 continue;
4572
4573 /* VALU writes SGPR followed by VMEM reading the same SGPR
4574 requires 5 wait states. */
4575 if ((prev_insn->age + nops_rqd) < 5
4576 && prev_insn->unit == UNIT_VECTOR
4577 && gcn_vmem_insn_p (itype))
4578 {
dc333d8f 4579 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
4580 if (hard_reg_set_intersect_p
4581 (regs, reg_class_contents[(int) SGPR_REGS]))
4582 nops_rqd = 5 - prev_insn->age;
4583 }
4584
4585 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4586 requires 5 wait states. */
4587 if ((prev_insn->age + nops_rqd) < 5
4588 && prev_insn->unit == UNIT_VECTOR
4589 && iunit == UNIT_VECTOR
4590 && ((hard_reg_set_intersect_p
4591 (prev_insn->writes,
4592 reg_class_contents[(int) EXEC_MASK_REG])
4593 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4594 ||
4595 (hard_reg_set_intersect_p
4596 (prev_insn->writes,
4597 reg_class_contents[(int) VCC_CONDITIONAL_REG])
4598 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4599 nops_rqd = 5 - prev_insn->age;
4600
4601 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4602 SGPR/VCC as lane select requires 4 wait states. */
4603 if ((prev_insn->age + nops_rqd) < 4
4604 && prev_insn->unit == UNIT_VECTOR
4605 && get_attr_laneselect (insn) == LANESELECT_YES)
4606 {
dc333d8f 4607 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
4608 if (hard_reg_set_intersect_p
4609 (regs, reg_class_contents[(int) SGPR_REGS])
4610 || hard_reg_set_intersect_p
4611 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4612 nops_rqd = 4 - prev_insn->age;
4613 }
4614
4615 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4616 requires 2 wait states. */
4617 if ((prev_insn->age + nops_rqd) < 2
4618 && prev_insn->unit == UNIT_VECTOR
4619 && itype == TYPE_VOP_DPP)
4620 {
dc333d8f 4621 HARD_REG_SET regs = prev_insn->writes & ireads;
5326695a
AS
4622 if (hard_reg_set_intersect_p
4623 (regs, reg_class_contents[(int) VGPR_REGS]))
4624 nops_rqd = 2 - prev_insn->age;
4625 }
930c5599
AS
4626
4627 /* Store that requires input registers are not overwritten by
4628 following instruction. */
4629 if ((prev_insn->age + nops_rqd) < 1
4630 && prev_insn->delayeduse == DELAYEDUSE_YES
4631 && ((hard_reg_set_intersect_p
4632 (prev_insn->reads, iwrites))))
4633 nops_rqd = 1 - prev_insn->age;
5326695a
AS
4634 }
4635
4636 /* Insert the required number of NOPs. */
4637 for (int i = nops_rqd; i > 0; i--)
4638 emit_insn_after (gen_nop (), last_insn);
4639
4640 /* Age the previous instructions. We can also ignore writes to
4641 registers subsequently overwritten. */
4642 HARD_REG_SET written;
4643 CLEAR_HARD_REG_SET (written);
4644 for (int i = oldest + max_waits - 1; i > oldest; i--)
4645 {
4646 struct ilist *prev_insn = &back[i % max_waits];
4647
4648 /* Assume all instructions are equivalent to one "wait", the same
4649 as s_nop. This is probably true for SALU, but not VALU (which
4650 may take longer), so this is not optimal. However, AMD do
4651 not publish the cycle times for instructions. */
4652 prev_insn->age += 1 + nops_rqd;
4653
44942965 4654 written |= iwrites;
d15e5131 4655 prev_insn->writes &= ~written;
5326695a
AS
4656 }
4657
4658 /* Track the current instruction as a previous instruction. */
4659 back[oldest].insn = insn;
4660 back[oldest].unit = iunit;
930c5599 4661 back[oldest].delayeduse = idelayeduse;
6576d245
RS
4662 back[oldest].writes = iwrites;
4663 back[oldest].reads = ireads;
5326695a
AS
4664 back[oldest].age = 0;
4665 oldest = (oldest + 1) % max_waits;
4666
4667 last_insn = insn;
4668 }
4669}
4670
4671/* }}} */
4672/* {{{ OpenACC / OpenMP. */
4673
4674#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
4675#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
4676#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
4677
4678/* Implement TARGET_GOACC_VALIDATE_DIMS.
4679
4680 Check the launch dimensions provided for an OpenACC compute
4681 region, or routine. */
4682
4683static bool
4684gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
4685 unsigned /*used*/)
4686{
4687 bool changed = false;
4688
4689 /* FIXME: remove -facc-experimental-workers when they're ready. */
4690 int max_workers = flag_worker_partitioning ? 16 : 1;
4691
fe22e0d4
AS
4692 gcc_assert (!flag_worker_partitioning);
4693
5326695a
AS
4694 /* The vector size must appear to be 64, to the user, unless this is a
4695 SEQ routine. The real, internal value is always 1, which means use
4696 autovectorization, but the user should not see that. */
4697 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4698 && dims[GOMP_DIM_VECTOR] >= 0)
4699 {
4700 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
4701 && dims[GOMP_DIM_VECTOR] != 64)
4702 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4703 OPT_Wopenacc_dims,
4704 (dims[GOMP_DIM_VECTOR]
4705 ? G_("using vector_length (64), ignoring %d")
4706 : G_("using vector_length (64), "
4707 "ignoring runtime setting")),
4708 dims[GOMP_DIM_VECTOR]);
4709 dims[GOMP_DIM_VECTOR] = 1;
4710 changed = true;
4711 }
4712
4713 /* Check the num workers is not too large. */
4714 if (dims[GOMP_DIM_WORKER] > max_workers)
4715 {
4716 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4717 OPT_Wopenacc_dims,
4718 "using num_workers (%d), ignoring %d",
4719 max_workers, dims[GOMP_DIM_WORKER]);
4720 dims[GOMP_DIM_WORKER] = max_workers;
4721 changed = true;
4722 }
4723
4724 /* Set global defaults. */
4725 if (!decl)
4726 {
4727 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
4728 if (dims[GOMP_DIM_WORKER] < 0)
4729 dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
4730 ? GCN_DEFAULT_WORKERS : 1);
4731 if (dims[GOMP_DIM_GANG] < 0)
4732 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
4733 changed = true;
4734 }
4735
4736 return changed;
4737}
4738
4739/* Helper function for oacc_dim_size instruction.
4740 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
4741
4742rtx
4743gcn_oacc_dim_size (int dim)
4744{
4745 if (dim < 0 || dim > 2)
4746 error ("offload dimension out of range (%d)", dim);
4747
4748 /* Vectors are a special case. */
4749 if (dim == 2)
4750 return const1_rtx; /* Think of this as 1 times 64. */
4751
4752 static int offset[] = {
4753 /* Offsets into dispatch packet. */
4754 12, /* X dim = Gang / Team / Work-group. */
4755 20, /* Z dim = Worker / Thread / Wavefront. */
4756 16 /* Y dim = Vector / SIMD / Work-item. */
4757 };
4758 rtx addr = gen_rtx_PLUS (DImode,
4759 gen_rtx_REG (DImode,
4760 cfun->machine->args.
4761 reg[DISPATCH_PTR_ARG]),
4762 GEN_INT (offset[dim]));
4763 return gen_rtx_MEM (SImode, addr);
4764}
4765
4766/* Helper function for oacc_dim_pos instruction.
4767 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
4768
4769rtx
4770gcn_oacc_dim_pos (int dim)
4771{
4772 if (dim < 0 || dim > 2)
4773 error ("offload dimension out of range (%d)", dim);
4774
4775 static const int reg[] = {
4776 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
4777 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
4778 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
4779 };
4780
4781 int reg_num = cfun->machine->args.reg[reg[dim]];
4782
4783 /* The information must have been requested by the kernel. */
4784 gcc_assert (reg_num >= 0);
4785
4786 return gen_rtx_REG (SImode, reg_num);
4787}
4788
4789/* Implement TARGET_GOACC_FORK_JOIN. */
4790
4791static bool
4792gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
4793 bool ARG_UNUSED (is_fork))
4794{
4795 /* GCN does not use the fork/join concept invented for NVPTX.
4796 Instead we use standard autovectorization. */
4797 return false;
4798}
4799
4800/* Implement ???????
4801 FIXME make this a real hook.
4802
4803 Adjust FNDECL such that options inherited from the host compiler
4804 are made appropriate for the accelerator compiler. */
4805
4806void
4807gcn_fixup_accel_lto_options (tree fndecl)
4808{
4809 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4810 if (!func_optimize)
4811 return;
4812
4813 tree old_optimize = build_optimization_node (&global_options);
4814 tree new_optimize;
4815
4816 /* If the function changed the optimization levels as well as
4817 setting target options, start with the optimizations
4818 specified. */
4819 if (func_optimize != old_optimize)
4820 cl_optimization_restore (&global_options,
4821 TREE_OPTIMIZATION (func_optimize));
4822
4823 gcn_option_override ();
4824
4825 /* The target attributes may also change some optimization flags,
4826 so update the optimization options if necessary. */
4827 new_optimize = build_optimization_node (&global_options);
4828
4829 if (old_optimize != new_optimize)
4830 {
4831 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4832 cl_optimization_restore (&global_options,
4833 TREE_OPTIMIZATION (old_optimize));
4834 }
4835}
4836
4837/* }}} */
4838/* {{{ ASM Output. */
4839
4840/* Implement TARGET_ASM_FILE_START.
4841
4842 Print assembler file header text. */
4843
4844static void
4845output_file_start (void)
4846{
4847 fprintf (asm_out_file, "\t.text\n");
4848 fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
4849 fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
4850 fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
4851 fprintf (asm_out_file, "\t.text\n");
4852}
4853
4854/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
4855
4856 Print the initial definition of a function name.
4857
4858 For GCN kernel entry points this includes all the HSA meta-data, special
4859 alignment constraints that don't apply to regular functions, and magic
4860 comments that pass information to mkoffload. */
4861
4862void
4863gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
4864{
4865 int sgpr, vgpr;
4866 bool xnack_enabled = false;
4867 int extra_regs = 0;
4868
4869 if (cfun && cfun->machine && cfun->machine->normal_function)
4870 {
4871 fputs ("\t.type\t", file);
4872 assemble_name (file, name);
4873 fputs (",@function\n", file);
4874 assemble_name (file, name);
4875 fputs (":\n", file);
4876 return;
4877 }
4878
4879 /* Determine count of sgpr/vgpr registers by looking for last
4880 one used. */
4881 for (sgpr = 101; sgpr >= 0; sgpr--)
4882 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
4883 break;
4884 sgpr++;
4885 for (vgpr = 255; vgpr >= 0; vgpr--)
4886 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
4887 break;
4888 vgpr++;
4889
4890 if (xnack_enabled)
4891 extra_regs = 6;
4892 if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
4893 || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
4894 extra_regs = 4;
4895 else if (df_regs_ever_live_p (VCC_LO_REG)
4896 || df_regs_ever_live_p (VCC_HI_REG))
4897 extra_regs = 2;
4898
4899 if (!leaf_function_p ())
4900 {
4901 /* We can't know how many registers function calls might use. */
4902 if (vgpr < 64)
4903 vgpr = 64;
4904 if (sgpr + extra_regs < 102)
4905 sgpr = 102 - extra_regs;
4906 }
4907
4908 fputs ("\t.align\t256\n", file);
4909 fputs ("\t.type\t", file);
4910 assemble_name (file, name);
4911 fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
4912 assemble_name (file, name);
4913 fputs ("\n", file);
4914 assemble_name (file, name);
4915 fputs (":\n", file);
4916 fprintf (file, "\t.amd_kernel_code_t\n"
4917 "\t\tkernel_code_version_major = 1\n"
4918 "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
4919 /* "\t\tmachine_version_major = 8\n"
4920 "\t\tmachine_version_minor = 0\n"
4921 "\t\tmachine_version_stepping = 1\n" */
4922 "\t\tkernel_code_entry_byte_offset = 256\n"
4923 "\t\tkernel_code_prefetch_byte_size = 0\n"
4924 "\t\tmax_scratch_backing_memory_byte_size = 0\n"
4925 "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
4926 "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
4927 "\t\tcompute_pgm_rsrc1_priority = 0\n"
4928 "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
4929 "\t\tcompute_pgm_rsrc1_priv = 0\n"
4930 "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
4931 "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
4932 "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
4933 /* We enable scratch memory. */
4934 "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
4935 "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
4936 "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
4937 "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
4938 "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
4939 "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
4940 "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
4941 "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
4942 "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */
4943 "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
4944 (vgpr - 1) / 4,
4945 /* Must match wavefront_sgpr_count */
4946 (sgpr + extra_regs + 7) / 8 - 1,
4947 /* The total number of SGPR user data registers requested. This
4948 number must match the number of user data registers enabled. */
4949 cfun->machine->args.nsgprs);
4950 int reg = FIRST_SGPR_REG;
4951 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
4952 {
4953 int reg_first = -1;
4954 int reg_last;
4955 if ((cfun->machine->args.requested & (1 << a))
4956 && (gcn_kernel_arg_types[a].fixed_regno < 0))
4957 {
4958 reg_first = reg;
4959 reg_last = (reg_first
4960 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
4961 / UNITS_PER_WORD) - 1);
4962 reg = reg_last + 1;
4963 }
4964
4965 if (gcn_kernel_arg_types[a].header_pseudo)
4966 {
4967 fprintf (file, "\t\t%s = %i",
4968 gcn_kernel_arg_types[a].header_pseudo,
4969 (cfun->machine->args.requested & (1 << a)) != 0);
4970 if (reg_first != -1)
4971 {
4972 fprintf (file, " ; (");
4973 for (int i = reg_first; i <= reg_last; ++i)
4974 {
4975 if (i != reg_first)
4976 fprintf (file, ", ");
4977 fprintf (file, "%s", reg_names[i]);
4978 }
4979 fprintf (file, ")");
4980 }
4981 fprintf (file, "\n");
4982 }
4983 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
4984 && cfun->machine->args.requested & (1 << a))
4985 fprintf (file, "\t\t; %s = %i (%s)\n",
4986 gcn_kernel_arg_types[a].name,
4987 (cfun->machine->args.requested & (1 << a)) != 0,
4988 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
4989 }
4990 fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
4991 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
4992 ? 2
4993 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
4994 ? 1 : 0);
4995 fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
4996 "\t\tprivate_element_size = 1\n"
4997 "\t\tis_ptr64 = 1\n"
4998 "\t\tis_dynamic_callstack = 0\n"
4999 "\t\tis_debug_enabled = 0\n"
5000 "\t\tis_xnack_enabled = %i\n"
5001 "\t\tworkitem_private_segment_byte_size = %i\n"
5002 "\t\tworkgroup_group_segment_byte_size = %u\n"
5003 "\t\tgds_segment_byte_size = 0\n"
5004 "\t\tkernarg_segment_byte_size = %i\n"
5005 "\t\tworkgroup_fbarrier_count = 0\n"
5006 "\t\twavefront_sgpr_count = %i\n"
5007 "\t\tworkitem_vgpr_count = %i\n"
5008 "\t\treserved_vgpr_first = 0\n"
5009 "\t\treserved_vgpr_count = 0\n"
5010 "\t\treserved_sgpr_first = 0\n"
5011 "\t\treserved_sgpr_count = 0\n"
5012 "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
5013 "\t\tdebug_private_segment_buffer_sgpr = 0\n"
5014 "\t\tkernarg_segment_alignment = %i\n"
5015 "\t\tgroup_segment_alignment = 4\n"
5016 "\t\tprivate_segment_alignment = %i\n"
5017 "\t\twavefront_size = 6\n"
5018 "\t\tcall_convention = 0\n"
5019 "\t\truntime_loader_kernel_symbol = 0\n"
5020 "\t.end_amd_kernel_code_t\n", xnack_enabled,
5021 /* workitem_private_segment_bytes_size needs to be
5022 one 64th the wave-front stack size. */
5023 stack_size_opt / 64,
5024 LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
5025 /* Number of scalar registers used by a wavefront. This
5026 includes the special SGPRs for VCC, Flat Scratch (Base,
5027 Size) and XNACK (for GFX8 (VI)+). It does not include the
5028 16 SGPR added if a trap handler is enabled. Must match
5029 compute_pgm_rsrc1.sgprs. */
5030 sgpr + extra_regs, vgpr,
5031 cfun->machine->kernarg_segment_alignment,
5032 crtl->stack_alignment_needed / 8);
5033
5034 /* This comment is read by mkoffload. */
5035 if (flag_openacc)
5036 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5037 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5038 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5039 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5040}
5041
5042/* Implement TARGET_ASM_SELECT_SECTION.
5043
5044 Return the section into which EXP should be placed. */
5045
5046static section *
5047gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5048{
5049 if (TREE_TYPE (exp) != error_mark_node
5050 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5051 {
5052 if (!DECL_P (exp))
5053 return get_section (".lds_bss",
5054 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5055 NULL);
5056
5057 return get_named_section (exp, ".lds_bss", reloc);
5058 }
5059
5060 return default_elf_select_section (exp, reloc, align);
5061}
5062
5063/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5064
5065 Emits custom text into the assembler file at the head of each function. */
5066
5067static void
5068gcn_target_asm_function_prologue (FILE *file)
5069{
5070 machine_function *offsets = gcn_compute_frame_offsets ();
5071
5072 asm_fprintf (file, "\t; using %s addressing in function\n",
5073 offsets->use_flat_addressing ? "flat" : "global");
5074
5075 if (offsets->normal_function)
5076 {
5077 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5078 offsets->need_frame_pointer ? "true" : "false");
5079 asm_fprintf (file, "\t; lr needs saving: %s\n",
5080 offsets->lr_needs_saving ? "true" : "false");
5081 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5082 offsets->outgoing_args_size);
5083 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5084 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5085 asm_fprintf (file, "\t; callee save size: %wd\n",
5086 offsets->callee_saves);
5087 }
5088 else
5089 {
5090 asm_fprintf (file, "\t; HSA kernel entry point\n");
5091 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5092 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5093 offsets->outgoing_args_size);
5094
5095 /* Enable denorms. */
5096 asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
5097 " input and output denorms\n");
5098 asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
5099 }
5100}
5101
5102/* Helper function for print_operand and print_operand_address.
5103
5104 Print a register as the assembler requires, according to mode and name. */
5105
5106static void
5107print_reg (FILE *file, rtx x)
5108{
5109 machine_mode mode = GET_MODE (x);
5110 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5111 || mode == HFmode || mode == SFmode
5112 || mode == V64SFmode || mode == V64SImode
5113 || mode == V64QImode || mode == V64HImode)
5114 fprintf (file, "%s", reg_names[REGNO (x)]);
5115 else if (mode == DImode || mode == V64DImode
5116 || mode == DFmode || mode == V64DFmode)
5117 {
5118 if (SGPR_REGNO_P (REGNO (x)))
5119 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5120 REGNO (x) - FIRST_SGPR_REG + 1);
5121 else if (VGPR_REGNO_P (REGNO (x)))
5122 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5123 REGNO (x) - FIRST_VGPR_REG + 1);
5124 else if (REGNO (x) == FLAT_SCRATCH_REG)
5125 fprintf (file, "flat_scratch");
5126 else if (REGNO (x) == EXEC_REG)
5127 fprintf (file, "exec");
5128 else if (REGNO (x) == VCC_LO_REG)
5129 fprintf (file, "vcc");
5130 else
5131 fprintf (file, "[%s:%s]",
5132 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5133 }
5134 else if (mode == TImode)
5135 {
5136 if (SGPR_REGNO_P (REGNO (x)))
5137 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5138 REGNO (x) - FIRST_SGPR_REG + 3);
5139 else if (VGPR_REGNO_P (REGNO (x)))
5140 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5141 REGNO (x) - FIRST_VGPR_REG + 3);
5142 else
5143 gcc_unreachable ();
5144 }
5145 else
5146 gcc_unreachable ();
5147}
5148
5149/* Implement TARGET_SECTION_TYPE_FLAGS.
5150
5151 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5152
5153static unsigned int
5154gcn_section_type_flags (tree decl, const char *name, int reloc)
5155{
5156 if (strcmp (name, ".lds_bss") == 0)
5157 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5158
5159 return default_section_type_flags (decl, name, reloc);
5160}
5161
5162/* Helper function for gcn_asm_output_symbol_ref.
5163
5164 FIXME: If we want to have propagation blocks allocated separately and
5165 statically like this, it would be better done via symbol refs and the
5166 assembler/linker. This is a temporary hack. */
5167
5168static void
5169gcn_print_lds_decl (FILE *f, tree var)
5170{
5171 int *offset;
5172 machine_function *machfun = cfun->machine;
5173
5174 if ((offset = machfun->lds_allocs->get (var)))
5175 fprintf (f, "%u", (unsigned) *offset);
5176 else
5177 {
5178 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5179 tree type = TREE_TYPE (var);
5180 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5181 if (size > align && size > 4 && align < 8)
5182 align = 8;
5183
5184 machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
5185 & ~(align - 1));
5186
5187 machfun->lds_allocs->put (var, machfun->lds_allocated);
5188 fprintf (f, "%u", machfun->lds_allocated);
5189 machfun->lds_allocated += size;
5190 if (machfun->lds_allocated > LDS_SIZE)
5191 error ("local data-share memory exhausted");
5192 }
5193}
5194
5195/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5196
5197void
5198gcn_asm_output_symbol_ref (FILE *file, rtx x)
5199{
5200 tree decl;
5201 if ((decl = SYMBOL_REF_DECL (x)) != 0
5202 && TREE_CODE (decl) == VAR_DECL
5203 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5204 {
5205 /* LDS symbols (emitted using this hook) are only used at present
5206 to propagate worker values from an active thread to neutered
5207 threads. Use the same offset for each such block, but don't
5208 use zero because null pointers are used to identify the active
5209 thread in GOACC_single_copy_start calls. */
5210 gcn_print_lds_decl (file, decl);
5211 }
5212 else
5213 {
5214 assemble_name (file, XSTR (x, 0));
5215 /* FIXME: See above -- this condition is unreachable. */
5216 if ((decl = SYMBOL_REF_DECL (x)) != 0
5217 && TREE_CODE (decl) == VAR_DECL
5218 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5219 fputs ("@abs32", file);
5220 }
5221}
5222
5223/* Implement TARGET_CONSTANT_ALIGNMENT.
5224
5225 Returns the alignment in bits of a constant that is being placed in memory.
5226 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5227 would ordinarily have. */
5228
5229static HOST_WIDE_INT
5230gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5231 HOST_WIDE_INT basic_align)
5232{
5233 return basic_align > 128 ? basic_align : 128;
5234}
5235
5236/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5237
5238void
5239print_operand_address (FILE *file, rtx mem)
5240{
5241 gcc_assert (MEM_P (mem));
5242
5243 rtx reg;
5244 rtx offset;
5245 addr_space_t as = MEM_ADDR_SPACE (mem);
5246 rtx addr = XEXP (mem, 0);
5247 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5248
5249 if (AS_SCRATCH_P (as))
5250 switch (GET_CODE (addr))
5251 {
5252 case REG:
5253 print_reg (file, addr);
5254 break;
5255
5256 case PLUS:
5257 reg = XEXP (addr, 0);
5258 offset = XEXP (addr, 1);
5259 print_reg (file, reg);
5260 if (GET_CODE (offset) == CONST_INT)
5261 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5262 else
5263 abort ();
5264 break;
5265
5266 default:
5267 debug_rtx (addr);
5268 abort ();
5269 }
5270 else if (AS_ANY_FLAT_P (as))
5271 {
5272 if (GET_CODE (addr) == REG)
5273 print_reg (file, addr);
5274 else
5275 {
5276 gcc_assert (TARGET_GCN5_PLUS);
5277 print_reg (file, XEXP (addr, 0));
5278 }
5279 }
5280 else if (AS_GLOBAL_P (as))
5281 {
5282 gcc_assert (TARGET_GCN5_PLUS);
5283
5284 rtx base = addr;
5285 rtx vgpr_offset = NULL_RTX;
5286
5287 if (GET_CODE (addr) == PLUS)
5288 {
5289 base = XEXP (addr, 0);
5290
5291 if (GET_CODE (base) == PLUS)
5292 {
5293 /* (SGPR + VGPR) + CONST */
5294 vgpr_offset = XEXP (base, 1);
5295 base = XEXP (base, 0);
5296 }
5297 else
5298 {
5299 rtx offset = XEXP (addr, 1);
5300
5301 if (REG_P (offset))
5302 /* SGPR + VGPR */
5303 vgpr_offset = offset;
5304 else if (CONST_INT_P (offset))
5305 /* VGPR + CONST or SGPR + CONST */
5306 ;
5307 else
5308 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5309 }
5310 }
5311
5312 if (REG_P (base))
5313 {
5314 if (VGPR_REGNO_P (REGNO (base)))
5315 print_reg (file, base);
5316 else if (SGPR_REGNO_P (REGNO (base)))
5317 {
5318 /* The assembler requires a 64-bit VGPR pair here, even though
5319 the offset should be only 32-bit. */
5320 if (vgpr_offset == NULL_RTX)
f6e20012
KCY
5321 /* In this case, the vector offset is zero, so we use the first
5322 lane of v1, which is initialized to zero. */
5323 fprintf (file, "v[1:2]");
5326695a
AS
5324 else if (REG_P (vgpr_offset)
5325 && VGPR_REGNO_P (REGNO (vgpr_offset)))
5326 {
5327 fprintf (file, "v[%d:%d]",
5328 REGNO (vgpr_offset) - FIRST_VGPR_REG,
5329 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5330 }
5331 else
5332 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5333 }
5334 }
5335 else
5336 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5337 }
5338 else if (AS_ANY_DS_P (as))
5339 switch (GET_CODE (addr))
5340 {
5341 case REG:
5342 print_reg (file, addr);
5343 break;
5344
5345 case PLUS:
5346 reg = XEXP (addr, 0);
5347 print_reg (file, reg);
5348 break;
5349
5350 default:
5351 debug_rtx (addr);
5352 abort ();
5353 }
5354 else
5355 switch (GET_CODE (addr))
5356 {
5357 case REG:
5358 print_reg (file, addr);
5359 fprintf (file, ", 0");
5360 break;
5361
5362 case PLUS:
5363 reg = XEXP (addr, 0);
5364 offset = XEXP (addr, 1);
5365 print_reg (file, reg);
5366 fprintf (file, ", ");
5367 if (GET_CODE (offset) == REG)
5368 print_reg (file, reg);
5369 else if (GET_CODE (offset) == CONST_INT)
5370 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5371 else
5372 abort ();
5373 break;
5374
5375 default:
5376 debug_rtx (addr);
5377 abort ();
5378 }
5379}
5380
5381/* Implement PRINT_OPERAND via gcn.h.
5382
5383 b - print operand size as untyped operand (b8/b16/b32/b64)
5384 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5385 i - print operand size as untyped operand (i16/b32/i64)
5386 u - print operand size as untyped operand (u16/u32/u64)
5387 o - print operand size as memory access size for loads
5388 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5389 s - print operand size as memory access size for stores
5390 (byte/short/dword/dwordx2/wordx3/dwordx4)
5391 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5392 c - print inverse conditional code for s_cbranch
5393 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5394 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5395 A - print address in formatting suitable for given address space.
5396 O - print offset:n for data share operations.
5397 ^ - print "_co" suffix for GCN5 mnemonics
5398 g - print "glc", if appropriate for given MEM
5399 */
5400
5401void
5402print_operand (FILE *file, rtx x, int code)
5403{
5404 int xcode = x ? GET_CODE (x) : 0;
5405 bool invert = false;
5406 switch (code)
5407 {
5408 /* Instructions have the following suffixes.
5409 If there are two suffixes, the first is the destination type,
5410 and the second is the source type.
5411
5412 B32 Bitfield (untyped data) 32-bit
5413 B64 Bitfield (untyped data) 64-bit
5414 F16 floating-point 16-bit
5415 F32 floating-point 32-bit (IEEE 754 single-precision float)
5416 F64 floating-point 64-bit (IEEE 754 double-precision float)
5417 I16 signed 32-bit integer
5418 I32 signed 32-bit integer
5419 I64 signed 64-bit integer
5420 U16 unsigned 32-bit integer
5421 U32 unsigned 32-bit integer
5422 U64 unsigned 64-bit integer */
5423
5424 /* Print operand size as untyped suffix. */
5425 case 'b':
5426 {
5427 const char *s = "";
5428 machine_mode mode = GET_MODE (x);
5429 if (VECTOR_MODE_P (mode))
5430 mode = GET_MODE_INNER (mode);
5431 switch (GET_MODE_SIZE (mode))
5432 {
5433 case 1:
5434 s = "_b8";
5435 break;
5436 case 2:
5437 s = "_b16";
5438 break;
5439 case 4:
5440 s = "_b32";
5441 break;
5442 case 8:
5443 s = "_b64";
5444 break;
5445 default:
5446 output_operand_lossage ("invalid operand %%xn code");
5447 return;
5448 }
5449 fputs (s, file);
5450 }
5451 return;
5452 case 'B':
5453 {
5454 const char *s = "";
5455 machine_mode mode = GET_MODE (x);
5456 if (VECTOR_MODE_P (mode))
5457 mode = GET_MODE_INNER (mode);
5458 switch (GET_MODE_SIZE (mode))
5459 {
5460 case 1:
5461 case 2:
5462 case 4:
5463 s = "_b32";
5464 break;
5465 case 8:
5466 s = "_b64";
5467 break;
5468 default:
5469 output_operand_lossage ("invalid operand %%xn code");
5470 return;
5471 }
5472 fputs (s, file);
5473 }
5474 return;
5475 case 'e':
5476 fputs ("sext(", file);
5477 print_operand (file, x, 0);
5478 fputs (")", file);
5479 return;
5480 case 'i':
5481 case 'u':
5482 {
5483 bool signed_p = code == 'i';
5484 const char *s = "";
5485 machine_mode mode = GET_MODE (x);
5486 if (VECTOR_MODE_P (mode))
5487 mode = GET_MODE_INNER (mode);
5488 if (mode == VOIDmode)
5489 switch (GET_CODE (x))
5490 {
5491 case CONST_INT:
5492 s = signed_p ? "_i32" : "_u32";
5493 break;
5494 case CONST_DOUBLE:
5495 s = "_f64";
5496 break;
5497 default:
5498 output_operand_lossage ("invalid operand %%xn code");
5499 return;
5500 }
5501 else if (FLOAT_MODE_P (mode))
5502 switch (GET_MODE_SIZE (mode))
5503 {
5504 case 2:
5505 s = "_f16";
5506 break;
5507 case 4:
5508 s = "_f32";
5509 break;
5510 case 8:
5511 s = "_f64";
5512 break;
5513 default:
5514 output_operand_lossage ("invalid operand %%xn code");
5515 return;
5516 }
5517 else
5518 switch (GET_MODE_SIZE (mode))
5519 {
5520 case 1:
5521 s = signed_p ? "_i8" : "_u8";
5522 break;
5523 case 2:
5524 s = signed_p ? "_i16" : "_u16";
5525 break;
5526 case 4:
5527 s = signed_p ? "_i32" : "_u32";
5528 break;
5529 case 8:
5530 s = signed_p ? "_i64" : "_u64";
5531 break;
5532 default:
5533 output_operand_lossage ("invalid operand %%xn code");
5534 return;
5535 }
5536 fputs (s, file);
5537 }
5538 return;
5539 /* Print operand size as untyped suffix. */
5540 case 'o':
5541 {
5542 const char *s = 0;
5543 switch (GET_MODE_SIZE (GET_MODE (x)))
5544 {
5545 case 1:
5546 s = "_ubyte";
5547 break;
5548 case 2:
5549 s = "_ushort";
5550 break;
5551 /* The following are full-vector variants. */
5552 case 64:
5553 s = "_ubyte";
5554 break;
5555 case 128:
5556 s = "_ushort";
5557 break;
5558 }
5559
5560 if (s)
5561 {
5562 fputs (s, file);
5563 return;
5564 }
5565
5566 /* Fall-through - the other cases for 'o' are the same as for 's'. */
5567 gcc_fallthrough();
5568 }
5569 case 's':
5570 {
5571 const char *s = "";
5572 switch (GET_MODE_SIZE (GET_MODE (x)))
5573 {
5574 case 1:
5575 s = "_byte";
5576 break;
5577 case 2:
5578 s = "_short";
5579 break;
5580 case 4:
5581 s = "_dword";
5582 break;
5583 case 8:
5584 s = "_dwordx2";
5585 break;
5586 case 12:
5587 s = "_dwordx3";
5588 break;
5589 case 16:
5590 s = "_dwordx4";
5591 break;
5592 case 32:
5593 s = "_dwordx8";
5594 break;
5595 case 64:
5596 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
5597 break;
5598 /* The following are full-vector variants. */
5599 case 128:
5600 s = "_short";
5601 break;
5602 case 256:
5603 s = "_dword";
5604 break;
5605 case 512:
5606 s = "_dwordx2";
5607 break;
5608 default:
5609 output_operand_lossage ("invalid operand %%xn code");
5610 return;
5611 }
5612 fputs (s, file);
5613 }
5614 return;
5615 case 'A':
5616 if (xcode != MEM)
5617 {
5618 output_operand_lossage ("invalid %%xn code");
5619 return;
5620 }
5621 print_operand_address (file, x);
5622 return;
5623 case 'O':
5624 {
5625 if (xcode != MEM)
5626 {
5627 output_operand_lossage ("invalid %%xn code");
5628 return;
5629 }
5630 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
5631 fprintf (file, " gds");
5632
5633 rtx x0 = XEXP (x, 0);
5634 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
5635 {
5636 gcc_assert (TARGET_GCN5_PLUS);
5637
5638 fprintf (file, ", ");
5639
5640 rtx base = x0;
5641 rtx const_offset = NULL_RTX;
5642
5643 if (GET_CODE (base) == PLUS)
5644 {
5645 rtx offset = XEXP (x0, 1);
5646 base = XEXP (x0, 0);
5647
5648 if (GET_CODE (base) == PLUS)
5649 /* (SGPR + VGPR) + CONST */
5650 /* Ignore the VGPR offset for this operand. */
5651 base = XEXP (base, 0);
5652
5653 if (CONST_INT_P (offset))
5654 const_offset = XEXP (x0, 1);
5655 else if (REG_P (offset))
5656 /* SGPR + VGPR */
5657 /* Ignore the VGPR offset for this operand. */
5658 ;
5659 else
5660 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5661 }
5662
5663 if (REG_P (base))
5664 {
5665 if (VGPR_REGNO_P (REGNO (base)))
5666 /* The VGPR address is specified in the %A operand. */
5667 fprintf (file, "off");
5668 else if (SGPR_REGNO_P (REGNO (base)))
5669 print_reg (file, base);
5670 else
5671 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5672 }
5673 else
5674 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5675
5676 if (const_offset != NULL_RTX)
5677 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
5678 INTVAL (const_offset));
5679
5680 return;
5681 }
5682
5683 if (GET_CODE (x0) == REG)
5684 return;
5685 if (GET_CODE (x0) != PLUS)
5686 {
5687 output_operand_lossage ("invalid %%xn code");
5688 return;
5689 }
5690 rtx val = XEXP (x0, 1);
5691 if (GET_CODE (val) == CONST_VECTOR)
5692 val = CONST_VECTOR_ELT (val, 0);
5693 if (GET_CODE (val) != CONST_INT)
5694 {
5695 output_operand_lossage ("invalid %%xn code");
5696 return;
5697 }
5698 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
5699
5700 }
5701 return;
5702 case 'c':
5703 invert = true;
5704 /* Fall through. */
5705 case 'C':
5706 {
5707 const char *s;
5708 bool num = false;
5709 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
5710 {
5711 output_operand_lossage ("invalid %%xn code");
5712 return;
5713 }
5714 switch (REGNO (XEXP (x, 0)))
5715 {
5716 case VCC_REG:
5717 case VCCZ_REG:
5718 s = "_vcc";
5719 break;
5720 case SCC_REG:
5721 /* For some reason llvm-mc insists on scc0 instead of sccz. */
5722 num = true;
5723 s = "_scc";
5724 break;
5725 case EXECZ_REG:
5726 s = "_exec";
5727 break;
5728 default:
5729 output_operand_lossage ("invalid %%xn code");
5730 return;
5731 }
5732 fputs (s, file);
5733 if (xcode == (invert ? NE : EQ))
5734 fputc (num ? '0' : 'z', file);
5735 else
5736 fputs (num ? "1" : "nz", file);
5737 return;
5738 }
5739 case 'D':
5740 {
5741 const char *s;
5742 bool cmp_signed = false;
5743 switch (xcode)
5744 {
5745 case EQ:
5746 s = "_eq_";
5747 break;
5748 case NE:
5749 s = "_lg_";
5750 break;
5751 case LT:
5752 s = "_lt_";
5753 cmp_signed = true;
5754 break;
5755 case LE:
5756 s = "_le_";
5757 cmp_signed = true;
5758 break;
5759 case GT:
5760 s = "_gt_";
5761 cmp_signed = true;
5762 break;
5763 case GE:
5764 s = "_ge_";
5765 cmp_signed = true;
5766 break;
5767 case LTU:
5768 s = "_lt_";
5769 break;
5770 case LEU:
5771 s = "_le_";
5772 break;
5773 case GTU:
5774 s = "_gt_";
5775 break;
5776 case GEU:
5777 s = "_ge_";
5778 break;
5779 default:
5780 output_operand_lossage ("invalid %%xn code");
5781 return;
5782 }
5783 fputs (s, file);
5784 fputc (cmp_signed ? 'i' : 'u', file);
5785
5786 machine_mode mode = GET_MODE (XEXP (x, 0));
5787
5788 if (mode == VOIDmode)
5789 mode = GET_MODE (XEXP (x, 1));
5790
5791 /* If both sides are constants, then assume the instruction is in
5792 SImode since s_cmp can only do integer compares. */
5793 if (mode == VOIDmode)
5794 mode = SImode;
5795
5796 switch (GET_MODE_SIZE (mode))
5797 {
5798 case 4:
5799 s = "32";
5800 break;
5801 case 8:
5802 s = "64";
5803 break;
5804 default:
5805 output_operand_lossage ("invalid operand %%xn code");
5806 return;
5807 }
5808 fputs (s, file);
5809 return;
5810 }
5811 case 'E':
5812 {
5813 const char *s;
5814 bool cmp_signed = false;
5815 machine_mode mode = GET_MODE (XEXP (x, 0));
5816
5817 if (mode == VOIDmode)
5818 mode = GET_MODE (XEXP (x, 1));
5819
5820 /* If both sides are constants, assume the instruction is in SFmode
5821 if either operand is floating point, otherwise assume SImode. */
5822 if (mode == VOIDmode)
5823 {
5824 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
5825 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
5826 mode = SFmode;
5827 else
5828 mode = SImode;
5829 }
5830
5831 /* Use the same format code for vector comparisons. */
5832 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
5833 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
5834 mode = GET_MODE_INNER (mode);
5835
5836 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
5837
5838 switch (xcode)
5839 {
5840 case EQ:
5841 s = "_eq_";
5842 break;
5843 case NE:
5844 s = float_p ? "_neq_" : "_ne_";
5845 break;
5846 case LT:
5847 s = "_lt_";
5848 cmp_signed = true;
5849 break;
5850 case LE:
5851 s = "_le_";
5852 cmp_signed = true;
5853 break;
5854 case GT:
5855 s = "_gt_";
5856 cmp_signed = true;
5857 break;
5858 case GE:
5859 s = "_ge_";
5860 cmp_signed = true;
5861 break;
5862 case LTU:
5863 s = "_lt_";
5864 break;
5865 case LEU:
5866 s = "_le_";
5867 break;
5868 case GTU:
5869 s = "_gt_";
5870 break;
5871 case GEU:
5872 s = "_ge_";
5873 break;
5874 case ORDERED:
5875 s = "_o_";
5876 break;
5877 case UNORDERED:
5878 s = "_u_";
5879 break;
5880 default:
5881 output_operand_lossage ("invalid %%xn code");
5882 return;
5883 }
5884 fputs (s, file);
5885 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
5886
5887 switch (GET_MODE_SIZE (mode))
5888 {
5889 case 1:
5890 s = "32";
5891 break;
5892 case 2:
5893 s = float_p ? "16" : "32";
5894 break;
5895 case 4:
5896 s = "32";
5897 break;
5898 case 8:
5899 s = "64";
5900 break;
5901 default:
5902 output_operand_lossage ("invalid operand %%xn code");
5903 return;
5904 }
5905 fputs (s, file);
5906 return;
5907 }
5908 case 'L':
5909 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
5910 return;
5911 case 'H':
5912 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
5913 return;
5914 case 'R':
5915 /* Print a scalar register number as an integer. Temporary hack. */
5916 gcc_assert (REG_P (x));
5917 fprintf (file, "%u", (int) REGNO (x));
5918 return;
5919 case 'V':
5920 /* Print a vector register number as an integer. Temporary hack. */
5921 gcc_assert (REG_P (x));
5922 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
5923 return;
5924 case 0:
5925 if (xcode == REG)
5926 print_reg (file, x);
5927 else if (xcode == MEM)
5928 output_address (GET_MODE (x), x);
5929 else if (xcode == CONST_INT)
5930 fprintf (file, "%i", (int) INTVAL (x));
5931 else if (xcode == CONST_VECTOR)
5932 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
5933 else if (xcode == CONST_DOUBLE)
5934 {
5935 const char *str;
5936 switch (gcn_inline_fp_constant_p (x, false))
5937 {
5938 case 240:
5939 str = "0.5";
5940 break;
5941 case 241:
5942 str = "-0.5";
5943 break;
5944 case 242:
5945 str = "1.0";
5946 break;
5947 case 243:
5948 str = "-1.0";
5949 break;
5950 case 244:
5951 str = "2.0";
5952 break;
5953 case 245:
5954 str = "-2.0";
5955 break;
5956 case 246:
5957 str = "4.0";
5958 break;
5959 case 247:
5960 str = "-4.0";
5961 break;
5962 case 248:
5963 str = "1/pi";
5964 break;
5965 default:
5966 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
5967 ? DImode : SImode,
5968 x, GET_MODE (x), 0);
5969 if (x)
5970 print_operand (file, ix, code);
5971 else
a94d5170 5972 output_operand_lossage ("invalid fp constant");
5326695a
AS
5973 return;
5974 break;
5975 }
5976 fprintf (file, str);
5977 return;
5978 }
5979 else
5980 output_addr_const (file, x);
5981 return;
5982 case '^':
5983 if (TARGET_GCN5_PLUS)
5984 fputs ("_co", file);
5985 return;
5986 case 'g':
5987 gcc_assert (xcode == MEM);
5988 if (MEM_VOLATILE_P (x))
5989 fputs (" glc", file);
5990 return;
5991 default:
5992 output_operand_lossage ("invalid %%xn code");
5993 }
5994 gcc_unreachable ();
5995}
5996
5997/* }}} */
5998/* {{{ TARGET hook overrides. */
5999
6000#undef TARGET_ADDR_SPACE_ADDRESS_MODE
6001#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6002#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6003#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6004 gcn_addr_space_legitimate_address_p
6005#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6006#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6007#undef TARGET_ADDR_SPACE_POINTER_MODE
6008#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6009#undef TARGET_ADDR_SPACE_SUBSET_P
6010#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6011#undef TARGET_ADDR_SPACE_CONVERT
6012#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6013#undef TARGET_ARG_PARTIAL_BYTES
6014#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6015#undef TARGET_ASM_ALIGNED_DI_OP
6016#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
5326695a
AS
6017#undef TARGET_ASM_FILE_START
6018#define TARGET_ASM_FILE_START output_file_start
6019#undef TARGET_ASM_FUNCTION_PROLOGUE
6020#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6021#undef TARGET_ASM_SELECT_SECTION
6022#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6023#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6024#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6025#undef TARGET_ATTRIBUTE_TABLE
6026#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6027#undef TARGET_BUILTIN_DECL
6028#define TARGET_BUILTIN_DECL gcn_builtin_decl
6029#undef TARGET_CAN_CHANGE_MODE_CLASS
6030#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6031#undef TARGET_CAN_ELIMINATE
6032#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6033#undef TARGET_CANNOT_COPY_INSN_P
6034#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6035#undef TARGET_CLASS_LIKELY_SPILLED_P
6036#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6037#undef TARGET_CLASS_MAX_NREGS
6038#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6039#undef TARGET_CONDITIONAL_REGISTER_USAGE
6040#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6041#undef TARGET_CONSTANT_ALIGNMENT
6042#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6043#undef TARGET_DEBUG_UNWIND_INFO
6044#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
76d46331
KCY
6045#undef TARGET_EMUTLS_VAR_INIT
6046#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
5326695a
AS
6047#undef TARGET_EXPAND_BUILTIN
6048#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6049#undef TARGET_FUNCTION_ARG
6050#undef TARGET_FUNCTION_ARG_ADVANCE
6051#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6052#define TARGET_FUNCTION_ARG gcn_function_arg
6053#undef TARGET_FUNCTION_VALUE
6054#define TARGET_FUNCTION_VALUE gcn_function_value
6055#undef TARGET_FUNCTION_VALUE_REGNO_P
6056#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6057#undef TARGET_GIMPLIFY_VA_ARG_EXPR
6058#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
955cd057
TB
6059#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6060#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
5326695a
AS
6061#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
6062#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
6063 gcn_goacc_adjust_propagation_record
6064#undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
6065#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
6066#undef TARGET_GOACC_FORK_JOIN
6067#define TARGET_GOACC_FORK_JOIN gcn_fork_join
6068#undef TARGET_GOACC_REDUCTION
6069#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6070#undef TARGET_GOACC_VALIDATE_DIMS
6071#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
5326695a
AS
6072#undef TARGET_HARD_REGNO_MODE_OK
6073#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6074#undef TARGET_HARD_REGNO_NREGS
6075#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6076#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6077#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6078#undef TARGET_INIT_BUILTINS
6079#define TARGET_INIT_BUILTINS gcn_init_builtins
6080#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6081#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6082 gcn_ira_change_pseudo_allocno_class
6083#undef TARGET_LEGITIMATE_CONSTANT_P
6084#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6085#undef TARGET_LRA_P
6086#define TARGET_LRA_P hook_bool_void_true
6087#undef TARGET_MACHINE_DEPENDENT_REORG
6088#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6089#undef TARGET_MEMORY_MOVE_COST
6090#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6091#undef TARGET_MODES_TIEABLE_P
6092#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6093#undef TARGET_OPTION_OVERRIDE
6094#define TARGET_OPTION_OVERRIDE gcn_option_override
6095#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6096#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6097 gcn_pretend_outgoing_varargs_named
6098#undef TARGET_PROMOTE_FUNCTION_MODE
6099#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6100#undef TARGET_REGISTER_MOVE_COST
6101#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6102#undef TARGET_RETURN_IN_MEMORY
6103#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6104#undef TARGET_RTX_COSTS
6105#define TARGET_RTX_COSTS gcn_rtx_costs
6106#undef TARGET_SECONDARY_RELOAD
6107#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6108#undef TARGET_SECTION_TYPE_FLAGS
6109#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6110#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6111#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6112 gcn_small_register_classes_for_mode_p
6113#undef TARGET_SPILL_CLASS
6114#define TARGET_SPILL_CLASS gcn_spill_class
6115#undef TARGET_STRICT_ARGUMENT_NAMING
6116#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6117#undef TARGET_TRAMPOLINE_INIT
6118#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6119#undef TARGET_TRULY_NOOP_TRUNCATION
6120#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6121#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6122#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6123#undef TARGET_VECTORIZE_GET_MASK_MODE
6124#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6125#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6126#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6127#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6128#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6129 gcn_preferred_vector_alignment
6130#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6131#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6132 gcn_vectorize_support_vector_misalignment
6133#undef TARGET_VECTORIZE_VEC_PERM_CONST
6134#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6135#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6136#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6137 gcn_vector_alignment_reachable
6138#undef TARGET_VECTOR_MODE_SUPPORTED_P
6139#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6140
6141struct gcc_target targetm = TARGET_INITIALIZER;
6142
6143#include "gt-gcn.h"
6144/* }}} */