]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/gcn/gcn.c
Make note_stores take an rtx_insn
[thirdparty/gcc.git] / gcc / config / gcn / gcn.c
CommitLineData
5326695a
AS
1/* Copyright (C) 2016-2019 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17/* {{{ Includes. */
18
19/* We want GET_MODE_SIZE et al to return integers, please. */
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "memmodel.h"
28#include "rtl.h"
29#include "tree.h"
30#include "df.h"
31#include "tm_p.h"
32#include "stringpool.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic-core.h"
38#include "insn-attr.h"
39#include "fold-const.h"
40#include "calls.h"
41#include "explow.h"
42#include "expr.h"
43#include "output.h"
44#include "cfgrtl.h"
45#include "langhooks.h"
46#include "builtins.h"
47#include "omp-general.h"
48#include "print-rtl.h"
49#include "attribs.h"
50#include "varasm.h"
51#include "intl.h"
52#include "rtl-iter.h"
53
54/* This file should be included last. */
55#include "target-def.h"
56
57/* }}} */
58/* {{{ Global variables. */
59
60/* Constants used by FP instructions. */
61
62static REAL_VALUE_TYPE dconst4, dconst1over2pi;
63static bool ext_gcn_constants_init = 0;
64
65/* Holds the ISA variant, derived from the command line parameters. */
66
67int gcn_isa = 3; /* Default to GCN3. */
68
69/* Reserve this much space for LDS (for propagating variables from
70 worker-single mode to worker-partitioned mode), per workgroup. Global
71 analysis could calculate an exact bound, but we don't do that yet.
72
73 We reserve the whole LDS, which also prevents any other workgroup
74 sharing the Compute Unit. */
75
76#define LDS_SIZE 65536
77
78/* }}} */
79/* {{{ Initialization and options. */
80
81/* Initialize machine_function. */
82
83static struct machine_function *
84gcn_init_machine_status (void)
85{
86 struct machine_function *f;
87
88 f = ggc_cleared_alloc<machine_function> ();
89
90 /* Set up LDS allocation for broadcasting for this function. */
91 f->lds_allocated = 32;
92 f->lds_allocs = hash_map<tree, int>::create_ggc (64);
93
94 /* And LDS temporary decls for worker reductions. */
95 vec_alloc (f->reduc_decls, 0);
96
97 if (TARGET_GCN3)
98 f->use_flat_addressing = true;
99
100 return f;
101}
102
103/* Implement TARGET_OPTION_OVERRIDE.
104
105 Override option settings where defaults are variable, or we have specific
106 needs to consider. */
107
108static void
109gcn_option_override (void)
110{
111 init_machine_status = gcn_init_machine_status;
112
113 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
114 if (!flag_pie)
115 flag_pie = 2;
116 if (!flag_pic)
117 flag_pic = flag_pie;
118
119 gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
120
121 /* The default stack size needs to be small for offload kernels because
122 there may be many, many threads. Also, a smaller stack gives a
123 measureable performance boost. But, a small stack is insufficient
124 for running the testsuite, so we use a larger default for the stand
125 alone case. */
126 if (stack_size_opt == -1)
127 {
128 if (flag_openacc || flag_openmp)
129 /* 512 bytes per work item = 32kB total. */
130 stack_size_opt = 512 * 64;
131 else
132 /* 1MB total. */
133 stack_size_opt = 1048576;
134 }
135}
136
137/* }}} */
138/* {{{ Attributes. */
139
140/* This table defines the arguments that are permitted in
141 __attribute__ ((amdgpu_hsa_kernel (...))).
142
143 The names and values correspond to the HSA metadata that is encoded
144 into the assembler file and binary. */
145
146static const struct gcn_kernel_arg_type
147{
148 const char *name;
149 const char *header_pseudo;
150 machine_mode mode;
151
152 /* This should be set to -1 or -2 for a dynamically allocated register
153 number. Use -1 if this argument contributes to the user_sgpr_count,
154 -2 otherwise. */
155 int fixed_regno;
156} gcn_kernel_arg_types[] = {
157 {"exec", NULL, DImode, EXEC_REG},
158#define PRIVATE_SEGMENT_BUFFER_ARG 1
159 {"private_segment_buffer",
160 "enable_sgpr_private_segment_buffer", TImode, -1},
161#define DISPATCH_PTR_ARG 2
162 {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
163#define QUEUE_PTR_ARG 3
164 {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
165#define KERNARG_SEGMENT_PTR_ARG 4
166 {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
167 {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
168#define FLAT_SCRATCH_INIT_ARG 6
169 {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
170#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
171 {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
172 {"grid_workgroup_count_X",
173 "enable_sgpr_grid_workgroup_count_x", SImode, -1},
174 {"grid_workgroup_count_Y",
175 "enable_sgpr_grid_workgroup_count_y", SImode, -1},
176 {"grid_workgroup_count_Z",
177 "enable_sgpr_grid_workgroup_count_z", SImode, -1},
178#define WORKGROUP_ID_X_ARG 11
179 {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
180 {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
181 {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
182 {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
183#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
184 {"private_segment_wave_offset",
185 "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
186#define WORK_ITEM_ID_X_ARG 16
187 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
188#define WORK_ITEM_ID_Y_ARG 17
189 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
190#define WORK_ITEM_ID_Z_ARG 18
191 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
192};
193
194/* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
195 This function also sets the default values for some arguments.
196
197 Return true on success, with ARGS populated. */
198
199static bool
200gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
201 tree list)
202{
203 bool err = false;
204 args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG)
205 | (1 << QUEUE_PTR_ARG)
206 | (1 << KERNARG_SEGMENT_PTR_ARG)
207 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG));
208 args->nargs = 0;
209
210 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
211 args->reg[a] = -1;
212
213 for (; list; list = TREE_CHAIN (list))
214 {
215 const char *str;
216 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
217 {
218 error ("amdgpu_hsa_kernel attribute requires string constant "
219 "arguments");
220 break;
221 }
222 str = TREE_STRING_POINTER (TREE_VALUE (list));
223 int a;
224 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
225 {
226 if (!strcmp (str, gcn_kernel_arg_types[a].name))
227 break;
228 }
229 if (a == GCN_KERNEL_ARG_TYPES)
230 {
231 error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str);
232 err = true;
233 break;
234 }
235 if (args->requested & (1 << a))
236 {
237 error ("duplicated parameter specifier %s in amdgpu_hsa_kernel "
238 "attribute", str);
239 err = true;
240 break;
241 }
242 args->requested |= (1 << a);
243 args->order[args->nargs++] = a;
244 }
245 args->requested |= (1 << WORKGROUP_ID_X_ARG);
246 args->requested |= (1 << WORK_ITEM_ID_Z_ARG);
247
248 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
249 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
250 requesting WORK_ITEM_ID_X_ARG. */
251 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
252 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
253 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
254 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
255
256 /* Always enable this so that kernargs is in a predictable place for
257 gomp_print, etc. */
258 args->requested |= (1 << DISPATCH_PTR_ARG);
259
260 int sgpr_regno = FIRST_SGPR_REG;
261 args->nsgprs = 0;
262 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
263 {
264 if (!(args->requested & (1 << a)))
265 continue;
266
267 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
268 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
269 else
270 {
271 int reg_count;
272
273 switch (gcn_kernel_arg_types[a].mode)
274 {
275 case E_SImode:
276 reg_count = 1;
277 break;
278 case E_DImode:
279 reg_count = 2;
280 break;
281 case E_TImode:
282 reg_count = 4;
283 break;
284 default:
285 gcc_unreachable ();
286 }
287 args->reg[a] = sgpr_regno;
288 sgpr_regno += reg_count;
289 if (gcn_kernel_arg_types[a].fixed_regno == -1)
290 args->nsgprs += reg_count;
291 }
292 }
293 if (sgpr_regno > FIRST_SGPR_REG + 16)
294 {
295 error ("too many arguments passed in sgpr registers");
296 }
297 return err;
298}
299
300/* Referenced by TARGET_ATTRIBUTE_TABLE.
301
302 Validates target specific attributes. */
303
304static tree
305gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
306 tree args, int, bool *no_add_attrs)
307{
7039cebf 308 if (!FUNC_OR_METHOD_TYPE_P (*node))
5326695a
AS
309 {
310 warning (OPT_Wattributes, "%qE attribute only applies to functions",
311 name);
312 *no_add_attrs = true;
313 return NULL_TREE;
314 }
315
316 /* Can combine regparm with all attributes but fastcall, and thiscall. */
317 if (is_attribute_p ("gcnhsa_kernel", name))
318 {
319 struct gcn_kernel_args kernelarg;
320
321 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
322 *no_add_attrs = true;
323
324 return NULL_TREE;
325 }
326
327 return NULL_TREE;
328}
329
330/* Implement TARGET_ATTRIBUTE_TABLE.
331
332 Create target-specific __attribute__ types. */
333
334static const struct attribute_spec gcn_attribute_table[] = {
335 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
336 affects_type_identity } */
337 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
338 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
339 /* End element. */
340 {NULL, 0, 0, false, false, false, false, NULL, NULL}
341};
342
343/* }}} */
344/* {{{ Registers and modes. */
345
346/* Implement TARGET_CLASS_MAX_NREGS.
347
348 Return the number of hard registers needed to hold a value of MODE in
349 a register of class RCLASS. */
350
351static unsigned char
352gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
353{
354 /* Scalar registers are 32bit, vector registers are in fact tuples of
355 64 lanes. */
356 if (rclass == VGPR_REGS)
357 {
358 if (vgpr_1reg_mode_p (mode))
359 return 1;
360 if (vgpr_2reg_mode_p (mode))
361 return 2;
362 /* TImode is used by DImode compare_and_swap. */
363 if (mode == TImode)
364 return 4;
365 }
366 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
367 return 2;
368 return CEIL (GET_MODE_SIZE (mode), 4);
369}
370
371/* Implement TARGET_HARD_REGNO_NREGS.
372
373 Return the number of hard registers needed to hold a value of MODE in
374 REGNO. */
375
376unsigned int
377gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
378{
379 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
380}
381
382/* Implement TARGET_HARD_REGNO_MODE_OK.
383
384 Return true if REGNO can hold value in MODE. */
385
386bool
387gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
388{
389 /* Treat a complex mode as if it were a scalar mode of the same overall
390 size for the purposes of allocating hard registers. */
391 if (COMPLEX_MODE_P (mode))
392 switch (mode)
393 {
394 case E_CQImode:
395 case E_CHImode:
396 mode = SImode;
397 break;
398 case E_CSImode:
399 mode = DImode;
400 break;
401 case E_CDImode:
402 mode = TImode;
403 break;
404 case E_HCmode:
405 mode = SFmode;
406 break;
407 case E_SCmode:
408 mode = DFmode;
409 break;
410 default:
411 /* Not supported. */
412 return false;
413 }
414
415 switch (regno)
416 {
417 case FLAT_SCRATCH_LO_REG:
418 case XNACK_MASK_LO_REG:
419 case TBA_LO_REG:
420 case TMA_LO_REG:
421 return (mode == SImode || mode == DImode);
422 case VCC_LO_REG:
423 case EXEC_LO_REG:
424 return (mode == BImode || mode == SImode || mode == DImode);
425 case M0_REG:
426 case FLAT_SCRATCH_HI_REG:
427 case XNACK_MASK_HI_REG:
428 case TBA_HI_REG:
429 case TMA_HI_REG:
430 return mode == SImode;
431 case VCC_HI_REG:
432 return false;
433 case EXEC_HI_REG:
434 return mode == SImode /*|| mode == V32BImode */ ;
435 case SCC_REG:
436 case VCCZ_REG:
437 case EXECZ_REG:
438 return mode == BImode;
439 }
440 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
441 return true;
442 if (SGPR_REGNO_P (regno))
443 /* We restrict double register values to aligned registers. */
444 return (sgpr_1reg_mode_p (mode)
445 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
446 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
447 if (VGPR_REGNO_P (regno))
448 return (vgpr_1reg_mode_p (mode) || vgpr_2reg_mode_p (mode)
449 /* TImode is used by DImode compare_and_swap. */
450 || mode == TImode);
451 return false;
452}
453
454/* Implement REGNO_REG_CLASS via gcn.h.
455
456 Return smallest class containing REGNO. */
457
458enum reg_class
459gcn_regno_reg_class (int regno)
460{
461 switch (regno)
462 {
463 case SCC_REG:
464 return SCC_CONDITIONAL_REG;
465 case VCCZ_REG:
466 return VCCZ_CONDITIONAL_REG;
467 case EXECZ_REG:
468 return EXECZ_CONDITIONAL_REG;
469 case EXEC_LO_REG:
470 case EXEC_HI_REG:
471 return EXEC_MASK_REG;
472 }
473 if (VGPR_REGNO_P (regno))
474 return VGPR_REGS;
475 if (SGPR_REGNO_P (regno))
476 return SGPR_REGS;
477 if (regno < FIRST_VGPR_REG)
478 return GENERAL_REGS;
479 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
480 return AFP_REGS;
481 return ALL_REGS;
482}
483
484/* Implement TARGET_CAN_CHANGE_MODE_CLASS.
485
486 GCC assumes that lowpart contains first part of value as stored in memory.
487 This is not the case for vector registers. */
488
489bool
490gcn_can_change_mode_class (machine_mode from, machine_mode to,
491 reg_class_t regclass)
492{
493 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
494 return true;
495 return (gcn_class_max_nregs (regclass, from)
496 == gcn_class_max_nregs (regclass, to));
497}
498
499/* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
500
501 When this hook returns true for MODE, the compiler allows
502 registers explicitly used in the rtl to be used as spill registers
503 but prevents the compiler from extending the lifetime of these
504 registers. */
505
506bool
507gcn_small_register_classes_for_mode_p (machine_mode mode)
508{
509 /* We allocate into exec and vcc regs. Those make small register class. */
510 return mode == DImode || mode == SImode;
511}
512
513/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
514
515 Returns true if pseudos that have been assigned to registers of class RCLASS
516 would likely be spilled because registers of RCLASS are needed for spill
517 registers. */
518
519static bool
520gcn_class_likely_spilled_p (reg_class_t rclass)
521{
522 return (rclass == EXEC_MASK_REG
523 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
524}
525
526/* Implement TARGET_MODES_TIEABLE_P.
527
528 Returns true if a value of MODE1 is accessible in MODE2 without
529 copying. */
530
531bool
532gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
533{
534 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
535 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
536}
537
538/* Implement TARGET_TRULY_NOOP_TRUNCATION.
539
540 Returns true if it is safe to “convert” a value of INPREC bits to one of
541 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
542 it as if it had only OUTPREC bits. */
543
544bool
545gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
546{
547 return ((inprec <= 32) && (outprec <= inprec));
548}
549
550/* Return N-th part of value occupying multiple registers. */
551
552rtx
553gcn_operand_part (machine_mode mode, rtx op, int n)
554{
555 if (GET_MODE_SIZE (mode) >= 256)
556 {
557 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
558
559 if (REG_P (op))
560 {
561 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
562 return gen_rtx_REG (V64SImode, REGNO (op) + n);
563 }
564 if (GET_CODE (op) == CONST_VECTOR)
565 {
566 int units = GET_MODE_NUNITS (mode);
567 rtvec v = rtvec_alloc (units);
568
569 for (int i = 0; i < units; ++i)
570 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
571 CONST_VECTOR_ELT (op, i), n);
572
573 return gen_rtx_CONST_VECTOR (V64SImode, v);
574 }
575 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
576 return gcn_gen_undef (V64SImode);
577 gcc_unreachable ();
578 }
579 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
580 {
581 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
582 return gen_rtx_REG (SImode, REGNO (op) + n);
583 }
584 else
585 {
586 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
587 return gcn_gen_undef (SImode);
588
589 /* If it's a constant then let's assume it is of the largest mode
590 available, otherwise simplify_gen_subreg will fail. */
591 if (mode == VOIDmode && CONST_INT_P (op))
592 mode = DImode;
593 return simplify_gen_subreg (SImode, op, mode, n * 4);
594 }
595}
596
597/* Return N-th part of value occupying multiple registers. */
598
599rtx
600gcn_operand_doublepart (machine_mode mode, rtx op, int n)
601{
602 return simplify_gen_subreg (DImode, op, mode, n * 8);
603}
604
605/* Return true if OP can be split into subregs or high/low parts.
606 This is always true for scalars, but not normally true for vectors.
607 However, for vectors in hardregs we can use the low and high registers. */
608
609bool
610gcn_can_split_p (machine_mode, rtx op)
611{
612 if (vgpr_vector_mode_p (GET_MODE (op)))
613 {
614 if (GET_CODE (op) == SUBREG)
615 op = SUBREG_REG (op);
616 if (!REG_P (op))
617 return true;
618 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
619 }
620 return true;
621}
622
623/* Implement TARGET_SPILL_CLASS.
624
625 Return class of registers which could be used for pseudo of MODE
626 and of class RCLASS for spilling instead of memory. Return NO_REGS
627 if it is not possible or non-profitable. */
628
629static reg_class_t
630gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
631{
632 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c))
633 return SGPR_REGS;
634 else
635 return NO_REGS;
636}
637
638/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
639
640 Change allocno class for given pseudo from allocno and best class
641 calculated by IRA. */
642
643static reg_class_t
644gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
645 reg_class_t best_cl)
646{
647 /* Avoid returning classes that contain both vgpr and sgpr registers. */
648 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
649 return cl;
650 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
651 && best_cl != ALL_GPR_REGS)
652 return best_cl;
653
654 machine_mode mode = PSEUDO_REGNO_MODE (regno);
655 if (vgpr_vector_mode_p (mode))
656 return VGPR_REGS;
657
658 return GENERAL_REGS;
659}
660
661/* Create a new DImode pseudo reg and emit an instruction to initialize
662 it to VAL. */
663
664static rtx
665get_exec (int64_t val)
666{
667 rtx reg = gen_reg_rtx (DImode);
668 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
669 return reg;
670}
671
672/* Return value of scalar exec register. */
673
674rtx
675gcn_scalar_exec ()
676{
677 return const1_rtx;
678}
679
680/* Return pseudo holding scalar exec register. */
681
682rtx
683gcn_scalar_exec_reg ()
684{
685 return get_exec (1);
686}
687
688/* Return value of full exec register. */
689
690rtx
691gcn_full_exec ()
692{
693 return constm1_rtx;
694}
695
696/* Return pseudo holding full exec register. */
697
698rtx
699gcn_full_exec_reg ()
700{
701 return get_exec (-1);
702}
703
704/* }}} */
705/* {{{ Immediate constants. */
706
707/* Initialize shared numeric constants. */
708
709static void
710init_ext_gcn_constants (void)
711{
712 real_from_integer (&dconst4, DFmode, 4, SIGNED);
713
714 /* FIXME: this constant probably does not match what hardware really loads.
715 Reality check it eventually. */
716 real_from_string (&dconst1over2pi,
717 "0.1591549430918953357663423455968866839");
718 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
719
720 ext_gcn_constants_init = 1;
721}
722
723/* Return non-zero if X is a constant that can appear as an inline operand.
724 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
725 Or a vector of those.
726 The value returned should be the encoding of this constant. */
727
728int
729gcn_inline_fp_constant_p (rtx x, bool allow_vector)
730{
731 machine_mode mode = GET_MODE (x);
732
733 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
734 && allow_vector)
735 {
736 int n;
737 if (GET_CODE (x) != CONST_VECTOR)
738 return 0;
739 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
740 if (!n)
741 return 0;
742 for (int i = 1; i < 64; i++)
743 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
744 return 0;
745 return 1;
746 }
747
748 if (mode != HFmode && mode != SFmode && mode != DFmode)
749 return 0;
750
751 const REAL_VALUE_TYPE *r;
752
753 if (x == CONST0_RTX (mode))
754 return 128;
755 if (x == CONST1_RTX (mode))
756 return 242;
757
758 r = CONST_DOUBLE_REAL_VALUE (x);
759
760 if (real_identical (r, &dconstm1))
761 return 243;
762
763 if (real_identical (r, &dconsthalf))
764 return 240;
765 if (real_identical (r, &dconstm1))
766 return 243;
767 if (real_identical (r, &dconst2))
768 return 244;
769 if (real_identical (r, &dconst4))
770 return 246;
771 if (real_identical (r, &dconst1over2pi))
772 return 248;
773 if (!ext_gcn_constants_init)
774 init_ext_gcn_constants ();
775 real_value_negate (r);
776 if (real_identical (r, &dconsthalf))
777 return 241;
778 if (real_identical (r, &dconst2))
779 return 245;
780 if (real_identical (r, &dconst4))
781 return 247;
782
783 /* FIXME: add 4, -4 and 1/(2*PI). */
784
785 return 0;
786}
787
788/* Return non-zero if X is a constant that can appear as an immediate operand.
789 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
790 Or a vector of those.
791 The value returned should be the encoding of this constant. */
792
793bool
794gcn_fp_constant_p (rtx x, bool allow_vector)
795{
796 machine_mode mode = GET_MODE (x);
797
798 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
799 && allow_vector)
800 {
801 int n;
802 if (GET_CODE (x) != CONST_VECTOR)
803 return false;
804 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
805 if (!n)
806 return false;
807 for (int i = 1; i < 64; i++)
808 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
809 return false;
810 return true;
811 }
812 if (mode != HFmode && mode != SFmode && mode != DFmode)
813 return false;
814
815 if (gcn_inline_fp_constant_p (x, false))
816 return true;
817 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
818 return (mode != DFmode);
819}
820
821/* Return true if X is a constant representable as an inline immediate
822 constant in a 32-bit instruction encoding. */
823
824bool
825gcn_inline_constant_p (rtx x)
826{
827 if (GET_CODE (x) == CONST_INT)
828 return INTVAL (x) >= -16 && INTVAL (x) < 64;
829 if (GET_CODE (x) == CONST_DOUBLE)
830 return gcn_inline_fp_constant_p (x, false);
831 if (GET_CODE (x) == CONST_VECTOR)
832 {
833 int n;
834 if (!vgpr_vector_mode_p (GET_MODE (x)))
835 return false;
836 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
837 if (!n)
838 return false;
839 for (int i = 1; i < 64; i++)
840 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
841 return false;
842 return 1;
843 }
844 return false;
845}
846
847/* Return true if X is a constant representable as an immediate constant
848 in a 32 or 64-bit instruction encoding. */
849
850bool
851gcn_constant_p (rtx x)
852{
853 switch (GET_CODE (x))
854 {
855 case CONST_INT:
856 return true;
857
858 case CONST_DOUBLE:
859 return gcn_fp_constant_p (x, false);
860
861 case CONST_VECTOR:
862 {
863 int n;
864 if (!vgpr_vector_mode_p (GET_MODE (x)))
865 return false;
866 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
867 if (!n)
868 return false;
869 for (int i = 1; i < 64; i++)
870 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
871 return false;
872 return true;
873 }
874
875 case SYMBOL_REF:
876 case LABEL_REF:
877 return true;
878
879 default:
880 ;
881 }
882
883 return false;
884}
885
886/* Return true if X is a constant representable as two inline immediate
887 constants in a 64-bit instruction that is split into two 32-bit
888 instructions. */
889
890bool
891gcn_inline_constant64_p (rtx x)
892{
893 if (GET_CODE (x) == CONST_VECTOR)
894 {
895 if (!vgpr_vector_mode_p (GET_MODE (x)))
896 return false;
897 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0)))
898 return false;
899 for (int i = 1; i < 64; i++)
900 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
901 return false;
902
903 return true;
904 }
905
906 if (GET_CODE (x) != CONST_INT)
907 return false;
908
909 rtx val_lo = gcn_operand_part (DImode, x, 0);
910 rtx val_hi = gcn_operand_part (DImode, x, 1);
911 return gcn_inline_constant_p (val_lo) && gcn_inline_constant_p (val_hi);
912}
913
914/* Return true if X is a constant representable as an immediate constant
915 in a 32 or 64-bit instruction encoding where the hardware will
916 extend the immediate to 64-bits. */
917
918bool
919gcn_constant64_p (rtx x)
920{
921 if (!gcn_constant_p (x))
922 return false;
923
924 if (GET_CODE (x) != CONST_INT)
925 return true;
926
927 /* Negative numbers are only allowed if they can be encoded within src0,
928 because the 32-bit immediates do not get sign-extended.
929 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
930 assembler will use a src0 inline immediate and that will get
931 sign-extended. */
932 HOST_WIDE_INT val = INTVAL (x);
933 return (((val & 0xffffffff) == val /* Positive 32-bit. */
934 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
935 || gcn_inline_constant_p (x)); /* Src0. */
936}
937
938/* Implement TARGET_LEGITIMATE_CONSTANT_P.
939
940 Returns true if X is a legitimate constant for a MODE immediate operand. */
941
942bool
943gcn_legitimate_constant_p (machine_mode, rtx x)
944{
945 return gcn_constant_p (x);
946}
947
948/* Return true if X is a CONST_VECTOR of single constant. */
949
950static bool
951single_cst_vector_p (rtx x)
952{
953 if (GET_CODE (x) != CONST_VECTOR)
954 return false;
955 for (int i = 1; i < 64; i++)
956 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
957 return false;
958 return true;
959}
960
961/* Create a CONST_VECTOR of duplicated value A. */
962
963rtx
964gcn_vec_constant (machine_mode mode, int a)
965{
966 /*if (!a)
967 return CONST0_RTX (mode);
968 if (a == -1)
969 return CONSTM1_RTX (mode);
970 if (a == 1)
971 return CONST1_RTX (mode);
972 if (a == 2)
973 return CONST2_RTX (mode);*/
974
975 int units = GET_MODE_NUNITS (mode);
976 rtx tem = gen_int_mode (a, GET_MODE_INNER (mode));
977 rtvec v = rtvec_alloc (units);
978
979 for (int i = 0; i < units; ++i)
980 RTVEC_ELT (v, i) = tem;
981
982 return gen_rtx_CONST_VECTOR (mode, v);
983}
984
985/* Create a CONST_VECTOR of duplicated value A. */
986
987rtx
988gcn_vec_constant (machine_mode mode, rtx a)
989{
990 int units = GET_MODE_NUNITS (mode);
991 rtvec v = rtvec_alloc (units);
992
993 for (int i = 0; i < units; ++i)
994 RTVEC_ELT (v, i) = a;
995
996 return gen_rtx_CONST_VECTOR (mode, v);
997}
998
999/* Create an undefined vector value, used where an insn operand is
1000 optional. */
1001
1002rtx
1003gcn_gen_undef (machine_mode mode)
1004{
1005 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1006}
1007
1008/* }}} */
1009/* {{{ Addresses, pointers and moves. */
1010
1011/* Return true is REG is a valid place to store a pointer,
1012 for instructions that require an SGPR.
1013 FIXME rename. */
1014
1015static bool
1016gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1017{
1018 if (GET_CODE (reg) == SUBREG)
1019 reg = SUBREG_REG (reg);
1020
1021 if (!REG_P (reg))
1022 return false;
1023
1024 if (GET_MODE (reg) != mode)
1025 return false;
1026
1027 int regno = REGNO (reg);
1028
1029 if (regno >= FIRST_PSEUDO_REGISTER)
1030 {
1031 if (!strict)
1032 return true;
1033
1034 if (!reg_renumber)
1035 return false;
1036
1037 regno = reg_renumber[regno];
1038 }
1039
1040 return (SGPR_REGNO_P (regno) || regno == M0_REG
1041 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1042}
1043
1044/* Return true is REG is a valid place to store a pointer,
1045 for instructions that require a VGPR. */
1046
1047static bool
1048gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1049{
1050 if (GET_CODE (reg) == SUBREG)
1051 reg = SUBREG_REG (reg);
1052
1053 if (!REG_P (reg))
1054 return false;
1055
1056 if (GET_MODE (reg) != mode)
1057 return false;
1058
1059 int regno = REGNO (reg);
1060
1061 if (regno >= FIRST_PSEUDO_REGISTER)
1062 {
1063 if (!strict)
1064 return true;
1065
1066 if (!reg_renumber)
1067 return false;
1068
1069 regno = reg_renumber[regno];
1070 }
1071
1072 return VGPR_REGNO_P (regno);
1073}
1074
1075/* Return true if X would be valid inside a MEM using the Flat address
1076 space. */
1077
1078bool
1079gcn_flat_address_p (rtx x, machine_mode mode)
1080{
1081 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1082 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1083
1084 if (vec_mode && gcn_address_register_p (x, DImode, false))
1085 return true;
1086
1087 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1088 return true;
1089
1090 if (TARGET_GCN5_PLUS
1091 && GET_CODE (x) == PLUS
1092 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1093 && CONST_INT_P (XEXP (x, 1)))
1094 return true;
1095
1096 return false;
1097}
1098
1099/* Return true if X would be valid inside a MEM using the Scalar Flat
1100 address space. */
1101
1102bool
1103gcn_scalar_flat_address_p (rtx x)
1104{
1105 if (gcn_address_register_p (x, DImode, false))
1106 return true;
1107
1108 if (GET_CODE (x) == PLUS
1109 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1110 && CONST_INT_P (XEXP (x, 1)))
1111 return true;
1112
1113 return false;
1114}
1115
1116/* Return true if MEM X would be valid for the Scalar Flat address space. */
1117
1118bool
1119gcn_scalar_flat_mem_p (rtx x)
1120{
1121 if (!MEM_P (x))
1122 return false;
1123
1124 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1125 return false;
1126
1127 return gcn_scalar_flat_address_p (XEXP (x, 0));
1128}
1129
1130/* Return true if X would be valid inside a MEM using the LDS or GDS
1131 address spaces. */
1132
1133bool
1134gcn_ds_address_p (rtx x)
1135{
1136 if (gcn_vec_address_register_p (x, SImode, false))
1137 return true;
1138
1139 if (GET_CODE (x) == PLUS
1140 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1141 && CONST_INT_P (XEXP (x, 1)))
1142 return true;
1143
1144 return false;
1145}
1146
1147/* Return true if ADDR would be valid inside a MEM using the Global
1148 address space. */
1149
1150bool
1151gcn_global_address_p (rtx addr)
1152{
1153 if (gcn_address_register_p (addr, DImode, false)
1154 || gcn_vec_address_register_p (addr, DImode, false))
1155 return true;
1156
1157 if (GET_CODE (addr) == PLUS)
1158 {
1159 rtx base = XEXP (addr, 0);
1160 rtx offset = XEXP (addr, 1);
1161 bool immediate_p = (CONST_INT_P (offset)
1162 && INTVAL (offset) >= -(1 << 12)
1163 && INTVAL (offset) < (1 << 12));
1164
1165 if ((gcn_address_register_p (base, DImode, false)
1166 || gcn_vec_address_register_p (base, DImode, false))
1167 && immediate_p)
1168 /* SGPR + CONST or VGPR + CONST */
1169 return true;
1170
1171 if (gcn_address_register_p (base, DImode, false)
1172 && gcn_vgpr_register_operand (offset, SImode))
1173 /* SPGR + VGPR */
1174 return true;
1175
1176 if (GET_CODE (base) == PLUS
1177 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1178 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1179 && immediate_p)
1180 /* (SGPR + VGPR) + CONST */
1181 return true;
1182 }
1183
1184 return false;
1185}
1186
1187/* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1188
1189 Recognizes RTL expressions that are valid memory addresses for an
1190 instruction. The MODE argument is the machine mode for the MEM
1191 expression that wants to use this address.
1192
1193 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1194 convert common non-canonical forms to canonical form so that they will
1195 be recognized. */
1196
1197static bool
1198gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1199 addr_space_t as)
1200{
1201 /* All vector instructions need to work on addresses in registers. */
1202 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1203 return false;
1204
1205 if (AS_SCALAR_FLAT_P (as))
1206 {
1207 if (mode == QImode || mode == HImode)
1208 return 0;
1209
1210 switch (GET_CODE (x))
1211 {
1212 case REG:
1213 return gcn_address_register_p (x, DImode, strict);
1214 /* Addresses are in the form BASE+OFFSET
1215 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1216 Writes and atomics do not accept SGPR. */
1217 case PLUS:
1218 {
1219 rtx x0 = XEXP (x, 0);
1220 rtx x1 = XEXP (x, 1);
1221 if (!gcn_address_register_p (x0, DImode, strict))
1222 return false;
1223 /* FIXME: This is disabled because of the mode mismatch between
1224 SImode (for the address or m0 register) and the DImode PLUS.
1225 We'll need a zero_extend or similar.
1226
1227 if (gcn_m0_register_p (x1, SImode, strict)
1228 || gcn_address_register_p (x1, SImode, strict))
1229 return true;
1230 else*/
1231 if (GET_CODE (x1) == CONST_INT)
1232 {
1233 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1234 /* The low bits of the offset are ignored, even when
1235 they're meant to realign the pointer. */
1236 && !(INTVAL (x1) & 0x3))
1237 return true;
1238 }
1239 return false;
1240 }
1241
1242 default:
1243 break;
1244 }
1245 }
1246 else if (AS_SCRATCH_P (as))
1247 return gcn_address_register_p (x, SImode, strict);
1248 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1249 {
1250 if (TARGET_GCN3 || GET_CODE (x) == REG)
1251 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1252 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1253 ? gcn_address_register_p (x, DImode, strict)
1254 : gcn_vec_address_register_p (x, DImode, strict));
1255 else
1256 {
1257 gcc_assert (TARGET_GCN5_PLUS);
1258
1259 if (GET_CODE (x) == PLUS)
1260 {
1261 rtx x1 = XEXP (x, 1);
1262
1263 if (VECTOR_MODE_P (mode)
1264 ? !gcn_address_register_p (x, DImode, strict)
1265 : !gcn_vec_address_register_p (x, DImode, strict))
1266 return false;
1267
1268 if (GET_CODE (x1) == CONST_INT)
1269 {
1270 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1271 /* The low bits of the offset are ignored, even when
1272 they're meant to realign the pointer. */
1273 && !(INTVAL (x1) & 0x3))
1274 return true;
1275 }
1276 }
1277 return false;
1278 }
1279 }
1280 else if (AS_GLOBAL_P (as))
1281 {
1282 gcc_assert (TARGET_GCN5_PLUS);
1283
1284 if (GET_CODE (x) == REG)
1285 return (gcn_address_register_p (x, DImode, strict)
1286 || (!VECTOR_MODE_P (mode)
1287 && gcn_vec_address_register_p (x, DImode, strict)));
1288 else if (GET_CODE (x) == PLUS)
1289 {
1290 rtx base = XEXP (x, 0);
1291 rtx offset = XEXP (x, 1);
1292
1293 bool immediate_p = (GET_CODE (offset) == CONST_INT
1294 /* Signed 13-bit immediate. */
1295 && INTVAL (offset) >= -(1 << 12)
1296 && INTVAL (offset) < (1 << 12)
1297 /* The low bits of the offset are ignored, even
1298 when they're meant to realign the pointer. */
1299 && !(INTVAL (offset) & 0x3));
1300
1301 if (!VECTOR_MODE_P (mode))
1302 {
1303 if ((gcn_address_register_p (base, DImode, strict)
1304 || gcn_vec_address_register_p (base, DImode, strict))
1305 && immediate_p)
1306 /* SGPR + CONST or VGPR + CONST */
1307 return true;
1308
1309 if (gcn_address_register_p (base, DImode, strict)
1310 && gcn_vgpr_register_operand (offset, SImode))
1311 /* SGPR + VGPR */
1312 return true;
1313
1314 if (GET_CODE (base) == PLUS
1315 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1316 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1317 && immediate_p)
1318 /* (SGPR + VGPR) + CONST */
1319 return true;
1320 }
1321 else
1322 {
1323 if (gcn_address_register_p (base, DImode, strict)
1324 && immediate_p)
1325 /* SGPR + CONST */
1326 return true;
1327 }
1328 }
1329 else
1330 return false;
1331 }
1332 else if (AS_ANY_DS_P (as))
1333 switch (GET_CODE (x))
1334 {
1335 case REG:
1336 return (VECTOR_MODE_P (mode)
1337 ? gcn_address_register_p (x, SImode, strict)
1338 : gcn_vec_address_register_p (x, SImode, strict));
1339 /* Addresses are in the form BASE+OFFSET
1340 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1341 Writes and atomics do not accept SGPR. */
1342 case PLUS:
1343 {
1344 rtx x0 = XEXP (x, 0);
1345 rtx x1 = XEXP (x, 1);
1346 if (!gcn_vec_address_register_p (x0, DImode, strict))
1347 return false;
1348 if (GET_CODE (x1) == REG)
1349 {
1350 if (GET_CODE (x1) != REG
1351 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1352 && !gcn_ssrc_register_operand (x1, DImode)))
1353 return false;
1354 }
1355 else if (GET_CODE (x1) == CONST_VECTOR
1356 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1357 && single_cst_vector_p (x1))
1358 {
1359 x1 = CONST_VECTOR_ELT (x1, 0);
1360 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1361 return true;
1362 }
1363 return false;
1364 }
1365
1366 default:
1367 break;
1368 }
1369 else
1370 gcc_unreachable ();
1371 return false;
1372}
1373
1374/* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1375
1376 Return the appropriate mode for a named address pointer. */
1377
1378static scalar_int_mode
1379gcn_addr_space_pointer_mode (addr_space_t addrspace)
1380{
1381 switch (addrspace)
1382 {
1383 case ADDR_SPACE_SCRATCH:
1384 case ADDR_SPACE_LDS:
1385 case ADDR_SPACE_GDS:
1386 return SImode;
1387 case ADDR_SPACE_DEFAULT:
1388 case ADDR_SPACE_FLAT:
1389 case ADDR_SPACE_FLAT_SCRATCH:
1390 case ADDR_SPACE_SCALAR_FLAT:
1391 return DImode;
1392 default:
1393 gcc_unreachable ();
1394 }
1395}
1396
1397/* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1398
1399 Return the appropriate mode for a named address space address. */
1400
1401static scalar_int_mode
1402gcn_addr_space_address_mode (addr_space_t addrspace)
1403{
1404 return gcn_addr_space_pointer_mode (addrspace);
1405}
1406
1407/* Implement TARGET_ADDR_SPACE_SUBSET_P.
1408
1409 Determine if one named address space is a subset of another. */
1410
1411static bool
1412gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1413{
1414 if (subset == superset)
1415 return true;
1416 /* FIXME is this true? */
1417 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1418 return true;
1419 return false;
1420}
1421
1422/* Convert from one address space to another. */
1423
1424static rtx
1425gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1426{
1427 gcc_assert (POINTER_TYPE_P (from_type));
1428 gcc_assert (POINTER_TYPE_P (to_type));
1429
1430 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1431 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1432
1433 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1434 {
1435 rtx queue = gen_rtx_REG (DImode,
1436 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1437 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1438 gen_rtx_PLUS (DImode, queue,
1439 gen_int_mode (64, SImode)));
1440 rtx tmp = gen_reg_rtx (DImode);
1441
1442 emit_move_insn (gen_lowpart (SImode, tmp), op);
1443 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1444 group_seg_aperture_hi);
1445
1446 return tmp;
1447 }
1448 else if (as_from == as_to)
1449 return op;
1450 else
1451 gcc_unreachable ();
1452}
1453
1454
1455/* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1456
1457 Retun true if REGNO is OK for memory adressing. */
1458
1459bool
1460gcn_regno_mode_code_ok_for_base_p (int regno,
1461 machine_mode, addr_space_t as, int, int)
1462{
1463 if (regno >= FIRST_PSEUDO_REGISTER)
1464 {
1465 if (reg_renumber)
1466 regno = reg_renumber[regno];
1467 else
1468 return true;
1469 }
1470 if (AS_FLAT_P (as))
1471 return (VGPR_REGNO_P (regno)
1472 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1473 else if (AS_SCALAR_FLAT_P (as))
1474 return (SGPR_REGNO_P (regno)
1475 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1476 else if (AS_GLOBAL_P (as))
1477 {
1478 return (SGPR_REGNO_P (regno)
1479 || VGPR_REGNO_P (regno)
1480 || regno == ARG_POINTER_REGNUM
1481 || regno == FRAME_POINTER_REGNUM);
1482 }
1483 else
1484 /* For now. */
1485 return false;
1486}
1487
1488/* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1489
1490 Return a suitable register class for memory addressing. */
1491
1492reg_class
1493gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1494 int ic)
1495{
1496 switch (as)
1497 {
1498 case ADDR_SPACE_DEFAULT:
1499 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1500 case ADDR_SPACE_SCALAR_FLAT:
1501 case ADDR_SPACE_SCRATCH:
1502 return SGPR_REGS;
1503 break;
1504 case ADDR_SPACE_FLAT:
1505 case ADDR_SPACE_FLAT_SCRATCH:
1506 case ADDR_SPACE_LDS:
1507 case ADDR_SPACE_GDS:
1508 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1509 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1510 ? SGPR_REGS : VGPR_REGS);
1511 case ADDR_SPACE_GLOBAL:
1512 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1513 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1514 ? SGPR_REGS : ALL_GPR_REGS);
1515 }
1516 gcc_unreachable ();
1517}
1518
1519/* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1520
1521 Return true if REGNO is OK for index of memory addressing. */
1522
1523bool
1524regno_ok_for_index_p (int regno)
1525{
1526 if (regno >= FIRST_PSEUDO_REGISTER)
1527 {
1528 if (reg_renumber)
1529 regno = reg_renumber[regno];
1530 else
1531 return true;
1532 }
1533 return regno == M0_REG || VGPR_REGNO_P (regno);
1534}
1535
1536/* Generate move which uses the exec flags. If EXEC is NULL, then it is
1537 assumed that all lanes normally relevant to the mode of the move are
1538 affected. If PREV is NULL, then a sensible default is supplied for
1539 the inactive lanes. */
1540
1541static rtx
1542gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1543{
1544 machine_mode mode = GET_MODE (op0);
1545
1546 if (vgpr_vector_mode_p (mode))
1547 {
1548 if (exec && exec != CONSTM1_RTX (DImode))
1549 {
1550 if (!prev)
1551 prev = op0;
1552 }
1553 else
1554 {
1555 if (!prev)
1556 prev = gcn_gen_undef (mode);
1557 exec = gcn_full_exec_reg ();
1558 }
1559
1560 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1561
1562 return gen_rtx_PARALLEL (VOIDmode,
1563 gen_rtvec (2, set,
1564 gen_rtx_CLOBBER (VOIDmode,
1565 gen_rtx_SCRATCH (V64DImode))));
1566 }
1567
1568 return (gen_rtx_PARALLEL
1569 (VOIDmode,
1570 gen_rtvec (2, gen_rtx_SET (op0, op1),
1571 gen_rtx_USE (VOIDmode,
1572 exec ? exec : gcn_scalar_exec ()))));
1573}
1574
1575/* Generate masked move. */
1576
1577static rtx
1578gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1579{
1580 if (exec)
1581 return (gen_rtx_SET (op0,
1582 gen_rtx_VEC_MERGE (GET_MODE (op0),
1583 gen_rtx_VEC_DUPLICATE (GET_MODE
1584 (op0), op1),
1585 op2, exec)));
1586 else
1587 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1588}
1589
1590/* Expand vector init of OP0 by VEC.
1591 Implements vec_init instruction pattern. */
1592
1593void
1594gcn_expand_vector_init (rtx op0, rtx vec)
1595{
1596 int64_t initialized_mask = 0;
1597 int64_t curr_mask = 1;
1598 machine_mode mode = GET_MODE (op0);
1599
1600 rtx val = XVECEXP (vec, 0, 0);
1601
1602 for (int i = 1; i < 64; i++)
1603 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1604 curr_mask |= (int64_t) 1 << i;
1605
1606 if (gcn_constant_p (val))
1607 emit_move_insn (op0, gcn_vec_constant (mode, val));
1608 else
1609 {
1610 val = force_reg (GET_MODE_INNER (mode), val);
1611 emit_insn (gen_duplicate_load (op0, val));
1612 }
1613 initialized_mask |= curr_mask;
1614 for (int i = 1; i < 64; i++)
1615 if (!(initialized_mask & ((int64_t) 1 << i)))
1616 {
1617 curr_mask = (int64_t) 1 << i;
1618 rtx val = XVECEXP (vec, 0, i);
1619
1620 for (int j = i + 1; j < 64; j++)
1621 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1622 curr_mask |= (int64_t) 1 << j;
1623 if (gcn_constant_p (val))
1624 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1625 get_exec (curr_mask)));
1626 else
1627 {
1628 val = force_reg (GET_MODE_INNER (mode), val);
1629 emit_insn (gen_duplicate_load (op0, val, op0,
1630 get_exec (curr_mask)));
1631 }
1632 initialized_mask |= curr_mask;
1633 }
1634}
1635
1636/* Load vector constant where n-th lane contains BASE+n*VAL. */
1637
1638static rtx
1639strided_constant (machine_mode mode, int base, int val)
1640{
1641 rtx x = gen_reg_rtx (mode);
1642 emit_move_insn (x, gcn_vec_constant (mode, base));
1643 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1644 x, get_exec (0xffffffff00000000)));
1645 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1646 x, get_exec (0xffff0000ffff0000)));
1647 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1648 x, get_exec (0xff00ff00ff00ff00)));
1649 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1650 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1651 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1652 x, get_exec (0xcccccccccccccccc)));
1653 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1654 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1655 return x;
1656}
1657
1658/* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1659
1660static rtx
1661gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1662 addr_space_t as)
1663{
1664 switch (as)
1665 {
1666 case ADDR_SPACE_DEFAULT:
1667 return gcn_addr_space_legitimize_address (x, old, mode,
1668 DEFAULT_ADDR_SPACE);
1669 case ADDR_SPACE_SCALAR_FLAT:
1670 case ADDR_SPACE_SCRATCH:
1671 /* Instructions working on vectors need the address to be in
1672 a register. */
1673 if (vgpr_vector_mode_p (mode))
1674 return force_reg (GET_MODE (x), x);
1675
1676 return x;
1677 case ADDR_SPACE_FLAT:
1678 case ADDR_SPACE_FLAT_SCRATCH:
1679 case ADDR_SPACE_GLOBAL:
1680 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1681 case ADDR_SPACE_LDS:
1682 case ADDR_SPACE_GDS:
1683 /* FIXME: LDS support offsets, handle them!. */
1684 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1685 {
1686 rtx addrs = gen_reg_rtx (V64SImode);
1687 rtx base = force_reg (SImode, x);
1688 rtx offsets = strided_constant (V64SImode, 0,
1689 GET_MODE_UNIT_SIZE (mode));
1690
1691 emit_insn (gen_vec_duplicatev64si (addrs, base));
1692 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1693 return addrs;
1694 }
1695 return x;
1696 }
1697 gcc_unreachable ();
1698}
1699
1700/* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1701 proper vector of stepped addresses.
1702
1703 MEM will be a DImode address of a vector in an SGPR.
1704 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1705
1706rtx
1707gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1708 rtx tmp)
1709{
1710 gcc_assert (MEM_P (mem));
1711 rtx mem_base = XEXP (mem, 0);
1712 rtx mem_index = NULL_RTX;
1713
1714 if (!TARGET_GCN5_PLUS)
1715 {
1716 /* gcn_addr_space_legitimize_address should have put the address in a
1717 register. If not, it is too late to do anything about it. */
1718 gcc_assert (REG_P (mem_base));
1719 }
1720
1721 if (GET_CODE (mem_base) == PLUS)
1722 {
1723 mem_index = XEXP (mem_base, 1);
1724 mem_base = XEXP (mem_base, 0);
1725 }
1726
1727 /* RF and RM base registers for vector modes should be always an SGPR. */
1728 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1729 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1730
1731 machine_mode inner = GET_MODE_INNER (mode);
1732 int shift = exact_log2 (GET_MODE_SIZE (inner));
1733 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1734 rtx undef_v64si = gcn_gen_undef (V64SImode);
1735 rtx new_base = NULL_RTX;
1736 addr_space_t as = MEM_ADDR_SPACE (mem);
1737
1738 rtx tmplo = (REG_P (tmp)
1739 ? gcn_operand_part (V64DImode, tmp, 0)
1740 : gen_reg_rtx (V64SImode));
1741
1742 /* tmplo[:] = ramp[:] << shift */
1743 if (exec)
1744 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1745 gen_int_mode (shift, SImode),
1746 undef_v64si, exec));
1747 else
1748 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1749
1750 if (AS_FLAT_P (as))
1751 {
1752 if (REG_P (tmp))
1753 {
1754 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1755 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1756 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1757 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1758
1759 /* tmphi[:] = mem_base_hi */
1760 if (exec)
1761 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1762 undef_v64si, exec));
1763 else
1764 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1765
1766 /* tmp[:] += zext (mem_base) */
1767 if (exec)
1768 {
1769 rtx undef_di = gcn_gen_undef (DImode);
1770 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1771 vcc, undef_v64si, exec));
1772 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1773 vcc, vcc, undef_v64si, exec));
1774 }
1775 else
1776 emit_insn (gen_addv64di3_zext_dup (tmp, mem_base_lo, tmp));
1777 }
1778 else
1779 {
1780 tmp = gen_reg_rtx (V64DImode);
1781 if (exec)
1782 emit_insn (gen_addv64di3_zext_dup2_exec (tmp, tmplo, mem_base,
1783 gcn_gen_undef (V64DImode),
1784 exec));
1785 else
1786 emit_insn (gen_addv64di3_zext_dup2 (tmp, tmplo, mem_base));
1787 }
1788
1789 new_base = tmp;
1790 }
1791 else if (AS_ANY_DS_P (as))
1792 {
1793 if (!exec)
1794 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1795 else
1796 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1797 gcn_gen_undef (V64SImode), exec));
1798 new_base = tmplo;
1799 }
1800 else
1801 {
1802 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1803 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1804 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1805 }
1806
1807 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1808 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1809 (mem_index ? mem_index
1810 : const0_rtx)));
1811}
1812
1813/* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1814 suitable for the given address space. This is indented for use in
1815 gather/scatter patterns.
1816
1817 The offsets may be signed or unsigned, according to UNSIGNED_P.
1818 If EXEC is set then _exec patterns will be used, otherwise plain.
1819
1820 Return values.
1821 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1822 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1823
1824rtx
1825gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1826 bool unsigned_p, rtx exec)
1827{
1828 /* Convert the offsets to V64SImode.
1829 TODO: more conversions will be needed when more types are vectorized. */
1830 if (GET_MODE (offsets) == V64DImode)
1831 {
1832 rtx tmp = gen_reg_rtx (V64SImode);
1833 emit_insn (gen_vec_truncatev64div64si (tmp, offsets));
1834 offsets = tmp;
1835 }
1836
1837 rtx tmpsi = gen_reg_rtx (V64SImode);
1838 rtx tmpdi = gen_reg_rtx (V64DImode);
1839 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1840 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1841
1842 if (CONST_INT_P (scale)
1843 && INTVAL (scale) > 0
1844 && exact_log2 (INTVAL (scale)) >= 0)
1845 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1846 GEN_INT (exact_log2 (INTVAL (scale)))));
1847 else
1848 (exec
1849 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1850 exec))
1851 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1852
1853 /* "Global" instructions do not support negative register offsets. */
1854 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1855 {
1856 if (unsigned_p)
1857 (exec
1858 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1859 undefdi, exec))
1860 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1861 else
1862 (exec
1863 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1864 undefdi, exec))
1865 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1866 return tmpdi;
1867 }
1868 else if (as == ADDR_SPACE_GLOBAL)
1869 return tmpsi;
1870
1871 gcc_unreachable ();
1872}
1873
1874/* Return true if move from OP0 to OP1 is known to be executed in vector
1875 unit. */
1876
1877bool
1878gcn_vgpr_move_p (rtx op0, rtx op1)
1879{
1880 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1881 return true;
1882 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1883 return true;
1884 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1885 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1886 || vgpr_vector_mode_p (GET_MODE (op0)));
1887}
1888
1889/* Return true if move from OP0 to OP1 is known to be executed in scalar
1890 unit. Used in the machine description. */
1891
1892bool
1893gcn_sgpr_move_p (rtx op0, rtx op1)
1894{
1895 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1896 return true;
1897 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1898 return true;
1899 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1900 || VGPR_REGNO_P (REGNO (op0)))
1901 return false;
1902 if (REG_P (op1)
1903 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1904 && !VGPR_REGNO_P (REGNO (op1)))
1905 return true;
1906 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1907}
1908
1909/* Implement TARGET_SECONDARY_RELOAD.
1910
1911 The address space determines which registers can be used for loads and
1912 stores. */
1913
1914static reg_class_t
1915gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
1916 machine_mode reload_mode, secondary_reload_info *sri)
1917{
1918 reg_class_t result = NO_REGS;
1919 bool spilled_pseudo =
1920 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
1921
1922 if (dump_file && (dump_flags & TDF_DETAILS))
1923 {
1924 fprintf (dump_file, "gcn_secondary_reload: ");
1925 dump_value_slim (dump_file, x, 1);
1926 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
1927 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
1928 if (REG_P (x) || GET_CODE (x) == SUBREG)
1929 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
1930 (true_regnum (x) >= 0
1931 && true_regnum (x) < FIRST_PSEUDO_REGISTER
1932 ? reg_names[true_regnum (x)]
1933 : (spilled_pseudo ? "stack spill" : "??")));
1934 fprintf (dump_file, "\n");
1935 }
1936
1937 /* Some callers don't use or initialize icode. */
1938 sri->icode = CODE_FOR_nothing;
1939
1940 if (MEM_P (x) || spilled_pseudo)
1941 {
1942 addr_space_t as = DEFAULT_ADDR_SPACE;
1943
1944 /* If we have a spilled pseudo, we can't find the address space
1945 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
1946 ADDR_SPACE_GLOBAL for GCN5. */
1947 if (MEM_P (x))
1948 as = MEM_ADDR_SPACE (x);
1949
1950 if (as == ADDR_SPACE_DEFAULT)
1951 as = DEFAULT_ADDR_SPACE;
1952
1953 switch (as)
1954 {
1955 case ADDR_SPACE_SCALAR_FLAT:
1956 result =
1957 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
1958 break;
1959 case ADDR_SPACE_FLAT:
1960 case ADDR_SPACE_FLAT_SCRATCH:
1961 case ADDR_SPACE_GLOBAL:
1962 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
1963 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
1964 {
1965 if (in_p)
1966 switch (reload_mode)
1967 {
1968 case E_V64SImode:
1969 sri->icode = CODE_FOR_reload_inv64si;
1970 break;
1971 case E_V64SFmode:
1972 sri->icode = CODE_FOR_reload_inv64sf;
1973 break;
1974 case E_V64HImode:
1975 sri->icode = CODE_FOR_reload_inv64hi;
1976 break;
1977 case E_V64HFmode:
1978 sri->icode = CODE_FOR_reload_inv64hf;
1979 break;
1980 case E_V64QImode:
1981 sri->icode = CODE_FOR_reload_inv64qi;
1982 break;
1983 case E_V64DImode:
1984 sri->icode = CODE_FOR_reload_inv64di;
1985 break;
1986 case E_V64DFmode:
1987 sri->icode = CODE_FOR_reload_inv64df;
1988 break;
1989 default:
1990 gcc_unreachable ();
1991 }
1992 else
1993 switch (reload_mode)
1994 {
1995 case E_V64SImode:
1996 sri->icode = CODE_FOR_reload_outv64si;
1997 break;
1998 case E_V64SFmode:
1999 sri->icode = CODE_FOR_reload_outv64sf;
2000 break;
2001 case E_V64HImode:
2002 sri->icode = CODE_FOR_reload_outv64hi;
2003 break;
2004 case E_V64HFmode:
2005 sri->icode = CODE_FOR_reload_outv64hf;
2006 break;
2007 case E_V64QImode:
2008 sri->icode = CODE_FOR_reload_outv64qi;
2009 break;
2010 case E_V64DImode:
2011 sri->icode = CODE_FOR_reload_outv64di;
2012 break;
2013 case E_V64DFmode:
2014 sri->icode = CODE_FOR_reload_outv64df;
2015 break;
2016 default:
2017 gcc_unreachable ();
2018 }
2019 break;
2020 }
2021 /* Fallthrough. */
2022 case ADDR_SPACE_LDS:
2023 case ADDR_SPACE_GDS:
2024 case ADDR_SPACE_SCRATCH:
2025 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2026 break;
2027 }
2028 }
2029
2030 if (dump_file && (dump_flags & TDF_DETAILS))
2031 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2032 get_insn_name (sri->icode));
2033
2034 return result;
2035}
2036
2037/* Update register usage after having seen the compiler flags and kernel
2038 attributes. We typically want to fix registers that contain values
2039 set by the HSA runtime. */
2040
2041static void
2042gcn_conditional_register_usage (void)
2043{
2044 int i;
2045
2046 /* FIXME: Do we need to reset fixed_regs? */
2047
2048/* Limit ourselves to 1/16 the register file for maximimum sized workgroups.
2049 There are enough SGPRs not to limit those.
2050 TODO: Adjust this more dynamically. */
2051 for (i = FIRST_VGPR_REG + 64; i <= LAST_VGPR_REG; i++)
2052 fixed_regs[i] = 1, call_used_regs[i] = 1;
2053
2054 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2055 {
2056 /* Normal functions can't know what kernel argument registers are
2057 live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs. */
2058 for (i = 0; i < 16; i++)
2059 fixed_regs[FIRST_SGPR_REG + i] = 1;
2060 for (i = 0; i < 3; i++)
2061 fixed_regs[FIRST_VGPR_REG + i] = 1;
2062 return;
2063 }
2064
2065 /* Fix the runtime argument register containing values that may be
2066 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2067 needed after the prologue so there's no need to fix them. */
2068 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2069 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2070 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2071 {
2072 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2073 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2074 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 2] = 1;
2075 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 3] = 1;
2076 }
2077 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2078 {
2079 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2080 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2081 }
2082 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2083 {
2084 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2085 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2086 }
2087 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2088 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2089 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2090 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2091 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2092 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2093 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2094 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2095
2096 if (TARGET_GCN5_PLUS)
2097 /* v0 is always zero, for global nul-offsets. */
2098 fixed_regs[VGPR_REGNO (0)] = 1;
2099}
2100
2101/* Determine if a load or store is valid, according to the register classes
2102 and address space. Used primarily by the machine description to decide
2103 when to split a move into two steps. */
2104
2105bool
2106gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2107{
2108 if (!MEM_P (dest) && !MEM_P (src))
2109 return true;
2110
2111 if (MEM_P (dest)
2112 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2113 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2114 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2115 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2116 && gcn_vgpr_register_operand (src, mode))
2117 return true;
2118 else if (MEM_P (src)
2119 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2120 && (gcn_flat_address_p (XEXP (src, 0), mode)
2121 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2122 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2123 && gcn_vgpr_register_operand (dest, mode))
2124 return true;
2125
2126 if (MEM_P (dest)
2127 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2128 && (gcn_global_address_p (XEXP (dest, 0))
2129 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2130 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2131 && gcn_vgpr_register_operand (src, mode))
2132 return true;
2133 else if (MEM_P (src)
2134 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2135 && (gcn_global_address_p (XEXP (src, 0))
2136 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2137 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2138 && gcn_vgpr_register_operand (dest, mode))
2139 return true;
2140
2141 if (MEM_P (dest)
2142 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2143 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2144 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2145 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2146 && gcn_ssrc_register_operand (src, mode))
2147 return true;
2148 else if (MEM_P (src)
2149 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2150 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2151 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2152 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2153 && gcn_sdst_register_operand (dest, mode))
2154 return true;
2155
2156 if (MEM_P (dest)
2157 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2158 && gcn_ds_address_p (XEXP (dest, 0))
2159 && gcn_vgpr_register_operand (src, mode))
2160 return true;
2161 else if (MEM_P (src)
2162 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2163 && gcn_ds_address_p (XEXP (src, 0))
2164 && gcn_vgpr_register_operand (dest, mode))
2165 return true;
2166
2167 return false;
2168}
2169
2170/* }}} */
2171/* {{{ Functions and ABI. */
2172
2173/* Implement TARGET_FUNCTION_VALUE.
2174
2175 Define how to find the value returned by a function.
2176 The register location is always the same, but the mode depends on
2177 VALTYPE. */
2178
2179static rtx
2180gcn_function_value (const_tree valtype, const_tree, bool)
2181{
2182 machine_mode mode = TYPE_MODE (valtype);
2183
2184 if (INTEGRAL_TYPE_P (valtype)
2185 && GET_MODE_CLASS (mode) == MODE_INT
2186 && GET_MODE_SIZE (mode) < 4)
2187 mode = SImode;
2188
2189 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2190}
2191
2192/* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2193
2194 Return true if N is a possible register number for the function return
2195 value. */
2196
2197static bool
2198gcn_function_value_regno_p (const unsigned int n)
2199{
2200 return n == RETURN_VALUE_REG;
2201}
2202
0ffef200
RS
2203/* Calculate the number of registers required to hold function argument
2204 ARG. */
5326695a
AS
2205
2206static int
0ffef200 2207num_arg_regs (const function_arg_info &arg)
5326695a 2208{
0ffef200 2209 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2210 return 0;
2211
0ffef200 2212 int size = arg.promoted_size_in_bytes ();
5326695a
AS
2213 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2214}
2215
2216/* Implement TARGET_STRICT_ARGUMENT_NAMING.
2217
2218 Return true if the location where a function argument is passed
2219 depends on whether or not it is a named argument
2220
2221 For gcn, we know how to handle functions declared as stdarg: by
2222 passing an extra pointer to the unnamed arguments. However, the
2223 Fortran frontend can produce a different situation, where a
2224 function pointer is declared with no arguments, but the actual
2225 function and calls to it take more arguments. In that case, we
2226 want to ensure the call matches the definition of the function. */
2227
2228static bool
2229gcn_strict_argument_naming (cumulative_args_t cum_v)
2230{
2231 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2232
2233 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2234}
2235
2236/* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2237
2238 See comment on gcn_strict_argument_naming. */
2239
2240static bool
2241gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2242{
2243 return !gcn_strict_argument_naming (cum_v);
2244}
2245
2246/* Implement TARGET_FUNCTION_ARG.
2247
2248 Return an RTX indicating whether a function argument is passed in a register
2249 and if so, which register. */
2250
2251static rtx
6783fdb7 2252gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2253{
2254 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2255 if (cum->normal_function)
2256 {
6783fdb7 2257 if (!arg.named || arg.end_marker_p ())
5326695a
AS
2258 return 0;
2259
0ffef200 2260 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2261 return 0;
2262
2263 int reg_num = FIRST_PARM_REG + cum->num;
0ffef200 2264 int num_regs = num_arg_regs (arg);
5326695a
AS
2265 if (num_regs > 0)
2266 while (reg_num % num_regs != 0)
2267 reg_num++;
2268 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
6783fdb7 2269 return gen_rtx_REG (arg.mode, reg_num);
5326695a
AS
2270 }
2271 else
2272 {
2273 if (cum->num >= cum->args.nargs)
2274 {
6783fdb7
RS
2275 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2276 & -(TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2277 cfun->machine->kernarg_segment_alignment
2278 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
6783fdb7 2279 TYPE_ALIGN (arg.type) / 8);
5326695a
AS
2280 rtx addr = gen_rtx_REG (DImode,
2281 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2282 if (cum->offset)
2283 addr = gen_rtx_PLUS (DImode, addr,
2284 gen_int_mode (cum->offset, DImode));
6783fdb7
RS
2285 rtx mem = gen_rtx_MEM (arg.mode, addr);
2286 set_mem_attributes (mem, arg.type, 1);
5326695a
AS
2287 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2288 MEM_READONLY_P (mem) = 1;
2289 return mem;
2290 }
2291
2292 int a = cum->args.order[cum->num];
6783fdb7 2293 if (arg.mode != gcn_kernel_arg_types[a].mode)
5326695a
AS
2294 {
2295 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2296 return 0;
2297 }
2298 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2299 cum->args.reg[a]);
2300 }
2301 return 0;
2302}
2303
2304/* Implement TARGET_FUNCTION_ARG_ADVANCE.
2305
2306 Updates the summarizer variable pointed to by CUM_V to advance past an
2307 argument in the argument list. */
2308
2309static void
6930c98c
RS
2310gcn_function_arg_advance (cumulative_args_t cum_v,
2311 const function_arg_info &arg)
5326695a
AS
2312{
2313 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2314
2315 if (cum->normal_function)
2316 {
6930c98c 2317 if (!arg.named)
5326695a
AS
2318 return;
2319
0ffef200 2320 int num_regs = num_arg_regs (arg);
5326695a
AS
2321 if (num_regs > 0)
2322 while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2323 cum->num++;
2324 cum->num += num_regs;
2325 }
2326 else
2327 {
2328 if (cum->num < cum->args.nargs)
2329 cum->num++;
2330 else
2331 {
6930c98c 2332 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
5326695a
AS
2333 cfun->machine->kernarg_segment_byte_size = cum->offset;
2334 }
2335 }
2336}
2337
2338/* Implement TARGET_ARG_PARTIAL_BYTES.
2339
2340 Returns the number of bytes at the beginning of an argument that must be put
2341 in registers. The value must be zero for arguments that are passed entirely
2342 in registers or that are entirely pushed on the stack. */
2343
2344static int
a7c81bc1 2345gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
5326695a
AS
2346{
2347 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2348
a7c81bc1 2349 if (!arg.named)
5326695a
AS
2350 return 0;
2351
0ffef200 2352 if (targetm.calls.must_pass_in_stack (arg))
5326695a
AS
2353 return 0;
2354
2355 if (cum->num >= NUM_PARM_REGS)
2356 return 0;
2357
2358 /* If the argument fits entirely in registers, return 0. */
0ffef200 2359 if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
5326695a
AS
2360 return 0;
2361
2362 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2363}
2364
2365/* A normal function which takes a pointer argument (to a scalar) may be
2366 passed a pointer to LDS space (via a high-bits-set aperture), and that only
2367 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
2368 function has an incoming pointer-to-scalar parameter. */
2369
2370static void
2371gcn_detect_incoming_pointer_arg (tree fndecl)
2372{
2373 gcc_assert (cfun && cfun->machine);
2374
2375 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2376 arg;
2377 arg = TREE_CHAIN (arg))
2378 if (POINTER_TYPE_P (TREE_VALUE (arg))
2379 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2380 cfun->machine->use_flat_addressing = true;
2381}
2382
2383/* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2384
2385 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2386 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2387
2388void
2389gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2390 tree fntype /* tree ptr for function decl */ ,
2391 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2392 tree fndecl, int caller)
2393{
2394 memset (cum, 0, sizeof (*cum));
2395 cum->fntype = fntype;
2396 if (libname)
2397 {
2398 gcc_assert (cfun && cfun->machine);
2399 cum->normal_function = true;
2400 if (!caller)
2401 {
2402 cfun->machine->normal_function = true;
2403 gcn_detect_incoming_pointer_arg (fndecl);
2404 }
2405 return;
2406 }
2407 tree attr = NULL;
2408 if (fndecl)
2409 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2410 if (fndecl && !attr)
2411 attr = lookup_attribute ("amdgpu_hsa_kernel",
2412 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2413 if (!attr && fntype)
2414 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2415 /* Handle main () as kernel, so we can run testsuite.
2416 Handle OpenACC kernels similarly to main. */
2417 if (!attr && !caller && fndecl
2418 && (MAIN_NAME_P (DECL_NAME (fndecl))
2419 || lookup_attribute ("omp target entrypoint",
2420 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2421 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2422 else
2423 {
2424 if (!attr || caller)
2425 {
2426 gcc_assert (cfun && cfun->machine);
2427 cum->normal_function = true;
2428 if (!caller)
2429 cfun->machine->normal_function = true;
2430 }
2431 gcn_parse_amdgpu_hsa_kernel_attribute
2432 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2433 }
2434 cfun->machine->args = cum->args;
2435 if (!caller && cfun->machine->normal_function)
2436 gcn_detect_incoming_pointer_arg (fndecl);
2437}
2438
2439static bool
2440gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2441{
2442 machine_mode mode = TYPE_MODE (type);
2443 HOST_WIDE_INT size = int_size_in_bytes (type);
2444
2445 if (AGGREGATE_TYPE_P (type))
2446 return true;
2447
2448 if (mode == BLKmode)
2449 return true;
2450
2451 if (size > 2 * UNITS_PER_WORD)
2452 return true;
2453
2454 return false;
2455}
2456
2457/* Implement TARGET_PROMOTE_FUNCTION_MODE.
2458
2459 Return the mode to use for outgoing function arguments. */
2460
2461machine_mode
2462gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2463 int *ARG_UNUSED (punsignedp),
2464 const_tree ARG_UNUSED (funtype),
2465 int ARG_UNUSED (for_return))
2466{
2467 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2468 return SImode;
2469
2470 return mode;
2471}
2472
2473/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2474
2475 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2476 ARGS_GROW_DOWNWARDS. */
2477
2478static tree
2479gcn_gimplify_va_arg_expr (tree valist, tree type,
2480 gimple_seq *ARG_UNUSED (pre_p),
2481 gimple_seq *ARG_UNUSED (post_p))
2482{
2483 tree ptr = build_pointer_type (type);
2484 tree valist_type;
2485 tree t, u;
2486 bool indirect;
2487
fde65a89 2488 indirect = pass_va_arg_by_reference (type);
5326695a
AS
2489 if (indirect)
2490 {
2491 type = ptr;
2492 ptr = build_pointer_type (type);
2493 }
2494 valist_type = TREE_TYPE (valist);
2495
2496 /* Args grow down. Not handled by generic routines. */
2497
2498 u = fold_convert (sizetype, size_in_bytes (type));
2499 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2500 t = fold_build_pointer_plus (valist, u);
2501
2502 /* Align to 8 byte boundary. */
2503
2504 u = build_int_cst (TREE_TYPE (t), -8);
2505 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2506 t = fold_convert (valist_type, t);
2507
2508 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2509
2510 t = fold_convert (ptr, t);
2511 t = build_va_arg_indirect_ref (t);
2512
2513 if (indirect)
2514 t = build_va_arg_indirect_ref (t);
2515
2516 return t;
2517}
2518
2519/* Calculate stack offsets needed to create prologues and epilogues. */
2520
2521static struct machine_function *
2522gcn_compute_frame_offsets (void)
2523{
2524 machine_function *offsets = cfun->machine;
2525
2526 if (reload_completed)
2527 return offsets;
2528
2529 offsets->need_frame_pointer = frame_pointer_needed;
2530
2531 offsets->outgoing_args_size = crtl->outgoing_args_size;
2532 offsets->pretend_size = crtl->args.pretend_args_size;
2533
2534 offsets->local_vars = get_frame_size ();
2535
2536 offsets->lr_needs_saving = (!leaf_function_p ()
2537 || df_regs_ever_live_p (LR_REGNUM)
2538 || df_regs_ever_live_p (LR_REGNUM + 1));
2539
2540 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2541
2542 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2543 if ((df_regs_ever_live_p (regno) && !call_used_regs[regno])
2544 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2545 && frame_pointer_needed))
2546 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2547
2548 /* Round up to 64-bit boundary to maintain stack alignment. */
2549 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2550
2551 return offsets;
2552}
2553
2554/* Insert code into the prologue or epilogue to store or load any
2555 callee-save register to/from the stack.
2556
2557 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2558
2559static void
2560move_callee_saved_registers (rtx sp, machine_function *offsets,
2561 bool prologue)
2562{
2563 int regno, offset, saved_scalars;
2564 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2565 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2566 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2567 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2568 HOST_WIDE_INT exec_set = 0;
2569 int offreg_set = 0;
2570
2571 start_sequence ();
2572
2573 /* Move scalars into two vector registers. */
2574 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2575 if ((df_regs_ever_live_p (regno) && !call_used_regs[regno])
2576 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2577 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2578 && offsets->need_frame_pointer))
2579 {
2580 rtx reg = gen_rtx_REG (SImode, regno);
2581 rtx vreg = gen_rtx_REG (V64SImode,
2582 VGPR_REGNO (6 + (saved_scalars / 64)));
2583 int lane = saved_scalars % 64;
2584
2585 if (prologue)
2586 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2587 else
2588 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2589
2590 saved_scalars++;
2591 }
2592
2593 rtx move_scalars = get_insns ();
2594 end_sequence ();
2595 start_sequence ();
2596
2597 /* Ensure that all vector lanes are moved. */
2598 exec_set = -1;
2599 emit_move_insn (exec, GEN_INT (exec_set));
2600
2601 /* Set up a vector stack pointer. */
2602 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2603 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2604 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2605 gcn_gen_undef (V64SImode), exec));
2606 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2607 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2608 exec));
2609 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2610 gcn_operand_part (V64SImode, vsp, 0),
2611 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2612 exec));
2613 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2614 gcn_operand_part (V64SImode, vsp, 1),
2615 const0_rtx, vcc, vcc,
2616 gcn_gen_undef (V64SImode), exec));
2617
2618 /* Move vectors. */
2619 for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size;
2620 regno < FIRST_PSEUDO_REGISTER; regno++)
2621 if ((df_regs_ever_live_p (regno) && !call_used_regs[regno])
2622 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2623 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2624 {
2625 rtx reg = gen_rtx_REG (V64SImode, regno);
2626 int size = 256;
2627
2628 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2629 size = saved_scalars * 4;
2630 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2631 size = (saved_scalars - 64) * 4;
2632
2633 if (size != 256 || exec_set != -1)
2634 {
2635 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2636 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2637 }
2638
2639 if (prologue)
2640 emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg,
2641 as, const0_rtx, exec));
2642 else
2643 emit_insn (gen_gatherv64si_insn_1offset_exec
2644 (reg, vsp, const0_rtx, as, const0_rtx,
2645 gcn_gen_undef (V64SImode), exec));
2646
2647 /* Move our VSP to the next stack entry. */
2648 if (offreg_set != size)
2649 {
2650 offreg_set = size;
2651 emit_move_insn (offreg, GEN_INT (size));
2652 }
2653 if (exec_set != -1)
2654 {
2655 exec_set = -1;
2656 emit_move_insn (exec, GEN_INT (exec_set));
2657 }
2658 emit_insn (gen_addv64si3_vcc_dup_exec
2659 (gcn_operand_part (V64SImode, vsp, 0),
2660 offreg, gcn_operand_part (V64SImode, vsp, 0),
2661 vcc, gcn_gen_undef (V64SImode), exec));
2662 emit_insn (gen_addcv64si3_exec
2663 (gcn_operand_part (V64SImode, vsp, 1),
2664 gcn_operand_part (V64SImode, vsp, 1),
2665 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2666
2667 offset += size;
2668 }
2669
2670 rtx move_vectors = get_insns ();
2671 end_sequence ();
2672
2673 if (prologue)
2674 {
2675 emit_insn (move_scalars);
2676 emit_insn (move_vectors);
2677 }
2678 else
2679 {
2680 emit_insn (move_vectors);
2681 emit_insn (move_scalars);
2682 }
2683}
2684
2685/* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2686
2687 For a non-kernel function, the stack layout looks like this (interim),
2688 growing *upwards*:
2689
2690 hi | + ...
2691 |__________________| <-- current SP
2692 | outgoing args |
2693 |__________________|
2694 | (alloca space) |
2695 |__________________|
2696 | local vars |
2697 |__________________| <-- FP/hard FP
2698 | callee-save regs |
2699 |__________________| <-- soft arg pointer
2700 | pretend args |
2701 |__________________| <-- incoming SP
2702 | incoming args |
2703 lo |..................|
2704
2705 This implies arguments (beyond the first N in registers) must grow
2706 downwards (as, apparently, PA has them do).
2707
2708 For a kernel function we have the simpler:
2709
2710 hi | + ...
2711 |__________________| <-- current SP
2712 | outgoing args |
2713 |__________________|
2714 | (alloca space) |
2715 |__________________|
2716 | local vars |
2717 lo |__________________| <-- FP/hard FP
2718
2719*/
2720
2721void
2722gcn_expand_prologue ()
2723{
2724 machine_function *offsets = gcn_compute_frame_offsets ();
2725
2726 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2727 {
2728 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2729 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2730
2731 start_sequence ();
2732
2733 if (offsets->pretend_size > 0)
2734 {
2735 /* FIXME: Do the actual saving of register pretend args to the stack.
2736 Register order needs consideration. */
2737 }
2738
2739 /* Save callee-save regs. */
2740 move_callee_saved_registers (sp, offsets, true);
2741
2742 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2743 + offsets->callee_saves
2744 + offsets->local_vars + offsets->outgoing_args_size;
2745 if (sp_adjust > 0)
2746 emit_insn (gen_adddi3_scc (sp, sp, gen_int_mode (sp_adjust, DImode)));
2747
2748 if (offsets->need_frame_pointer)
2749 emit_insn (gen_adddi3_scc (fp, sp,
2750 gen_int_mode
2751 (-(offsets->local_vars +
2752 offsets->outgoing_args_size),
2753 DImode)));
2754
2755 rtx_insn *seq = get_insns ();
2756 end_sequence ();
2757
2758 /* FIXME: Prologue insns should have this flag set for debug output, etc.
2759 but it causes issues for now.
2760 for (insn = seq; insn; insn = NEXT_INSN (insn))
2761 if (INSN_P (insn))
2762 RTX_FRAME_RELATED_P (insn) = 1;*/
2763
2764 emit_insn (seq);
2765 }
2766 else
2767 {
2768 rtx wave_offset = gen_rtx_REG (SImode,
2769 cfun->machine->args.
2770 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
2771
2772 if (TARGET_GCN5_PLUS)
2773 {
2774 /* v0 is reserved for constant zero so that "global"
2775 memory instructions can have a nul-offset without
2776 causing reloads. */
2777 emit_insn (gen_vec_duplicatev64si
2778 (gen_rtx_REG (V64SImode, VGPR_REGNO (0)), const0_rtx));
2779 }
2780
2781 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
2782 {
2783 rtx fs_init_lo =
2784 gen_rtx_REG (SImode,
2785 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
2786 rtx fs_init_hi =
2787 gen_rtx_REG (SImode,
2788 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
2789 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
2790 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
2791
2792 /*rtx queue = gen_rtx_REG(DImode,
2793 cfun->machine->args.reg[QUEUE_PTR_ARG]);
2794 rtx aperture = gen_rtx_MEM (SImode,
2795 gen_rtx_PLUS (DImode, queue,
2796 gen_int_mode (68, SImode)));
2797 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
2798
2799 /* Set up flat_scratch. */
2800 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
2801 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
2802 gen_int_mode (8, SImode)));
2803 emit_move_insn (fs_reg_lo, fs_init_hi);
2804 }
2805
2806 /* Set up frame pointer and stack pointer. */
2807 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
2808 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
2809 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
2810 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
2811
2812 HOST_WIDE_INT sp_adjust = (offsets->local_vars
2813 + offsets->outgoing_args_size);
2814
2815 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
2816 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
2817 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
2818 gen_int_mode (0xffff, SImode)));
3258c2d6
AS
2819 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2820 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
2821 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
5326695a
AS
2822
2823 if (sp_adjust > 0)
2824 emit_insn (gen_adddi3_scc (sp, fp, gen_int_mode (sp_adjust, DImode)));
2825 else
2826 emit_move_insn (sp, fp);
2827
2828 /* Make sure the flat scratch reg doesn't get optimised away. */
2829 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
2830 }
2831
2832 /* Ensure that the scheduler doesn't do anything unexpected. */
2833 emit_insn (gen_blockage ());
2834
2835 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
2836 gen_int_mode (LDS_SIZE, SImode));
2837
2838 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
2839 if (TARGET_GCN5_PLUS)
2840 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, VGPR_REGNO (0))));
2841
2842 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
2843 {
2844 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
2845 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2846 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
2847 "gomp_gcn_enter_kernel"));
2848 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2849 }
2850}
2851
2852/* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
2853
2854 See gcn_expand_prologue for stack details. */
2855
2856void
2857gcn_expand_epilogue (void)
2858{
2859 /* Ensure that the scheduler doesn't do anything unexpected. */
2860 emit_insn (gen_blockage ());
2861
2862 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2863 {
2864 machine_function *offsets = gcn_compute_frame_offsets ();
2865 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2866 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2867
2868 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
2869
2870 if (offsets->need_frame_pointer)
2871 {
2872 /* Restore old SP from the frame pointer. */
2873 if (sp_adjust > 0)
2874 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
2875 else
2876 emit_move_insn (sp, fp);
2877 }
2878 else
2879 {
2880 /* Restore old SP from current SP. */
2881 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
2882
2883 if (sp_adjust > 0)
2884 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
2885 }
2886
2887 move_callee_saved_registers (sp, offsets, false);
2888
2889 /* There's no explicit use of the link register on the return insn. Emit
2890 one here instead. */
2891 if (offsets->lr_needs_saving)
2892 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
2893
2894 /* Similar for frame pointer. */
2895 if (offsets->need_frame_pointer)
2896 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
2897 }
2898 else if (flag_openmp)
2899 {
2900 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
2901 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
2902 emit_move_insn (fn_reg,
2903 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
2904 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
2905 }
2906 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
2907 {
2908 /* Assume that an exit value compatible with gcn-run is expected.
2909 That is, the third input parameter is an int*.
2910
2911 We can't allocate any new registers, but the kernarg_reg is
2912 dead after this, so we'll use that. */
2913 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
2914 [KERNARG_SEGMENT_PTR_ARG]);
2915 rtx retptr_mem = gen_rtx_MEM (DImode,
2916 gen_rtx_PLUS (DImode, kernarg_reg,
2917 GEN_INT (16)));
2918 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
2919 emit_move_insn (kernarg_reg, retptr_mem);
2920
2921 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
2922 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
2923 emit_move_insn (retval_mem,
2924 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
2925 }
2926
2927 emit_jump_insn (gen_gcn_return ());
2928}
2929
2930/* Implement TARGET_CAN_ELIMINATE.
2931
2932 Return true if the compiler is allowed to try to replace register number
2933 FROM_REG with register number TO_REG.
2934
2935 FIXME: is the default "true" not enough? Should this be a negative set? */
2936
2937bool
2938gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
2939{
2940 return (to_reg == HARD_FRAME_POINTER_REGNUM
2941 || to_reg == STACK_POINTER_REGNUM);
2942}
2943
2944/* Implement INITIAL_ELIMINATION_OFFSET.
2945
2946 Returns the initial difference between the specified pair of registers, in
2947 terms of stack position. */
2948
2949HOST_WIDE_INT
2950gcn_initial_elimination_offset (int from, int to)
2951{
2952 machine_function *offsets = gcn_compute_frame_offsets ();
2953
2954 switch (from)
2955 {
2956 case ARG_POINTER_REGNUM:
2957 if (to == STACK_POINTER_REGNUM)
2958 return -(offsets->callee_saves + offsets->local_vars
2959 + offsets->outgoing_args_size);
2960 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
2961 return -offsets->callee_saves;
2962 else
2963 gcc_unreachable ();
2964 break;
2965
2966 case FRAME_POINTER_REGNUM:
2967 if (to == STACK_POINTER_REGNUM)
2968 return -(offsets->local_vars + offsets->outgoing_args_size);
2969 else if (to == HARD_FRAME_POINTER_REGNUM)
2970 return 0;
2971 else
2972 gcc_unreachable ();
2973 break;
2974
2975 default:
2976 gcc_unreachable ();
2977 }
2978}
2979
2980/* Implement HARD_REGNO_RENAME_OK.
2981
2982 Return true if it is permissible to rename a hard register from
2983 FROM_REG to TO_REG. */
2984
2985bool
2986gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
2987{
2988 if (from_reg == SCC_REG
2989 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
2990 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
2991 || to_reg == SCC_REG
2992 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
2993 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
2994 return false;
2995
2996 /* Allow the link register to be used if it was saved. */
2997 if ((to_reg & ~1) == LINK_REGNUM)
2998 return !cfun || cfun->machine->lr_needs_saving;
2999
3000 /* Allow the registers used for the static chain to be used if the chain is
3001 not in active use. */
3002 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3003 return !cfun
3004 || !(cfun->static_chain_decl
3005 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3006 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3007
3008 return true;
3009}
3010
3011/* Implement HARD_REGNO_CALLER_SAVE_MODE.
3012
3013 Which mode is required for saving NREGS of a pseudo-register in
3014 call-clobbered hard register REGNO. */
3015
3016machine_mode
3017gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3018 machine_mode regmode)
3019{
3020 machine_mode result = choose_hard_reg_mode (regno, nregs, false);
3021
3022 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3023 result = (nregs == 1 ? SImode : DImode);
3024
3025 return result;
3026}
3027
3028/* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3029
3030 Output assembler code for a block containing the constant parts
3031 of a trampoline, leaving space for the variable parts. */
3032
3033static void
3034gcn_asm_trampoline_template (FILE *f)
3035{
3036 /* The source operand of the move instructions must be a 32-bit
3037 constant following the opcode. */
3038 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3039 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3040 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3041 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3042 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3043}
3044
3045/* Implement TARGET_TRAMPOLINE_INIT.
3046
3047 Emit RTL insns to initialize the variable parts of a trampoline.
3048 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3049 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3050 to be passed to the target function. */
3051
3052static void
3053gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3054{
b7c28a47
AS
3055 if (TARGET_GCN5_PLUS)
3056 sorry ("nested function trampolines not supported on GCN5 due to"
3057 " non-executable stacks");
3058
5326695a
AS
3059 emit_block_move (m_tramp, assemble_trampoline_template (),
3060 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3061
3062 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3063 rtx chain_value_reg = copy_to_reg (chain_value);
3064 rtx fnaddr_reg = copy_to_reg (fnaddr);
3065
3066 for (int i = 0; i < 4; i++)
3067 {
3068 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3069 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3070 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3071 }
3072
3073 rtx tramp_addr = XEXP (m_tramp, 0);
3074 emit_insn (gen_clear_icache (tramp_addr,
3075 plus_constant (ptr_mode, tramp_addr,
3076 TRAMPOLINE_SIZE)));
3077}
3078
3079/* }}} */
3080/* {{{ Miscellaneous. */
3081
3082/* Implement TARGET_CANNOT_COPY_INSN_P.
3083
3084 Return true if INSN must not be duplicated. */
3085
3086static bool
3087gcn_cannot_copy_insn_p (rtx_insn *insn)
3088{
3089 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3090 return true;
3091
3092 return false;
3093}
3094
3095/* Implement TARGET_DEBUG_UNWIND_INFO.
3096
3097 Defines the mechanism that will be used for describing frame unwind
3098 information to the debugger. */
3099
3100static enum unwind_info_type
3101gcn_debug_unwind_info ()
3102{
3103 /* No support for debug info, yet. */
3104 return UI_NONE;
3105}
3106
3107/* Determine if there is a suitable hardware conversion instruction.
3108 Used primarily by the machine description. */
3109
3110bool
3111gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3112{
3113 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3114 return false;
3115
3116 if (VECTOR_MODE_P (from))
3117 {
3118 from = GET_MODE_INNER (from);
3119 to = GET_MODE_INNER (to);
3120 }
3121
3122 switch (op)
3123 {
3124 case fix_trunc_cvt:
3125 case fixuns_trunc_cvt:
3126 if (GET_MODE_CLASS (from) != MODE_FLOAT
3127 || GET_MODE_CLASS (to) != MODE_INT)
3128 return false;
3129 break;
3130 case float_cvt:
3131 case floatuns_cvt:
3132 if (GET_MODE_CLASS (from) != MODE_INT
3133 || GET_MODE_CLASS (to) != MODE_FLOAT)
3134 return false;
3135 break;
3136 case extend_cvt:
3137 if (GET_MODE_CLASS (from) != MODE_FLOAT
3138 || GET_MODE_CLASS (to) != MODE_FLOAT
3139 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3140 return false;
3141 break;
3142 case trunc_cvt:
3143 if (GET_MODE_CLASS (from) != MODE_FLOAT
3144 || GET_MODE_CLASS (to) != MODE_FLOAT
3145 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3146 return false;
3147 break;
3148 }
3149
3150 return ((to == HImode && from == HFmode)
3151 || (to == SImode && (from == SFmode || from == DFmode))
3152 || (to == HFmode && (from == HImode || from == SFmode))
3153 || (to == SFmode && (from == SImode || from == HFmode
3154 || from == DFmode))
3155 || (to == DFmode && (from == SImode || from == SFmode)));
3156}
3157
76d46331
KCY
3158/* Implement TARGET_EMUTLS_VAR_INIT.
3159
3160 Disable emutls (gthr-gcn.h does not support it, yet). */
3161
3162tree
3163gcn_emutls_var_init (tree, tree decl, tree)
3164{
3165 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3166}
3167
5326695a
AS
3168/* }}} */
3169/* {{{ Costs. */
3170
3171/* Implement TARGET_RTX_COSTS.
3172
3173 Compute a (partial) cost for rtx X. Return true if the complete
3174 cost has been computed, and false if subexpressions should be
3175 scanned. In either case, *TOTAL contains the cost result. */
3176
3177static bool
3178gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3179{
3180 enum rtx_code code = GET_CODE (x);
3181 switch (code)
3182 {
3183 case CONST:
3184 case CONST_DOUBLE:
3185 case CONST_VECTOR:
3186 case CONST_INT:
3187 if (gcn_inline_constant_p (x))
3188 *total = 0;
3189 else if (code == CONST_INT
3190 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3191 *total = 1;
3192 else if (gcn_constant_p (x))
3193 *total = 2;
3194 else
3195 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3196 return true;
3197
3198 case DIV:
3199 *total = 100;
3200 return false;
3201
3202 default:
3203 *total = 3;
3204 return false;
3205 }
3206}
3207
3208/* Implement TARGET_MEMORY_MOVE_COST.
3209
3210 Return the cost of moving data of mode M between a
3211 register and memory. A value of 2 is the default; this cost is
3212 relative to those in `REGISTER_MOVE_COST'.
3213
3214 This function is used extensively by register_move_cost that is used to
3215 build tables at startup. Make it inline in this case.
3216 When IN is 2, return maximum of in and out move cost.
3217
3218 If moving between registers and memory is more expensive than
3219 between two registers, you should define this macro to express the
3220 relative cost.
3221
3222 Model also increased moving costs of QImode registers in non
3223 Q_REGS classes. */
3224
3225#define LOAD_COST 32
3226#define STORE_COST 32
3227static int
3228gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3229{
3230 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3231 switch (regclass)
3232 {
3233 case SCC_CONDITIONAL_REG:
3234 case VCCZ_CONDITIONAL_REG:
3235 case VCC_CONDITIONAL_REG:
3236 case EXECZ_CONDITIONAL_REG:
3237 case ALL_CONDITIONAL_REGS:
3238 case SGPR_REGS:
3239 case SGPR_EXEC_REGS:
3240 case EXEC_MASK_REG:
3241 case SGPR_VOP_SRC_REGS:
3242 case SGPR_MEM_SRC_REGS:
3243 case SGPR_SRC_REGS:
3244 case SGPR_DST_REGS:
3245 case GENERAL_REGS:
3246 case AFP_REGS:
3247 if (!in)
3248 return (STORE_COST + 2) * nregs;
3249 return LOAD_COST * nregs;
3250 case VGPR_REGS:
3251 if (in)
3252 return (LOAD_COST + 2) * nregs;
3253 return STORE_COST * nregs;
3254 case ALL_REGS:
3255 case ALL_GPR_REGS:
3256 case SRCDST_REGS:
3257 if (in)
3258 return (LOAD_COST + 2) * nregs;
3259 return (STORE_COST + 2) * nregs;
3260 default:
3261 gcc_unreachable ();
3262 }
3263}
3264
3265/* Implement TARGET_REGISTER_MOVE_COST.
3266
3267 Return the cost of moving data from a register in class CLASS1 to
3268 one in class CLASS2. Base value is 2. */
3269
3270static int
3271gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3272{
3273 /* Increase cost of moving from and to vector registers. While this is
3274 fast in hardware (I think), it has hidden cost of setting up the exec
3275 flags. */
3276 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3277 return 4;
3278 return 2;
3279}
3280
3281/* }}} */
3282/* {{{ Builtins. */
3283
3284/* Type codes used by GCN built-in definitions. */
3285
3286enum gcn_builtin_type_index
3287{
3288 GCN_BTI_END_OF_PARAMS,
3289
3290 GCN_BTI_VOID,
3291 GCN_BTI_BOOL,
3292 GCN_BTI_INT,
3293 GCN_BTI_UINT,
3294 GCN_BTI_SIZE_T,
3295 GCN_BTI_LLINT,
3296 GCN_BTI_LLUINT,
3297 GCN_BTI_EXEC,
3298
3299 GCN_BTI_SF,
3300 GCN_BTI_V64SI,
3301 GCN_BTI_V64SF,
3302 GCN_BTI_V64PTR,
3303 GCN_BTI_SIPTR,
3304 GCN_BTI_SFPTR,
3305 GCN_BTI_VOIDPTR,
3306
3307 GCN_BTI_LDS_VOIDPTR,
3308
3309 GCN_BTI_MAX
3310};
3311
3312static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3313
3314#define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3315#define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3316#define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3317#define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3318#define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3319#define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3320#define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3321#define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3322#define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3323
3324static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3325 struct gcn_builtin_description *);
3326static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3327 struct gcn_builtin_description *);
3328
3329struct gcn_builtin_description;
3330typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3331 struct gcn_builtin_description *);
3332
3333enum gcn_builtin_type
3334{
3335 B_UNIMPLEMENTED, /* Sorry out */
3336 B_INSN, /* Emit a pattern */
3337 B_OVERLOAD /* Placeholder for an overloaded function */
3338};
3339
3340struct gcn_builtin_description
3341{
3342 int fcode;
3343 int icode;
3344 const char *name;
3345 enum gcn_builtin_type type;
3346 /* The first element of parm is always the return type. The rest
3347 are a zero terminated list of parameters. */
3348 int parm[6];
3349 gcn_builtin_expander expander;
3350};
3351
3352/* Read in the GCN builtins from gcn-builtins.def. */
3353
3354extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3355
3356struct gcn_builtin_description gcn_builtins[] = {
3357#define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3358 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3359
3360#define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3361 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3362 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3363 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3364 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3365 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3366 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3367 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3368 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3369
3370#include "gcn-builtins.def"
3371#undef DEF_BUILTIN_BINOP_INT_FP
3372#undef DEF_BUILTIN
3373};
3374
3375static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3376
3377/* Implement TARGET_BUILTIN_DECL.
3378
3379 Return the GCN builtin for CODE. */
3380
3381tree
3382gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3383{
3384 if (code >= GCN_BUILTIN_MAX)
3385 return error_mark_node;
3386
3387 return gcn_builtin_decls[code];
3388}
3389
3390/* Helper function for gcn_init_builtins. */
3391
3392static void
3393gcn_init_builtin_types (void)
3394{
3395 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3396 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3397 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3398 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3399 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3400 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3401 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3402
3403 exec_type_node = unsigned_intDI_type_node;
3404 sf_type_node = float32_type_node;
3405 v64si_type_node = build_vector_type (intSI_type_node, 64);
3406 v64sf_type_node = build_vector_type (float_type_node, 64);
3407 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3408 /*build_pointer_type
3409 (integer_type_node) */
3410 , 64);
3411 tree tmp = build_distinct_type_copy (intSI_type_node);
3412 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3413 siptr_type_node = build_pointer_type (tmp);
3414
3415 tmp = build_distinct_type_copy (float_type_node);
3416 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3417 sfptr_type_node = build_pointer_type (tmp);
3418
3419 tmp = build_distinct_type_copy (void_type_node);
3420 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3421 voidptr_type_node = build_pointer_type (tmp);
3422
3423 tmp = build_distinct_type_copy (void_type_node);
3424 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3425 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3426}
3427
3428/* Implement TARGET_INIT_BUILTINS.
3429
3430 Set up all builtin functions for this target. */
3431
3432static void
3433gcn_init_builtins (void)
3434{
3435 gcn_init_builtin_types ();
3436
3437 struct gcn_builtin_description *d;
3438 unsigned int i;
3439 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3440 {
3441 tree p;
3442 char name[64]; /* build_function will make a copy. */
3443 int parm;
3444
3445 /* FIXME: Is this necessary/useful? */
3446 if (d->name == 0)
3447 continue;
3448
3449 /* Find last parm. */
3450 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3451 ;
3452
3453 p = void_list_node;
3454 while (parm > 1)
3455 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3456
3457 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3458
3459 sprintf (name, "__builtin_gcn_%s", d->name);
3460 gcn_builtin_decls[i]
3461 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3462
3463 /* These builtins don't throw. */
3464 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3465 }
3466
3467/* FIXME: remove the ifdef once OpenACC support is merged upstream. */
3468#ifdef BUILT_IN_GOACC_SINGLE_START
3469 /* These builtins need to take/return an LDS pointer: override the generic
3470 versions here. */
3471
3472 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3473 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3474
3475 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3476 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3477 false);
3478
3479 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3480 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3481 false);
3482
3483 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3484 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3485#endif
3486}
3487
3488/* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3489 not require taking the address of any object, other than the memory
3490 cell being operated on.
3491
3492 Helper function for gcn_expand_builtin_1. */
3493
3494static rtx
3495gcn_expand_cmp_swap (tree exp, rtx target)
3496{
3497 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3498 addr_space_t as
3499 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3500 machine_mode as_mode = gcn_addr_space_address_mode (as);
3501
3502 if (!target)
3503 target = gen_reg_rtx (mode);
3504
3505 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3506 NULL_RTX, as_mode, EXPAND_NORMAL);
3507 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3508 NULL_RTX, mode, EXPAND_NORMAL);
3509 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3510 NULL_RTX, mode, EXPAND_NORMAL);
3511 rtx pat;
3512
3513 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3514 set_mem_addr_space (mem, as);
3515
3516 if (!REG_P (cmp))
3517 cmp = copy_to_mode_reg (mode, cmp);
3518 if (!REG_P (src))
3519 src = copy_to_mode_reg (mode, src);
3520
3521 if (mode == SImode)
3522 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3523 else
3524 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3525
3526 emit_insn (pat);
3527
3528 return target;
3529}
3530
3531/* Expand many different builtins.
3532
3533 Intended for use in gcn-builtins.def. */
3534
3535static rtx
3536gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3537 machine_mode /*mode */ , int ignore,
3538 struct gcn_builtin_description *)
3539{
3540 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3541 switch (DECL_MD_FUNCTION_CODE (fndecl))
5326695a
AS
3542 {
3543 case GCN_BUILTIN_FLAT_LOAD_INT32:
3544 {
3545 if (ignore)
3546 return target;
3547 /*rtx exec = */
3548 force_reg (DImode,
3549 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3550 EXPAND_NORMAL));
3551 /*rtx ptr = */
3552 force_reg (V64DImode,
3553 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3554 EXPAND_NORMAL));
3555 /*emit_insn (gen_vector_flat_loadv64si
3556 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3557 return target;
3558 }
3559 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3560 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3561 {
3562 if (ignore)
3563 return target;
3564 rtx exec = force_reg (DImode,
3565 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3566 DImode,
3567 EXPAND_NORMAL));
3568 rtx ptr = force_reg (DImode,
3569 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3570 V64DImode,
3571 EXPAND_NORMAL));
3572 rtx offsets = force_reg (V64SImode,
3573 expand_expr (CALL_EXPR_ARG (exp, 2),
3574 NULL_RTX, V64DImode,
3575 EXPAND_NORMAL));
3576 rtx addrs = gen_reg_rtx (V64DImode);
3577 rtx tmp = gen_reg_rtx (V64SImode);
3578 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3579 GEN_INT (2),
3580 gcn_gen_undef (V64SImode), exec));
3581 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3582 gcn_gen_undef (V64DImode),
3583 exec));
3584 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3585 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3586 /* FIXME: set attributes. */
3587 emit_insn (gen_mov_with_exec (target, mem, exec));
3588 return target;
3589 }
3590 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3591 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3592 {
3593 rtx exec = force_reg (DImode,
3594 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3595 DImode,
3596 EXPAND_NORMAL));
3597 rtx ptr = force_reg (DImode,
3598 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3599 V64DImode,
3600 EXPAND_NORMAL));
3601 rtx offsets = force_reg (V64SImode,
3602 expand_expr (CALL_EXPR_ARG (exp, 2),
3603 NULL_RTX, V64DImode,
3604 EXPAND_NORMAL));
3605 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3606 3)));
3607 rtx val = force_reg (vmode,
3608 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3609 vmode,
3610 EXPAND_NORMAL));
3611 rtx addrs = gen_reg_rtx (V64DImode);
3612 rtx tmp = gen_reg_rtx (V64SImode);
3613 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3614 GEN_INT (2),
3615 gcn_gen_undef (V64SImode), exec));
3616 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3617 gcn_gen_undef (V64DImode),
3618 exec));
3619 rtx mem = gen_rtx_MEM (vmode, addrs);
3620 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3621 /* FIXME: set attributes. */
3622 emit_insn (gen_mov_with_exec (mem, val, exec));
3623 return target;
3624 }
3625 case GCN_BUILTIN_SQRTVF:
3626 {
3627 if (ignore)
3628 return target;
3629 rtx exec = gcn_full_exec_reg ();
3630 rtx arg = force_reg (V64SFmode,
3631 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3632 V64SFmode,
3633 EXPAND_NORMAL));
3634 emit_insn (gen_sqrtv64sf2_exec
3635 (target, arg, gcn_gen_undef (V64SFmode), exec));
3636 return target;
3637 }
3638 case GCN_BUILTIN_SQRTF:
3639 {
3640 if (ignore)
3641 return target;
3642 rtx arg = force_reg (SFmode,
3643 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3644 SFmode,
3645 EXPAND_NORMAL));
3646 emit_insn (gen_sqrtsf2 (target, arg));
3647 return target;
3648 }
3649 case GCN_BUILTIN_OMP_DIM_SIZE:
3650 {
3651 if (ignore)
3652 return target;
3653 emit_insn (gen_oacc_dim_size (target,
3654 expand_expr (CALL_EXPR_ARG (exp, 0),
3655 NULL_RTX, SImode,
3656 EXPAND_NORMAL)));
3657 return target;
3658 }
3659 case GCN_BUILTIN_OMP_DIM_POS:
3660 {
3661 if (ignore)
3662 return target;
3663 emit_insn (gen_oacc_dim_pos (target,
3664 expand_expr (CALL_EXPR_ARG (exp, 0),
3665 NULL_RTX, SImode,
3666 EXPAND_NORMAL)));
3667 return target;
3668 }
3669 case GCN_BUILTIN_CMP_SWAP:
3670 case GCN_BUILTIN_CMP_SWAPLL:
3671 return gcn_expand_cmp_swap (exp, target);
3672
3673 case GCN_BUILTIN_ACC_SINGLE_START:
3674 {
3675 if (ignore)
3676 return target;
3677
3678 rtx wavefront = gcn_oacc_dim_pos (1);
3679 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3680 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3681 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3682 return cc;
3683 }
3684
3685 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3686 {
3687 rtx blk = force_reg (SImode,
3688 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3689 SImode, EXPAND_NORMAL));
3690 rtx wavefront = gcn_oacc_dim_pos (1);
3691 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3692 rtx not_zero = gen_label_rtx ();
3693 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3694 emit_move_insn (blk, const0_rtx);
3695 emit_label (not_zero);
3696 return blk;
3697 }
3698
3699 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3700 return target;
3701
3702 case GCN_BUILTIN_ACC_BARRIER:
3703 emit_insn (gen_gcn_wavefront_barrier ());
3704 return target;
3705
3706 default:
3707 gcc_unreachable ();
3708 }
3709}
3710
3711/* Expansion of simple arithmetic and bit binary operation builtins.
3712
3713 Intended for use with gcn_builtins table. */
3714
3715static rtx
3716gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
3717 machine_mode /*mode */ , int ignore,
3718 struct gcn_builtin_description *d)
3719{
3720 int icode = d->icode;
3721 if (ignore)
3722 return target;
3723
3724 rtx exec = force_reg (DImode,
3725 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3726 EXPAND_NORMAL));
3727
3728 machine_mode m1 = insn_data[icode].operand[1].mode;
3729 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
3730 EXPAND_NORMAL);
3731 if (!insn_data[icode].operand[1].predicate (arg1, m1))
3732 arg1 = force_reg (m1, arg1);
3733
3734 machine_mode m2 = insn_data[icode].operand[2].mode;
3735 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
3736 EXPAND_NORMAL);
3737 if (!insn_data[icode].operand[2].predicate (arg2, m2))
3738 arg2 = force_reg (m2, arg2);
3739
3740 rtx arg_prev;
3741 if (call_expr_nargs (exp) == 4)
3742 {
3743 machine_mode m_prev = insn_data[icode].operand[4].mode;
3744 arg_prev = force_reg (m_prev,
3745 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3746 m_prev, EXPAND_NORMAL));
3747 }
3748 else
3749 arg_prev = gcn_gen_undef (GET_MODE (target));
3750
3751 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
3752 emit_insn (pat);
3753 return target;
3754}
3755
3756/* Implement TARGET_EXPAND_BUILTIN.
3757
3758 Expand an expression EXP that calls a built-in function, with result going
3759 to TARGET if that's convenient (and in mode MODE if that's convenient).
3760 SUBTARGET may be used as the target for computing one of EXP's operands.
3761 IGNORE is nonzero if the value is to be ignored. */
3762
3763rtx
3764gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
3765 int ignore)
3766{
3767 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 3768 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
5326695a
AS
3769 struct gcn_builtin_description *d;
3770
3771 gcc_assert (fcode < GCN_BUILTIN_MAX);
3772 d = &gcn_builtins[fcode];
3773
3774 if (d->type == B_UNIMPLEMENTED)
3775 sorry ("Builtin not implemented");
3776
3777 return d->expander (exp, target, subtarget, mode, ignore, d);
3778}
3779
3780/* }}} */
3781/* {{{ Vectorization. */
3782
3783/* Implement TARGET_VECTORIZE_GET_MASK_MODE.
3784
3785 A vector mask is a value that holds one boolean result for every element in
3786 a vector. */
3787
3788opt_machine_mode
3789gcn_vectorize_get_mask_mode (poly_uint64 ARG_UNUSED (nunits),
3790 poly_uint64 ARG_UNUSED (length))
3791{
3792 /* GCN uses a DImode bit-mask. */
3793 return DImode;
3794}
3795
3796/* Return an RTX that references a vector with the i-th lane containing
3797 PERM[i]*4.
3798
3799 Helper function for gcn_vectorize_vec_perm_const. */
3800
3801static rtx
3802gcn_make_vec_perm_address (unsigned int *perm)
3803{
3804 rtx x = gen_reg_rtx (V64SImode);
3805 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
3806
3807 /* Permutation addresses use byte addressing. With each vector lane being
3808 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
3809 so only set those.
3810
3811 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
3812 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
3813 only take one source vector, the most-significant bit can be ignored
3814 here. Instead, we can use EXEC masking to select the relevant part of
3815 each source vector after they are permuted separately. */
3816 uint64_t bit_mask = 1 << 2;
3817 for (int i = 2; i < 8; i++, bit_mask <<= 1)
3818 {
3819 uint64_t exec_mask = 0;
3820 uint64_t lane_mask = 1;
3821 for (int j = 0; j < 64; j++, lane_mask <<= 1)
3822 if ((perm[j] * 4) & bit_mask)
3823 exec_mask |= lane_mask;
3824
3825 if (exec_mask)
3826 emit_insn (gen_addv64si3_exec (x, x,
3827 gcn_vec_constant (V64SImode,
3828 bit_mask),
3829 x, get_exec (exec_mask)));
3830 }
3831
3832 return x;
3833}
3834
3835/* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
3836
3837 Return true if permutation with SEL is possible.
3838
3839 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
3840 permutations. */
3841
3842static bool
3843gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
3844 rtx src0, rtx src1,
3845 const vec_perm_indices & sel)
3846{
3847 unsigned int nelt = GET_MODE_NUNITS (vmode);
3848
3849 gcc_assert (VECTOR_MODE_P (vmode));
3850 gcc_assert (nelt <= 64);
3851 gcc_assert (sel.length () == nelt);
3852
3853 if (!dst)
3854 {
3855 /* All vector permutations are possible on this architecture,
3856 with varying degrees of efficiency depending on the permutation. */
3857 return true;
3858 }
3859
3860 unsigned int perm[64];
3861 for (unsigned int i = 0; i < nelt; ++i)
3862 perm[i] = sel[i] & (2 * nelt - 1);
3863
3864 /* Make life a bit easier by swapping operands if necessary so that
3865 the first element always comes from src0. */
3866 if (perm[0] >= nelt)
3867 {
3868 rtx temp = src0;
3869 src0 = src1;
3870 src1 = temp;
3871
3872 for (unsigned int i = 0; i < nelt; ++i)
3873 if (perm[i] < nelt)
3874 perm[i] += nelt;
3875 else
3876 perm[i] -= nelt;
3877 }
3878
3879 /* TODO: There are more efficient ways to implement certain permutations
3880 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
3881 this more inefficient generic approach is used. */
3882
3883 int64_t src1_lanes = 0;
3884 int64_t lane_bit = 1;
3885
3886 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
3887 {
3888 /* Set the bits for lanes from src1. */
3889 if (perm[i] >= nelt)
3890 src1_lanes |= lane_bit;
3891 }
3892
3893 rtx addr = gcn_make_vec_perm_address (perm);
3894 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
3895
3896 switch (vmode)
3897 {
3898 case E_V64QImode:
3899 ds_bpermute = gen_ds_bpermutev64qi;
3900 break;
3901 case E_V64HImode:
3902 ds_bpermute = gen_ds_bpermutev64hi;
3903 break;
3904 case E_V64SImode:
3905 ds_bpermute = gen_ds_bpermutev64si;
3906 break;
3907 case E_V64HFmode:
3908 ds_bpermute = gen_ds_bpermutev64hf;
3909 break;
3910 case E_V64SFmode:
3911 ds_bpermute = gen_ds_bpermutev64sf;
3912 break;
3913 case E_V64DImode:
3914 ds_bpermute = gen_ds_bpermutev64di;
3915 break;
3916 case E_V64DFmode:
3917 ds_bpermute = gen_ds_bpermutev64df;
3918 break;
3919 default:
3920 gcc_assert (false);
3921 }
3922
3923 /* Load elements from src0 to dst. */
3924 gcc_assert (~src1_lanes);
3925 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
3926
3927 /* Load elements from src1 to dst. */
3928 if (src1_lanes)
3929 {
3930 /* Masking a lane masks both the destination and source lanes for
3931 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
3932 then add an extra masked move to merge the results of permuting
3933 the two source vectors together.
3934 */
3935 rtx tmp = gen_reg_rtx (vmode);
3936 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
3937 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
3938 }
3939
3940 return true;
3941}
3942
3943/* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
3944
3945 Return nonzero if vector MODE is supported with at least move
3946 instructions. */
3947
3948static bool
3949gcn_vector_mode_supported_p (machine_mode mode)
3950{
3951 /* FIXME: Enable V64QImode and V64HImode.
3952 We should support these modes, but vector operations are usually
3953 assumed to automatically truncate types, and GCN does not. We
3954 need to add explicit truncates and/or use SDWA for QI/HI insns. */
3955 return (/* mode == V64QImode || mode == V64HImode
3956 ||*/ mode == V64SImode || mode == V64DImode
3957 || mode == V64SFmode || mode == V64DFmode);
3958}
3959
3960/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
3961
3962 Enables autovectorization for all supported modes. */
3963
3964static machine_mode
3965gcn_vectorize_preferred_simd_mode (scalar_mode mode)
3966{
3967 switch (mode)
3968 {
3969 case E_QImode:
3970 return V64QImode;
3971 case E_HImode:
3972 return V64HImode;
3973 case E_SImode:
3974 return V64SImode;
3975 case E_DImode:
3976 return V64DImode;
3977 case E_SFmode:
3978 return V64SFmode;
3979 case E_DFmode:
3980 return V64DFmode;
3981 default:
3982 return word_mode;
3983 }
3984}
3985
3986/* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
3987
3988 Returns the preferred alignment in bits for accesses to vectors of type type
3989 in vectorized code. This might be less than or greater than the ABI-defined
3990 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
3991 of a single element, in which case the vectorizer will not try to optimize
3992 for alignment. */
3993
3994static poly_uint64
3995gcn_preferred_vector_alignment (const_tree type)
3996{
3997 return TYPE_ALIGN (TREE_TYPE (type));
3998}
3999
4000/* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4001
4002 Return true if the target supports misaligned vector store/load of a
4003 specific factor denoted in the misalignment parameter. */
4004
4005static bool
4006gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4007 const_tree type, int misalignment,
4008 bool is_packed)
4009{
4010 if (is_packed)
4011 return false;
4012
4013 /* If the misalignment is unknown, we should be able to handle the access
4014 so long as it is not to a member of a packed data structure. */
4015 if (misalignment == -1)
4016 return true;
4017
4018 /* Return true if the misalignment is a multiple of the natural alignment
4019 of the vector's element type. This is probably always going to be
4020 true in practice, since we've already established that this isn't a
4021 packed access. */
4022 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4023}
4024
4025/* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4026
4027 Return true if vector alignment is reachable (by peeling N iterations) for
4028 the given scalar type TYPE. */
4029
4030static bool
4031gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4032{
4033 /* Vectors which aren't in packed structures will not be less aligned than
4034 the natural alignment of their element type, so this is safe. */
4035 return !is_packed;
4036}
4037
4038/* Generate DPP instructions used for vector reductions.
4039
4040 The opcode is given by INSN.
4041 The first operand of the operation is shifted right by SHIFT vector lanes.
4042 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4043 broadcast the next row (thereby acting like a shift of 16 for the end of
4044 each row). If SHIFT is 32, lane 31 is broadcast to all the
4045 following lanes (thereby acting like a shift of 32 for lane 63). */
4046
4047char *
4048gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4049 int unspec, int shift)
4050{
4051 static char buf[64];
4052 const char *dpp;
4053 const char *vcc_in = "";
4054 const char *vcc_out = "";
4055
4056 /* Add the vcc operand if needed. */
4057 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4058 {
4059 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4060 vcc_in = ", vcc";
4061
4062 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4063 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4064 vcc_out = ", vcc";
4065 }
4066
4067 /* Add the DPP modifiers. */
4068 switch (shift)
4069 {
4070 case 1:
4071 dpp = "row_shr:1 bound_ctrl:0";
4072 break;
4073 case 2:
4074 dpp = "row_shr:2 bound_ctrl:0";
4075 break;
4076 case 4:
4077 dpp = "row_shr:4 bank_mask:0xe";
4078 break;
4079 case 8:
4080 dpp = "row_shr:8 bank_mask:0xc";
4081 break;
4082 case 16:
4083 dpp = "row_bcast:15 row_mask:0xa";
4084 break;
4085 case 32:
4086 dpp = "row_bcast:31 row_mask:0xc";
4087 break;
4088 default:
4089 gcc_unreachable ();
4090 }
4091
4092 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4093
4094 return buf;
4095}
4096
4097/* Generate vector reductions in terms of DPP instructions.
4098
4099 The vector register SRC of mode MODE is reduced using the operation given
4100 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4101 register. */
4102
4103rtx
4104gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4105{
4106 rtx tmp = gen_reg_rtx (mode);
4107 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4108 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4109 && (TARGET_GCN3 || mode == V64DImode);
4110
4111 if (use_plus_carry)
4112 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4113
4114 /* Perform reduction by first performing the reduction operation on every
4115 pair of lanes, then on every pair of results from the previous
4116 iteration (thereby effectively reducing every 4 lanes) and so on until
4117 all lanes are reduced. */
4118 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4119 {
4120 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4121 rtx insn = gen_rtx_SET (tmp,
4122 gen_rtx_UNSPEC (mode,
4123 gen_rtvec (3,
4124 src, src, shift_val),
4125 unspec));
4126
4127 /* Add clobber for instructions that set the carry flags. */
4128 if (use_plus_carry)
4129 {
4130 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4131 gen_rtx_REG (DImode, VCC_REG));
4132 insn = gen_rtx_PARALLEL (VOIDmode,
4133 gen_rtvec (2, insn, clobber));
4134 }
4135
4136 emit_insn (insn);
4137
4138 /* The source operands for every iteration after the first
4139 should be TMP. */
4140 src = tmp;
4141 }
4142
4143 return tmp;
4144}
4145
4146/* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4147
4148int
4149gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4150 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4151{
4152 /* Always vectorize. */
4153 return 1;
4154}
4155
4156/* }}} */
4157/* {{{ md_reorg pass. */
4158
4159/* Identify VMEM instructions from their "type" attribute. */
4160
4161static bool
4162gcn_vmem_insn_p (attr_type type)
4163{
4164 switch (type)
4165 {
4166 case TYPE_MUBUF:
4167 case TYPE_MTBUF:
4168 case TYPE_FLAT:
4169 return true;
4170 case TYPE_UNKNOWN:
4171 case TYPE_SOP1:
4172 case TYPE_SOP2:
4173 case TYPE_SOPK:
4174 case TYPE_SOPC:
4175 case TYPE_SOPP:
4176 case TYPE_SMEM:
4177 case TYPE_DS:
4178 case TYPE_VOP2:
4179 case TYPE_VOP1:
4180 case TYPE_VOPC:
4181 case TYPE_VOP3A:
4182 case TYPE_VOP3B:
4183 case TYPE_VOP_SDWA:
4184 case TYPE_VOP_DPP:
4185 case TYPE_MULT:
4186 case TYPE_VMULT:
4187 return false;
4188 }
4189 gcc_unreachable ();
4190 return false;
4191}
4192
4193/* If INSN sets the EXEC register to a constant value, return the value,
4194 otherwise return zero. */
4195
4196static int64_t
4197gcn_insn_exec_value (rtx_insn *insn)
4198{
4199 if (!NONDEBUG_INSN_P (insn))
4200 return 0;
4201
4202 rtx pattern = PATTERN (insn);
4203
4204 if (GET_CODE (pattern) == SET)
4205 {
4206 rtx dest = XEXP (pattern, 0);
4207 rtx src = XEXP (pattern, 1);
4208
4209 if (GET_MODE (dest) == DImode
4210 && REG_P (dest) && REGNO (dest) == EXEC_REG
4211 && CONST_INT_P (src))
4212 return INTVAL (src);
4213 }
4214
4215 return 0;
4216}
4217
4218/* Sets the EXEC register before INSN to the value that it had after
4219 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4220 known, otherwise it returns zero. */
4221
4222static int64_t
4223gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4224 bool curr_exec_known, bool &last_exec_def_saved)
4225{
4226 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4227 rtx exec;
4228
4229 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4230
4231 if (exec_value)
4232 {
4233 /* If the EXEC value is a constant and it happens to be the same as the
4234 current EXEC value, the restore can be skipped. */
4235 if (curr_exec_known && exec_value == curr_exec)
4236 return exec_value;
4237
4238 exec = GEN_INT (exec_value);
4239 }
4240 else
4241 {
4242 /* If the EXEC value is not a constant, save it in a register after the
4243 point of definition. */
4244 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4245
4246 if (!last_exec_def_saved)
4247 {
4248 start_sequence ();
4249 emit_move_insn (exec_save_reg, exec_reg);
4250 rtx_insn *seq = get_insns ();
4251 end_sequence ();
4252
4253 emit_insn_after (seq, last_exec_def);
4254 if (dump_file && (dump_flags & TDF_DETAILS))
4255 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4256 INSN_UID (last_exec_def));
4257
4258 last_exec_def_saved = true;
4259 }
4260
4261 exec = exec_save_reg;
4262 }
4263
4264 /* Restore EXEC register before the usage. */
4265 start_sequence ();
4266 emit_move_insn (exec_reg, exec);
4267 rtx_insn *seq = get_insns ();
4268 end_sequence ();
4269 emit_insn_before (seq, insn);
4270
4271 if (dump_file && (dump_flags & TDF_DETAILS))
4272 {
4273 if (exec_value)
4274 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4275 exec_value, INSN_UID (insn));
4276 else
4277 fprintf (dump_file,
4278 "Restoring EXEC from saved value before insn %d.\n",
4279 INSN_UID (insn));
4280 }
4281
4282 return exec_value;
4283}
4284
4285/* Implement TARGET_MACHINE_DEPENDENT_REORG.
4286
4287 Ensure that pipeline dependencies and lane masking are set correctly. */
4288
4289static void
4290gcn_md_reorg (void)
4291{
4292 basic_block bb;
4293 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4294 rtx exec_lo_reg = gen_rtx_REG (SImode, EXEC_LO_REG);
4295 rtx exec_hi_reg = gen_rtx_REG (SImode, EXEC_HI_REG);
4296 regset_head live;
4297
4298 INIT_REG_SET (&live);
4299
4300 compute_bb_for_insn ();
4301
4302 if (!optimize)
4303 {
4304 split_all_insns ();
4305 if (dump_file && (dump_flags & TDF_DETAILS))
4306 {
4307 fprintf (dump_file, "After split:\n");
4308 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4309 }
4310
4311 /* Update data-flow information for split instructions. */
4312 df_insn_rescan_all ();
4313 }
4314
4315 df_analyze ();
4316
4317 /* This pass ensures that the EXEC register is set correctly, according
4318 to the "exec" attribute. However, care must be taken so that the
4319 value that reaches explicit uses of the EXEC register remains the
4320 same as before.
4321 */
4322
4323 FOR_EACH_BB_FN (bb, cfun)
4324 {
4325 if (dump_file && (dump_flags & TDF_DETAILS))
4326 fprintf (dump_file, "BB %d:\n", bb->index);
4327
4328 rtx_insn *insn, *curr;
4329 rtx_insn *last_exec_def = BB_HEAD (bb);
4330 bool last_exec_def_saved = false;
4331 bool curr_exec_explicit = true;
4332 bool curr_exec_known = true;
4333 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4334 after last_exec_def is executed'. */
4335
4336 FOR_BB_INSNS_SAFE (bb, insn, curr)
4337 {
4338 if (!NONDEBUG_INSN_P (insn))
4339 continue;
4340
4341 if (GET_CODE (PATTERN (insn)) == USE
4342 || GET_CODE (PATTERN (insn)) == CLOBBER)
4343 continue;
4344
4345 HARD_REG_SET defs, uses;
4346 CLEAR_HARD_REG_SET (defs);
4347 CLEAR_HARD_REG_SET (uses);
e8448ba5 4348 note_stores (insn, record_hard_reg_sets, &defs);
5326695a
AS
4349 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4350
4351 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4352 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4353 bool exec_used = (hard_reg_set_intersect_p
4354 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4355 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4356
4357 /* Check the instruction for implicit setting of EXEC via an
4358 attribute. */
4359 attr_exec exec_attr = get_attr_exec (insn);
4360 int64_t new_exec;
4361
4362 switch (exec_attr)
4363 {
4364 case EXEC_NONE:
4365 new_exec = 0;
4366 break;
4367
4368 case EXEC_SINGLE:
4369 /* Instructions that do not involve memory accesses only require
4370 bit 0 of EXEC to be set. */
4371 if (gcn_vmem_insn_p (get_attr_type (insn))
4372 || get_attr_type (insn) == TYPE_DS)
4373 new_exec = 1;
4374 else
4375 new_exec = curr_exec | 1;
4376 break;
4377
4378 case EXEC_FULL:
4379 new_exec = -1;
4380 break;
4381
4382 default: /* Auto-detect what setting is appropriate. */
4383 {
4384 new_exec = 0;
4385
4386 /* If EXEC is referenced explicitly then we don't need to do
4387 anything to set it, so we're done. */
4388 if (exec_used)
4389 break;
4390
4391 /* Scan the insn for VGPRs defs or uses. The mode determines
4392 what kind of exec is needed. */
4393 subrtx_iterator::array_type array;
4394 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4395 {
4396 const_rtx x = *iter;
4397 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4398 {
4399 if (VECTOR_MODE_P (GET_MODE (x)))
4400 {
4401 new_exec = -1;
4402 break;
4403 }
4404 else
4405 new_exec = 1;
4406 }
4407 }
4408 }
4409 break;
4410 }
4411
4412 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4413 {
4414 start_sequence ();
4415 emit_move_insn (exec_reg, GEN_INT (new_exec));
4416 rtx_insn *seq = get_insns ();
4417 end_sequence ();
4418 emit_insn_before (seq, insn);
4419
4420 if (dump_file && (dump_flags & TDF_DETAILS))
4421 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4422 new_exec, INSN_UID (insn));
4423
4424 curr_exec = new_exec;
4425 curr_exec_explicit = false;
4426 curr_exec_known = true;
4427 }
4428 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4429 {
4430 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4431 new_exec, INSN_UID (insn));
4432 }
4433
4434 /* The state of the EXEC register is unknown after a
4435 function call. */
4436 if (CALL_P (insn))
4437 curr_exec_known = false;
4438
4439 /* Handle explicit uses of EXEC. If the instruction is a partial
4440 explicit definition of EXEC, then treat it as an explicit use of
4441 EXEC as well. */
4442 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4443 {
4444 /* An instruction that explicitly uses EXEC should not also
4445 implicitly define it. */
4446 gcc_assert (!exec_used || !new_exec);
4447
4448 if (!curr_exec_known || !curr_exec_explicit)
4449 {
4450 /* Restore the previous explicitly defined value. */
4451 curr_exec = gcn_restore_exec (insn, last_exec_def,
4452 curr_exec, curr_exec_known,
4453 last_exec_def_saved);
4454 curr_exec_explicit = true;
4455 curr_exec_known = true;
4456 }
4457 }
4458
4459 /* Handle explicit definitions of EXEC. */
4460 if (exec_lo_def_p || exec_hi_def_p)
4461 {
4462 last_exec_def = insn;
4463 last_exec_def_saved = false;
4464 curr_exec = gcn_insn_exec_value (insn);
4465 curr_exec_explicit = true;
4466 curr_exec_known = true;
4467
4468 if (dump_file && (dump_flags & TDF_DETAILS))
4469 fprintf (dump_file,
4470 "Found %s definition of EXEC at insn %d.\n",
4471 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4472 INSN_UID (insn));
4473 }
4474 }
4475
4476 COPY_REG_SET (&live, DF_LR_OUT (bb));
4477 df_simulate_initialize_backwards (bb, &live);
4478
4479 /* If EXEC is live after the basic block, restore the value of EXEC
4480 at the end of the block. */
4481 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4482 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4483 && (!curr_exec_known || !curr_exec_explicit))
4484 {
4485 rtx_insn *end_insn = BB_END (bb);
4486
4487 /* If the instruction is not a jump instruction, do the restore
4488 after the last instruction in the basic block. */
4489 if (NONJUMP_INSN_P (end_insn))
4490 end_insn = NEXT_INSN (end_insn);
4491
4492 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4493 curr_exec_known, last_exec_def_saved);
4494 }
4495 }
4496
4497 CLEAR_REG_SET (&live);
4498
4499 /* "Manually Inserted Wait States (NOPs)."
4500
4501 GCN hardware detects most kinds of register dependencies, but there
4502 are some exceptions documented in the ISA manual. This pass
4503 detects the missed cases, and inserts the documented number of NOPs
4504 required for correct execution. */
4505
4506 const int max_waits = 5;
4507 struct ilist
4508 {
4509 rtx_insn *insn;
4510 attr_unit unit;
930c5599 4511 attr_delayeduse delayeduse;
5326695a 4512 HARD_REG_SET writes;
930c5599 4513 HARD_REG_SET reads;
5326695a
AS
4514 int age;
4515 } back[max_waits];
4516 int oldest = 0;
4517 for (int i = 0; i < max_waits; i++)
4518 back[i].insn = NULL;
4519
4520 rtx_insn *insn, *last_insn = NULL;
4521 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4522 {
4523 if (!NONDEBUG_INSN_P (insn))
4524 continue;
4525
4526 if (GET_CODE (PATTERN (insn)) == USE
4527 || GET_CODE (PATTERN (insn)) == CLOBBER)
4528 continue;
4529
4530 attr_type itype = get_attr_type (insn);
4531 attr_unit iunit = get_attr_unit (insn);
930c5599 4532 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
5326695a
AS
4533 HARD_REG_SET ireads, iwrites;
4534 CLEAR_HARD_REG_SET (ireads);
4535 CLEAR_HARD_REG_SET (iwrites);
e8448ba5 4536 note_stores (insn, record_hard_reg_sets, &iwrites);
5326695a
AS
4537 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4538
4539 /* Scan recent previous instructions for dependencies not handled in
4540 hardware. */
4541 int nops_rqd = 0;
4542 for (int i = oldest; i < oldest + max_waits; i++)
4543 {
4544 struct ilist *prev_insn = &back[i % max_waits];
4545
4546 if (!prev_insn->insn)
4547 continue;
4548
4549 /* VALU writes SGPR followed by VMEM reading the same SGPR
4550 requires 5 wait states. */
4551 if ((prev_insn->age + nops_rqd) < 5
4552 && prev_insn->unit == UNIT_VECTOR
4553 && gcn_vmem_insn_p (itype))
4554 {
4555 HARD_REG_SET regs;
4556 COPY_HARD_REG_SET (regs, prev_insn->writes);
4557 AND_HARD_REG_SET (regs, ireads);
4558 if (hard_reg_set_intersect_p
4559 (regs, reg_class_contents[(int) SGPR_REGS]))
4560 nops_rqd = 5 - prev_insn->age;
4561 }
4562
4563 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4564 requires 5 wait states. */
4565 if ((prev_insn->age + nops_rqd) < 5
4566 && prev_insn->unit == UNIT_VECTOR
4567 && iunit == UNIT_VECTOR
4568 && ((hard_reg_set_intersect_p
4569 (prev_insn->writes,
4570 reg_class_contents[(int) EXEC_MASK_REG])
4571 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4572 ||
4573 (hard_reg_set_intersect_p
4574 (prev_insn->writes,
4575 reg_class_contents[(int) VCC_CONDITIONAL_REG])
4576 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4577 nops_rqd = 5 - prev_insn->age;
4578
4579 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4580 SGPR/VCC as lane select requires 4 wait states. */
4581 if ((prev_insn->age + nops_rqd) < 4
4582 && prev_insn->unit == UNIT_VECTOR
4583 && get_attr_laneselect (insn) == LANESELECT_YES)
4584 {
4585 HARD_REG_SET regs;
4586 COPY_HARD_REG_SET (regs, prev_insn->writes);
4587 AND_HARD_REG_SET (regs, ireads);
4588 if (hard_reg_set_intersect_p
4589 (regs, reg_class_contents[(int) SGPR_REGS])
4590 || hard_reg_set_intersect_p
4591 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4592 nops_rqd = 4 - prev_insn->age;
4593 }
4594
4595 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4596 requires 2 wait states. */
4597 if ((prev_insn->age + nops_rqd) < 2
4598 && prev_insn->unit == UNIT_VECTOR
4599 && itype == TYPE_VOP_DPP)
4600 {
4601 HARD_REG_SET regs;
4602 COPY_HARD_REG_SET (regs, prev_insn->writes);
4603 AND_HARD_REG_SET (regs, ireads);
4604 if (hard_reg_set_intersect_p
4605 (regs, reg_class_contents[(int) VGPR_REGS]))
4606 nops_rqd = 2 - prev_insn->age;
4607 }
930c5599
AS
4608
4609 /* Store that requires input registers are not overwritten by
4610 following instruction. */
4611 if ((prev_insn->age + nops_rqd) < 1
4612 && prev_insn->delayeduse == DELAYEDUSE_YES
4613 && ((hard_reg_set_intersect_p
4614 (prev_insn->reads, iwrites))))
4615 nops_rqd = 1 - prev_insn->age;
5326695a
AS
4616 }
4617
4618 /* Insert the required number of NOPs. */
4619 for (int i = nops_rqd; i > 0; i--)
4620 emit_insn_after (gen_nop (), last_insn);
4621
4622 /* Age the previous instructions. We can also ignore writes to
4623 registers subsequently overwritten. */
4624 HARD_REG_SET written;
4625 CLEAR_HARD_REG_SET (written);
4626 for (int i = oldest + max_waits - 1; i > oldest; i--)
4627 {
4628 struct ilist *prev_insn = &back[i % max_waits];
4629
4630 /* Assume all instructions are equivalent to one "wait", the same
4631 as s_nop. This is probably true for SALU, but not VALU (which
4632 may take longer), so this is not optimal. However, AMD do
4633 not publish the cycle times for instructions. */
4634 prev_insn->age += 1 + nops_rqd;
4635
4636 IOR_HARD_REG_SET (written, iwrites);
4637 AND_COMPL_HARD_REG_SET (prev_insn->writes, written);
4638 }
4639
4640 /* Track the current instruction as a previous instruction. */
4641 back[oldest].insn = insn;
4642 back[oldest].unit = iunit;
930c5599 4643 back[oldest].delayeduse = idelayeduse;
5326695a 4644 COPY_HARD_REG_SET (back[oldest].writes, iwrites);
930c5599 4645 COPY_HARD_REG_SET (back[oldest].reads, ireads);
5326695a
AS
4646 back[oldest].age = 0;
4647 oldest = (oldest + 1) % max_waits;
4648
4649 last_insn = insn;
4650 }
4651}
4652
4653/* }}} */
4654/* {{{ OpenACC / OpenMP. */
4655
4656#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
4657#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
4658#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
4659
4660/* Implement TARGET_GOACC_VALIDATE_DIMS.
4661
4662 Check the launch dimensions provided for an OpenACC compute
4663 region, or routine. */
4664
4665static bool
4666gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
4667 unsigned /*used*/)
4668{
4669 bool changed = false;
4670
4671 /* FIXME: remove -facc-experimental-workers when they're ready. */
4672 int max_workers = flag_worker_partitioning ? 16 : 1;
4673
4674 /* The vector size must appear to be 64, to the user, unless this is a
4675 SEQ routine. The real, internal value is always 1, which means use
4676 autovectorization, but the user should not see that. */
4677 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4678 && dims[GOMP_DIM_VECTOR] >= 0)
4679 {
4680 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
4681 && dims[GOMP_DIM_VECTOR] != 64)
4682 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4683 OPT_Wopenacc_dims,
4684 (dims[GOMP_DIM_VECTOR]
4685 ? G_("using vector_length (64), ignoring %d")
4686 : G_("using vector_length (64), "
4687 "ignoring runtime setting")),
4688 dims[GOMP_DIM_VECTOR]);
4689 dims[GOMP_DIM_VECTOR] = 1;
4690 changed = true;
4691 }
4692
4693 /* Check the num workers is not too large. */
4694 if (dims[GOMP_DIM_WORKER] > max_workers)
4695 {
4696 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
4697 OPT_Wopenacc_dims,
4698 "using num_workers (%d), ignoring %d",
4699 max_workers, dims[GOMP_DIM_WORKER]);
4700 dims[GOMP_DIM_WORKER] = max_workers;
4701 changed = true;
4702 }
4703
4704 /* Set global defaults. */
4705 if (!decl)
4706 {
4707 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
4708 if (dims[GOMP_DIM_WORKER] < 0)
4709 dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
4710 ? GCN_DEFAULT_WORKERS : 1);
4711 if (dims[GOMP_DIM_GANG] < 0)
4712 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
4713 changed = true;
4714 }
4715
4716 return changed;
4717}
4718
4719/* Helper function for oacc_dim_size instruction.
4720 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
4721
4722rtx
4723gcn_oacc_dim_size (int dim)
4724{
4725 if (dim < 0 || dim > 2)
4726 error ("offload dimension out of range (%d)", dim);
4727
4728 /* Vectors are a special case. */
4729 if (dim == 2)
4730 return const1_rtx; /* Think of this as 1 times 64. */
4731
4732 static int offset[] = {
4733 /* Offsets into dispatch packet. */
4734 12, /* X dim = Gang / Team / Work-group. */
4735 20, /* Z dim = Worker / Thread / Wavefront. */
4736 16 /* Y dim = Vector / SIMD / Work-item. */
4737 };
4738 rtx addr = gen_rtx_PLUS (DImode,
4739 gen_rtx_REG (DImode,
4740 cfun->machine->args.
4741 reg[DISPATCH_PTR_ARG]),
4742 GEN_INT (offset[dim]));
4743 return gen_rtx_MEM (SImode, addr);
4744}
4745
4746/* Helper function for oacc_dim_pos instruction.
4747 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
4748
4749rtx
4750gcn_oacc_dim_pos (int dim)
4751{
4752 if (dim < 0 || dim > 2)
4753 error ("offload dimension out of range (%d)", dim);
4754
4755 static const int reg[] = {
4756 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
4757 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
4758 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
4759 };
4760
4761 int reg_num = cfun->machine->args.reg[reg[dim]];
4762
4763 /* The information must have been requested by the kernel. */
4764 gcc_assert (reg_num >= 0);
4765
4766 return gen_rtx_REG (SImode, reg_num);
4767}
4768
4769/* Implement TARGET_GOACC_FORK_JOIN. */
4770
4771static bool
4772gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
4773 bool ARG_UNUSED (is_fork))
4774{
4775 /* GCN does not use the fork/join concept invented for NVPTX.
4776 Instead we use standard autovectorization. */
4777 return false;
4778}
4779
4780/* Implement ???????
4781 FIXME make this a real hook.
4782
4783 Adjust FNDECL such that options inherited from the host compiler
4784 are made appropriate for the accelerator compiler. */
4785
4786void
4787gcn_fixup_accel_lto_options (tree fndecl)
4788{
4789 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4790 if (!func_optimize)
4791 return;
4792
4793 tree old_optimize = build_optimization_node (&global_options);
4794 tree new_optimize;
4795
4796 /* If the function changed the optimization levels as well as
4797 setting target options, start with the optimizations
4798 specified. */
4799 if (func_optimize != old_optimize)
4800 cl_optimization_restore (&global_options,
4801 TREE_OPTIMIZATION (func_optimize));
4802
4803 gcn_option_override ();
4804
4805 /* The target attributes may also change some optimization flags,
4806 so update the optimization options if necessary. */
4807 new_optimize = build_optimization_node (&global_options);
4808
4809 if (old_optimize != new_optimize)
4810 {
4811 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4812 cl_optimization_restore (&global_options,
4813 TREE_OPTIMIZATION (old_optimize));
4814 }
4815}
4816
4817/* }}} */
4818/* {{{ ASM Output. */
4819
4820/* Implement TARGET_ASM_FILE_START.
4821
4822 Print assembler file header text. */
4823
4824static void
4825output_file_start (void)
4826{
4827 fprintf (asm_out_file, "\t.text\n");
4828 fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
4829 fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
4830 fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
4831 fprintf (asm_out_file, "\t.text\n");
4832}
4833
4834/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
4835
4836 Print the initial definition of a function name.
4837
4838 For GCN kernel entry points this includes all the HSA meta-data, special
4839 alignment constraints that don't apply to regular functions, and magic
4840 comments that pass information to mkoffload. */
4841
4842void
4843gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
4844{
4845 int sgpr, vgpr;
4846 bool xnack_enabled = false;
4847 int extra_regs = 0;
4848
4849 if (cfun && cfun->machine && cfun->machine->normal_function)
4850 {
4851 fputs ("\t.type\t", file);
4852 assemble_name (file, name);
4853 fputs (",@function\n", file);
4854 assemble_name (file, name);
4855 fputs (":\n", file);
4856 return;
4857 }
4858
4859 /* Determine count of sgpr/vgpr registers by looking for last
4860 one used. */
4861 for (sgpr = 101; sgpr >= 0; sgpr--)
4862 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
4863 break;
4864 sgpr++;
4865 for (vgpr = 255; vgpr >= 0; vgpr--)
4866 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
4867 break;
4868 vgpr++;
4869
4870 if (xnack_enabled)
4871 extra_regs = 6;
4872 if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
4873 || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
4874 extra_regs = 4;
4875 else if (df_regs_ever_live_p (VCC_LO_REG)
4876 || df_regs_ever_live_p (VCC_HI_REG))
4877 extra_regs = 2;
4878
4879 if (!leaf_function_p ())
4880 {
4881 /* We can't know how many registers function calls might use. */
4882 if (vgpr < 64)
4883 vgpr = 64;
4884 if (sgpr + extra_regs < 102)
4885 sgpr = 102 - extra_regs;
4886 }
4887
4888 fputs ("\t.align\t256\n", file);
4889 fputs ("\t.type\t", file);
4890 assemble_name (file, name);
4891 fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
4892 assemble_name (file, name);
4893 fputs ("\n", file);
4894 assemble_name (file, name);
4895 fputs (":\n", file);
4896 fprintf (file, "\t.amd_kernel_code_t\n"
4897 "\t\tkernel_code_version_major = 1\n"
4898 "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
4899 /* "\t\tmachine_version_major = 8\n"
4900 "\t\tmachine_version_minor = 0\n"
4901 "\t\tmachine_version_stepping = 1\n" */
4902 "\t\tkernel_code_entry_byte_offset = 256\n"
4903 "\t\tkernel_code_prefetch_byte_size = 0\n"
4904 "\t\tmax_scratch_backing_memory_byte_size = 0\n"
4905 "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
4906 "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
4907 "\t\tcompute_pgm_rsrc1_priority = 0\n"
4908 "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
4909 "\t\tcompute_pgm_rsrc1_priv = 0\n"
4910 "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
4911 "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
4912 "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
4913 /* We enable scratch memory. */
4914 "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
4915 "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
4916 "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
4917 "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
4918 "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
4919 "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
4920 "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
4921 "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
4922 "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */
4923 "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
4924 (vgpr - 1) / 4,
4925 /* Must match wavefront_sgpr_count */
4926 (sgpr + extra_regs + 7) / 8 - 1,
4927 /* The total number of SGPR user data registers requested. This
4928 number must match the number of user data registers enabled. */
4929 cfun->machine->args.nsgprs);
4930 int reg = FIRST_SGPR_REG;
4931 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
4932 {
4933 int reg_first = -1;
4934 int reg_last;
4935 if ((cfun->machine->args.requested & (1 << a))
4936 && (gcn_kernel_arg_types[a].fixed_regno < 0))
4937 {
4938 reg_first = reg;
4939 reg_last = (reg_first
4940 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
4941 / UNITS_PER_WORD) - 1);
4942 reg = reg_last + 1;
4943 }
4944
4945 if (gcn_kernel_arg_types[a].header_pseudo)
4946 {
4947 fprintf (file, "\t\t%s = %i",
4948 gcn_kernel_arg_types[a].header_pseudo,
4949 (cfun->machine->args.requested & (1 << a)) != 0);
4950 if (reg_first != -1)
4951 {
4952 fprintf (file, " ; (");
4953 for (int i = reg_first; i <= reg_last; ++i)
4954 {
4955 if (i != reg_first)
4956 fprintf (file, ", ");
4957 fprintf (file, "%s", reg_names[i]);
4958 }
4959 fprintf (file, ")");
4960 }
4961 fprintf (file, "\n");
4962 }
4963 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
4964 && cfun->machine->args.requested & (1 << a))
4965 fprintf (file, "\t\t; %s = %i (%s)\n",
4966 gcn_kernel_arg_types[a].name,
4967 (cfun->machine->args.requested & (1 << a)) != 0,
4968 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
4969 }
4970 fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
4971 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
4972 ? 2
4973 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
4974 ? 1 : 0);
4975 fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
4976 "\t\tprivate_element_size = 1\n"
4977 "\t\tis_ptr64 = 1\n"
4978 "\t\tis_dynamic_callstack = 0\n"
4979 "\t\tis_debug_enabled = 0\n"
4980 "\t\tis_xnack_enabled = %i\n"
4981 "\t\tworkitem_private_segment_byte_size = %i\n"
4982 "\t\tworkgroup_group_segment_byte_size = %u\n"
4983 "\t\tgds_segment_byte_size = 0\n"
4984 "\t\tkernarg_segment_byte_size = %i\n"
4985 "\t\tworkgroup_fbarrier_count = 0\n"
4986 "\t\twavefront_sgpr_count = %i\n"
4987 "\t\tworkitem_vgpr_count = %i\n"
4988 "\t\treserved_vgpr_first = 0\n"
4989 "\t\treserved_vgpr_count = 0\n"
4990 "\t\treserved_sgpr_first = 0\n"
4991 "\t\treserved_sgpr_count = 0\n"
4992 "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
4993 "\t\tdebug_private_segment_buffer_sgpr = 0\n"
4994 "\t\tkernarg_segment_alignment = %i\n"
4995 "\t\tgroup_segment_alignment = 4\n"
4996 "\t\tprivate_segment_alignment = %i\n"
4997 "\t\twavefront_size = 6\n"
4998 "\t\tcall_convention = 0\n"
4999 "\t\truntime_loader_kernel_symbol = 0\n"
5000 "\t.end_amd_kernel_code_t\n", xnack_enabled,
5001 /* workitem_private_segment_bytes_size needs to be
5002 one 64th the wave-front stack size. */
5003 stack_size_opt / 64,
5004 LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
5005 /* Number of scalar registers used by a wavefront. This
5006 includes the special SGPRs for VCC, Flat Scratch (Base,
5007 Size) and XNACK (for GFX8 (VI)+). It does not include the
5008 16 SGPR added if a trap handler is enabled. Must match
5009 compute_pgm_rsrc1.sgprs. */
5010 sgpr + extra_regs, vgpr,
5011 cfun->machine->kernarg_segment_alignment,
5012 crtl->stack_alignment_needed / 8);
5013
5014 /* This comment is read by mkoffload. */
5015 if (flag_openacc)
5016 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5017 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5018 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5019 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5020}
5021
5022/* Implement TARGET_ASM_SELECT_SECTION.
5023
5024 Return the section into which EXP should be placed. */
5025
5026static section *
5027gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5028{
5029 if (TREE_TYPE (exp) != error_mark_node
5030 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5031 {
5032 if (!DECL_P (exp))
5033 return get_section (".lds_bss",
5034 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5035 NULL);
5036
5037 return get_named_section (exp, ".lds_bss", reloc);
5038 }
5039
5040 return default_elf_select_section (exp, reloc, align);
5041}
5042
5043/* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5044
5045 Emits custom text into the assembler file at the head of each function. */
5046
5047static void
5048gcn_target_asm_function_prologue (FILE *file)
5049{
5050 machine_function *offsets = gcn_compute_frame_offsets ();
5051
5052 asm_fprintf (file, "\t; using %s addressing in function\n",
5053 offsets->use_flat_addressing ? "flat" : "global");
5054
5055 if (offsets->normal_function)
5056 {
5057 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5058 offsets->need_frame_pointer ? "true" : "false");
5059 asm_fprintf (file, "\t; lr needs saving: %s\n",
5060 offsets->lr_needs_saving ? "true" : "false");
5061 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5062 offsets->outgoing_args_size);
5063 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5064 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5065 asm_fprintf (file, "\t; callee save size: %wd\n",
5066 offsets->callee_saves);
5067 }
5068 else
5069 {
5070 asm_fprintf (file, "\t; HSA kernel entry point\n");
5071 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5072 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5073 offsets->outgoing_args_size);
5074
5075 /* Enable denorms. */
5076 asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
5077 " input and output denorms\n");
5078 asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
5079 }
5080}
5081
5082/* Helper function for print_operand and print_operand_address.
5083
5084 Print a register as the assembler requires, according to mode and name. */
5085
5086static void
5087print_reg (FILE *file, rtx x)
5088{
5089 machine_mode mode = GET_MODE (x);
5090 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5091 || mode == HFmode || mode == SFmode
5092 || mode == V64SFmode || mode == V64SImode
5093 || mode == V64QImode || mode == V64HImode)
5094 fprintf (file, "%s", reg_names[REGNO (x)]);
5095 else if (mode == DImode || mode == V64DImode
5096 || mode == DFmode || mode == V64DFmode)
5097 {
5098 if (SGPR_REGNO_P (REGNO (x)))
5099 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5100 REGNO (x) - FIRST_SGPR_REG + 1);
5101 else if (VGPR_REGNO_P (REGNO (x)))
5102 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5103 REGNO (x) - FIRST_VGPR_REG + 1);
5104 else if (REGNO (x) == FLAT_SCRATCH_REG)
5105 fprintf (file, "flat_scratch");
5106 else if (REGNO (x) == EXEC_REG)
5107 fprintf (file, "exec");
5108 else if (REGNO (x) == VCC_LO_REG)
5109 fprintf (file, "vcc");
5110 else
5111 fprintf (file, "[%s:%s]",
5112 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5113 }
5114 else if (mode == TImode)
5115 {
5116 if (SGPR_REGNO_P (REGNO (x)))
5117 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5118 REGNO (x) - FIRST_SGPR_REG + 3);
5119 else if (VGPR_REGNO_P (REGNO (x)))
5120 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5121 REGNO (x) - FIRST_VGPR_REG + 3);
5122 else
5123 gcc_unreachable ();
5124 }
5125 else
5126 gcc_unreachable ();
5127}
5128
5129/* Implement TARGET_SECTION_TYPE_FLAGS.
5130
5131 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5132
5133static unsigned int
5134gcn_section_type_flags (tree decl, const char *name, int reloc)
5135{
5136 if (strcmp (name, ".lds_bss") == 0)
5137 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5138
5139 return default_section_type_flags (decl, name, reloc);
5140}
5141
5142/* Helper function for gcn_asm_output_symbol_ref.
5143
5144 FIXME: If we want to have propagation blocks allocated separately and
5145 statically like this, it would be better done via symbol refs and the
5146 assembler/linker. This is a temporary hack. */
5147
5148static void
5149gcn_print_lds_decl (FILE *f, tree var)
5150{
5151 int *offset;
5152 machine_function *machfun = cfun->machine;
5153
5154 if ((offset = machfun->lds_allocs->get (var)))
5155 fprintf (f, "%u", (unsigned) *offset);
5156 else
5157 {
5158 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5159 tree type = TREE_TYPE (var);
5160 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5161 if (size > align && size > 4 && align < 8)
5162 align = 8;
5163
5164 machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
5165 & ~(align - 1));
5166
5167 machfun->lds_allocs->put (var, machfun->lds_allocated);
5168 fprintf (f, "%u", machfun->lds_allocated);
5169 machfun->lds_allocated += size;
5170 if (machfun->lds_allocated > LDS_SIZE)
5171 error ("local data-share memory exhausted");
5172 }
5173}
5174
5175/* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5176
5177void
5178gcn_asm_output_symbol_ref (FILE *file, rtx x)
5179{
5180 tree decl;
5181 if ((decl = SYMBOL_REF_DECL (x)) != 0
5182 && TREE_CODE (decl) == VAR_DECL
5183 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5184 {
5185 /* LDS symbols (emitted using this hook) are only used at present
5186 to propagate worker values from an active thread to neutered
5187 threads. Use the same offset for each such block, but don't
5188 use zero because null pointers are used to identify the active
5189 thread in GOACC_single_copy_start calls. */
5190 gcn_print_lds_decl (file, decl);
5191 }
5192 else
5193 {
5194 assemble_name (file, XSTR (x, 0));
5195 /* FIXME: See above -- this condition is unreachable. */
5196 if ((decl = SYMBOL_REF_DECL (x)) != 0
5197 && TREE_CODE (decl) == VAR_DECL
5198 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5199 fputs ("@abs32", file);
5200 }
5201}
5202
5203/* Implement TARGET_CONSTANT_ALIGNMENT.
5204
5205 Returns the alignment in bits of a constant that is being placed in memory.
5206 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5207 would ordinarily have. */
5208
5209static HOST_WIDE_INT
5210gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5211 HOST_WIDE_INT basic_align)
5212{
5213 return basic_align > 128 ? basic_align : 128;
5214}
5215
5216/* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5217
5218void
5219print_operand_address (FILE *file, rtx mem)
5220{
5221 gcc_assert (MEM_P (mem));
5222
5223 rtx reg;
5224 rtx offset;
5225 addr_space_t as = MEM_ADDR_SPACE (mem);
5226 rtx addr = XEXP (mem, 0);
5227 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5228
5229 if (AS_SCRATCH_P (as))
5230 switch (GET_CODE (addr))
5231 {
5232 case REG:
5233 print_reg (file, addr);
5234 break;
5235
5236 case PLUS:
5237 reg = XEXP (addr, 0);
5238 offset = XEXP (addr, 1);
5239 print_reg (file, reg);
5240 if (GET_CODE (offset) == CONST_INT)
5241 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5242 else
5243 abort ();
5244 break;
5245
5246 default:
5247 debug_rtx (addr);
5248 abort ();
5249 }
5250 else if (AS_ANY_FLAT_P (as))
5251 {
5252 if (GET_CODE (addr) == REG)
5253 print_reg (file, addr);
5254 else
5255 {
5256 gcc_assert (TARGET_GCN5_PLUS);
5257 print_reg (file, XEXP (addr, 0));
5258 }
5259 }
5260 else if (AS_GLOBAL_P (as))
5261 {
5262 gcc_assert (TARGET_GCN5_PLUS);
5263
5264 rtx base = addr;
5265 rtx vgpr_offset = NULL_RTX;
5266
5267 if (GET_CODE (addr) == PLUS)
5268 {
5269 base = XEXP (addr, 0);
5270
5271 if (GET_CODE (base) == PLUS)
5272 {
5273 /* (SGPR + VGPR) + CONST */
5274 vgpr_offset = XEXP (base, 1);
5275 base = XEXP (base, 0);
5276 }
5277 else
5278 {
5279 rtx offset = XEXP (addr, 1);
5280
5281 if (REG_P (offset))
5282 /* SGPR + VGPR */
5283 vgpr_offset = offset;
5284 else if (CONST_INT_P (offset))
5285 /* VGPR + CONST or SGPR + CONST */
5286 ;
5287 else
5288 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5289 }
5290 }
5291
5292 if (REG_P (base))
5293 {
5294 if (VGPR_REGNO_P (REGNO (base)))
5295 print_reg (file, base);
5296 else if (SGPR_REGNO_P (REGNO (base)))
5297 {
5298 /* The assembler requires a 64-bit VGPR pair here, even though
5299 the offset should be only 32-bit. */
5300 if (vgpr_offset == NULL_RTX)
5301 /* In this case, the vector offset is zero, so we use v0,
5302 which is initialized by the kernel prologue to zero. */
5303 fprintf (file, "v[0:1]");
5304 else if (REG_P (vgpr_offset)
5305 && VGPR_REGNO_P (REGNO (vgpr_offset)))
5306 {
5307 fprintf (file, "v[%d:%d]",
5308 REGNO (vgpr_offset) - FIRST_VGPR_REG,
5309 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5310 }
5311 else
5312 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5313 }
5314 }
5315 else
5316 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5317 }
5318 else if (AS_ANY_DS_P (as))
5319 switch (GET_CODE (addr))
5320 {
5321 case REG:
5322 print_reg (file, addr);
5323 break;
5324
5325 case PLUS:
5326 reg = XEXP (addr, 0);
5327 print_reg (file, reg);
5328 break;
5329
5330 default:
5331 debug_rtx (addr);
5332 abort ();
5333 }
5334 else
5335 switch (GET_CODE (addr))
5336 {
5337 case REG:
5338 print_reg (file, addr);
5339 fprintf (file, ", 0");
5340 break;
5341
5342 case PLUS:
5343 reg = XEXP (addr, 0);
5344 offset = XEXP (addr, 1);
5345 print_reg (file, reg);
5346 fprintf (file, ", ");
5347 if (GET_CODE (offset) == REG)
5348 print_reg (file, reg);
5349 else if (GET_CODE (offset) == CONST_INT)
5350 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5351 else
5352 abort ();
5353 break;
5354
5355 default:
5356 debug_rtx (addr);
5357 abort ();
5358 }
5359}
5360
5361/* Implement PRINT_OPERAND via gcn.h.
5362
5363 b - print operand size as untyped operand (b8/b16/b32/b64)
5364 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5365 i - print operand size as untyped operand (i16/b32/i64)
5366 u - print operand size as untyped operand (u16/u32/u64)
5367 o - print operand size as memory access size for loads
5368 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5369 s - print operand size as memory access size for stores
5370 (byte/short/dword/dwordx2/wordx3/dwordx4)
5371 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5372 c - print inverse conditional code for s_cbranch
5373 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5374 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5375 A - print address in formatting suitable for given address space.
5376 O - print offset:n for data share operations.
5377 ^ - print "_co" suffix for GCN5 mnemonics
5378 g - print "glc", if appropriate for given MEM
5379 */
5380
5381void
5382print_operand (FILE *file, rtx x, int code)
5383{
5384 int xcode = x ? GET_CODE (x) : 0;
5385 bool invert = false;
5386 switch (code)
5387 {
5388 /* Instructions have the following suffixes.
5389 If there are two suffixes, the first is the destination type,
5390 and the second is the source type.
5391
5392 B32 Bitfield (untyped data) 32-bit
5393 B64 Bitfield (untyped data) 64-bit
5394 F16 floating-point 16-bit
5395 F32 floating-point 32-bit (IEEE 754 single-precision float)
5396 F64 floating-point 64-bit (IEEE 754 double-precision float)
5397 I16 signed 32-bit integer
5398 I32 signed 32-bit integer
5399 I64 signed 64-bit integer
5400 U16 unsigned 32-bit integer
5401 U32 unsigned 32-bit integer
5402 U64 unsigned 64-bit integer */
5403
5404 /* Print operand size as untyped suffix. */
5405 case 'b':
5406 {
5407 const char *s = "";
5408 machine_mode mode = GET_MODE (x);
5409 if (VECTOR_MODE_P (mode))
5410 mode = GET_MODE_INNER (mode);
5411 switch (GET_MODE_SIZE (mode))
5412 {
5413 case 1:
5414 s = "_b8";
5415 break;
5416 case 2:
5417 s = "_b16";
5418 break;
5419 case 4:
5420 s = "_b32";
5421 break;
5422 case 8:
5423 s = "_b64";
5424 break;
5425 default:
5426 output_operand_lossage ("invalid operand %%xn code");
5427 return;
5428 }
5429 fputs (s, file);
5430 }
5431 return;
5432 case 'B':
5433 {
5434 const char *s = "";
5435 machine_mode mode = GET_MODE (x);
5436 if (VECTOR_MODE_P (mode))
5437 mode = GET_MODE_INNER (mode);
5438 switch (GET_MODE_SIZE (mode))
5439 {
5440 case 1:
5441 case 2:
5442 case 4:
5443 s = "_b32";
5444 break;
5445 case 8:
5446 s = "_b64";
5447 break;
5448 default:
5449 output_operand_lossage ("invalid operand %%xn code");
5450 return;
5451 }
5452 fputs (s, file);
5453 }
5454 return;
5455 case 'e':
5456 fputs ("sext(", file);
5457 print_operand (file, x, 0);
5458 fputs (")", file);
5459 return;
5460 case 'i':
5461 case 'u':
5462 {
5463 bool signed_p = code == 'i';
5464 const char *s = "";
5465 machine_mode mode = GET_MODE (x);
5466 if (VECTOR_MODE_P (mode))
5467 mode = GET_MODE_INNER (mode);
5468 if (mode == VOIDmode)
5469 switch (GET_CODE (x))
5470 {
5471 case CONST_INT:
5472 s = signed_p ? "_i32" : "_u32";
5473 break;
5474 case CONST_DOUBLE:
5475 s = "_f64";
5476 break;
5477 default:
5478 output_operand_lossage ("invalid operand %%xn code");
5479 return;
5480 }
5481 else if (FLOAT_MODE_P (mode))
5482 switch (GET_MODE_SIZE (mode))
5483 {
5484 case 2:
5485 s = "_f16";
5486 break;
5487 case 4:
5488 s = "_f32";
5489 break;
5490 case 8:
5491 s = "_f64";
5492 break;
5493 default:
5494 output_operand_lossage ("invalid operand %%xn code");
5495 return;
5496 }
5497 else
5498 switch (GET_MODE_SIZE (mode))
5499 {
5500 case 1:
5501 s = signed_p ? "_i8" : "_u8";
5502 break;
5503 case 2:
5504 s = signed_p ? "_i16" : "_u16";
5505 break;
5506 case 4:
5507 s = signed_p ? "_i32" : "_u32";
5508 break;
5509 case 8:
5510 s = signed_p ? "_i64" : "_u64";
5511 break;
5512 default:
5513 output_operand_lossage ("invalid operand %%xn code");
5514 return;
5515 }
5516 fputs (s, file);
5517 }
5518 return;
5519 /* Print operand size as untyped suffix. */
5520 case 'o':
5521 {
5522 const char *s = 0;
5523 switch (GET_MODE_SIZE (GET_MODE (x)))
5524 {
5525 case 1:
5526 s = "_ubyte";
5527 break;
5528 case 2:
5529 s = "_ushort";
5530 break;
5531 /* The following are full-vector variants. */
5532 case 64:
5533 s = "_ubyte";
5534 break;
5535 case 128:
5536 s = "_ushort";
5537 break;
5538 }
5539
5540 if (s)
5541 {
5542 fputs (s, file);
5543 return;
5544 }
5545
5546 /* Fall-through - the other cases for 'o' are the same as for 's'. */
5547 gcc_fallthrough();
5548 }
5549 case 's':
5550 {
5551 const char *s = "";
5552 switch (GET_MODE_SIZE (GET_MODE (x)))
5553 {
5554 case 1:
5555 s = "_byte";
5556 break;
5557 case 2:
5558 s = "_short";
5559 break;
5560 case 4:
5561 s = "_dword";
5562 break;
5563 case 8:
5564 s = "_dwordx2";
5565 break;
5566 case 12:
5567 s = "_dwordx3";
5568 break;
5569 case 16:
5570 s = "_dwordx4";
5571 break;
5572 case 32:
5573 s = "_dwordx8";
5574 break;
5575 case 64:
5576 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
5577 break;
5578 /* The following are full-vector variants. */
5579 case 128:
5580 s = "_short";
5581 break;
5582 case 256:
5583 s = "_dword";
5584 break;
5585 case 512:
5586 s = "_dwordx2";
5587 break;
5588 default:
5589 output_operand_lossage ("invalid operand %%xn code");
5590 return;
5591 }
5592 fputs (s, file);
5593 }
5594 return;
5595 case 'A':
5596 if (xcode != MEM)
5597 {
5598 output_operand_lossage ("invalid %%xn code");
5599 return;
5600 }
5601 print_operand_address (file, x);
5602 return;
5603 case 'O':
5604 {
5605 if (xcode != MEM)
5606 {
5607 output_operand_lossage ("invalid %%xn code");
5608 return;
5609 }
5610 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
5611 fprintf (file, " gds");
5612
5613 rtx x0 = XEXP (x, 0);
5614 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
5615 {
5616 gcc_assert (TARGET_GCN5_PLUS);
5617
5618 fprintf (file, ", ");
5619
5620 rtx base = x0;
5621 rtx const_offset = NULL_RTX;
5622
5623 if (GET_CODE (base) == PLUS)
5624 {
5625 rtx offset = XEXP (x0, 1);
5626 base = XEXP (x0, 0);
5627
5628 if (GET_CODE (base) == PLUS)
5629 /* (SGPR + VGPR) + CONST */
5630 /* Ignore the VGPR offset for this operand. */
5631 base = XEXP (base, 0);
5632
5633 if (CONST_INT_P (offset))
5634 const_offset = XEXP (x0, 1);
5635 else if (REG_P (offset))
5636 /* SGPR + VGPR */
5637 /* Ignore the VGPR offset for this operand. */
5638 ;
5639 else
5640 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5641 }
5642
5643 if (REG_P (base))
5644 {
5645 if (VGPR_REGNO_P (REGNO (base)))
5646 /* The VGPR address is specified in the %A operand. */
5647 fprintf (file, "off");
5648 else if (SGPR_REGNO_P (REGNO (base)))
5649 print_reg (file, base);
5650 else
5651 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5652 }
5653 else
5654 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5655
5656 if (const_offset != NULL_RTX)
5657 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
5658 INTVAL (const_offset));
5659
5660 return;
5661 }
5662
5663 if (GET_CODE (x0) == REG)
5664 return;
5665 if (GET_CODE (x0) != PLUS)
5666 {
5667 output_operand_lossage ("invalid %%xn code");
5668 return;
5669 }
5670 rtx val = XEXP (x0, 1);
5671 if (GET_CODE (val) == CONST_VECTOR)
5672 val = CONST_VECTOR_ELT (val, 0);
5673 if (GET_CODE (val) != CONST_INT)
5674 {
5675 output_operand_lossage ("invalid %%xn code");
5676 return;
5677 }
5678 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
5679
5680 }
5681 return;
5682 case 'c':
5683 invert = true;
5684 /* Fall through. */
5685 case 'C':
5686 {
5687 const char *s;
5688 bool num = false;
5689 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
5690 {
5691 output_operand_lossage ("invalid %%xn code");
5692 return;
5693 }
5694 switch (REGNO (XEXP (x, 0)))
5695 {
5696 case VCC_REG:
5697 case VCCZ_REG:
5698 s = "_vcc";
5699 break;
5700 case SCC_REG:
5701 /* For some reason llvm-mc insists on scc0 instead of sccz. */
5702 num = true;
5703 s = "_scc";
5704 break;
5705 case EXECZ_REG:
5706 s = "_exec";
5707 break;
5708 default:
5709 output_operand_lossage ("invalid %%xn code");
5710 return;
5711 }
5712 fputs (s, file);
5713 if (xcode == (invert ? NE : EQ))
5714 fputc (num ? '0' : 'z', file);
5715 else
5716 fputs (num ? "1" : "nz", file);
5717 return;
5718 }
5719 case 'D':
5720 {
5721 const char *s;
5722 bool cmp_signed = false;
5723 switch (xcode)
5724 {
5725 case EQ:
5726 s = "_eq_";
5727 break;
5728 case NE:
5729 s = "_lg_";
5730 break;
5731 case LT:
5732 s = "_lt_";
5733 cmp_signed = true;
5734 break;
5735 case LE:
5736 s = "_le_";
5737 cmp_signed = true;
5738 break;
5739 case GT:
5740 s = "_gt_";
5741 cmp_signed = true;
5742 break;
5743 case GE:
5744 s = "_ge_";
5745 cmp_signed = true;
5746 break;
5747 case LTU:
5748 s = "_lt_";
5749 break;
5750 case LEU:
5751 s = "_le_";
5752 break;
5753 case GTU:
5754 s = "_gt_";
5755 break;
5756 case GEU:
5757 s = "_ge_";
5758 break;
5759 default:
5760 output_operand_lossage ("invalid %%xn code");
5761 return;
5762 }
5763 fputs (s, file);
5764 fputc (cmp_signed ? 'i' : 'u', file);
5765
5766 machine_mode mode = GET_MODE (XEXP (x, 0));
5767
5768 if (mode == VOIDmode)
5769 mode = GET_MODE (XEXP (x, 1));
5770
5771 /* If both sides are constants, then assume the instruction is in
5772 SImode since s_cmp can only do integer compares. */
5773 if (mode == VOIDmode)
5774 mode = SImode;
5775
5776 switch (GET_MODE_SIZE (mode))
5777 {
5778 case 4:
5779 s = "32";
5780 break;
5781 case 8:
5782 s = "64";
5783 break;
5784 default:
5785 output_operand_lossage ("invalid operand %%xn code");
5786 return;
5787 }
5788 fputs (s, file);
5789 return;
5790 }
5791 case 'E':
5792 {
5793 const char *s;
5794 bool cmp_signed = false;
5795 machine_mode mode = GET_MODE (XEXP (x, 0));
5796
5797 if (mode == VOIDmode)
5798 mode = GET_MODE (XEXP (x, 1));
5799
5800 /* If both sides are constants, assume the instruction is in SFmode
5801 if either operand is floating point, otherwise assume SImode. */
5802 if (mode == VOIDmode)
5803 {
5804 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
5805 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
5806 mode = SFmode;
5807 else
5808 mode = SImode;
5809 }
5810
5811 /* Use the same format code for vector comparisons. */
5812 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
5813 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
5814 mode = GET_MODE_INNER (mode);
5815
5816 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
5817
5818 switch (xcode)
5819 {
5820 case EQ:
5821 s = "_eq_";
5822 break;
5823 case NE:
5824 s = float_p ? "_neq_" : "_ne_";
5825 break;
5826 case LT:
5827 s = "_lt_";
5828 cmp_signed = true;
5829 break;
5830 case LE:
5831 s = "_le_";
5832 cmp_signed = true;
5833 break;
5834 case GT:
5835 s = "_gt_";
5836 cmp_signed = true;
5837 break;
5838 case GE:
5839 s = "_ge_";
5840 cmp_signed = true;
5841 break;
5842 case LTU:
5843 s = "_lt_";
5844 break;
5845 case LEU:
5846 s = "_le_";
5847 break;
5848 case GTU:
5849 s = "_gt_";
5850 break;
5851 case GEU:
5852 s = "_ge_";
5853 break;
5854 case ORDERED:
5855 s = "_o_";
5856 break;
5857 case UNORDERED:
5858 s = "_u_";
5859 break;
5860 default:
5861 output_operand_lossage ("invalid %%xn code");
5862 return;
5863 }
5864 fputs (s, file);
5865 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
5866
5867 switch (GET_MODE_SIZE (mode))
5868 {
5869 case 1:
5870 s = "32";
5871 break;
5872 case 2:
5873 s = float_p ? "16" : "32";
5874 break;
5875 case 4:
5876 s = "32";
5877 break;
5878 case 8:
5879 s = "64";
5880 break;
5881 default:
5882 output_operand_lossage ("invalid operand %%xn code");
5883 return;
5884 }
5885 fputs (s, file);
5886 return;
5887 }
5888 case 'L':
5889 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
5890 return;
5891 case 'H':
5892 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
5893 return;
5894 case 'R':
5895 /* Print a scalar register number as an integer. Temporary hack. */
5896 gcc_assert (REG_P (x));
5897 fprintf (file, "%u", (int) REGNO (x));
5898 return;
5899 case 'V':
5900 /* Print a vector register number as an integer. Temporary hack. */
5901 gcc_assert (REG_P (x));
5902 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
5903 return;
5904 case 0:
5905 if (xcode == REG)
5906 print_reg (file, x);
5907 else if (xcode == MEM)
5908 output_address (GET_MODE (x), x);
5909 else if (xcode == CONST_INT)
5910 fprintf (file, "%i", (int) INTVAL (x));
5911 else if (xcode == CONST_VECTOR)
5912 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
5913 else if (xcode == CONST_DOUBLE)
5914 {
5915 const char *str;
5916 switch (gcn_inline_fp_constant_p (x, false))
5917 {
5918 case 240:
5919 str = "0.5";
5920 break;
5921 case 241:
5922 str = "-0.5";
5923 break;
5924 case 242:
5925 str = "1.0";
5926 break;
5927 case 243:
5928 str = "-1.0";
5929 break;
5930 case 244:
5931 str = "2.0";
5932 break;
5933 case 245:
5934 str = "-2.0";
5935 break;
5936 case 246:
5937 str = "4.0";
5938 break;
5939 case 247:
5940 str = "-4.0";
5941 break;
5942 case 248:
5943 str = "1/pi";
5944 break;
5945 default:
5946 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
5947 ? DImode : SImode,
5948 x, GET_MODE (x), 0);
5949 if (x)
5950 print_operand (file, ix, code);
5951 else
a94d5170 5952 output_operand_lossage ("invalid fp constant");
5326695a
AS
5953 return;
5954 break;
5955 }
5956 fprintf (file, str);
5957 return;
5958 }
5959 else
5960 output_addr_const (file, x);
5961 return;
5962 case '^':
5963 if (TARGET_GCN5_PLUS)
5964 fputs ("_co", file);
5965 return;
5966 case 'g':
5967 gcc_assert (xcode == MEM);
5968 if (MEM_VOLATILE_P (x))
5969 fputs (" glc", file);
5970 return;
5971 default:
5972 output_operand_lossage ("invalid %%xn code");
5973 }
5974 gcc_unreachable ();
5975}
5976
5977/* }}} */
5978/* {{{ TARGET hook overrides. */
5979
5980#undef TARGET_ADDR_SPACE_ADDRESS_MODE
5981#define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
5982#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
5983#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
5984 gcn_addr_space_legitimate_address_p
5985#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
5986#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
5987#undef TARGET_ADDR_SPACE_POINTER_MODE
5988#define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
5989#undef TARGET_ADDR_SPACE_SUBSET_P
5990#define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
5991#undef TARGET_ADDR_SPACE_CONVERT
5992#define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
5993#undef TARGET_ARG_PARTIAL_BYTES
5994#define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
5995#undef TARGET_ASM_ALIGNED_DI_OP
5996#define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
5326695a
AS
5997#undef TARGET_ASM_FILE_START
5998#define TARGET_ASM_FILE_START output_file_start
5999#undef TARGET_ASM_FUNCTION_PROLOGUE
6000#define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6001#undef TARGET_ASM_SELECT_SECTION
6002#define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6003#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6004#define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6005#undef TARGET_ATTRIBUTE_TABLE
6006#define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6007#undef TARGET_BUILTIN_DECL
6008#define TARGET_BUILTIN_DECL gcn_builtin_decl
6009#undef TARGET_CAN_CHANGE_MODE_CLASS
6010#define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6011#undef TARGET_CAN_ELIMINATE
6012#define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6013#undef TARGET_CANNOT_COPY_INSN_P
6014#define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6015#undef TARGET_CLASS_LIKELY_SPILLED_P
6016#define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6017#undef TARGET_CLASS_MAX_NREGS
6018#define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6019#undef TARGET_CONDITIONAL_REGISTER_USAGE
6020#define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6021#undef TARGET_CONSTANT_ALIGNMENT
6022#define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6023#undef TARGET_DEBUG_UNWIND_INFO
6024#define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
76d46331
KCY
6025#undef TARGET_EMUTLS_VAR_INIT
6026#define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
5326695a
AS
6027#undef TARGET_EXPAND_BUILTIN
6028#define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6029#undef TARGET_FUNCTION_ARG
6030#undef TARGET_FUNCTION_ARG_ADVANCE
6031#define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6032#define TARGET_FUNCTION_ARG gcn_function_arg
6033#undef TARGET_FUNCTION_VALUE
6034#define TARGET_FUNCTION_VALUE gcn_function_value
6035#undef TARGET_FUNCTION_VALUE_REGNO_P
6036#define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6037#undef TARGET_GIMPLIFY_VA_ARG_EXPR
6038#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6039#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
6040#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
6041 gcn_goacc_adjust_propagation_record
6042#undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL
6043#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl
6044#undef TARGET_GOACC_FORK_JOIN
6045#define TARGET_GOACC_FORK_JOIN gcn_fork_join
6046#undef TARGET_GOACC_REDUCTION
6047#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6048#undef TARGET_GOACC_VALIDATE_DIMS
6049#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6050#undef TARGET_GOACC_WORKER_PARTITIONING
6051#define TARGET_GOACC_WORKER_PARTITIONING true
6052#undef TARGET_HARD_REGNO_MODE_OK
6053#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6054#undef TARGET_HARD_REGNO_NREGS
6055#define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6056#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6057#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6058#undef TARGET_INIT_BUILTINS
6059#define TARGET_INIT_BUILTINS gcn_init_builtins
6060#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6061#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6062 gcn_ira_change_pseudo_allocno_class
6063#undef TARGET_LEGITIMATE_CONSTANT_P
6064#define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6065#undef TARGET_LRA_P
6066#define TARGET_LRA_P hook_bool_void_true
6067#undef TARGET_MACHINE_DEPENDENT_REORG
6068#define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6069#undef TARGET_MEMORY_MOVE_COST
6070#define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6071#undef TARGET_MODES_TIEABLE_P
6072#define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6073#undef TARGET_OPTION_OVERRIDE
6074#define TARGET_OPTION_OVERRIDE gcn_option_override
6075#undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6076#define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6077 gcn_pretend_outgoing_varargs_named
6078#undef TARGET_PROMOTE_FUNCTION_MODE
6079#define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6080#undef TARGET_REGISTER_MOVE_COST
6081#define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6082#undef TARGET_RETURN_IN_MEMORY
6083#define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6084#undef TARGET_RTX_COSTS
6085#define TARGET_RTX_COSTS gcn_rtx_costs
6086#undef TARGET_SECONDARY_RELOAD
6087#define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6088#undef TARGET_SECTION_TYPE_FLAGS
6089#define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6090#undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6091#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6092 gcn_small_register_classes_for_mode_p
6093#undef TARGET_SPILL_CLASS
6094#define TARGET_SPILL_CLASS gcn_spill_class
6095#undef TARGET_STRICT_ARGUMENT_NAMING
6096#define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6097#undef TARGET_TRAMPOLINE_INIT
6098#define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6099#undef TARGET_TRULY_NOOP_TRUNCATION
6100#define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6101#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6102#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6103#undef TARGET_VECTORIZE_GET_MASK_MODE
6104#define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6105#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6106#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6107#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6108#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6109 gcn_preferred_vector_alignment
6110#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6111#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6112 gcn_vectorize_support_vector_misalignment
6113#undef TARGET_VECTORIZE_VEC_PERM_CONST
6114#define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6115#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6116#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6117 gcn_vector_alignment_reachable
6118#undef TARGET_VECTOR_MODE_SUPPORTED_P
6119#define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6120
6121struct gcc_target targetm = TARGET_INITIALIZER;
6122
6123#include "gt-gcn.h"
6124/* }}} */