]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/gcn/gcn.cc
GCN: Make "gang-private data-share memory exhausted" error more verbose
[thirdparty/gcc.git] / gcc / config / gcn / gcn.cc
1 /* Copyright (C) 2016-2022 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
6 any later version.
7
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with GCC; see the file COPYING3. If not see
15 <http://www.gnu.org/licenses/>. */
16
17 /* {{{ Includes. */
18
19 /* We want GET_MODE_SIZE et al to return integers, please. */
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "memmodel.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic-core.h"
38 #include "insn-attr.h"
39 #include "fold-const.h"
40 #include "calls.h"
41 #include "explow.h"
42 #include "expr.h"
43 #include "output.h"
44 #include "cfgrtl.h"
45 #include "langhooks.h"
46 #include "builtins.h"
47 #include "omp-general.h"
48 #include "print-rtl.h"
49 #include "attribs.h"
50 #include "varasm.h"
51 #include "intl.h"
52 #include "rtl-iter.h"
53 #include "dwarf2.h"
54 #include "gimple.h"
55
56 /* This file should be included last. */
57 #include "target-def.h"
58
59 /* }}} */
60 /* {{{ Global variables. */
61
62 /* Constants used by FP instructions. */
63
64 static REAL_VALUE_TYPE dconst4, dconst1over2pi;
65 static bool ext_gcn_constants_init = 0;
66
67 /* Holds the ISA variant, derived from the command line parameters. */
68
69 int gcn_isa = 3; /* Default to GCN3. */
70
71 /* Reserve this much space for LDS (for propagating variables from
72 worker-single mode to worker-partitioned mode), per workgroup. Global
73 analysis could calculate an exact bound, but we don't do that yet.
74
75 We want to permit full occupancy, so size accordingly. */
76
77 /* Use this as a default, but allow it to grow if the user requests a large
78 amount of gang-private shared-memory space. */
79 static int acc_lds_size = 0x600;
80
81 #define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
82 #define ACC_LDS_SIZE acc_lds_size
83 #define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
84
85 #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
86 : flag_openmp ? OMP_LDS_SIZE \
87 : OTHER_LDS_SIZE)
88
89 static int gang_private_hwm = 32;
90 static hash_map<tree, int> lds_allocs;
91
92 /* The number of registers usable by normal non-kernel functions.
93 The SGPR count includes any special extra registers such as VCC. */
94
95 #define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
96 #define MAX_NORMAL_VGPR_COUNT 24
97
98 /* }}} */
99 /* {{{ Initialization and options. */
100
101 /* Initialize machine_function. */
102
103 static struct machine_function *
104 gcn_init_machine_status (void)
105 {
106 struct machine_function *f;
107
108 f = ggc_cleared_alloc<machine_function> ();
109
110 if (TARGET_GCN3)
111 f->use_flat_addressing = true;
112
113 return f;
114 }
115
116 /* Implement TARGET_OPTION_OVERRIDE.
117
118 Override option settings where defaults are variable, or we have specific
119 needs to consider. */
120
121 static void
122 gcn_option_override (void)
123 {
124 init_machine_status = gcn_init_machine_status;
125
126 /* The HSA runtime does not respect ELF load addresses, so force PIE. */
127 if (!flag_pie)
128 flag_pie = 2;
129 if (!flag_pic)
130 flag_pic = flag_pie;
131
132 gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
133
134 /* The default stack size needs to be small for offload kernels because
135 there may be many, many threads. Also, a smaller stack gives a
136 measureable performance boost. But, a small stack is insufficient
137 for running the testsuite, so we use a larger default for the stand
138 alone case. */
139 if (stack_size_opt == -1)
140 {
141 if (flag_openacc || flag_openmp)
142 /* 512 bytes per work item = 32kB total. */
143 stack_size_opt = 512 * 64;
144 else
145 /* 1MB total. */
146 stack_size_opt = 1048576;
147 }
148
149 /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
150 worker broadcasts. */
151 if (gang_private_size_opt == -1)
152 gang_private_size_opt = 512;
153 else if (gang_private_size_opt < gang_private_hwm)
154 gang_private_size_opt = gang_private_hwm;
155 else if (gang_private_size_opt >= acc_lds_size - 1024)
156 {
157 /* We need some space for reductions and worker broadcasting. If the
158 user requests a large amount of gang-private LDS space, we might not
159 have enough left for the former. Increase the LDS allocation in that
160 case, although this may reduce the maximum occupancy on the
161 hardware. */
162 acc_lds_size = gang_private_size_opt + 1024;
163 if (acc_lds_size > 32768)
164 acc_lds_size = 32768;
165 }
166
167 /* The xnack option is a placeholder, for now. */
168 if (flag_xnack)
169 sorry ("XNACK support");
170 }
171
172 /* }}} */
173 /* {{{ Attributes. */
174
175 /* This table defines the arguments that are permitted in
176 __attribute__ ((amdgpu_hsa_kernel (...))).
177
178 The names and values correspond to the HSA metadata that is encoded
179 into the assembler file and binary. */
180
181 static const struct gcn_kernel_arg_type
182 {
183 const char *name;
184 const char *header_pseudo;
185 machine_mode mode;
186
187 /* This should be set to -1 or -2 for a dynamically allocated register
188 number. Use -1 if this argument contributes to the user_sgpr_count,
189 -2 otherwise. */
190 int fixed_regno;
191 } gcn_kernel_arg_types[] = {
192 {"exec", NULL, DImode, EXEC_REG},
193 #define PRIVATE_SEGMENT_BUFFER_ARG 1
194 {"private_segment_buffer",
195 ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
196 #define DISPATCH_PTR_ARG 2
197 {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
198 #define QUEUE_PTR_ARG 3
199 {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
200 #define KERNARG_SEGMENT_PTR_ARG 4
201 {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
202 {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
203 #define FLAT_SCRATCH_INIT_ARG 6
204 {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
205 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
206 {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
207 #define WORKGROUP_ID_X_ARG 8
208 {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
209 {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
210 {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
211 {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
212 #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
213 {"private_segment_wave_offset",
214 ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
215 #define WORK_ITEM_ID_X_ARG 13
216 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
217 #define WORK_ITEM_ID_Y_ARG 14
218 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
219 #define WORK_ITEM_ID_Z_ARG 15
220 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
221 };
222
223 static const long default_requested_args
224 = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
225 | (1 << DISPATCH_PTR_ARG)
226 | (1 << QUEUE_PTR_ARG)
227 | (1 << KERNARG_SEGMENT_PTR_ARG)
228 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
229 | (1 << WORKGROUP_ID_X_ARG)
230 | (1 << WORK_ITEM_ID_X_ARG)
231 | (1 << WORK_ITEM_ID_Y_ARG)
232 | (1 << WORK_ITEM_ID_Z_ARG);
233
234 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())).
235 This function also sets the default values for some arguments.
236
237 Return true on success, with ARGS populated. */
238
239 static bool
240 gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args,
241 tree list)
242 {
243 bool err = false;
244 args->requested = default_requested_args;
245 args->nargs = 0;
246
247 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
248 args->reg[a] = -1;
249
250 for (; list; list = TREE_CHAIN (list))
251 {
252 const char *str;
253 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST)
254 {
255 error ("%<amdgpu_hsa_kernel%> attribute requires string constant "
256 "arguments");
257 break;
258 }
259 str = TREE_STRING_POINTER (TREE_VALUE (list));
260 int a;
261 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
262 {
263 if (!strcmp (str, gcn_kernel_arg_types[a].name))
264 break;
265 }
266 if (a == GCN_KERNEL_ARG_TYPES)
267 {
268 error ("unknown specifier %qs in %<amdgpu_hsa_kernel%> attribute",
269 str);
270 err = true;
271 break;
272 }
273 if (args->requested & (1 << a))
274 {
275 error ("duplicated parameter specifier %qs in %<amdgpu_hsa_kernel%> "
276 "attribute", str);
277 err = true;
278 break;
279 }
280 args->requested |= (1 << a);
281 args->order[args->nargs++] = a;
282 }
283
284 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and
285 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies
286 requesting WORK_ITEM_ID_X_ARG. */
287 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG))
288 args->requested |= (1 << WORK_ITEM_ID_Y_ARG);
289 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG))
290 args->requested |= (1 << WORK_ITEM_ID_X_ARG);
291
292 int sgpr_regno = FIRST_SGPR_REG;
293 args->nsgprs = 0;
294 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
295 {
296 if (!(args->requested & (1 << a)))
297 continue;
298
299 if (gcn_kernel_arg_types[a].fixed_regno >= 0)
300 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno;
301 else
302 {
303 int reg_count;
304
305 switch (gcn_kernel_arg_types[a].mode)
306 {
307 case E_SImode:
308 reg_count = 1;
309 break;
310 case E_DImode:
311 reg_count = 2;
312 break;
313 case E_TImode:
314 reg_count = 4;
315 break;
316 default:
317 gcc_unreachable ();
318 }
319 args->reg[a] = sgpr_regno;
320 sgpr_regno += reg_count;
321 if (gcn_kernel_arg_types[a].fixed_regno == -1)
322 args->nsgprs += reg_count;
323 }
324 }
325 if (sgpr_regno > FIRST_SGPR_REG + 16)
326 {
327 error ("too many arguments passed in sgpr registers");
328 }
329 return err;
330 }
331
332 /* Referenced by TARGET_ATTRIBUTE_TABLE.
333
334 Validates target specific attributes. */
335
336 static tree
337 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name,
338 tree args, int, bool *no_add_attrs)
339 {
340 if (!FUNC_OR_METHOD_TYPE_P (*node))
341 {
342 warning (OPT_Wattributes, "%qE attribute only applies to functions",
343 name);
344 *no_add_attrs = true;
345 return NULL_TREE;
346 }
347
348 /* Can combine regparm with all attributes but fastcall, and thiscall. */
349 if (is_attribute_p ("gcnhsa_kernel", name))
350 {
351 struct gcn_kernel_args kernelarg;
352
353 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args))
354 *no_add_attrs = true;
355
356 return NULL_TREE;
357 }
358
359 return NULL_TREE;
360 }
361
362 /* Implement TARGET_ATTRIBUTE_TABLE.
363
364 Create target-specific __attribute__ types. */
365
366 static const struct attribute_spec gcn_attribute_table[] = {
367 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
368 affects_type_identity } */
369 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true,
370 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL},
371 /* End element. */
372 {NULL, 0, 0, false, false, false, false, NULL, NULL}
373 };
374
375 /* }}} */
376 /* {{{ Registers and modes. */
377
378 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P. */
379
380 bool
381 gcn_scalar_mode_supported_p (scalar_mode mode)
382 {
383 return (mode == BImode
384 || mode == QImode
385 || mode == HImode /* || mode == HFmode */
386 || mode == SImode || mode == SFmode
387 || mode == DImode || mode == DFmode
388 || mode == TImode);
389 }
390
391 /* Implement TARGET_CLASS_MAX_NREGS.
392
393 Return the number of hard registers needed to hold a value of MODE in
394 a register of class RCLASS. */
395
396 static unsigned char
397 gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
398 {
399 /* Scalar registers are 32bit, vector registers are in fact tuples of
400 64 lanes. */
401 if (rclass == VGPR_REGS)
402 {
403 if (vgpr_1reg_mode_p (mode))
404 return 1;
405 if (vgpr_2reg_mode_p (mode))
406 return 2;
407 /* TImode is used by DImode compare_and_swap. */
408 if (mode == TImode)
409 return 4;
410 }
411 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode)
412 return 2;
413 return CEIL (GET_MODE_SIZE (mode), 4);
414 }
415
416 /* Implement TARGET_HARD_REGNO_NREGS.
417
418 Return the number of hard registers needed to hold a value of MODE in
419 REGNO. */
420
421 unsigned int
422 gcn_hard_regno_nregs (unsigned int regno, machine_mode mode)
423 {
424 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode);
425 }
426
427 /* Implement TARGET_HARD_REGNO_MODE_OK.
428
429 Return true if REGNO can hold value in MODE. */
430
431 bool
432 gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
433 {
434 /* Treat a complex mode as if it were a scalar mode of the same overall
435 size for the purposes of allocating hard registers. */
436 if (COMPLEX_MODE_P (mode))
437 switch (mode)
438 {
439 case E_CQImode:
440 case E_CHImode:
441 mode = SImode;
442 break;
443 case E_CSImode:
444 mode = DImode;
445 break;
446 case E_CDImode:
447 mode = TImode;
448 break;
449 case E_HCmode:
450 mode = SFmode;
451 break;
452 case E_SCmode:
453 mode = DFmode;
454 break;
455 default:
456 /* Not supported. */
457 return false;
458 }
459
460 switch (regno)
461 {
462 case FLAT_SCRATCH_LO_REG:
463 case XNACK_MASK_LO_REG:
464 case TBA_LO_REG:
465 case TMA_LO_REG:
466 return (mode == SImode || mode == DImode);
467 case VCC_LO_REG:
468 case EXEC_LO_REG:
469 return (mode == BImode || mode == SImode || mode == DImode);
470 case M0_REG:
471 case FLAT_SCRATCH_HI_REG:
472 case XNACK_MASK_HI_REG:
473 case TBA_HI_REG:
474 case TMA_HI_REG:
475 return mode == SImode;
476 case VCC_HI_REG:
477 return false;
478 case EXEC_HI_REG:
479 return mode == SImode /*|| mode == V32BImode */ ;
480 case SCC_REG:
481 case VCCZ_REG:
482 case EXECZ_REG:
483 return mode == BImode;
484 }
485 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
486 return true;
487 if (SGPR_REGNO_P (regno))
488 /* We restrict double register values to aligned registers. */
489 return (sgpr_1reg_mode_p (mode)
490 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
491 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
492 if (VGPR_REGNO_P (regno))
493 /* Vector instructions do not care about the alignment of register
494 pairs, but where there is no 64-bit instruction, many of the
495 define_split do not work if the input and output registers partially
496 overlap. We tried to fix this with early clobber and match
497 constraints, but it was bug prone, added complexity, and conflicts
498 with the 'U0' constraints on vec_merge.
499 Therefore, we restrict ourselved to aligned registers. */
500 return (vgpr_1reg_mode_p (mode)
501 || (!((regno - FIRST_VGPR_REG) & 1) && vgpr_2reg_mode_p (mode))
502 /* TImode is used by DImode compare_and_swap. */
503 || (mode == TImode
504 && !((regno - FIRST_VGPR_REG) & 3)));
505 return false;
506 }
507
508 /* Implement REGNO_REG_CLASS via gcn.h.
509
510 Return smallest class containing REGNO. */
511
512 enum reg_class
513 gcn_regno_reg_class (int regno)
514 {
515 switch (regno)
516 {
517 case SCC_REG:
518 return SCC_CONDITIONAL_REG;
519 case VCC_LO_REG:
520 case VCC_HI_REG:
521 return VCC_CONDITIONAL_REG;
522 case VCCZ_REG:
523 return VCCZ_CONDITIONAL_REG;
524 case EXECZ_REG:
525 return EXECZ_CONDITIONAL_REG;
526 case EXEC_LO_REG:
527 case EXEC_HI_REG:
528 return EXEC_MASK_REG;
529 }
530 if (VGPR_REGNO_P (regno))
531 return VGPR_REGS;
532 if (SGPR_REGNO_P (regno))
533 return SGPR_REGS;
534 if (regno < FIRST_VGPR_REG)
535 return GENERAL_REGS;
536 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM)
537 return AFP_REGS;
538 return ALL_REGS;
539 }
540
541 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.
542
543 GCC assumes that lowpart contains first part of value as stored in memory.
544 This is not the case for vector registers. */
545
546 bool
547 gcn_can_change_mode_class (machine_mode from, machine_mode to,
548 reg_class_t regclass)
549 {
550 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to))
551 return true;
552 return (gcn_class_max_nregs (regclass, from)
553 == gcn_class_max_nregs (regclass, to));
554 }
555
556 /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P.
557
558 When this hook returns true for MODE, the compiler allows
559 registers explicitly used in the rtl to be used as spill registers
560 but prevents the compiler from extending the lifetime of these
561 registers. */
562
563 bool
564 gcn_small_register_classes_for_mode_p (machine_mode mode)
565 {
566 /* We allocate into exec and vcc regs. Those make small register class. */
567 return mode == DImode || mode == SImode;
568 }
569
570 /* Implement TARGET_CLASS_LIKELY_SPILLED_P.
571
572 Returns true if pseudos that have been assigned to registers of class RCLASS
573 would likely be spilled because registers of RCLASS are needed for spill
574 registers. */
575
576 static bool
577 gcn_class_likely_spilled_p (reg_class_t rclass)
578 {
579 return (rclass == EXEC_MASK_REG
580 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass));
581 }
582
583 /* Implement TARGET_MODES_TIEABLE_P.
584
585 Returns true if a value of MODE1 is accessible in MODE2 without
586 copying. */
587
588 bool
589 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2)
590 {
591 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE
592 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE);
593 }
594
595 /* Implement TARGET_TRULY_NOOP_TRUNCATION.
596
597 Returns true if it is safe to “convert” a value of INPREC bits to one of
598 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on
599 it as if it had only OUTPREC bits. */
600
601 bool
602 gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec)
603 {
604 return ((inprec <= 32) && (outprec <= inprec));
605 }
606
607 /* Return N-th part of value occupying multiple registers. */
608
609 rtx
610 gcn_operand_part (machine_mode mode, rtx op, int n)
611 {
612 if (GET_MODE_SIZE (mode) >= 256)
613 {
614 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */
615
616 if (REG_P (op))
617 {
618 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
619 return gen_rtx_REG (V64SImode, REGNO (op) + n);
620 }
621 if (GET_CODE (op) == CONST_VECTOR)
622 {
623 int units = GET_MODE_NUNITS (mode);
624 rtvec v = rtvec_alloc (units);
625
626 for (int i = 0; i < units; ++i)
627 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode),
628 CONST_VECTOR_ELT (op, i), n);
629
630 return gen_rtx_CONST_VECTOR (V64SImode, v);
631 }
632 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
633 return gcn_gen_undef (V64SImode);
634 gcc_unreachable ();
635 }
636 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op))
637 {
638 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER);
639 return gen_rtx_REG (SImode, REGNO (op) + n);
640 }
641 else
642 {
643 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR)
644 return gcn_gen_undef (SImode);
645
646 /* If it's a constant then let's assume it is of the largest mode
647 available, otherwise simplify_gen_subreg will fail. */
648 if (mode == VOIDmode && CONST_INT_P (op))
649 mode = DImode;
650 return simplify_gen_subreg (SImode, op, mode, n * 4);
651 }
652 }
653
654 /* Return N-th part of value occupying multiple registers. */
655
656 rtx
657 gcn_operand_doublepart (machine_mode mode, rtx op, int n)
658 {
659 return simplify_gen_subreg (DImode, op, mode, n * 8);
660 }
661
662 /* Return true if OP can be split into subregs or high/low parts.
663 This is always true for scalars, but not normally true for vectors.
664 However, for vectors in hardregs we can use the low and high registers. */
665
666 bool
667 gcn_can_split_p (machine_mode, rtx op)
668 {
669 if (vgpr_vector_mode_p (GET_MODE (op)))
670 {
671 if (GET_CODE (op) == SUBREG)
672 op = SUBREG_REG (op);
673 if (!REG_P (op))
674 return true;
675 return REGNO (op) <= FIRST_PSEUDO_REGISTER;
676 }
677 return true;
678 }
679
680 /* Implement TARGET_SPILL_CLASS.
681
682 Return class of registers which could be used for pseudo of MODE
683 and of class RCLASS for spilling instead of memory. Return NO_REGS
684 if it is not possible or non-profitable. */
685
686 static reg_class_t
687 gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
688 {
689 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)
690 || c == VCC_CONDITIONAL_REG)
691 return SGPR_REGS;
692 else
693 return NO_REGS;
694 }
695
696 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
697
698 Change allocno class for given pseudo from allocno and best class
699 calculated by IRA. */
700
701 static reg_class_t
702 gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl,
703 reg_class_t best_cl)
704 {
705 /* Avoid returning classes that contain both vgpr and sgpr registers. */
706 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS)
707 return cl;
708 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS
709 && best_cl != ALL_GPR_REGS)
710 return best_cl;
711
712 machine_mode mode = PSEUDO_REGNO_MODE (regno);
713 if (vgpr_vector_mode_p (mode))
714 return VGPR_REGS;
715
716 return GENERAL_REGS;
717 }
718
719 /* Create a new DImode pseudo reg and emit an instruction to initialize
720 it to VAL. */
721
722 static rtx
723 get_exec (int64_t val)
724 {
725 rtx reg = gen_reg_rtx (DImode);
726 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode)));
727 return reg;
728 }
729
730 /* Return value of scalar exec register. */
731
732 rtx
733 gcn_scalar_exec ()
734 {
735 return const1_rtx;
736 }
737
738 /* Return pseudo holding scalar exec register. */
739
740 rtx
741 gcn_scalar_exec_reg ()
742 {
743 return get_exec (1);
744 }
745
746 /* Return value of full exec register. */
747
748 rtx
749 gcn_full_exec ()
750 {
751 return constm1_rtx;
752 }
753
754 /* Return pseudo holding full exec register. */
755
756 rtx
757 gcn_full_exec_reg ()
758 {
759 return get_exec (-1);
760 }
761
762 /* }}} */
763 /* {{{ Immediate constants. */
764
765 /* Initialize shared numeric constants. */
766
767 static void
768 init_ext_gcn_constants (void)
769 {
770 real_from_integer (&dconst4, DFmode, 4, SIGNED);
771
772 /* FIXME: this constant probably does not match what hardware really loads.
773 Reality check it eventually. */
774 real_from_string (&dconst1over2pi,
775 "0.1591549430918953357663423455968866839");
776 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi);
777
778 ext_gcn_constants_init = 1;
779 }
780
781 /* Return non-zero if X is a constant that can appear as an inline operand.
782 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
783 Or a vector of those.
784 The value returned should be the encoding of this constant. */
785
786 int
787 gcn_inline_fp_constant_p (rtx x, bool allow_vector)
788 {
789 machine_mode mode = GET_MODE (x);
790
791 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
792 && allow_vector)
793 {
794 int n;
795 if (GET_CODE (x) != CONST_VECTOR)
796 return 0;
797 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
798 if (!n)
799 return 0;
800 for (int i = 1; i < 64; i++)
801 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
802 return 0;
803 return 1;
804 }
805
806 if (mode != HFmode && mode != SFmode && mode != DFmode)
807 return 0;
808
809 const REAL_VALUE_TYPE *r;
810
811 if (x == CONST0_RTX (mode))
812 return 128;
813 if (x == CONST1_RTX (mode))
814 return 242;
815
816 r = CONST_DOUBLE_REAL_VALUE (x);
817
818 if (real_identical (r, &dconstm1))
819 return 243;
820
821 if (real_identical (r, &dconsthalf))
822 return 240;
823 if (real_identical (r, &dconstm1))
824 return 243;
825 if (real_identical (r, &dconst2))
826 return 244;
827 if (real_identical (r, &dconst4))
828 return 246;
829 if (real_identical (r, &dconst1over2pi))
830 return 248;
831 if (!ext_gcn_constants_init)
832 init_ext_gcn_constants ();
833 real_value_negate (r);
834 if (real_identical (r, &dconsthalf))
835 return 241;
836 if (real_identical (r, &dconst2))
837 return 245;
838 if (real_identical (r, &dconst4))
839 return 247;
840
841 /* FIXME: add 4, -4 and 1/(2*PI). */
842
843 return 0;
844 }
845
846 /* Return non-zero if X is a constant that can appear as an immediate operand.
847 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi)
848 Or a vector of those.
849 The value returned should be the encoding of this constant. */
850
851 bool
852 gcn_fp_constant_p (rtx x, bool allow_vector)
853 {
854 machine_mode mode = GET_MODE (x);
855
856 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode)
857 && allow_vector)
858 {
859 int n;
860 if (GET_CODE (x) != CONST_VECTOR)
861 return false;
862 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false);
863 if (!n)
864 return false;
865 for (int i = 1; i < 64; i++)
866 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
867 return false;
868 return true;
869 }
870 if (mode != HFmode && mode != SFmode && mode != DFmode)
871 return false;
872
873 if (gcn_inline_fp_constant_p (x, false))
874 return true;
875 /* FIXME: It is not clear how 32bit immediates are interpreted here. */
876 return (mode != DFmode);
877 }
878
879 /* Return true if X is a constant representable as an inline immediate
880 constant in a 32-bit instruction encoding. */
881
882 bool
883 gcn_inline_constant_p (rtx x)
884 {
885 if (GET_CODE (x) == CONST_INT)
886 return INTVAL (x) >= -16 && INTVAL (x) <= 64;
887 if (GET_CODE (x) == CONST_DOUBLE)
888 return gcn_inline_fp_constant_p (x, false);
889 if (GET_CODE (x) == CONST_VECTOR)
890 {
891 int n;
892 if (!vgpr_vector_mode_p (GET_MODE (x)))
893 return false;
894 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0));
895 if (!n)
896 return false;
897 for (int i = 1; i < 64; i++)
898 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
899 return false;
900 return 1;
901 }
902 return false;
903 }
904
905 /* Return true if X is a constant representable as an immediate constant
906 in a 32 or 64-bit instruction encoding. */
907
908 bool
909 gcn_constant_p (rtx x)
910 {
911 switch (GET_CODE (x))
912 {
913 case CONST_INT:
914 return true;
915
916 case CONST_DOUBLE:
917 return gcn_fp_constant_p (x, false);
918
919 case CONST_VECTOR:
920 {
921 int n;
922 if (!vgpr_vector_mode_p (GET_MODE (x)))
923 return false;
924 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0));
925 if (!n)
926 return false;
927 for (int i = 1; i < 64; i++)
928 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
929 return false;
930 return true;
931 }
932
933 case SYMBOL_REF:
934 case LABEL_REF:
935 return true;
936
937 default:
938 ;
939 }
940
941 return false;
942 }
943
944 /* Return true if X is a constant representable as two inline immediate
945 constants in a 64-bit instruction that is split into two 32-bit
946 instructions.
947 When MIXED is set, the low-part is permitted to use the full 32-bits. */
948
949 bool
950 gcn_inline_constant64_p (rtx x, bool mixed)
951 {
952 if (GET_CODE (x) == CONST_VECTOR)
953 {
954 if (!vgpr_vector_mode_p (GET_MODE (x)))
955 return false;
956 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0), mixed))
957 return false;
958 for (int i = 1; i < 64; i++)
959 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
960 return false;
961
962 return true;
963 }
964
965 if (GET_CODE (x) != CONST_INT)
966 return false;
967
968 rtx val_lo = gcn_operand_part (DImode, x, 0);
969 rtx val_hi = gcn_operand_part (DImode, x, 1);
970 return ((mixed || gcn_inline_constant_p (val_lo))
971 && gcn_inline_constant_p (val_hi));
972 }
973
974 /* Return true if X is a constant representable as an immediate constant
975 in a 32 or 64-bit instruction encoding where the hardware will
976 extend the immediate to 64-bits. */
977
978 bool
979 gcn_constant64_p (rtx x)
980 {
981 if (!gcn_constant_p (x))
982 return false;
983
984 if (GET_CODE (x) != CONST_INT)
985 return true;
986
987 /* Negative numbers are only allowed if they can be encoded within src0,
988 because the 32-bit immediates do not get sign-extended.
989 Unsigned numbers must not be encodable as 32-bit -1..-16, because the
990 assembler will use a src0 inline immediate and that will get
991 sign-extended. */
992 HOST_WIDE_INT val = INTVAL (x);
993 return (((val & 0xffffffff) == val /* Positive 32-bit. */
994 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */
995 || gcn_inline_constant_p (x)); /* Src0. */
996 }
997
998 /* Implement TARGET_LEGITIMATE_CONSTANT_P.
999
1000 Returns true if X is a legitimate constant for a MODE immediate operand. */
1001
1002 bool
1003 gcn_legitimate_constant_p (machine_mode, rtx x)
1004 {
1005 return gcn_constant_p (x);
1006 }
1007
1008 /* Return true if X is a CONST_VECTOR of single constant. */
1009
1010 static bool
1011 single_cst_vector_p (rtx x)
1012 {
1013 if (GET_CODE (x) != CONST_VECTOR)
1014 return false;
1015 for (int i = 1; i < 64; i++)
1016 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0))
1017 return false;
1018 return true;
1019 }
1020
1021 /* Create a CONST_VECTOR of duplicated value A. */
1022
1023 rtx
1024 gcn_vec_constant (machine_mode mode, int a)
1025 {
1026 /*if (!a)
1027 return CONST0_RTX (mode);
1028 if (a == -1)
1029 return CONSTM1_RTX (mode);
1030 if (a == 1)
1031 return CONST1_RTX (mode);
1032 if (a == 2)
1033 return CONST2_RTX (mode);*/
1034
1035 int units = GET_MODE_NUNITS (mode);
1036 machine_mode innermode = GET_MODE_INNER (mode);
1037
1038 rtx tem;
1039 if (FLOAT_MODE_P (innermode))
1040 {
1041 REAL_VALUE_TYPE rv;
1042 real_from_integer (&rv, NULL, a, SIGNED);
1043 tem = const_double_from_real_value (rv, innermode);
1044 }
1045 else
1046 tem = gen_int_mode (a, innermode);
1047
1048 rtvec v = rtvec_alloc (units);
1049 for (int i = 0; i < units; ++i)
1050 RTVEC_ELT (v, i) = tem;
1051
1052 return gen_rtx_CONST_VECTOR (mode, v);
1053 }
1054
1055 /* Create a CONST_VECTOR of duplicated value A. */
1056
1057 rtx
1058 gcn_vec_constant (machine_mode mode, rtx a)
1059 {
1060 int units = GET_MODE_NUNITS (mode);
1061 rtvec v = rtvec_alloc (units);
1062
1063 for (int i = 0; i < units; ++i)
1064 RTVEC_ELT (v, i) = a;
1065
1066 return gen_rtx_CONST_VECTOR (mode, v);
1067 }
1068
1069 /* Create an undefined vector value, used where an insn operand is
1070 optional. */
1071
1072 rtx
1073 gcn_gen_undef (machine_mode mode)
1074 {
1075 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR);
1076 }
1077
1078 /* }}} */
1079 /* {{{ Addresses, pointers and moves. */
1080
1081 /* Return true is REG is a valid place to store a pointer,
1082 for instructions that require an SGPR.
1083 FIXME rename. */
1084
1085 static bool
1086 gcn_address_register_p (rtx reg, machine_mode mode, bool strict)
1087 {
1088 if (GET_CODE (reg) == SUBREG)
1089 reg = SUBREG_REG (reg);
1090
1091 if (!REG_P (reg))
1092 return false;
1093
1094 if (GET_MODE (reg) != mode)
1095 return false;
1096
1097 int regno = REGNO (reg);
1098
1099 if (regno >= FIRST_PSEUDO_REGISTER)
1100 {
1101 if (!strict)
1102 return true;
1103
1104 if (!reg_renumber)
1105 return false;
1106
1107 regno = reg_renumber[regno];
1108 }
1109
1110 return (SGPR_REGNO_P (regno) || regno == M0_REG
1111 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1112 }
1113
1114 /* Return true is REG is a valid place to store a pointer,
1115 for instructions that require a VGPR. */
1116
1117 static bool
1118 gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict)
1119 {
1120 if (GET_CODE (reg) == SUBREG)
1121 reg = SUBREG_REG (reg);
1122
1123 if (!REG_P (reg))
1124 return false;
1125
1126 if (GET_MODE (reg) != mode)
1127 return false;
1128
1129 int regno = REGNO (reg);
1130
1131 if (regno >= FIRST_PSEUDO_REGISTER)
1132 {
1133 if (!strict)
1134 return true;
1135
1136 if (!reg_renumber)
1137 return false;
1138
1139 regno = reg_renumber[regno];
1140 }
1141
1142 return VGPR_REGNO_P (regno);
1143 }
1144
1145 /* Return true if X would be valid inside a MEM using the Flat address
1146 space. */
1147
1148 bool
1149 gcn_flat_address_p (rtx x, machine_mode mode)
1150 {
1151 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1152 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1153
1154 if (vec_mode && gcn_address_register_p (x, DImode, false))
1155 return true;
1156
1157 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false))
1158 return true;
1159
1160 if (TARGET_GCN5_PLUS
1161 && GET_CODE (x) == PLUS
1162 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false)
1163 && CONST_INT_P (XEXP (x, 1)))
1164 return true;
1165
1166 return false;
1167 }
1168
1169 /* Return true if X would be valid inside a MEM using the Scalar Flat
1170 address space. */
1171
1172 bool
1173 gcn_scalar_flat_address_p (rtx x)
1174 {
1175 if (gcn_address_register_p (x, DImode, false))
1176 return true;
1177
1178 if (GET_CODE (x) == PLUS
1179 && gcn_address_register_p (XEXP (x, 0), DImode, false)
1180 && CONST_INT_P (XEXP (x, 1)))
1181 return true;
1182
1183 return false;
1184 }
1185
1186 /* Return true if MEM X would be valid for the Scalar Flat address space. */
1187
1188 bool
1189 gcn_scalar_flat_mem_p (rtx x)
1190 {
1191 if (!MEM_P (x))
1192 return false;
1193
1194 if (GET_MODE_SIZE (GET_MODE (x)) < 4)
1195 return false;
1196
1197 return gcn_scalar_flat_address_p (XEXP (x, 0));
1198 }
1199
1200 /* Return true if X would be valid inside a MEM using the LDS or GDS
1201 address spaces. */
1202
1203 bool
1204 gcn_ds_address_p (rtx x)
1205 {
1206 if (gcn_vec_address_register_p (x, SImode, false))
1207 return true;
1208
1209 if (GET_CODE (x) == PLUS
1210 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false)
1211 && CONST_INT_P (XEXP (x, 1)))
1212 return true;
1213
1214 return false;
1215 }
1216
1217 /* Return true if ADDR would be valid inside a MEM using the Global
1218 address space. */
1219
1220 bool
1221 gcn_global_address_p (rtx addr)
1222 {
1223 if (gcn_address_register_p (addr, DImode, false)
1224 || gcn_vec_address_register_p (addr, DImode, false))
1225 return true;
1226
1227 if (GET_CODE (addr) == PLUS)
1228 {
1229 rtx base = XEXP (addr, 0);
1230 rtx offset = XEXP (addr, 1);
1231 bool immediate_p = (CONST_INT_P (offset)
1232 && INTVAL (offset) >= -(1 << 12)
1233 && INTVAL (offset) < (1 << 12));
1234
1235 if ((gcn_address_register_p (base, DImode, false)
1236 || gcn_vec_address_register_p (base, DImode, false))
1237 && immediate_p)
1238 /* SGPR + CONST or VGPR + CONST */
1239 return true;
1240
1241 if (gcn_address_register_p (base, DImode, false)
1242 && gcn_vgpr_register_operand (offset, SImode))
1243 /* SPGR + VGPR */
1244 return true;
1245
1246 if (GET_CODE (base) == PLUS
1247 && gcn_address_register_p (XEXP (base, 0), DImode, false)
1248 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1249 && immediate_p)
1250 /* (SGPR + VGPR) + CONST */
1251 return true;
1252 }
1253
1254 return false;
1255 }
1256
1257 /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P.
1258
1259 Recognizes RTL expressions that are valid memory addresses for an
1260 instruction. The MODE argument is the machine mode for the MEM
1261 expression that wants to use this address.
1262
1263 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
1264 convert common non-canonical forms to canonical form so that they will
1265 be recognized. */
1266
1267 static bool
1268 gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict,
1269 addr_space_t as)
1270 {
1271 /* All vector instructions need to work on addresses in registers. */
1272 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x)))
1273 return false;
1274
1275 if (AS_SCALAR_FLAT_P (as))
1276 {
1277 if (mode == QImode || mode == HImode)
1278 return 0;
1279
1280 switch (GET_CODE (x))
1281 {
1282 case REG:
1283 return gcn_address_register_p (x, DImode, strict);
1284 /* Addresses are in the form BASE+OFFSET
1285 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1286 Writes and atomics do not accept SGPR. */
1287 case PLUS:
1288 {
1289 rtx x0 = XEXP (x, 0);
1290 rtx x1 = XEXP (x, 1);
1291 if (!gcn_address_register_p (x0, DImode, strict))
1292 return false;
1293 /* FIXME: This is disabled because of the mode mismatch between
1294 SImode (for the address or m0 register) and the DImode PLUS.
1295 We'll need a zero_extend or similar.
1296
1297 if (gcn_m0_register_p (x1, SImode, strict)
1298 || gcn_address_register_p (x1, SImode, strict))
1299 return true;
1300 else*/
1301 if (GET_CODE (x1) == CONST_INT)
1302 {
1303 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)
1304 /* The low bits of the offset are ignored, even when
1305 they're meant to realign the pointer. */
1306 && !(INTVAL (x1) & 0x3))
1307 return true;
1308 }
1309 return false;
1310 }
1311
1312 default:
1313 break;
1314 }
1315 }
1316 else if (AS_SCRATCH_P (as))
1317 return gcn_address_register_p (x, SImode, strict);
1318 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as))
1319 {
1320 if (TARGET_GCN3 || GET_CODE (x) == REG)
1321 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1322 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1323 ? gcn_address_register_p (x, DImode, strict)
1324 : gcn_vec_address_register_p (x, DImode, strict));
1325 else
1326 {
1327 gcc_assert (TARGET_GCN5_PLUS);
1328
1329 if (GET_CODE (x) == PLUS)
1330 {
1331 rtx x1 = XEXP (x, 1);
1332
1333 if (VECTOR_MODE_P (mode)
1334 ? !gcn_address_register_p (x, DImode, strict)
1335 : !gcn_vec_address_register_p (x, DImode, strict))
1336 return false;
1337
1338 if (GET_CODE (x1) == CONST_INT)
1339 {
1340 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12)
1341 /* The low bits of the offset are ignored, even when
1342 they're meant to realign the pointer. */
1343 && !(INTVAL (x1) & 0x3))
1344 return true;
1345 }
1346 }
1347 return false;
1348 }
1349 }
1350 else if (AS_GLOBAL_P (as))
1351 {
1352 gcc_assert (TARGET_GCN5_PLUS);
1353
1354 if (GET_CODE (x) == REG)
1355 return (gcn_address_register_p (x, DImode, strict)
1356 || (!VECTOR_MODE_P (mode)
1357 && gcn_vec_address_register_p (x, DImode, strict)));
1358 else if (GET_CODE (x) == PLUS)
1359 {
1360 rtx base = XEXP (x, 0);
1361 rtx offset = XEXP (x, 1);
1362
1363 bool immediate_p = (GET_CODE (offset) == CONST_INT
1364 /* Signed 13-bit immediate. */
1365 && INTVAL (offset) >= -(1 << 12)
1366 && INTVAL (offset) < (1 << 12)
1367 /* The low bits of the offset are ignored, even
1368 when they're meant to realign the pointer. */
1369 && !(INTVAL (offset) & 0x3));
1370
1371 if (!VECTOR_MODE_P (mode))
1372 {
1373 if ((gcn_address_register_p (base, DImode, strict)
1374 || gcn_vec_address_register_p (base, DImode, strict))
1375 && immediate_p)
1376 /* SGPR + CONST or VGPR + CONST */
1377 return true;
1378
1379 if (gcn_address_register_p (base, DImode, strict)
1380 && gcn_vgpr_register_operand (offset, SImode))
1381 /* SGPR + VGPR */
1382 return true;
1383
1384 if (GET_CODE (base) == PLUS
1385 && gcn_address_register_p (XEXP (base, 0), DImode, strict)
1386 && gcn_vgpr_register_operand (XEXP (base, 1), SImode)
1387 && immediate_p)
1388 /* (SGPR + VGPR) + CONST */
1389 return true;
1390 }
1391 else
1392 {
1393 if (gcn_address_register_p (base, DImode, strict)
1394 && immediate_p)
1395 /* SGPR + CONST */
1396 return true;
1397 }
1398 }
1399 else
1400 return false;
1401 }
1402 else if (AS_ANY_DS_P (as))
1403 switch (GET_CODE (x))
1404 {
1405 case REG:
1406 return (VECTOR_MODE_P (mode)
1407 ? gcn_address_register_p (x, SImode, strict)
1408 : gcn_vec_address_register_p (x, SImode, strict));
1409 /* Addresses are in the form BASE+OFFSET
1410 OFFSET is either 20bit unsigned immediate, SGPR or M0.
1411 Writes and atomics do not accept SGPR. */
1412 case PLUS:
1413 {
1414 rtx x0 = XEXP (x, 0);
1415 rtx x1 = XEXP (x, 1);
1416 if (!gcn_vec_address_register_p (x0, DImode, strict))
1417 return false;
1418 if (GET_CODE (x1) == REG)
1419 {
1420 if (GET_CODE (x1) != REG
1421 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER
1422 && !gcn_ssrc_register_operand (x1, DImode)))
1423 return false;
1424 }
1425 else if (GET_CODE (x1) == CONST_VECTOR
1426 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT
1427 && single_cst_vector_p (x1))
1428 {
1429 x1 = CONST_VECTOR_ELT (x1, 0);
1430 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20))
1431 return true;
1432 }
1433 return false;
1434 }
1435
1436 default:
1437 break;
1438 }
1439 else
1440 gcc_unreachable ();
1441 return false;
1442 }
1443
1444 /* Implement TARGET_ADDR_SPACE_POINTER_MODE.
1445
1446 Return the appropriate mode for a named address pointer. */
1447
1448 static scalar_int_mode
1449 gcn_addr_space_pointer_mode (addr_space_t addrspace)
1450 {
1451 switch (addrspace)
1452 {
1453 case ADDR_SPACE_SCRATCH:
1454 case ADDR_SPACE_LDS:
1455 case ADDR_SPACE_GDS:
1456 return SImode;
1457 case ADDR_SPACE_DEFAULT:
1458 case ADDR_SPACE_FLAT:
1459 case ADDR_SPACE_FLAT_SCRATCH:
1460 case ADDR_SPACE_SCALAR_FLAT:
1461 return DImode;
1462 default:
1463 gcc_unreachable ();
1464 }
1465 }
1466
1467 /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE.
1468
1469 Return the appropriate mode for a named address space address. */
1470
1471 static scalar_int_mode
1472 gcn_addr_space_address_mode (addr_space_t addrspace)
1473 {
1474 return gcn_addr_space_pointer_mode (addrspace);
1475 }
1476
1477 /* Implement TARGET_ADDR_SPACE_SUBSET_P.
1478
1479 Determine if one named address space is a subset of another. */
1480
1481 static bool
1482 gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
1483 {
1484 if (subset == superset)
1485 return true;
1486 /* FIXME is this true? */
1487 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset))
1488 return true;
1489 return false;
1490 }
1491
1492 /* Convert from one address space to another. */
1493
1494 static rtx
1495 gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
1496 {
1497 gcc_assert (POINTER_TYPE_P (from_type));
1498 gcc_assert (POINTER_TYPE_P (to_type));
1499
1500 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
1501 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
1502
1503 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
1504 {
1505 rtx queue = gen_rtx_REG (DImode,
1506 cfun->machine->args.reg[QUEUE_PTR_ARG]);
1507 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
1508 gen_rtx_PLUS (DImode, queue,
1509 gen_int_mode (64, SImode)));
1510 rtx tmp = gen_reg_rtx (DImode);
1511
1512 emit_move_insn (gen_lowpart (SImode, tmp), op);
1513 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp),
1514 group_seg_aperture_hi);
1515
1516 return tmp;
1517 }
1518 else if (as_from == as_to)
1519 return op;
1520 else
1521 gcc_unreachable ();
1522 }
1523
1524 /* Implement TARGET_ADDR_SPACE_DEBUG.
1525
1526 Return the dwarf address space class for each hardware address space. */
1527
1528 static int
1529 gcn_addr_space_debug (addr_space_t as)
1530 {
1531 switch (as)
1532 {
1533 case ADDR_SPACE_DEFAULT:
1534 case ADDR_SPACE_FLAT:
1535 case ADDR_SPACE_SCALAR_FLAT:
1536 case ADDR_SPACE_FLAT_SCRATCH:
1537 return DW_ADDR_none;
1538 case ADDR_SPACE_GLOBAL:
1539 return 1; // DW_ADDR_LLVM_global
1540 case ADDR_SPACE_LDS:
1541 return 3; // DW_ADDR_LLVM_group
1542 case ADDR_SPACE_SCRATCH:
1543 return 4; // DW_ADDR_LLVM_private
1544 case ADDR_SPACE_GDS:
1545 return 0x8000; // DW_ADDR_AMDGPU_region
1546 }
1547 gcc_unreachable ();
1548 }
1549
1550
1551 /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h
1552
1553 Retun true if REGNO is OK for memory adressing. */
1554
1555 bool
1556 gcn_regno_mode_code_ok_for_base_p (int regno,
1557 machine_mode, addr_space_t as, int, int)
1558 {
1559 if (regno >= FIRST_PSEUDO_REGISTER)
1560 {
1561 if (reg_renumber)
1562 regno = reg_renumber[regno];
1563 else
1564 return true;
1565 }
1566 if (AS_FLAT_P (as))
1567 return (VGPR_REGNO_P (regno)
1568 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1569 else if (AS_SCALAR_FLAT_P (as))
1570 return (SGPR_REGNO_P (regno)
1571 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM);
1572 else if (AS_GLOBAL_P (as))
1573 {
1574 return (SGPR_REGNO_P (regno)
1575 || VGPR_REGNO_P (regno)
1576 || regno == ARG_POINTER_REGNUM
1577 || regno == FRAME_POINTER_REGNUM);
1578 }
1579 else
1580 /* For now. */
1581 return false;
1582 }
1583
1584 /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h.
1585
1586 Return a suitable register class for memory addressing. */
1587
1588 reg_class
1589 gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc,
1590 int ic)
1591 {
1592 switch (as)
1593 {
1594 case ADDR_SPACE_DEFAULT:
1595 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic);
1596 case ADDR_SPACE_SCALAR_FLAT:
1597 case ADDR_SPACE_SCRATCH:
1598 return SGPR_REGS;
1599 break;
1600 case ADDR_SPACE_FLAT:
1601 case ADDR_SPACE_FLAT_SCRATCH:
1602 case ADDR_SPACE_LDS:
1603 case ADDR_SPACE_GDS:
1604 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1605 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1606 ? SGPR_REGS : VGPR_REGS);
1607 case ADDR_SPACE_GLOBAL:
1608 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1609 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
1610 ? SGPR_REGS : ALL_GPR_REGS);
1611 }
1612 gcc_unreachable ();
1613 }
1614
1615 /* Implement REGNO_OK_FOR_INDEX_P via gcn.h.
1616
1617 Return true if REGNO is OK for index of memory addressing. */
1618
1619 bool
1620 regno_ok_for_index_p (int regno)
1621 {
1622 if (regno >= FIRST_PSEUDO_REGISTER)
1623 {
1624 if (reg_renumber)
1625 regno = reg_renumber[regno];
1626 else
1627 return true;
1628 }
1629 return regno == M0_REG || VGPR_REGNO_P (regno);
1630 }
1631
1632 /* Generate move which uses the exec flags. If EXEC is NULL, then it is
1633 assumed that all lanes normally relevant to the mode of the move are
1634 affected. If PREV is NULL, then a sensible default is supplied for
1635 the inactive lanes. */
1636
1637 static rtx
1638 gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL)
1639 {
1640 machine_mode mode = GET_MODE (op0);
1641
1642 if (vgpr_vector_mode_p (mode))
1643 {
1644 if (exec && exec != CONSTM1_RTX (DImode))
1645 {
1646 if (!prev)
1647 prev = op0;
1648 }
1649 else
1650 {
1651 if (!prev)
1652 prev = gcn_gen_undef (mode);
1653 exec = gcn_full_exec_reg ();
1654 }
1655
1656 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec));
1657
1658 return gen_rtx_PARALLEL (VOIDmode,
1659 gen_rtvec (2, set,
1660 gen_rtx_CLOBBER (VOIDmode,
1661 gen_rtx_SCRATCH (V64DImode))));
1662 }
1663
1664 return (gen_rtx_PARALLEL
1665 (VOIDmode,
1666 gen_rtvec (2, gen_rtx_SET (op0, op1),
1667 gen_rtx_USE (VOIDmode,
1668 exec ? exec : gcn_scalar_exec ()))));
1669 }
1670
1671 /* Generate masked move. */
1672
1673 static rtx
1674 gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL)
1675 {
1676 if (exec)
1677 return (gen_rtx_SET (op0,
1678 gen_rtx_VEC_MERGE (GET_MODE (op0),
1679 gen_rtx_VEC_DUPLICATE (GET_MODE
1680 (op0), op1),
1681 op2, exec)));
1682 else
1683 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1)));
1684 }
1685
1686 /* Expand vector init of OP0 by VEC.
1687 Implements vec_init instruction pattern. */
1688
1689 void
1690 gcn_expand_vector_init (rtx op0, rtx vec)
1691 {
1692 int64_t initialized_mask = 0;
1693 int64_t curr_mask = 1;
1694 machine_mode mode = GET_MODE (op0);
1695
1696 rtx val = XVECEXP (vec, 0, 0);
1697
1698 for (int i = 1; i < 64; i++)
1699 if (rtx_equal_p (val, XVECEXP (vec, 0, i)))
1700 curr_mask |= (int64_t) 1 << i;
1701
1702 if (gcn_constant_p (val))
1703 emit_move_insn (op0, gcn_vec_constant (mode, val));
1704 else
1705 {
1706 val = force_reg (GET_MODE_INNER (mode), val);
1707 emit_insn (gen_duplicate_load (op0, val));
1708 }
1709 initialized_mask |= curr_mask;
1710 for (int i = 1; i < 64; i++)
1711 if (!(initialized_mask & ((int64_t) 1 << i)))
1712 {
1713 curr_mask = (int64_t) 1 << i;
1714 rtx val = XVECEXP (vec, 0, i);
1715
1716 for (int j = i + 1; j < 64; j++)
1717 if (rtx_equal_p (val, XVECEXP (vec, 0, j)))
1718 curr_mask |= (int64_t) 1 << j;
1719 if (gcn_constant_p (val))
1720 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val),
1721 get_exec (curr_mask)));
1722 else
1723 {
1724 val = force_reg (GET_MODE_INNER (mode), val);
1725 emit_insn (gen_duplicate_load (op0, val, op0,
1726 get_exec (curr_mask)));
1727 }
1728 initialized_mask |= curr_mask;
1729 }
1730 }
1731
1732 /* Load vector constant where n-th lane contains BASE+n*VAL. */
1733
1734 static rtx
1735 strided_constant (machine_mode mode, int base, int val)
1736 {
1737 rtx x = gen_reg_rtx (mode);
1738 emit_move_insn (x, gcn_vec_constant (mode, base));
1739 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32),
1740 x, get_exec (0xffffffff00000000)));
1741 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16),
1742 x, get_exec (0xffff0000ffff0000)));
1743 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8),
1744 x, get_exec (0xff00ff00ff00ff00)));
1745 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4),
1746 x, get_exec (0xf0f0f0f0f0f0f0f0)));
1747 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2),
1748 x, get_exec (0xcccccccccccccccc)));
1749 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1),
1750 x, get_exec (0xaaaaaaaaaaaaaaaa)));
1751 return x;
1752 }
1753
1754 /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */
1755
1756 static rtx
1757 gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode,
1758 addr_space_t as)
1759 {
1760 switch (as)
1761 {
1762 case ADDR_SPACE_DEFAULT:
1763 return gcn_addr_space_legitimize_address (x, old, mode,
1764 DEFAULT_ADDR_SPACE);
1765 case ADDR_SPACE_SCALAR_FLAT:
1766 case ADDR_SPACE_SCRATCH:
1767 /* Instructions working on vectors need the address to be in
1768 a register. */
1769 if (vgpr_vector_mode_p (mode))
1770 return force_reg (GET_MODE (x), x);
1771
1772 return x;
1773 case ADDR_SPACE_FLAT:
1774 case ADDR_SPACE_FLAT_SCRATCH:
1775 case ADDR_SPACE_GLOBAL:
1776 return TARGET_GCN3 ? force_reg (DImode, x) : x;
1777 case ADDR_SPACE_LDS:
1778 case ADDR_SPACE_GDS:
1779 /* FIXME: LDS support offsets, handle them!. */
1780 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode)
1781 {
1782 rtx addrs = gen_reg_rtx (V64SImode);
1783 rtx base = force_reg (SImode, x);
1784 rtx offsets = strided_constant (V64SImode, 0,
1785 GET_MODE_UNIT_SIZE (mode));
1786
1787 emit_insn (gen_vec_duplicatev64si (addrs, base));
1788 emit_insn (gen_addv64si3 (addrs, offsets, addrs));
1789 return addrs;
1790 }
1791 return x;
1792 }
1793 gcc_unreachable ();
1794 }
1795
1796 /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the
1797 proper vector of stepped addresses.
1798
1799 MEM will be a DImode address of a vector in an SGPR.
1800 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */
1801
1802 rtx
1803 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem,
1804 rtx tmp)
1805 {
1806 gcc_assert (MEM_P (mem));
1807 rtx mem_base = XEXP (mem, 0);
1808 rtx mem_index = NULL_RTX;
1809
1810 if (!TARGET_GCN5_PLUS)
1811 {
1812 /* gcn_addr_space_legitimize_address should have put the address in a
1813 register. If not, it is too late to do anything about it. */
1814 gcc_assert (REG_P (mem_base));
1815 }
1816
1817 if (GET_CODE (mem_base) == PLUS)
1818 {
1819 mem_index = XEXP (mem_base, 1);
1820 mem_base = XEXP (mem_base, 0);
1821 }
1822
1823 /* RF and RM base registers for vector modes should be always an SGPR. */
1824 gcc_assert (SGPR_REGNO_P (REGNO (mem_base))
1825 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER);
1826
1827 machine_mode inner = GET_MODE_INNER (mode);
1828 int shift = exact_log2 (GET_MODE_SIZE (inner));
1829 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
1830 rtx undef_v64si = gcn_gen_undef (V64SImode);
1831 rtx new_base = NULL_RTX;
1832 addr_space_t as = MEM_ADDR_SPACE (mem);
1833
1834 rtx tmplo = (REG_P (tmp)
1835 ? gcn_operand_part (V64DImode, tmp, 0)
1836 : gen_reg_rtx (V64SImode));
1837
1838 /* tmplo[:] = ramp[:] << shift */
1839 if (exec)
1840 emit_insn (gen_ashlv64si3_exec (tmplo, ramp,
1841 gen_int_mode (shift, SImode),
1842 undef_v64si, exec));
1843 else
1844 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode)));
1845
1846 if (AS_FLAT_P (as))
1847 {
1848 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG);
1849
1850 if (REG_P (tmp))
1851 {
1852 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0);
1853 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1);
1854 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1);
1855
1856 /* tmphi[:] = mem_base_hi */
1857 if (exec)
1858 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi,
1859 undef_v64si, exec));
1860 else
1861 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi));
1862
1863 /* tmp[:] += zext (mem_base) */
1864 if (exec)
1865 {
1866 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo,
1867 vcc, undef_v64si, exec));
1868 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx,
1869 vcc, vcc, undef_v64si, exec));
1870 }
1871 else
1872 emit_insn (gen_addv64di3_vcc_zext_dup (tmp, mem_base_lo, tmp, vcc));
1873 }
1874 else
1875 {
1876 tmp = gen_reg_rtx (V64DImode);
1877 if (exec)
1878 emit_insn (gen_addv64di3_vcc_zext_dup2_exec
1879 (tmp, tmplo, mem_base, vcc, gcn_gen_undef (V64DImode),
1880 exec));
1881 else
1882 emit_insn (gen_addv64di3_vcc_zext_dup2 (tmp, tmplo, mem_base, vcc));
1883 }
1884
1885 new_base = tmp;
1886 }
1887 else if (AS_ANY_DS_P (as))
1888 {
1889 if (!exec)
1890 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base));
1891 else
1892 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base,
1893 gcn_gen_undef (V64SImode), exec));
1894 new_base = tmplo;
1895 }
1896 else
1897 {
1898 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base);
1899 new_base = gen_rtx_PLUS (V64DImode, mem_base,
1900 gen_rtx_SIGN_EXTEND (V64DImode, tmplo));
1901 }
1902
1903 return gen_rtx_PLUS (GET_MODE (new_base), new_base,
1904 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base),
1905 (mem_index ? mem_index
1906 : const0_rtx)));
1907 }
1908
1909 /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses
1910 suitable for the given address space. This is indented for use in
1911 gather/scatter patterns.
1912
1913 The offsets may be signed or unsigned, according to UNSIGNED_P.
1914 If EXEC is set then _exec patterns will be used, otherwise plain.
1915
1916 Return values.
1917 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses.
1918 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */
1919
1920 rtx
1921 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale,
1922 bool unsigned_p, rtx exec)
1923 {
1924 rtx tmpsi = gen_reg_rtx (V64SImode);
1925 rtx tmpdi = gen_reg_rtx (V64DImode);
1926 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL;
1927 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL;
1928
1929 if (CONST_INT_P (scale)
1930 && INTVAL (scale) > 0
1931 && exact_log2 (INTVAL (scale)) >= 0)
1932 emit_insn (gen_ashlv64si3 (tmpsi, offsets,
1933 GEN_INT (exact_log2 (INTVAL (scale)))));
1934 else
1935 (exec
1936 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi,
1937 exec))
1938 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale)));
1939
1940 /* "Global" instructions do not support negative register offsets. */
1941 if (as == ADDR_SPACE_FLAT || !unsigned_p)
1942 {
1943 if (unsigned_p)
1944 (exec
1945 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base,
1946 undefdi, exec))
1947 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base)));
1948 else
1949 (exec
1950 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base,
1951 undefdi, exec))
1952 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base)));
1953 return tmpdi;
1954 }
1955 else if (as == ADDR_SPACE_GLOBAL)
1956 return tmpsi;
1957
1958 gcc_unreachable ();
1959 }
1960
1961 /* Return true if move from OP0 to OP1 is known to be executed in vector
1962 unit. */
1963
1964 bool
1965 gcn_vgpr_move_p (rtx op0, rtx op1)
1966 {
1967 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1968 return true;
1969 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1970 return true;
1971 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0)))
1972 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1)))
1973 || vgpr_vector_mode_p (GET_MODE (op0)));
1974 }
1975
1976 /* Return true if move from OP0 to OP1 is known to be executed in scalar
1977 unit. Used in the machine description. */
1978
1979 bool
1980 gcn_sgpr_move_p (rtx op0, rtx op1)
1981 {
1982 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0)))
1983 return true;
1984 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
1985 return true;
1986 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
1987 || VGPR_REGNO_P (REGNO (op0)))
1988 return false;
1989 if (REG_P (op1)
1990 && REGNO (op1) < FIRST_PSEUDO_REGISTER
1991 && !VGPR_REGNO_P (REGNO (op1)))
1992 return true;
1993 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
1994 }
1995
1996 /* Implement TARGET_SECONDARY_RELOAD.
1997
1998 The address space determines which registers can be used for loads and
1999 stores. */
2000
2001 static reg_class_t
2002 gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
2003 machine_mode reload_mode, secondary_reload_info *sri)
2004 {
2005 reg_class_t result = NO_REGS;
2006 bool spilled_pseudo =
2007 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1;
2008
2009 if (dump_file && (dump_flags & TDF_DETAILS))
2010 {
2011 fprintf (dump_file, "gcn_secondary_reload: ");
2012 dump_value_slim (dump_file, x, 1);
2013 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"),
2014 reg_class_names[rclass], GET_MODE_NAME (reload_mode));
2015 if (REG_P (x) || GET_CODE (x) == SUBREG)
2016 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x),
2017 (true_regnum (x) >= 0
2018 && true_regnum (x) < FIRST_PSEUDO_REGISTER
2019 ? reg_names[true_regnum (x)]
2020 : (spilled_pseudo ? "stack spill" : "??")));
2021 fprintf (dump_file, "\n");
2022 }
2023
2024 /* Some callers don't use or initialize icode. */
2025 sri->icode = CODE_FOR_nothing;
2026
2027 if (MEM_P (x) || spilled_pseudo)
2028 {
2029 addr_space_t as = DEFAULT_ADDR_SPACE;
2030
2031 /* If we have a spilled pseudo, we can't find the address space
2032 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or
2033 ADDR_SPACE_GLOBAL for GCN5. */
2034 if (MEM_P (x))
2035 as = MEM_ADDR_SPACE (x);
2036
2037 if (as == ADDR_SPACE_DEFAULT)
2038 as = DEFAULT_ADDR_SPACE;
2039
2040 switch (as)
2041 {
2042 case ADDR_SPACE_SCALAR_FLAT:
2043 result =
2044 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS);
2045 break;
2046 case ADDR_SPACE_FLAT:
2047 case ADDR_SPACE_FLAT_SCRATCH:
2048 case ADDR_SPACE_GLOBAL:
2049 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT
2050 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT)
2051 {
2052 if (in_p)
2053 switch (reload_mode)
2054 {
2055 case E_V64SImode:
2056 sri->icode = CODE_FOR_reload_inv64si;
2057 break;
2058 case E_V64SFmode:
2059 sri->icode = CODE_FOR_reload_inv64sf;
2060 break;
2061 case E_V64HImode:
2062 sri->icode = CODE_FOR_reload_inv64hi;
2063 break;
2064 case E_V64HFmode:
2065 sri->icode = CODE_FOR_reload_inv64hf;
2066 break;
2067 case E_V64QImode:
2068 sri->icode = CODE_FOR_reload_inv64qi;
2069 break;
2070 case E_V64DImode:
2071 sri->icode = CODE_FOR_reload_inv64di;
2072 break;
2073 case E_V64DFmode:
2074 sri->icode = CODE_FOR_reload_inv64df;
2075 break;
2076 default:
2077 gcc_unreachable ();
2078 }
2079 else
2080 switch (reload_mode)
2081 {
2082 case E_V64SImode:
2083 sri->icode = CODE_FOR_reload_outv64si;
2084 break;
2085 case E_V64SFmode:
2086 sri->icode = CODE_FOR_reload_outv64sf;
2087 break;
2088 case E_V64HImode:
2089 sri->icode = CODE_FOR_reload_outv64hi;
2090 break;
2091 case E_V64HFmode:
2092 sri->icode = CODE_FOR_reload_outv64hf;
2093 break;
2094 case E_V64QImode:
2095 sri->icode = CODE_FOR_reload_outv64qi;
2096 break;
2097 case E_V64DImode:
2098 sri->icode = CODE_FOR_reload_outv64di;
2099 break;
2100 case E_V64DFmode:
2101 sri->icode = CODE_FOR_reload_outv64df;
2102 break;
2103 default:
2104 gcc_unreachable ();
2105 }
2106 break;
2107 }
2108 /* Fallthrough. */
2109 case ADDR_SPACE_LDS:
2110 case ADDR_SPACE_GDS:
2111 case ADDR_SPACE_SCRATCH:
2112 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
2113 break;
2114 }
2115 }
2116
2117 if (dump_file && (dump_flags & TDF_DETAILS))
2118 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result],
2119 get_insn_name (sri->icode));
2120
2121 return result;
2122 }
2123
2124 /* Update register usage after having seen the compiler flags and kernel
2125 attributes. We typically want to fix registers that contain values
2126 set by the HSA runtime. */
2127
2128 static void
2129 gcn_conditional_register_usage (void)
2130 {
2131 if (!cfun || !cfun->machine)
2132 return;
2133
2134 if (cfun->machine->normal_function)
2135 {
2136 /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
2137 for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
2138 i <= LAST_SGPR_REG; i++)
2139 fixed_regs[i] = 1, call_used_regs[i] = 1;
2140
2141 for (int i = VGPR_REGNO (MAX_NORMAL_VGPR_COUNT);
2142 i <= LAST_VGPR_REG; i++)
2143 fixed_regs[i] = 1, call_used_regs[i] = 1;
2144
2145 return;
2146 }
2147
2148 /* If the set of requested args is the default set, nothing more needs to
2149 be done. */
2150 if (cfun->machine->args.requested == default_requested_args)
2151 return;
2152
2153 /* Requesting a set of args different from the default violates the ABI. */
2154 if (!leaf_function_p ())
2155 warning (0, "A non-default set of initial values has been requested, "
2156 "which violates the ABI");
2157
2158 for (int i = SGPR_REGNO (0); i < SGPR_REGNO (14); i++)
2159 fixed_regs[i] = 0;
2160
2161 /* Fix the runtime argument register containing values that may be
2162 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be
2163 needed after the prologue so there's no need to fix them. */
2164 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
2165 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1;
2166 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
2167 {
2168 /* The upper 32-bits of the 64-bit descriptor are not used, so allow
2169 the containing registers to be used for other purposes. */
2170 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1;
2171 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1;
2172 }
2173 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
2174 {
2175 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1;
2176 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1;
2177 }
2178 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
2179 {
2180 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
2181 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
2182 }
2183 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
2184 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
2185 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
2186 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1;
2187 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0)
2188 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1;
2189 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0)
2190 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
2191 }
2192
2193 /* Determine if a load or store is valid, according to the register classes
2194 and address space. Used primarily by the machine description to decide
2195 when to split a move into two steps. */
2196
2197 bool
2198 gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
2199 {
2200 if (!MEM_P (dest) && !MEM_P (src))
2201 return true;
2202
2203 if (MEM_P (dest)
2204 && AS_FLAT_P (MEM_ADDR_SPACE (dest))
2205 && (gcn_flat_address_p (XEXP (dest, 0), mode)
2206 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2207 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2208 && gcn_vgpr_register_operand (src, mode))
2209 return true;
2210 else if (MEM_P (src)
2211 && AS_FLAT_P (MEM_ADDR_SPACE (src))
2212 && (gcn_flat_address_p (XEXP (src, 0), mode)
2213 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2214 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2215 && gcn_vgpr_register_operand (dest, mode))
2216 return true;
2217
2218 if (MEM_P (dest)
2219 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest))
2220 && (gcn_global_address_p (XEXP (dest, 0))
2221 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2222 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2223 && gcn_vgpr_register_operand (src, mode))
2224 return true;
2225 else if (MEM_P (src)
2226 && AS_GLOBAL_P (MEM_ADDR_SPACE (src))
2227 && (gcn_global_address_p (XEXP (src, 0))
2228 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2229 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2230 && gcn_vgpr_register_operand (dest, mode))
2231 return true;
2232
2233 if (MEM_P (dest)
2234 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT
2235 && (gcn_scalar_flat_address_p (XEXP (dest, 0))
2236 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
2237 || GET_CODE (XEXP (dest, 0)) == LABEL_REF)
2238 && gcn_ssrc_register_operand (src, mode))
2239 return true;
2240 else if (MEM_P (src)
2241 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT
2242 && (gcn_scalar_flat_address_p (XEXP (src, 0))
2243 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF
2244 || GET_CODE (XEXP (src, 0)) == LABEL_REF)
2245 && gcn_sdst_register_operand (dest, mode))
2246 return true;
2247
2248 if (MEM_P (dest)
2249 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
2250 && gcn_ds_address_p (XEXP (dest, 0))
2251 && gcn_vgpr_register_operand (src, mode))
2252 return true;
2253 else if (MEM_P (src)
2254 && AS_ANY_DS_P (MEM_ADDR_SPACE (src))
2255 && gcn_ds_address_p (XEXP (src, 0))
2256 && gcn_vgpr_register_operand (dest, mode))
2257 return true;
2258
2259 return false;
2260 }
2261
2262 /* }}} */
2263 /* {{{ Functions and ABI. */
2264
2265 /* Implement TARGET_FUNCTION_VALUE.
2266
2267 Define how to find the value returned by a function.
2268 The register location is always the same, but the mode depends on
2269 VALTYPE. */
2270
2271 static rtx
2272 gcn_function_value (const_tree valtype, const_tree, bool)
2273 {
2274 machine_mode mode = TYPE_MODE (valtype);
2275
2276 if (INTEGRAL_TYPE_P (valtype)
2277 && GET_MODE_CLASS (mode) == MODE_INT
2278 && GET_MODE_SIZE (mode) < 4)
2279 mode = SImode;
2280
2281 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG));
2282 }
2283
2284 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.
2285
2286 Return true if N is a possible register number for the function return
2287 value. */
2288
2289 static bool
2290 gcn_function_value_regno_p (const unsigned int n)
2291 {
2292 return n == RETURN_VALUE_REG;
2293 }
2294
2295 /* Calculate the number of registers required to hold function argument
2296 ARG. */
2297
2298 static int
2299 num_arg_regs (const function_arg_info &arg)
2300 {
2301 if (targetm.calls.must_pass_in_stack (arg))
2302 return 0;
2303
2304 int size = arg.promoted_size_in_bytes ();
2305 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
2306 }
2307
2308 /* Implement TARGET_STRICT_ARGUMENT_NAMING.
2309
2310 Return true if the location where a function argument is passed
2311 depends on whether or not it is a named argument
2312
2313 For gcn, we know how to handle functions declared as stdarg: by
2314 passing an extra pointer to the unnamed arguments. However, the
2315 Fortran frontend can produce a different situation, where a
2316 function pointer is declared with no arguments, but the actual
2317 function and calls to it take more arguments. In that case, we
2318 want to ensure the call matches the definition of the function. */
2319
2320 static bool
2321 gcn_strict_argument_naming (cumulative_args_t cum_v)
2322 {
2323 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2324
2325 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
2326 }
2327
2328 /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED.
2329
2330 See comment on gcn_strict_argument_naming. */
2331
2332 static bool
2333 gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v)
2334 {
2335 return !gcn_strict_argument_naming (cum_v);
2336 }
2337
2338 /* Implement TARGET_FUNCTION_ARG.
2339
2340 Return an RTX indicating whether a function argument is passed in a register
2341 and if so, which register. */
2342
2343 static rtx
2344 gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg)
2345 {
2346 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2347 if (cum->normal_function)
2348 {
2349 if (!arg.named || arg.end_marker_p ())
2350 return 0;
2351
2352 if (targetm.calls.must_pass_in_stack (arg))
2353 return 0;
2354
2355 /* Vector parameters are not supported yet. */
2356 if (VECTOR_MODE_P (arg.mode))
2357 return 0;
2358
2359 int reg_num = FIRST_PARM_REG + cum->num;
2360 int num_regs = num_arg_regs (arg);
2361 if (num_regs > 0)
2362 while (reg_num % num_regs != 0)
2363 reg_num++;
2364 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS)
2365 return gen_rtx_REG (arg.mode, reg_num);
2366 }
2367 else
2368 {
2369 if (cum->num >= cum->args.nargs)
2370 {
2371 cum->offset = (cum->offset + TYPE_ALIGN (arg.type) / 8 - 1)
2372 & -(TYPE_ALIGN (arg.type) / 8);
2373 cfun->machine->kernarg_segment_alignment
2374 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment,
2375 TYPE_ALIGN (arg.type) / 8);
2376 rtx addr = gen_rtx_REG (DImode,
2377 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]);
2378 if (cum->offset)
2379 addr = gen_rtx_PLUS (DImode, addr,
2380 gen_int_mode (cum->offset, DImode));
2381 rtx mem = gen_rtx_MEM (arg.mode, addr);
2382 set_mem_attributes (mem, arg.type, 1);
2383 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT);
2384 MEM_READONLY_P (mem) = 1;
2385 return mem;
2386 }
2387
2388 int a = cum->args.order[cum->num];
2389 if (arg.mode != gcn_kernel_arg_types[a].mode)
2390 {
2391 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name);
2392 return 0;
2393 }
2394 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode,
2395 cum->args.reg[a]);
2396 }
2397 return 0;
2398 }
2399
2400 /* Implement TARGET_FUNCTION_ARG_ADVANCE.
2401
2402 Updates the summarizer variable pointed to by CUM_V to advance past an
2403 argument in the argument list. */
2404
2405 static void
2406 gcn_function_arg_advance (cumulative_args_t cum_v,
2407 const function_arg_info &arg)
2408 {
2409 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2410
2411 if (cum->normal_function)
2412 {
2413 if (!arg.named)
2414 return;
2415
2416 int num_regs = num_arg_regs (arg);
2417 if (num_regs > 0)
2418 while ((FIRST_PARM_REG + cum->num) % num_regs != 0)
2419 cum->num++;
2420 cum->num += num_regs;
2421 }
2422 else
2423 {
2424 if (cum->num < cum->args.nargs)
2425 cum->num++;
2426 else
2427 {
2428 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (arg.type));
2429 cfun->machine->kernarg_segment_byte_size = cum->offset;
2430 }
2431 }
2432 }
2433
2434 /* Implement TARGET_ARG_PARTIAL_BYTES.
2435
2436 Returns the number of bytes at the beginning of an argument that must be put
2437 in registers. The value must be zero for arguments that are passed entirely
2438 in registers or that are entirely pushed on the stack. */
2439
2440 static int
2441 gcn_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg)
2442 {
2443 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
2444
2445 if (!arg.named)
2446 return 0;
2447
2448 if (targetm.calls.must_pass_in_stack (arg))
2449 return 0;
2450
2451 if (cum->num >= NUM_PARM_REGS)
2452 return 0;
2453
2454 /* If the argument fits entirely in registers, return 0. */
2455 if (cum->num + num_arg_regs (arg) <= NUM_PARM_REGS)
2456 return 0;
2457
2458 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD;
2459 }
2460
2461 /* A normal function which takes a pointer argument (to a scalar) may be
2462 passed a pointer to LDS space (via a high-bits-set aperture), and that only
2463 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the
2464 function has an incoming pointer-to-scalar parameter. */
2465
2466 static void
2467 gcn_detect_incoming_pointer_arg (tree fndecl)
2468 {
2469 gcc_assert (cfun && cfun->machine);
2470
2471 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl));
2472 arg;
2473 arg = TREE_CHAIN (arg))
2474 if (POINTER_TYPE_P (TREE_VALUE (arg))
2475 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg))))
2476 cfun->machine->use_flat_addressing = true;
2477 }
2478
2479 /* Implement INIT_CUMULATIVE_ARGS, via gcn.h.
2480
2481 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function
2482 whose data type is FNTYPE. For a library call, FNTYPE is 0. */
2483
2484 void
2485 gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ ,
2486 tree fntype /* tree ptr for function decl */ ,
2487 rtx libname /* SYMBOL_REF of library name or 0 */ ,
2488 tree fndecl, int caller)
2489 {
2490 memset (cum, 0, sizeof (*cum));
2491 cum->fntype = fntype;
2492 if (libname)
2493 {
2494 gcc_assert (cfun && cfun->machine);
2495 cum->normal_function = true;
2496 if (!caller)
2497 {
2498 cfun->machine->normal_function = true;
2499 gcn_detect_incoming_pointer_arg (fndecl);
2500 }
2501 return;
2502 }
2503 tree attr = NULL;
2504 if (fndecl)
2505 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl));
2506 if (fndecl && !attr)
2507 attr = lookup_attribute ("amdgpu_hsa_kernel",
2508 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)));
2509 if (!attr && fntype)
2510 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype));
2511 /* Handle main () as kernel, so we can run testsuite.
2512 Handle OpenACC kernels similarly to main. */
2513 if (!attr && !caller && fndecl
2514 && (MAIN_NAME_P (DECL_NAME (fndecl))
2515 || lookup_attribute ("omp target entrypoint",
2516 DECL_ATTRIBUTES (fndecl)) != NULL_TREE))
2517 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE);
2518 else
2519 {
2520 if (!attr || caller)
2521 {
2522 gcc_assert (cfun && cfun->machine);
2523 cum->normal_function = true;
2524 if (!caller)
2525 cfun->machine->normal_function = true;
2526 }
2527 gcn_parse_amdgpu_hsa_kernel_attribute
2528 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE);
2529 }
2530 cfun->machine->args = cum->args;
2531 if (!caller && cfun->machine->normal_function)
2532 gcn_detect_incoming_pointer_arg (fndecl);
2533
2534 reinit_regs ();
2535 }
2536
2537 static bool
2538 gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype))
2539 {
2540 machine_mode mode = TYPE_MODE (type);
2541 HOST_WIDE_INT size = int_size_in_bytes (type);
2542
2543 if (AGGREGATE_TYPE_P (type))
2544 return true;
2545
2546 /* Vector return values are not supported yet. */
2547 if (VECTOR_TYPE_P (type))
2548 return true;
2549
2550 if (mode == BLKmode)
2551 return true;
2552
2553 if (size > 2 * UNITS_PER_WORD)
2554 return true;
2555
2556 return false;
2557 }
2558
2559 /* Implement TARGET_PROMOTE_FUNCTION_MODE.
2560
2561 Return the mode to use for outgoing function arguments. */
2562
2563 machine_mode
2564 gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode,
2565 int *ARG_UNUSED (punsignedp),
2566 const_tree ARG_UNUSED (funtype),
2567 int ARG_UNUSED (for_return))
2568 {
2569 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4)
2570 return SImode;
2571
2572 return mode;
2573 }
2574
2575 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.
2576
2577 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle
2578 ARGS_GROW_DOWNWARDS. */
2579
2580 static tree
2581 gcn_gimplify_va_arg_expr (tree valist, tree type,
2582 gimple_seq *ARG_UNUSED (pre_p),
2583 gimple_seq *ARG_UNUSED (post_p))
2584 {
2585 tree ptr = build_pointer_type (type);
2586 tree valist_type;
2587 tree t, u;
2588 bool indirect;
2589
2590 indirect = pass_va_arg_by_reference (type);
2591 if (indirect)
2592 {
2593 type = ptr;
2594 ptr = build_pointer_type (type);
2595 }
2596 valist_type = TREE_TYPE (valist);
2597
2598 /* Args grow down. Not handled by generic routines. */
2599
2600 u = fold_convert (sizetype, size_in_bytes (type));
2601 u = fold_build1 (NEGATE_EXPR, sizetype, u);
2602 t = fold_build_pointer_plus (valist, u);
2603
2604 /* Align to 8 byte boundary. */
2605
2606 u = build_int_cst (TREE_TYPE (t), -8);
2607 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u);
2608 t = fold_convert (valist_type, t);
2609
2610 t = build2 (MODIFY_EXPR, valist_type, valist, t);
2611
2612 t = fold_convert (ptr, t);
2613 t = build_va_arg_indirect_ref (t);
2614
2615 if (indirect)
2616 t = build_va_arg_indirect_ref (t);
2617
2618 return t;
2619 }
2620
2621 /* Return 1 if TRAIT NAME is present in the OpenMP context's
2622 device trait set, return 0 if not present in any OpenMP context in the
2623 whole translation unit, or -1 if not present in the current OpenMP context
2624 but might be present in another OpenMP context in the same TU. */
2625
2626 int
2627 gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
2628 const char *name)
2629 {
2630 switch (trait)
2631 {
2632 case omp_device_kind:
2633 return strcmp (name, "gpu") == 0;
2634 case omp_device_arch:
2635 return strcmp (name, "gcn") == 0;
2636 case omp_device_isa:
2637 if (strcmp (name, "fiji") == 0)
2638 return gcn_arch == PROCESSOR_FIJI;
2639 if (strcmp (name, "gfx900") == 0)
2640 return gcn_arch == PROCESSOR_VEGA10;
2641 if (strcmp (name, "gfx906") == 0)
2642 return gcn_arch == PROCESSOR_VEGA20;
2643 if (strcmp (name, "gfx908") == 0)
2644 return gcn_arch == PROCESSOR_GFX908;
2645 return 0;
2646 default:
2647 gcc_unreachable ();
2648 }
2649 }
2650
2651 /* Calculate stack offsets needed to create prologues and epilogues. */
2652
2653 static struct machine_function *
2654 gcn_compute_frame_offsets (void)
2655 {
2656 machine_function *offsets = cfun->machine;
2657
2658 if (reload_completed)
2659 return offsets;
2660
2661 offsets->need_frame_pointer = frame_pointer_needed;
2662
2663 offsets->outgoing_args_size = crtl->outgoing_args_size;
2664 offsets->pretend_size = crtl->args.pretend_args_size;
2665
2666 offsets->local_vars = get_frame_size ();
2667
2668 offsets->lr_needs_saving = (!leaf_function_p ()
2669 || df_regs_ever_live_p (LR_REGNUM)
2670 || df_regs_ever_live_p (LR_REGNUM + 1));
2671
2672 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0;
2673
2674 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2675 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2676 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2677 && frame_pointer_needed))
2678 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
2679
2680 /* Round up to 64-bit boundary to maintain stack alignment. */
2681 offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
2682
2683 return offsets;
2684 }
2685
2686 /* Insert code into the prologue or epilogue to store or load any
2687 callee-save register to/from the stack.
2688
2689 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */
2690
2691 static void
2692 move_callee_saved_registers (rtx sp, machine_function *offsets,
2693 bool prologue)
2694 {
2695 int regno, offset, saved_scalars;
2696 rtx exec = gen_rtx_REG (DImode, EXEC_REG);
2697 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG);
2698 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22));
2699 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE);
2700 HOST_WIDE_INT exec_set = 0;
2701 int offreg_set = 0;
2702 auto_vec<int> saved_sgprs;
2703
2704 start_sequence ();
2705
2706 /* Move scalars into two vector registers. */
2707 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++)
2708 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2709 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving)
2710 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
2711 && offsets->need_frame_pointer))
2712 {
2713 rtx reg = gen_rtx_REG (SImode, regno);
2714 rtx vreg = gen_rtx_REG (V64SImode,
2715 VGPR_REGNO (6 + (saved_scalars / 64)));
2716 int lane = saved_scalars % 64;
2717
2718 if (prologue)
2719 {
2720 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane)));
2721 saved_sgprs.safe_push (regno);
2722 }
2723 else
2724 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane)));
2725
2726 saved_scalars++;
2727 }
2728
2729 rtx move_scalars = get_insns ();
2730 end_sequence ();
2731 start_sequence ();
2732
2733 /* Ensure that all vector lanes are moved. */
2734 exec_set = -1;
2735 emit_move_insn (exec, GEN_INT (exec_set));
2736
2737 /* Set up a vector stack pointer. */
2738 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
2739 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3));
2740 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2),
2741 gcn_gen_undef (V64SImode), exec));
2742 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4));
2743 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode),
2744 exec));
2745 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0),
2746 gcn_operand_part (V64SImode, vsp, 0),
2747 _0_4_8_12, vcc, gcn_gen_undef (V64SImode),
2748 exec));
2749 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1),
2750 gcn_operand_part (V64SImode, vsp, 1),
2751 const0_rtx, vcc, vcc,
2752 gcn_gen_undef (V64SImode), exec));
2753
2754 /* Move vectors. */
2755 for (regno = FIRST_VGPR_REG, offset = 0;
2756 regno < FIRST_PSEUDO_REGISTER; regno++)
2757 if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
2758 || (regno == VGPR_REGNO (6) && saved_scalars > 0)
2759 || (regno == VGPR_REGNO (7) && saved_scalars > 63))
2760 {
2761 rtx reg = gen_rtx_REG (V64SImode, regno);
2762 int size = 256;
2763
2764 if (regno == VGPR_REGNO (6) && saved_scalars < 64)
2765 size = saved_scalars * 4;
2766 else if (regno == VGPR_REGNO (7) && saved_scalars < 128)
2767 size = (saved_scalars - 64) * 4;
2768
2769 if (size != 256 || exec_set != -1)
2770 {
2771 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1;
2772 emit_move_insn (exec, gen_int_mode (exec_set, DImode));
2773 }
2774
2775 if (prologue)
2776 {
2777 rtx insn = emit_insn (gen_scatterv64si_insn_1offset_exec
2778 (vsp, const0_rtx, reg, as, const0_rtx,
2779 exec));
2780
2781 /* Add CFI metadata. */
2782 rtx note;
2783 if (regno == VGPR_REGNO (6) || regno == VGPR_REGNO (7))
2784 {
2785 int start = (regno == VGPR_REGNO (7) ? 64 : 0);
2786 int count = MIN (saved_scalars - start, 64);
2787 int add_lr = (regno == VGPR_REGNO (6)
2788 && offsets->lr_needs_saving);
2789 int lrdest = -1;
2790 rtvec seq = rtvec_alloc (count + add_lr);
2791
2792 /* Add an REG_FRAME_RELATED_EXPR entry for each scalar
2793 register that was saved in this batch. */
2794 for (int idx = 0; idx < count; idx++)
2795 {
2796 int stackaddr = offset + idx * 4;
2797 rtx dest = gen_rtx_MEM (SImode,
2798 gen_rtx_PLUS
2799 (DImode, sp,
2800 GEN_INT (stackaddr)));
2801 rtx src = gen_rtx_REG (SImode, saved_sgprs[start + idx]);
2802 rtx set = gen_rtx_SET (dest, src);
2803 RTX_FRAME_RELATED_P (set) = 1;
2804 RTVEC_ELT (seq, idx) = set;
2805
2806 if (saved_sgprs[start + idx] == LINK_REGNUM)
2807 lrdest = stackaddr;
2808 }
2809
2810 /* Add an additional expression for DWARF_LINK_REGISTER if
2811 LINK_REGNUM was saved. */
2812 if (lrdest != -1)
2813 {
2814 rtx dest = gen_rtx_MEM (DImode,
2815 gen_rtx_PLUS
2816 (DImode, sp,
2817 GEN_INT (lrdest)));
2818 rtx src = gen_rtx_REG (DImode, DWARF_LINK_REGISTER);
2819 rtx set = gen_rtx_SET (dest, src);
2820 RTX_FRAME_RELATED_P (set) = 1;
2821 RTVEC_ELT (seq, count) = set;
2822 }
2823
2824 note = gen_rtx_SEQUENCE (VOIDmode, seq);
2825 }
2826 else
2827 {
2828 rtx dest = gen_rtx_MEM (V64SImode,
2829 gen_rtx_PLUS (DImode, sp,
2830 GEN_INT (offset)));
2831 rtx src = gen_rtx_REG (V64SImode, regno);
2832 note = gen_rtx_SET (dest, src);
2833 }
2834 RTX_FRAME_RELATED_P (insn) = 1;
2835 add_reg_note (insn, REG_FRAME_RELATED_EXPR, note);
2836 }
2837 else
2838 emit_insn (gen_gatherv64si_insn_1offset_exec
2839 (reg, vsp, const0_rtx, as, const0_rtx,
2840 gcn_gen_undef (V64SImode), exec));
2841
2842 /* Move our VSP to the next stack entry. */
2843 if (offreg_set != size)
2844 {
2845 offreg_set = size;
2846 emit_move_insn (offreg, GEN_INT (size));
2847 }
2848 if (exec_set != -1)
2849 {
2850 exec_set = -1;
2851 emit_move_insn (exec, GEN_INT (exec_set));
2852 }
2853 emit_insn (gen_addv64si3_vcc_dup_exec
2854 (gcn_operand_part (V64SImode, vsp, 0),
2855 offreg, gcn_operand_part (V64SImode, vsp, 0),
2856 vcc, gcn_gen_undef (V64SImode), exec));
2857 emit_insn (gen_addcv64si3_exec
2858 (gcn_operand_part (V64SImode, vsp, 1),
2859 gcn_operand_part (V64SImode, vsp, 1),
2860 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec));
2861
2862 offset += size;
2863 }
2864
2865 rtx move_vectors = get_insns ();
2866 end_sequence ();
2867
2868 if (prologue)
2869 {
2870 emit_insn (move_scalars);
2871 emit_insn (move_vectors);
2872 }
2873 else
2874 {
2875 emit_insn (move_vectors);
2876 emit_insn (move_scalars);
2877 }
2878 }
2879
2880 /* Generate prologue. Called from gen_prologue during pro_and_epilogue pass.
2881
2882 For a non-kernel function, the stack layout looks like this (interim),
2883 growing *upwards*:
2884
2885 hi | + ...
2886 |__________________| <-- current SP
2887 | outgoing args |
2888 |__________________|
2889 | (alloca space) |
2890 |__________________|
2891 | local vars |
2892 |__________________| <-- FP/hard FP
2893 | callee-save regs |
2894 |__________________| <-- soft arg pointer
2895 | pretend args |
2896 |__________________| <-- incoming SP
2897 | incoming args |
2898 lo |..................|
2899
2900 This implies arguments (beyond the first N in registers) must grow
2901 downwards (as, apparently, PA has them do).
2902
2903 For a kernel function we have the simpler:
2904
2905 hi | + ...
2906 |__________________| <-- current SP
2907 | outgoing args |
2908 |__________________|
2909 | (alloca space) |
2910 |__________________|
2911 | local vars |
2912 lo |__________________| <-- FP/hard FP
2913
2914 */
2915
2916 void
2917 gcn_expand_prologue ()
2918 {
2919 machine_function *offsets = gcn_compute_frame_offsets ();
2920
2921 if (!cfun || !cfun->machine || cfun->machine->normal_function)
2922 {
2923 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
2924 rtx sp_hi = gcn_operand_part (Pmode, sp, 1);
2925 rtx sp_lo = gcn_operand_part (Pmode, sp, 0);
2926 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
2927 rtx fp_hi = gcn_operand_part (Pmode, fp, 1);
2928 rtx fp_lo = gcn_operand_part (Pmode, fp, 0);
2929
2930 start_sequence ();
2931
2932 if (offsets->pretend_size > 0)
2933 {
2934 /* FIXME: Do the actual saving of register pretend args to the stack.
2935 Register order needs consideration. */
2936 }
2937
2938 /* Save callee-save regs. */
2939 move_callee_saved_registers (sp, offsets, true);
2940
2941 HOST_WIDE_INT sp_adjust = offsets->pretend_size
2942 + offsets->callee_saves
2943 + offsets->local_vars + offsets->outgoing_args_size;
2944 if (sp_adjust > 0)
2945 {
2946 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2947 we use split add explictly, and specify the DImode add in
2948 the note. */
2949 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2950 rtx adjustment = gen_int_mode (sp_adjust, SImode);
2951 rtx insn = emit_insn (gen_addsi3_scalar_carry (sp_lo, sp_lo,
2952 adjustment, scc));
2953 if (!offsets->need_frame_pointer)
2954 {
2955 RTX_FRAME_RELATED_P (insn) = 1;
2956 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
2957 gen_rtx_SET (sp,
2958 gen_rtx_PLUS (DImode, sp,
2959 adjustment)));
2960 }
2961 emit_insn (gen_addcsi3_scalar_zero (sp_hi, sp_hi, scc));
2962 }
2963
2964 if (offsets->need_frame_pointer)
2965 {
2966 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so
2967 we use split add explictly, and specify the DImode add in
2968 the note. */
2969 rtx scc = gen_rtx_REG (BImode, SCC_REG);
2970 int fp_adjust = -(offsets->local_vars + offsets->outgoing_args_size);
2971 rtx adjustment = gen_int_mode (fp_adjust, SImode);
2972 rtx insn = emit_insn (gen_addsi3_scalar_carry(fp_lo, sp_lo,
2973 adjustment, scc));
2974 emit_insn (gen_addcsi3_scalar (fp_hi, sp_hi,
2975 (fp_adjust < 0 ? GEN_INT (-1)
2976 : const0_rtx),
2977 scc, scc));
2978
2979 /* Set the CFA to the entry stack address, as an offset from the
2980 frame pointer. This is preferred because the frame pointer is
2981 saved in each frame, whereas the stack pointer is not. */
2982 RTX_FRAME_RELATED_P (insn) = 1;
2983 add_reg_note (insn, REG_CFA_DEF_CFA,
2984 gen_rtx_PLUS (DImode, fp,
2985 GEN_INT (-(offsets->pretend_size
2986 + offsets->callee_saves))));
2987 }
2988
2989 rtx_insn *seq = get_insns ();
2990 end_sequence ();
2991
2992 emit_insn (seq);
2993 }
2994 else
2995 {
2996 rtx wave_offset = gen_rtx_REG (SImode,
2997 cfun->machine->args.
2998 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
2999
3000 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
3001 {
3002 rtx fs_init_lo =
3003 gen_rtx_REG (SImode,
3004 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]);
3005 rtx fs_init_hi =
3006 gen_rtx_REG (SImode,
3007 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1);
3008 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG);
3009 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1);
3010
3011 /*rtx queue = gen_rtx_REG(DImode,
3012 cfun->machine->args.reg[QUEUE_PTR_ARG]);
3013 rtx aperture = gen_rtx_MEM (SImode,
3014 gen_rtx_PLUS (DImode, queue,
3015 gen_int_mode (68, SImode)));
3016 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/
3017
3018 /* Set up flat_scratch. */
3019 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset));
3020 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi,
3021 gen_int_mode (8, SImode)));
3022 emit_move_insn (fs_reg_lo, fs_init_hi);
3023 }
3024
3025 /* Set up frame pointer and stack pointer. */
3026 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM);
3027 rtx sp_hi = simplify_gen_subreg (SImode, sp, DImode, 4);
3028 rtx sp_lo = simplify_gen_subreg (SImode, sp, DImode, 0);
3029 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM);
3030 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4);
3031 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0);
3032
3033 HOST_WIDE_INT sp_adjust = (offsets->local_vars
3034 + offsets->outgoing_args_size);
3035
3036 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */
3037 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
3038 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
3039 gen_int_mode (0xffff, SImode)));
3040 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3041 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
3042 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
3043
3044 /* Adding RTX_FRAME_RELATED_P effectively disables spliting, so we use
3045 split add explictly, and specify the DImode add in the note.
3046 The DWARF info expects that the callee-save data is in the frame,
3047 even though it isn't (because this is the entry point), so we
3048 make a notional adjustment to the DWARF frame offset here. */
3049 rtx dbg_adjustment = gen_int_mode (sp_adjust + offsets->callee_saves,
3050 DImode);
3051 rtx insn;
3052 if (sp_adjust > 0)
3053 {
3054 rtx scc = gen_rtx_REG (BImode, SCC_REG);
3055 rtx adjustment = gen_int_mode (sp_adjust, DImode);
3056 insn = emit_insn (gen_addsi3_scalar_carry(sp_lo, fp_lo, adjustment,
3057 scc));
3058 emit_insn (gen_addcsi3_scalar_zero (sp_hi, fp_hi, scc));
3059 }
3060 else
3061 insn = emit_move_insn (sp, fp);
3062 RTX_FRAME_RELATED_P (insn) = 1;
3063 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
3064 gen_rtx_SET (sp, gen_rtx_PLUS (DImode, sp,
3065 dbg_adjustment)));
3066
3067 if (offsets->need_frame_pointer)
3068 {
3069 /* Set the CFA to the entry stack address, as an offset from the
3070 frame pointer. This is necessary when alloca is used, and
3071 harmless otherwise. */
3072 rtx neg_adjust = gen_int_mode (-offsets->callee_saves, DImode);
3073 add_reg_note (insn, REG_CFA_DEF_CFA,
3074 gen_rtx_PLUS (DImode, fp, neg_adjust));
3075 }
3076
3077 /* Make sure the flat scratch reg doesn't get optimised away. */
3078 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG)));
3079 }
3080
3081 /* Ensure that the scheduler doesn't do anything unexpected. */
3082 emit_insn (gen_blockage ());
3083
3084 /* m0 is initialized for the usual LDS DS and FLAT memory case.
3085 The low-part is the address of the topmost addressable byte, which is
3086 size-1. The high-part is an offset and should be zero. */
3087 emit_move_insn (gen_rtx_REG (SImode, M0_REG),
3088 gen_int_mode (LDS_SIZE, SImode));
3089
3090 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
3091
3092 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
3093 {
3094 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */
3095 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3096 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode,
3097 "gomp_gcn_enter_kernel"));
3098 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3099 }
3100 }
3101
3102 /* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass.
3103
3104 See gcn_expand_prologue for stack details. */
3105
3106 void
3107 gcn_expand_epilogue (void)
3108 {
3109 /* Ensure that the scheduler doesn't do anything unexpected. */
3110 emit_insn (gen_blockage ());
3111
3112 if (!cfun || !cfun->machine || cfun->machine->normal_function)
3113 {
3114 machine_function *offsets = gcn_compute_frame_offsets ();
3115 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
3116 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
3117
3118 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size;
3119
3120 if (offsets->need_frame_pointer)
3121 {
3122 /* Restore old SP from the frame pointer. */
3123 if (sp_adjust > 0)
3124 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode)));
3125 else
3126 emit_move_insn (sp, fp);
3127 }
3128 else
3129 {
3130 /* Restore old SP from current SP. */
3131 sp_adjust += offsets->outgoing_args_size + offsets->local_vars;
3132
3133 if (sp_adjust > 0)
3134 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode)));
3135 }
3136
3137 move_callee_saved_registers (sp, offsets, false);
3138
3139 /* There's no explicit use of the link register on the return insn. Emit
3140 one here instead. */
3141 if (offsets->lr_needs_saving)
3142 emit_use (gen_rtx_REG (DImode, LINK_REGNUM));
3143
3144 /* Similar for frame pointer. */
3145 if (offsets->need_frame_pointer)
3146 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM));
3147 }
3148 else if (flag_openmp)
3149 {
3150 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */
3151 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG);
3152 emit_move_insn (fn_reg,
3153 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel"));
3154 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx));
3155 }
3156 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE)
3157 {
3158 /* Assume that an exit value compatible with gcn-run is expected.
3159 That is, the third input parameter is an int*.
3160
3161 We can't allocate any new registers, but the kernarg_reg is
3162 dead after this, so we'll use that. */
3163 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
3164 [KERNARG_SEGMENT_PTR_ARG]);
3165 rtx retptr_mem = gen_rtx_MEM (DImode,
3166 gen_rtx_PLUS (DImode, kernarg_reg,
3167 GEN_INT (16)));
3168 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
3169 emit_move_insn (kernarg_reg, retptr_mem);
3170
3171 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg);
3172 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT);
3173 emit_move_insn (retval_mem,
3174 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG)));
3175 }
3176
3177 emit_jump_insn (gen_gcn_return ());
3178 }
3179
3180 /* Implement TARGET_FRAME_POINTER_REQUIRED.
3181
3182 Return true if the frame pointer should not be eliminated. */
3183
3184 bool
3185 gcn_frame_pointer_rqd (void)
3186 {
3187 /* GDB needs the frame pointer in order to unwind properly,
3188 but that's not important for the entry point, unless alloca is used.
3189 It's not important for code execution, so we should repect the
3190 -fomit-frame-pointer flag. */
3191 return (!flag_omit_frame_pointer
3192 && cfun
3193 && (cfun->calls_alloca
3194 || (cfun->machine && cfun->machine->normal_function)));
3195 }
3196
3197 /* Implement TARGET_CAN_ELIMINATE.
3198
3199 Return true if the compiler is allowed to try to replace register number
3200 FROM_REG with register number TO_REG.
3201
3202 FIXME: is the default "true" not enough? Should this be a negative set? */
3203
3204 bool
3205 gcn_can_eliminate_p (int /*from_reg */ , int to_reg)
3206 {
3207 return (to_reg == HARD_FRAME_POINTER_REGNUM
3208 || to_reg == STACK_POINTER_REGNUM);
3209 }
3210
3211 /* Implement INITIAL_ELIMINATION_OFFSET.
3212
3213 Returns the initial difference between the specified pair of registers, in
3214 terms of stack position. */
3215
3216 HOST_WIDE_INT
3217 gcn_initial_elimination_offset (int from, int to)
3218 {
3219 machine_function *offsets = gcn_compute_frame_offsets ();
3220
3221 switch (from)
3222 {
3223 case ARG_POINTER_REGNUM:
3224 if (to == STACK_POINTER_REGNUM)
3225 return -(offsets->callee_saves + offsets->local_vars
3226 + offsets->outgoing_args_size);
3227 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM)
3228 return -offsets->callee_saves;
3229 else
3230 gcc_unreachable ();
3231 break;
3232
3233 case FRAME_POINTER_REGNUM:
3234 if (to == STACK_POINTER_REGNUM)
3235 return -(offsets->local_vars + offsets->outgoing_args_size);
3236 else if (to == HARD_FRAME_POINTER_REGNUM)
3237 return 0;
3238 else
3239 gcc_unreachable ();
3240 break;
3241
3242 default:
3243 gcc_unreachable ();
3244 }
3245 }
3246
3247 /* Implement HARD_REGNO_RENAME_OK.
3248
3249 Return true if it is permissible to rename a hard register from
3250 FROM_REG to TO_REG. */
3251
3252 bool
3253 gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg)
3254 {
3255 if (from_reg == SCC_REG
3256 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG
3257 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG
3258 || to_reg == SCC_REG
3259 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG
3260 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG)
3261 return false;
3262
3263 /* Allow the link register to be used if it was saved. */
3264 if ((to_reg & ~1) == LINK_REGNUM)
3265 return !cfun || cfun->machine->lr_needs_saving;
3266
3267 /* Allow the registers used for the static chain to be used if the chain is
3268 not in active use. */
3269 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM)
3270 return !cfun
3271 || !(cfun->static_chain_decl
3272 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM)
3273 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1));
3274
3275 return true;
3276 }
3277
3278 /* Implement HARD_REGNO_CALLER_SAVE_MODE.
3279
3280 Which mode is required for saving NREGS of a pseudo-register in
3281 call-clobbered hard register REGNO. */
3282
3283 machine_mode
3284 gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs,
3285 machine_mode regmode)
3286 {
3287 machine_mode result = choose_hard_reg_mode (regno, nregs, NULL);
3288
3289 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode))
3290 result = (nregs == 1 ? SImode : DImode);
3291
3292 return result;
3293 }
3294
3295 /* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE.
3296
3297 Output assembler code for a block containing the constant parts
3298 of a trampoline, leaving space for the variable parts. */
3299
3300 static void
3301 gcn_asm_trampoline_template (FILE *f)
3302 {
3303 /* The source operand of the move instructions must be a 32-bit
3304 constant following the opcode. */
3305 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM);
3306 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1);
3307 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG);
3308 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1);
3309 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1);
3310 }
3311
3312 /* Implement TARGET_TRAMPOLINE_INIT.
3313
3314 Emit RTL insns to initialize the variable parts of a trampoline.
3315 FNDECL is the decl of the target address, M_TRAMP is a MEM for
3316 the trampoline, and CHAIN_VALUE is an RTX for the static chain
3317 to be passed to the target function. */
3318
3319 static void
3320 gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
3321 {
3322 if (TARGET_GCN5_PLUS)
3323 sorry ("nested function trampolines not supported on GCN5 due to"
3324 " non-executable stacks");
3325
3326 emit_block_move (m_tramp, assemble_trampoline_template (),
3327 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
3328
3329 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
3330 rtx chain_value_reg = copy_to_reg (chain_value);
3331 rtx fnaddr_reg = copy_to_reg (fnaddr);
3332
3333 for (int i = 0; i < 4; i++)
3334 {
3335 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4);
3336 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg;
3337 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4));
3338 }
3339
3340 rtx tramp_addr = XEXP (m_tramp, 0);
3341 emit_insn (gen_clear_icache (tramp_addr,
3342 plus_constant (ptr_mode, tramp_addr,
3343 TRAMPOLINE_SIZE)));
3344 }
3345
3346 /* }}} */
3347 /* {{{ Miscellaneous. */
3348
3349 /* Implement TARGET_CANNOT_COPY_INSN_P.
3350
3351 Return true if INSN must not be duplicated. */
3352
3353 static bool
3354 gcn_cannot_copy_insn_p (rtx_insn *insn)
3355 {
3356 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier)
3357 return true;
3358
3359 return false;
3360 }
3361
3362 /* Implement TARGET_DEBUG_UNWIND_INFO.
3363
3364 Defines the mechanism that will be used for describing frame unwind
3365 information to the debugger. */
3366
3367 static enum unwind_info_type
3368 gcn_debug_unwind_info ()
3369 {
3370 return UI_DWARF2;
3371 }
3372
3373 /* Determine if there is a suitable hardware conversion instruction.
3374 Used primarily by the machine description. */
3375
3376 bool
3377 gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op)
3378 {
3379 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to))
3380 return false;
3381
3382 if (VECTOR_MODE_P (from))
3383 {
3384 from = GET_MODE_INNER (from);
3385 to = GET_MODE_INNER (to);
3386 }
3387
3388 switch (op)
3389 {
3390 case fix_trunc_cvt:
3391 case fixuns_trunc_cvt:
3392 if (GET_MODE_CLASS (from) != MODE_FLOAT
3393 || GET_MODE_CLASS (to) != MODE_INT)
3394 return false;
3395 break;
3396 case float_cvt:
3397 case floatuns_cvt:
3398 if (GET_MODE_CLASS (from) != MODE_INT
3399 || GET_MODE_CLASS (to) != MODE_FLOAT)
3400 return false;
3401 break;
3402 case extend_cvt:
3403 if (GET_MODE_CLASS (from) != MODE_FLOAT
3404 || GET_MODE_CLASS (to) != MODE_FLOAT
3405 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to))
3406 return false;
3407 break;
3408 case trunc_cvt:
3409 if (GET_MODE_CLASS (from) != MODE_FLOAT
3410 || GET_MODE_CLASS (to) != MODE_FLOAT
3411 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to))
3412 return false;
3413 break;
3414 }
3415
3416 return ((to == HImode && from == HFmode)
3417 || (to == SImode && (from == SFmode || from == DFmode))
3418 || (to == HFmode && (from == HImode || from == SFmode))
3419 || (to == SFmode && (from == SImode || from == HFmode
3420 || from == DFmode))
3421 || (to == DFmode && (from == SImode || from == SFmode)));
3422 }
3423
3424 /* Implement TARGET_EMUTLS_VAR_INIT.
3425
3426 Disable emutls (gthr-gcn.h does not support it, yet). */
3427
3428 tree
3429 gcn_emutls_var_init (tree, tree decl, tree)
3430 {
3431 sorry_at (DECL_SOURCE_LOCATION (decl), "TLS is not implemented for GCN.");
3432 return NULL_TREE;
3433 }
3434
3435 /* }}} */
3436 /* {{{ Costs. */
3437
3438 /* Implement TARGET_RTX_COSTS.
3439
3440 Compute a (partial) cost for rtx X. Return true if the complete
3441 cost has been computed, and false if subexpressions should be
3442 scanned. In either case, *TOTAL contains the cost result. */
3443
3444 static bool
3445 gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool)
3446 {
3447 enum rtx_code code = GET_CODE (x);
3448 switch (code)
3449 {
3450 case CONST:
3451 case CONST_DOUBLE:
3452 case CONST_VECTOR:
3453 case CONST_INT:
3454 if (gcn_inline_constant_p (x))
3455 *total = 0;
3456 else if (code == CONST_INT
3457 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000)
3458 *total = 1;
3459 else if (gcn_constant_p (x))
3460 *total = 2;
3461 else
3462 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4;
3463 return true;
3464
3465 case DIV:
3466 *total = 100;
3467 return false;
3468
3469 default:
3470 *total = 3;
3471 return false;
3472 }
3473 }
3474
3475 /* Implement TARGET_MEMORY_MOVE_COST.
3476
3477 Return the cost of moving data of mode M between a
3478 register and memory. A value of 2 is the default; this cost is
3479 relative to those in `REGISTER_MOVE_COST'.
3480
3481 This function is used extensively by register_move_cost that is used to
3482 build tables at startup. Make it inline in this case.
3483 When IN is 2, return maximum of in and out move cost.
3484
3485 If moving between registers and memory is more expensive than
3486 between two registers, you should define this macro to express the
3487 relative cost.
3488
3489 Model also increased moving costs of QImode registers in non
3490 Q_REGS classes. */
3491
3492 #define LOAD_COST 32
3493 #define STORE_COST 32
3494 static int
3495 gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
3496 {
3497 int nregs = CEIL (GET_MODE_SIZE (mode), 4);
3498 switch (regclass)
3499 {
3500 case SCC_CONDITIONAL_REG:
3501 case VCCZ_CONDITIONAL_REG:
3502 case VCC_CONDITIONAL_REG:
3503 case EXECZ_CONDITIONAL_REG:
3504 case ALL_CONDITIONAL_REGS:
3505 case SGPR_REGS:
3506 case SGPR_EXEC_REGS:
3507 case EXEC_MASK_REG:
3508 case SGPR_VOP_SRC_REGS:
3509 case SGPR_MEM_SRC_REGS:
3510 case SGPR_SRC_REGS:
3511 case SGPR_DST_REGS:
3512 case GENERAL_REGS:
3513 case AFP_REGS:
3514 if (!in)
3515 return (STORE_COST + 2) * nregs;
3516 return LOAD_COST * nregs;
3517 case VGPR_REGS:
3518 if (in)
3519 return (LOAD_COST + 2) * nregs;
3520 return STORE_COST * nregs;
3521 case ALL_REGS:
3522 case ALL_GPR_REGS:
3523 case SRCDST_REGS:
3524 if (in)
3525 return (LOAD_COST + 2) * nregs;
3526 return (STORE_COST + 2) * nregs;
3527 default:
3528 gcc_unreachable ();
3529 }
3530 }
3531
3532 /* Implement TARGET_REGISTER_MOVE_COST.
3533
3534 Return the cost of moving data from a register in class CLASS1 to
3535 one in class CLASS2. Base value is 2. */
3536
3537 static int
3538 gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
3539 {
3540 /* Increase cost of moving from and to vector registers. While this is
3541 fast in hardware (I think), it has hidden cost of setting up the exec
3542 flags. */
3543 if ((src < VGPR_REGS) != (dst < VGPR_REGS))
3544 return 4;
3545 return 2;
3546 }
3547
3548 /* }}} */
3549 /* {{{ Builtins. */
3550
3551 /* Type codes used by GCN built-in definitions. */
3552
3553 enum gcn_builtin_type_index
3554 {
3555 GCN_BTI_END_OF_PARAMS,
3556
3557 GCN_BTI_VOID,
3558 GCN_BTI_BOOL,
3559 GCN_BTI_INT,
3560 GCN_BTI_UINT,
3561 GCN_BTI_SIZE_T,
3562 GCN_BTI_LLINT,
3563 GCN_BTI_LLUINT,
3564 GCN_BTI_EXEC,
3565
3566 GCN_BTI_SF,
3567 GCN_BTI_V64SI,
3568 GCN_BTI_V64SF,
3569 GCN_BTI_V64PTR,
3570 GCN_BTI_SIPTR,
3571 GCN_BTI_SFPTR,
3572 GCN_BTI_VOIDPTR,
3573
3574 GCN_BTI_LDS_VOIDPTR,
3575
3576 GCN_BTI_MAX
3577 };
3578
3579 static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX];
3580
3581 #define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC])
3582 #define sf_type_node (gcn_builtin_types[GCN_BTI_SF])
3583 #define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI])
3584 #define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF])
3585 #define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR])
3586 #define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR])
3587 #define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR])
3588 #define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR])
3589 #define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T])
3590
3591 static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int,
3592 struct gcn_builtin_description *);
3593 static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int,
3594 struct gcn_builtin_description *);
3595
3596 struct gcn_builtin_description;
3597 typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int,
3598 struct gcn_builtin_description *);
3599
3600 enum gcn_builtin_type
3601 {
3602 B_UNIMPLEMENTED, /* Sorry out */
3603 B_INSN, /* Emit a pattern */
3604 B_OVERLOAD /* Placeholder for an overloaded function */
3605 };
3606
3607 struct gcn_builtin_description
3608 {
3609 int fcode;
3610 int icode;
3611 const char *name;
3612 enum gcn_builtin_type type;
3613 /* The first element of parm is always the return type. The rest
3614 are a zero terminated list of parameters. */
3615 int parm[6];
3616 gcn_builtin_expander expander;
3617 };
3618
3619 /* Read in the GCN builtins from gcn-builtins.def. */
3620
3621 extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX];
3622
3623 struct gcn_builtin_description gcn_builtins[] = {
3624 #define DEF_BUILTIN(fcode, icode, name, type, params, expander) \
3625 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander},
3626
3627 #define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \
3628 {GCN_BUILTIN_ ## fcode ## _V64SI, \
3629 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \
3630 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3631 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \
3632 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \
3633 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \
3634 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \
3635 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop},
3636
3637 #include "gcn-builtins.def"
3638 #undef DEF_BUILTIN_BINOP_INT_FP
3639 #undef DEF_BUILTIN
3640 };
3641
3642 static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX];
3643
3644 /* Implement TARGET_BUILTIN_DECL.
3645
3646 Return the GCN builtin for CODE. */
3647
3648 tree
3649 gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3650 {
3651 if (code >= GCN_BUILTIN_MAX)
3652 return error_mark_node;
3653
3654 return gcn_builtin_decls[code];
3655 }
3656
3657 /* Helper function for gcn_init_builtins. */
3658
3659 static void
3660 gcn_init_builtin_types (void)
3661 {
3662 gcn_builtin_types[GCN_BTI_VOID] = void_type_node;
3663 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node;
3664 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node;
3665 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node);
3666 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node;
3667 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node;
3668 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node);
3669
3670 exec_type_node = unsigned_intDI_type_node;
3671 sf_type_node = float32_type_node;
3672 v64si_type_node = build_vector_type (intSI_type_node, 64);
3673 v64sf_type_node = build_vector_type (float_type_node, 64);
3674 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node
3675 /*build_pointer_type
3676 (integer_type_node) */
3677 , 64);
3678 tree tmp = build_distinct_type_copy (intSI_type_node);
3679 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3680 siptr_type_node = build_pointer_type (tmp);
3681
3682 tmp = build_distinct_type_copy (float_type_node);
3683 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3684 sfptr_type_node = build_pointer_type (tmp);
3685
3686 tmp = build_distinct_type_copy (void_type_node);
3687 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT;
3688 voidptr_type_node = build_pointer_type (tmp);
3689
3690 tmp = build_distinct_type_copy (void_type_node);
3691 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS;
3692 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp);
3693 }
3694
3695 /* Implement TARGET_INIT_BUILTINS.
3696
3697 Set up all builtin functions for this target. */
3698
3699 static void
3700 gcn_init_builtins (void)
3701 {
3702 gcn_init_builtin_types ();
3703
3704 struct gcn_builtin_description *d;
3705 unsigned int i;
3706 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++)
3707 {
3708 tree p;
3709 char name[64]; /* build_function will make a copy. */
3710 int parm;
3711
3712 /* FIXME: Is this necessary/useful? */
3713 if (d->name == 0)
3714 continue;
3715
3716 /* Find last parm. */
3717 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++)
3718 ;
3719
3720 p = void_list_node;
3721 while (parm > 1)
3722 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p);
3723
3724 p = build_function_type (gcn_builtin_types[d->parm[0]], p);
3725
3726 sprintf (name, "__builtin_gcn_%s", d->name);
3727 gcn_builtin_decls[i]
3728 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
3729
3730 /* These builtins don't throw. */
3731 TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
3732 }
3733
3734 /* These builtins need to take/return an LDS pointer: override the generic
3735 versions here. */
3736
3737 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START,
3738 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false);
3739
3740 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START,
3741 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START],
3742 false);
3743
3744 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END,
3745 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END],
3746 false);
3747
3748 set_builtin_decl (BUILT_IN_GOACC_BARRIER,
3749 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
3750 }
3751
3752 /* Implement TARGET_INIT_LIBFUNCS. */
3753
3754 static void
3755 gcn_init_libfuncs (void)
3756 {
3757 /* BITS_PER_UNIT * 2 is 64 bits, which causes
3758 optabs-libfuncs.cc:gen_int_libfunc to omit TImode (i.e 128 bits)
3759 libcalls that we need to support operations for that type. Initialise
3760 them here instead. */
3761 set_optab_libfunc (udiv_optab, TImode, "__udivti3");
3762 set_optab_libfunc (umod_optab, TImode, "__umodti3");
3763 set_optab_libfunc (sdiv_optab, TImode, "__divti3");
3764 set_optab_libfunc (smod_optab, TImode, "__modti3");
3765 set_optab_libfunc (smul_optab, TImode, "__multi3");
3766 set_optab_libfunc (addv_optab, TImode, "__addvti3");
3767 set_optab_libfunc (subv_optab, TImode, "__subvti3");
3768 set_optab_libfunc (negv_optab, TImode, "__negvti2");
3769 set_optab_libfunc (absv_optab, TImode, "__absvti2");
3770 set_optab_libfunc (smulv_optab, TImode, "__mulvti3");
3771 set_optab_libfunc (ffs_optab, TImode, "__ffsti2");
3772 set_optab_libfunc (clz_optab, TImode, "__clzti2");
3773 set_optab_libfunc (ctz_optab, TImode, "__ctzti2");
3774 set_optab_libfunc (clrsb_optab, TImode, "__clrsbti2");
3775 set_optab_libfunc (popcount_optab, TImode, "__popcountti2");
3776 set_optab_libfunc (parity_optab, TImode, "__parityti2");
3777 set_optab_libfunc (bswap_optab, TImode, "__bswapti2");
3778 }
3779
3780 /* Expand the CMP_SWAP GCN builtins. We have our own versions that do
3781 not require taking the address of any object, other than the memory
3782 cell being operated on.
3783
3784 Helper function for gcn_expand_builtin_1. */
3785
3786 static rtx
3787 gcn_expand_cmp_swap (tree exp, rtx target)
3788 {
3789 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3790 addr_space_t as
3791 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0))));
3792 machine_mode as_mode = gcn_addr_space_address_mode (as);
3793
3794 if (!target)
3795 target = gen_reg_rtx (mode);
3796
3797 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0),
3798 NULL_RTX, as_mode, EXPAND_NORMAL);
3799 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3800 NULL_RTX, mode, EXPAND_NORMAL);
3801 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3802 NULL_RTX, mode, EXPAND_NORMAL);
3803 rtx pat;
3804
3805 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr));
3806 set_mem_addr_space (mem, as);
3807
3808 if (!REG_P (cmp))
3809 cmp = copy_to_mode_reg (mode, cmp);
3810 if (!REG_P (src))
3811 src = copy_to_mode_reg (mode, src);
3812
3813 if (mode == SImode)
3814 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src);
3815 else
3816 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src);
3817
3818 emit_insn (pat);
3819
3820 return target;
3821 }
3822
3823 /* Expand many different builtins.
3824
3825 Intended for use in gcn-builtins.def. */
3826
3827 static rtx
3828 gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
3829 machine_mode /*mode */ , int ignore,
3830 struct gcn_builtin_description *)
3831 {
3832 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3833 switch (DECL_MD_FUNCTION_CODE (fndecl))
3834 {
3835 case GCN_BUILTIN_FLAT_LOAD_INT32:
3836 {
3837 if (ignore)
3838 return target;
3839 /*rtx exec = */
3840 force_reg (DImode,
3841 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
3842 EXPAND_NORMAL));
3843 /*rtx ptr = */
3844 force_reg (V64DImode,
3845 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode,
3846 EXPAND_NORMAL));
3847 /*emit_insn (gen_vector_flat_loadv64si
3848 (target, gcn_gen_undef (V64SImode), ptr, exec)); */
3849 return target;
3850 }
3851 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32:
3852 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT:
3853 {
3854 if (ignore)
3855 return target;
3856 rtx exec = force_reg (DImode,
3857 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3858 DImode,
3859 EXPAND_NORMAL));
3860 rtx ptr = force_reg (DImode,
3861 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3862 V64DImode,
3863 EXPAND_NORMAL));
3864 rtx offsets = force_reg (V64SImode,
3865 expand_expr (CALL_EXPR_ARG (exp, 2),
3866 NULL_RTX, V64DImode,
3867 EXPAND_NORMAL));
3868 rtx addrs = gen_reg_rtx (V64DImode);
3869 rtx tmp = gen_reg_rtx (V64SImode);
3870 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3871 GEN_INT (2),
3872 gcn_gen_undef (V64SImode), exec));
3873 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3874 gcn_gen_undef (V64DImode),
3875 exec));
3876 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs);
3877 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3878 /* FIXME: set attributes. */
3879 emit_insn (gen_mov_with_exec (target, mem, exec));
3880 return target;
3881 }
3882 case GCN_BUILTIN_FLAT_STORE_PTR_INT32:
3883 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT:
3884 {
3885 rtx exec = force_reg (DImode,
3886 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3887 DImode,
3888 EXPAND_NORMAL));
3889 rtx ptr = force_reg (DImode,
3890 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX,
3891 V64DImode,
3892 EXPAND_NORMAL));
3893 rtx offsets = force_reg (V64SImode,
3894 expand_expr (CALL_EXPR_ARG (exp, 2),
3895 NULL_RTX, V64DImode,
3896 EXPAND_NORMAL));
3897 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp,
3898 3)));
3899 rtx val = force_reg (vmode,
3900 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
3901 vmode,
3902 EXPAND_NORMAL));
3903 rtx addrs = gen_reg_rtx (V64DImode);
3904 rtx tmp = gen_reg_rtx (V64SImode);
3905 emit_insn (gen_ashlv64si3_exec (tmp, offsets,
3906 GEN_INT (2),
3907 gcn_gen_undef (V64SImode), exec));
3908 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr,
3909 gcn_gen_undef (V64DImode),
3910 exec));
3911 rtx mem = gen_rtx_MEM (vmode, addrs);
3912 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */
3913 /* FIXME: set attributes. */
3914 emit_insn (gen_mov_with_exec (mem, val, exec));
3915 return target;
3916 }
3917 case GCN_BUILTIN_SQRTVF:
3918 {
3919 if (ignore)
3920 return target;
3921 rtx exec = gcn_full_exec_reg ();
3922 rtx arg = force_reg (V64SFmode,
3923 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3924 V64SFmode,
3925 EXPAND_NORMAL));
3926 emit_insn (gen_sqrtv64sf2_exec
3927 (target, arg, gcn_gen_undef (V64SFmode), exec));
3928 return target;
3929 }
3930 case GCN_BUILTIN_SQRTF:
3931 {
3932 if (ignore)
3933 return target;
3934 rtx arg = force_reg (SFmode,
3935 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3936 SFmode,
3937 EXPAND_NORMAL));
3938 emit_insn (gen_sqrtsf2 (target, arg));
3939 return target;
3940 }
3941 case GCN_BUILTIN_OMP_DIM_SIZE:
3942 {
3943 if (ignore)
3944 return target;
3945 emit_insn (gen_oacc_dim_size (target,
3946 expand_expr (CALL_EXPR_ARG (exp, 0),
3947 NULL_RTX, SImode,
3948 EXPAND_NORMAL)));
3949 return target;
3950 }
3951 case GCN_BUILTIN_OMP_DIM_POS:
3952 {
3953 if (ignore)
3954 return target;
3955 emit_insn (gen_oacc_dim_pos (target,
3956 expand_expr (CALL_EXPR_ARG (exp, 0),
3957 NULL_RTX, SImode,
3958 EXPAND_NORMAL)));
3959 return target;
3960 }
3961 case GCN_BUILTIN_CMP_SWAP:
3962 case GCN_BUILTIN_CMP_SWAPLL:
3963 return gcn_expand_cmp_swap (exp, target);
3964
3965 case GCN_BUILTIN_ACC_SINGLE_START:
3966 {
3967 if (ignore)
3968 return target;
3969
3970 rtx wavefront = gcn_oacc_dim_pos (1);
3971 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx);
3972 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode);
3973 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx));
3974 return cc;
3975 }
3976
3977 case GCN_BUILTIN_ACC_SINGLE_COPY_START:
3978 {
3979 rtx blk = force_reg (SImode,
3980 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX,
3981 SImode, EXPAND_NORMAL));
3982 rtx wavefront = gcn_oacc_dim_pos (1);
3983 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx);
3984 rtx not_zero = gen_label_rtx ();
3985 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero));
3986 emit_move_insn (blk, const0_rtx);
3987 emit_label (not_zero);
3988 return blk;
3989 }
3990
3991 case GCN_BUILTIN_ACC_SINGLE_COPY_END:
3992 return target;
3993
3994 case GCN_BUILTIN_ACC_BARRIER:
3995 emit_insn (gen_gcn_wavefront_barrier ());
3996 return target;
3997
3998 default:
3999 gcc_unreachable ();
4000 }
4001 }
4002
4003 /* Expansion of simple arithmetic and bit binary operation builtins.
4004
4005 Intended for use with gcn_builtins table. */
4006
4007 static rtx
4008 gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ ,
4009 machine_mode /*mode */ , int ignore,
4010 struct gcn_builtin_description *d)
4011 {
4012 int icode = d->icode;
4013 if (ignore)
4014 return target;
4015
4016 rtx exec = force_reg (DImode,
4017 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode,
4018 EXPAND_NORMAL));
4019
4020 machine_mode m1 = insn_data[icode].operand[1].mode;
4021 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1,
4022 EXPAND_NORMAL);
4023 if (!insn_data[icode].operand[1].predicate (arg1, m1))
4024 arg1 = force_reg (m1, arg1);
4025
4026 machine_mode m2 = insn_data[icode].operand[2].mode;
4027 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2,
4028 EXPAND_NORMAL);
4029 if (!insn_data[icode].operand[2].predicate (arg2, m2))
4030 arg2 = force_reg (m2, arg2);
4031
4032 rtx arg_prev;
4033 if (call_expr_nargs (exp) == 4)
4034 {
4035 machine_mode m_prev = insn_data[icode].operand[4].mode;
4036 arg_prev = force_reg (m_prev,
4037 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX,
4038 m_prev, EXPAND_NORMAL));
4039 }
4040 else
4041 arg_prev = gcn_gen_undef (GET_MODE (target));
4042
4043 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev);
4044 emit_insn (pat);
4045 return target;
4046 }
4047
4048 /* Implement TARGET_EXPAND_BUILTIN.
4049
4050 Expand an expression EXP that calls a built-in function, with result going
4051 to TARGET if that's convenient (and in mode MODE if that's convenient).
4052 SUBTARGET may be used as the target for computing one of EXP's operands.
4053 IGNORE is nonzero if the value is to be ignored. */
4054
4055 rtx
4056 gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
4057 int ignore)
4058 {
4059 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4060 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
4061 struct gcn_builtin_description *d;
4062
4063 gcc_assert (fcode < GCN_BUILTIN_MAX);
4064 d = &gcn_builtins[fcode];
4065
4066 if (d->type == B_UNIMPLEMENTED)
4067 sorry ("Builtin not implemented");
4068
4069 return d->expander (exp, target, subtarget, mode, ignore, d);
4070 }
4071
4072 /* }}} */
4073 /* {{{ Vectorization. */
4074
4075 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.
4076
4077 A vector mask is a value that holds one boolean result for every element in
4078 a vector. */
4079
4080 opt_machine_mode
4081 gcn_vectorize_get_mask_mode (machine_mode)
4082 {
4083 /* GCN uses a DImode bit-mask. */
4084 return DImode;
4085 }
4086
4087 /* Return an RTX that references a vector with the i-th lane containing
4088 PERM[i]*4.
4089
4090 Helper function for gcn_vectorize_vec_perm_const. */
4091
4092 static rtx
4093 gcn_make_vec_perm_address (unsigned int *perm)
4094 {
4095 rtx x = gen_reg_rtx (V64SImode);
4096 emit_move_insn (x, gcn_vec_constant (V64SImode, 0));
4097
4098 /* Permutation addresses use byte addressing. With each vector lane being
4099 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant,
4100 so only set those.
4101
4102 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to
4103 select between lanes in two vectors, but as the DS_BPERMUTE* instructions
4104 only take one source vector, the most-significant bit can be ignored
4105 here. Instead, we can use EXEC masking to select the relevant part of
4106 each source vector after they are permuted separately. */
4107 uint64_t bit_mask = 1 << 2;
4108 for (int i = 2; i < 8; i++, bit_mask <<= 1)
4109 {
4110 uint64_t exec_mask = 0;
4111 uint64_t lane_mask = 1;
4112 for (int j = 0; j < 64; j++, lane_mask <<= 1)
4113 if ((perm[j] * 4) & bit_mask)
4114 exec_mask |= lane_mask;
4115
4116 if (exec_mask)
4117 emit_insn (gen_addv64si3_exec (x, x,
4118 gcn_vec_constant (V64SImode,
4119 bit_mask),
4120 x, get_exec (exec_mask)));
4121 }
4122
4123 return x;
4124 }
4125
4126 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.
4127
4128 Return true if permutation with SEL is possible.
4129
4130 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the
4131 permutations. */
4132
4133 static bool
4134 gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
4135 rtx src0, rtx src1,
4136 const vec_perm_indices & sel)
4137 {
4138 unsigned int nelt = GET_MODE_NUNITS (vmode);
4139
4140 gcc_assert (VECTOR_MODE_P (vmode));
4141 gcc_assert (nelt <= 64);
4142 gcc_assert (sel.length () == nelt);
4143
4144 if (!dst)
4145 {
4146 /* All vector permutations are possible on this architecture,
4147 with varying degrees of efficiency depending on the permutation. */
4148 return true;
4149 }
4150
4151 unsigned int perm[64];
4152 for (unsigned int i = 0; i < nelt; ++i)
4153 perm[i] = sel[i] & (2 * nelt - 1);
4154 for (unsigned int i = nelt; i < 64; ++i)
4155 perm[i] = 0;
4156
4157 src0 = force_reg (vmode, src0);
4158 src1 = force_reg (vmode, src1);
4159
4160 /* Make life a bit easier by swapping operands if necessary so that
4161 the first element always comes from src0. */
4162 if (perm[0] >= nelt)
4163 {
4164 std::swap (src0, src1);
4165
4166 for (unsigned int i = 0; i < nelt; ++i)
4167 if (perm[i] < nelt)
4168 perm[i] += nelt;
4169 else
4170 perm[i] -= nelt;
4171 }
4172
4173 /* TODO: There are more efficient ways to implement certain permutations
4174 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before
4175 this more inefficient generic approach is used. */
4176
4177 int64_t src1_lanes = 0;
4178 int64_t lane_bit = 1;
4179
4180 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1)
4181 {
4182 /* Set the bits for lanes from src1. */
4183 if (perm[i] >= nelt)
4184 src1_lanes |= lane_bit;
4185 }
4186
4187 rtx addr = gcn_make_vec_perm_address (perm);
4188 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx);
4189
4190 switch (vmode)
4191 {
4192 case E_V64QImode:
4193 ds_bpermute = gen_ds_bpermutev64qi;
4194 break;
4195 case E_V64HImode:
4196 ds_bpermute = gen_ds_bpermutev64hi;
4197 break;
4198 case E_V64SImode:
4199 ds_bpermute = gen_ds_bpermutev64si;
4200 break;
4201 case E_V64HFmode:
4202 ds_bpermute = gen_ds_bpermutev64hf;
4203 break;
4204 case E_V64SFmode:
4205 ds_bpermute = gen_ds_bpermutev64sf;
4206 break;
4207 case E_V64DImode:
4208 ds_bpermute = gen_ds_bpermutev64di;
4209 break;
4210 case E_V64DFmode:
4211 ds_bpermute = gen_ds_bpermutev64df;
4212 break;
4213 default:
4214 gcc_assert (false);
4215 }
4216
4217 /* Load elements from src0 to dst. */
4218 gcc_assert (~src1_lanes);
4219 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ()));
4220
4221 /* Load elements from src1 to dst. */
4222 if (src1_lanes)
4223 {
4224 /* Masking a lane masks both the destination and source lanes for
4225 DS_BPERMUTE, so we need to have all lanes enabled for the permute,
4226 then add an extra masked move to merge the results of permuting
4227 the two source vectors together.
4228 */
4229 rtx tmp = gen_reg_rtx (vmode);
4230 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ()));
4231 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes)));
4232 }
4233
4234 return true;
4235 }
4236
4237 /* Implements TARGET_VECTOR_MODE_SUPPORTED_P.
4238
4239 Return nonzero if vector MODE is supported with at least move
4240 instructions. */
4241
4242 static bool
4243 gcn_vector_mode_supported_p (machine_mode mode)
4244 {
4245 return (mode == V64QImode || mode == V64HImode
4246 || mode == V64SImode || mode == V64DImode
4247 || mode == V64SFmode || mode == V64DFmode);
4248 }
4249
4250 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
4251
4252 Enables autovectorization for all supported modes. */
4253
4254 static machine_mode
4255 gcn_vectorize_preferred_simd_mode (scalar_mode mode)
4256 {
4257 switch (mode)
4258 {
4259 case E_QImode:
4260 return V64QImode;
4261 case E_HImode:
4262 return V64HImode;
4263 case E_SImode:
4264 return V64SImode;
4265 case E_DImode:
4266 return V64DImode;
4267 case E_SFmode:
4268 return V64SFmode;
4269 case E_DFmode:
4270 return V64DFmode;
4271 default:
4272 return word_mode;
4273 }
4274 }
4275
4276 /* Implement TARGET_VECTORIZE_RELATED_MODE.
4277
4278 All GCN vectors are 64-lane, so this is simpler than other architectures.
4279 In particular, we do *not* want to match vector bit-size. */
4280
4281 static opt_machine_mode
4282 gcn_related_vector_mode (machine_mode ARG_UNUSED (vector_mode),
4283 scalar_mode element_mode, poly_uint64 nunits)
4284 {
4285 if (known_ne (nunits, 0U) && known_ne (nunits, 64U))
4286 return VOIDmode;
4287
4288 machine_mode pref_mode = gcn_vectorize_preferred_simd_mode (element_mode);
4289 if (!VECTOR_MODE_P (pref_mode))
4290 return VOIDmode;
4291
4292 return pref_mode;
4293 }
4294
4295 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.
4296
4297 Returns the preferred alignment in bits for accesses to vectors of type type
4298 in vectorized code. This might be less than or greater than the ABI-defined
4299 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment
4300 of a single element, in which case the vectorizer will not try to optimize
4301 for alignment. */
4302
4303 static poly_uint64
4304 gcn_preferred_vector_alignment (const_tree type)
4305 {
4306 return TYPE_ALIGN (TREE_TYPE (type));
4307 }
4308
4309 /* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT.
4310
4311 Return true if the target supports misaligned vector store/load of a
4312 specific factor denoted in the misalignment parameter. */
4313
4314 static bool
4315 gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode),
4316 const_tree type, int misalignment,
4317 bool is_packed)
4318 {
4319 if (is_packed)
4320 return false;
4321
4322 /* If the misalignment is unknown, we should be able to handle the access
4323 so long as it is not to a member of a packed data structure. */
4324 if (misalignment == -1)
4325 return true;
4326
4327 /* Return true if the misalignment is a multiple of the natural alignment
4328 of the vector's element type. This is probably always going to be
4329 true in practice, since we've already established that this isn't a
4330 packed access. */
4331 return misalignment % TYPE_ALIGN_UNIT (type) == 0;
4332 }
4333
4334 /* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.
4335
4336 Return true if vector alignment is reachable (by peeling N iterations) for
4337 the given scalar type TYPE. */
4338
4339 static bool
4340 gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
4341 {
4342 /* Vectors which aren't in packed structures will not be less aligned than
4343 the natural alignment of their element type, so this is safe. */
4344 return !is_packed;
4345 }
4346
4347 /* Generate DPP instructions used for vector reductions.
4348
4349 The opcode is given by INSN.
4350 The first operand of the operation is shifted right by SHIFT vector lanes.
4351 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is
4352 broadcast the next row (thereby acting like a shift of 16 for the end of
4353 each row). If SHIFT is 32, lane 31 is broadcast to all the
4354 following lanes (thereby acting like a shift of 32 for lane 63). */
4355
4356 char *
4357 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
4358 int unspec, int shift)
4359 {
4360 static char buf[128];
4361 const char *dpp;
4362 const char *vcc_in = "";
4363 const char *vcc_out = "";
4364
4365 /* Add the vcc operand if needed. */
4366 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4367 {
4368 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4369 vcc_in = ", vcc";
4370
4371 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR
4372 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR)
4373 vcc_out = ", vcc";
4374 }
4375
4376 /* Add the DPP modifiers. */
4377 switch (shift)
4378 {
4379 case 1:
4380 dpp = "row_shr:1 bound_ctrl:0";
4381 break;
4382 case 2:
4383 dpp = "row_shr:2 bound_ctrl:0";
4384 break;
4385 case 4:
4386 dpp = "row_shr:4 bank_mask:0xe";
4387 break;
4388 case 8:
4389 dpp = "row_shr:8 bank_mask:0xc";
4390 break;
4391 case 16:
4392 dpp = "row_bcast:15 row_mask:0xa";
4393 break;
4394 case 32:
4395 dpp = "row_bcast:31 row_mask:0xc";
4396 break;
4397 default:
4398 gcc_unreachable ();
4399 }
4400
4401 if (unspec == UNSPEC_MOV_DPP_SHR && vgpr_2reg_mode_p (mode))
4402 sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
4403 insn, dpp, insn, dpp);
4404 else if (unspec == UNSPEC_MOV_DPP_SHR)
4405 sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
4406 else
4407 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp);
4408
4409 return buf;
4410 }
4411
4412 /* Generate vector reductions in terms of DPP instructions.
4413
4414 The vector register SRC of mode MODE is reduced using the operation given
4415 by UNSPEC, and the scalar result is returned in lane 63 of a vector
4416 register. */
4417
4418 rtx
4419 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
4420 {
4421 machine_mode orig_mode = mode;
4422 bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
4423 || unspec == UNSPEC_SMAX_DPP_SHR
4424 || unspec == UNSPEC_UMIN_DPP_SHR
4425 || unspec == UNSPEC_UMAX_DPP_SHR)
4426 && (mode == V64DImode
4427 || mode == V64DFmode))
4428 || (unspec == UNSPEC_PLUS_DPP_SHR
4429 && mode == V64DFmode));
4430 rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
4431 : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
4432 : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
4433 : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
4434 : unspec == UNSPEC_PLUS_DPP_SHR ? PLUS
4435 : UNKNOWN);
4436 bool use_extends = ((unspec == UNSPEC_SMIN_DPP_SHR
4437 || unspec == UNSPEC_SMAX_DPP_SHR
4438 || unspec == UNSPEC_UMIN_DPP_SHR
4439 || unspec == UNSPEC_UMAX_DPP_SHR)
4440 && (mode == V64QImode
4441 || mode == V64HImode));
4442 bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
4443 || unspec == UNSPEC_UMAX_DPP_SHR);
4444 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
4445 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4446 && (TARGET_GCN3 || mode == V64DImode);
4447
4448 if (use_plus_carry)
4449 unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
4450
4451 if (use_extends)
4452 {
4453 rtx tmp = gen_reg_rtx (V64SImode);
4454 convert_move (tmp, src, unsignedp);
4455 src = tmp;
4456 mode = V64SImode;
4457 }
4458
4459 /* Perform reduction by first performing the reduction operation on every
4460 pair of lanes, then on every pair of results from the previous
4461 iteration (thereby effectively reducing every 4 lanes) and so on until
4462 all lanes are reduced. */
4463 rtx in, out = force_reg (mode, src);
4464 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
4465 {
4466 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
4467 in = out;
4468 out = gen_reg_rtx (mode);
4469
4470 if (use_moves)
4471 {
4472 rtx tmp = gen_reg_rtx (mode);
4473 emit_insn (gen_dpp_move (mode, tmp, in, shift_val));
4474 emit_insn (gen_rtx_SET (out, gen_rtx_fmt_ee (code, mode, tmp, in)));
4475 }
4476 else
4477 {
4478 rtx insn = gen_rtx_SET (out,
4479 gen_rtx_UNSPEC (mode,
4480 gen_rtvec (3, in, in,
4481 shift_val),
4482 unspec));
4483
4484 /* Add clobber for instructions that set the carry flags. */
4485 if (use_plus_carry)
4486 {
4487 rtx clobber = gen_rtx_CLOBBER (VOIDmode,
4488 gen_rtx_REG (DImode, VCC_REG));
4489 insn = gen_rtx_PARALLEL (VOIDmode,
4490 gen_rtvec (2, insn, clobber));
4491 }
4492
4493 emit_insn (insn);
4494 }
4495 }
4496
4497 if (use_extends)
4498 {
4499 rtx tmp = gen_reg_rtx (orig_mode);
4500 convert_move (tmp, out, unsignedp);
4501 out = tmp;
4502 }
4503
4504 return out;
4505 }
4506
4507 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */
4508
4509 int
4510 gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost),
4511 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign))
4512 {
4513 /* Always vectorize. */
4514 return 1;
4515 }
4516
4517 /* }}} */
4518 /* {{{ md_reorg pass. */
4519
4520 /* Identify VMEM instructions from their "type" attribute. */
4521
4522 static bool
4523 gcn_vmem_insn_p (attr_type type)
4524 {
4525 switch (type)
4526 {
4527 case TYPE_MUBUF:
4528 case TYPE_MTBUF:
4529 case TYPE_FLAT:
4530 return true;
4531 case TYPE_UNKNOWN:
4532 case TYPE_SOP1:
4533 case TYPE_SOP2:
4534 case TYPE_SOPK:
4535 case TYPE_SOPC:
4536 case TYPE_SOPP:
4537 case TYPE_SMEM:
4538 case TYPE_DS:
4539 case TYPE_VOP2:
4540 case TYPE_VOP1:
4541 case TYPE_VOPC:
4542 case TYPE_VOP3A:
4543 case TYPE_VOP3B:
4544 case TYPE_VOP_SDWA:
4545 case TYPE_VOP_DPP:
4546 case TYPE_MULT:
4547 case TYPE_VMULT:
4548 return false;
4549 }
4550 gcc_unreachable ();
4551 return false;
4552 }
4553
4554 /* If INSN sets the EXEC register to a constant value, return the value,
4555 otherwise return zero. */
4556
4557 static int64_t
4558 gcn_insn_exec_value (rtx_insn *insn)
4559 {
4560 if (!NONDEBUG_INSN_P (insn))
4561 return 0;
4562
4563 rtx pattern = PATTERN (insn);
4564
4565 if (GET_CODE (pattern) == SET)
4566 {
4567 rtx dest = XEXP (pattern, 0);
4568 rtx src = XEXP (pattern, 1);
4569
4570 if (GET_MODE (dest) == DImode
4571 && REG_P (dest) && REGNO (dest) == EXEC_REG
4572 && CONST_INT_P (src))
4573 return INTVAL (src);
4574 }
4575
4576 return 0;
4577 }
4578
4579 /* Sets the EXEC register before INSN to the value that it had after
4580 LAST_EXEC_DEF. The constant value of the EXEC register is returned if
4581 known, otherwise it returns zero. */
4582
4583 static int64_t
4584 gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec,
4585 bool curr_exec_known, bool &last_exec_def_saved)
4586 {
4587 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4588 rtx exec;
4589
4590 int64_t exec_value = gcn_insn_exec_value (last_exec_def);
4591
4592 if (exec_value)
4593 {
4594 /* If the EXEC value is a constant and it happens to be the same as the
4595 current EXEC value, the restore can be skipped. */
4596 if (curr_exec_known && exec_value == curr_exec)
4597 return exec_value;
4598
4599 exec = GEN_INT (exec_value);
4600 }
4601 else
4602 {
4603 /* If the EXEC value is not a constant, save it in a register after the
4604 point of definition. */
4605 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG);
4606
4607 if (!last_exec_def_saved)
4608 {
4609 start_sequence ();
4610 emit_move_insn (exec_save_reg, exec_reg);
4611 rtx_insn *seq = get_insns ();
4612 end_sequence ();
4613
4614 emit_insn_after (seq, last_exec_def);
4615 if (dump_file && (dump_flags & TDF_DETAILS))
4616 fprintf (dump_file, "Saving EXEC after insn %d.\n",
4617 INSN_UID (last_exec_def));
4618
4619 last_exec_def_saved = true;
4620 }
4621
4622 exec = exec_save_reg;
4623 }
4624
4625 /* Restore EXEC register before the usage. */
4626 start_sequence ();
4627 emit_move_insn (exec_reg, exec);
4628 rtx_insn *seq = get_insns ();
4629 end_sequence ();
4630 emit_insn_before (seq, insn);
4631
4632 if (dump_file && (dump_flags & TDF_DETAILS))
4633 {
4634 if (exec_value)
4635 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n",
4636 exec_value, INSN_UID (insn));
4637 else
4638 fprintf (dump_file,
4639 "Restoring EXEC from saved value before insn %d.\n",
4640 INSN_UID (insn));
4641 }
4642
4643 return exec_value;
4644 }
4645
4646 /* Implement TARGET_MACHINE_DEPENDENT_REORG.
4647
4648 Ensure that pipeline dependencies and lane masking are set correctly. */
4649
4650 static void
4651 gcn_md_reorg (void)
4652 {
4653 basic_block bb;
4654 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG);
4655 regset_head live;
4656
4657 INIT_REG_SET (&live);
4658
4659 compute_bb_for_insn ();
4660
4661 if (!optimize)
4662 {
4663 split_all_insns ();
4664 if (dump_file && (dump_flags & TDF_DETAILS))
4665 {
4666 fprintf (dump_file, "After split:\n");
4667 print_rtl_with_bb (dump_file, get_insns (), dump_flags);
4668 }
4669
4670 /* Update data-flow information for split instructions. */
4671 df_insn_rescan_all ();
4672 }
4673
4674 df_live_add_problem ();
4675 df_live_set_all_dirty ();
4676 df_analyze ();
4677
4678 /* This pass ensures that the EXEC register is set correctly, according
4679 to the "exec" attribute. However, care must be taken so that the
4680 value that reaches explicit uses of the EXEC register remains the
4681 same as before.
4682 */
4683
4684 FOR_EACH_BB_FN (bb, cfun)
4685 {
4686 if (dump_file && (dump_flags & TDF_DETAILS))
4687 fprintf (dump_file, "BB %d:\n", bb->index);
4688
4689 rtx_insn *insn, *curr;
4690 rtx_insn *last_exec_def = BB_HEAD (bb);
4691 bool last_exec_def_saved = false;
4692 bool curr_exec_explicit = true;
4693 bool curr_exec_known = true;
4694 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC
4695 after last_exec_def is executed'. */
4696
4697 bitmap live_in = DF_LR_IN (bb);
4698 bool exec_live_on_entry = false;
4699 if (bitmap_bit_p (live_in, EXEC_LO_REG)
4700 || bitmap_bit_p (live_in, EXEC_HI_REG))
4701 {
4702 if (dump_file)
4703 fprintf (dump_file, "EXEC reg is live on entry to block %d\n",
4704 (int) bb->index);
4705 exec_live_on_entry = true;
4706 }
4707
4708 FOR_BB_INSNS_SAFE (bb, insn, curr)
4709 {
4710 if (!NONDEBUG_INSN_P (insn))
4711 continue;
4712
4713 if (GET_CODE (PATTERN (insn)) == USE
4714 || GET_CODE (PATTERN (insn)) == CLOBBER)
4715 continue;
4716
4717 HARD_REG_SET defs, uses;
4718 CLEAR_HARD_REG_SET (defs);
4719 CLEAR_HARD_REG_SET (uses);
4720 note_stores (insn, record_hard_reg_sets, &defs);
4721 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses);
4722
4723 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG);
4724 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG);
4725 bool exec_used = (hard_reg_set_intersect_p
4726 (uses, reg_class_contents[(int) EXEC_MASK_REG])
4727 || TEST_HARD_REG_BIT (uses, EXECZ_REG));
4728
4729 /* Check the instruction for implicit setting of EXEC via an
4730 attribute. */
4731 attr_exec exec_attr = get_attr_exec (insn);
4732 int64_t new_exec;
4733
4734 switch (exec_attr)
4735 {
4736 case EXEC_NONE:
4737 new_exec = 0;
4738 break;
4739
4740 case EXEC_SINGLE:
4741 /* Instructions that do not involve memory accesses only require
4742 bit 0 of EXEC to be set. */
4743 if (gcn_vmem_insn_p (get_attr_type (insn))
4744 || get_attr_type (insn) == TYPE_DS)
4745 new_exec = 1;
4746 else
4747 new_exec = curr_exec | 1;
4748 break;
4749
4750 case EXEC_FULL:
4751 new_exec = -1;
4752 break;
4753
4754 default: /* Auto-detect what setting is appropriate. */
4755 {
4756 new_exec = 0;
4757
4758 /* If EXEC is referenced explicitly then we don't need to do
4759 anything to set it, so we're done. */
4760 if (exec_used)
4761 break;
4762
4763 /* Scan the insn for VGPRs defs or uses. The mode determines
4764 what kind of exec is needed. */
4765 subrtx_iterator::array_type array;
4766 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
4767 {
4768 const_rtx x = *iter;
4769 if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
4770 {
4771 if (VECTOR_MODE_P (GET_MODE (x)))
4772 {
4773 new_exec = -1;
4774 break;
4775 }
4776 else
4777 new_exec = 1;
4778 }
4779 }
4780 }
4781 break;
4782 }
4783
4784 if (new_exec && (!curr_exec_known || new_exec != curr_exec))
4785 {
4786 start_sequence ();
4787 emit_move_insn (exec_reg, GEN_INT (new_exec));
4788 rtx_insn *seq = get_insns ();
4789 end_sequence ();
4790 emit_insn_before (seq, insn);
4791
4792 if (dump_file && (dump_flags & TDF_DETAILS))
4793 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n",
4794 new_exec, INSN_UID (insn));
4795
4796 curr_exec = new_exec;
4797 curr_exec_explicit = false;
4798 curr_exec_known = true;
4799 }
4800 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS))
4801 {
4802 fprintf (dump_file, "Exec already is %ld before insn %d.\n",
4803 new_exec, INSN_UID (insn));
4804 }
4805
4806 /* The state of the EXEC register is unknown after a
4807 function call. */
4808 if (CALL_P (insn))
4809 curr_exec_known = false;
4810
4811 /* Handle explicit uses of EXEC. If the instruction is a partial
4812 explicit definition of EXEC, then treat it as an explicit use of
4813 EXEC as well. */
4814 if (exec_used || exec_lo_def_p != exec_hi_def_p)
4815 {
4816 /* An instruction that explicitly uses EXEC should not also
4817 implicitly define it. */
4818 gcc_assert (!exec_used || !new_exec);
4819
4820 if (!curr_exec_known || !curr_exec_explicit)
4821 {
4822 /* Restore the previous explicitly defined value. */
4823 curr_exec = gcn_restore_exec (insn, last_exec_def,
4824 curr_exec, curr_exec_known,
4825 last_exec_def_saved);
4826 curr_exec_explicit = true;
4827 curr_exec_known = true;
4828 }
4829 }
4830
4831 /* Handle explicit definitions of EXEC. */
4832 if (exec_lo_def_p || exec_hi_def_p)
4833 {
4834 last_exec_def = insn;
4835 last_exec_def_saved = false;
4836 curr_exec = gcn_insn_exec_value (insn);
4837 curr_exec_explicit = true;
4838 curr_exec_known = true;
4839
4840 if (dump_file && (dump_flags & TDF_DETAILS))
4841 fprintf (dump_file,
4842 "Found %s definition of EXEC at insn %d.\n",
4843 exec_lo_def_p == exec_hi_def_p ? "full" : "partial",
4844 INSN_UID (insn));
4845 }
4846
4847 exec_live_on_entry = false;
4848 }
4849
4850 COPY_REG_SET (&live, DF_LR_OUT (bb));
4851 df_simulate_initialize_backwards (bb, &live);
4852
4853 /* If EXEC is live after the basic block, restore the value of EXEC
4854 at the end of the block. */
4855 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG)
4856 || REGNO_REG_SET_P (&live, EXEC_HI_REG))
4857 && (!curr_exec_known || !curr_exec_explicit || exec_live_on_entry))
4858 {
4859 rtx_insn *end_insn = BB_END (bb);
4860
4861 /* If the instruction is not a jump instruction, do the restore
4862 after the last instruction in the basic block. */
4863 if (NONJUMP_INSN_P (end_insn))
4864 end_insn = NEXT_INSN (end_insn);
4865
4866 gcn_restore_exec (end_insn, last_exec_def, curr_exec,
4867 curr_exec_known, last_exec_def_saved);
4868 }
4869 }
4870
4871 CLEAR_REG_SET (&live);
4872
4873 /* "Manually Inserted Wait States (NOPs)."
4874
4875 GCN hardware detects most kinds of register dependencies, but there
4876 are some exceptions documented in the ISA manual. This pass
4877 detects the missed cases, and inserts the documented number of NOPs
4878 required for correct execution. */
4879
4880 const int max_waits = 5;
4881 struct ilist
4882 {
4883 rtx_insn *insn;
4884 attr_unit unit;
4885 attr_delayeduse delayeduse;
4886 HARD_REG_SET writes;
4887 HARD_REG_SET reads;
4888 int age;
4889 } back[max_waits];
4890 int oldest = 0;
4891 for (int i = 0; i < max_waits; i++)
4892 back[i].insn = NULL;
4893
4894 rtx_insn *insn, *last_insn = NULL;
4895 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
4896 {
4897 if (!NONDEBUG_INSN_P (insn))
4898 continue;
4899
4900 if (GET_CODE (PATTERN (insn)) == USE
4901 || GET_CODE (PATTERN (insn)) == CLOBBER)
4902 continue;
4903
4904 attr_type itype = get_attr_type (insn);
4905 attr_unit iunit = get_attr_unit (insn);
4906 attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
4907 HARD_REG_SET ireads, iwrites;
4908 CLEAR_HARD_REG_SET (ireads);
4909 CLEAR_HARD_REG_SET (iwrites);
4910 note_stores (insn, record_hard_reg_sets, &iwrites);
4911 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
4912
4913 /* Scan recent previous instructions for dependencies not handled in
4914 hardware. */
4915 int nops_rqd = 0;
4916 for (int i = oldest; i < oldest + max_waits; i++)
4917 {
4918 struct ilist *prev_insn = &back[i % max_waits];
4919
4920 if (!prev_insn->insn)
4921 continue;
4922
4923 /* VALU writes SGPR followed by VMEM reading the same SGPR
4924 requires 5 wait states. */
4925 if ((prev_insn->age + nops_rqd) < 5
4926 && prev_insn->unit == UNIT_VECTOR
4927 && gcn_vmem_insn_p (itype))
4928 {
4929 HARD_REG_SET regs = prev_insn->writes & ireads;
4930 if (hard_reg_set_intersect_p
4931 (regs, reg_class_contents[(int) SGPR_REGS]))
4932 nops_rqd = 5 - prev_insn->age;
4933 }
4934
4935 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
4936 requires 5 wait states. */
4937 if ((prev_insn->age + nops_rqd) < 5
4938 && prev_insn->unit == UNIT_VECTOR
4939 && iunit == UNIT_VECTOR
4940 && ((hard_reg_set_intersect_p
4941 (prev_insn->writes,
4942 reg_class_contents[(int) EXEC_MASK_REG])
4943 && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
4944 ||
4945 (hard_reg_set_intersect_p
4946 (prev_insn->writes,
4947 reg_class_contents[(int) VCC_CONDITIONAL_REG])
4948 && TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
4949 nops_rqd = 5 - prev_insn->age;
4950
4951 /* VALU writes SGPR/VCC followed by v_{read,write}lane using
4952 SGPR/VCC as lane select requires 4 wait states. */
4953 if ((prev_insn->age + nops_rqd) < 4
4954 && prev_insn->unit == UNIT_VECTOR
4955 && get_attr_laneselect (insn) == LANESELECT_YES)
4956 {
4957 HARD_REG_SET regs = prev_insn->writes & ireads;
4958 if (hard_reg_set_intersect_p
4959 (regs, reg_class_contents[(int) SGPR_REGS])
4960 || hard_reg_set_intersect_p
4961 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
4962 nops_rqd = 4 - prev_insn->age;
4963 }
4964
4965 /* VALU writes VGPR followed by VALU_DPP reading that VGPR
4966 requires 2 wait states. */
4967 if ((prev_insn->age + nops_rqd) < 2
4968 && prev_insn->unit == UNIT_VECTOR
4969 && itype == TYPE_VOP_DPP)
4970 {
4971 HARD_REG_SET regs = prev_insn->writes & ireads;
4972 if (hard_reg_set_intersect_p
4973 (regs, reg_class_contents[(int) VGPR_REGS]))
4974 nops_rqd = 2 - prev_insn->age;
4975 }
4976
4977 /* Store that requires input registers are not overwritten by
4978 following instruction. */
4979 if ((prev_insn->age + nops_rqd) < 1
4980 && prev_insn->delayeduse == DELAYEDUSE_YES
4981 && ((hard_reg_set_intersect_p
4982 (prev_insn->reads, iwrites))))
4983 nops_rqd = 1 - prev_insn->age;
4984 }
4985
4986 /* Insert the required number of NOPs. */
4987 for (int i = nops_rqd; i > 0; i--)
4988 emit_insn_after (gen_nop (), last_insn);
4989
4990 /* Age the previous instructions. We can also ignore writes to
4991 registers subsequently overwritten. */
4992 HARD_REG_SET written;
4993 CLEAR_HARD_REG_SET (written);
4994 for (int i = oldest + max_waits - 1; i > oldest; i--)
4995 {
4996 struct ilist *prev_insn = &back[i % max_waits];
4997
4998 /* Assume all instructions are equivalent to one "wait", the same
4999 as s_nop. This is probably true for SALU, but not VALU (which
5000 may take longer), so this is not optimal. However, AMD do
5001 not publish the cycle times for instructions. */
5002 prev_insn->age += 1 + nops_rqd;
5003
5004 written |= iwrites;
5005 prev_insn->writes &= ~written;
5006 }
5007
5008 /* Track the current instruction as a previous instruction. */
5009 back[oldest].insn = insn;
5010 back[oldest].unit = iunit;
5011 back[oldest].delayeduse = idelayeduse;
5012 back[oldest].writes = iwrites;
5013 back[oldest].reads = ireads;
5014 back[oldest].age = 0;
5015 oldest = (oldest + 1) % max_waits;
5016
5017 last_insn = insn;
5018 }
5019 }
5020
5021 /* }}} */
5022 /* {{{ OpenACC / OpenMP. */
5023
5024 #define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
5025 #define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
5026 #define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
5027
5028 /* Implement TARGET_GOACC_VALIDATE_DIMS.
5029
5030 Check the launch dimensions provided for an OpenACC compute
5031 region, or routine. */
5032
5033 static bool
5034 gcn_goacc_validate_dims (tree decl, int dims[], int fn_level,
5035 unsigned /*used*/)
5036 {
5037 bool changed = false;
5038 const int max_workers = 16;
5039
5040 /* The vector size must appear to be 64, to the user, unless this is a
5041 SEQ routine. The real, internal value is always 1, which means use
5042 autovectorization, but the user should not see that. */
5043 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5044 && dims[GOMP_DIM_VECTOR] >= 0)
5045 {
5046 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0
5047 && dims[GOMP_DIM_VECTOR] != 64)
5048 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5049 OPT_Wopenacc_dims,
5050 (dims[GOMP_DIM_VECTOR]
5051 ? G_("using %<vector_length (64)%>, ignoring %d")
5052 : G_("using %<vector_length (64)%>, "
5053 "ignoring runtime setting")),
5054 dims[GOMP_DIM_VECTOR]);
5055 dims[GOMP_DIM_VECTOR] = 1;
5056 changed = true;
5057 }
5058
5059 /* Check the num workers is not too large. */
5060 if (dims[GOMP_DIM_WORKER] > max_workers)
5061 {
5062 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION,
5063 OPT_Wopenacc_dims,
5064 "using %<num_workers (%d)%>, ignoring %d",
5065 max_workers, dims[GOMP_DIM_WORKER]);
5066 dims[GOMP_DIM_WORKER] = max_workers;
5067 changed = true;
5068 }
5069
5070 /* Set global defaults. */
5071 if (!decl)
5072 {
5073 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
5074 if (dims[GOMP_DIM_WORKER] < 0)
5075 dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
5076 if (dims[GOMP_DIM_GANG] < 0)
5077 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
5078 changed = true;
5079 }
5080
5081 return changed;
5082 }
5083
5084 /* Helper function for oacc_dim_size instruction.
5085 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */
5086
5087 rtx
5088 gcn_oacc_dim_size (int dim)
5089 {
5090 if (dim < 0 || dim > 2)
5091 error ("offload dimension out of range (%d)", dim);
5092
5093 /* Vectors are a special case. */
5094 if (dim == 2)
5095 return const1_rtx; /* Think of this as 1 times 64. */
5096
5097 static int offset[] = {
5098 /* Offsets into dispatch packet. */
5099 12, /* X dim = Gang / Team / Work-group. */
5100 20, /* Z dim = Worker / Thread / Wavefront. */
5101 16 /* Y dim = Vector / SIMD / Work-item. */
5102 };
5103 rtx addr = gen_rtx_PLUS (DImode,
5104 gen_rtx_REG (DImode,
5105 cfun->machine->args.
5106 reg[DISPATCH_PTR_ARG]),
5107 GEN_INT (offset[dim]));
5108 return gen_rtx_MEM (SImode, addr);
5109 }
5110
5111 /* Helper function for oacc_dim_pos instruction.
5112 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */
5113
5114 rtx
5115 gcn_oacc_dim_pos (int dim)
5116 {
5117 if (dim < 0 || dim > 2)
5118 error ("offload dimension out of range (%d)", dim);
5119
5120 static const int reg[] = {
5121 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */
5122 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */
5123 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */
5124 };
5125
5126 int reg_num = cfun->machine->args.reg[reg[dim]];
5127
5128 /* The information must have been requested by the kernel. */
5129 gcc_assert (reg_num >= 0);
5130
5131 return gen_rtx_REG (SImode, reg_num);
5132 }
5133
5134 /* Implement TARGET_GOACC_FORK_JOIN. */
5135
5136 static bool
5137 gcn_fork_join (gcall *call, const int dims[], bool is_fork)
5138 {
5139 tree arg = gimple_call_arg (call, 2);
5140 unsigned axis = TREE_INT_CST_LOW (arg);
5141
5142 if (!is_fork && axis == GOMP_DIM_WORKER && dims[axis] != 1)
5143 return true;
5144
5145 return false;
5146 }
5147
5148 /* Implement ???????
5149 FIXME make this a real hook.
5150
5151 Adjust FNDECL such that options inherited from the host compiler
5152 are made appropriate for the accelerator compiler. */
5153
5154 void
5155 gcn_fixup_accel_lto_options (tree fndecl)
5156 {
5157 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5158 if (!func_optimize)
5159 return;
5160
5161 tree old_optimize
5162 = build_optimization_node (&global_options, &global_options_set);
5163 tree new_optimize;
5164
5165 /* If the function changed the optimization levels as well as
5166 setting target options, start with the optimizations
5167 specified. */
5168 if (func_optimize != old_optimize)
5169 cl_optimization_restore (&global_options, &global_options_set,
5170 TREE_OPTIMIZATION (func_optimize));
5171
5172 gcn_option_override ();
5173
5174 /* The target attributes may also change some optimization flags,
5175 so update the optimization options if necessary. */
5176 new_optimize = build_optimization_node (&global_options,
5177 &global_options_set);
5178
5179 if (old_optimize != new_optimize)
5180 {
5181 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5182 cl_optimization_restore (&global_options, &global_options_set,
5183 TREE_OPTIMIZATION (old_optimize));
5184 }
5185 }
5186
5187 /* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
5188
5189 static void
5190 gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
5191 unsigned HOST_WIDE_INT *hi,
5192 int ARG_UNUSED (dims[GOMP_DIM_MAX]),
5193 unsigned HOST_WIDE_INT
5194 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
5195 unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
5196 {
5197 *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5198 /* !!! We can maybe use dims[] to estimate the maximum number of work
5199 groups/wavefronts/etc. we will launch, and therefore tune the maximum
5200 amount of LDS we should use. For now, use a minimal amount to try to
5201 maximise occupancy. */
5202 *hi = acc_lds_size;
5203 machine_function *machfun = cfun->machine;
5204 machfun->reduction_base = gang_private_size_opt;
5205 machfun->reduction_limit
5206 = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
5207 }
5208
5209 /* }}} */
5210 /* {{{ ASM Output. */
5211
5212 /* Implement TARGET_ASM_FILE_START.
5213
5214 Print assembler file header text. */
5215
5216 static void
5217 output_file_start (void)
5218 {
5219 const char *cpu;
5220 bool use_xnack_attr = true;
5221 bool use_sram_attr = true;
5222 switch (gcn_arch)
5223 {
5224 case PROCESSOR_FIJI:
5225 cpu = "gfx803";
5226 #ifndef HAVE_GCN_XNACK_FIJI
5227 use_xnack_attr = false;
5228 #endif
5229 use_sram_attr = false;
5230 break;
5231 case PROCESSOR_VEGA10:
5232 cpu = "gfx900";
5233 #ifndef HAVE_GCN_XNACK_GFX900
5234 use_xnack_attr = false;
5235 #endif
5236 use_sram_attr = false;
5237 break;
5238 case PROCESSOR_VEGA20:
5239 cpu = "gfx906";
5240 #ifndef HAVE_GCN_XNACK_GFX906
5241 use_xnack_attr = false;
5242 #endif
5243 use_sram_attr = false;
5244 break;
5245 case PROCESSOR_GFX908:
5246 cpu = "gfx908";
5247 #ifndef HAVE_GCN_XNACK_GFX908
5248 use_xnack_attr = false;
5249 #endif
5250 #ifndef HAVE_GCN_SRAM_ECC_GFX908
5251 use_sram_attr = false;
5252 #endif
5253 break;
5254 default: gcc_unreachable ();
5255 }
5256
5257 #if HAVE_GCN_ASM_V3_SYNTAX
5258 const char *xnack = (flag_xnack ? "+xnack" : "");
5259 const char *sram_ecc = (flag_sram_ecc ? "+sram-ecc" : "");
5260 #endif
5261 #if HAVE_GCN_ASM_V4_SYNTAX
5262 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
5263 configuration. In GCC binaries, this is true for SRAM ECC, but not
5264 XNACK. */
5265 const char *xnack = (flag_xnack ? ":xnack+" : ":xnack-");
5266 const char *sram_ecc = (flag_sram_ecc == SRAM_ECC_ON ? ":sramecc+"
5267 : flag_sram_ecc == SRAM_ECC_OFF ? ":sramecc-"
5268 : "");
5269 #endif
5270 if (!use_xnack_attr)
5271 xnack = "";
5272 if (!use_sram_attr)
5273 sram_ecc = "";
5274
5275 fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n",
5276 cpu,
5277 #if HAVE_GCN_ASM_V3_SYNTAX
5278 xnack, sram_ecc
5279 #endif
5280 #ifdef HAVE_GCN_ASM_V4_SYNTAX
5281 sram_ecc, xnack
5282 #endif
5283 );
5284 }
5285
5286 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
5287
5288 Print the initial definition of a function name.
5289
5290 For GCN kernel entry points this includes all the HSA meta-data, special
5291 alignment constraints that don't apply to regular functions, and magic
5292 comments that pass information to mkoffload. */
5293
5294 void
5295 gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
5296 {
5297 int sgpr, vgpr;
5298 bool xnack_enabled = false;
5299
5300 fputs ("\n\n", file);
5301
5302 if (cfun && cfun->machine && cfun->machine->normal_function)
5303 {
5304 fputs ("\t.type\t", file);
5305 assemble_name (file, name);
5306 fputs (",@function\n", file);
5307 assemble_name (file, name);
5308 fputs (":\n", file);
5309 return;
5310 }
5311
5312 /* Determine count of sgpr/vgpr registers by looking for last
5313 one used. */
5314 for (sgpr = 101; sgpr >= 0; sgpr--)
5315 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr))
5316 break;
5317 sgpr++;
5318 for (vgpr = 255; vgpr >= 0; vgpr--)
5319 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
5320 break;
5321 vgpr++;
5322
5323 if (!leaf_function_p ())
5324 {
5325 /* We can't know how many registers function calls might use. */
5326 if (vgpr < MAX_NORMAL_VGPR_COUNT)
5327 vgpr = MAX_NORMAL_VGPR_COUNT;
5328 if (sgpr < MAX_NORMAL_SGPR_COUNT)
5329 sgpr = MAX_NORMAL_SGPR_COUNT;
5330 }
5331
5332 fputs ("\t.rodata\n"
5333 "\t.p2align\t6\n"
5334 "\t.amdhsa_kernel\t", file);
5335 assemble_name (file, name);
5336 fputs ("\n", file);
5337 int reg = FIRST_SGPR_REG;
5338 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
5339 {
5340 int reg_first = -1;
5341 int reg_last;
5342 if ((cfun->machine->args.requested & (1 << a))
5343 && (gcn_kernel_arg_types[a].fixed_regno < 0))
5344 {
5345 reg_first = reg;
5346 reg_last = (reg_first
5347 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode)
5348 / UNITS_PER_WORD) - 1);
5349 reg = reg_last + 1;
5350 }
5351
5352 if (gcn_kernel_arg_types[a].header_pseudo)
5353 {
5354 fprintf (file, "\t %s%s\t%i",
5355 (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
5356 gcn_kernel_arg_types[a].header_pseudo,
5357 (cfun->machine->args.requested & (1 << a)) != 0);
5358 if (reg_first != -1)
5359 {
5360 fprintf (file, " ; (");
5361 for (int i = reg_first; i <= reg_last; ++i)
5362 {
5363 if (i != reg_first)
5364 fprintf (file, ", ");
5365 fprintf (file, "%s", reg_names[i]);
5366 }
5367 fprintf (file, ")");
5368 }
5369 fprintf (file, "\n");
5370 }
5371 else if (gcn_kernel_arg_types[a].fixed_regno >= 0
5372 && cfun->machine->args.requested & (1 << a))
5373 fprintf (file, "\t ; %s\t%i (%s)\n",
5374 gcn_kernel_arg_types[a].name,
5375 (cfun->machine->args.requested & (1 << a)) != 0,
5376 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
5377 }
5378 fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
5379 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
5380 ? 2
5381 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
5382 ? 1 : 0);
5383 fprintf (file,
5384 "\t .amdhsa_next_free_vgpr\t%i\n"
5385 "\t .amdhsa_next_free_sgpr\t%i\n"
5386 "\t .amdhsa_reserve_vcc\t1\n"
5387 "\t .amdhsa_reserve_flat_scratch\t0\n"
5388 "\t .amdhsa_reserve_xnack_mask\t%i\n"
5389 "\t .amdhsa_private_segment_fixed_size\t%i\n"
5390 "\t .amdhsa_group_segment_fixed_size\t%u\n"
5391 "\t .amdhsa_float_denorm_mode_32\t3\n"
5392 "\t .amdhsa_float_denorm_mode_16_64\t3\n",
5393 vgpr,
5394 sgpr,
5395 xnack_enabled,
5396 /* workitem_private_segment_bytes_size needs to be
5397 one 64th the wave-front stack size. */
5398 stack_size_opt / 64,
5399 LDS_SIZE);
5400 fputs ("\t.end_amdhsa_kernel\n", file);
5401
5402 #if 1
5403 /* The following is YAML embedded in assembler; tabs are not allowed. */
5404 fputs (" .amdgpu_metadata\n"
5405 " amdhsa.version:\n"
5406 " - 1\n"
5407 " - 0\n"
5408 " amdhsa.kernels:\n"
5409 " - .name: ", file);
5410 assemble_name (file, name);
5411 fputs ("\n .symbol: ", file);
5412 assemble_name (file, name);
5413 fprintf (file,
5414 ".kd\n"
5415 " .kernarg_segment_size: %i\n"
5416 " .kernarg_segment_align: %i\n"
5417 " .group_segment_fixed_size: %u\n"
5418 " .private_segment_fixed_size: %i\n"
5419 " .wavefront_size: 64\n"
5420 " .sgpr_count: %i\n"
5421 " .vgpr_count: %i\n"
5422 " .max_flat_workgroup_size: 1024\n",
5423 cfun->machine->kernarg_segment_byte_size,
5424 cfun->machine->kernarg_segment_alignment,
5425 LDS_SIZE,
5426 stack_size_opt / 64,
5427 sgpr, vgpr);
5428 fputs (" .end_amdgpu_metadata\n", file);
5429 #endif
5430
5431 fputs ("\t.text\n", file);
5432 fputs ("\t.align\t256\n", file);
5433 fputs ("\t.type\t", file);
5434 assemble_name (file, name);
5435 fputs (",@function\n", file);
5436 assemble_name (file, name);
5437 fputs (":\n", file);
5438
5439 /* This comment is read by mkoffload. */
5440 if (flag_openacc)
5441 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n",
5442 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG),
5443 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER),
5444 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name);
5445 }
5446
5447 /* Implement TARGET_ASM_SELECT_SECTION.
5448
5449 Return the section into which EXP should be placed. */
5450
5451 static section *
5452 gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align)
5453 {
5454 if (TREE_TYPE (exp) != error_mark_node
5455 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS)
5456 {
5457 if (!DECL_P (exp))
5458 return get_section (".lds_bss",
5459 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG,
5460 NULL);
5461
5462 return get_named_section (exp, ".lds_bss", reloc);
5463 }
5464
5465 return default_elf_select_section (exp, reloc, align);
5466 }
5467
5468 /* Implement TARGET_ASM_FUNCTION_PROLOGUE.
5469
5470 Emits custom text into the assembler file at the head of each function. */
5471
5472 static void
5473 gcn_target_asm_function_prologue (FILE *file)
5474 {
5475 machine_function *offsets = gcn_compute_frame_offsets ();
5476
5477 asm_fprintf (file, "\t; using %s addressing in function\n",
5478 offsets->use_flat_addressing ? "flat" : "global");
5479
5480 if (offsets->normal_function)
5481 {
5482 asm_fprintf (file, "\t; frame pointer needed: %s\n",
5483 offsets->need_frame_pointer ? "true" : "false");
5484 asm_fprintf (file, "\t; lr needs saving: %s\n",
5485 offsets->lr_needs_saving ? "true" : "false");
5486 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5487 offsets->outgoing_args_size);
5488 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size);
5489 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5490 asm_fprintf (file, "\t; callee save size: %wd\n",
5491 offsets->callee_saves);
5492 }
5493 else
5494 {
5495 asm_fprintf (file, "\t; HSA kernel entry point\n");
5496 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
5497 asm_fprintf (file, "\t; outgoing args size: %wd\n",
5498 offsets->outgoing_args_size);
5499 }
5500 }
5501
5502 /* Helper function for print_operand and print_operand_address.
5503
5504 Print a register as the assembler requires, according to mode and name. */
5505
5506 static void
5507 print_reg (FILE *file, rtx x)
5508 {
5509 machine_mode mode = GET_MODE (x);
5510 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode
5511 || mode == HFmode || mode == SFmode
5512 || mode == V64SFmode || mode == V64SImode
5513 || mode == V64QImode || mode == V64HImode)
5514 fprintf (file, "%s", reg_names[REGNO (x)]);
5515 else if (mode == DImode || mode == V64DImode
5516 || mode == DFmode || mode == V64DFmode)
5517 {
5518 if (SGPR_REGNO_P (REGNO (x)))
5519 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5520 REGNO (x) - FIRST_SGPR_REG + 1);
5521 else if (VGPR_REGNO_P (REGNO (x)))
5522 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5523 REGNO (x) - FIRST_VGPR_REG + 1);
5524 else if (REGNO (x) == FLAT_SCRATCH_REG)
5525 fprintf (file, "flat_scratch");
5526 else if (REGNO (x) == EXEC_REG)
5527 fprintf (file, "exec");
5528 else if (REGNO (x) == VCC_LO_REG)
5529 fprintf (file, "vcc");
5530 else
5531 fprintf (file, "[%s:%s]",
5532 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]);
5533 }
5534 else if (mode == TImode)
5535 {
5536 if (SGPR_REGNO_P (REGNO (x)))
5537 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG,
5538 REGNO (x) - FIRST_SGPR_REG + 3);
5539 else if (VGPR_REGNO_P (REGNO (x)))
5540 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
5541 REGNO (x) - FIRST_VGPR_REG + 3);
5542 else
5543 gcc_unreachable ();
5544 }
5545 else
5546 gcc_unreachable ();
5547 }
5548
5549 /* Implement TARGET_SECTION_TYPE_FLAGS.
5550
5551 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */
5552
5553 static unsigned int
5554 gcn_section_type_flags (tree decl, const char *name, int reloc)
5555 {
5556 if (strcmp (name, ".lds_bss") == 0)
5557 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG;
5558
5559 return default_section_type_flags (decl, name, reloc);
5560 }
5561
5562 /* Helper function for gcn_asm_output_symbol_ref.
5563
5564 FIXME: This function is used to lay out gang-private variables in LDS
5565 on a per-CU basis.
5566 There may be cases in which gang-private variables in different compilation
5567 units could clobber each other. In that case we should be relying on the
5568 linker to lay out gang-private LDS space, but that doesn't appear to be
5569 possible at present. */
5570
5571 static void
5572 gcn_print_lds_decl (FILE *f, tree var)
5573 {
5574 int *offset;
5575 if ((offset = lds_allocs.get (var)))
5576 fprintf (f, "%u", (unsigned) *offset);
5577 else
5578 {
5579 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var);
5580 tree type = TREE_TYPE (var);
5581 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5582 if (size > align && size > 4 && align < 8)
5583 align = 8;
5584
5585 gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
5586
5587 lds_allocs.put (var, gang_private_hwm);
5588 fprintf (f, "%u", gang_private_hwm);
5589 gang_private_hwm += size;
5590 if (gang_private_hwm > gang_private_size_opt)
5591 error ("%d bytes of gang-private data-share memory exhausted"
5592 " (increase with %<-mgang-private-size=%d%>, for example)",
5593 gang_private_size_opt, gang_private_hwm);
5594 }
5595 }
5596
5597 /* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */
5598
5599 void
5600 gcn_asm_output_symbol_ref (FILE *file, rtx x)
5601 {
5602 tree decl;
5603 if (cfun
5604 && (decl = SYMBOL_REF_DECL (x)) != 0
5605 && TREE_CODE (decl) == VAR_DECL
5606 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5607 {
5608 /* LDS symbols (emitted using this hook) are only used at present
5609 to propagate worker values from an active thread to neutered
5610 threads. Use the same offset for each such block, but don't
5611 use zero because null pointers are used to identify the active
5612 thread in GOACC_single_copy_start calls. */
5613 gcn_print_lds_decl (file, decl);
5614 }
5615 else
5616 {
5617 assemble_name (file, XSTR (x, 0));
5618 /* FIXME: See above -- this condition is unreachable. */
5619 if (cfun
5620 && (decl = SYMBOL_REF_DECL (x)) != 0
5621 && TREE_CODE (decl) == VAR_DECL
5622 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl))))
5623 fputs ("@abs32", file);
5624 }
5625 }
5626
5627 /* Implement TARGET_CONSTANT_ALIGNMENT.
5628
5629 Returns the alignment in bits of a constant that is being placed in memory.
5630 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object
5631 would ordinarily have. */
5632
5633 static HOST_WIDE_INT
5634 gcn_constant_alignment (const_tree ARG_UNUSED (constant),
5635 HOST_WIDE_INT basic_align)
5636 {
5637 return basic_align > 128 ? basic_align : 128;
5638 }
5639
5640 /* Implement PRINT_OPERAND_ADDRESS via gcn.h. */
5641
5642 void
5643 print_operand_address (FILE *file, rtx mem)
5644 {
5645 gcc_assert (MEM_P (mem));
5646
5647 rtx reg;
5648 rtx offset;
5649 addr_space_t as = MEM_ADDR_SPACE (mem);
5650 rtx addr = XEXP (mem, 0);
5651 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS);
5652
5653 if (AS_SCRATCH_P (as))
5654 switch (GET_CODE (addr))
5655 {
5656 case REG:
5657 print_reg (file, addr);
5658 break;
5659
5660 case PLUS:
5661 reg = XEXP (addr, 0);
5662 offset = XEXP (addr, 1);
5663 print_reg (file, reg);
5664 if (GET_CODE (offset) == CONST_INT)
5665 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5666 else
5667 abort ();
5668 break;
5669
5670 default:
5671 debug_rtx (addr);
5672 abort ();
5673 }
5674 else if (AS_ANY_FLAT_P (as))
5675 {
5676 if (GET_CODE (addr) == REG)
5677 print_reg (file, addr);
5678 else
5679 {
5680 gcc_assert (TARGET_GCN5_PLUS);
5681 print_reg (file, XEXP (addr, 0));
5682 }
5683 }
5684 else if (AS_GLOBAL_P (as))
5685 {
5686 gcc_assert (TARGET_GCN5_PLUS);
5687
5688 rtx base = addr;
5689 rtx vgpr_offset = NULL_RTX;
5690
5691 if (GET_CODE (addr) == PLUS)
5692 {
5693 base = XEXP (addr, 0);
5694
5695 if (GET_CODE (base) == PLUS)
5696 {
5697 /* (SGPR + VGPR) + CONST */
5698 vgpr_offset = XEXP (base, 1);
5699 base = XEXP (base, 0);
5700 }
5701 else
5702 {
5703 rtx offset = XEXP (addr, 1);
5704
5705 if (REG_P (offset))
5706 /* SGPR + VGPR */
5707 vgpr_offset = offset;
5708 else if (CONST_INT_P (offset))
5709 /* VGPR + CONST or SGPR + CONST */
5710 ;
5711 else
5712 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5713 }
5714 }
5715
5716 if (REG_P (base))
5717 {
5718 if (VGPR_REGNO_P (REGNO (base)))
5719 print_reg (file, base);
5720 else if (SGPR_REGNO_P (REGNO (base)))
5721 {
5722 /* The assembler requires a 64-bit VGPR pair here, even though
5723 the offset should be only 32-bit. */
5724 if (vgpr_offset == NULL_RTX)
5725 /* In this case, the vector offset is zero, so we use the first
5726 lane of v1, which is initialized to zero. */
5727 {
5728 if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5729 fprintf (file, "v1");
5730 else
5731 fprintf (file, "v[1:2]");
5732 }
5733 else if (REG_P (vgpr_offset)
5734 && VGPR_REGNO_P (REGNO (vgpr_offset)))
5735 {
5736 if (HAVE_GCN_ASM_GLOBAL_LOAD_FIXED)
5737 fprintf (file, "v%d",
5738 REGNO (vgpr_offset) - FIRST_VGPR_REG);
5739 else
5740 fprintf (file, "v[%d:%d]",
5741 REGNO (vgpr_offset) - FIRST_VGPR_REG,
5742 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1);
5743 }
5744 else
5745 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5746 }
5747 }
5748 else
5749 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
5750 }
5751 else if (AS_ANY_DS_P (as))
5752 switch (GET_CODE (addr))
5753 {
5754 case REG:
5755 print_reg (file, addr);
5756 break;
5757
5758 case PLUS:
5759 reg = XEXP (addr, 0);
5760 print_reg (file, reg);
5761 break;
5762
5763 default:
5764 debug_rtx (addr);
5765 abort ();
5766 }
5767 else
5768 switch (GET_CODE (addr))
5769 {
5770 case REG:
5771 print_reg (file, addr);
5772 fprintf (file, ", 0");
5773 break;
5774
5775 case PLUS:
5776 reg = XEXP (addr, 0);
5777 offset = XEXP (addr, 1);
5778 print_reg (file, reg);
5779 fprintf (file, ", ");
5780 if (GET_CODE (offset) == REG)
5781 print_reg (file, reg);
5782 else if (GET_CODE (offset) == CONST_INT)
5783 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
5784 else
5785 abort ();
5786 break;
5787
5788 default:
5789 debug_rtx (addr);
5790 abort ();
5791 }
5792 }
5793
5794 /* Implement PRINT_OPERAND via gcn.h.
5795
5796 b - print operand size as untyped operand (b8/b16/b32/b64)
5797 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64)
5798 i - print operand size as untyped operand (i16/b32/i64)
5799 I - print operand size as SI/DI untyped operand(i32/b32/i64)
5800 u - print operand size as untyped operand (u16/u32/u64)
5801 U - print operand size as SI/DI untyped operand(u32/u64)
5802 o - print operand size as memory access size for loads
5803 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4)
5804 s - print operand size as memory access size for stores
5805 (byte/short/dword/dwordx2/wordx3/dwordx4)
5806 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...)
5807 c - print inverse conditional code for s_cbranch
5808 D - print conditional code for s_cmp (eq_u64/lg_u64...)
5809 E - print conditional code for v_cmp (eq_u64/ne_u64...)
5810 A - print address in formatting suitable for given address space.
5811 O - print offset:n for data share operations.
5812 ^ - print "_co" suffix for GCN5 mnemonics
5813 g - print "glc", if appropriate for given MEM
5814 */
5815
5816 void
5817 print_operand (FILE *file, rtx x, int code)
5818 {
5819 int xcode = x ? GET_CODE (x) : 0;
5820 bool invert = false;
5821 switch (code)
5822 {
5823 /* Instructions have the following suffixes.
5824 If there are two suffixes, the first is the destination type,
5825 and the second is the source type.
5826
5827 B32 Bitfield (untyped data) 32-bit
5828 B64 Bitfield (untyped data) 64-bit
5829 F16 floating-point 16-bit
5830 F32 floating-point 32-bit (IEEE 754 single-precision float)
5831 F64 floating-point 64-bit (IEEE 754 double-precision float)
5832 I16 signed 32-bit integer
5833 I32 signed 32-bit integer
5834 I64 signed 64-bit integer
5835 U16 unsigned 32-bit integer
5836 U32 unsigned 32-bit integer
5837 U64 unsigned 64-bit integer */
5838
5839 /* Print operand size as untyped suffix. */
5840 case 'b':
5841 {
5842 const char *s = "";
5843 machine_mode mode = GET_MODE (x);
5844 if (VECTOR_MODE_P (mode))
5845 mode = GET_MODE_INNER (mode);
5846 switch (GET_MODE_SIZE (mode))
5847 {
5848 case 1:
5849 s = "_b8";
5850 break;
5851 case 2:
5852 s = "_b16";
5853 break;
5854 case 4:
5855 s = "_b32";
5856 break;
5857 case 8:
5858 s = "_b64";
5859 break;
5860 default:
5861 output_operand_lossage ("invalid operand %%xn code");
5862 return;
5863 }
5864 fputs (s, file);
5865 }
5866 return;
5867 case 'B':
5868 {
5869 const char *s = "";
5870 machine_mode mode = GET_MODE (x);
5871 if (VECTOR_MODE_P (mode))
5872 mode = GET_MODE_INNER (mode);
5873 switch (GET_MODE_SIZE (mode))
5874 {
5875 case 1:
5876 case 2:
5877 case 4:
5878 s = "_b32";
5879 break;
5880 case 8:
5881 s = "_b64";
5882 break;
5883 default:
5884 output_operand_lossage ("invalid operand %%xn code");
5885 return;
5886 }
5887 fputs (s, file);
5888 }
5889 return;
5890 case 'e':
5891 fputs ("sext(", file);
5892 print_operand (file, x, 0);
5893 fputs (")", file);
5894 return;
5895 case 'i':
5896 case 'I':
5897 case 'u':
5898 case 'U':
5899 {
5900 bool signed_p = code == 'i';
5901 bool min32_p = code == 'I' || code == 'U';
5902 const char *s = "";
5903 machine_mode mode = GET_MODE (x);
5904 if (VECTOR_MODE_P (mode))
5905 mode = GET_MODE_INNER (mode);
5906 if (mode == VOIDmode)
5907 switch (GET_CODE (x))
5908 {
5909 case CONST_INT:
5910 s = signed_p ? "_i32" : "_u32";
5911 break;
5912 case CONST_DOUBLE:
5913 s = "_f64";
5914 break;
5915 default:
5916 output_operand_lossage ("invalid operand %%xn code");
5917 return;
5918 }
5919 else if (FLOAT_MODE_P (mode))
5920 switch (GET_MODE_SIZE (mode))
5921 {
5922 case 2:
5923 s = "_f16";
5924 break;
5925 case 4:
5926 s = "_f32";
5927 break;
5928 case 8:
5929 s = "_f64";
5930 break;
5931 default:
5932 output_operand_lossage ("invalid operand %%xn code");
5933 return;
5934 }
5935 else if (min32_p)
5936 switch (GET_MODE_SIZE (mode))
5937 {
5938 case 1:
5939 case 2:
5940 case 4:
5941 s = signed_p ? "_i32" : "_u32";
5942 break;
5943 case 8:
5944 s = signed_p ? "_i64" : "_u64";
5945 break;
5946 default:
5947 output_operand_lossage ("invalid operand %%xn code");
5948 return;
5949 }
5950 else
5951 switch (GET_MODE_SIZE (mode))
5952 {
5953 case 1:
5954 s = signed_p ? "_i8" : "_u8";
5955 break;
5956 case 2:
5957 s = signed_p ? "_i16" : "_u16";
5958 break;
5959 case 4:
5960 s = signed_p ? "_i32" : "_u32";
5961 break;
5962 case 8:
5963 s = signed_p ? "_i64" : "_u64";
5964 break;
5965 default:
5966 output_operand_lossage ("invalid operand %%xn code");
5967 return;
5968 }
5969 fputs (s, file);
5970 }
5971 return;
5972 /* Print operand size as untyped suffix. */
5973 case 'o':
5974 {
5975 const char *s = 0;
5976 switch (GET_MODE_SIZE (GET_MODE (x)))
5977 {
5978 case 1:
5979 s = "_ubyte";
5980 break;
5981 case 2:
5982 s = "_ushort";
5983 break;
5984 /* The following are full-vector variants. */
5985 case 64:
5986 s = "_ubyte";
5987 break;
5988 case 128:
5989 s = "_ushort";
5990 break;
5991 }
5992
5993 if (s)
5994 {
5995 fputs (s, file);
5996 return;
5997 }
5998
5999 /* Fall-through - the other cases for 'o' are the same as for 's'. */
6000 gcc_fallthrough();
6001 }
6002 case 's':
6003 {
6004 const char *s = "";
6005 switch (GET_MODE_SIZE (GET_MODE (x)))
6006 {
6007 case 1:
6008 s = "_byte";
6009 break;
6010 case 2:
6011 s = "_short";
6012 break;
6013 case 4:
6014 s = "_dword";
6015 break;
6016 case 8:
6017 s = "_dwordx2";
6018 break;
6019 case 12:
6020 s = "_dwordx3";
6021 break;
6022 case 16:
6023 s = "_dwordx4";
6024 break;
6025 case 32:
6026 s = "_dwordx8";
6027 break;
6028 case 64:
6029 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16";
6030 break;
6031 /* The following are full-vector variants. */
6032 case 128:
6033 s = "_short";
6034 break;
6035 case 256:
6036 s = "_dword";
6037 break;
6038 case 512:
6039 s = "_dwordx2";
6040 break;
6041 default:
6042 output_operand_lossage ("invalid operand %%xn code");
6043 return;
6044 }
6045 fputs (s, file);
6046 }
6047 return;
6048 case 'A':
6049 if (xcode != MEM)
6050 {
6051 output_operand_lossage ("invalid %%xn code");
6052 return;
6053 }
6054 print_operand_address (file, x);
6055 return;
6056 case 'O':
6057 {
6058 if (xcode != MEM)
6059 {
6060 output_operand_lossage ("invalid %%xn code");
6061 return;
6062 }
6063 if (AS_GDS_P (MEM_ADDR_SPACE (x)))
6064 fprintf (file, " gds");
6065
6066 rtx x0 = XEXP (x, 0);
6067 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x)))
6068 {
6069 gcc_assert (TARGET_GCN5_PLUS);
6070
6071 fprintf (file, ", ");
6072
6073 rtx base = x0;
6074 rtx const_offset = NULL_RTX;
6075
6076 if (GET_CODE (base) == PLUS)
6077 {
6078 rtx offset = XEXP (x0, 1);
6079 base = XEXP (x0, 0);
6080
6081 if (GET_CODE (base) == PLUS)
6082 /* (SGPR + VGPR) + CONST */
6083 /* Ignore the VGPR offset for this operand. */
6084 base = XEXP (base, 0);
6085
6086 if (CONST_INT_P (offset))
6087 const_offset = XEXP (x0, 1);
6088 else if (REG_P (offset))
6089 /* SGPR + VGPR */
6090 /* Ignore the VGPR offset for this operand. */
6091 ;
6092 else
6093 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6094 }
6095
6096 if (REG_P (base))
6097 {
6098 if (VGPR_REGNO_P (REGNO (base)))
6099 /* The VGPR address is specified in the %A operand. */
6100 fprintf (file, "off");
6101 else if (SGPR_REGNO_P (REGNO (base)))
6102 print_reg (file, base);
6103 else
6104 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6105 }
6106 else
6107 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address");
6108
6109 if (const_offset != NULL_RTX)
6110 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC,
6111 INTVAL (const_offset));
6112
6113 return;
6114 }
6115
6116 if (GET_CODE (x0) == REG)
6117 return;
6118 if (GET_CODE (x0) != PLUS)
6119 {
6120 output_operand_lossage ("invalid %%xn code");
6121 return;
6122 }
6123 rtx val = XEXP (x0, 1);
6124 if (GET_CODE (val) == CONST_VECTOR)
6125 val = CONST_VECTOR_ELT (val, 0);
6126 if (GET_CODE (val) != CONST_INT)
6127 {
6128 output_operand_lossage ("invalid %%xn code");
6129 return;
6130 }
6131 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val));
6132
6133 }
6134 return;
6135 case 'c':
6136 invert = true;
6137 /* Fall through. */
6138 case 'C':
6139 {
6140 const char *s;
6141 bool num = false;
6142 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0)))
6143 {
6144 output_operand_lossage ("invalid %%xn code");
6145 return;
6146 }
6147 switch (REGNO (XEXP (x, 0)))
6148 {
6149 case VCC_REG:
6150 case VCCZ_REG:
6151 s = "_vcc";
6152 break;
6153 case SCC_REG:
6154 /* For some reason llvm-mc insists on scc0 instead of sccz. */
6155 num = true;
6156 s = "_scc";
6157 break;
6158 case EXECZ_REG:
6159 s = "_exec";
6160 break;
6161 default:
6162 output_operand_lossage ("invalid %%xn code");
6163 return;
6164 }
6165 fputs (s, file);
6166 if (xcode == (invert ? NE : EQ))
6167 fputc (num ? '0' : 'z', file);
6168 else
6169 fputs (num ? "1" : "nz", file);
6170 return;
6171 }
6172 case 'D':
6173 {
6174 const char *s;
6175 bool cmp_signed = false;
6176 switch (xcode)
6177 {
6178 case EQ:
6179 s = "_eq_";
6180 break;
6181 case NE:
6182 s = "_lg_";
6183 break;
6184 case LT:
6185 s = "_lt_";
6186 cmp_signed = true;
6187 break;
6188 case LE:
6189 s = "_le_";
6190 cmp_signed = true;
6191 break;
6192 case GT:
6193 s = "_gt_";
6194 cmp_signed = true;
6195 break;
6196 case GE:
6197 s = "_ge_";
6198 cmp_signed = true;
6199 break;
6200 case LTU:
6201 s = "_lt_";
6202 break;
6203 case LEU:
6204 s = "_le_";
6205 break;
6206 case GTU:
6207 s = "_gt_";
6208 break;
6209 case GEU:
6210 s = "_ge_";
6211 break;
6212 default:
6213 output_operand_lossage ("invalid %%xn code");
6214 return;
6215 }
6216 fputs (s, file);
6217 fputc (cmp_signed ? 'i' : 'u', file);
6218
6219 machine_mode mode = GET_MODE (XEXP (x, 0));
6220
6221 if (mode == VOIDmode)
6222 mode = GET_MODE (XEXP (x, 1));
6223
6224 /* If both sides are constants, then assume the instruction is in
6225 SImode since s_cmp can only do integer compares. */
6226 if (mode == VOIDmode)
6227 mode = SImode;
6228
6229 switch (GET_MODE_SIZE (mode))
6230 {
6231 case 4:
6232 s = "32";
6233 break;
6234 case 8:
6235 s = "64";
6236 break;
6237 default:
6238 output_operand_lossage ("invalid operand %%xn code");
6239 return;
6240 }
6241 fputs (s, file);
6242 return;
6243 }
6244 case 'E':
6245 {
6246 const char *s;
6247 bool cmp_signed = false;
6248 machine_mode mode = GET_MODE (XEXP (x, 0));
6249
6250 if (mode == VOIDmode)
6251 mode = GET_MODE (XEXP (x, 1));
6252
6253 /* If both sides are constants, assume the instruction is in SFmode
6254 if either operand is floating point, otherwise assume SImode. */
6255 if (mode == VOIDmode)
6256 {
6257 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
6258 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE)
6259 mode = SFmode;
6260 else
6261 mode = SImode;
6262 }
6263
6264 /* Use the same format code for vector comparisons. */
6265 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
6266 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
6267 mode = GET_MODE_INNER (mode);
6268
6269 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT;
6270
6271 switch (xcode)
6272 {
6273 case EQ:
6274 s = "_eq_";
6275 break;
6276 case NE:
6277 s = float_p ? "_neq_" : "_ne_";
6278 break;
6279 case LT:
6280 s = "_lt_";
6281 cmp_signed = true;
6282 break;
6283 case LE:
6284 s = "_le_";
6285 cmp_signed = true;
6286 break;
6287 case GT:
6288 s = "_gt_";
6289 cmp_signed = true;
6290 break;
6291 case GE:
6292 s = "_ge_";
6293 cmp_signed = true;
6294 break;
6295 case LTU:
6296 s = "_lt_";
6297 break;
6298 case LEU:
6299 s = "_le_";
6300 break;
6301 case GTU:
6302 s = "_gt_";
6303 break;
6304 case GEU:
6305 s = "_ge_";
6306 break;
6307 case ORDERED:
6308 s = "_o_";
6309 break;
6310 case UNORDERED:
6311 s = "_u_";
6312 break;
6313 case UNEQ:
6314 s = "_nlg_";
6315 break;
6316 case UNGE:
6317 s = "_nlt_";
6318 break;
6319 case UNGT:
6320 s = "_nle_";
6321 break;
6322 case UNLE:
6323 s = "_ngt_";
6324 break;
6325 case UNLT:
6326 s = "_nge_";
6327 break;
6328 case LTGT:
6329 s = "_lg_";
6330 break;
6331 default:
6332 output_operand_lossage ("invalid %%xn code");
6333 return;
6334 }
6335 fputs (s, file);
6336 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file);
6337
6338 switch (GET_MODE_SIZE (mode))
6339 {
6340 case 1:
6341 output_operand_lossage ("operand %%xn code invalid for QImode");
6342 return;
6343 case 2:
6344 s = "16";
6345 break;
6346 case 4:
6347 s = "32";
6348 break;
6349 case 8:
6350 s = "64";
6351 break;
6352 default:
6353 output_operand_lossage ("invalid operand %%xn code");
6354 return;
6355 }
6356 fputs (s, file);
6357 return;
6358 }
6359 case 'L':
6360 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0);
6361 return;
6362 case 'H':
6363 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0);
6364 return;
6365 case 'R':
6366 /* Print a scalar register number as an integer. Temporary hack. */
6367 gcc_assert (REG_P (x));
6368 fprintf (file, "%u", (int) REGNO (x));
6369 return;
6370 case 'V':
6371 /* Print a vector register number as an integer. Temporary hack. */
6372 gcc_assert (REG_P (x));
6373 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG);
6374 return;
6375 case 0:
6376 if (xcode == REG)
6377 print_reg (file, x);
6378 else if (xcode == MEM)
6379 output_address (GET_MODE (x), x);
6380 else if (xcode == CONST_INT)
6381 fprintf (file, "%i", (int) INTVAL (x));
6382 else if (xcode == CONST_VECTOR)
6383 print_operand (file, CONST_VECTOR_ELT (x, 0), code);
6384 else if (xcode == CONST_DOUBLE)
6385 {
6386 const char *str;
6387 switch (gcn_inline_fp_constant_p (x, false))
6388 {
6389 case 240:
6390 str = "0.5";
6391 break;
6392 case 241:
6393 str = "-0.5";
6394 break;
6395 case 242:
6396 str = "1.0";
6397 break;
6398 case 243:
6399 str = "-1.0";
6400 break;
6401 case 244:
6402 str = "2.0";
6403 break;
6404 case 245:
6405 str = "-2.0";
6406 break;
6407 case 246:
6408 str = "4.0";
6409 break;
6410 case 247:
6411 str = "-4.0";
6412 break;
6413 case 248:
6414 str = "1/pi";
6415 break;
6416 default:
6417 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode
6418 ? DImode : SImode,
6419 x, GET_MODE (x), 0);
6420 if (x)
6421 print_operand (file, ix, code);
6422 else
6423 output_operand_lossage ("invalid fp constant");
6424 return;
6425 break;
6426 }
6427 fprintf (file, str);
6428 return;
6429 }
6430 else
6431 output_addr_const (file, x);
6432 return;
6433 case '^':
6434 if (TARGET_GCN5_PLUS)
6435 fputs ("_co", file);
6436 return;
6437 case 'g':
6438 gcc_assert (xcode == MEM);
6439 if (MEM_VOLATILE_P (x))
6440 fputs (" glc", file);
6441 return;
6442 default:
6443 output_operand_lossage ("invalid %%xn code");
6444 }
6445 gcc_unreachable ();
6446 }
6447
6448 /* Implement DBX_REGISTER_NUMBER macro.
6449
6450 Return the DWARF register number that corresponds to the GCC internal
6451 REGNO. */
6452
6453 unsigned int
6454 gcn_dwarf_register_number (unsigned int regno)
6455 {
6456 /* Registers defined in DWARF. */
6457 if (regno == EXEC_LO_REG)
6458 return 17;
6459 /* We need to use a more complex DWARF expression for this
6460 else if (regno == EXEC_HI_REG)
6461 return 17; */
6462 else if (regno == VCC_LO_REG)
6463 return 768;
6464 /* We need to use a more complex DWARF expression for this
6465 else if (regno == VCC_HI_REG)
6466 return 768; */
6467 else if (regno == SCC_REG)
6468 return 128;
6469 else if (regno == DWARF_LINK_REGISTER)
6470 return 16;
6471 else if (SGPR_REGNO_P (regno))
6472 {
6473 if (regno - FIRST_SGPR_REG < 64)
6474 return (regno - FIRST_SGPR_REG + 32);
6475 else
6476 return (regno - FIRST_SGPR_REG + 1024);
6477 }
6478 else if (VGPR_REGNO_P (regno))
6479 return (regno - FIRST_VGPR_REG + 2560);
6480
6481 /* Otherwise, there's nothing sensible to do. */
6482 return regno + 100000;
6483 }
6484
6485 /* Implement TARGET_DWARF_REGISTER_SPAN.
6486
6487 DImode and Vector DImode require additional registers. */
6488
6489 static rtx
6490 gcn_dwarf_register_span (rtx rtl)
6491 {
6492 machine_mode mode = GET_MODE (rtl);
6493
6494 if (VECTOR_MODE_P (mode))
6495 mode = GET_MODE_INNER (mode);
6496
6497 if (GET_MODE_SIZE (mode) != 8)
6498 return NULL_RTX;
6499
6500 unsigned regno = REGNO (rtl);
6501
6502 if (regno == DWARF_LINK_REGISTER)
6503 return NULL_RTX;
6504
6505 rtx p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
6506 XVECEXP (p, 0, 0) = gen_rtx_REG (SImode, regno);
6507 XVECEXP (p, 0, 1) = gen_rtx_REG (SImode, regno + 1);
6508
6509 return p;
6510 }
6511
6512 /* }}} */
6513 /* {{{ TARGET hook overrides. */
6514
6515 #undef TARGET_ADDR_SPACE_ADDRESS_MODE
6516 #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode
6517 #undef TARGET_ADDR_SPACE_DEBUG
6518 #define TARGET_ADDR_SPACE_DEBUG gcn_addr_space_debug
6519 #undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
6520 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
6521 gcn_addr_space_legitimate_address_p
6522 #undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
6523 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address
6524 #undef TARGET_ADDR_SPACE_POINTER_MODE
6525 #define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode
6526 #undef TARGET_ADDR_SPACE_SUBSET_P
6527 #define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p
6528 #undef TARGET_ADDR_SPACE_CONVERT
6529 #define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert
6530 #undef TARGET_ARG_PARTIAL_BYTES
6531 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes
6532 #undef TARGET_ASM_ALIGNED_DI_OP
6533 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t"
6534 #undef TARGET_ASM_FILE_START
6535 #define TARGET_ASM_FILE_START output_file_start
6536 #undef TARGET_ASM_FUNCTION_PROLOGUE
6537 #define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue
6538 #undef TARGET_ASM_SELECT_SECTION
6539 #define TARGET_ASM_SELECT_SECTION gcn_asm_select_section
6540 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
6541 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template
6542 #undef TARGET_ATTRIBUTE_TABLE
6543 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table
6544 #undef TARGET_BUILTIN_DECL
6545 #define TARGET_BUILTIN_DECL gcn_builtin_decl
6546 #undef TARGET_CAN_CHANGE_MODE_CLASS
6547 #define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class
6548 #undef TARGET_CAN_ELIMINATE
6549 #define TARGET_CAN_ELIMINATE gcn_can_eliminate_p
6550 #undef TARGET_CANNOT_COPY_INSN_P
6551 #define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p
6552 #undef TARGET_CLASS_LIKELY_SPILLED_P
6553 #define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p
6554 #undef TARGET_CLASS_MAX_NREGS
6555 #define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs
6556 #undef TARGET_CONDITIONAL_REGISTER_USAGE
6557 #define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage
6558 #undef TARGET_CONSTANT_ALIGNMENT
6559 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment
6560 #undef TARGET_DEBUG_UNWIND_INFO
6561 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info
6562 #undef TARGET_DWARF_REGISTER_SPAN
6563 #define TARGET_DWARF_REGISTER_SPAN gcn_dwarf_register_span
6564 #undef TARGET_EMUTLS_VAR_INIT
6565 #define TARGET_EMUTLS_VAR_INIT gcn_emutls_var_init
6566 #undef TARGET_EXPAND_BUILTIN
6567 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin
6568 #undef TARGET_FRAME_POINTER_REQUIRED
6569 #define TARGET_FRAME_POINTER_REQUIRED gcn_frame_pointer_rqd
6570 #undef TARGET_FUNCTION_ARG
6571 #undef TARGET_FUNCTION_ARG_ADVANCE
6572 #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance
6573 #define TARGET_FUNCTION_ARG gcn_function_arg
6574 #undef TARGET_FUNCTION_VALUE
6575 #define TARGET_FUNCTION_VALUE gcn_function_value
6576 #undef TARGET_FUNCTION_VALUE_REGNO_P
6577 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p
6578 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
6579 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
6580 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6581 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
6582 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
6583 #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
6584 #undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
6585 #define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
6586 gcn_goacc_create_worker_broadcast_record
6587 #undef TARGET_GOACC_FORK_JOIN
6588 #define TARGET_GOACC_FORK_JOIN gcn_fork_join
6589 #undef TARGET_GOACC_REDUCTION
6590 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
6591 #undef TARGET_GOACC_VALIDATE_DIMS
6592 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
6593 #undef TARGET_GOACC_SHARED_MEM_LAYOUT
6594 #define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
6595 #undef TARGET_HARD_REGNO_MODE_OK
6596 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
6597 #undef TARGET_HARD_REGNO_NREGS
6598 #define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs
6599 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6600 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6601 #undef TARGET_INIT_BUILTINS
6602 #define TARGET_INIT_BUILTINS gcn_init_builtins
6603 #undef TARGET_INIT_LIBFUNCS
6604 #define TARGET_INIT_LIBFUNCS gcn_init_libfuncs
6605 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
6606 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
6607 gcn_ira_change_pseudo_allocno_class
6608 #undef TARGET_LEGITIMATE_CONSTANT_P
6609 #define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p
6610 #undef TARGET_LRA_P
6611 #define TARGET_LRA_P hook_bool_void_true
6612 #undef TARGET_MACHINE_DEPENDENT_REORG
6613 #define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg
6614 #undef TARGET_MEMORY_MOVE_COST
6615 #define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost
6616 #undef TARGET_MODES_TIEABLE_P
6617 #define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p
6618 #undef TARGET_OPTION_OVERRIDE
6619 #define TARGET_OPTION_OVERRIDE gcn_option_override
6620 #undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED
6621 #define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \
6622 gcn_pretend_outgoing_varargs_named
6623 #undef TARGET_PROMOTE_FUNCTION_MODE
6624 #define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode
6625 #undef TARGET_REGISTER_MOVE_COST
6626 #define TARGET_REGISTER_MOVE_COST gcn_register_move_cost
6627 #undef TARGET_RETURN_IN_MEMORY
6628 #define TARGET_RETURN_IN_MEMORY gcn_return_in_memory
6629 #undef TARGET_RTX_COSTS
6630 #define TARGET_RTX_COSTS gcn_rtx_costs
6631 #undef TARGET_SECONDARY_RELOAD
6632 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload
6633 #undef TARGET_SECTION_TYPE_FLAGS
6634 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags
6635 #undef TARGET_SCALAR_MODE_SUPPORTED_P
6636 #define TARGET_SCALAR_MODE_SUPPORTED_P gcn_scalar_mode_supported_p
6637 #undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P
6638 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \
6639 gcn_small_register_classes_for_mode_p
6640 #undef TARGET_SPILL_CLASS
6641 #define TARGET_SPILL_CLASS gcn_spill_class
6642 #undef TARGET_STRICT_ARGUMENT_NAMING
6643 #define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming
6644 #undef TARGET_TRAMPOLINE_INIT
6645 #define TARGET_TRAMPOLINE_INIT gcn_trampoline_init
6646 #undef TARGET_TRULY_NOOP_TRUNCATION
6647 #define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation
6648 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
6649 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost
6650 #undef TARGET_VECTORIZE_GET_MASK_MODE
6651 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode
6652 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6653 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode
6654 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
6655 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
6656 gcn_preferred_vector_alignment
6657 #undef TARGET_VECTORIZE_RELATED_MODE
6658 #define TARGET_VECTORIZE_RELATED_MODE gcn_related_vector_mode
6659 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
6660 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
6661 gcn_vectorize_support_vector_misalignment
6662 #undef TARGET_VECTORIZE_VEC_PERM_CONST
6663 #define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const
6664 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
6665 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
6666 gcn_vector_alignment_reachable
6667 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6668 #define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p
6669
6670 struct gcc_target targetm = TARGET_INITIALIZER;
6671
6672 #include "gt-gcn.h"
6673 /* }}} */