]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/nvptx/nvptx.c
[nvptx] Rewrite nvptx_goacc_validate_dims to use predicate vars
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82 #define WORKAROUND_PTXJIT_BUG_3 1
83
84 /* The various PTX memory areas an object might reside in. */
85 enum nvptx_data_area
86 {
87 DATA_AREA_GENERIC,
88 DATA_AREA_GLOBAL,
89 DATA_AREA_SHARED,
90 DATA_AREA_LOCAL,
91 DATA_AREA_CONST,
92 DATA_AREA_PARAM,
93 DATA_AREA_MAX
94 };
95
96 /* We record the data area in the target symbol flags. */
97 #define SYMBOL_DATA_AREA(SYM) \
98 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
99 & 7)
100 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
101 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
102
103 /* Record the function decls we've written, and the libfuncs and function
104 decls corresponding to them. */
105 static std::stringstream func_decls;
106
107 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
108 {
109 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
110 static bool equal (rtx a, rtx b) { return a == b; }
111 };
112
113 static GTY((cache))
114 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
115
116 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
117 {
118 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
119 static bool equal (tree a, tree b) { return a == b; }
120 };
121
122 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
123 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
124
125 /* Buffer needed to broadcast across workers. This is used for both
126 worker-neutering and worker broadcasting. It is shared by all
127 functions emitted. The buffer is placed in shared memory. It'd be
128 nice if PTX supported common blocks, because then this could be
129 shared across TUs (taking the largest size). */
130 static unsigned worker_bcast_size;
131 static unsigned worker_bcast_align;
132 static GTY(()) rtx worker_bcast_sym;
133
134 /* Buffer needed for worker reductions. This has to be distinct from
135 the worker broadcast array, as both may be live concurrently. */
136 static unsigned worker_red_size;
137 static unsigned worker_red_align;
138 static GTY(()) rtx worker_red_sym;
139
140 /* Global lock variable, needed for 128bit worker & gang reductions. */
141 static GTY(()) tree global_lock_var;
142
143 /* True if any function references __nvptx_stacks. */
144 static bool need_softstack_decl;
145
146 /* True if any function references __nvptx_uni. */
147 static bool need_unisimt_decl;
148
149 /* Allocate a new, cleared machine_function structure. */
150
151 static struct machine_function *
152 nvptx_init_machine_status (void)
153 {
154 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
155 p->return_mode = VOIDmode;
156 return p;
157 }
158
159 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
160 and -fopenacc is also enabled. */
161
162 static void
163 diagnose_openacc_conflict (bool optval, const char *optname)
164 {
165 if (flag_openacc && optval)
166 error ("option %s is not supported together with -fopenacc", optname);
167 }
168
169 /* Implement TARGET_OPTION_OVERRIDE. */
170
171 static void
172 nvptx_option_override (void)
173 {
174 init_machine_status = nvptx_init_machine_status;
175
176 /* Set toplevel_reorder, unless explicitly disabled. We need
177 reordering so that we emit necessary assembler decls of
178 undeclared variables. */
179 if (!global_options_set.x_flag_toplevel_reorder)
180 flag_toplevel_reorder = 1;
181
182 debug_nonbind_markers_p = 0;
183
184 /* Set flag_no_common, unless explicitly disabled. We fake common
185 using .weak, and that's not entirely accurate, so avoid it
186 unless forced. */
187 if (!global_options_set.x_flag_no_common)
188 flag_no_common = 1;
189
190 /* The patch area requires nops, which we don't have. */
191 if (function_entry_patch_area_size > 0)
192 sorry ("not generating patch area, nops not supported");
193
194 /* Assumes that it will see only hard registers. */
195 flag_var_tracking = 0;
196
197 if (nvptx_optimize < 0)
198 nvptx_optimize = optimize > 0;
199
200 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
201 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
202 declared_libfuncs_htab
203 = hash_table<declared_libfunc_hasher>::create_ggc (17);
204
205 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
206 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
207 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
208
209 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
210 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
211 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
212
213 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
214 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
215 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
216
217 if (TARGET_GOMP)
218 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
219 }
220
221 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
222 deal with ptx ideosyncracies. */
223
224 const char *
225 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
226 {
227 switch (mode)
228 {
229 case E_BLKmode:
230 return ".b8";
231 case E_BImode:
232 return ".pred";
233 case E_QImode:
234 if (promote)
235 return ".u32";
236 else
237 return ".u8";
238 case E_HImode:
239 return ".u16";
240 case E_SImode:
241 return ".u32";
242 case E_DImode:
243 return ".u64";
244
245 case E_SFmode:
246 return ".f32";
247 case E_DFmode:
248 return ".f64";
249
250 case E_V2SImode:
251 return ".v2.u32";
252 case E_V2DImode:
253 return ".v2.u64";
254
255 default:
256 gcc_unreachable ();
257 }
258 }
259
260 /* Encode the PTX data area that DECL (which might not actually be a
261 _DECL) should reside in. */
262
263 static void
264 nvptx_encode_section_info (tree decl, rtx rtl, int first)
265 {
266 default_encode_section_info (decl, rtl, first);
267 if (first && MEM_P (rtl))
268 {
269 nvptx_data_area area = DATA_AREA_GENERIC;
270
271 if (TREE_CONSTANT (decl))
272 area = DATA_AREA_CONST;
273 else if (TREE_CODE (decl) == VAR_DECL)
274 {
275 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
276 {
277 area = DATA_AREA_SHARED;
278 if (DECL_INITIAL (decl))
279 error ("static initialization of variable %q+D in %<.shared%>"
280 " memory is not supported", decl);
281 }
282 else
283 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
284 }
285
286 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
287 }
288 }
289
290 /* Return the PTX name of the data area in which SYM should be
291 placed. The symbol must have already been processed by
292 nvptx_encode_seciton_info, or equivalent. */
293
294 static const char *
295 section_for_sym (rtx sym)
296 {
297 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
298 /* Same order as nvptx_data_area enum. */
299 static char const *const areas[] =
300 {"", ".global", ".shared", ".local", ".const", ".param"};
301
302 return areas[area];
303 }
304
305 /* Similarly for a decl. */
306
307 static const char *
308 section_for_decl (const_tree decl)
309 {
310 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
311 }
312
313 /* Check NAME for special function names and redirect them by returning a
314 replacement. This applies to malloc, free and realloc, for which we
315 want to use libgcc wrappers, and call, which triggers a bug in
316 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
317 not active in an offload compiler -- the names are all set by the
318 host-side compiler. */
319
320 static const char *
321 nvptx_name_replacement (const char *name)
322 {
323 if (strcmp (name, "call") == 0)
324 return "__nvptx_call";
325 if (strcmp (name, "malloc") == 0)
326 return "__nvptx_malloc";
327 if (strcmp (name, "free") == 0)
328 return "__nvptx_free";
329 if (strcmp (name, "realloc") == 0)
330 return "__nvptx_realloc";
331 return name;
332 }
333
334 /* If MODE should be treated as two registers of an inner mode, return
335 that inner mode. Otherwise return VOIDmode. */
336
337 static machine_mode
338 maybe_split_mode (machine_mode mode)
339 {
340 if (COMPLEX_MODE_P (mode))
341 return GET_MODE_INNER (mode);
342
343 if (mode == TImode)
344 return DImode;
345
346 return VOIDmode;
347 }
348
349 /* Return true if mode should be treated as two registers. */
350
351 static bool
352 split_mode_p (machine_mode mode)
353 {
354 return maybe_split_mode (mode) != VOIDmode;
355 }
356
357 /* Output a register, subreg, or register pair (with optional
358 enclosing braces). */
359
360 static void
361 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
362 int subreg_offset = -1)
363 {
364 if (inner_mode == VOIDmode)
365 {
366 if (HARD_REGISTER_NUM_P (regno))
367 fprintf (file, "%s", reg_names[regno]);
368 else
369 fprintf (file, "%%r%d", regno);
370 }
371 else if (subreg_offset >= 0)
372 {
373 output_reg (file, regno, VOIDmode);
374 fprintf (file, "$%d", subreg_offset);
375 }
376 else
377 {
378 if (subreg_offset == -1)
379 fprintf (file, "{");
380 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
381 fprintf (file, ",");
382 output_reg (file, regno, inner_mode, 0);
383 if (subreg_offset == -1)
384 fprintf (file, "}");
385 }
386 }
387
388 /* Emit forking instructions for MASK. */
389
390 static void
391 nvptx_emit_forking (unsigned mask, bool is_call)
392 {
393 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
394 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
395 if (mask)
396 {
397 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
398
399 /* Emit fork at all levels. This helps form SESE regions, as
400 it creates a block with a single successor before entering a
401 partitooned region. That is a good candidate for the end of
402 an SESE region. */
403 emit_insn (gen_nvptx_fork (op));
404 emit_insn (gen_nvptx_forked (op));
405 }
406 }
407
408 /* Emit joining instructions for MASK. */
409
410 static void
411 nvptx_emit_joining (unsigned mask, bool is_call)
412 {
413 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
414 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
415 if (mask)
416 {
417 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
418
419 /* Emit joining for all non-call pars to ensure there's a single
420 predecessor for the block the join insn ends up in. This is
421 needed for skipping entire loops. */
422 emit_insn (gen_nvptx_joining (op));
423 emit_insn (gen_nvptx_join (op));
424 }
425 }
426
427 \f
428 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
429 returned in memory. Integer and floating types supported by the
430 machine are passed in registers, everything else is passed in
431 memory. Complex types are split. */
432
433 static bool
434 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
435 {
436 if (type)
437 {
438 if (AGGREGATE_TYPE_P (type))
439 return true;
440 if (TREE_CODE (type) == VECTOR_TYPE)
441 return true;
442 }
443
444 if (!for_return && COMPLEX_MODE_P (mode))
445 /* Complex types are passed as two underlying args. */
446 mode = GET_MODE_INNER (mode);
447
448 if (GET_MODE_CLASS (mode) != MODE_INT
449 && GET_MODE_CLASS (mode) != MODE_FLOAT)
450 return true;
451
452 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
453 return true;
454
455 return false;
456 }
457
458 /* A non-memory argument of mode MODE is being passed, determine the mode it
459 should be promoted to. This is also used for determining return
460 type promotion. */
461
462 static machine_mode
463 promote_arg (machine_mode mode, bool prototyped)
464 {
465 if (!prototyped && mode == SFmode)
466 /* K&R float promotion for unprototyped functions. */
467 mode = DFmode;
468 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
469 mode = SImode;
470
471 return mode;
472 }
473
474 /* A non-memory return type of MODE is being returned. Determine the
475 mode it should be promoted to. */
476
477 static machine_mode
478 promote_return (machine_mode mode)
479 {
480 return promote_arg (mode, true);
481 }
482
483 /* Implement TARGET_FUNCTION_ARG. */
484
485 static rtx
486 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
487 const_tree, bool named)
488 {
489 if (mode == VOIDmode || !named)
490 return NULL_RTX;
491
492 return gen_reg_rtx (mode);
493 }
494
495 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
496
497 static rtx
498 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
499 const_tree, bool named)
500 {
501 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
502
503 if (mode == VOIDmode || !named)
504 return NULL_RTX;
505
506 /* No need to deal with split modes here, the only case that can
507 happen is complex modes and those are dealt with by
508 TARGET_SPLIT_COMPLEX_ARG. */
509 return gen_rtx_UNSPEC (mode,
510 gen_rtvec (1, GEN_INT (cum->count)),
511 UNSPEC_ARG_REG);
512 }
513
514 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
515
516 static void
517 nvptx_function_arg_advance (cumulative_args_t cum_v,
518 machine_mode ARG_UNUSED (mode),
519 const_tree ARG_UNUSED (type),
520 bool ARG_UNUSED (named))
521 {
522 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
523
524 cum->count++;
525 }
526
527 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
528
529 For nvptx This is only used for varadic args. The type has already
530 been promoted and/or converted to invisible reference. */
531
532 static unsigned
533 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
534 {
535 return GET_MODE_ALIGNMENT (mode);
536 }
537
538 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
539
540 For nvptx, we know how to handle functions declared as stdarg: by
541 passing an extra pointer to the unnamed arguments. However, the
542 Fortran frontend can produce a different situation, where a
543 function pointer is declared with no arguments, but the actual
544 function and calls to it take more arguments. In that case, we
545 want to ensure the call matches the definition of the function. */
546
547 static bool
548 nvptx_strict_argument_naming (cumulative_args_t cum_v)
549 {
550 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
551
552 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
553 }
554
555 /* Implement TARGET_LIBCALL_VALUE. */
556
557 static rtx
558 nvptx_libcall_value (machine_mode mode, const_rtx)
559 {
560 if (!cfun || !cfun->machine->doing_call)
561 /* Pretend to return in a hard reg for early uses before pseudos can be
562 generated. */
563 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
564
565 return gen_reg_rtx (mode);
566 }
567
568 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
569 where function FUNC returns or receives a value of data type TYPE. */
570
571 static rtx
572 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
573 bool outgoing)
574 {
575 machine_mode mode = promote_return (TYPE_MODE (type));
576
577 if (outgoing)
578 {
579 gcc_assert (cfun);
580 cfun->machine->return_mode = mode;
581 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
582 }
583
584 return nvptx_libcall_value (mode, NULL_RTX);
585 }
586
587 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
588
589 static bool
590 nvptx_function_value_regno_p (const unsigned int regno)
591 {
592 return regno == NVPTX_RETURN_REGNUM;
593 }
594
595 /* Types with a mode other than those supported by the machine are passed by
596 reference in memory. */
597
598 static bool
599 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
600 machine_mode mode, const_tree type,
601 bool ARG_UNUSED (named))
602 {
603 return pass_in_memory (mode, type, false);
604 }
605
606 /* Implement TARGET_RETURN_IN_MEMORY. */
607
608 static bool
609 nvptx_return_in_memory (const_tree type, const_tree)
610 {
611 return pass_in_memory (TYPE_MODE (type), type, true);
612 }
613
614 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
615
616 static machine_mode
617 nvptx_promote_function_mode (const_tree type, machine_mode mode,
618 int *ARG_UNUSED (punsignedp),
619 const_tree funtype, int for_return)
620 {
621 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
622 }
623
624 /* Helper for write_arg. Emit a single PTX argument of MODE, either
625 in a prototype, or as copy in a function prologue. ARGNO is the
626 index of this argument in the PTX function. FOR_REG is negative,
627 if we're emitting the PTX prototype. It is zero if we're copying
628 to an argument register and it is greater than zero if we're
629 copying to a specific hard register. */
630
631 static int
632 write_arg_mode (std::stringstream &s, int for_reg, int argno,
633 machine_mode mode)
634 {
635 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
636
637 if (for_reg < 0)
638 {
639 /* Writing PTX prototype. */
640 s << (argno ? ", " : " (");
641 s << ".param" << ptx_type << " %in_ar" << argno;
642 }
643 else
644 {
645 s << "\t.reg" << ptx_type << " ";
646 if (for_reg)
647 s << reg_names[for_reg];
648 else
649 s << "%ar" << argno;
650 s << ";\n";
651 if (argno >= 0)
652 {
653 s << "\tld.param" << ptx_type << " ";
654 if (for_reg)
655 s << reg_names[for_reg];
656 else
657 s << "%ar" << argno;
658 s << ", [%in_ar" << argno << "];\n";
659 }
660 }
661 return argno + 1;
662 }
663
664 /* Process function parameter TYPE to emit one or more PTX
665 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
666 is true, if this is a prototyped function, rather than an old-style
667 C declaration. Returns the next argument number to use.
668
669 The promotion behavior here must match the regular GCC function
670 parameter marshalling machinery. */
671
672 static int
673 write_arg_type (std::stringstream &s, int for_reg, int argno,
674 tree type, bool prototyped)
675 {
676 machine_mode mode = TYPE_MODE (type);
677
678 if (mode == VOIDmode)
679 return argno;
680
681 if (pass_in_memory (mode, type, false))
682 mode = Pmode;
683 else
684 {
685 bool split = TREE_CODE (type) == COMPLEX_TYPE;
686
687 if (split)
688 {
689 /* Complex types are sent as two separate args. */
690 type = TREE_TYPE (type);
691 mode = TYPE_MODE (type);
692 prototyped = true;
693 }
694
695 mode = promote_arg (mode, prototyped);
696 if (split)
697 argno = write_arg_mode (s, for_reg, argno, mode);
698 }
699
700 return write_arg_mode (s, for_reg, argno, mode);
701 }
702
703 /* Emit a PTX return as a prototype or function prologue declaration
704 for MODE. */
705
706 static void
707 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
708 {
709 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
710 const char *pfx = "\t.reg";
711 const char *sfx = ";\n";
712
713 if (for_proto)
714 pfx = "(.param", sfx = "_out) ";
715
716 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
717 }
718
719 /* Process a function return TYPE to emit a PTX return as a prototype
720 or function prologue declaration. Returns true if return is via an
721 additional pointer parameter. The promotion behavior here must
722 match the regular GCC function return mashalling. */
723
724 static bool
725 write_return_type (std::stringstream &s, bool for_proto, tree type)
726 {
727 machine_mode mode = TYPE_MODE (type);
728
729 if (mode == VOIDmode)
730 return false;
731
732 bool return_in_mem = pass_in_memory (mode, type, true);
733
734 if (return_in_mem)
735 {
736 if (for_proto)
737 return return_in_mem;
738
739 /* Named return values can cause us to return a pointer as well
740 as expect an argument for the return location. This is
741 optimization-level specific, so no caller can make use of
742 this data, but more importantly for us, we must ensure it
743 doesn't change the PTX prototype. */
744 mode = (machine_mode) cfun->machine->return_mode;
745
746 if (mode == VOIDmode)
747 return return_in_mem;
748
749 /* Clear return_mode to inhibit copy of retval to non-existent
750 retval parameter. */
751 cfun->machine->return_mode = VOIDmode;
752 }
753 else
754 mode = promote_return (mode);
755
756 write_return_mode (s, for_proto, mode);
757
758 return return_in_mem;
759 }
760
761 /* Look for attributes in ATTRS that would indicate we must write a function
762 as a .entry kernel rather than a .func. Return true if one is found. */
763
764 static bool
765 write_as_kernel (tree attrs)
766 {
767 return (lookup_attribute ("kernel", attrs) != NULL_TREE
768 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
769 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
770 /* For OpenMP target regions, the corresponding kernel entry is emitted from
771 write_omp_entry as a separate function. */
772 }
773
774 /* Emit a linker marker for a function decl or defn. */
775
776 static void
777 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
778 const char *name)
779 {
780 s << "\n// BEGIN";
781 if (globalize)
782 s << " GLOBAL";
783 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
784 s << name << "\n";
785 }
786
787 /* Emit a linker marker for a variable decl or defn. */
788
789 static void
790 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
791 {
792 fprintf (file, "\n// BEGIN%s VAR %s: ",
793 globalize ? " GLOBAL" : "",
794 is_defn ? "DEF" : "DECL");
795 assemble_name_raw (file, name);
796 fputs ("\n", file);
797 }
798
799 /* Write a .func or .kernel declaration or definition along with
800 a helper comment for use by ld. S is the stream to write to, DECL
801 the decl for the function with name NAME. For definitions, emit
802 a declaration too. */
803
804 static const char *
805 write_fn_proto (std::stringstream &s, bool is_defn,
806 const char *name, const_tree decl)
807 {
808 if (is_defn)
809 /* Emit a declaration. The PTX assembler gets upset without it. */
810 name = write_fn_proto (s, false, name, decl);
811 else
812 {
813 /* Avoid repeating the name replacement. */
814 name = nvptx_name_replacement (name);
815 if (name[0] == '*')
816 name++;
817 }
818
819 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
820
821 /* PTX declaration. */
822 if (DECL_EXTERNAL (decl))
823 s << ".extern ";
824 else if (TREE_PUBLIC (decl))
825 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
826 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
827
828 tree fntype = TREE_TYPE (decl);
829 tree result_type = TREE_TYPE (fntype);
830
831 /* atomic_compare_exchange_$n builtins have an exceptional calling
832 convention. */
833 int not_atomic_weak_arg = -1;
834 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
835 switch (DECL_FUNCTION_CODE (decl))
836 {
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
841 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
842 /* These atomics skip the 'weak' parm in an actual library
843 call. We must skip it in the prototype too. */
844 not_atomic_weak_arg = 3;
845 break;
846
847 default:
848 break;
849 }
850
851 /* Declare the result. */
852 bool return_in_mem = write_return_type (s, true, result_type);
853
854 s << name;
855
856 int argno = 0;
857
858 /* Emit argument list. */
859 if (return_in_mem)
860 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
861
862 /* We get:
863 NULL in TYPE_ARG_TYPES, for old-style functions
864 NULL in DECL_ARGUMENTS, for builtin functions without another
865 declaration.
866 So we have to pick the best one we have. */
867 tree args = TYPE_ARG_TYPES (fntype);
868 bool prototyped = true;
869 if (!args)
870 {
871 args = DECL_ARGUMENTS (decl);
872 prototyped = false;
873 }
874
875 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
876 {
877 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
878
879 if (not_atomic_weak_arg)
880 argno = write_arg_type (s, -1, argno, type, prototyped);
881 else
882 gcc_assert (type == boolean_type_node);
883 }
884
885 if (stdarg_p (fntype))
886 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
887
888 if (DECL_STATIC_CHAIN (decl))
889 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
890
891 if (!argno && strcmp (name, "main") == 0)
892 {
893 argno = write_arg_type (s, -1, argno, integer_type_node, true);
894 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
895 }
896
897 if (argno)
898 s << ")";
899
900 s << (is_defn ? "\n" : ";\n");
901
902 return name;
903 }
904
905 /* Construct a function declaration from a call insn. This can be
906 necessary for two reasons - either we have an indirect call which
907 requires a .callprototype declaration, or we have a libcall
908 generated by emit_library_call for which no decl exists. */
909
910 static void
911 write_fn_proto_from_insn (std::stringstream &s, const char *name,
912 rtx result, rtx pat)
913 {
914 if (!name)
915 {
916 s << "\t.callprototype ";
917 name = "_";
918 }
919 else
920 {
921 name = nvptx_name_replacement (name);
922 write_fn_marker (s, false, true, name);
923 s << "\t.extern .func ";
924 }
925
926 if (result != NULL_RTX)
927 write_return_mode (s, true, GET_MODE (result));
928
929 s << name;
930
931 int arg_end = XVECLEN (pat, 0);
932 for (int i = 1; i < arg_end; i++)
933 {
934 /* We don't have to deal with mode splitting & promotion here,
935 as that was already done when generating the call
936 sequence. */
937 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
938
939 write_arg_mode (s, -1, i - 1, mode);
940 }
941 if (arg_end != 1)
942 s << ")";
943 s << ";\n";
944 }
945
946 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
947 table and and write a ptx prototype. These are emitted at end of
948 compilation. */
949
950 static void
951 nvptx_record_fndecl (tree decl)
952 {
953 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
954 if (*slot == NULL)
955 {
956 *slot = decl;
957 const char *name = get_fnname_from_decl (decl);
958 write_fn_proto (func_decls, false, name, decl);
959 }
960 }
961
962 /* Record a libcall or unprototyped external function. CALLEE is the
963 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
964 declaration for it. */
965
966 static void
967 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
968 {
969 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
970 if (*slot == NULL)
971 {
972 *slot = callee;
973
974 const char *name = XSTR (callee, 0);
975 write_fn_proto_from_insn (func_decls, name, retval, pat);
976 }
977 }
978
979 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
980 is prototyped, record it now. Otherwise record it as needed at end
981 of compilation, when we might have more information about it. */
982
983 void
984 nvptx_record_needed_fndecl (tree decl)
985 {
986 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
987 {
988 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
989 if (*slot == NULL)
990 *slot = decl;
991 }
992 else
993 nvptx_record_fndecl (decl);
994 }
995
996 /* SYM is a SYMBOL_REF. If it refers to an external function, record
997 it as needed. */
998
999 static void
1000 nvptx_maybe_record_fnsym (rtx sym)
1001 {
1002 tree decl = SYMBOL_REF_DECL (sym);
1003
1004 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1005 nvptx_record_needed_fndecl (decl);
1006 }
1007
1008 /* Emit a local array to hold some part of a conventional stack frame
1009 and initialize REGNO to point to it. If the size is zero, it'll
1010 never be valid to dereference, so we can simply initialize to
1011 zero. */
1012
1013 static void
1014 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1015 {
1016 if (size)
1017 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1018 align, reg_names[regno], size);
1019 fprintf (file, "\t.reg.u%d %s;\n",
1020 POINTER_SIZE, reg_names[regno]);
1021 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1022 : "\tmov.u%d %s, 0;\n"),
1023 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1024 }
1025
1026 /* Emit soft stack frame setup sequence. */
1027
1028 static void
1029 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1030 {
1031 /* Maintain 64-bit stack alignment. */
1032 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1033 size = ROUND_UP (size, keep_align);
1034 int bits = POINTER_SIZE;
1035 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1036 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1037 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1038 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1039 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1040 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1041 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1042 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1043 fprintf (file, "\t{\n");
1044 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1045 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1046 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1047 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1048 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1049 bits == 64 ? ".wide" : ".lo", bits / 8);
1050 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1051
1052 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1053 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1054
1055 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1056 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1057 bits, reg_sspprev, reg_sspslot);
1058
1059 /* Initialize %frame = %sspprev - size. */
1060 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1061 bits, reg_frame, reg_sspprev, size);
1062
1063 /* Apply alignment, if larger than 64. */
1064 if (alignment > keep_align)
1065 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1066 bits, reg_frame, reg_frame, -alignment);
1067
1068 size = crtl->outgoing_args_size;
1069 gcc_assert (size % keep_align == 0);
1070
1071 /* Initialize %stack. */
1072 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1073 bits, reg_stack, reg_frame, size);
1074
1075 if (!crtl->is_leaf)
1076 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1077 bits, reg_sspslot, reg_stack);
1078 fprintf (file, "\t}\n");
1079 cfun->machine->has_softstack = true;
1080 need_softstack_decl = true;
1081 }
1082
1083 /* Emit code to initialize the REGNO predicate register to indicate
1084 whether we are not lane zero on the NAME axis. */
1085
1086 static void
1087 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1088 {
1089 fprintf (file, "\t{\n");
1090 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1091 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1092 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1093 fprintf (file, "\t}\n");
1094 }
1095
1096 /* Emit code to initialize predicate and master lane index registers for
1097 -muniform-simt code generation variant. */
1098
1099 static void
1100 nvptx_init_unisimt_predicate (FILE *file)
1101 {
1102 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1103 int loc = REGNO (cfun->machine->unisimt_location);
1104 int bits = POINTER_SIZE;
1105 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1106 fprintf (file, "\t{\n");
1107 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1108 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1109 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1110 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1111 bits == 64 ? ".wide" : ".lo");
1112 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1113 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1114 if (cfun->machine->unisimt_predicate)
1115 {
1116 int master = REGNO (cfun->machine->unisimt_master);
1117 int pred = REGNO (cfun->machine->unisimt_predicate);
1118 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1119 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1120 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1121 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1122 /* Compute predicate as 'tid.x == master'. */
1123 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1124 }
1125 fprintf (file, "\t}\n");
1126 need_unisimt_decl = true;
1127 }
1128
1129 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1130
1131 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1132 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1133 {
1134 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1135 __nvptx_uni[tid.y] = 0;
1136 gomp_nvptx_main (ORIG, arg);
1137 }
1138 ORIG itself should not be emitted as a PTX .entry function. */
1139
1140 static void
1141 write_omp_entry (FILE *file, const char *name, const char *orig)
1142 {
1143 static bool gomp_nvptx_main_declared;
1144 if (!gomp_nvptx_main_declared)
1145 {
1146 gomp_nvptx_main_declared = true;
1147 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1148 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1149 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1150 }
1151 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1152 #define NTID_Y "%ntid.y"
1153 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1154 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1155 {\n\
1156 .reg.u32 %r<3>;\n\
1157 .reg.u" PS " %R<4>;\n\
1158 mov.u32 %r0, %tid.y;\n\
1159 mov.u32 %r1, " NTID_Y ";\n\
1160 mov.u32 %r2, %ctaid.x;\n\
1161 cvt.u" PS ".u32 %R1, %r0;\n\
1162 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1163 mov.u" PS " %R0, __nvptx_stacks;\n\
1164 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1165 ld.param.u" PS " %R2, [%stack];\n\
1166 ld.param.u" PS " %R3, [%sz];\n\
1167 add.u" PS " %R2, %R2, %R3;\n\
1168 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1169 st.shared.u" PS " [%R0], %R2;\n\
1170 mov.u" PS " %R0, __nvptx_uni;\n\
1171 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1172 mov.u32 %r0, 0;\n\
1173 st.shared.u32 [%R0], %r0;\n\
1174 mov.u" PS " %R0, \0;\n\
1175 ld.param.u" PS " %R1, [%arg];\n\
1176 {\n\
1177 .param.u" PS " %P<2>;\n\
1178 st.param.u" PS " [%P0], %R0;\n\
1179 st.param.u" PS " [%P1], %R1;\n\
1180 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1181 }\n\
1182 ret.uni;\n\
1183 }\n"
1184 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1185 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1186 #undef ENTRY_TEMPLATE
1187 #undef NTID_Y
1188 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1189 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1190 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1191 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1192 need_softstack_decl = need_unisimt_decl = true;
1193 }
1194
1195 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1196 function, including local var decls and copies from the arguments to
1197 local regs. */
1198
1199 void
1200 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1201 {
1202 tree fntype = TREE_TYPE (decl);
1203 tree result_type = TREE_TYPE (fntype);
1204 int argno = 0;
1205
1206 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1207 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1208 {
1209 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1210 sprintf (buf, "%s$impl", name);
1211 write_omp_entry (file, name, buf);
1212 name = buf;
1213 }
1214 /* We construct the initial part of the function into a string
1215 stream, in order to share the prototype writing code. */
1216 std::stringstream s;
1217 write_fn_proto (s, true, name, decl);
1218 s << "{\n";
1219
1220 bool return_in_mem = write_return_type (s, false, result_type);
1221 if (return_in_mem)
1222 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1223
1224 /* Declare and initialize incoming arguments. */
1225 tree args = TYPE_ARG_TYPES (fntype);
1226 bool prototyped = true;
1227 if (!args)
1228 {
1229 args = DECL_ARGUMENTS (decl);
1230 prototyped = false;
1231 }
1232
1233 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1234 {
1235 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1236
1237 argno = write_arg_type (s, 0, argno, type, prototyped);
1238 }
1239
1240 if (stdarg_p (fntype))
1241 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1242 true);
1243
1244 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1245 write_arg_type (s, STATIC_CHAIN_REGNUM,
1246 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1247 true);
1248
1249 fprintf (file, "%s", s.str().c_str());
1250
1251 /* Usually 'crtl->is_leaf' is computed during register allocator
1252 initialization (which is not done on NVPTX) or for pressure-sensitive
1253 optimizations. Initialize it here, except if already set. */
1254 if (!crtl->is_leaf)
1255 crtl->is_leaf = leaf_function_p ();
1256
1257 HOST_WIDE_INT sz = get_frame_size ();
1258 bool need_frameptr = sz || cfun->machine->has_chain;
1259 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1260 if (!TARGET_SOFT_STACK)
1261 {
1262 /* Declare a local var for outgoing varargs. */
1263 if (cfun->machine->has_varadic)
1264 init_frame (file, STACK_POINTER_REGNUM,
1265 UNITS_PER_WORD, crtl->outgoing_args_size);
1266
1267 /* Declare a local variable for the frame. Force its size to be
1268 DImode-compatible. */
1269 if (need_frameptr)
1270 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1271 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1272 }
1273 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1274 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1275 init_softstack_frame (file, alignment, sz);
1276
1277 if (cfun->machine->has_simtreg)
1278 {
1279 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1280 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1281 align = MAX (align, GET_MODE_SIZE (DImode));
1282 if (!crtl->is_leaf || cfun->calls_alloca)
1283 simtsz = HOST_WIDE_INT_M1U;
1284 if (simtsz == HOST_WIDE_INT_M1U)
1285 simtsz = nvptx_softstack_size;
1286 if (cfun->machine->has_softstack)
1287 simtsz += POINTER_SIZE / 8;
1288 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1289 if (align > GET_MODE_SIZE (DImode))
1290 simtsz += align - GET_MODE_SIZE (DImode);
1291 if (simtsz)
1292 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1293 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1294 }
1295 /* Declare the pseudos we have as ptx registers. */
1296 int maxregs = max_reg_num ();
1297 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1298 {
1299 if (regno_reg_rtx[i] != const0_rtx)
1300 {
1301 machine_mode mode = PSEUDO_REGNO_MODE (i);
1302 machine_mode split = maybe_split_mode (mode);
1303
1304 if (split_mode_p (mode))
1305 mode = split;
1306 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1307 output_reg (file, i, split, -2);
1308 fprintf (file, ";\n");
1309 }
1310 }
1311
1312 /* Emit axis predicates. */
1313 if (cfun->machine->axis_predicate[0])
1314 nvptx_init_axis_predicate (file,
1315 REGNO (cfun->machine->axis_predicate[0]), "y");
1316 if (cfun->machine->axis_predicate[1])
1317 nvptx_init_axis_predicate (file,
1318 REGNO (cfun->machine->axis_predicate[1]), "x");
1319 if (cfun->machine->unisimt_predicate
1320 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1321 nvptx_init_unisimt_predicate (file);
1322 }
1323
1324 /* Output code for switching uniform-simt state. ENTERING indicates whether
1325 we are entering or leaving non-uniform execution region. */
1326
1327 static void
1328 nvptx_output_unisimt_switch (FILE *file, bool entering)
1329 {
1330 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1331 return;
1332 fprintf (file, "\t{\n");
1333 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1334 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1335 if (!crtl->is_leaf)
1336 {
1337 int loc = REGNO (cfun->machine->unisimt_location);
1338 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1339 }
1340 if (cfun->machine->unisimt_predicate)
1341 {
1342 int master = REGNO (cfun->machine->unisimt_master);
1343 int pred = REGNO (cfun->machine->unisimt_predicate);
1344 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1345 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1346 master, entering ? "%ustmp2" : "0");
1347 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1348 }
1349 fprintf (file, "\t}\n");
1350 }
1351
1352 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1353 ENTERING indicates whether we are entering or leaving non-uniform execution.
1354 PTR is the register pointing to allocated storage, it is assigned to on
1355 entering and used to restore state on leaving. SIZE and ALIGN are used only
1356 on entering. */
1357
1358 static void
1359 nvptx_output_softstack_switch (FILE *file, bool entering,
1360 rtx ptr, rtx size, rtx align)
1361 {
1362 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1363 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1364 return;
1365 int bits = POINTER_SIZE, regno = REGNO (ptr);
1366 fprintf (file, "\t{\n");
1367 if (entering)
1368 {
1369 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1370 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1371 cfun->machine->simt_stack_size);
1372 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1373 if (CONST_INT_P (size))
1374 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1375 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1376 else
1377 output_reg (file, REGNO (size), VOIDmode);
1378 fputs (";\n", file);
1379 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1380 fprintf (file,
1381 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1382 bits, regno, regno, UINTVAL (align));
1383 }
1384 if (cfun->machine->has_softstack)
1385 {
1386 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1387 if (entering)
1388 {
1389 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1390 bits, regno, bits / 8, reg_stack);
1391 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1392 bits, reg_stack, regno, bits / 8);
1393 }
1394 else
1395 {
1396 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1397 bits, reg_stack, regno, bits / 8);
1398 }
1399 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1400 }
1401 fprintf (file, "\t}\n");
1402 }
1403
1404 /* Output code to enter non-uniform execution region. DEST is a register
1405 to hold a per-lane allocation given by SIZE and ALIGN. */
1406
1407 const char *
1408 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1409 {
1410 nvptx_output_unisimt_switch (asm_out_file, true);
1411 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1412 return "";
1413 }
1414
1415 /* Output code to leave non-uniform execution region. SRC is the register
1416 holding per-lane storage previously allocated by omp_simt_enter insn. */
1417
1418 const char *
1419 nvptx_output_simt_exit (rtx src)
1420 {
1421 nvptx_output_unisimt_switch (asm_out_file, false);
1422 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1423 return "";
1424 }
1425
1426 /* Output instruction that sets soft stack pointer in shared memory to the
1427 value in register given by SRC_REGNO. */
1428
1429 const char *
1430 nvptx_output_set_softstack (unsigned src_regno)
1431 {
1432 if (cfun->machine->has_softstack && !crtl->is_leaf)
1433 {
1434 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1435 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1436 output_reg (asm_out_file, src_regno, VOIDmode);
1437 fprintf (asm_out_file, ";\n");
1438 }
1439 return "";
1440 }
1441 /* Output a return instruction. Also copy the return value to its outgoing
1442 location. */
1443
1444 const char *
1445 nvptx_output_return (void)
1446 {
1447 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1448
1449 if (mode != VOIDmode)
1450 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1451 nvptx_ptx_type_from_mode (mode, false),
1452 reg_names[NVPTX_RETURN_REGNUM],
1453 reg_names[NVPTX_RETURN_REGNUM]);
1454
1455 return "ret;";
1456 }
1457
1458 /* Terminate a function by writing a closing brace to FILE. */
1459
1460 void
1461 nvptx_function_end (FILE *file)
1462 {
1463 fprintf (file, "}\n");
1464 }
1465 \f
1466 /* Decide whether we can make a sibling call to a function. For ptx, we
1467 can't. */
1468
1469 static bool
1470 nvptx_function_ok_for_sibcall (tree, tree)
1471 {
1472 return false;
1473 }
1474
1475 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1476
1477 static rtx
1478 nvptx_get_drap_rtx (void)
1479 {
1480 if (TARGET_SOFT_STACK && stack_realign_drap)
1481 return arg_pointer_rtx;
1482 return NULL_RTX;
1483 }
1484
1485 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1486 argument to the next call. */
1487
1488 static void
1489 nvptx_call_args (rtx arg, tree fntype)
1490 {
1491 if (!cfun->machine->doing_call)
1492 {
1493 cfun->machine->doing_call = true;
1494 cfun->machine->is_varadic = false;
1495 cfun->machine->num_args = 0;
1496
1497 if (fntype && stdarg_p (fntype))
1498 {
1499 cfun->machine->is_varadic = true;
1500 cfun->machine->has_varadic = true;
1501 cfun->machine->num_args++;
1502 }
1503 }
1504
1505 if (REG_P (arg) && arg != pc_rtx)
1506 {
1507 cfun->machine->num_args++;
1508 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1509 cfun->machine->call_args);
1510 }
1511 }
1512
1513 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1514 information we recorded. */
1515
1516 static void
1517 nvptx_end_call_args (void)
1518 {
1519 cfun->machine->doing_call = false;
1520 free_EXPR_LIST_list (&cfun->machine->call_args);
1521 }
1522
1523 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1524 track of whether calls involving static chains or varargs were seen
1525 in the current function.
1526 For libcalls, maintain a hash table of decls we have seen, and
1527 record a function decl for later when encountering a new one. */
1528
1529 void
1530 nvptx_expand_call (rtx retval, rtx address)
1531 {
1532 rtx callee = XEXP (address, 0);
1533 rtx varargs = NULL_RTX;
1534 unsigned parallel = 0;
1535
1536 if (!call_insn_operand (callee, Pmode))
1537 {
1538 callee = force_reg (Pmode, callee);
1539 address = change_address (address, QImode, callee);
1540 }
1541
1542 if (GET_CODE (callee) == SYMBOL_REF)
1543 {
1544 tree decl = SYMBOL_REF_DECL (callee);
1545 if (decl != NULL_TREE)
1546 {
1547 if (DECL_STATIC_CHAIN (decl))
1548 cfun->machine->has_chain = true;
1549
1550 tree attr = oacc_get_fn_attrib (decl);
1551 if (attr)
1552 {
1553 tree dims = TREE_VALUE (attr);
1554
1555 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1556 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1557 {
1558 if (TREE_PURPOSE (dims)
1559 && !integer_zerop (TREE_PURPOSE (dims)))
1560 break;
1561 /* Not on this axis. */
1562 parallel ^= GOMP_DIM_MASK (ix);
1563 dims = TREE_CHAIN (dims);
1564 }
1565 }
1566 }
1567 }
1568
1569 unsigned nargs = cfun->machine->num_args;
1570 if (cfun->machine->is_varadic)
1571 {
1572 varargs = gen_reg_rtx (Pmode);
1573 emit_move_insn (varargs, stack_pointer_rtx);
1574 }
1575
1576 rtvec vec = rtvec_alloc (nargs + 1);
1577 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1578 int vec_pos = 0;
1579
1580 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1581 rtx tmp_retval = retval;
1582 if (retval)
1583 {
1584 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1585 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1586 call = gen_rtx_SET (tmp_retval, call);
1587 }
1588 XVECEXP (pat, 0, vec_pos++) = call;
1589
1590 /* Construct the call insn, including a USE for each argument pseudo
1591 register. These will be used when printing the insn. */
1592 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1593 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1594
1595 if (varargs)
1596 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1597
1598 gcc_assert (vec_pos = XVECLEN (pat, 0));
1599
1600 nvptx_emit_forking (parallel, true);
1601 emit_call_insn (pat);
1602 nvptx_emit_joining (parallel, true);
1603
1604 if (tmp_retval != retval)
1605 emit_move_insn (retval, tmp_retval);
1606 }
1607
1608 /* Emit a comparison COMPARE, and return the new test to be used in the
1609 jump. */
1610
1611 rtx
1612 nvptx_expand_compare (rtx compare)
1613 {
1614 rtx pred = gen_reg_rtx (BImode);
1615 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1616 XEXP (compare, 0), XEXP (compare, 1));
1617 emit_insn (gen_rtx_SET (pred, cmp));
1618 return gen_rtx_NE (BImode, pred, const0_rtx);
1619 }
1620
1621 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1622
1623 void
1624 nvptx_expand_oacc_fork (unsigned mode)
1625 {
1626 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1627 }
1628
1629 void
1630 nvptx_expand_oacc_join (unsigned mode)
1631 {
1632 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1633 }
1634
1635 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1636 objects. */
1637
1638 static rtx
1639 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1640 {
1641 rtx res;
1642
1643 switch (GET_MODE (src))
1644 {
1645 case E_DImode:
1646 res = gen_unpackdisi2 (dst0, dst1, src);
1647 break;
1648 case E_DFmode:
1649 res = gen_unpackdfsi2 (dst0, dst1, src);
1650 break;
1651 default: gcc_unreachable ();
1652 }
1653 return res;
1654 }
1655
1656 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1657 object. */
1658
1659 static rtx
1660 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1661 {
1662 rtx res;
1663
1664 switch (GET_MODE (dst))
1665 {
1666 case E_DImode:
1667 res = gen_packsidi2 (dst, src0, src1);
1668 break;
1669 case E_DFmode:
1670 res = gen_packsidf2 (dst, src0, src1);
1671 break;
1672 default: gcc_unreachable ();
1673 }
1674 return res;
1675 }
1676
1677 /* Generate an instruction or sequence to broadcast register REG
1678 across the vectors of a single warp. */
1679
1680 rtx
1681 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1682 {
1683 rtx res;
1684
1685 switch (GET_MODE (dst))
1686 {
1687 case E_SImode:
1688 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1689 break;
1690 case E_SFmode:
1691 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1692 break;
1693 case E_DImode:
1694 case E_DFmode:
1695 {
1696 rtx tmp0 = gen_reg_rtx (SImode);
1697 rtx tmp1 = gen_reg_rtx (SImode);
1698
1699 start_sequence ();
1700 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1701 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1702 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1703 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1704 res = get_insns ();
1705 end_sequence ();
1706 }
1707 break;
1708 case E_BImode:
1709 {
1710 rtx tmp = gen_reg_rtx (SImode);
1711
1712 start_sequence ();
1713 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1714 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1715 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1716 res = get_insns ();
1717 end_sequence ();
1718 }
1719 break;
1720 case E_QImode:
1721 case E_HImode:
1722 {
1723 rtx tmp = gen_reg_rtx (SImode);
1724
1725 start_sequence ();
1726 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1727 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1728 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1729 tmp)));
1730 res = get_insns ();
1731 end_sequence ();
1732 }
1733 break;
1734
1735 default:
1736 gcc_unreachable ();
1737 }
1738 return res;
1739 }
1740
1741 /* Generate an instruction or sequence to broadcast register REG
1742 across the vectors of a single warp. */
1743
1744 static rtx
1745 nvptx_gen_vcast (rtx reg)
1746 {
1747 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1748 }
1749
1750 /* Structure used when generating a worker-level spill or fill. */
1751
1752 struct wcast_data_t
1753 {
1754 rtx base; /* Register holding base addr of buffer. */
1755 rtx ptr; /* Iteration var, if needed. */
1756 unsigned offset; /* Offset into worker buffer. */
1757 };
1758
1759 /* Direction of the spill/fill and looping setup/teardown indicator. */
1760
1761 enum propagate_mask
1762 {
1763 PM_read = 1 << 0,
1764 PM_write = 1 << 1,
1765 PM_loop_begin = 1 << 2,
1766 PM_loop_end = 1 << 3,
1767
1768 PM_read_write = PM_read | PM_write
1769 };
1770
1771 /* Generate instruction(s) to spill or fill register REG to/from the
1772 worker broadcast array. PM indicates what is to be done, REP
1773 how many loop iterations will be executed (0 for not a loop). */
1774
1775 static rtx
1776 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1777 {
1778 rtx res;
1779 machine_mode mode = GET_MODE (reg);
1780
1781 switch (mode)
1782 {
1783 case E_BImode:
1784 {
1785 rtx tmp = gen_reg_rtx (SImode);
1786
1787 start_sequence ();
1788 if (pm & PM_read)
1789 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1790 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1791 if (pm & PM_write)
1792 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1793 res = get_insns ();
1794 end_sequence ();
1795 }
1796 break;
1797
1798 default:
1799 {
1800 rtx addr = data->ptr;
1801
1802 if (!addr)
1803 {
1804 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1805
1806 if (align > worker_bcast_align)
1807 worker_bcast_align = align;
1808 data->offset = (data->offset + align - 1) & ~(align - 1);
1809 addr = data->base;
1810 if (data->offset)
1811 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1812 }
1813
1814 addr = gen_rtx_MEM (mode, addr);
1815 if (pm == PM_read)
1816 res = gen_rtx_SET (addr, reg);
1817 else if (pm == PM_write)
1818 res = gen_rtx_SET (reg, addr);
1819 else
1820 gcc_unreachable ();
1821
1822 if (data->ptr)
1823 {
1824 /* We're using a ptr, increment it. */
1825 start_sequence ();
1826
1827 emit_insn (res);
1828 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1829 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1830 res = get_insns ();
1831 end_sequence ();
1832 }
1833 else
1834 rep = 1;
1835 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1836 }
1837 break;
1838 }
1839 return res;
1840 }
1841 \f
1842 /* Returns true if X is a valid address for use in a memory reference. */
1843
1844 static bool
1845 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1846 {
1847 enum rtx_code code = GET_CODE (x);
1848
1849 switch (code)
1850 {
1851 case REG:
1852 return true;
1853
1854 case PLUS:
1855 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1856 return true;
1857 return false;
1858
1859 case CONST:
1860 case SYMBOL_REF:
1861 case LABEL_REF:
1862 return true;
1863
1864 default:
1865 return false;
1866 }
1867 }
1868 \f
1869 /* Machinery to output constant initializers. When beginning an
1870 initializer, we decide on a fragment size (which is visible in ptx
1871 in the type used), and then all initializer data is buffered until
1872 a fragment is filled and ready to be written out. */
1873
1874 static struct
1875 {
1876 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1877 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1878 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1879 out. */
1880 unsigned size; /* Fragment size to accumulate. */
1881 unsigned offset; /* Offset within current fragment. */
1882 bool started; /* Whether we've output any initializer. */
1883 } init_frag;
1884
1885 /* The current fragment is full, write it out. SYM may provide a
1886 symbolic reference we should output, in which case the fragment
1887 value is the addend. */
1888
1889 static void
1890 output_init_frag (rtx sym)
1891 {
1892 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1893 unsigned HOST_WIDE_INT val = init_frag.val;
1894
1895 init_frag.started = true;
1896 init_frag.val = 0;
1897 init_frag.offset = 0;
1898 init_frag.remaining--;
1899
1900 if (sym)
1901 {
1902 bool function = (SYMBOL_REF_DECL (sym)
1903 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1904 if (!function)
1905 fprintf (asm_out_file, "generic(");
1906 output_address (VOIDmode, sym);
1907 if (!function)
1908 fprintf (asm_out_file, ")");
1909 if (val)
1910 fprintf (asm_out_file, " + ");
1911 }
1912
1913 if (!sym || val)
1914 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1915 }
1916
1917 /* Add value VAL of size SIZE to the data we're emitting, and keep
1918 writing out chunks as they fill up. */
1919
1920 static void
1921 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1922 {
1923 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1924
1925 for (unsigned part = 0; size; size -= part)
1926 {
1927 val >>= part * BITS_PER_UNIT;
1928 part = init_frag.size - init_frag.offset;
1929 if (part > size)
1930 part = size;
1931
1932 unsigned HOST_WIDE_INT partial
1933 = val << (init_frag.offset * BITS_PER_UNIT);
1934 init_frag.val |= partial & init_frag.mask;
1935 init_frag.offset += part;
1936
1937 if (init_frag.offset == init_frag.size)
1938 output_init_frag (NULL);
1939 }
1940 }
1941
1942 /* Target hook for assembling integer object X of size SIZE. */
1943
1944 static bool
1945 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1946 {
1947 HOST_WIDE_INT val = 0;
1948
1949 switch (GET_CODE (x))
1950 {
1951 default:
1952 /* Let the generic machinery figure it out, usually for a
1953 CONST_WIDE_INT. */
1954 return false;
1955
1956 case CONST_INT:
1957 nvptx_assemble_value (INTVAL (x), size);
1958 break;
1959
1960 case CONST:
1961 x = XEXP (x, 0);
1962 gcc_assert (GET_CODE (x) == PLUS);
1963 val = INTVAL (XEXP (x, 1));
1964 x = XEXP (x, 0);
1965 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1966 /* FALLTHROUGH */
1967
1968 case SYMBOL_REF:
1969 gcc_assert (size == init_frag.size);
1970 if (init_frag.offset)
1971 sorry ("cannot emit unaligned pointers in ptx assembly");
1972
1973 nvptx_maybe_record_fnsym (x);
1974 init_frag.val = val;
1975 output_init_frag (x);
1976 break;
1977 }
1978
1979 return true;
1980 }
1981
1982 /* Output SIZE zero bytes. We ignore the FILE argument since the
1983 functions we're calling to perform the output just use
1984 asm_out_file. */
1985
1986 void
1987 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1988 {
1989 /* Finish the current fragment, if it's started. */
1990 if (init_frag.offset)
1991 {
1992 unsigned part = init_frag.size - init_frag.offset;
1993 if (part > size)
1994 part = (unsigned) size;
1995 size -= part;
1996 nvptx_assemble_value (0, part);
1997 }
1998
1999 /* If this skip doesn't terminate the initializer, write as many
2000 remaining pieces as possible directly. */
2001 if (size < init_frag.remaining * init_frag.size)
2002 {
2003 while (size >= init_frag.size)
2004 {
2005 size -= init_frag.size;
2006 output_init_frag (NULL_RTX);
2007 }
2008 if (size)
2009 nvptx_assemble_value (0, size);
2010 }
2011 }
2012
2013 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2014 ignore the FILE arg. */
2015
2016 void
2017 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2018 {
2019 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2020 nvptx_assemble_value (str[i], 1);
2021 }
2022
2023 /* Return true if TYPE is a record type where the last field is an array without
2024 given dimension. */
2025
2026 static bool
2027 flexible_array_member_type_p (const_tree type)
2028 {
2029 if (TREE_CODE (type) != RECORD_TYPE)
2030 return false;
2031
2032 const_tree last_field = NULL_TREE;
2033 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2034 last_field = f;
2035
2036 if (!last_field)
2037 return false;
2038
2039 const_tree last_field_type = TREE_TYPE (last_field);
2040 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2041 return false;
2042
2043 return (! TYPE_DOMAIN (last_field_type)
2044 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2045 }
2046
2047 /* Emit a PTX variable decl and prepare for emission of its
2048 initializer. NAME is the symbol name and SETION the PTX data
2049 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2050 The caller has already emitted any indentation and linkage
2051 specifier. It is responsible for any initializer, terminating ;
2052 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2053 this is the opposite way round that PTX wants them! */
2054
2055 static void
2056 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2057 const_tree type, HOST_WIDE_INT size, unsigned align,
2058 bool undefined = false)
2059 {
2060 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2061 && (TYPE_DOMAIN (type) == NULL_TREE);
2062
2063 if (undefined && flexible_array_member_type_p (type))
2064 {
2065 size = 0;
2066 atype = true;
2067 }
2068
2069 while (TREE_CODE (type) == ARRAY_TYPE)
2070 type = TREE_TYPE (type);
2071
2072 if (TREE_CODE (type) == VECTOR_TYPE
2073 || TREE_CODE (type) == COMPLEX_TYPE)
2074 /* Neither vector nor complex types can contain the other. */
2075 type = TREE_TYPE (type);
2076
2077 unsigned elt_size = int_size_in_bytes (type);
2078
2079 /* Largest mode we're prepared to accept. For BLKmode types we
2080 don't know if it'll contain pointer constants, so have to choose
2081 pointer size, otherwise we can choose DImode. */
2082 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2083
2084 elt_size |= GET_MODE_SIZE (elt_mode);
2085 elt_size &= -elt_size; /* Extract LSB set. */
2086
2087 init_frag.size = elt_size;
2088 /* Avoid undefined shift behavior by using '2'. */
2089 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2090 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2091 init_frag.val = 0;
2092 init_frag.offset = 0;
2093 init_frag.started = false;
2094 /* Size might not be a multiple of elt size, if there's an
2095 initialized trailing struct array with smaller type than
2096 elt_size. */
2097 init_frag.remaining = (size + elt_size - 1) / elt_size;
2098
2099 fprintf (file, "%s .align %d .u%d ",
2100 section, align / BITS_PER_UNIT,
2101 elt_size * BITS_PER_UNIT);
2102 assemble_name (file, name);
2103
2104 if (size)
2105 /* We make everything an array, to simplify any initialization
2106 emission. */
2107 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2108 else if (atype)
2109 fprintf (file, "[]");
2110 }
2111
2112 /* Called when the initializer for a decl has been completely output through
2113 combinations of the three functions above. */
2114
2115 static void
2116 nvptx_assemble_decl_end (void)
2117 {
2118 if (init_frag.offset)
2119 /* This can happen with a packed struct with trailing array member. */
2120 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2121 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2122 }
2123
2124 /* Output an uninitialized common or file-scope variable. */
2125
2126 void
2127 nvptx_output_aligned_decl (FILE *file, const char *name,
2128 const_tree decl, HOST_WIDE_INT size, unsigned align)
2129 {
2130 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2131
2132 /* If this is public, it is common. The nearest thing we have to
2133 common is weak. */
2134 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2135
2136 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2137 TREE_TYPE (decl), size, align);
2138 nvptx_assemble_decl_end ();
2139 }
2140
2141 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2142 writing a constant variable EXP with NAME and SIZE and its
2143 initializer to FILE. */
2144
2145 static void
2146 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2147 const_tree exp, HOST_WIDE_INT obj_size)
2148 {
2149 write_var_marker (file, true, false, name);
2150
2151 fprintf (file, "\t");
2152
2153 tree type = TREE_TYPE (exp);
2154 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2155 TYPE_ALIGN (type));
2156 }
2157
2158 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2159 a variable DECL with NAME to FILE. */
2160
2161 void
2162 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2163 {
2164 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2165
2166 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2167 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2168
2169 tree type = TREE_TYPE (decl);
2170 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2171 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2172 type, obj_size, DECL_ALIGN (decl));
2173 }
2174
2175 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2176
2177 static void
2178 nvptx_globalize_label (FILE *, const char *)
2179 {
2180 }
2181
2182 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2183 declaration only for variable DECL with NAME to FILE. */
2184
2185 static void
2186 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2187 {
2188 /* The middle end can place constant pool decls into the varpool as
2189 undefined. Until that is fixed, catch the problem here. */
2190 if (DECL_IN_CONSTANT_POOL (decl))
2191 return;
2192
2193 /* We support weak defintions, and hence have the right
2194 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2195 if (DECL_WEAK (decl))
2196 error_at (DECL_SOURCE_LOCATION (decl),
2197 "PTX does not support weak declarations"
2198 " (only weak definitions)");
2199 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2200
2201 fprintf (file, "\t.extern ");
2202 tree size = DECL_SIZE_UNIT (decl);
2203 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2204 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2205 DECL_ALIGN (decl), true);
2206 nvptx_assemble_decl_end ();
2207 }
2208
2209 /* Output a pattern for a move instruction. */
2210
2211 const char *
2212 nvptx_output_mov_insn (rtx dst, rtx src)
2213 {
2214 machine_mode dst_mode = GET_MODE (dst);
2215 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2216 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2217 machine_mode src_inner = (GET_CODE (src) == SUBREG
2218 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2219
2220 rtx sym = src;
2221 if (GET_CODE (sym) == CONST)
2222 sym = XEXP (XEXP (sym, 0), 0);
2223 if (SYMBOL_REF_P (sym))
2224 {
2225 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2226 return "%.\tcvta%D1%t0\t%0, %1;";
2227 nvptx_maybe_record_fnsym (sym);
2228 }
2229
2230 if (src_inner == dst_inner)
2231 return "%.\tmov%t0\t%0, %1;";
2232
2233 if (CONSTANT_P (src))
2234 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2235 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2236 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2237
2238 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2239 {
2240 if (GET_MODE_BITSIZE (dst_mode) == 128
2241 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2242 {
2243 /* mov.b128 is not supported. */
2244 if (dst_inner == V2DImode && src_inner == TImode)
2245 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2246 else if (dst_inner == TImode && src_inner == V2DImode)
2247 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2248
2249 gcc_unreachable ();
2250 }
2251 return "%.\tmov.b%T0\t%0, %1;";
2252 }
2253
2254 return "%.\tcvt%t0%t1\t%0, %1;";
2255 }
2256
2257 static void nvptx_print_operand (FILE *, rtx, int);
2258
2259 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2260 involves writing .param declarations and in/out copies into them. For
2261 indirect calls, also write the .callprototype. */
2262
2263 const char *
2264 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2265 {
2266 char buf[16];
2267 static int labelno;
2268 bool needs_tgt = register_operand (callee, Pmode);
2269 rtx pat = PATTERN (insn);
2270 if (GET_CODE (pat) == COND_EXEC)
2271 pat = COND_EXEC_CODE (pat);
2272 int arg_end = XVECLEN (pat, 0);
2273 tree decl = NULL_TREE;
2274
2275 fprintf (asm_out_file, "\t{\n");
2276 if (result != NULL)
2277 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2278 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2279 reg_names[NVPTX_RETURN_REGNUM]);
2280
2281 /* Ensure we have a ptx declaration in the output if necessary. */
2282 if (GET_CODE (callee) == SYMBOL_REF)
2283 {
2284 decl = SYMBOL_REF_DECL (callee);
2285 if (!decl
2286 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2287 nvptx_record_libfunc (callee, result, pat);
2288 else if (DECL_EXTERNAL (decl))
2289 nvptx_record_fndecl (decl);
2290 }
2291
2292 if (needs_tgt)
2293 {
2294 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2295 labelno++;
2296 ASM_OUTPUT_LABEL (asm_out_file, buf);
2297 std::stringstream s;
2298 write_fn_proto_from_insn (s, NULL, result, pat);
2299 fputs (s.str().c_str(), asm_out_file);
2300 }
2301
2302 for (int argno = 1; argno < arg_end; argno++)
2303 {
2304 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2305 machine_mode mode = GET_MODE (t);
2306 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2307
2308 /* Mode splitting has already been done. */
2309 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2310 "\t\tst.param%s [%%out_arg%d], ",
2311 ptx_type, argno, ptx_type, argno);
2312 output_reg (asm_out_file, REGNO (t), VOIDmode);
2313 fprintf (asm_out_file, ";\n");
2314 }
2315
2316 /* The '.' stands for the call's predicate, if any. */
2317 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2318 fprintf (asm_out_file, "\t\tcall ");
2319 if (result != NULL_RTX)
2320 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2321
2322 if (decl)
2323 {
2324 const char *name = get_fnname_from_decl (decl);
2325 name = nvptx_name_replacement (name);
2326 assemble_name (asm_out_file, name);
2327 }
2328 else
2329 output_address (VOIDmode, callee);
2330
2331 const char *open = "(";
2332 for (int argno = 1; argno < arg_end; argno++)
2333 {
2334 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2335 open = "";
2336 }
2337 if (decl && DECL_STATIC_CHAIN (decl))
2338 {
2339 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2340 open = "";
2341 }
2342 if (!open[0])
2343 fprintf (asm_out_file, ")");
2344
2345 if (needs_tgt)
2346 {
2347 fprintf (asm_out_file, ", ");
2348 assemble_name (asm_out_file, buf);
2349 }
2350 fprintf (asm_out_file, ";\n");
2351
2352 if (find_reg_note (insn, REG_NORETURN, NULL))
2353 {
2354 /* No return functions confuse the PTX JIT, as it doesn't realize
2355 the flow control barrier they imply. It can seg fault if it
2356 encounters what looks like an unexitable loop. Emit a trailing
2357 trap and exit, which it does grok. */
2358 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2359 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2360 }
2361
2362 if (result)
2363 {
2364 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2365
2366 if (!rval[0])
2367 /* We must escape the '%' that starts RETURN_REGNUM. */
2368 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2369 reg_names[NVPTX_RETURN_REGNUM]);
2370 return rval;
2371 }
2372
2373 return "}";
2374 }
2375
2376 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2377
2378 static bool
2379 nvptx_print_operand_punct_valid_p (unsigned char c)
2380 {
2381 return c == '.' || c== '#';
2382 }
2383
2384 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2385
2386 static void
2387 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2388 {
2389 rtx off;
2390 if (GET_CODE (x) == CONST)
2391 x = XEXP (x, 0);
2392 switch (GET_CODE (x))
2393 {
2394 case PLUS:
2395 off = XEXP (x, 1);
2396 output_address (VOIDmode, XEXP (x, 0));
2397 fprintf (file, "+");
2398 output_address (VOIDmode, off);
2399 break;
2400
2401 case SYMBOL_REF:
2402 case LABEL_REF:
2403 output_addr_const (file, x);
2404 break;
2405
2406 default:
2407 gcc_assert (GET_CODE (x) != MEM);
2408 nvptx_print_operand (file, x, 0);
2409 break;
2410 }
2411 }
2412
2413 /* Write assembly language output for the address ADDR to FILE. */
2414
2415 static void
2416 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2417 {
2418 nvptx_print_address_operand (file, addr, mode);
2419 }
2420
2421 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2422
2423 Meaning of CODE:
2424 . -- print the predicate for the instruction or an emptry string for an
2425 unconditional one.
2426 # -- print a rounding mode for the instruction
2427
2428 A -- print a data area for a MEM
2429 c -- print an opcode suffix for a comparison operator, including a type code
2430 D -- print a data area for a MEM operand
2431 S -- print a shuffle kind specified by CONST_INT
2432 t -- print a type opcode suffix, promoting QImode to 32 bits
2433 T -- print a type size in bits
2434 u -- print a type opcode suffix without promotions. */
2435
2436 static void
2437 nvptx_print_operand (FILE *file, rtx x, int code)
2438 {
2439 if (code == '.')
2440 {
2441 x = current_insn_predicate;
2442 if (x)
2443 {
2444 fputs ("@", file);
2445 if (GET_CODE (x) == EQ)
2446 fputs ("!", file);
2447 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2448 }
2449 return;
2450 }
2451 else if (code == '#')
2452 {
2453 fputs (".rn", file);
2454 return;
2455 }
2456
2457 enum rtx_code x_code = GET_CODE (x);
2458 machine_mode mode = GET_MODE (x);
2459
2460 switch (code)
2461 {
2462 case 'A':
2463 x = XEXP (x, 0);
2464 /* FALLTHROUGH. */
2465
2466 case 'D':
2467 if (GET_CODE (x) == CONST)
2468 x = XEXP (x, 0);
2469 if (GET_CODE (x) == PLUS)
2470 x = XEXP (x, 0);
2471
2472 if (GET_CODE (x) == SYMBOL_REF)
2473 fputs (section_for_sym (x), file);
2474 break;
2475
2476 case 't':
2477 case 'u':
2478 if (x_code == SUBREG)
2479 {
2480 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2481 if (VECTOR_MODE_P (inner_mode)
2482 && (GET_MODE_SIZE (mode)
2483 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2484 mode = GET_MODE_INNER (inner_mode);
2485 else if (split_mode_p (inner_mode))
2486 mode = maybe_split_mode (inner_mode);
2487 else
2488 mode = inner_mode;
2489 }
2490 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2491 break;
2492
2493 case 'H':
2494 case 'L':
2495 {
2496 rtx inner_x = SUBREG_REG (x);
2497 machine_mode inner_mode = GET_MODE (inner_x);
2498 machine_mode split = maybe_split_mode (inner_mode);
2499
2500 output_reg (file, REGNO (inner_x), split,
2501 (code == 'H'
2502 ? GET_MODE_SIZE (inner_mode) / 2
2503 : 0));
2504 }
2505 break;
2506
2507 case 'S':
2508 {
2509 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2510 /* Same order as nvptx_shuffle_kind. */
2511 static const char *const kinds[] =
2512 {".up", ".down", ".bfly", ".idx"};
2513 fputs (kinds[kind], file);
2514 }
2515 break;
2516
2517 case 'T':
2518 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2519 break;
2520
2521 case 'j':
2522 fprintf (file, "@");
2523 goto common;
2524
2525 case 'J':
2526 fprintf (file, "@!");
2527 goto common;
2528
2529 case 'c':
2530 mode = GET_MODE (XEXP (x, 0));
2531 switch (x_code)
2532 {
2533 case EQ:
2534 fputs (".eq", file);
2535 break;
2536 case NE:
2537 if (FLOAT_MODE_P (mode))
2538 fputs (".neu", file);
2539 else
2540 fputs (".ne", file);
2541 break;
2542 case LE:
2543 case LEU:
2544 fputs (".le", file);
2545 break;
2546 case GE:
2547 case GEU:
2548 fputs (".ge", file);
2549 break;
2550 case LT:
2551 case LTU:
2552 fputs (".lt", file);
2553 break;
2554 case GT:
2555 case GTU:
2556 fputs (".gt", file);
2557 break;
2558 case LTGT:
2559 fputs (".ne", file);
2560 break;
2561 case UNEQ:
2562 fputs (".equ", file);
2563 break;
2564 case UNLE:
2565 fputs (".leu", file);
2566 break;
2567 case UNGE:
2568 fputs (".geu", file);
2569 break;
2570 case UNLT:
2571 fputs (".ltu", file);
2572 break;
2573 case UNGT:
2574 fputs (".gtu", file);
2575 break;
2576 case UNORDERED:
2577 fputs (".nan", file);
2578 break;
2579 case ORDERED:
2580 fputs (".num", file);
2581 break;
2582 default:
2583 gcc_unreachable ();
2584 }
2585 if (FLOAT_MODE_P (mode)
2586 || x_code == EQ || x_code == NE
2587 || x_code == GEU || x_code == GTU
2588 || x_code == LEU || x_code == LTU)
2589 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2590 else
2591 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2592 break;
2593 default:
2594 common:
2595 switch (x_code)
2596 {
2597 case SUBREG:
2598 {
2599 rtx inner_x = SUBREG_REG (x);
2600 machine_mode inner_mode = GET_MODE (inner_x);
2601 machine_mode split = maybe_split_mode (inner_mode);
2602
2603 if (VECTOR_MODE_P (inner_mode)
2604 && (GET_MODE_SIZE (mode)
2605 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2606 {
2607 output_reg (file, REGNO (inner_x), VOIDmode);
2608 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2609 }
2610 else if (split_mode_p (inner_mode)
2611 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2612 output_reg (file, REGNO (inner_x), split);
2613 else
2614 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2615 }
2616 break;
2617
2618 case REG:
2619 output_reg (file, REGNO (x), maybe_split_mode (mode));
2620 break;
2621
2622 case MEM:
2623 fputc ('[', file);
2624 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2625 fputc (']', file);
2626 break;
2627
2628 case CONST_INT:
2629 output_addr_const (file, x);
2630 break;
2631
2632 case CONST:
2633 case SYMBOL_REF:
2634 case LABEL_REF:
2635 /* We could use output_addr_const, but that can print things like
2636 "x-8", which breaks ptxas. Need to ensure it is output as
2637 "x+-8". */
2638 nvptx_print_address_operand (file, x, VOIDmode);
2639 break;
2640
2641 case CONST_DOUBLE:
2642 long vals[2];
2643 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2644 vals[0] &= 0xffffffff;
2645 vals[1] &= 0xffffffff;
2646 if (mode == SFmode)
2647 fprintf (file, "0f%08lx", vals[0]);
2648 else
2649 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2650 break;
2651
2652 case CONST_VECTOR:
2653 {
2654 unsigned n = CONST_VECTOR_NUNITS (x);
2655 fprintf (file, "{ ");
2656 for (unsigned i = 0; i < n; ++i)
2657 {
2658 if (i != 0)
2659 fprintf (file, ", ");
2660
2661 rtx elem = CONST_VECTOR_ELT (x, i);
2662 output_addr_const (file, elem);
2663 }
2664 fprintf (file, " }");
2665 }
2666 break;
2667
2668 default:
2669 output_addr_const (file, x);
2670 }
2671 }
2672 }
2673 \f
2674 /* Record replacement regs used to deal with subreg operands. */
2675 struct reg_replace
2676 {
2677 rtx replacement[MAX_RECOG_OPERANDS];
2678 machine_mode mode;
2679 int n_allocated;
2680 int n_in_use;
2681 };
2682
2683 /* Allocate or reuse a replacement in R and return the rtx. */
2684
2685 static rtx
2686 get_replacement (struct reg_replace *r)
2687 {
2688 if (r->n_allocated == r->n_in_use)
2689 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2690 return r->replacement[r->n_in_use++];
2691 }
2692
2693 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2694 the presence of subregs would break the rules for most instructions.
2695 Replace them with a suitable new register of the right size, plus
2696 conversion copyin/copyout instructions. */
2697
2698 static void
2699 nvptx_reorg_subreg (void)
2700 {
2701 struct reg_replace qiregs, hiregs, siregs, diregs;
2702 rtx_insn *insn, *next;
2703
2704 qiregs.n_allocated = 0;
2705 hiregs.n_allocated = 0;
2706 siregs.n_allocated = 0;
2707 diregs.n_allocated = 0;
2708 qiregs.mode = QImode;
2709 hiregs.mode = HImode;
2710 siregs.mode = SImode;
2711 diregs.mode = DImode;
2712
2713 for (insn = get_insns (); insn; insn = next)
2714 {
2715 next = NEXT_INSN (insn);
2716 if (!NONDEBUG_INSN_P (insn)
2717 || asm_noperands (PATTERN (insn)) >= 0
2718 || GET_CODE (PATTERN (insn)) == USE
2719 || GET_CODE (PATTERN (insn)) == CLOBBER)
2720 continue;
2721
2722 qiregs.n_in_use = 0;
2723 hiregs.n_in_use = 0;
2724 siregs.n_in_use = 0;
2725 diregs.n_in_use = 0;
2726 extract_insn (insn);
2727 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2728
2729 for (int i = 0; i < recog_data.n_operands; i++)
2730 {
2731 rtx op = recog_data.operand[i];
2732 if (GET_CODE (op) != SUBREG)
2733 continue;
2734
2735 rtx inner = SUBREG_REG (op);
2736
2737 machine_mode outer_mode = GET_MODE (op);
2738 machine_mode inner_mode = GET_MODE (inner);
2739 gcc_assert (s_ok);
2740 if (s_ok
2741 && (GET_MODE_PRECISION (inner_mode)
2742 >= GET_MODE_PRECISION (outer_mode)))
2743 continue;
2744 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2745 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2746 : outer_mode == HImode ? &hiregs
2747 : outer_mode == SImode ? &siregs
2748 : &diregs);
2749 rtx new_reg = get_replacement (r);
2750
2751 if (recog_data.operand_type[i] != OP_OUT)
2752 {
2753 enum rtx_code code;
2754 if (GET_MODE_PRECISION (inner_mode)
2755 < GET_MODE_PRECISION (outer_mode))
2756 code = ZERO_EXTEND;
2757 else
2758 code = TRUNCATE;
2759
2760 rtx pat = gen_rtx_SET (new_reg,
2761 gen_rtx_fmt_e (code, outer_mode, inner));
2762 emit_insn_before (pat, insn);
2763 }
2764
2765 if (recog_data.operand_type[i] != OP_IN)
2766 {
2767 enum rtx_code code;
2768 if (GET_MODE_PRECISION (inner_mode)
2769 < GET_MODE_PRECISION (outer_mode))
2770 code = TRUNCATE;
2771 else
2772 code = ZERO_EXTEND;
2773
2774 rtx pat = gen_rtx_SET (inner,
2775 gen_rtx_fmt_e (code, inner_mode, new_reg));
2776 emit_insn_after (pat, insn);
2777 }
2778 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2779 }
2780 }
2781 }
2782
2783 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2784 first use. */
2785
2786 static rtx
2787 nvptx_get_unisimt_master ()
2788 {
2789 rtx &master = cfun->machine->unisimt_master;
2790 return master ? master : master = gen_reg_rtx (SImode);
2791 }
2792
2793 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2794
2795 static rtx
2796 nvptx_get_unisimt_predicate ()
2797 {
2798 rtx &pred = cfun->machine->unisimt_predicate;
2799 return pred ? pred : pred = gen_reg_rtx (BImode);
2800 }
2801
2802 /* Return true if given call insn references one of the functions provided by
2803 the CUDA runtime: malloc, free, vprintf. */
2804
2805 static bool
2806 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2807 {
2808 rtx pat = PATTERN (insn);
2809 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2810 pat = XVECEXP (pat, 0, 0);
2811 if (GET_CODE (pat) == SET)
2812 pat = SET_SRC (pat);
2813 gcc_checking_assert (GET_CODE (pat) == CALL
2814 && GET_CODE (XEXP (pat, 0)) == MEM);
2815 rtx addr = XEXP (XEXP (pat, 0), 0);
2816 if (GET_CODE (addr) != SYMBOL_REF)
2817 return false;
2818 const char *name = XSTR (addr, 0);
2819 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2820 references with forced assembler name refer to PTX syscalls. For vprintf,
2821 accept both normal and forced-assembler-name references. */
2822 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2823 || !strcmp (name, "*malloc")
2824 || !strcmp (name, "*free"));
2825 }
2826
2827 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2828 propagate its value from lane MASTER to current lane. */
2829
2830 static void
2831 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2832 {
2833 rtx reg;
2834 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2835 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2836 }
2837
2838 /* Adjust code for uniform-simt code generation variant by making atomics and
2839 "syscalls" conditionally executed, and inserting shuffle-based propagation
2840 for registers being set. */
2841
2842 static void
2843 nvptx_reorg_uniform_simt ()
2844 {
2845 rtx_insn *insn, *next;
2846
2847 for (insn = get_insns (); insn; insn = next)
2848 {
2849 next = NEXT_INSN (insn);
2850 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2851 && !(NONJUMP_INSN_P (insn)
2852 && GET_CODE (PATTERN (insn)) == PARALLEL
2853 && get_attr_atomic (insn)))
2854 continue;
2855 rtx pat = PATTERN (insn);
2856 rtx master = nvptx_get_unisimt_master ();
2857 for (int i = 0; i < XVECLEN (pat, 0); i++)
2858 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2859 rtx pred = nvptx_get_unisimt_predicate ();
2860 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2861 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2862 validate_change (insn, &PATTERN (insn), pat, false);
2863 }
2864 }
2865
2866 /* Loop structure of the function. The entire function is described as
2867 a NULL loop. */
2868
2869 struct parallel
2870 {
2871 /* Parent parallel. */
2872 parallel *parent;
2873
2874 /* Next sibling parallel. */
2875 parallel *next;
2876
2877 /* First child parallel. */
2878 parallel *inner;
2879
2880 /* Partitioning mask of the parallel. */
2881 unsigned mask;
2882
2883 /* Partitioning used within inner parallels. */
2884 unsigned inner_mask;
2885
2886 /* Location of parallel forked and join. The forked is the first
2887 block in the parallel and the join is the first block after of
2888 the partition. */
2889 basic_block forked_block;
2890 basic_block join_block;
2891
2892 rtx_insn *forked_insn;
2893 rtx_insn *join_insn;
2894
2895 rtx_insn *fork_insn;
2896 rtx_insn *joining_insn;
2897
2898 /* Basic blocks in this parallel, but not in child parallels. The
2899 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2900 blocks are not. */
2901 auto_vec<basic_block> blocks;
2902
2903 public:
2904 parallel (parallel *parent, unsigned mode);
2905 ~parallel ();
2906 };
2907
2908 /* Constructor links the new parallel into it's parent's chain of
2909 children. */
2910
2911 parallel::parallel (parallel *parent_, unsigned mask_)
2912 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2913 {
2914 forked_block = join_block = 0;
2915 forked_insn = join_insn = 0;
2916 fork_insn = joining_insn = 0;
2917
2918 if (parent)
2919 {
2920 next = parent->inner;
2921 parent->inner = this;
2922 }
2923 }
2924
2925 parallel::~parallel ()
2926 {
2927 delete inner;
2928 delete next;
2929 }
2930
2931 /* Map of basic blocks to insns */
2932 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2933
2934 /* A tuple of an insn of interest and the BB in which it resides. */
2935 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2936 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2937
2938 /* Split basic blocks such that each forked and join unspecs are at
2939 the start of their basic blocks. Thus afterwards each block will
2940 have a single partitioning mode. We also do the same for return
2941 insns, as they are executed by every thread. Return the
2942 partitioning mode of the function as a whole. Populate MAP with
2943 head and tail blocks. We also clear the BB visited flag, which is
2944 used when finding partitions. */
2945
2946 static void
2947 nvptx_split_blocks (bb_insn_map_t *map)
2948 {
2949 insn_bb_vec_t worklist;
2950 basic_block block;
2951 rtx_insn *insn;
2952
2953 /* Locate all the reorg instructions of interest. */
2954 FOR_ALL_BB_FN (block, cfun)
2955 {
2956 bool seen_insn = false;
2957
2958 /* Clear visited flag, for use by parallel locator */
2959 block->flags &= ~BB_VISITED;
2960
2961 FOR_BB_INSNS (block, insn)
2962 {
2963 if (!INSN_P (insn))
2964 continue;
2965 switch (recog_memoized (insn))
2966 {
2967 default:
2968 seen_insn = true;
2969 continue;
2970 case CODE_FOR_nvptx_forked:
2971 case CODE_FOR_nvptx_join:
2972 break;
2973
2974 case CODE_FOR_return:
2975 /* We also need to split just before return insns, as
2976 that insn needs executing by all threads, but the
2977 block it is in probably does not. */
2978 break;
2979 }
2980
2981 if (seen_insn)
2982 /* We've found an instruction that must be at the start of
2983 a block, but isn't. Add it to the worklist. */
2984 worklist.safe_push (insn_bb_t (insn, block));
2985 else
2986 /* It was already the first instruction. Just add it to
2987 the map. */
2988 map->get_or_insert (block) = insn;
2989 seen_insn = true;
2990 }
2991 }
2992
2993 /* Split blocks on the worklist. */
2994 unsigned ix;
2995 insn_bb_t *elt;
2996 basic_block remap = 0;
2997 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2998 {
2999 if (remap != elt->second)
3000 {
3001 block = elt->second;
3002 remap = block;
3003 }
3004
3005 /* Split block before insn. The insn is in the new block */
3006 edge e = split_block (block, PREV_INSN (elt->first));
3007
3008 block = e->dest;
3009 map->get_or_insert (block) = elt->first;
3010 }
3011 }
3012
3013 /* BLOCK is a basic block containing a head or tail instruction.
3014 Locate the associated prehead or pretail instruction, which must be
3015 in the single predecessor block. */
3016
3017 static rtx_insn *
3018 nvptx_discover_pre (basic_block block, int expected)
3019 {
3020 gcc_assert (block->preds->length () == 1);
3021 basic_block pre_block = (*block->preds)[0]->src;
3022 rtx_insn *pre_insn;
3023
3024 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3025 pre_insn = PREV_INSN (pre_insn))
3026 gcc_assert (pre_insn != BB_HEAD (pre_block));
3027
3028 gcc_assert (recog_memoized (pre_insn) == expected);
3029 return pre_insn;
3030 }
3031
3032 /* Dump this parallel and all its inner parallels. */
3033
3034 static void
3035 nvptx_dump_pars (parallel *par, unsigned depth)
3036 {
3037 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3038 depth, par->mask,
3039 par->forked_block ? par->forked_block->index : -1,
3040 par->join_block ? par->join_block->index : -1);
3041
3042 fprintf (dump_file, " blocks:");
3043
3044 basic_block block;
3045 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3046 fprintf (dump_file, " %d", block->index);
3047 fprintf (dump_file, "\n");
3048 if (par->inner)
3049 nvptx_dump_pars (par->inner, depth + 1);
3050
3051 if (par->next)
3052 nvptx_dump_pars (par->next, depth);
3053 }
3054
3055 /* If BLOCK contains a fork/join marker, process it to create or
3056 terminate a loop structure. Add this block to the current loop,
3057 and then walk successor blocks. */
3058
3059 static parallel *
3060 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3061 {
3062 if (block->flags & BB_VISITED)
3063 return par;
3064 block->flags |= BB_VISITED;
3065
3066 if (rtx_insn **endp = map->get (block))
3067 {
3068 rtx_insn *end = *endp;
3069
3070 /* This is a block head or tail, or return instruction. */
3071 switch (recog_memoized (end))
3072 {
3073 case CODE_FOR_return:
3074 /* Return instructions are in their own block, and we
3075 don't need to do anything more. */
3076 return par;
3077
3078 case CODE_FOR_nvptx_forked:
3079 /* Loop head, create a new inner loop and add it into
3080 our parent's child list. */
3081 {
3082 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3083
3084 gcc_assert (mask);
3085 par = new parallel (par, mask);
3086 par->forked_block = block;
3087 par->forked_insn = end;
3088 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3089 par->fork_insn
3090 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3091 }
3092 break;
3093
3094 case CODE_FOR_nvptx_join:
3095 /* A loop tail. Finish the current loop and return to
3096 parent. */
3097 {
3098 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3099
3100 gcc_assert (par->mask == mask);
3101 par->join_block = block;
3102 par->join_insn = end;
3103 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3104 par->joining_insn
3105 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3106 par = par->parent;
3107 }
3108 break;
3109
3110 default:
3111 gcc_unreachable ();
3112 }
3113 }
3114
3115 if (par)
3116 /* Add this block onto the current loop's list of blocks. */
3117 par->blocks.safe_push (block);
3118 else
3119 /* This must be the entry block. Create a NULL parallel. */
3120 par = new parallel (0, 0);
3121
3122 /* Walk successor blocks. */
3123 edge e;
3124 edge_iterator ei;
3125
3126 FOR_EACH_EDGE (e, ei, block->succs)
3127 nvptx_find_par (map, par, e->dest);
3128
3129 return par;
3130 }
3131
3132 /* DFS walk the CFG looking for fork & join markers. Construct
3133 loop structures as we go. MAP is a mapping of basic blocks
3134 to head & tail markers, discovered when splitting blocks. This
3135 speeds up the discovery. We rely on the BB visited flag having
3136 been cleared when splitting blocks. */
3137
3138 static parallel *
3139 nvptx_discover_pars (bb_insn_map_t *map)
3140 {
3141 basic_block block;
3142
3143 /* Mark exit blocks as visited. */
3144 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3145 block->flags |= BB_VISITED;
3146
3147 /* And entry block as not. */
3148 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3149 block->flags &= ~BB_VISITED;
3150
3151 parallel *par = nvptx_find_par (map, 0, block);
3152
3153 if (dump_file)
3154 {
3155 fprintf (dump_file, "\nLoops\n");
3156 nvptx_dump_pars (par, 0);
3157 fprintf (dump_file, "\n");
3158 }
3159
3160 return par;
3161 }
3162
3163 /* Analyse a group of BBs within a partitioned region and create N
3164 Single-Entry-Single-Exit regions. Some of those regions will be
3165 trivial ones consisting of a single BB. The blocks of a
3166 partitioned region might form a set of disjoint graphs -- because
3167 the region encloses a differently partitoned sub region.
3168
3169 We use the linear time algorithm described in 'Finding Regions Fast:
3170 Single Entry Single Exit and control Regions in Linear Time'
3171 Johnson, Pearson & Pingali. That algorithm deals with complete
3172 CFGs, where a back edge is inserted from END to START, and thus the
3173 problem becomes one of finding equivalent loops.
3174
3175 In this case we have a partial CFG. We complete it by redirecting
3176 any incoming edge to the graph to be from an arbitrary external BB,
3177 and similarly redirecting any outgoing edge to be to that BB.
3178 Thus we end up with a closed graph.
3179
3180 The algorithm works by building a spanning tree of an undirected
3181 graph and keeping track of back edges from nodes further from the
3182 root in the tree to nodes nearer to the root in the tree. In the
3183 description below, the root is up and the tree grows downwards.
3184
3185 We avoid having to deal with degenerate back-edges to the same
3186 block, by splitting each BB into 3 -- one for input edges, one for
3187 the node itself and one for the output edges. Such back edges are
3188 referred to as 'Brackets'. Cycle equivalent nodes will have the
3189 same set of brackets.
3190
3191 Determining bracket equivalency is done by maintaining a list of
3192 brackets in such a manner that the list length and final bracket
3193 uniquely identify the set.
3194
3195 We use coloring to mark all BBs with cycle equivalency with the
3196 same color. This is the output of the 'Finding Regions Fast'
3197 algorithm. Notice it doesn't actually find the set of nodes within
3198 a particular region, just unorderd sets of nodes that are the
3199 entries and exits of SESE regions.
3200
3201 After determining cycle equivalency, we need to find the minimal
3202 set of SESE regions. Do this with a DFS coloring walk of the
3203 complete graph. We're either 'looking' or 'coloring'. When
3204 looking, and we're in the subgraph, we start coloring the color of
3205 the current node, and remember that node as the start of the
3206 current color's SESE region. Every time we go to a new node, we
3207 decrement the count of nodes with thet color. If it reaches zero,
3208 we remember that node as the end of the current color's SESE region
3209 and return to 'looking'. Otherwise we color the node the current
3210 color.
3211
3212 This way we end up with coloring the inside of non-trivial SESE
3213 regions with the color of that region. */
3214
3215 /* A pair of BBs. We use this to represent SESE regions. */
3216 typedef std::pair<basic_block, basic_block> bb_pair_t;
3217 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3218
3219 /* A node in the undirected CFG. The discriminator SECOND indicates just
3220 above or just below the BB idicated by FIRST. */
3221 typedef std::pair<basic_block, int> pseudo_node_t;
3222
3223 /* A bracket indicates an edge towards the root of the spanning tree of the
3224 undirected graph. Each bracket has a color, determined
3225 from the currrent set of brackets. */
3226 struct bracket
3227 {
3228 pseudo_node_t back; /* Back target */
3229
3230 /* Current color and size of set. */
3231 unsigned color;
3232 unsigned size;
3233
3234 bracket (pseudo_node_t back_)
3235 : back (back_), color (~0u), size (~0u)
3236 {
3237 }
3238
3239 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3240 {
3241 if (length != size)
3242 {
3243 size = length;
3244 color = color_counts.length ();
3245 color_counts.quick_push (0);
3246 }
3247 color_counts[color]++;
3248 return color;
3249 }
3250 };
3251
3252 typedef auto_vec<bracket> bracket_vec_t;
3253
3254 /* Basic block info for finding SESE regions. */
3255
3256 struct bb_sese
3257 {
3258 int node; /* Node number in spanning tree. */
3259 int parent; /* Parent node number. */
3260
3261 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3262 edges arrive at pseudo-node Ai and the outgoing edges leave at
3263 pseudo-node Ao. We have to remember which way we arrived at a
3264 particular node when generating the spanning tree. dir > 0 means
3265 we arrived at Ai, dir < 0 means we arrived at Ao. */
3266 int dir;
3267
3268 /* Lowest numbered pseudo-node reached via a backedge from thsis
3269 node, or any descendant. */
3270 pseudo_node_t high;
3271
3272 int color; /* Cycle-equivalence color */
3273
3274 /* Stack of brackets for this node. */
3275 bracket_vec_t brackets;
3276
3277 bb_sese (unsigned node_, unsigned p, int dir_)
3278 :node (node_), parent (p), dir (dir_)
3279 {
3280 }
3281 ~bb_sese ();
3282
3283 /* Push a bracket ending at BACK. */
3284 void push (const pseudo_node_t &back)
3285 {
3286 if (dump_file)
3287 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3288 back.first ? back.first->index : 0, back.second);
3289 brackets.safe_push (bracket (back));
3290 }
3291
3292 void append (bb_sese *child);
3293 void remove (const pseudo_node_t &);
3294
3295 /* Set node's color. */
3296 void set_color (auto_vec<unsigned> &color_counts)
3297 {
3298 color = brackets.last ().get_color (color_counts, brackets.length ());
3299 }
3300 };
3301
3302 bb_sese::~bb_sese ()
3303 {
3304 }
3305
3306 /* Destructively append CHILD's brackets. */
3307
3308 void
3309 bb_sese::append (bb_sese *child)
3310 {
3311 if (int len = child->brackets.length ())
3312 {
3313 int ix;
3314
3315 if (dump_file)
3316 {
3317 for (ix = 0; ix < len; ix++)
3318 {
3319 const pseudo_node_t &pseudo = child->brackets[ix].back;
3320 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3321 child->node, pseudo.first ? pseudo.first->index : 0,
3322 pseudo.second);
3323 }
3324 }
3325 if (!brackets.length ())
3326 std::swap (brackets, child->brackets);
3327 else
3328 {
3329 brackets.reserve (len);
3330 for (ix = 0; ix < len; ix++)
3331 brackets.quick_push (child->brackets[ix]);
3332 }
3333 }
3334 }
3335
3336 /* Remove brackets that terminate at PSEUDO. */
3337
3338 void
3339 bb_sese::remove (const pseudo_node_t &pseudo)
3340 {
3341 unsigned removed = 0;
3342 int len = brackets.length ();
3343
3344 for (int ix = 0; ix < len; ix++)
3345 {
3346 if (brackets[ix].back == pseudo)
3347 {
3348 if (dump_file)
3349 fprintf (dump_file, "Removing backedge %d:%+d\n",
3350 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3351 removed++;
3352 }
3353 else if (removed)
3354 brackets[ix-removed] = brackets[ix];
3355 }
3356 while (removed--)
3357 brackets.pop ();
3358 }
3359
3360 /* Accessors for BB's aux pointer. */
3361 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3362 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3363
3364 /* DFS walk creating SESE data structures. Only cover nodes with
3365 BB_VISITED set. Append discovered blocks to LIST. We number in
3366 increments of 3 so that the above and below pseudo nodes can be
3367 implicitly numbered too. */
3368
3369 static int
3370 nvptx_sese_number (int n, int p, int dir, basic_block b,
3371 auto_vec<basic_block> *list)
3372 {
3373 if (BB_GET_SESE (b))
3374 return n;
3375
3376 if (dump_file)
3377 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3378 b->index, n, p, dir);
3379
3380 BB_SET_SESE (b, new bb_sese (n, p, dir));
3381 p = n;
3382
3383 n += 3;
3384 list->quick_push (b);
3385
3386 /* First walk the nodes on the 'other side' of this node, then walk
3387 the nodes on the same side. */
3388 for (unsigned ix = 2; ix; ix--)
3389 {
3390 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3391 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3392 : offsetof (edge_def, src));
3393 edge e;
3394 edge_iterator (ei);
3395
3396 FOR_EACH_EDGE (e, ei, edges)
3397 {
3398 basic_block target = *(basic_block *)((char *)e + offset);
3399
3400 if (target->flags & BB_VISITED)
3401 n = nvptx_sese_number (n, p, dir, target, list);
3402 }
3403 dir = -dir;
3404 }
3405 return n;
3406 }
3407
3408 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3409 EDGES are the outgoing edges and OFFSET is the offset to the src
3410 or dst block on the edges. */
3411
3412 static void
3413 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3414 vec<edge, va_gc> *edges, size_t offset)
3415 {
3416 edge e;
3417 edge_iterator (ei);
3418 int hi_back = depth;
3419 pseudo_node_t node_back (0, depth);
3420 int hi_child = depth;
3421 pseudo_node_t node_child (0, depth);
3422 basic_block child = NULL;
3423 unsigned num_children = 0;
3424 int usd = -dir * sese->dir;
3425
3426 if (dump_file)
3427 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3428 me->index, sese->node, dir);
3429
3430 if (dir < 0)
3431 {
3432 /* This is the above pseudo-child. It has the BB itself as an
3433 additional child node. */
3434 node_child = sese->high;
3435 hi_child = node_child.second;
3436 if (node_child.first)
3437 hi_child += BB_GET_SESE (node_child.first)->node;
3438 num_children++;
3439 }
3440
3441 /* Examine each edge.
3442 - if it is a child (a) append its bracket list and (b) record
3443 whether it is the child with the highest reaching bracket.
3444 - if it is an edge to ancestor, record whether it's the highest
3445 reaching backlink. */
3446 FOR_EACH_EDGE (e, ei, edges)
3447 {
3448 basic_block target = *(basic_block *)((char *)e + offset);
3449
3450 if (bb_sese *t_sese = BB_GET_SESE (target))
3451 {
3452 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3453 {
3454 /* Child node. Append its bracket list. */
3455 num_children++;
3456 sese->append (t_sese);
3457
3458 /* Compare it's hi value. */
3459 int t_hi = t_sese->high.second;
3460
3461 if (basic_block child_hi_block = t_sese->high.first)
3462 t_hi += BB_GET_SESE (child_hi_block)->node;
3463
3464 if (hi_child > t_hi)
3465 {
3466 hi_child = t_hi;
3467 node_child = t_sese->high;
3468 child = target;
3469 }
3470 }
3471 else if (t_sese->node < sese->node + dir
3472 && !(dir < 0 && sese->parent == t_sese->node))
3473 {
3474 /* Non-parental ancestor node -- a backlink. */
3475 int d = usd * t_sese->dir;
3476 int back = t_sese->node + d;
3477
3478 if (hi_back > back)
3479 {
3480 hi_back = back;
3481 node_back = pseudo_node_t (target, d);
3482 }
3483 }
3484 }
3485 else
3486 { /* Fallen off graph, backlink to entry node. */
3487 hi_back = 0;
3488 node_back = pseudo_node_t (0, 0);
3489 }
3490 }
3491
3492 /* Remove any brackets that terminate at this pseudo node. */
3493 sese->remove (pseudo_node_t (me, dir));
3494
3495 /* Now push any backlinks from this pseudo node. */
3496 FOR_EACH_EDGE (e, ei, edges)
3497 {
3498 basic_block target = *(basic_block *)((char *)e + offset);
3499 if (bb_sese *t_sese = BB_GET_SESE (target))
3500 {
3501 if (t_sese->node < sese->node + dir
3502 && !(dir < 0 && sese->parent == t_sese->node))
3503 /* Non-parental ancestor node - backedge from me. */
3504 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3505 }
3506 else
3507 {
3508 /* back edge to entry node */
3509 sese->push (pseudo_node_t (0, 0));
3510 }
3511 }
3512
3513 /* If this node leads directly or indirectly to a no-return region of
3514 the graph, then fake a backedge to entry node. */
3515 if (!sese->brackets.length () || !edges || !edges->length ())
3516 {
3517 hi_back = 0;
3518 node_back = pseudo_node_t (0, 0);
3519 sese->push (node_back);
3520 }
3521
3522 /* Record the highest reaching backedge from us or a descendant. */
3523 sese->high = hi_back < hi_child ? node_back : node_child;
3524
3525 if (num_children > 1)
3526 {
3527 /* There is more than one child -- this is a Y shaped piece of
3528 spanning tree. We have to insert a fake backedge from this
3529 node to the highest ancestor reached by not-the-highest
3530 reaching child. Note that there may be multiple children
3531 with backedges to the same highest node. That's ok and we
3532 insert the edge to that highest node. */
3533 hi_child = depth;
3534 if (dir < 0 && child)
3535 {
3536 node_child = sese->high;
3537 hi_child = node_child.second;
3538 if (node_child.first)
3539 hi_child += BB_GET_SESE (node_child.first)->node;
3540 }
3541
3542 FOR_EACH_EDGE (e, ei, edges)
3543 {
3544 basic_block target = *(basic_block *)((char *)e + offset);
3545
3546 if (target == child)
3547 /* Ignore the highest child. */
3548 continue;
3549
3550 bb_sese *t_sese = BB_GET_SESE (target);
3551 if (!t_sese)
3552 continue;
3553 if (t_sese->parent != sese->node)
3554 /* Not a child. */
3555 continue;
3556
3557 /* Compare its hi value. */
3558 int t_hi = t_sese->high.second;
3559
3560 if (basic_block child_hi_block = t_sese->high.first)
3561 t_hi += BB_GET_SESE (child_hi_block)->node;
3562
3563 if (hi_child > t_hi)
3564 {
3565 hi_child = t_hi;
3566 node_child = t_sese->high;
3567 }
3568 }
3569
3570 sese->push (node_child);
3571 }
3572 }
3573
3574
3575 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3576 proceed to successors. Set SESE entry and exit nodes of
3577 REGIONS. */
3578
3579 static void
3580 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3581 basic_block block, int coloring)
3582 {
3583 bb_sese *sese = BB_GET_SESE (block);
3584
3585 if (block->flags & BB_VISITED)
3586 {
3587 /* If we've already encountered this block, either we must not
3588 be coloring, or it must have been colored the current color. */
3589 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3590 return;
3591 }
3592
3593 block->flags |= BB_VISITED;
3594
3595 if (sese)
3596 {
3597 if (coloring < 0)
3598 {
3599 /* Start coloring a region. */
3600 regions[sese->color].first = block;
3601 coloring = sese->color;
3602 }
3603
3604 if (!--color_counts[sese->color] && sese->color == coloring)
3605 {
3606 /* Found final block of SESE region. */
3607 regions[sese->color].second = block;
3608 coloring = -1;
3609 }
3610 else
3611 /* Color the node, so we can assert on revisiting the node
3612 that the graph is indeed SESE. */
3613 sese->color = coloring;
3614 }
3615 else
3616 /* Fallen off the subgraph, we cannot be coloring. */
3617 gcc_assert (coloring < 0);
3618
3619 /* Walk each successor block. */
3620 if (block->succs && block->succs->length ())
3621 {
3622 edge e;
3623 edge_iterator ei;
3624
3625 FOR_EACH_EDGE (e, ei, block->succs)
3626 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3627 }
3628 else
3629 gcc_assert (coloring < 0);
3630 }
3631
3632 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3633 end up with NULL entries in it. */
3634
3635 static void
3636 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3637 {
3638 basic_block block;
3639 int ix;
3640
3641 /* First clear each BB of the whole function. */
3642 FOR_ALL_BB_FN (block, cfun)
3643 {
3644 block->flags &= ~BB_VISITED;
3645 BB_SET_SESE (block, 0);
3646 }
3647
3648 /* Mark blocks in the function that are in this graph. */
3649 for (ix = 0; blocks.iterate (ix, &block); ix++)
3650 block->flags |= BB_VISITED;
3651
3652 /* Counts of nodes assigned to each color. There cannot be more
3653 colors than blocks (and hopefully there will be fewer). */
3654 auto_vec<unsigned> color_counts;
3655 color_counts.reserve (blocks.length ());
3656
3657 /* Worklist of nodes in the spanning tree. Again, there cannot be
3658 more nodes in the tree than blocks (there will be fewer if the
3659 CFG of blocks is disjoint). */
3660 auto_vec<basic_block> spanlist;
3661 spanlist.reserve (blocks.length ());
3662
3663 /* Make sure every block has its cycle class determined. */
3664 for (ix = 0; blocks.iterate (ix, &block); ix++)
3665 {
3666 if (BB_GET_SESE (block))
3667 /* We already met this block in an earlier graph solve. */
3668 continue;
3669
3670 if (dump_file)
3671 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3672
3673 /* Number the nodes reachable from block initial DFS order. */
3674 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3675
3676 /* Now walk in reverse DFS order to find cycle equivalents. */
3677 while (spanlist.length ())
3678 {
3679 block = spanlist.pop ();
3680 bb_sese *sese = BB_GET_SESE (block);
3681
3682 /* Do the pseudo node below. */
3683 nvptx_sese_pseudo (block, sese, depth, +1,
3684 sese->dir > 0 ? block->succs : block->preds,
3685 (sese->dir > 0 ? offsetof (edge_def, dest)
3686 : offsetof (edge_def, src)));
3687 sese->set_color (color_counts);
3688 /* Do the pseudo node above. */
3689 nvptx_sese_pseudo (block, sese, depth, -1,
3690 sese->dir < 0 ? block->succs : block->preds,
3691 (sese->dir < 0 ? offsetof (edge_def, dest)
3692 : offsetof (edge_def, src)));
3693 }
3694 if (dump_file)
3695 fprintf (dump_file, "\n");
3696 }
3697
3698 if (dump_file)
3699 {
3700 unsigned count;
3701 const char *comma = "";
3702
3703 fprintf (dump_file, "Found %d cycle equivalents\n",
3704 color_counts.length ());
3705 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3706 {
3707 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3708
3709 comma = "";
3710 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3711 if (BB_GET_SESE (block)->color == ix)
3712 {
3713 block->flags |= BB_VISITED;
3714 fprintf (dump_file, "%s%d", comma, block->index);
3715 comma=",";
3716 }
3717 fprintf (dump_file, "}");
3718 comma = ", ";
3719 }
3720 fprintf (dump_file, "\n");
3721 }
3722
3723 /* Now we've colored every block in the subgraph. We now need to
3724 determine the minimal set of SESE regions that cover that
3725 subgraph. Do this with a DFS walk of the complete function.
3726 During the walk we're either 'looking' or 'coloring'. When we
3727 reach the last node of a particular color, we stop coloring and
3728 return to looking. */
3729
3730 /* There cannot be more SESE regions than colors. */
3731 regions.reserve (color_counts.length ());
3732 for (ix = color_counts.length (); ix--;)
3733 regions.quick_push (bb_pair_t (0, 0));
3734
3735 for (ix = 0; blocks.iterate (ix, &block); ix++)
3736 block->flags &= ~BB_VISITED;
3737
3738 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3739
3740 if (dump_file)
3741 {
3742 const char *comma = "";
3743 int len = regions.length ();
3744
3745 fprintf (dump_file, "SESE regions:");
3746 for (ix = 0; ix != len; ix++)
3747 {
3748 basic_block from = regions[ix].first;
3749 basic_block to = regions[ix].second;
3750
3751 if (from)
3752 {
3753 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3754 if (to != from)
3755 fprintf (dump_file, "->%d", to->index);
3756
3757 int color = BB_GET_SESE (from)->color;
3758
3759 /* Print the blocks within the region (excluding ends). */
3760 FOR_EACH_BB_FN (block, cfun)
3761 {
3762 bb_sese *sese = BB_GET_SESE (block);
3763
3764 if (sese && sese->color == color
3765 && block != from && block != to)
3766 fprintf (dump_file, ".%d", block->index);
3767 }
3768 fprintf (dump_file, "}");
3769 }
3770 comma = ",";
3771 }
3772 fprintf (dump_file, "\n\n");
3773 }
3774
3775 for (ix = 0; blocks.iterate (ix, &block); ix++)
3776 delete BB_GET_SESE (block);
3777 }
3778
3779 #undef BB_SET_SESE
3780 #undef BB_GET_SESE
3781
3782 /* Propagate live state at the start of a partitioned region. IS_CALL
3783 indicates whether the propagation is for a (partitioned) call
3784 instruction. BLOCK provides the live register information, and
3785 might not contain INSN. Propagation is inserted just after INSN. RW
3786 indicates whether we are reading and/or writing state. This
3787 separation is needed for worker-level proppagation where we
3788 essentially do a spill & fill. FN is the underlying worker
3789 function to generate the propagation instructions for single
3790 register. DATA is user data.
3791
3792 Returns true if we didn't emit any instructions.
3793
3794 We propagate the live register set for non-calls and the entire
3795 frame for calls and non-calls. We could do better by (a)
3796 propagating just the live set that is used within the partitioned
3797 regions and (b) only propagating stack entries that are used. The
3798 latter might be quite hard to determine. */
3799
3800 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3801
3802 static bool
3803 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3804 propagate_mask rw, propagator_fn fn, void *data)
3805 {
3806 bitmap live = DF_LIVE_IN (block);
3807 bitmap_iterator iterator;
3808 unsigned ix;
3809 bool empty = true;
3810
3811 /* Copy the frame array. */
3812 HOST_WIDE_INT fs = get_frame_size ();
3813 if (fs)
3814 {
3815 rtx tmp = gen_reg_rtx (DImode);
3816 rtx idx = NULL_RTX;
3817 rtx ptr = gen_reg_rtx (Pmode);
3818 rtx pred = NULL_RTX;
3819 rtx_code_label *label = NULL;
3820
3821 empty = false;
3822 /* The frame size might not be DImode compatible, but the frame
3823 array's declaration will be. So it's ok to round up here. */
3824 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3825 /* Detect single iteration loop. */
3826 if (fs == 1)
3827 fs = 0;
3828
3829 start_sequence ();
3830 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3831 if (fs)
3832 {
3833 idx = gen_reg_rtx (SImode);
3834 pred = gen_reg_rtx (BImode);
3835 label = gen_label_rtx ();
3836
3837 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3838 /* Allow worker function to initialize anything needed. */
3839 rtx init = fn (tmp, PM_loop_begin, fs, data);
3840 if (init)
3841 emit_insn (init);
3842 emit_label (label);
3843 LABEL_NUSES (label)++;
3844 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3845 }
3846 if (rw & PM_read)
3847 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3848 emit_insn (fn (tmp, rw, fs, data));
3849 if (rw & PM_write)
3850 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3851 if (fs)
3852 {
3853 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3854 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3855 emit_insn (gen_br_true_uni (pred, label));
3856 rtx fini = fn (tmp, PM_loop_end, fs, data);
3857 if (fini)
3858 emit_insn (fini);
3859 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3860 }
3861 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3862 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3863 rtx cpy = get_insns ();
3864 end_sequence ();
3865 insn = emit_insn_after (cpy, insn);
3866 }
3867
3868 if (!is_call)
3869 /* Copy live registers. */
3870 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3871 {
3872 rtx reg = regno_reg_rtx[ix];
3873
3874 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3875 {
3876 rtx bcast = fn (reg, rw, 0, data);
3877
3878 insn = emit_insn_after (bcast, insn);
3879 empty = false;
3880 }
3881 }
3882 return empty;
3883 }
3884
3885 /* Worker for nvptx_vpropagate. */
3886
3887 static rtx
3888 vprop_gen (rtx reg, propagate_mask pm,
3889 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3890 {
3891 if (!(pm & PM_read_write))
3892 return 0;
3893
3894 return nvptx_gen_vcast (reg);
3895 }
3896
3897 /* Propagate state that is live at start of BLOCK across the vectors
3898 of a single warp. Propagation is inserted just after INSN.
3899 IS_CALL and return as for nvptx_propagate. */
3900
3901 static bool
3902 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
3903 {
3904 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
3905 }
3906
3907 /* Worker for nvptx_wpropagate. */
3908
3909 static rtx
3910 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3911 {
3912 wcast_data_t *data = (wcast_data_t *)data_;
3913
3914 if (pm & PM_loop_begin)
3915 {
3916 /* Starting a loop, initialize pointer. */
3917 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3918
3919 if (align > worker_bcast_align)
3920 worker_bcast_align = align;
3921 data->offset = (data->offset + align - 1) & ~(align - 1);
3922
3923 data->ptr = gen_reg_rtx (Pmode);
3924
3925 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3926 }
3927 else if (pm & PM_loop_end)
3928 {
3929 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3930 data->ptr = NULL_RTX;
3931 return clobber;
3932 }
3933 else
3934 return nvptx_gen_wcast (reg, pm, rep, data);
3935 }
3936
3937 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3938 indicates if this is just before partitioned mode (do spill), or
3939 just after it starts (do fill). Sequence is inserted just after
3940 INSN. IS_CALL and return as for nvptx_propagate. */
3941
3942 static bool
3943 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
3944 {
3945 wcast_data_t data;
3946
3947 data.base = gen_reg_rtx (Pmode);
3948 data.offset = 0;
3949 data.ptr = NULL_RTX;
3950
3951 bool empty = nvptx_propagate (is_call, block, insn,
3952 pre_p ? PM_read : PM_write, wprop_gen, &data);
3953 gcc_assert (empty == !data.offset);
3954 if (data.offset)
3955 {
3956 /* Stuff was emitted, initialize the base pointer now. */
3957 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3958 emit_insn_after (init, insn);
3959
3960 if (worker_bcast_size < data.offset)
3961 worker_bcast_size = data.offset;
3962 }
3963 return empty;
3964 }
3965
3966 /* Emit a worker-level synchronization barrier. We use different
3967 markers for before and after synchronizations. */
3968
3969 static rtx
3970 nvptx_wsync (bool after)
3971 {
3972 return gen_nvptx_barsync (GEN_INT (after));
3973 }
3974
3975 #if WORKAROUND_PTXJIT_BUG
3976 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3977 real insns. */
3978
3979 static rtx_insn *
3980 bb_first_real_insn (basic_block bb)
3981 {
3982 rtx_insn *insn;
3983
3984 /* Find first insn of from block. */
3985 FOR_BB_INSNS (bb, insn)
3986 if (INSN_P (insn))
3987 return insn;
3988
3989 return 0;
3990 }
3991 #endif
3992
3993 /* Return true if INSN needs neutering. */
3994
3995 static bool
3996 needs_neutering_p (rtx_insn *insn)
3997 {
3998 if (!INSN_P (insn))
3999 return false;
4000
4001 switch (recog_memoized (insn))
4002 {
4003 case CODE_FOR_nvptx_fork:
4004 case CODE_FOR_nvptx_forked:
4005 case CODE_FOR_nvptx_joining:
4006 case CODE_FOR_nvptx_join:
4007 case CODE_FOR_nvptx_barsync:
4008 return false;
4009 default:
4010 return true;
4011 }
4012 }
4013
4014 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4015
4016 static bool
4017 verify_neutering_jumps (basic_block from,
4018 rtx_insn *vector_jump, rtx_insn *worker_jump,
4019 rtx_insn *vector_label, rtx_insn *worker_label)
4020 {
4021 basic_block bb = from;
4022 rtx_insn *insn = BB_HEAD (bb);
4023 bool seen_worker_jump = false;
4024 bool seen_vector_jump = false;
4025 bool seen_worker_label = false;
4026 bool seen_vector_label = false;
4027 bool worker_neutered = false;
4028 bool vector_neutered = false;
4029 while (true)
4030 {
4031 if (insn == worker_jump)
4032 {
4033 seen_worker_jump = true;
4034 worker_neutered = true;
4035 gcc_assert (!vector_neutered);
4036 }
4037 else if (insn == vector_jump)
4038 {
4039 seen_vector_jump = true;
4040 vector_neutered = true;
4041 }
4042 else if (insn == worker_label)
4043 {
4044 seen_worker_label = true;
4045 gcc_assert (worker_neutered);
4046 worker_neutered = false;
4047 }
4048 else if (insn == vector_label)
4049 {
4050 seen_vector_label = true;
4051 gcc_assert (vector_neutered);
4052 vector_neutered = false;
4053 }
4054 else if (INSN_P (insn))
4055 switch (recog_memoized (insn))
4056 {
4057 case CODE_FOR_nvptx_barsync:
4058 gcc_assert (!vector_neutered && !worker_neutered);
4059 break;
4060 default:
4061 break;
4062 }
4063
4064 if (insn != BB_END (bb))
4065 insn = NEXT_INSN (insn);
4066 else if (JUMP_P (insn) && single_succ_p (bb)
4067 && !seen_vector_jump && !seen_worker_jump)
4068 {
4069 bb = single_succ (bb);
4070 insn = BB_HEAD (bb);
4071 }
4072 else
4073 break;
4074 }
4075
4076 gcc_assert (!(vector_jump && !seen_vector_jump));
4077 gcc_assert (!(worker_jump && !seen_worker_jump));
4078
4079 if (seen_vector_label || seen_worker_label)
4080 {
4081 gcc_assert (!(vector_label && !seen_vector_label));
4082 gcc_assert (!(worker_label && !seen_worker_label));
4083
4084 return true;
4085 }
4086
4087 return false;
4088 }
4089
4090 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4091
4092 static void
4093 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4094 rtx_insn *worker_label)
4095 {
4096 basic_block bb = to;
4097 rtx_insn *insn = BB_END (bb);
4098 bool seen_worker_label = false;
4099 bool seen_vector_label = false;
4100 while (true)
4101 {
4102 if (insn == worker_label)
4103 {
4104 seen_worker_label = true;
4105 gcc_assert (!seen_vector_label);
4106 }
4107 else if (insn == vector_label)
4108 seen_vector_label = true;
4109 else if (INSN_P (insn))
4110 switch (recog_memoized (insn))
4111 {
4112 case CODE_FOR_nvptx_barsync:
4113 gcc_assert (!seen_vector_label && !seen_worker_label);
4114 break;
4115 }
4116
4117 if (insn != BB_HEAD (bb))
4118 insn = PREV_INSN (insn);
4119 else
4120 break;
4121 }
4122
4123 gcc_assert (!(vector_label && !seen_vector_label));
4124 gcc_assert (!(worker_label && !seen_worker_label));
4125 }
4126
4127 /* Single neutering according to MASK. FROM is the incoming block and
4128 TO is the outgoing block. These may be the same block. Insert at
4129 start of FROM:
4130
4131 if (tid.<axis>) goto end.
4132
4133 and insert before ending branch of TO (if there is such an insn):
4134
4135 end:
4136 <possibly-broadcast-cond>
4137 <branch>
4138
4139 We currently only use differnt FROM and TO when skipping an entire
4140 loop. We could do more if we detected superblocks. */
4141
4142 static void
4143 nvptx_single (unsigned mask, basic_block from, basic_block to)
4144 {
4145 rtx_insn *head = BB_HEAD (from);
4146 rtx_insn *tail = BB_END (to);
4147 unsigned skip_mask = mask;
4148
4149 while (true)
4150 {
4151 /* Find first insn of from block. */
4152 while (head != BB_END (from) && !needs_neutering_p (head))
4153 head = NEXT_INSN (head);
4154
4155 if (from == to)
4156 break;
4157
4158 if (!(JUMP_P (head) && single_succ_p (from)))
4159 break;
4160
4161 basic_block jump_target = single_succ (from);
4162 if (!single_pred_p (jump_target))
4163 break;
4164
4165 from = jump_target;
4166 head = BB_HEAD (from);
4167 }
4168
4169 /* Find last insn of to block */
4170 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4171 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4172 tail = PREV_INSN (tail);
4173
4174 /* Detect if tail is a branch. */
4175 rtx tail_branch = NULL_RTX;
4176 rtx cond_branch = NULL_RTX;
4177 if (tail && INSN_P (tail))
4178 {
4179 tail_branch = PATTERN (tail);
4180 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4181 tail_branch = NULL_RTX;
4182 else
4183 {
4184 cond_branch = SET_SRC (tail_branch);
4185 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4186 cond_branch = NULL_RTX;
4187 }
4188 }
4189
4190 if (tail == head)
4191 {
4192 /* If this is empty, do nothing. */
4193 if (!head || !needs_neutering_p (head))
4194 return;
4195
4196 if (cond_branch)
4197 {
4198 /* If we're only doing vector single, there's no need to
4199 emit skip code because we'll not insert anything. */
4200 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4201 skip_mask = 0;
4202 }
4203 else if (tail_branch)
4204 /* Block with only unconditional branch. Nothing to do. */
4205 return;
4206 }
4207
4208 /* Insert the vector test inside the worker test. */
4209 unsigned mode;
4210 rtx_insn *before = tail;
4211 rtx_insn *neuter_start = NULL;
4212 rtx_insn *worker_label = NULL, *vector_label = NULL;
4213 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4214 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4215 if (GOMP_DIM_MASK (mode) & skip_mask)
4216 {
4217 rtx_code_label *label = gen_label_rtx ();
4218 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4219 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4220 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4221
4222 if (!pred)
4223 {
4224 pred = gen_reg_rtx (BImode);
4225 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4226 }
4227
4228 rtx br;
4229 if (mode == GOMP_DIM_VECTOR)
4230 br = gen_br_true (pred, label);
4231 else
4232 br = gen_br_true_uni (pred, label);
4233 if (neuter_start)
4234 neuter_start = emit_insn_after (br, neuter_start);
4235 else
4236 neuter_start = emit_insn_before (br, head);
4237 *mode_jump = neuter_start;
4238
4239 LABEL_NUSES (label)++;
4240 rtx_insn *label_insn;
4241 if (tail_branch)
4242 {
4243 label_insn = emit_label_before (label, before);
4244 before = label_insn;
4245 }
4246 else
4247 {
4248 label_insn = emit_label_after (label, tail);
4249 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4250 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4251 emit_insn_after (gen_exit (), label_insn);
4252 }
4253
4254 if (mode == GOMP_DIM_VECTOR)
4255 vector_label = label_insn;
4256 else
4257 worker_label = label_insn;
4258 }
4259
4260 /* Now deal with propagating the branch condition. */
4261 if (cond_branch)
4262 {
4263 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4264
4265 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4266 {
4267 /* Vector mode only, do a shuffle. */
4268 #if WORKAROUND_PTXJIT_BUG
4269 /* The branch condition %rcond is propagated like this:
4270
4271 {
4272 .reg .u32 %x;
4273 mov.u32 %x,%tid.x;
4274 setp.ne.u32 %rnotvzero,%x,0;
4275 }
4276
4277 @%rnotvzero bra Lskip;
4278 setp.<op>.<type> %rcond,op1,op2;
4279 Lskip:
4280 selp.u32 %rcondu32,1,0,%rcond;
4281 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4282 setp.ne.u32 %rcond,%rcondu32,0;
4283
4284 There seems to be a bug in the ptx JIT compiler (observed at driver
4285 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4286 unless %rcond is initialized to something before 'bra Lskip'. The
4287 bug is not observed with ptxas from cuda 8.0.61.
4288
4289 It is true that the code is non-trivial: at Lskip, %rcond is
4290 uninitialized in threads 1-31, and after the selp the same holds
4291 for %rcondu32. But shfl propagates the defined value in thread 0
4292 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4293 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4294
4295 There is nothing in the PTX spec to suggest that this is wrong, or
4296 to explain why the extra initialization is needed. So, we classify
4297 it as a JIT bug, and the extra initialization as workaround:
4298
4299 {
4300 .reg .u32 %x;
4301 mov.u32 %x,%tid.x;
4302 setp.ne.u32 %rnotvzero,%x,0;
4303 }
4304
4305 +.reg .pred %rcond2;
4306 +setp.eq.u32 %rcond2, 1, 0;
4307
4308 @%rnotvzero bra Lskip;
4309 setp.<op>.<type> %rcond,op1,op2;
4310 +mov.pred %rcond2, %rcond;
4311 Lskip:
4312 +mov.pred %rcond, %rcond2;
4313 selp.u32 %rcondu32,1,0,%rcond;
4314 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4315 setp.ne.u32 %rcond,%rcondu32,0;
4316 */
4317 rtx_insn *label = PREV_INSN (tail);
4318 gcc_assert (label && LABEL_P (label));
4319 rtx tmp = gen_reg_rtx (BImode);
4320 emit_insn_before (gen_movbi (tmp, const0_rtx),
4321 bb_first_real_insn (from));
4322 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4323 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4324 #endif
4325 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4326 }
4327 else
4328 {
4329 /* Includes worker mode, do spill & fill. By construction
4330 we should never have worker mode only. */
4331 wcast_data_t data;
4332
4333 data.base = worker_bcast_sym;
4334 data.ptr = 0;
4335
4336 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4337 worker_bcast_size = GET_MODE_SIZE (SImode);
4338
4339 data.offset = 0;
4340 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4341 before);
4342 /* Barrier so other workers can see the write. */
4343 emit_insn_before (nvptx_wsync (false), tail);
4344 data.offset = 0;
4345 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4346 /* This barrier is needed to avoid worker zero clobbering
4347 the broadcast buffer before all the other workers have
4348 had a chance to read this instance of it. */
4349 emit_insn_before (nvptx_wsync (true), tail);
4350 }
4351
4352 extract_insn (tail);
4353 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4354 UNSPEC_BR_UNIFIED);
4355 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4356 }
4357
4358 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4359 vector_label, worker_label);
4360 if (!seen_label)
4361 verify_neutering_labels (to, vector_label, worker_label);
4362 }
4363
4364 /* PAR is a parallel that is being skipped in its entirety according to
4365 MASK. Treat this as skipping a superblock starting at forked
4366 and ending at joining. */
4367
4368 static void
4369 nvptx_skip_par (unsigned mask, parallel *par)
4370 {
4371 basic_block tail = par->join_block;
4372 gcc_assert (tail->preds->length () == 1);
4373
4374 basic_block pre_tail = (*tail->preds)[0]->src;
4375 gcc_assert (pre_tail->succs->length () == 1);
4376
4377 nvptx_single (mask, par->forked_block, pre_tail);
4378 }
4379
4380 /* If PAR has a single inner parallel and PAR itself only contains
4381 empty entry and exit blocks, swallow the inner PAR. */
4382
4383 static void
4384 nvptx_optimize_inner (parallel *par)
4385 {
4386 parallel *inner = par->inner;
4387
4388 /* We mustn't be the outer dummy par. */
4389 if (!par->mask)
4390 return;
4391
4392 /* We must have a single inner par. */
4393 if (!inner || inner->next)
4394 return;
4395
4396 /* We must only contain 2 blocks ourselves -- the head and tail of
4397 the inner par. */
4398 if (par->blocks.length () != 2)
4399 return;
4400
4401 /* We must be disjoint partitioning. As we only have vector and
4402 worker partitioning, this is sufficient to guarantee the pars
4403 have adjacent partitioning. */
4404 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4405 /* This indicates malformed code generation. */
4406 return;
4407
4408 /* The outer forked insn should be immediately followed by the inner
4409 fork insn. */
4410 rtx_insn *forked = par->forked_insn;
4411 rtx_insn *fork = BB_END (par->forked_block);
4412
4413 if (NEXT_INSN (forked) != fork)
4414 return;
4415 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4416
4417 /* The outer joining insn must immediately follow the inner join
4418 insn. */
4419 rtx_insn *joining = par->joining_insn;
4420 rtx_insn *join = inner->join_insn;
4421 if (NEXT_INSN (join) != joining)
4422 return;
4423
4424 /* Preconditions met. Swallow the inner par. */
4425 if (dump_file)
4426 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4427 inner->mask, inner->forked_block->index,
4428 inner->join_block->index,
4429 par->mask, par->forked_block->index, par->join_block->index);
4430
4431 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4432
4433 par->blocks.reserve (inner->blocks.length ());
4434 while (inner->blocks.length ())
4435 par->blocks.quick_push (inner->blocks.pop ());
4436
4437 par->inner = inner->inner;
4438 inner->inner = NULL;
4439
4440 delete inner;
4441 }
4442
4443 /* Process the parallel PAR and all its contained
4444 parallels. We do everything but the neutering. Return mask of
4445 partitioned modes used within this parallel. */
4446
4447 static unsigned
4448 nvptx_process_pars (parallel *par)
4449 {
4450 if (nvptx_optimize)
4451 nvptx_optimize_inner (par);
4452
4453 unsigned inner_mask = par->mask;
4454
4455 /* Do the inner parallels first. */
4456 if (par->inner)
4457 {
4458 par->inner_mask = nvptx_process_pars (par->inner);
4459 inner_mask |= par->inner_mask;
4460 }
4461
4462 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4463
4464 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4465 {
4466 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
4467 bool empty = nvptx_wpropagate (true, is_call,
4468 par->forked_block, par->fork_insn);
4469
4470 if (!empty || !is_call)
4471 {
4472 /* Insert begin and end synchronizations. */
4473 emit_insn_before (nvptx_wsync (false), par->forked_insn);
4474 emit_insn_before (nvptx_wsync (true), par->join_insn);
4475 }
4476 }
4477 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4478 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
4479
4480 /* Now do siblings. */
4481 if (par->next)
4482 inner_mask |= nvptx_process_pars (par->next);
4483 return inner_mask;
4484 }
4485
4486 /* Neuter the parallel described by PAR. We recurse in depth-first
4487 order. MODES are the partitioning of the execution and OUTER is
4488 the partitioning of the parallels we are contained in. */
4489
4490 static void
4491 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4492 {
4493 unsigned me = (par->mask
4494 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4495 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4496 unsigned skip_mask = 0, neuter_mask = 0;
4497
4498 if (par->inner)
4499 nvptx_neuter_pars (par->inner, modes, outer | me);
4500
4501 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4502 {
4503 if ((outer | me) & GOMP_DIM_MASK (mode))
4504 {} /* Mode is partitioned: no neutering. */
4505 else if (!(modes & GOMP_DIM_MASK (mode)))
4506 {} /* Mode is not used: nothing to do. */
4507 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4508 || !par->forked_insn)
4509 /* Partitioned in inner parallels, or we're not a partitioned
4510 at all: neuter individual blocks. */
4511 neuter_mask |= GOMP_DIM_MASK (mode);
4512 else if (!par->parent || !par->parent->forked_insn
4513 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4514 /* Parent isn't a parallel or contains this paralleling: skip
4515 parallel at this level. */
4516 skip_mask |= GOMP_DIM_MASK (mode);
4517 else
4518 {} /* Parent will skip this parallel itself. */
4519 }
4520
4521 if (neuter_mask)
4522 {
4523 int ix, len;
4524
4525 if (nvptx_optimize)
4526 {
4527 /* Neuter whole SESE regions. */
4528 bb_pair_vec_t regions;
4529
4530 nvptx_find_sese (par->blocks, regions);
4531 len = regions.length ();
4532 for (ix = 0; ix != len; ix++)
4533 {
4534 basic_block from = regions[ix].first;
4535 basic_block to = regions[ix].second;
4536
4537 if (from)
4538 nvptx_single (neuter_mask, from, to);
4539 else
4540 gcc_assert (!to);
4541 }
4542 }
4543 else
4544 {
4545 /* Neuter each BB individually. */
4546 len = par->blocks.length ();
4547 for (ix = 0; ix != len; ix++)
4548 {
4549 basic_block block = par->blocks[ix];
4550
4551 nvptx_single (neuter_mask, block, block);
4552 }
4553 }
4554 }
4555
4556 if (skip_mask)
4557 nvptx_skip_par (skip_mask, par);
4558
4559 if (par->next)
4560 nvptx_neuter_pars (par->next, modes, outer);
4561 }
4562
4563 #if WORKAROUND_PTXJIT_BUG_2
4564 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4565 is needed in the nvptx target because the branches generated for
4566 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4567
4568 static rtx
4569 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4570 {
4571 rtx pat;
4572 if ((strict && !JUMP_P (insn))
4573 || (!strict && !INSN_P (insn)))
4574 return NULL_RTX;
4575 pat = PATTERN (insn);
4576
4577 /* The set is allowed to appear either as the insn pattern or
4578 the first set in a PARALLEL. */
4579 if (GET_CODE (pat) == PARALLEL)
4580 pat = XVECEXP (pat, 0, 0);
4581 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4582 return pat;
4583
4584 return NULL_RTX;
4585 }
4586
4587 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4588
4589 static rtx
4590 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4591 {
4592 rtx x = nvptx_pc_set (insn, strict);
4593
4594 if (!x)
4595 return NULL_RTX;
4596 x = SET_SRC (x);
4597 if (GET_CODE (x) == LABEL_REF)
4598 return x;
4599 if (GET_CODE (x) != IF_THEN_ELSE)
4600 return NULL_RTX;
4601 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4602 return XEXP (x, 1);
4603 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4604 return XEXP (x, 2);
4605 return NULL_RTX;
4606 }
4607
4608 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4609 insn inbetween the branch and the label. This works around a JIT bug
4610 observed at driver version 384.111, at -O0 for sm_50. */
4611
4612 static void
4613 prevent_branch_around_nothing (void)
4614 {
4615 rtx_insn *seen_label = NULL;
4616 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4617 {
4618 if (INSN_P (insn) && condjump_p (insn))
4619 {
4620 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4621 continue;
4622 }
4623
4624 if (seen_label == NULL)
4625 continue;
4626
4627 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4628 continue;
4629
4630 if (INSN_P (insn))
4631 switch (recog_memoized (insn))
4632 {
4633 case CODE_FOR_nvptx_fork:
4634 case CODE_FOR_nvptx_forked:
4635 case CODE_FOR_nvptx_joining:
4636 case CODE_FOR_nvptx_join:
4637 continue;
4638 default:
4639 seen_label = NULL;
4640 continue;
4641 }
4642
4643 if (LABEL_P (insn) && insn == seen_label)
4644 emit_insn_before (gen_fake_nop (), insn);
4645
4646 seen_label = NULL;
4647 }
4648 }
4649 #endif
4650
4651 #ifdef WORKAROUND_PTXJIT_BUG_3
4652 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4653 works around a hang observed at driver version 390.48 for sm_50. */
4654
4655 static void
4656 workaround_barsyncs (void)
4657 {
4658 bool seen_barsync = false;
4659 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4660 {
4661 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4662 {
4663 if (seen_barsync)
4664 {
4665 emit_insn_before (gen_nvptx_membar_cta (), insn);
4666 emit_insn_before (gen_nvptx_membar_cta (), insn);
4667 }
4668
4669 seen_barsync = true;
4670 continue;
4671 }
4672
4673 if (!seen_barsync)
4674 continue;
4675
4676 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4677 continue;
4678 else if (INSN_P (insn))
4679 switch (recog_memoized (insn))
4680 {
4681 case CODE_FOR_nvptx_fork:
4682 case CODE_FOR_nvptx_forked:
4683 case CODE_FOR_nvptx_joining:
4684 case CODE_FOR_nvptx_join:
4685 continue;
4686 default:
4687 break;
4688 }
4689
4690 seen_barsync = false;
4691 }
4692 }
4693 #endif
4694
4695 /* PTX-specific reorganization
4696 - Split blocks at fork and join instructions
4697 - Compute live registers
4698 - Mark now-unused registers, so function begin doesn't declare
4699 unused registers.
4700 - Insert state propagation when entering partitioned mode
4701 - Insert neutering instructions when in single mode
4702 - Replace subregs with suitable sequences.
4703 */
4704
4705 static void
4706 nvptx_reorg (void)
4707 {
4708 /* We are freeing block_for_insn in the toplev to keep compatibility
4709 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4710 compute_bb_for_insn ();
4711
4712 thread_prologue_and_epilogue_insns ();
4713
4714 /* Split blocks and record interesting unspecs. */
4715 bb_insn_map_t bb_insn_map;
4716
4717 nvptx_split_blocks (&bb_insn_map);
4718
4719 /* Compute live regs */
4720 df_clear_flags (DF_LR_RUN_DCE);
4721 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4722 df_live_add_problem ();
4723 df_live_set_all_dirty ();
4724 df_analyze ();
4725 regstat_init_n_sets_and_refs ();
4726
4727 if (dump_file)
4728 df_dump (dump_file);
4729
4730 /* Mark unused regs as unused. */
4731 int max_regs = max_reg_num ();
4732 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4733 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4734 regno_reg_rtx[i] = const0_rtx;
4735
4736 /* Determine launch dimensions of the function. If it is not an
4737 offloaded function (i.e. this is a regular compiler), the
4738 function has no neutering. */
4739 tree attr = oacc_get_fn_attrib (current_function_decl);
4740 if (attr)
4741 {
4742 /* If we determined this mask before RTL expansion, we could
4743 elide emission of some levels of forks and joins. */
4744 unsigned mask = 0;
4745 tree dims = TREE_VALUE (attr);
4746 unsigned ix;
4747
4748 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4749 {
4750 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4751 tree allowed = TREE_PURPOSE (dims);
4752
4753 if (size != 1 && !(allowed && integer_zerop (allowed)))
4754 mask |= GOMP_DIM_MASK (ix);
4755 }
4756 /* If there is worker neutering, there must be vector
4757 neutering. Otherwise the hardware will fail. */
4758 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4759 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4760
4761 /* Discover & process partitioned regions. */
4762 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4763 nvptx_process_pars (pars);
4764 nvptx_neuter_pars (pars, mask, 0);
4765 delete pars;
4766 }
4767
4768 /* Replace subregs. */
4769 nvptx_reorg_subreg ();
4770
4771 if (TARGET_UNIFORM_SIMT)
4772 nvptx_reorg_uniform_simt ();
4773
4774 #if WORKAROUND_PTXJIT_BUG_2
4775 prevent_branch_around_nothing ();
4776 #endif
4777
4778 #ifdef WORKAROUND_PTXJIT_BUG_3
4779 workaround_barsyncs ();
4780 #endif
4781
4782 regstat_free_n_sets_and_refs ();
4783
4784 df_finish_pass (true);
4785 }
4786 \f
4787 /* Handle a "kernel" attribute; arguments as in
4788 struct attribute_spec.handler. */
4789
4790 static tree
4791 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4792 int ARG_UNUSED (flags), bool *no_add_attrs)
4793 {
4794 tree decl = *node;
4795
4796 if (TREE_CODE (decl) != FUNCTION_DECL)
4797 {
4798 error ("%qE attribute only applies to functions", name);
4799 *no_add_attrs = true;
4800 }
4801 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4802 {
4803 error ("%qE attribute requires a void return type", name);
4804 *no_add_attrs = true;
4805 }
4806
4807 return NULL_TREE;
4808 }
4809
4810 /* Handle a "shared" attribute; arguments as in
4811 struct attribute_spec.handler. */
4812
4813 static tree
4814 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4815 int ARG_UNUSED (flags), bool *no_add_attrs)
4816 {
4817 tree decl = *node;
4818
4819 if (TREE_CODE (decl) != VAR_DECL)
4820 {
4821 error ("%qE attribute only applies to variables", name);
4822 *no_add_attrs = true;
4823 }
4824 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4825 {
4826 error ("%qE attribute not allowed with auto storage class", name);
4827 *no_add_attrs = true;
4828 }
4829
4830 return NULL_TREE;
4831 }
4832
4833 /* Table of valid machine attributes. */
4834 static const struct attribute_spec nvptx_attribute_table[] =
4835 {
4836 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4837 affects_type_identity, handler, exclude } */
4838 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4839 NULL },
4840 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4841 NULL },
4842 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4843 };
4844 \f
4845 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4846
4847 static HOST_WIDE_INT
4848 nvptx_vector_alignment (const_tree type)
4849 {
4850 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4851
4852 return MIN (align, BIGGEST_ALIGNMENT);
4853 }
4854
4855 /* Indicate that INSN cannot be duplicated. */
4856
4857 static bool
4858 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4859 {
4860 switch (recog_memoized (insn))
4861 {
4862 case CODE_FOR_nvptx_shufflesi:
4863 case CODE_FOR_nvptx_shufflesf:
4864 case CODE_FOR_nvptx_barsync:
4865 case CODE_FOR_nvptx_fork:
4866 case CODE_FOR_nvptx_forked:
4867 case CODE_FOR_nvptx_joining:
4868 case CODE_FOR_nvptx_join:
4869 return true;
4870 default:
4871 return false;
4872 }
4873 }
4874
4875 /* Section anchors do not work. Initialization for flag_section_anchor
4876 probes the existence of the anchoring target hooks and prevents
4877 anchoring if they don't exist. However, we may be being used with
4878 a host-side compiler that does support anchoring, and hence see
4879 the anchor flag set (as it's not recalculated). So provide an
4880 implementation denying anchoring. */
4881
4882 static bool
4883 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4884 {
4885 return false;
4886 }
4887 \f
4888 /* Record a symbol for mkoffload to enter into the mapping table. */
4889
4890 static void
4891 nvptx_record_offload_symbol (tree decl)
4892 {
4893 switch (TREE_CODE (decl))
4894 {
4895 case VAR_DECL:
4896 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4897 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4898 break;
4899
4900 case FUNCTION_DECL:
4901 {
4902 tree attr = oacc_get_fn_attrib (decl);
4903 /* OpenMP offloading does not set this attribute. */
4904 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4905
4906 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4907 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4908
4909 for (; dims; dims = TREE_CHAIN (dims))
4910 {
4911 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4912
4913 gcc_assert (!TREE_PURPOSE (dims));
4914 fprintf (asm_out_file, ", %#x", size);
4915 }
4916
4917 fprintf (asm_out_file, "\n");
4918 }
4919 break;
4920
4921 default:
4922 gcc_unreachable ();
4923 }
4924 }
4925
4926 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4927 at the start of a file. */
4928
4929 static void
4930 nvptx_file_start (void)
4931 {
4932 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4933 fputs ("\t.version\t3.1\n", asm_out_file);
4934 if (TARGET_SM35)
4935 fputs ("\t.target\tsm_35\n", asm_out_file);
4936 else
4937 fputs ("\t.target\tsm_30\n", asm_out_file);
4938 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4939 fputs ("// END PREAMBLE\n", asm_out_file);
4940 }
4941
4942 /* Emit a declaration for a worker-level buffer in .shared memory. */
4943
4944 static void
4945 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4946 {
4947 const char *name = XSTR (sym, 0);
4948
4949 write_var_marker (file, true, false, name);
4950 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4951 align, name, size);
4952 }
4953
4954 /* Write out the function declarations we've collected and declare storage
4955 for the broadcast buffer. */
4956
4957 static void
4958 nvptx_file_end (void)
4959 {
4960 hash_table<tree_hasher>::iterator iter;
4961 tree decl;
4962 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4963 nvptx_record_fndecl (decl);
4964 fputs (func_decls.str().c_str(), asm_out_file);
4965
4966 if (worker_bcast_size)
4967 write_worker_buffer (asm_out_file, worker_bcast_sym,
4968 worker_bcast_align, worker_bcast_size);
4969
4970 if (worker_red_size)
4971 write_worker_buffer (asm_out_file, worker_red_sym,
4972 worker_red_align, worker_red_size);
4973
4974 if (need_softstack_decl)
4975 {
4976 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4977 /* 32 is the maximum number of warps in a block. Even though it's an
4978 external declaration, emit the array size explicitly; otherwise, it
4979 may fail at PTX JIT time if the definition is later in link order. */
4980 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4981 POINTER_SIZE);
4982 }
4983 if (need_unisimt_decl)
4984 {
4985 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4986 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4987 }
4988 }
4989
4990 /* Expander for the shuffle builtins. */
4991
4992 static rtx
4993 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4994 {
4995 if (ignore)
4996 return target;
4997
4998 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4999 NULL_RTX, mode, EXPAND_NORMAL);
5000 if (!REG_P (src))
5001 src = copy_to_mode_reg (mode, src);
5002
5003 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5004 NULL_RTX, SImode, EXPAND_NORMAL);
5005 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5006 NULL_RTX, SImode, EXPAND_NORMAL);
5007
5008 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5009 idx = copy_to_mode_reg (SImode, idx);
5010
5011 rtx pat = nvptx_gen_shuffle (target, src, idx,
5012 (nvptx_shuffle_kind) INTVAL (op));
5013 if (pat)
5014 emit_insn (pat);
5015
5016 return target;
5017 }
5018
5019 /* Worker reduction address expander. */
5020
5021 static rtx
5022 nvptx_expand_worker_addr (tree exp, rtx target,
5023 machine_mode ARG_UNUSED (mode), int ignore)
5024 {
5025 if (ignore)
5026 return target;
5027
5028 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5029 if (align > worker_red_align)
5030 worker_red_align = align;
5031
5032 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5033 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5034 if (size + offset > worker_red_size)
5035 worker_red_size = size + offset;
5036
5037 rtx addr = worker_red_sym;
5038 if (offset)
5039 {
5040 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5041 addr = gen_rtx_CONST (Pmode, addr);
5042 }
5043
5044 emit_move_insn (target, addr);
5045
5046 return target;
5047 }
5048
5049 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5050 not require taking the address of any object, other than the memory
5051 cell being operated on. */
5052
5053 static rtx
5054 nvptx_expand_cmp_swap (tree exp, rtx target,
5055 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5056 {
5057 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5058
5059 if (!target)
5060 target = gen_reg_rtx (mode);
5061
5062 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5063 NULL_RTX, Pmode, EXPAND_NORMAL);
5064 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5065 NULL_RTX, mode, EXPAND_NORMAL);
5066 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5067 NULL_RTX, mode, EXPAND_NORMAL);
5068 rtx pat;
5069
5070 mem = gen_rtx_MEM (mode, mem);
5071 if (!REG_P (cmp))
5072 cmp = copy_to_mode_reg (mode, cmp);
5073 if (!REG_P (src))
5074 src = copy_to_mode_reg (mode, src);
5075
5076 if (mode == SImode)
5077 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5078 else
5079 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5080
5081 emit_insn (pat);
5082
5083 return target;
5084 }
5085
5086
5087 /* Codes for all the NVPTX builtins. */
5088 enum nvptx_builtins
5089 {
5090 NVPTX_BUILTIN_SHUFFLE,
5091 NVPTX_BUILTIN_SHUFFLELL,
5092 NVPTX_BUILTIN_WORKER_ADDR,
5093 NVPTX_BUILTIN_CMP_SWAP,
5094 NVPTX_BUILTIN_CMP_SWAPLL,
5095 NVPTX_BUILTIN_MAX
5096 };
5097
5098 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5099
5100 /* Return the NVPTX builtin for CODE. */
5101
5102 static tree
5103 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5104 {
5105 if (code >= NVPTX_BUILTIN_MAX)
5106 return error_mark_node;
5107
5108 return nvptx_builtin_decls[code];
5109 }
5110
5111 /* Set up all builtin functions for this target. */
5112
5113 static void
5114 nvptx_init_builtins (void)
5115 {
5116 #define DEF(ID, NAME, T) \
5117 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5118 = add_builtin_function ("__builtin_nvptx_" NAME, \
5119 build_function_type_list T, \
5120 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5121 #define ST sizetype
5122 #define UINT unsigned_type_node
5123 #define LLUINT long_long_unsigned_type_node
5124 #define PTRVOID ptr_type_node
5125
5126 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5127 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5128 DEF (WORKER_ADDR, "worker_addr",
5129 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5130 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5131 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5132
5133 #undef DEF
5134 #undef ST
5135 #undef UINT
5136 #undef LLUINT
5137 #undef PTRVOID
5138 }
5139
5140 /* Expand an expression EXP that calls a built-in function,
5141 with result going to TARGET if that's convenient
5142 (and in mode MODE if that's convenient).
5143 SUBTARGET may be used as the target for computing one of EXP's operands.
5144 IGNORE is nonzero if the value is to be ignored. */
5145
5146 static rtx
5147 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5148 machine_mode mode, int ignore)
5149 {
5150 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5151 switch (DECL_FUNCTION_CODE (fndecl))
5152 {
5153 case NVPTX_BUILTIN_SHUFFLE:
5154 case NVPTX_BUILTIN_SHUFFLELL:
5155 return nvptx_expand_shuffle (exp, target, mode, ignore);
5156
5157 case NVPTX_BUILTIN_WORKER_ADDR:
5158 return nvptx_expand_worker_addr (exp, target, mode, ignore);
5159
5160 case NVPTX_BUILTIN_CMP_SWAP:
5161 case NVPTX_BUILTIN_CMP_SWAPLL:
5162 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5163
5164 default: gcc_unreachable ();
5165 }
5166 }
5167 \f
5168 /* Define dimension sizes for known hardware. */
5169 #define PTX_VECTOR_LENGTH 32
5170 #define PTX_WORKER_LENGTH 32
5171 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5172
5173 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5174
5175 static int
5176 nvptx_simt_vf ()
5177 {
5178 return PTX_VECTOR_LENGTH;
5179 }
5180
5181 /* Validate compute dimensions of an OpenACC offload or routine, fill
5182 in non-unity defaults. FN_LEVEL indicates the level at which a
5183 routine might spawn a loop. It is negative for non-routines. If
5184 DECL is null, we are validating the default dimensions. */
5185
5186 static bool
5187 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
5188 {
5189 bool changed = false;
5190 bool oacc_default_dims_p = false;
5191 bool oacc_min_dims_p = false;
5192 bool offload_region_p = false;
5193 bool routine_p = false;
5194 bool routine_seq_p = false;
5195
5196 if (decl == NULL_TREE)
5197 {
5198 if (fn_level == -1)
5199 oacc_default_dims_p = true;
5200 else if (fn_level == -2)
5201 oacc_min_dims_p = true;
5202 else
5203 gcc_unreachable ();
5204 }
5205 else if (fn_level == -1)
5206 offload_region_p = true;
5207 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5208 {
5209 routine_p = true;
5210 routine_seq_p = fn_level == GOMP_DIM_MAX;
5211 }
5212 else
5213 gcc_unreachable ();
5214
5215 /* The vector size must be 32, unless this is a SEQ routine. */
5216 if ((offload_region_p || oacc_default_dims_p
5217 || (routine_p && !routine_seq_p))
5218 && dims[GOMP_DIM_VECTOR] >= 0
5219 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5220 {
5221 if ((offload_region_p || oacc_default_dims_p)
5222 && dims[GOMP_DIM_VECTOR] >= 0)
5223 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5224 dims[GOMP_DIM_VECTOR]
5225 ? G_("using vector_length (%d), ignoring %d")
5226 : G_("using vector_length (%d), ignoring runtime setting"),
5227 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5228 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5229 changed = true;
5230 }
5231
5232 /* Check the num workers is not too large. */
5233 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5234 {
5235 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5236 "using num_workers (%d), ignoring %d",
5237 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5238 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5239 changed = true;
5240 }
5241
5242 if (oacc_default_dims_p || oacc_min_dims_p)
5243 {
5244 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5245 if (dims[GOMP_DIM_WORKER] < 0)
5246 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5247 if (dims[GOMP_DIM_GANG] < 0)
5248 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5249 changed = true;
5250 }
5251
5252 return changed;
5253 }
5254
5255 /* Return maximum dimension size, or zero for unbounded. */
5256
5257 static int
5258 nvptx_dim_limit (int axis)
5259 {
5260 switch (axis)
5261 {
5262 case GOMP_DIM_VECTOR:
5263 return PTX_VECTOR_LENGTH;
5264
5265 default:
5266 break;
5267 }
5268 return 0;
5269 }
5270
5271 /* Determine whether fork & joins are needed. */
5272
5273 static bool
5274 nvptx_goacc_fork_join (gcall *call, const int dims[],
5275 bool ARG_UNUSED (is_fork))
5276 {
5277 tree arg = gimple_call_arg (call, 2);
5278 unsigned axis = TREE_INT_CST_LOW (arg);
5279
5280 /* We only care about worker and vector partitioning. */
5281 if (axis < GOMP_DIM_WORKER)
5282 return false;
5283
5284 /* If the size is 1, there's no partitioning. */
5285 if (dims[axis] == 1)
5286 return false;
5287
5288 return true;
5289 }
5290
5291 /* Generate a PTX builtin function call that returns the address in
5292 the worker reduction buffer at OFFSET. TYPE is the type of the
5293 data at that location. */
5294
5295 static tree
5296 nvptx_get_worker_red_addr (tree type, tree offset)
5297 {
5298 machine_mode mode = TYPE_MODE (type);
5299 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5300 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5301 tree align = build_int_cst (unsigned_type_node,
5302 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5303 tree call = build_call_expr (fndecl, 3, offset, size, align);
5304
5305 return fold_convert (build_pointer_type (type), call);
5306 }
5307
5308 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5309 will cast the variable if necessary. */
5310
5311 static void
5312 nvptx_generate_vector_shuffle (location_t loc,
5313 tree dest_var, tree var, unsigned shift,
5314 gimple_seq *seq)
5315 {
5316 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5317 tree_code code = NOP_EXPR;
5318 tree arg_type = unsigned_type_node;
5319 tree var_type = TREE_TYPE (var);
5320 tree dest_type = var_type;
5321
5322 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5323 var_type = TREE_TYPE (var_type);
5324
5325 if (TREE_CODE (var_type) == REAL_TYPE)
5326 code = VIEW_CONVERT_EXPR;
5327
5328 if (TYPE_SIZE (var_type)
5329 == TYPE_SIZE (long_long_unsigned_type_node))
5330 {
5331 fn = NVPTX_BUILTIN_SHUFFLELL;
5332 arg_type = long_long_unsigned_type_node;
5333 }
5334
5335 tree call = nvptx_builtin_decl (fn, true);
5336 tree bits = build_int_cst (unsigned_type_node, shift);
5337 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5338 tree expr;
5339
5340 if (var_type != dest_type)
5341 {
5342 /* Do real and imaginary parts separately. */
5343 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5344 real = fold_build1 (code, arg_type, real);
5345 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5346 real = fold_build1 (code, var_type, real);
5347
5348 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5349 imag = fold_build1 (code, arg_type, imag);
5350 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5351 imag = fold_build1 (code, var_type, imag);
5352
5353 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5354 }
5355 else
5356 {
5357 expr = fold_build1 (code, arg_type, var);
5358 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5359 expr = fold_build1 (code, dest_type, expr);
5360 }
5361
5362 gimplify_assign (dest_var, expr, seq);
5363 }
5364
5365 /* Lazily generate the global lock var decl and return its address. */
5366
5367 static tree
5368 nvptx_global_lock_addr ()
5369 {
5370 tree v = global_lock_var;
5371
5372 if (!v)
5373 {
5374 tree name = get_identifier ("__reduction_lock");
5375 tree type = build_qualified_type (unsigned_type_node,
5376 TYPE_QUAL_VOLATILE);
5377 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5378 global_lock_var = v;
5379 DECL_ARTIFICIAL (v) = 1;
5380 DECL_EXTERNAL (v) = 1;
5381 TREE_STATIC (v) = 1;
5382 TREE_PUBLIC (v) = 1;
5383 TREE_USED (v) = 1;
5384 mark_addressable (v);
5385 mark_decl_referenced (v);
5386 }
5387
5388 return build_fold_addr_expr (v);
5389 }
5390
5391 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5392 GSI. We use a lockless scheme for nearly all case, which looks
5393 like:
5394 actual = initval(OP);
5395 do {
5396 guess = actual;
5397 write = guess OP myval;
5398 actual = cmp&swap (ptr, guess, write)
5399 } while (actual bit-different-to guess);
5400 return write;
5401
5402 This relies on a cmp&swap instruction, which is available for 32-
5403 and 64-bit types. Larger types must use a locking scheme. */
5404
5405 static tree
5406 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5407 tree ptr, tree var, tree_code op)
5408 {
5409 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5410 tree_code code = NOP_EXPR;
5411 tree arg_type = unsigned_type_node;
5412 tree var_type = TREE_TYPE (var);
5413
5414 if (TREE_CODE (var_type) == COMPLEX_TYPE
5415 || TREE_CODE (var_type) == REAL_TYPE)
5416 code = VIEW_CONVERT_EXPR;
5417
5418 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5419 {
5420 arg_type = long_long_unsigned_type_node;
5421 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5422 }
5423
5424 tree swap_fn = nvptx_builtin_decl (fn, true);
5425
5426 gimple_seq init_seq = NULL;
5427 tree init_var = make_ssa_name (arg_type);
5428 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5429 init_expr = fold_build1 (code, arg_type, init_expr);
5430 gimplify_assign (init_var, init_expr, &init_seq);
5431 gimple *init_end = gimple_seq_last (init_seq);
5432
5433 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5434
5435 /* Split the block just after the init stmts. */
5436 basic_block pre_bb = gsi_bb (*gsi);
5437 edge pre_edge = split_block (pre_bb, init_end);
5438 basic_block loop_bb = pre_edge->dest;
5439 pre_bb = pre_edge->src;
5440 /* Reset the iterator. */
5441 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5442
5443 tree expect_var = make_ssa_name (arg_type);
5444 tree actual_var = make_ssa_name (arg_type);
5445 tree write_var = make_ssa_name (arg_type);
5446
5447 /* Build and insert the reduction calculation. */
5448 gimple_seq red_seq = NULL;
5449 tree write_expr = fold_build1 (code, var_type, expect_var);
5450 write_expr = fold_build2 (op, var_type, write_expr, var);
5451 write_expr = fold_build1 (code, arg_type, write_expr);
5452 gimplify_assign (write_var, write_expr, &red_seq);
5453
5454 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5455
5456 /* Build & insert the cmp&swap sequence. */
5457 gimple_seq latch_seq = NULL;
5458 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5459 ptr, expect_var, write_var);
5460 gimplify_assign (actual_var, swap_expr, &latch_seq);
5461
5462 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5463 NULL_TREE, NULL_TREE);
5464 gimple_seq_add_stmt (&latch_seq, cond);
5465
5466 gimple *latch_end = gimple_seq_last (latch_seq);
5467 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5468
5469 /* Split the block just after the latch stmts. */
5470 edge post_edge = split_block (loop_bb, latch_end);
5471 basic_block post_bb = post_edge->dest;
5472 loop_bb = post_edge->src;
5473 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5474
5475 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5476 post_edge->probability = profile_probability::even ();
5477 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5478 loop_edge->probability = profile_probability::even ();
5479 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5480 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5481
5482 gphi *phi = create_phi_node (expect_var, loop_bb);
5483 add_phi_arg (phi, init_var, pre_edge, loc);
5484 add_phi_arg (phi, actual_var, loop_edge, loc);
5485
5486 loop *loop = alloc_loop ();
5487 loop->header = loop_bb;
5488 loop->latch = loop_bb;
5489 add_loop (loop, loop_bb->loop_father);
5490
5491 return fold_build1 (code, var_type, write_var);
5492 }
5493
5494 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5495 GSI. This is necessary for types larger than 64 bits, where there
5496 is no cmp&swap instruction to implement a lockless scheme. We use
5497 a lock variable in global memory.
5498
5499 while (cmp&swap (&lock_var, 0, 1))
5500 continue;
5501 T accum = *ptr;
5502 accum = accum OP var;
5503 *ptr = accum;
5504 cmp&swap (&lock_var, 1, 0);
5505 return accum;
5506
5507 A lock in global memory is necessary to force execution engine
5508 descheduling and avoid resource starvation that can occur if the
5509 lock is in .shared memory. */
5510
5511 static tree
5512 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5513 tree ptr, tree var, tree_code op)
5514 {
5515 tree var_type = TREE_TYPE (var);
5516 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5517 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5518 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5519
5520 /* Split the block just before the gsi. Insert a gimple nop to make
5521 this easier. */
5522 gimple *nop = gimple_build_nop ();
5523 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5524 basic_block entry_bb = gsi_bb (*gsi);
5525 edge entry_edge = split_block (entry_bb, nop);
5526 basic_block lock_bb = entry_edge->dest;
5527 /* Reset the iterator. */
5528 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5529
5530 /* Build and insert the locking sequence. */
5531 gimple_seq lock_seq = NULL;
5532 tree lock_var = make_ssa_name (unsigned_type_node);
5533 tree lock_expr = nvptx_global_lock_addr ();
5534 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5535 uns_unlocked, uns_locked);
5536 gimplify_assign (lock_var, lock_expr, &lock_seq);
5537 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5538 NULL_TREE, NULL_TREE);
5539 gimple_seq_add_stmt (&lock_seq, cond);
5540 gimple *lock_end = gimple_seq_last (lock_seq);
5541 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5542
5543 /* Split the block just after the lock sequence. */
5544 edge locked_edge = split_block (lock_bb, lock_end);
5545 basic_block update_bb = locked_edge->dest;
5546 lock_bb = locked_edge->src;
5547 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5548
5549 /* Create the lock loop ... */
5550 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5551 locked_edge->probability = profile_probability::even ();
5552 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5553 loop_edge->probability = profile_probability::even ();
5554 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5555 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5556
5557 /* ... and the loop structure. */
5558 loop *lock_loop = alloc_loop ();
5559 lock_loop->header = lock_bb;
5560 lock_loop->latch = lock_bb;
5561 lock_loop->nb_iterations_estimate = 1;
5562 lock_loop->any_estimate = true;
5563 add_loop (lock_loop, entry_bb->loop_father);
5564
5565 /* Build and insert the reduction calculation. */
5566 gimple_seq red_seq = NULL;
5567 tree acc_in = make_ssa_name (var_type);
5568 tree ref_in = build_simple_mem_ref (ptr);
5569 TREE_THIS_VOLATILE (ref_in) = 1;
5570 gimplify_assign (acc_in, ref_in, &red_seq);
5571
5572 tree acc_out = make_ssa_name (var_type);
5573 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5574 gimplify_assign (acc_out, update_expr, &red_seq);
5575
5576 tree ref_out = build_simple_mem_ref (ptr);
5577 TREE_THIS_VOLATILE (ref_out) = 1;
5578 gimplify_assign (ref_out, acc_out, &red_seq);
5579
5580 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5581
5582 /* Build & insert the unlock sequence. */
5583 gimple_seq unlock_seq = NULL;
5584 tree unlock_expr = nvptx_global_lock_addr ();
5585 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5586 uns_locked, uns_unlocked);
5587 gimplify_and_add (unlock_expr, &unlock_seq);
5588 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5589
5590 return acc_out;
5591 }
5592
5593 /* Emit a sequence to update a reduction accumlator at *PTR with the
5594 value held in VAR using operator OP. Return the updated value.
5595
5596 TODO: optimize for atomic ops and indepedent complex ops. */
5597
5598 static tree
5599 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5600 tree ptr, tree var, tree_code op)
5601 {
5602 tree type = TREE_TYPE (var);
5603 tree size = TYPE_SIZE (type);
5604
5605 if (size == TYPE_SIZE (unsigned_type_node)
5606 || size == TYPE_SIZE (long_long_unsigned_type_node))
5607 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5608 else
5609 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5610 }
5611
5612 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5613
5614 static void
5615 nvptx_goacc_reduction_setup (gcall *call)
5616 {
5617 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5618 tree lhs = gimple_call_lhs (call);
5619 tree var = gimple_call_arg (call, 2);
5620 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5621 gimple_seq seq = NULL;
5622
5623 push_gimplify_context (true);
5624
5625 if (level != GOMP_DIM_GANG)
5626 {
5627 /* Copy the receiver object. */
5628 tree ref_to_res = gimple_call_arg (call, 1);
5629
5630 if (!integer_zerop (ref_to_res))
5631 var = build_simple_mem_ref (ref_to_res);
5632 }
5633
5634 if (level == GOMP_DIM_WORKER)
5635 {
5636 /* Store incoming value to worker reduction buffer. */
5637 tree offset = gimple_call_arg (call, 5);
5638 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5639 tree ptr = make_ssa_name (TREE_TYPE (call));
5640
5641 gimplify_assign (ptr, call, &seq);
5642 tree ref = build_simple_mem_ref (ptr);
5643 TREE_THIS_VOLATILE (ref) = 1;
5644 gimplify_assign (ref, var, &seq);
5645 }
5646
5647 if (lhs)
5648 gimplify_assign (lhs, var, &seq);
5649
5650 pop_gimplify_context (NULL);
5651 gsi_replace_with_seq (&gsi, seq, true);
5652 }
5653
5654 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5655
5656 static void
5657 nvptx_goacc_reduction_init (gcall *call)
5658 {
5659 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5660 tree lhs = gimple_call_lhs (call);
5661 tree var = gimple_call_arg (call, 2);
5662 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5663 enum tree_code rcode
5664 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5665 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5666 TREE_TYPE (var));
5667 gimple_seq seq = NULL;
5668
5669 push_gimplify_context (true);
5670
5671 if (level == GOMP_DIM_VECTOR)
5672 {
5673 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5674 tree tid = make_ssa_name (integer_type_node);
5675 tree dim_vector = gimple_call_arg (call, 3);
5676 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5677 dim_vector);
5678 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5679 NULL_TREE, NULL_TREE);
5680
5681 gimple_call_set_lhs (tid_call, tid);
5682 gimple_seq_add_stmt (&seq, tid_call);
5683 gimple_seq_add_stmt (&seq, cond_stmt);
5684
5685 /* Split the block just after the call. */
5686 edge init_edge = split_block (gsi_bb (gsi), call);
5687 basic_block init_bb = init_edge->dest;
5688 basic_block call_bb = init_edge->src;
5689
5690 /* Fixup flags from call_bb to init_bb. */
5691 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5692 init_edge->probability = profile_probability::even ();
5693
5694 /* Set the initialization stmts. */
5695 gimple_seq init_seq = NULL;
5696 tree init_var = make_ssa_name (TREE_TYPE (var));
5697 gimplify_assign (init_var, init, &init_seq);
5698 gsi = gsi_start_bb (init_bb);
5699 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5700
5701 /* Split block just after the init stmt. */
5702 gsi_prev (&gsi);
5703 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5704 basic_block dst_bb = inited_edge->dest;
5705
5706 /* Create false edge from call_bb to dst_bb. */
5707 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5708 nop_edge->probability = profile_probability::even ();
5709
5710 /* Create phi node in dst block. */
5711 gphi *phi = create_phi_node (lhs, dst_bb);
5712 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5713 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5714
5715 /* Reset dominator of dst bb. */
5716 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5717
5718 /* Reset the gsi. */
5719 gsi = gsi_for_stmt (call);
5720 }
5721 else
5722 {
5723 if (level == GOMP_DIM_GANG)
5724 {
5725 /* If there's no receiver object, propagate the incoming VAR. */
5726 tree ref_to_res = gimple_call_arg (call, 1);
5727 if (integer_zerop (ref_to_res))
5728 init = var;
5729 }
5730
5731 gimplify_assign (lhs, init, &seq);
5732 }
5733
5734 pop_gimplify_context (NULL);
5735 gsi_replace_with_seq (&gsi, seq, true);
5736 }
5737
5738 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5739
5740 static void
5741 nvptx_goacc_reduction_fini (gcall *call)
5742 {
5743 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5744 tree lhs = gimple_call_lhs (call);
5745 tree ref_to_res = gimple_call_arg (call, 1);
5746 tree var = gimple_call_arg (call, 2);
5747 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5748 enum tree_code op
5749 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5750 gimple_seq seq = NULL;
5751 tree r = NULL_TREE;;
5752
5753 push_gimplify_context (true);
5754
5755 if (level == GOMP_DIM_VECTOR)
5756 {
5757 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5758 but that requires a method of emitting a unified jump at the
5759 gimple level. */
5760 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5761 {
5762 tree other_var = make_ssa_name (TREE_TYPE (var));
5763 nvptx_generate_vector_shuffle (gimple_location (call),
5764 other_var, var, shfl, &seq);
5765
5766 r = make_ssa_name (TREE_TYPE (var));
5767 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5768 var, other_var), &seq);
5769 var = r;
5770 }
5771 }
5772 else
5773 {
5774 tree accum = NULL_TREE;
5775
5776 if (level == GOMP_DIM_WORKER)
5777 {
5778 /* Get reduction buffer address. */
5779 tree offset = gimple_call_arg (call, 5);
5780 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5781 tree ptr = make_ssa_name (TREE_TYPE (call));
5782
5783 gimplify_assign (ptr, call, &seq);
5784 accum = ptr;
5785 }
5786 else if (integer_zerop (ref_to_res))
5787 r = var;
5788 else
5789 accum = ref_to_res;
5790
5791 if (accum)
5792 {
5793 /* UPDATE the accumulator. */
5794 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5795 seq = NULL;
5796 r = nvptx_reduction_update (gimple_location (call), &gsi,
5797 accum, var, op);
5798 }
5799 }
5800
5801 if (lhs)
5802 gimplify_assign (lhs, r, &seq);
5803 pop_gimplify_context (NULL);
5804
5805 gsi_replace_with_seq (&gsi, seq, true);
5806 }
5807
5808 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5809
5810 static void
5811 nvptx_goacc_reduction_teardown (gcall *call)
5812 {
5813 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5814 tree lhs = gimple_call_lhs (call);
5815 tree var = gimple_call_arg (call, 2);
5816 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5817 gimple_seq seq = NULL;
5818
5819 push_gimplify_context (true);
5820 if (level == GOMP_DIM_WORKER)
5821 {
5822 /* Read the worker reduction buffer. */
5823 tree offset = gimple_call_arg (call, 5);
5824 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5825 tree ptr = make_ssa_name (TREE_TYPE (call));
5826
5827 gimplify_assign (ptr, call, &seq);
5828 var = build_simple_mem_ref (ptr);
5829 TREE_THIS_VOLATILE (var) = 1;
5830 }
5831
5832 if (level != GOMP_DIM_GANG)
5833 {
5834 /* Write to the receiver object. */
5835 tree ref_to_res = gimple_call_arg (call, 1);
5836
5837 if (!integer_zerop (ref_to_res))
5838 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5839 }
5840
5841 if (lhs)
5842 gimplify_assign (lhs, var, &seq);
5843
5844 pop_gimplify_context (NULL);
5845
5846 gsi_replace_with_seq (&gsi, seq, true);
5847 }
5848
5849 /* NVPTX reduction expander. */
5850
5851 static void
5852 nvptx_goacc_reduction (gcall *call)
5853 {
5854 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5855
5856 switch (code)
5857 {
5858 case IFN_GOACC_REDUCTION_SETUP:
5859 nvptx_goacc_reduction_setup (call);
5860 break;
5861
5862 case IFN_GOACC_REDUCTION_INIT:
5863 nvptx_goacc_reduction_init (call);
5864 break;
5865
5866 case IFN_GOACC_REDUCTION_FINI:
5867 nvptx_goacc_reduction_fini (call);
5868 break;
5869
5870 case IFN_GOACC_REDUCTION_TEARDOWN:
5871 nvptx_goacc_reduction_teardown (call);
5872 break;
5873
5874 default:
5875 gcc_unreachable ();
5876 }
5877 }
5878
5879 static bool
5880 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5881 rtx x ATTRIBUTE_UNUSED)
5882 {
5883 return true;
5884 }
5885
5886 static bool
5887 nvptx_vector_mode_supported (machine_mode mode)
5888 {
5889 return (mode == V2SImode
5890 || mode == V2DImode);
5891 }
5892
5893 /* Return the preferred mode for vectorizing scalar MODE. */
5894
5895 static machine_mode
5896 nvptx_preferred_simd_mode (scalar_mode mode)
5897 {
5898 switch (mode)
5899 {
5900 case E_DImode:
5901 return V2DImode;
5902 case E_SImode:
5903 return V2SImode;
5904
5905 default:
5906 return default_preferred_simd_mode (mode);
5907 }
5908 }
5909
5910 unsigned int
5911 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5912 {
5913 if (TREE_CODE (type) == INTEGER_TYPE)
5914 {
5915 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5916 if (size == GET_MODE_SIZE (TImode))
5917 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5918 }
5919
5920 return basic_align;
5921 }
5922
5923 /* Implement TARGET_MODES_TIEABLE_P. */
5924
5925 static bool
5926 nvptx_modes_tieable_p (machine_mode, machine_mode)
5927 {
5928 return false;
5929 }
5930
5931 /* Implement TARGET_HARD_REGNO_NREGS. */
5932
5933 static unsigned int
5934 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5935 {
5936 return 1;
5937 }
5938
5939 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5940
5941 static bool
5942 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5943 {
5944 return false;
5945 }
5946
5947 #undef TARGET_OPTION_OVERRIDE
5948 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5949
5950 #undef TARGET_ATTRIBUTE_TABLE
5951 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5952
5953 #undef TARGET_LRA_P
5954 #define TARGET_LRA_P hook_bool_void_false
5955
5956 #undef TARGET_LEGITIMATE_ADDRESS_P
5957 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5958
5959 #undef TARGET_PROMOTE_FUNCTION_MODE
5960 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5961
5962 #undef TARGET_FUNCTION_ARG
5963 #define TARGET_FUNCTION_ARG nvptx_function_arg
5964 #undef TARGET_FUNCTION_INCOMING_ARG
5965 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5966 #undef TARGET_FUNCTION_ARG_ADVANCE
5967 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5968 #undef TARGET_FUNCTION_ARG_BOUNDARY
5969 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5970 #undef TARGET_PASS_BY_REFERENCE
5971 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5972 #undef TARGET_FUNCTION_VALUE_REGNO_P
5973 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5974 #undef TARGET_FUNCTION_VALUE
5975 #define TARGET_FUNCTION_VALUE nvptx_function_value
5976 #undef TARGET_LIBCALL_VALUE
5977 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5978 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5979 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5980 #undef TARGET_GET_DRAP_RTX
5981 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5982 #undef TARGET_SPLIT_COMPLEX_ARG
5983 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5984 #undef TARGET_RETURN_IN_MEMORY
5985 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5986 #undef TARGET_OMIT_STRUCT_RETURN_REG
5987 #define TARGET_OMIT_STRUCT_RETURN_REG true
5988 #undef TARGET_STRICT_ARGUMENT_NAMING
5989 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5990 #undef TARGET_CALL_ARGS
5991 #define TARGET_CALL_ARGS nvptx_call_args
5992 #undef TARGET_END_CALL_ARGS
5993 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5994
5995 #undef TARGET_ASM_FILE_START
5996 #define TARGET_ASM_FILE_START nvptx_file_start
5997 #undef TARGET_ASM_FILE_END
5998 #define TARGET_ASM_FILE_END nvptx_file_end
5999 #undef TARGET_ASM_GLOBALIZE_LABEL
6000 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6001 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6002 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6003 #undef TARGET_PRINT_OPERAND
6004 #define TARGET_PRINT_OPERAND nvptx_print_operand
6005 #undef TARGET_PRINT_OPERAND_ADDRESS
6006 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6007 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6008 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6009 #undef TARGET_ASM_INTEGER
6010 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6011 #undef TARGET_ASM_DECL_END
6012 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6013 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6014 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6015 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6016 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6017 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6018 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6019
6020 #undef TARGET_MACHINE_DEPENDENT_REORG
6021 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6022 #undef TARGET_NO_REGISTER_ALLOCATION
6023 #define TARGET_NO_REGISTER_ALLOCATION true
6024
6025 #undef TARGET_ENCODE_SECTION_INFO
6026 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6027 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6028 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6029
6030 #undef TARGET_VECTOR_ALIGNMENT
6031 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6032
6033 #undef TARGET_CANNOT_COPY_INSN_P
6034 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6035
6036 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6037 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6038
6039 #undef TARGET_INIT_BUILTINS
6040 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6041 #undef TARGET_EXPAND_BUILTIN
6042 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6043 #undef TARGET_BUILTIN_DECL
6044 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6045
6046 #undef TARGET_SIMT_VF
6047 #define TARGET_SIMT_VF nvptx_simt_vf
6048
6049 #undef TARGET_GOACC_VALIDATE_DIMS
6050 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6051
6052 #undef TARGET_GOACC_DIM_LIMIT
6053 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6054
6055 #undef TARGET_GOACC_FORK_JOIN
6056 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6057
6058 #undef TARGET_GOACC_REDUCTION
6059 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6060
6061 #undef TARGET_CANNOT_FORCE_CONST_MEM
6062 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6063
6064 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6065 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6066
6067 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6068 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6069 nvptx_preferred_simd_mode
6070
6071 #undef TARGET_MODES_TIEABLE_P
6072 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6073
6074 #undef TARGET_HARD_REGNO_NREGS
6075 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6076
6077 #undef TARGET_CAN_CHANGE_MODE_CLASS
6078 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6079
6080 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6081 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6082
6083 struct gcc_target targetm = TARGET_INITIALIZER;
6084
6085 #include "gt-nvptx.h"