]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/nvptx/nvptx.c
Turn HARD_REGNO_NREGS into a target hook
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "alias.h"
40 #include "insn-flags.h"
41 #include "output.h"
42 #include "insn-attr.h"
43 #include "flags.h"
44 #include "dojump.h"
45 #include "explow.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "stmt.h"
49 #include "expr.h"
50 #include "tm-preds.h"
51 #include "tm-constrs.h"
52 #include "langhooks.h"
53 #include "dbxout.h"
54 #include "cfgrtl.h"
55 #include "gimple.h"
56 #include "stor-layout.h"
57 #include "builtins.h"
58 #include "omp-general.h"
59 #include "omp-low.h"
60 #include "gomp-constants.h"
61 #include "dumpfile.h"
62 #include "internal-fn.h"
63 #include "gimple-iterator.h"
64 #include "stringpool.h"
65 #include "attribs.h"
66 #include "tree-vrp.h"
67 #include "tree-ssa-operands.h"
68 #include "tree-ssanames.h"
69 #include "gimplify.h"
70 #include "tree-phinodes.h"
71 #include "cfgloop.h"
72 #include "fold-const.h"
73 #include "intl.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 #define WORKAROUND_PTXJIT_BUG 1
79
80 /* The various PTX memory areas an object might reside in. */
81 enum nvptx_data_area
82 {
83 DATA_AREA_GENERIC,
84 DATA_AREA_GLOBAL,
85 DATA_AREA_SHARED,
86 DATA_AREA_LOCAL,
87 DATA_AREA_CONST,
88 DATA_AREA_PARAM,
89 DATA_AREA_MAX
90 };
91
92 /* We record the data area in the target symbol flags. */
93 #define SYMBOL_DATA_AREA(SYM) \
94 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
95 & 7)
96 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
97 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
98
99 /* Record the function decls we've written, and the libfuncs and function
100 decls corresponding to them. */
101 static std::stringstream func_decls;
102
103 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
104 {
105 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
106 static bool equal (rtx a, rtx b) { return a == b; }
107 };
108
109 static GTY((cache))
110 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
111
112 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
113 {
114 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
115 static bool equal (tree a, tree b) { return a == b; }
116 };
117
118 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
119 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
120
121 /* Buffer needed to broadcast across workers. This is used for both
122 worker-neutering and worker broadcasting. It is shared by all
123 functions emitted. The buffer is placed in shared memory. It'd be
124 nice if PTX supported common blocks, because then this could be
125 shared across TUs (taking the largest size). */
126 static unsigned worker_bcast_size;
127 static unsigned worker_bcast_align;
128 static GTY(()) rtx worker_bcast_sym;
129
130 /* Buffer needed for worker reductions. This has to be distinct from
131 the worker broadcast array, as both may be live concurrently. */
132 static unsigned worker_red_size;
133 static unsigned worker_red_align;
134 static GTY(()) rtx worker_red_sym;
135
136 /* Global lock variable, needed for 128bit worker & gang reductions. */
137 static GTY(()) tree global_lock_var;
138
139 /* True if any function references __nvptx_stacks. */
140 static bool need_softstack_decl;
141
142 /* True if any function references __nvptx_uni. */
143 static bool need_unisimt_decl;
144
145 /* Allocate a new, cleared machine_function structure. */
146
147 static struct machine_function *
148 nvptx_init_machine_status (void)
149 {
150 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
151 p->return_mode = VOIDmode;
152 return p;
153 }
154
155 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
156 and -fopenacc is also enabled. */
157
158 static void
159 diagnose_openacc_conflict (bool optval, const char *optname)
160 {
161 if (flag_openacc && optval)
162 error ("option %s is not supported together with -fopenacc", optname);
163 }
164
165 /* Implement TARGET_OPTION_OVERRIDE. */
166
167 static void
168 nvptx_option_override (void)
169 {
170 init_machine_status = nvptx_init_machine_status;
171
172 /* Set toplevel_reorder, unless explicitly disabled. We need
173 reordering so that we emit necessary assembler decls of
174 undeclared variables. */
175 if (!global_options_set.x_flag_toplevel_reorder)
176 flag_toplevel_reorder = 1;
177
178 /* Set flag_no_common, unless explicitly disabled. We fake common
179 using .weak, and that's not entirely accurate, so avoid it
180 unless forced. */
181 if (!global_options_set.x_flag_no_common)
182 flag_no_common = 1;
183
184 /* The patch area requires nops, which we don't have. */
185 if (function_entry_patch_area_size > 0)
186 sorry ("not generating patch area, nops not supported");
187
188 /* Assumes that it will see only hard registers. */
189 flag_var_tracking = 0;
190
191 if (nvptx_optimize < 0)
192 nvptx_optimize = optimize > 0;
193
194 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
195 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
196 declared_libfuncs_htab
197 = hash_table<declared_libfunc_hasher>::create_ggc (17);
198
199 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
200 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
201 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
202
203 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
204 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
205 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
206
207 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
208 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
209 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
210
211 if (TARGET_GOMP)
212 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
213 }
214
215 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
216 deal with ptx ideosyncracies. */
217
218 const char *
219 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
220 {
221 switch (mode)
222 {
223 case E_BLKmode:
224 return ".b8";
225 case E_BImode:
226 return ".pred";
227 case E_QImode:
228 if (promote)
229 return ".u32";
230 else
231 return ".u8";
232 case E_HImode:
233 return ".u16";
234 case E_SImode:
235 return ".u32";
236 case E_DImode:
237 return ".u64";
238
239 case E_SFmode:
240 return ".f32";
241 case E_DFmode:
242 return ".f64";
243
244 case E_V2SImode:
245 return ".v2.u32";
246 case E_V2DImode:
247 return ".v2.u64";
248
249 default:
250 gcc_unreachable ();
251 }
252 }
253
254 /* Encode the PTX data area that DECL (which might not actually be a
255 _DECL) should reside in. */
256
257 static void
258 nvptx_encode_section_info (tree decl, rtx rtl, int first)
259 {
260 default_encode_section_info (decl, rtl, first);
261 if (first && MEM_P (rtl))
262 {
263 nvptx_data_area area = DATA_AREA_GENERIC;
264
265 if (TREE_CONSTANT (decl))
266 area = DATA_AREA_CONST;
267 else if (TREE_CODE (decl) == VAR_DECL)
268 {
269 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
270 {
271 area = DATA_AREA_SHARED;
272 if (DECL_INITIAL (decl))
273 error ("static initialization of variable %q+D in %<.shared%>"
274 " memory is not supported", decl);
275 }
276 else
277 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
278 }
279
280 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
281 }
282 }
283
284 /* Return the PTX name of the data area in which SYM should be
285 placed. The symbol must have already been processed by
286 nvptx_encode_seciton_info, or equivalent. */
287
288 static const char *
289 section_for_sym (rtx sym)
290 {
291 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
292 /* Same order as nvptx_data_area enum. */
293 static char const *const areas[] =
294 {"", ".global", ".shared", ".local", ".const", ".param"};
295
296 return areas[area];
297 }
298
299 /* Similarly for a decl. */
300
301 static const char *
302 section_for_decl (const_tree decl)
303 {
304 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
305 }
306
307 /* Check NAME for special function names and redirect them by returning a
308 replacement. This applies to malloc, free and realloc, for which we
309 want to use libgcc wrappers, and call, which triggers a bug in
310 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
311 not active in an offload compiler -- the names are all set by the
312 host-side compiler. */
313
314 static const char *
315 nvptx_name_replacement (const char *name)
316 {
317 if (strcmp (name, "call") == 0)
318 return "__nvptx_call";
319 if (strcmp (name, "malloc") == 0)
320 return "__nvptx_malloc";
321 if (strcmp (name, "free") == 0)
322 return "__nvptx_free";
323 if (strcmp (name, "realloc") == 0)
324 return "__nvptx_realloc";
325 return name;
326 }
327
328 /* If MODE should be treated as two registers of an inner mode, return
329 that inner mode. Otherwise return VOIDmode. */
330
331 static machine_mode
332 maybe_split_mode (machine_mode mode)
333 {
334 if (COMPLEX_MODE_P (mode))
335 return GET_MODE_INNER (mode);
336
337 if (mode == TImode)
338 return DImode;
339
340 return VOIDmode;
341 }
342
343 /* Return true if mode should be treated as two registers. */
344
345 static bool
346 split_mode_p (machine_mode mode)
347 {
348 return maybe_split_mode (mode) != VOIDmode;
349 }
350
351 /* Output a register, subreg, or register pair (with optional
352 enclosing braces). */
353
354 static void
355 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
356 int subreg_offset = -1)
357 {
358 if (inner_mode == VOIDmode)
359 {
360 if (HARD_REGISTER_NUM_P (regno))
361 fprintf (file, "%s", reg_names[regno]);
362 else
363 fprintf (file, "%%r%d", regno);
364 }
365 else if (subreg_offset >= 0)
366 {
367 output_reg (file, regno, VOIDmode);
368 fprintf (file, "$%d", subreg_offset);
369 }
370 else
371 {
372 if (subreg_offset == -1)
373 fprintf (file, "{");
374 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
375 fprintf (file, ",");
376 output_reg (file, regno, inner_mode, 0);
377 if (subreg_offset == -1)
378 fprintf (file, "}");
379 }
380 }
381
382 /* Emit forking instructions for MASK. */
383
384 static void
385 nvptx_emit_forking (unsigned mask, bool is_call)
386 {
387 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
388 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
389 if (mask)
390 {
391 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
392
393 /* Emit fork at all levels. This helps form SESE regions, as
394 it creates a block with a single successor before entering a
395 partitooned region. That is a good candidate for the end of
396 an SESE region. */
397 if (!is_call)
398 emit_insn (gen_nvptx_fork (op));
399 emit_insn (gen_nvptx_forked (op));
400 }
401 }
402
403 /* Emit joining instructions for MASK. */
404
405 static void
406 nvptx_emit_joining (unsigned mask, bool is_call)
407 {
408 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
409 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
410 if (mask)
411 {
412 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
413
414 /* Emit joining for all non-call pars to ensure there's a single
415 predecessor for the block the join insn ends up in. This is
416 needed for skipping entire loops. */
417 if (!is_call)
418 emit_insn (gen_nvptx_joining (op));
419 emit_insn (gen_nvptx_join (op));
420 }
421 }
422
423 \f
424 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
425 returned in memory. Integer and floating types supported by the
426 machine are passed in registers, everything else is passed in
427 memory. Complex types are split. */
428
429 static bool
430 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
431 {
432 if (type)
433 {
434 if (AGGREGATE_TYPE_P (type))
435 return true;
436 if (TREE_CODE (type) == VECTOR_TYPE)
437 return true;
438 }
439
440 if (!for_return && COMPLEX_MODE_P (mode))
441 /* Complex types are passed as two underlying args. */
442 mode = GET_MODE_INNER (mode);
443
444 if (GET_MODE_CLASS (mode) != MODE_INT
445 && GET_MODE_CLASS (mode) != MODE_FLOAT)
446 return true;
447
448 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
449 return true;
450
451 return false;
452 }
453
454 /* A non-memory argument of mode MODE is being passed, determine the mode it
455 should be promoted to. This is also used for determining return
456 type promotion. */
457
458 static machine_mode
459 promote_arg (machine_mode mode, bool prototyped)
460 {
461 if (!prototyped && mode == SFmode)
462 /* K&R float promotion for unprototyped functions. */
463 mode = DFmode;
464 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
465 mode = SImode;
466
467 return mode;
468 }
469
470 /* A non-memory return type of MODE is being returned. Determine the
471 mode it should be promoted to. */
472
473 static machine_mode
474 promote_return (machine_mode mode)
475 {
476 return promote_arg (mode, true);
477 }
478
479 /* Implement TARGET_FUNCTION_ARG. */
480
481 static rtx
482 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
483 const_tree, bool named)
484 {
485 if (mode == VOIDmode || !named)
486 return NULL_RTX;
487
488 return gen_reg_rtx (mode);
489 }
490
491 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
492
493 static rtx
494 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
495 const_tree, bool named)
496 {
497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
498
499 if (mode == VOIDmode || !named)
500 return NULL_RTX;
501
502 /* No need to deal with split modes here, the only case that can
503 happen is complex modes and those are dealt with by
504 TARGET_SPLIT_COMPLEX_ARG. */
505 return gen_rtx_UNSPEC (mode,
506 gen_rtvec (1, GEN_INT (cum->count)),
507 UNSPEC_ARG_REG);
508 }
509
510 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
511
512 static void
513 nvptx_function_arg_advance (cumulative_args_t cum_v,
514 machine_mode ARG_UNUSED (mode),
515 const_tree ARG_UNUSED (type),
516 bool ARG_UNUSED (named))
517 {
518 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
519
520 cum->count++;
521 }
522
523 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
524
525 For nvptx This is only used for varadic args. The type has already
526 been promoted and/or converted to invisible reference. */
527
528 static unsigned
529 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
530 {
531 return GET_MODE_ALIGNMENT (mode);
532 }
533
534 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
535
536 For nvptx, we know how to handle functions declared as stdarg: by
537 passing an extra pointer to the unnamed arguments. However, the
538 Fortran frontend can produce a different situation, where a
539 function pointer is declared with no arguments, but the actual
540 function and calls to it take more arguments. In that case, we
541 want to ensure the call matches the definition of the function. */
542
543 static bool
544 nvptx_strict_argument_naming (cumulative_args_t cum_v)
545 {
546 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
547
548 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
549 }
550
551 /* Implement TARGET_LIBCALL_VALUE. */
552
553 static rtx
554 nvptx_libcall_value (machine_mode mode, const_rtx)
555 {
556 if (!cfun || !cfun->machine->doing_call)
557 /* Pretend to return in a hard reg for early uses before pseudos can be
558 generated. */
559 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
560
561 return gen_reg_rtx (mode);
562 }
563
564 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
565 where function FUNC returns or receives a value of data type TYPE. */
566
567 static rtx
568 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
569 bool outgoing)
570 {
571 machine_mode mode = promote_return (TYPE_MODE (type));
572
573 if (outgoing)
574 {
575 gcc_assert (cfun);
576 cfun->machine->return_mode = mode;
577 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
578 }
579
580 return nvptx_libcall_value (mode, NULL_RTX);
581 }
582
583 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
584
585 static bool
586 nvptx_function_value_regno_p (const unsigned int regno)
587 {
588 return regno == NVPTX_RETURN_REGNUM;
589 }
590
591 /* Types with a mode other than those supported by the machine are passed by
592 reference in memory. */
593
594 static bool
595 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
596 machine_mode mode, const_tree type,
597 bool ARG_UNUSED (named))
598 {
599 return pass_in_memory (mode, type, false);
600 }
601
602 /* Implement TARGET_RETURN_IN_MEMORY. */
603
604 static bool
605 nvptx_return_in_memory (const_tree type, const_tree)
606 {
607 return pass_in_memory (TYPE_MODE (type), type, true);
608 }
609
610 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
611
612 static machine_mode
613 nvptx_promote_function_mode (const_tree type, machine_mode mode,
614 int *ARG_UNUSED (punsignedp),
615 const_tree funtype, int for_return)
616 {
617 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
618 }
619
620 /* Helper for write_arg. Emit a single PTX argument of MODE, either
621 in a prototype, or as copy in a function prologue. ARGNO is the
622 index of this argument in the PTX function. FOR_REG is negative,
623 if we're emitting the PTX prototype. It is zero if we're copying
624 to an argument register and it is greater than zero if we're
625 copying to a specific hard register. */
626
627 static int
628 write_arg_mode (std::stringstream &s, int for_reg, int argno,
629 machine_mode mode)
630 {
631 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
632
633 if (for_reg < 0)
634 {
635 /* Writing PTX prototype. */
636 s << (argno ? ", " : " (");
637 s << ".param" << ptx_type << " %in_ar" << argno;
638 }
639 else
640 {
641 s << "\t.reg" << ptx_type << " ";
642 if (for_reg)
643 s << reg_names[for_reg];
644 else
645 s << "%ar" << argno;
646 s << ";\n";
647 if (argno >= 0)
648 {
649 s << "\tld.param" << ptx_type << " ";
650 if (for_reg)
651 s << reg_names[for_reg];
652 else
653 s << "%ar" << argno;
654 s << ", [%in_ar" << argno << "];\n";
655 }
656 }
657 return argno + 1;
658 }
659
660 /* Process function parameter TYPE to emit one or more PTX
661 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
662 is true, if this is a prototyped function, rather than an old-style
663 C declaration. Returns the next argument number to use.
664
665 The promotion behavior here must match the regular GCC function
666 parameter marshalling machinery. */
667
668 static int
669 write_arg_type (std::stringstream &s, int for_reg, int argno,
670 tree type, bool prototyped)
671 {
672 machine_mode mode = TYPE_MODE (type);
673
674 if (mode == VOIDmode)
675 return argno;
676
677 if (pass_in_memory (mode, type, false))
678 mode = Pmode;
679 else
680 {
681 bool split = TREE_CODE (type) == COMPLEX_TYPE;
682
683 if (split)
684 {
685 /* Complex types are sent as two separate args. */
686 type = TREE_TYPE (type);
687 mode = TYPE_MODE (type);
688 prototyped = true;
689 }
690
691 mode = promote_arg (mode, prototyped);
692 if (split)
693 argno = write_arg_mode (s, for_reg, argno, mode);
694 }
695
696 return write_arg_mode (s, for_reg, argno, mode);
697 }
698
699 /* Emit a PTX return as a prototype or function prologue declaration
700 for MODE. */
701
702 static void
703 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
704 {
705 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
706 const char *pfx = "\t.reg";
707 const char *sfx = ";\n";
708
709 if (for_proto)
710 pfx = "(.param", sfx = "_out) ";
711
712 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
713 }
714
715 /* Process a function return TYPE to emit a PTX return as a prototype
716 or function prologue declaration. Returns true if return is via an
717 additional pointer parameter. The promotion behavior here must
718 match the regular GCC function return mashalling. */
719
720 static bool
721 write_return_type (std::stringstream &s, bool for_proto, tree type)
722 {
723 machine_mode mode = TYPE_MODE (type);
724
725 if (mode == VOIDmode)
726 return false;
727
728 bool return_in_mem = pass_in_memory (mode, type, true);
729
730 if (return_in_mem)
731 {
732 if (for_proto)
733 return return_in_mem;
734
735 /* Named return values can cause us to return a pointer as well
736 as expect an argument for the return location. This is
737 optimization-level specific, so no caller can make use of
738 this data, but more importantly for us, we must ensure it
739 doesn't change the PTX prototype. */
740 mode = (machine_mode) cfun->machine->return_mode;
741
742 if (mode == VOIDmode)
743 return return_in_mem;
744
745 /* Clear return_mode to inhibit copy of retval to non-existent
746 retval parameter. */
747 cfun->machine->return_mode = VOIDmode;
748 }
749 else
750 mode = promote_return (mode);
751
752 write_return_mode (s, for_proto, mode);
753
754 return return_in_mem;
755 }
756
757 /* Look for attributes in ATTRS that would indicate we must write a function
758 as a .entry kernel rather than a .func. Return true if one is found. */
759
760 static bool
761 write_as_kernel (tree attrs)
762 {
763 return (lookup_attribute ("kernel", attrs) != NULL_TREE
764 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
765 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
766 /* For OpenMP target regions, the corresponding kernel entry is emitted from
767 write_omp_entry as a separate function. */
768 }
769
770 /* Emit a linker marker for a function decl or defn. */
771
772 static void
773 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
774 const char *name)
775 {
776 s << "\n// BEGIN";
777 if (globalize)
778 s << " GLOBAL";
779 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
780 s << name << "\n";
781 }
782
783 /* Emit a linker marker for a variable decl or defn. */
784
785 static void
786 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
787 {
788 fprintf (file, "\n// BEGIN%s VAR %s: ",
789 globalize ? " GLOBAL" : "",
790 is_defn ? "DEF" : "DECL");
791 assemble_name_raw (file, name);
792 fputs ("\n", file);
793 }
794
795 /* Write a .func or .kernel declaration or definition along with
796 a helper comment for use by ld. S is the stream to write to, DECL
797 the decl for the function with name NAME. For definitions, emit
798 a declaration too. */
799
800 static const char *
801 write_fn_proto (std::stringstream &s, bool is_defn,
802 const char *name, const_tree decl)
803 {
804 if (is_defn)
805 /* Emit a declaration. The PTX assembler gets upset without it. */
806 name = write_fn_proto (s, false, name, decl);
807 else
808 {
809 /* Avoid repeating the name replacement. */
810 name = nvptx_name_replacement (name);
811 if (name[0] == '*')
812 name++;
813 }
814
815 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
816
817 /* PTX declaration. */
818 if (DECL_EXTERNAL (decl))
819 s << ".extern ";
820 else if (TREE_PUBLIC (decl))
821 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
822 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
823
824 tree fntype = TREE_TYPE (decl);
825 tree result_type = TREE_TYPE (fntype);
826
827 /* atomic_compare_exchange_$n builtins have an exceptional calling
828 convention. */
829 int not_atomic_weak_arg = -1;
830 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
831 switch (DECL_FUNCTION_CODE (decl))
832 {
833 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
834 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
835 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
836 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
838 /* These atomics skip the 'weak' parm in an actual library
839 call. We must skip it in the prototype too. */
840 not_atomic_weak_arg = 3;
841 break;
842
843 default:
844 break;
845 }
846
847 /* Declare the result. */
848 bool return_in_mem = write_return_type (s, true, result_type);
849
850 s << name;
851
852 int argno = 0;
853
854 /* Emit argument list. */
855 if (return_in_mem)
856 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
857
858 /* We get:
859 NULL in TYPE_ARG_TYPES, for old-style functions
860 NULL in DECL_ARGUMENTS, for builtin functions without another
861 declaration.
862 So we have to pick the best one we have. */
863 tree args = TYPE_ARG_TYPES (fntype);
864 bool prototyped = true;
865 if (!args)
866 {
867 args = DECL_ARGUMENTS (decl);
868 prototyped = false;
869 }
870
871 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
872 {
873 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
874
875 if (not_atomic_weak_arg)
876 argno = write_arg_type (s, -1, argno, type, prototyped);
877 else
878 gcc_assert (type == boolean_type_node);
879 }
880
881 if (stdarg_p (fntype))
882 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
883
884 if (DECL_STATIC_CHAIN (decl))
885 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
886
887 if (!argno && strcmp (name, "main") == 0)
888 {
889 argno = write_arg_type (s, -1, argno, integer_type_node, true);
890 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
891 }
892
893 if (argno)
894 s << ")";
895
896 s << (is_defn ? "\n" : ";\n");
897
898 return name;
899 }
900
901 /* Construct a function declaration from a call insn. This can be
902 necessary for two reasons - either we have an indirect call which
903 requires a .callprototype declaration, or we have a libcall
904 generated by emit_library_call for which no decl exists. */
905
906 static void
907 write_fn_proto_from_insn (std::stringstream &s, const char *name,
908 rtx result, rtx pat)
909 {
910 if (!name)
911 {
912 s << "\t.callprototype ";
913 name = "_";
914 }
915 else
916 {
917 name = nvptx_name_replacement (name);
918 write_fn_marker (s, false, true, name);
919 s << "\t.extern .func ";
920 }
921
922 if (result != NULL_RTX)
923 write_return_mode (s, true, GET_MODE (result));
924
925 s << name;
926
927 int arg_end = XVECLEN (pat, 0);
928 for (int i = 1; i < arg_end; i++)
929 {
930 /* We don't have to deal with mode splitting & promotion here,
931 as that was already done when generating the call
932 sequence. */
933 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
934
935 write_arg_mode (s, -1, i - 1, mode);
936 }
937 if (arg_end != 1)
938 s << ")";
939 s << ";\n";
940 }
941
942 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
943 table and and write a ptx prototype. These are emitted at end of
944 compilation. */
945
946 static void
947 nvptx_record_fndecl (tree decl)
948 {
949 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
950 if (*slot == NULL)
951 {
952 *slot = decl;
953 const char *name = get_fnname_from_decl (decl);
954 write_fn_proto (func_decls, false, name, decl);
955 }
956 }
957
958 /* Record a libcall or unprototyped external function. CALLEE is the
959 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
960 declaration for it. */
961
962 static void
963 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
964 {
965 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
966 if (*slot == NULL)
967 {
968 *slot = callee;
969
970 const char *name = XSTR (callee, 0);
971 write_fn_proto_from_insn (func_decls, name, retval, pat);
972 }
973 }
974
975 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
976 is prototyped, record it now. Otherwise record it as needed at end
977 of compilation, when we might have more information about it. */
978
979 void
980 nvptx_record_needed_fndecl (tree decl)
981 {
982 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
983 {
984 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
985 if (*slot == NULL)
986 *slot = decl;
987 }
988 else
989 nvptx_record_fndecl (decl);
990 }
991
992 /* SYM is a SYMBOL_REF. If it refers to an external function, record
993 it as needed. */
994
995 static void
996 nvptx_maybe_record_fnsym (rtx sym)
997 {
998 tree decl = SYMBOL_REF_DECL (sym);
999
1000 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1001 nvptx_record_needed_fndecl (decl);
1002 }
1003
1004 /* Emit a local array to hold some part of a conventional stack frame
1005 and initialize REGNO to point to it. If the size is zero, it'll
1006 never be valid to dereference, so we can simply initialize to
1007 zero. */
1008
1009 static void
1010 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1011 {
1012 if (size)
1013 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1014 align, reg_names[regno], size);
1015 fprintf (file, "\t.reg.u%d %s;\n",
1016 POINTER_SIZE, reg_names[regno]);
1017 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1018 : "\tmov.u%d %s, 0;\n"),
1019 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1020 }
1021
1022 /* Emit soft stack frame setup sequence. */
1023
1024 static void
1025 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1026 {
1027 /* Maintain 64-bit stack alignment. */
1028 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1029 size = ROUND_UP (size, keep_align);
1030 int bits = POINTER_SIZE;
1031 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1032 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1033 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1034 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1035 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1036 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1037 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1038 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1039 fprintf (file, "\t{\n");
1040 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1041 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1042 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1043 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1044 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1045 bits == 64 ? ".wide" : ".lo", bits / 8);
1046 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1047
1048 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1049 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1050
1051 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1052 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1053 bits, reg_sspprev, reg_sspslot);
1054
1055 /* Initialize %frame = %sspprev - size. */
1056 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1057 bits, reg_frame, reg_sspprev, size);
1058
1059 /* Apply alignment, if larger than 64. */
1060 if (alignment > keep_align)
1061 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1062 bits, reg_frame, reg_frame, -alignment);
1063
1064 size = crtl->outgoing_args_size;
1065 gcc_assert (size % keep_align == 0);
1066
1067 /* Initialize %stack. */
1068 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1069 bits, reg_stack, reg_frame, size);
1070
1071 if (!crtl->is_leaf)
1072 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1073 bits, reg_sspslot, reg_stack);
1074 fprintf (file, "\t}\n");
1075 cfun->machine->has_softstack = true;
1076 need_softstack_decl = true;
1077 }
1078
1079 /* Emit code to initialize the REGNO predicate register to indicate
1080 whether we are not lane zero on the NAME axis. */
1081
1082 static void
1083 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1084 {
1085 fprintf (file, "\t{\n");
1086 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1087 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1088 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1089 fprintf (file, "\t}\n");
1090 }
1091
1092 /* Emit code to initialize predicate and master lane index registers for
1093 -muniform-simt code generation variant. */
1094
1095 static void
1096 nvptx_init_unisimt_predicate (FILE *file)
1097 {
1098 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1099 int loc = REGNO (cfun->machine->unisimt_location);
1100 int bits = POINTER_SIZE;
1101 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1102 fprintf (file, "\t{\n");
1103 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1104 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1105 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1106 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1107 bits == 64 ? ".wide" : ".lo");
1108 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1109 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1110 if (cfun->machine->unisimt_predicate)
1111 {
1112 int master = REGNO (cfun->machine->unisimt_master);
1113 int pred = REGNO (cfun->machine->unisimt_predicate);
1114 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1115 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1116 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1117 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1118 /* Compute predicate as 'tid.x == master'. */
1119 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1120 }
1121 fprintf (file, "\t}\n");
1122 need_unisimt_decl = true;
1123 }
1124
1125 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1126
1127 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1128 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1129 {
1130 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1131 __nvptx_uni[tid.y] = 0;
1132 gomp_nvptx_main (ORIG, arg);
1133 }
1134 ORIG itself should not be emitted as a PTX .entry function. */
1135
1136 static void
1137 write_omp_entry (FILE *file, const char *name, const char *orig)
1138 {
1139 static bool gomp_nvptx_main_declared;
1140 if (!gomp_nvptx_main_declared)
1141 {
1142 gomp_nvptx_main_declared = true;
1143 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1144 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1145 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1146 }
1147 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1148 #define NTID_Y "%ntid.y"
1149 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1150 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1151 {\n\
1152 .reg.u32 %r<3>;\n\
1153 .reg.u" PS " %R<4>;\n\
1154 mov.u32 %r0, %tid.y;\n\
1155 mov.u32 %r1, " NTID_Y ";\n\
1156 mov.u32 %r2, %ctaid.x;\n\
1157 cvt.u" PS ".u32 %R1, %r0;\n\
1158 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1159 mov.u" PS " %R0, __nvptx_stacks;\n\
1160 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1161 ld.param.u" PS " %R2, [%stack];\n\
1162 ld.param.u" PS " %R3, [%sz];\n\
1163 add.u" PS " %R2, %R2, %R3;\n\
1164 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1165 st.shared.u" PS " [%R0], %R2;\n\
1166 mov.u" PS " %R0, __nvptx_uni;\n\
1167 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1168 mov.u32 %r0, 0;\n\
1169 st.shared.u32 [%R0], %r0;\n\
1170 mov.u" PS " %R0, \0;\n\
1171 ld.param.u" PS " %R1, [%arg];\n\
1172 {\n\
1173 .param.u" PS " %P<2>;\n\
1174 st.param.u" PS " [%P0], %R0;\n\
1175 st.param.u" PS " [%P1], %R1;\n\
1176 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1177 }\n\
1178 ret.uni;\n\
1179 }\n"
1180 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1181 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1182 #undef ENTRY_TEMPLATE
1183 #undef NTID_Y
1184 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1185 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1186 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1187 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1188 need_softstack_decl = need_unisimt_decl = true;
1189 }
1190
1191 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1192 function, including local var decls and copies from the arguments to
1193 local regs. */
1194
1195 void
1196 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1197 {
1198 tree fntype = TREE_TYPE (decl);
1199 tree result_type = TREE_TYPE (fntype);
1200 int argno = 0;
1201
1202 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1203 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1204 {
1205 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1206 sprintf (buf, "%s$impl", name);
1207 write_omp_entry (file, name, buf);
1208 name = buf;
1209 }
1210 /* We construct the initial part of the function into a string
1211 stream, in order to share the prototype writing code. */
1212 std::stringstream s;
1213 write_fn_proto (s, true, name, decl);
1214 s << "{\n";
1215
1216 bool return_in_mem = write_return_type (s, false, result_type);
1217 if (return_in_mem)
1218 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1219
1220 /* Declare and initialize incoming arguments. */
1221 tree args = TYPE_ARG_TYPES (fntype);
1222 bool prototyped = true;
1223 if (!args)
1224 {
1225 args = DECL_ARGUMENTS (decl);
1226 prototyped = false;
1227 }
1228
1229 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1230 {
1231 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1232
1233 argno = write_arg_type (s, 0, argno, type, prototyped);
1234 }
1235
1236 if (stdarg_p (fntype))
1237 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1238 true);
1239
1240 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1241 write_arg_type (s, STATIC_CHAIN_REGNUM,
1242 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1243 true);
1244
1245 fprintf (file, "%s", s.str().c_str());
1246
1247 /* Usually 'crtl->is_leaf' is computed during register allocator
1248 initialization (which is not done on NVPTX) or for pressure-sensitive
1249 optimizations. Initialize it here, except if already set. */
1250 if (!crtl->is_leaf)
1251 crtl->is_leaf = leaf_function_p ();
1252
1253 HOST_WIDE_INT sz = get_frame_size ();
1254 bool need_frameptr = sz || cfun->machine->has_chain;
1255 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1256 if (!TARGET_SOFT_STACK)
1257 {
1258 /* Declare a local var for outgoing varargs. */
1259 if (cfun->machine->has_varadic)
1260 init_frame (file, STACK_POINTER_REGNUM,
1261 UNITS_PER_WORD, crtl->outgoing_args_size);
1262
1263 /* Declare a local variable for the frame. Force its size to be
1264 DImode-compatible. */
1265 if (need_frameptr)
1266 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1267 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1268 }
1269 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1270 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1271 init_softstack_frame (file, alignment, sz);
1272
1273 if (cfun->machine->has_simtreg)
1274 {
1275 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1276 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1277 align = MAX (align, GET_MODE_SIZE (DImode));
1278 if (!crtl->is_leaf || cfun->calls_alloca)
1279 simtsz = HOST_WIDE_INT_M1U;
1280 if (simtsz == HOST_WIDE_INT_M1U)
1281 simtsz = nvptx_softstack_size;
1282 if (cfun->machine->has_softstack)
1283 simtsz += POINTER_SIZE / 8;
1284 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1285 if (align > GET_MODE_SIZE (DImode))
1286 simtsz += align - GET_MODE_SIZE (DImode);
1287 if (simtsz)
1288 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1289 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1290 }
1291 /* Declare the pseudos we have as ptx registers. */
1292 int maxregs = max_reg_num ();
1293 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1294 {
1295 if (regno_reg_rtx[i] != const0_rtx)
1296 {
1297 machine_mode mode = PSEUDO_REGNO_MODE (i);
1298 machine_mode split = maybe_split_mode (mode);
1299
1300 if (split_mode_p (mode))
1301 mode = split;
1302 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1303 output_reg (file, i, split, -2);
1304 fprintf (file, ";\n");
1305 }
1306 }
1307
1308 /* Emit axis predicates. */
1309 if (cfun->machine->axis_predicate[0])
1310 nvptx_init_axis_predicate (file,
1311 REGNO (cfun->machine->axis_predicate[0]), "y");
1312 if (cfun->machine->axis_predicate[1])
1313 nvptx_init_axis_predicate (file,
1314 REGNO (cfun->machine->axis_predicate[1]), "x");
1315 if (cfun->machine->unisimt_predicate
1316 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1317 nvptx_init_unisimt_predicate (file);
1318 }
1319
1320 /* Output code for switching uniform-simt state. ENTERING indicates whether
1321 we are entering or leaving non-uniform execution region. */
1322
1323 static void
1324 nvptx_output_unisimt_switch (FILE *file, bool entering)
1325 {
1326 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1327 return;
1328 fprintf (file, "\t{\n");
1329 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1330 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1331 if (!crtl->is_leaf)
1332 {
1333 int loc = REGNO (cfun->machine->unisimt_location);
1334 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1335 }
1336 if (cfun->machine->unisimt_predicate)
1337 {
1338 int master = REGNO (cfun->machine->unisimt_master);
1339 int pred = REGNO (cfun->machine->unisimt_predicate);
1340 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1341 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1342 master, entering ? "%ustmp2" : "0");
1343 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1344 }
1345 fprintf (file, "\t}\n");
1346 }
1347
1348 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1349 ENTERING indicates whether we are entering or leaving non-uniform execution.
1350 PTR is the register pointing to allocated storage, it is assigned to on
1351 entering and used to restore state on leaving. SIZE and ALIGN are used only
1352 on entering. */
1353
1354 static void
1355 nvptx_output_softstack_switch (FILE *file, bool entering,
1356 rtx ptr, rtx size, rtx align)
1357 {
1358 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1359 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1360 return;
1361 int bits = POINTER_SIZE, regno = REGNO (ptr);
1362 fprintf (file, "\t{\n");
1363 if (entering)
1364 {
1365 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1366 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1367 cfun->machine->simt_stack_size);
1368 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1369 if (CONST_INT_P (size))
1370 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1371 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1372 else
1373 output_reg (file, REGNO (size), VOIDmode);
1374 fputs (";\n", file);
1375 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1376 fprintf (file,
1377 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1378 bits, regno, regno, UINTVAL (align));
1379 }
1380 if (cfun->machine->has_softstack)
1381 {
1382 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1383 if (entering)
1384 {
1385 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1386 bits, regno, bits / 8, reg_stack);
1387 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1388 bits, reg_stack, regno, bits / 8);
1389 }
1390 else
1391 {
1392 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1393 bits, reg_stack, regno, bits / 8);
1394 }
1395 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1396 }
1397 fprintf (file, "\t}\n");
1398 }
1399
1400 /* Output code to enter non-uniform execution region. DEST is a register
1401 to hold a per-lane allocation given by SIZE and ALIGN. */
1402
1403 const char *
1404 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1405 {
1406 nvptx_output_unisimt_switch (asm_out_file, true);
1407 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1408 return "";
1409 }
1410
1411 /* Output code to leave non-uniform execution region. SRC is the register
1412 holding per-lane storage previously allocated by omp_simt_enter insn. */
1413
1414 const char *
1415 nvptx_output_simt_exit (rtx src)
1416 {
1417 nvptx_output_unisimt_switch (asm_out_file, false);
1418 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1419 return "";
1420 }
1421
1422 /* Output instruction that sets soft stack pointer in shared memory to the
1423 value in register given by SRC_REGNO. */
1424
1425 const char *
1426 nvptx_output_set_softstack (unsigned src_regno)
1427 {
1428 if (cfun->machine->has_softstack && !crtl->is_leaf)
1429 {
1430 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1431 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1432 output_reg (asm_out_file, src_regno, VOIDmode);
1433 fprintf (asm_out_file, ";\n");
1434 }
1435 return "";
1436 }
1437 /* Output a return instruction. Also copy the return value to its outgoing
1438 location. */
1439
1440 const char *
1441 nvptx_output_return (void)
1442 {
1443 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1444
1445 if (mode != VOIDmode)
1446 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1447 nvptx_ptx_type_from_mode (mode, false),
1448 reg_names[NVPTX_RETURN_REGNUM],
1449 reg_names[NVPTX_RETURN_REGNUM]);
1450
1451 return "ret;";
1452 }
1453
1454 /* Terminate a function by writing a closing brace to FILE. */
1455
1456 void
1457 nvptx_function_end (FILE *file)
1458 {
1459 fprintf (file, "}\n");
1460 }
1461 \f
1462 /* Decide whether we can make a sibling call to a function. For ptx, we
1463 can't. */
1464
1465 static bool
1466 nvptx_function_ok_for_sibcall (tree, tree)
1467 {
1468 return false;
1469 }
1470
1471 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1472
1473 static rtx
1474 nvptx_get_drap_rtx (void)
1475 {
1476 if (TARGET_SOFT_STACK && stack_realign_drap)
1477 return arg_pointer_rtx;
1478 return NULL_RTX;
1479 }
1480
1481 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1482 argument to the next call. */
1483
1484 static void
1485 nvptx_call_args (rtx arg, tree fntype)
1486 {
1487 if (!cfun->machine->doing_call)
1488 {
1489 cfun->machine->doing_call = true;
1490 cfun->machine->is_varadic = false;
1491 cfun->machine->num_args = 0;
1492
1493 if (fntype && stdarg_p (fntype))
1494 {
1495 cfun->machine->is_varadic = true;
1496 cfun->machine->has_varadic = true;
1497 cfun->machine->num_args++;
1498 }
1499 }
1500
1501 if (REG_P (arg) && arg != pc_rtx)
1502 {
1503 cfun->machine->num_args++;
1504 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1505 cfun->machine->call_args);
1506 }
1507 }
1508
1509 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1510 information we recorded. */
1511
1512 static void
1513 nvptx_end_call_args (void)
1514 {
1515 cfun->machine->doing_call = false;
1516 free_EXPR_LIST_list (&cfun->machine->call_args);
1517 }
1518
1519 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1520 track of whether calls involving static chains or varargs were seen
1521 in the current function.
1522 For libcalls, maintain a hash table of decls we have seen, and
1523 record a function decl for later when encountering a new one. */
1524
1525 void
1526 nvptx_expand_call (rtx retval, rtx address)
1527 {
1528 rtx callee = XEXP (address, 0);
1529 rtx varargs = NULL_RTX;
1530 unsigned parallel = 0;
1531
1532 if (!call_insn_operand (callee, Pmode))
1533 {
1534 callee = force_reg (Pmode, callee);
1535 address = change_address (address, QImode, callee);
1536 }
1537
1538 if (GET_CODE (callee) == SYMBOL_REF)
1539 {
1540 tree decl = SYMBOL_REF_DECL (callee);
1541 if (decl != NULL_TREE)
1542 {
1543 if (DECL_STATIC_CHAIN (decl))
1544 cfun->machine->has_chain = true;
1545
1546 tree attr = oacc_get_fn_attrib (decl);
1547 if (attr)
1548 {
1549 tree dims = TREE_VALUE (attr);
1550
1551 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1552 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1553 {
1554 if (TREE_PURPOSE (dims)
1555 && !integer_zerop (TREE_PURPOSE (dims)))
1556 break;
1557 /* Not on this axis. */
1558 parallel ^= GOMP_DIM_MASK (ix);
1559 dims = TREE_CHAIN (dims);
1560 }
1561 }
1562 }
1563 }
1564
1565 unsigned nargs = cfun->machine->num_args;
1566 if (cfun->machine->is_varadic)
1567 {
1568 varargs = gen_reg_rtx (Pmode);
1569 emit_move_insn (varargs, stack_pointer_rtx);
1570 }
1571
1572 rtvec vec = rtvec_alloc (nargs + 1);
1573 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1574 int vec_pos = 0;
1575
1576 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1577 rtx tmp_retval = retval;
1578 if (retval)
1579 {
1580 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1581 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1582 call = gen_rtx_SET (tmp_retval, call);
1583 }
1584 XVECEXP (pat, 0, vec_pos++) = call;
1585
1586 /* Construct the call insn, including a USE for each argument pseudo
1587 register. These will be used when printing the insn. */
1588 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1589 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1590
1591 if (varargs)
1592 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1593
1594 gcc_assert (vec_pos = XVECLEN (pat, 0));
1595
1596 nvptx_emit_forking (parallel, true);
1597 emit_call_insn (pat);
1598 nvptx_emit_joining (parallel, true);
1599
1600 if (tmp_retval != retval)
1601 emit_move_insn (retval, tmp_retval);
1602 }
1603
1604 /* Emit a comparison COMPARE, and return the new test to be used in the
1605 jump. */
1606
1607 rtx
1608 nvptx_expand_compare (rtx compare)
1609 {
1610 rtx pred = gen_reg_rtx (BImode);
1611 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1612 XEXP (compare, 0), XEXP (compare, 1));
1613 emit_insn (gen_rtx_SET (pred, cmp));
1614 return gen_rtx_NE (BImode, pred, const0_rtx);
1615 }
1616
1617 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1618
1619 void
1620 nvptx_expand_oacc_fork (unsigned mode)
1621 {
1622 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1623 }
1624
1625 void
1626 nvptx_expand_oacc_join (unsigned mode)
1627 {
1628 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1629 }
1630
1631 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1632 objects. */
1633
1634 static rtx
1635 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1636 {
1637 rtx res;
1638
1639 switch (GET_MODE (src))
1640 {
1641 case E_DImode:
1642 res = gen_unpackdisi2 (dst0, dst1, src);
1643 break;
1644 case E_DFmode:
1645 res = gen_unpackdfsi2 (dst0, dst1, src);
1646 break;
1647 default: gcc_unreachable ();
1648 }
1649 return res;
1650 }
1651
1652 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1653 object. */
1654
1655 static rtx
1656 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1657 {
1658 rtx res;
1659
1660 switch (GET_MODE (dst))
1661 {
1662 case E_DImode:
1663 res = gen_packsidi2 (dst, src0, src1);
1664 break;
1665 case E_DFmode:
1666 res = gen_packsidf2 (dst, src0, src1);
1667 break;
1668 default: gcc_unreachable ();
1669 }
1670 return res;
1671 }
1672
1673 /* Generate an instruction or sequence to broadcast register REG
1674 across the vectors of a single warp. */
1675
1676 rtx
1677 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1678 {
1679 rtx res;
1680
1681 switch (GET_MODE (dst))
1682 {
1683 case E_SImode:
1684 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1685 break;
1686 case E_SFmode:
1687 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1688 break;
1689 case E_DImode:
1690 case E_DFmode:
1691 {
1692 rtx tmp0 = gen_reg_rtx (SImode);
1693 rtx tmp1 = gen_reg_rtx (SImode);
1694
1695 start_sequence ();
1696 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1697 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1698 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1699 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1700 res = get_insns ();
1701 end_sequence ();
1702 }
1703 break;
1704 case E_BImode:
1705 {
1706 rtx tmp = gen_reg_rtx (SImode);
1707
1708 start_sequence ();
1709 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1710 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1711 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1712 res = get_insns ();
1713 end_sequence ();
1714 }
1715 break;
1716 case E_QImode:
1717 case E_HImode:
1718 {
1719 rtx tmp = gen_reg_rtx (SImode);
1720
1721 start_sequence ();
1722 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1723 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1724 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1725 tmp)));
1726 res = get_insns ();
1727 end_sequence ();
1728 }
1729 break;
1730
1731 default:
1732 gcc_unreachable ();
1733 }
1734 return res;
1735 }
1736
1737 /* Generate an instruction or sequence to broadcast register REG
1738 across the vectors of a single warp. */
1739
1740 static rtx
1741 nvptx_gen_vcast (rtx reg)
1742 {
1743 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1744 }
1745
1746 /* Structure used when generating a worker-level spill or fill. */
1747
1748 struct wcast_data_t
1749 {
1750 rtx base; /* Register holding base addr of buffer. */
1751 rtx ptr; /* Iteration var, if needed. */
1752 unsigned offset; /* Offset into worker buffer. */
1753 };
1754
1755 /* Direction of the spill/fill and looping setup/teardown indicator. */
1756
1757 enum propagate_mask
1758 {
1759 PM_read = 1 << 0,
1760 PM_write = 1 << 1,
1761 PM_loop_begin = 1 << 2,
1762 PM_loop_end = 1 << 3,
1763
1764 PM_read_write = PM_read | PM_write
1765 };
1766
1767 /* Generate instruction(s) to spill or fill register REG to/from the
1768 worker broadcast array. PM indicates what is to be done, REP
1769 how many loop iterations will be executed (0 for not a loop). */
1770
1771 static rtx
1772 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1773 {
1774 rtx res;
1775 machine_mode mode = GET_MODE (reg);
1776
1777 switch (mode)
1778 {
1779 case E_BImode:
1780 {
1781 rtx tmp = gen_reg_rtx (SImode);
1782
1783 start_sequence ();
1784 if (pm & PM_read)
1785 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1786 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1787 if (pm & PM_write)
1788 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1789 res = get_insns ();
1790 end_sequence ();
1791 }
1792 break;
1793
1794 default:
1795 {
1796 rtx addr = data->ptr;
1797
1798 if (!addr)
1799 {
1800 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1801
1802 if (align > worker_bcast_align)
1803 worker_bcast_align = align;
1804 data->offset = (data->offset + align - 1) & ~(align - 1);
1805 addr = data->base;
1806 if (data->offset)
1807 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1808 }
1809
1810 addr = gen_rtx_MEM (mode, addr);
1811 if (pm == PM_read)
1812 res = gen_rtx_SET (addr, reg);
1813 else if (pm == PM_write)
1814 res = gen_rtx_SET (reg, addr);
1815 else
1816 gcc_unreachable ();
1817
1818 if (data->ptr)
1819 {
1820 /* We're using a ptr, increment it. */
1821 start_sequence ();
1822
1823 emit_insn (res);
1824 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1825 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1826 res = get_insns ();
1827 end_sequence ();
1828 }
1829 else
1830 rep = 1;
1831 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1832 }
1833 break;
1834 }
1835 return res;
1836 }
1837 \f
1838 /* Returns true if X is a valid address for use in a memory reference. */
1839
1840 static bool
1841 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1842 {
1843 enum rtx_code code = GET_CODE (x);
1844
1845 switch (code)
1846 {
1847 case REG:
1848 return true;
1849
1850 case PLUS:
1851 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1852 return true;
1853 return false;
1854
1855 case CONST:
1856 case SYMBOL_REF:
1857 case LABEL_REF:
1858 return true;
1859
1860 default:
1861 return false;
1862 }
1863 }
1864 \f
1865 /* Machinery to output constant initializers. When beginning an
1866 initializer, we decide on a fragment size (which is visible in ptx
1867 in the type used), and then all initializer data is buffered until
1868 a fragment is filled and ready to be written out. */
1869
1870 static struct
1871 {
1872 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1873 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1874 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1875 out. */
1876 unsigned size; /* Fragment size to accumulate. */
1877 unsigned offset; /* Offset within current fragment. */
1878 bool started; /* Whether we've output any initializer. */
1879 } init_frag;
1880
1881 /* The current fragment is full, write it out. SYM may provide a
1882 symbolic reference we should output, in which case the fragment
1883 value is the addend. */
1884
1885 static void
1886 output_init_frag (rtx sym)
1887 {
1888 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1889 unsigned HOST_WIDE_INT val = init_frag.val;
1890
1891 init_frag.started = true;
1892 init_frag.val = 0;
1893 init_frag.offset = 0;
1894 init_frag.remaining--;
1895
1896 if (sym)
1897 {
1898 fprintf (asm_out_file, "generic(");
1899 output_address (VOIDmode, sym);
1900 fprintf (asm_out_file, val ? ") + " : ")");
1901 }
1902
1903 if (!sym || val)
1904 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1905 }
1906
1907 /* Add value VAL of size SIZE to the data we're emitting, and keep
1908 writing out chunks as they fill up. */
1909
1910 static void
1911 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1912 {
1913 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1914
1915 for (unsigned part = 0; size; size -= part)
1916 {
1917 val >>= part * BITS_PER_UNIT;
1918 part = init_frag.size - init_frag.offset;
1919 if (part > size)
1920 part = size;
1921
1922 unsigned HOST_WIDE_INT partial
1923 = val << (init_frag.offset * BITS_PER_UNIT);
1924 init_frag.val |= partial & init_frag.mask;
1925 init_frag.offset += part;
1926
1927 if (init_frag.offset == init_frag.size)
1928 output_init_frag (NULL);
1929 }
1930 }
1931
1932 /* Target hook for assembling integer object X of size SIZE. */
1933
1934 static bool
1935 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1936 {
1937 HOST_WIDE_INT val = 0;
1938
1939 switch (GET_CODE (x))
1940 {
1941 default:
1942 /* Let the generic machinery figure it out, usually for a
1943 CONST_WIDE_INT. */
1944 return false;
1945
1946 case CONST_INT:
1947 nvptx_assemble_value (INTVAL (x), size);
1948 break;
1949
1950 case CONST:
1951 x = XEXP (x, 0);
1952 gcc_assert (GET_CODE (x) == PLUS);
1953 val = INTVAL (XEXP (x, 1));
1954 x = XEXP (x, 0);
1955 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1956 /* FALLTHROUGH */
1957
1958 case SYMBOL_REF:
1959 gcc_assert (size == init_frag.size);
1960 if (init_frag.offset)
1961 sorry ("cannot emit unaligned pointers in ptx assembly");
1962
1963 nvptx_maybe_record_fnsym (x);
1964 init_frag.val = val;
1965 output_init_frag (x);
1966 break;
1967 }
1968
1969 return true;
1970 }
1971
1972 /* Output SIZE zero bytes. We ignore the FILE argument since the
1973 functions we're calling to perform the output just use
1974 asm_out_file. */
1975
1976 void
1977 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1978 {
1979 /* Finish the current fragment, if it's started. */
1980 if (init_frag.offset)
1981 {
1982 unsigned part = init_frag.size - init_frag.offset;
1983 if (part > size)
1984 part = (unsigned) size;
1985 size -= part;
1986 nvptx_assemble_value (0, part);
1987 }
1988
1989 /* If this skip doesn't terminate the initializer, write as many
1990 remaining pieces as possible directly. */
1991 if (size < init_frag.remaining * init_frag.size)
1992 {
1993 while (size >= init_frag.size)
1994 {
1995 size -= init_frag.size;
1996 output_init_frag (NULL_RTX);
1997 }
1998 if (size)
1999 nvptx_assemble_value (0, size);
2000 }
2001 }
2002
2003 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2004 ignore the FILE arg. */
2005
2006 void
2007 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2008 {
2009 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2010 nvptx_assemble_value (str[i], 1);
2011 }
2012
2013 /* Emit a PTX variable decl and prepare for emission of its
2014 initializer. NAME is the symbol name and SETION the PTX data
2015 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2016 The caller has already emitted any indentation and linkage
2017 specifier. It is responsible for any initializer, terminating ;
2018 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2019 this is the opposite way round that PTX wants them! */
2020
2021 static void
2022 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2023 const_tree type, HOST_WIDE_INT size, unsigned align)
2024 {
2025 while (TREE_CODE (type) == ARRAY_TYPE)
2026 type = TREE_TYPE (type);
2027
2028 if (TREE_CODE (type) == VECTOR_TYPE
2029 || TREE_CODE (type) == COMPLEX_TYPE)
2030 /* Neither vector nor complex types can contain the other. */
2031 type = TREE_TYPE (type);
2032
2033 unsigned elt_size = int_size_in_bytes (type);
2034
2035 /* Largest mode we're prepared to accept. For BLKmode types we
2036 don't know if it'll contain pointer constants, so have to choose
2037 pointer size, otherwise we can choose DImode. */
2038 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2039
2040 elt_size |= GET_MODE_SIZE (elt_mode);
2041 elt_size &= -elt_size; /* Extract LSB set. */
2042
2043 init_frag.size = elt_size;
2044 /* Avoid undefined shift behavior by using '2'. */
2045 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2046 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2047 init_frag.val = 0;
2048 init_frag.offset = 0;
2049 init_frag.started = false;
2050 /* Size might not be a multiple of elt size, if there's an
2051 initialized trailing struct array with smaller type than
2052 elt_size. */
2053 init_frag.remaining = (size + elt_size - 1) / elt_size;
2054
2055 fprintf (file, "%s .align %d .u%d ",
2056 section, align / BITS_PER_UNIT,
2057 elt_size * BITS_PER_UNIT);
2058 assemble_name (file, name);
2059
2060 if (size)
2061 /* We make everything an array, to simplify any initialization
2062 emission. */
2063 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2064 }
2065
2066 /* Called when the initializer for a decl has been completely output through
2067 combinations of the three functions above. */
2068
2069 static void
2070 nvptx_assemble_decl_end (void)
2071 {
2072 if (init_frag.offset)
2073 /* This can happen with a packed struct with trailing array member. */
2074 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2075 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2076 }
2077
2078 /* Output an uninitialized common or file-scope variable. */
2079
2080 void
2081 nvptx_output_aligned_decl (FILE *file, const char *name,
2082 const_tree decl, HOST_WIDE_INT size, unsigned align)
2083 {
2084 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2085
2086 /* If this is public, it is common. The nearest thing we have to
2087 common is weak. */
2088 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2089
2090 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2091 TREE_TYPE (decl), size, align);
2092 nvptx_assemble_decl_end ();
2093 }
2094
2095 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2096 writing a constant variable EXP with NAME and SIZE and its
2097 initializer to FILE. */
2098
2099 static void
2100 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2101 const_tree exp, HOST_WIDE_INT obj_size)
2102 {
2103 write_var_marker (file, true, false, name);
2104
2105 fprintf (file, "\t");
2106
2107 tree type = TREE_TYPE (exp);
2108 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2109 TYPE_ALIGN (type));
2110 }
2111
2112 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2113 a variable DECL with NAME to FILE. */
2114
2115 void
2116 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2117 {
2118 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2119
2120 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2121 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2122
2123 tree type = TREE_TYPE (decl);
2124 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2125 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2126 type, obj_size, DECL_ALIGN (decl));
2127 }
2128
2129 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2130
2131 static void
2132 nvptx_globalize_label (FILE *, const char *)
2133 {
2134 }
2135
2136 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2137 declaration only for variable DECL with NAME to FILE. */
2138
2139 static void
2140 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2141 {
2142 /* The middle end can place constant pool decls into the varpool as
2143 undefined. Until that is fixed, catch the problem here. */
2144 if (DECL_IN_CONSTANT_POOL (decl))
2145 return;
2146
2147 /* We support weak defintions, and hence have the right
2148 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2149 if (DECL_WEAK (decl))
2150 error_at (DECL_SOURCE_LOCATION (decl),
2151 "PTX does not support weak declarations"
2152 " (only weak definitions)");
2153 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2154
2155 fprintf (file, "\t.extern ");
2156 tree size = DECL_SIZE_UNIT (decl);
2157 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2158 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2159 DECL_ALIGN (decl));
2160 nvptx_assemble_decl_end ();
2161 }
2162
2163 /* Output a pattern for a move instruction. */
2164
2165 const char *
2166 nvptx_output_mov_insn (rtx dst, rtx src)
2167 {
2168 machine_mode dst_mode = GET_MODE (dst);
2169 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2170 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2171 machine_mode src_inner = (GET_CODE (src) == SUBREG
2172 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2173
2174 rtx sym = src;
2175 if (GET_CODE (sym) == CONST)
2176 sym = XEXP (XEXP (sym, 0), 0);
2177 if (SYMBOL_REF_P (sym))
2178 {
2179 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2180 return "%.\tcvta%D1%t0\t%0, %1;";
2181 nvptx_maybe_record_fnsym (sym);
2182 }
2183
2184 if (src_inner == dst_inner)
2185 return "%.\tmov%t0\t%0, %1;";
2186
2187 if (CONSTANT_P (src))
2188 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2189 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2190 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2191
2192 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2193 {
2194 if (GET_MODE_BITSIZE (dst_mode) == 128
2195 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2196 {
2197 /* mov.b128 is not supported. */
2198 if (dst_inner == V2DImode && src_inner == TImode)
2199 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2200 else if (dst_inner == TImode && src_inner == V2DImode)
2201 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2202
2203 gcc_unreachable ();
2204 }
2205 return "%.\tmov.b%T0\t%0, %1;";
2206 }
2207
2208 return "%.\tcvt%t0%t1\t%0, %1;";
2209 }
2210
2211 static void nvptx_print_operand (FILE *, rtx, int);
2212
2213 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2214 involves writing .param declarations and in/out copies into them. For
2215 indirect calls, also write the .callprototype. */
2216
2217 const char *
2218 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2219 {
2220 char buf[16];
2221 static int labelno;
2222 bool needs_tgt = register_operand (callee, Pmode);
2223 rtx pat = PATTERN (insn);
2224 if (GET_CODE (pat) == COND_EXEC)
2225 pat = COND_EXEC_CODE (pat);
2226 int arg_end = XVECLEN (pat, 0);
2227 tree decl = NULL_TREE;
2228
2229 fprintf (asm_out_file, "\t{\n");
2230 if (result != NULL)
2231 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2232 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2233 reg_names[NVPTX_RETURN_REGNUM]);
2234
2235 /* Ensure we have a ptx declaration in the output if necessary. */
2236 if (GET_CODE (callee) == SYMBOL_REF)
2237 {
2238 decl = SYMBOL_REF_DECL (callee);
2239 if (!decl
2240 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2241 nvptx_record_libfunc (callee, result, pat);
2242 else if (DECL_EXTERNAL (decl))
2243 nvptx_record_fndecl (decl);
2244 }
2245
2246 if (needs_tgt)
2247 {
2248 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2249 labelno++;
2250 ASM_OUTPUT_LABEL (asm_out_file, buf);
2251 std::stringstream s;
2252 write_fn_proto_from_insn (s, NULL, result, pat);
2253 fputs (s.str().c_str(), asm_out_file);
2254 }
2255
2256 for (int argno = 1; argno < arg_end; argno++)
2257 {
2258 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2259 machine_mode mode = GET_MODE (t);
2260 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2261
2262 /* Mode splitting has already been done. */
2263 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2264 "\t\tst.param%s [%%out_arg%d], ",
2265 ptx_type, argno, ptx_type, argno);
2266 output_reg (asm_out_file, REGNO (t), VOIDmode);
2267 fprintf (asm_out_file, ";\n");
2268 }
2269
2270 /* The '.' stands for the call's predicate, if any. */
2271 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2272 fprintf (asm_out_file, "\t\tcall ");
2273 if (result != NULL_RTX)
2274 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2275
2276 if (decl)
2277 {
2278 const char *name = get_fnname_from_decl (decl);
2279 name = nvptx_name_replacement (name);
2280 assemble_name (asm_out_file, name);
2281 }
2282 else
2283 output_address (VOIDmode, callee);
2284
2285 const char *open = "(";
2286 for (int argno = 1; argno < arg_end; argno++)
2287 {
2288 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2289 open = "";
2290 }
2291 if (decl && DECL_STATIC_CHAIN (decl))
2292 {
2293 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2294 open = "";
2295 }
2296 if (!open[0])
2297 fprintf (asm_out_file, ")");
2298
2299 if (needs_tgt)
2300 {
2301 fprintf (asm_out_file, ", ");
2302 assemble_name (asm_out_file, buf);
2303 }
2304 fprintf (asm_out_file, ";\n");
2305
2306 if (find_reg_note (insn, REG_NORETURN, NULL))
2307 /* No return functions confuse the PTX JIT, as it doesn't realize
2308 the flow control barrier they imply. It can seg fault if it
2309 encounters what looks like an unexitable loop. Emit a trailing
2310 trap, which it does grok. */
2311 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2312
2313 if (result)
2314 {
2315 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2316
2317 if (!rval[0])
2318 /* We must escape the '%' that starts RETURN_REGNUM. */
2319 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2320 reg_names[NVPTX_RETURN_REGNUM]);
2321 return rval;
2322 }
2323
2324 return "}";
2325 }
2326
2327 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2328
2329 static bool
2330 nvptx_print_operand_punct_valid_p (unsigned char c)
2331 {
2332 return c == '.' || c== '#';
2333 }
2334
2335 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2336
2337 static void
2338 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2339 {
2340 rtx off;
2341 if (GET_CODE (x) == CONST)
2342 x = XEXP (x, 0);
2343 switch (GET_CODE (x))
2344 {
2345 case PLUS:
2346 off = XEXP (x, 1);
2347 output_address (VOIDmode, XEXP (x, 0));
2348 fprintf (file, "+");
2349 output_address (VOIDmode, off);
2350 break;
2351
2352 case SYMBOL_REF:
2353 case LABEL_REF:
2354 output_addr_const (file, x);
2355 break;
2356
2357 default:
2358 gcc_assert (GET_CODE (x) != MEM);
2359 nvptx_print_operand (file, x, 0);
2360 break;
2361 }
2362 }
2363
2364 /* Write assembly language output for the address ADDR to FILE. */
2365
2366 static void
2367 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2368 {
2369 nvptx_print_address_operand (file, addr, mode);
2370 }
2371
2372 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2373
2374 Meaning of CODE:
2375 . -- print the predicate for the instruction or an emptry string for an
2376 unconditional one.
2377 # -- print a rounding mode for the instruction
2378
2379 A -- print a data area for a MEM
2380 c -- print an opcode suffix for a comparison operator, including a type code
2381 D -- print a data area for a MEM operand
2382 S -- print a shuffle kind specified by CONST_INT
2383 t -- print a type opcode suffix, promoting QImode to 32 bits
2384 T -- print a type size in bits
2385 u -- print a type opcode suffix without promotions. */
2386
2387 static void
2388 nvptx_print_operand (FILE *file, rtx x, int code)
2389 {
2390 if (code == '.')
2391 {
2392 x = current_insn_predicate;
2393 if (x)
2394 {
2395 fputs ("@", file);
2396 if (GET_CODE (x) == EQ)
2397 fputs ("!", file);
2398 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2399 }
2400 return;
2401 }
2402 else if (code == '#')
2403 {
2404 fputs (".rn", file);
2405 return;
2406 }
2407
2408 enum rtx_code x_code = GET_CODE (x);
2409 machine_mode mode = GET_MODE (x);
2410
2411 switch (code)
2412 {
2413 case 'A':
2414 x = XEXP (x, 0);
2415 /* FALLTHROUGH. */
2416
2417 case 'D':
2418 if (GET_CODE (x) == CONST)
2419 x = XEXP (x, 0);
2420 if (GET_CODE (x) == PLUS)
2421 x = XEXP (x, 0);
2422
2423 if (GET_CODE (x) == SYMBOL_REF)
2424 fputs (section_for_sym (x), file);
2425 break;
2426
2427 case 't':
2428 case 'u':
2429 if (x_code == SUBREG)
2430 {
2431 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2432 if (VECTOR_MODE_P (inner_mode)
2433 && (GET_MODE_SIZE (mode)
2434 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2435 mode = GET_MODE_INNER (inner_mode);
2436 else if (split_mode_p (inner_mode))
2437 mode = maybe_split_mode (inner_mode);
2438 else
2439 mode = inner_mode;
2440 }
2441 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2442 break;
2443
2444 case 'H':
2445 case 'L':
2446 {
2447 rtx inner_x = SUBREG_REG (x);
2448 machine_mode inner_mode = GET_MODE (inner_x);
2449 machine_mode split = maybe_split_mode (inner_mode);
2450
2451 output_reg (file, REGNO (inner_x), split,
2452 (code == 'H'
2453 ? GET_MODE_SIZE (inner_mode) / 2
2454 : 0));
2455 }
2456 break;
2457
2458 case 'S':
2459 {
2460 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2461 /* Same order as nvptx_shuffle_kind. */
2462 static const char *const kinds[] =
2463 {".up", ".down", ".bfly", ".idx"};
2464 fputs (kinds[kind], file);
2465 }
2466 break;
2467
2468 case 'T':
2469 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2470 break;
2471
2472 case 'j':
2473 fprintf (file, "@");
2474 goto common;
2475
2476 case 'J':
2477 fprintf (file, "@!");
2478 goto common;
2479
2480 case 'c':
2481 mode = GET_MODE (XEXP (x, 0));
2482 switch (x_code)
2483 {
2484 case EQ:
2485 fputs (".eq", file);
2486 break;
2487 case NE:
2488 if (FLOAT_MODE_P (mode))
2489 fputs (".neu", file);
2490 else
2491 fputs (".ne", file);
2492 break;
2493 case LE:
2494 case LEU:
2495 fputs (".le", file);
2496 break;
2497 case GE:
2498 case GEU:
2499 fputs (".ge", file);
2500 break;
2501 case LT:
2502 case LTU:
2503 fputs (".lt", file);
2504 break;
2505 case GT:
2506 case GTU:
2507 fputs (".gt", file);
2508 break;
2509 case LTGT:
2510 fputs (".ne", file);
2511 break;
2512 case UNEQ:
2513 fputs (".equ", file);
2514 break;
2515 case UNLE:
2516 fputs (".leu", file);
2517 break;
2518 case UNGE:
2519 fputs (".geu", file);
2520 break;
2521 case UNLT:
2522 fputs (".ltu", file);
2523 break;
2524 case UNGT:
2525 fputs (".gtu", file);
2526 break;
2527 case UNORDERED:
2528 fputs (".nan", file);
2529 break;
2530 case ORDERED:
2531 fputs (".num", file);
2532 break;
2533 default:
2534 gcc_unreachable ();
2535 }
2536 if (FLOAT_MODE_P (mode)
2537 || x_code == EQ || x_code == NE
2538 || x_code == GEU || x_code == GTU
2539 || x_code == LEU || x_code == LTU)
2540 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2541 else
2542 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2543 break;
2544 default:
2545 common:
2546 switch (x_code)
2547 {
2548 case SUBREG:
2549 {
2550 rtx inner_x = SUBREG_REG (x);
2551 machine_mode inner_mode = GET_MODE (inner_x);
2552 machine_mode split = maybe_split_mode (inner_mode);
2553
2554 if (VECTOR_MODE_P (inner_mode)
2555 && (GET_MODE_SIZE (mode)
2556 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2557 {
2558 output_reg (file, REGNO (inner_x), VOIDmode);
2559 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2560 }
2561 else if (split_mode_p (inner_mode)
2562 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2563 output_reg (file, REGNO (inner_x), split);
2564 else
2565 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2566 }
2567 break;
2568
2569 case REG:
2570 output_reg (file, REGNO (x), maybe_split_mode (mode));
2571 break;
2572
2573 case MEM:
2574 fputc ('[', file);
2575 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2576 fputc (']', file);
2577 break;
2578
2579 case CONST_INT:
2580 output_addr_const (file, x);
2581 break;
2582
2583 case CONST:
2584 case SYMBOL_REF:
2585 case LABEL_REF:
2586 /* We could use output_addr_const, but that can print things like
2587 "x-8", which breaks ptxas. Need to ensure it is output as
2588 "x+-8". */
2589 nvptx_print_address_operand (file, x, VOIDmode);
2590 break;
2591
2592 case CONST_DOUBLE:
2593 long vals[2];
2594 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2595 vals[0] &= 0xffffffff;
2596 vals[1] &= 0xffffffff;
2597 if (mode == SFmode)
2598 fprintf (file, "0f%08lx", vals[0]);
2599 else
2600 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2601 break;
2602
2603 case CONST_VECTOR:
2604 {
2605 unsigned n = CONST_VECTOR_NUNITS (x);
2606 fprintf (file, "{ ");
2607 for (unsigned i = 0; i < n; ++i)
2608 {
2609 if (i != 0)
2610 fprintf (file, ", ");
2611
2612 rtx elem = CONST_VECTOR_ELT (x, i);
2613 output_addr_const (file, elem);
2614 }
2615 fprintf (file, " }");
2616 }
2617 break;
2618
2619 default:
2620 output_addr_const (file, x);
2621 }
2622 }
2623 }
2624 \f
2625 /* Record replacement regs used to deal with subreg operands. */
2626 struct reg_replace
2627 {
2628 rtx replacement[MAX_RECOG_OPERANDS];
2629 machine_mode mode;
2630 int n_allocated;
2631 int n_in_use;
2632 };
2633
2634 /* Allocate or reuse a replacement in R and return the rtx. */
2635
2636 static rtx
2637 get_replacement (struct reg_replace *r)
2638 {
2639 if (r->n_allocated == r->n_in_use)
2640 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2641 return r->replacement[r->n_in_use++];
2642 }
2643
2644 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2645 the presence of subregs would break the rules for most instructions.
2646 Replace them with a suitable new register of the right size, plus
2647 conversion copyin/copyout instructions. */
2648
2649 static void
2650 nvptx_reorg_subreg (void)
2651 {
2652 struct reg_replace qiregs, hiregs, siregs, diregs;
2653 rtx_insn *insn, *next;
2654
2655 qiregs.n_allocated = 0;
2656 hiregs.n_allocated = 0;
2657 siregs.n_allocated = 0;
2658 diregs.n_allocated = 0;
2659 qiregs.mode = QImode;
2660 hiregs.mode = HImode;
2661 siregs.mode = SImode;
2662 diregs.mode = DImode;
2663
2664 for (insn = get_insns (); insn; insn = next)
2665 {
2666 next = NEXT_INSN (insn);
2667 if (!NONDEBUG_INSN_P (insn)
2668 || asm_noperands (PATTERN (insn)) >= 0
2669 || GET_CODE (PATTERN (insn)) == USE
2670 || GET_CODE (PATTERN (insn)) == CLOBBER)
2671 continue;
2672
2673 qiregs.n_in_use = 0;
2674 hiregs.n_in_use = 0;
2675 siregs.n_in_use = 0;
2676 diregs.n_in_use = 0;
2677 extract_insn (insn);
2678 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2679
2680 for (int i = 0; i < recog_data.n_operands; i++)
2681 {
2682 rtx op = recog_data.operand[i];
2683 if (GET_CODE (op) != SUBREG)
2684 continue;
2685
2686 rtx inner = SUBREG_REG (op);
2687
2688 machine_mode outer_mode = GET_MODE (op);
2689 machine_mode inner_mode = GET_MODE (inner);
2690 gcc_assert (s_ok);
2691 if (s_ok
2692 && (GET_MODE_PRECISION (inner_mode)
2693 >= GET_MODE_PRECISION (outer_mode)))
2694 continue;
2695 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2696 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2697 : outer_mode == HImode ? &hiregs
2698 : outer_mode == SImode ? &siregs
2699 : &diregs);
2700 rtx new_reg = get_replacement (r);
2701
2702 if (recog_data.operand_type[i] != OP_OUT)
2703 {
2704 enum rtx_code code;
2705 if (GET_MODE_PRECISION (inner_mode)
2706 < GET_MODE_PRECISION (outer_mode))
2707 code = ZERO_EXTEND;
2708 else
2709 code = TRUNCATE;
2710
2711 rtx pat = gen_rtx_SET (new_reg,
2712 gen_rtx_fmt_e (code, outer_mode, inner));
2713 emit_insn_before (pat, insn);
2714 }
2715
2716 if (recog_data.operand_type[i] != OP_IN)
2717 {
2718 enum rtx_code code;
2719 if (GET_MODE_PRECISION (inner_mode)
2720 < GET_MODE_PRECISION (outer_mode))
2721 code = TRUNCATE;
2722 else
2723 code = ZERO_EXTEND;
2724
2725 rtx pat = gen_rtx_SET (inner,
2726 gen_rtx_fmt_e (code, inner_mode, new_reg));
2727 emit_insn_after (pat, insn);
2728 }
2729 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2730 }
2731 }
2732 }
2733
2734 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2735 first use. */
2736
2737 static rtx
2738 nvptx_get_unisimt_master ()
2739 {
2740 rtx &master = cfun->machine->unisimt_master;
2741 return master ? master : master = gen_reg_rtx (SImode);
2742 }
2743
2744 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2745
2746 static rtx
2747 nvptx_get_unisimt_predicate ()
2748 {
2749 rtx &pred = cfun->machine->unisimt_predicate;
2750 return pred ? pred : pred = gen_reg_rtx (BImode);
2751 }
2752
2753 /* Return true if given call insn references one of the functions provided by
2754 the CUDA runtime: malloc, free, vprintf. */
2755
2756 static bool
2757 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2758 {
2759 rtx pat = PATTERN (insn);
2760 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2761 pat = XVECEXP (pat, 0, 0);
2762 if (GET_CODE (pat) == SET)
2763 pat = SET_SRC (pat);
2764 gcc_checking_assert (GET_CODE (pat) == CALL
2765 && GET_CODE (XEXP (pat, 0)) == MEM);
2766 rtx addr = XEXP (XEXP (pat, 0), 0);
2767 if (GET_CODE (addr) != SYMBOL_REF)
2768 return false;
2769 const char *name = XSTR (addr, 0);
2770 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2771 references with forced assembler name refer to PTX syscalls. For vprintf,
2772 accept both normal and forced-assembler-name references. */
2773 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2774 || !strcmp (name, "*malloc")
2775 || !strcmp (name, "*free"));
2776 }
2777
2778 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2779 propagate its value from lane MASTER to current lane. */
2780
2781 static void
2782 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2783 {
2784 rtx reg;
2785 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2786 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2787 }
2788
2789 /* Adjust code for uniform-simt code generation variant by making atomics and
2790 "syscalls" conditionally executed, and inserting shuffle-based propagation
2791 for registers being set. */
2792
2793 static void
2794 nvptx_reorg_uniform_simt ()
2795 {
2796 rtx_insn *insn, *next;
2797
2798 for (insn = get_insns (); insn; insn = next)
2799 {
2800 next = NEXT_INSN (insn);
2801 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2802 && !(NONJUMP_INSN_P (insn)
2803 && GET_CODE (PATTERN (insn)) == PARALLEL
2804 && get_attr_atomic (insn)))
2805 continue;
2806 rtx pat = PATTERN (insn);
2807 rtx master = nvptx_get_unisimt_master ();
2808 for (int i = 0; i < XVECLEN (pat, 0); i++)
2809 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2810 rtx pred = nvptx_get_unisimt_predicate ();
2811 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2812 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2813 validate_change (insn, &PATTERN (insn), pat, false);
2814 }
2815 }
2816
2817 /* Loop structure of the function. The entire function is described as
2818 a NULL loop. */
2819
2820 struct parallel
2821 {
2822 /* Parent parallel. */
2823 parallel *parent;
2824
2825 /* Next sibling parallel. */
2826 parallel *next;
2827
2828 /* First child parallel. */
2829 parallel *inner;
2830
2831 /* Partitioning mask of the parallel. */
2832 unsigned mask;
2833
2834 /* Partitioning used within inner parallels. */
2835 unsigned inner_mask;
2836
2837 /* Location of parallel forked and join. The forked is the first
2838 block in the parallel and the join is the first block after of
2839 the partition. */
2840 basic_block forked_block;
2841 basic_block join_block;
2842
2843 rtx_insn *forked_insn;
2844 rtx_insn *join_insn;
2845
2846 rtx_insn *fork_insn;
2847 rtx_insn *joining_insn;
2848
2849 /* Basic blocks in this parallel, but not in child parallels. The
2850 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2851 blocks are not. */
2852 auto_vec<basic_block> blocks;
2853
2854 public:
2855 parallel (parallel *parent, unsigned mode);
2856 ~parallel ();
2857 };
2858
2859 /* Constructor links the new parallel into it's parent's chain of
2860 children. */
2861
2862 parallel::parallel (parallel *parent_, unsigned mask_)
2863 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2864 {
2865 forked_block = join_block = 0;
2866 forked_insn = join_insn = 0;
2867 fork_insn = joining_insn = 0;
2868
2869 if (parent)
2870 {
2871 next = parent->inner;
2872 parent->inner = this;
2873 }
2874 }
2875
2876 parallel::~parallel ()
2877 {
2878 delete inner;
2879 delete next;
2880 }
2881
2882 /* Map of basic blocks to insns */
2883 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2884
2885 /* A tuple of an insn of interest and the BB in which it resides. */
2886 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2887 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2888
2889 /* Split basic blocks such that each forked and join unspecs are at
2890 the start of their basic blocks. Thus afterwards each block will
2891 have a single partitioning mode. We also do the same for return
2892 insns, as they are executed by every thread. Return the
2893 partitioning mode of the function as a whole. Populate MAP with
2894 head and tail blocks. We also clear the BB visited flag, which is
2895 used when finding partitions. */
2896
2897 static void
2898 nvptx_split_blocks (bb_insn_map_t *map)
2899 {
2900 insn_bb_vec_t worklist;
2901 basic_block block;
2902 rtx_insn *insn;
2903
2904 /* Locate all the reorg instructions of interest. */
2905 FOR_ALL_BB_FN (block, cfun)
2906 {
2907 bool seen_insn = false;
2908
2909 /* Clear visited flag, for use by parallel locator */
2910 block->flags &= ~BB_VISITED;
2911
2912 FOR_BB_INSNS (block, insn)
2913 {
2914 if (!INSN_P (insn))
2915 continue;
2916 switch (recog_memoized (insn))
2917 {
2918 default:
2919 seen_insn = true;
2920 continue;
2921 case CODE_FOR_nvptx_forked:
2922 case CODE_FOR_nvptx_join:
2923 break;
2924
2925 case CODE_FOR_return:
2926 /* We also need to split just before return insns, as
2927 that insn needs executing by all threads, but the
2928 block it is in probably does not. */
2929 break;
2930 }
2931
2932 if (seen_insn)
2933 /* We've found an instruction that must be at the start of
2934 a block, but isn't. Add it to the worklist. */
2935 worklist.safe_push (insn_bb_t (insn, block));
2936 else
2937 /* It was already the first instruction. Just add it to
2938 the map. */
2939 map->get_or_insert (block) = insn;
2940 seen_insn = true;
2941 }
2942 }
2943
2944 /* Split blocks on the worklist. */
2945 unsigned ix;
2946 insn_bb_t *elt;
2947 basic_block remap = 0;
2948 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2949 {
2950 if (remap != elt->second)
2951 {
2952 block = elt->second;
2953 remap = block;
2954 }
2955
2956 /* Split block before insn. The insn is in the new block */
2957 edge e = split_block (block, PREV_INSN (elt->first));
2958
2959 block = e->dest;
2960 map->get_or_insert (block) = elt->first;
2961 }
2962 }
2963
2964 /* BLOCK is a basic block containing a head or tail instruction.
2965 Locate the associated prehead or pretail instruction, which must be
2966 in the single predecessor block. */
2967
2968 static rtx_insn *
2969 nvptx_discover_pre (basic_block block, int expected)
2970 {
2971 gcc_assert (block->preds->length () == 1);
2972 basic_block pre_block = (*block->preds)[0]->src;
2973 rtx_insn *pre_insn;
2974
2975 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2976 pre_insn = PREV_INSN (pre_insn))
2977 gcc_assert (pre_insn != BB_HEAD (pre_block));
2978
2979 gcc_assert (recog_memoized (pre_insn) == expected);
2980 return pre_insn;
2981 }
2982
2983 /* Dump this parallel and all its inner parallels. */
2984
2985 static void
2986 nvptx_dump_pars (parallel *par, unsigned depth)
2987 {
2988 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2989 depth, par->mask,
2990 par->forked_block ? par->forked_block->index : -1,
2991 par->join_block ? par->join_block->index : -1);
2992
2993 fprintf (dump_file, " blocks:");
2994
2995 basic_block block;
2996 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2997 fprintf (dump_file, " %d", block->index);
2998 fprintf (dump_file, "\n");
2999 if (par->inner)
3000 nvptx_dump_pars (par->inner, depth + 1);
3001
3002 if (par->next)
3003 nvptx_dump_pars (par->next, depth);
3004 }
3005
3006 /* If BLOCK contains a fork/join marker, process it to create or
3007 terminate a loop structure. Add this block to the current loop,
3008 and then walk successor blocks. */
3009
3010 static parallel *
3011 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3012 {
3013 if (block->flags & BB_VISITED)
3014 return par;
3015 block->flags |= BB_VISITED;
3016
3017 if (rtx_insn **endp = map->get (block))
3018 {
3019 rtx_insn *end = *endp;
3020
3021 /* This is a block head or tail, or return instruction. */
3022 switch (recog_memoized (end))
3023 {
3024 case CODE_FOR_return:
3025 /* Return instructions are in their own block, and we
3026 don't need to do anything more. */
3027 return par;
3028
3029 case CODE_FOR_nvptx_forked:
3030 /* Loop head, create a new inner loop and add it into
3031 our parent's child list. */
3032 {
3033 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3034
3035 gcc_assert (mask);
3036 par = new parallel (par, mask);
3037 par->forked_block = block;
3038 par->forked_insn = end;
3039 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3040 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3041 par->fork_insn
3042 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3043 }
3044 break;
3045
3046 case CODE_FOR_nvptx_join:
3047 /* A loop tail. Finish the current loop and return to
3048 parent. */
3049 {
3050 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3051
3052 gcc_assert (par->mask == mask);
3053 par->join_block = block;
3054 par->join_insn = end;
3055 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3056 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3057 par->joining_insn
3058 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3059 par = par->parent;
3060 }
3061 break;
3062
3063 default:
3064 gcc_unreachable ();
3065 }
3066 }
3067
3068 if (par)
3069 /* Add this block onto the current loop's list of blocks. */
3070 par->blocks.safe_push (block);
3071 else
3072 /* This must be the entry block. Create a NULL parallel. */
3073 par = new parallel (0, 0);
3074
3075 /* Walk successor blocks. */
3076 edge e;
3077 edge_iterator ei;
3078
3079 FOR_EACH_EDGE (e, ei, block->succs)
3080 nvptx_find_par (map, par, e->dest);
3081
3082 return par;
3083 }
3084
3085 /* DFS walk the CFG looking for fork & join markers. Construct
3086 loop structures as we go. MAP is a mapping of basic blocks
3087 to head & tail markers, discovered when splitting blocks. This
3088 speeds up the discovery. We rely on the BB visited flag having
3089 been cleared when splitting blocks. */
3090
3091 static parallel *
3092 nvptx_discover_pars (bb_insn_map_t *map)
3093 {
3094 basic_block block;
3095
3096 /* Mark exit blocks as visited. */
3097 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3098 block->flags |= BB_VISITED;
3099
3100 /* And entry block as not. */
3101 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3102 block->flags &= ~BB_VISITED;
3103
3104 parallel *par = nvptx_find_par (map, 0, block);
3105
3106 if (dump_file)
3107 {
3108 fprintf (dump_file, "\nLoops\n");
3109 nvptx_dump_pars (par, 0);
3110 fprintf (dump_file, "\n");
3111 }
3112
3113 return par;
3114 }
3115
3116 /* Analyse a group of BBs within a partitioned region and create N
3117 Single-Entry-Single-Exit regions. Some of those regions will be
3118 trivial ones consisting of a single BB. The blocks of a
3119 partitioned region might form a set of disjoint graphs -- because
3120 the region encloses a differently partitoned sub region.
3121
3122 We use the linear time algorithm described in 'Finding Regions Fast:
3123 Single Entry Single Exit and control Regions in Linear Time'
3124 Johnson, Pearson & Pingali. That algorithm deals with complete
3125 CFGs, where a back edge is inserted from END to START, and thus the
3126 problem becomes one of finding equivalent loops.
3127
3128 In this case we have a partial CFG. We complete it by redirecting
3129 any incoming edge to the graph to be from an arbitrary external BB,
3130 and similarly redirecting any outgoing edge to be to that BB.
3131 Thus we end up with a closed graph.
3132
3133 The algorithm works by building a spanning tree of an undirected
3134 graph and keeping track of back edges from nodes further from the
3135 root in the tree to nodes nearer to the root in the tree. In the
3136 description below, the root is up and the tree grows downwards.
3137
3138 We avoid having to deal with degenerate back-edges to the same
3139 block, by splitting each BB into 3 -- one for input edges, one for
3140 the node itself and one for the output edges. Such back edges are
3141 referred to as 'Brackets'. Cycle equivalent nodes will have the
3142 same set of brackets.
3143
3144 Determining bracket equivalency is done by maintaining a list of
3145 brackets in such a manner that the list length and final bracket
3146 uniquely identify the set.
3147
3148 We use coloring to mark all BBs with cycle equivalency with the
3149 same color. This is the output of the 'Finding Regions Fast'
3150 algorithm. Notice it doesn't actually find the set of nodes within
3151 a particular region, just unorderd sets of nodes that are the
3152 entries and exits of SESE regions.
3153
3154 After determining cycle equivalency, we need to find the minimal
3155 set of SESE regions. Do this with a DFS coloring walk of the
3156 complete graph. We're either 'looking' or 'coloring'. When
3157 looking, and we're in the subgraph, we start coloring the color of
3158 the current node, and remember that node as the start of the
3159 current color's SESE region. Every time we go to a new node, we
3160 decrement the count of nodes with thet color. If it reaches zero,
3161 we remember that node as the end of the current color's SESE region
3162 and return to 'looking'. Otherwise we color the node the current
3163 color.
3164
3165 This way we end up with coloring the inside of non-trivial SESE
3166 regions with the color of that region. */
3167
3168 /* A pair of BBs. We use this to represent SESE regions. */
3169 typedef std::pair<basic_block, basic_block> bb_pair_t;
3170 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3171
3172 /* A node in the undirected CFG. The discriminator SECOND indicates just
3173 above or just below the BB idicated by FIRST. */
3174 typedef std::pair<basic_block, int> pseudo_node_t;
3175
3176 /* A bracket indicates an edge towards the root of the spanning tree of the
3177 undirected graph. Each bracket has a color, determined
3178 from the currrent set of brackets. */
3179 struct bracket
3180 {
3181 pseudo_node_t back; /* Back target */
3182
3183 /* Current color and size of set. */
3184 unsigned color;
3185 unsigned size;
3186
3187 bracket (pseudo_node_t back_)
3188 : back (back_), color (~0u), size (~0u)
3189 {
3190 }
3191
3192 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3193 {
3194 if (length != size)
3195 {
3196 size = length;
3197 color = color_counts.length ();
3198 color_counts.quick_push (0);
3199 }
3200 color_counts[color]++;
3201 return color;
3202 }
3203 };
3204
3205 typedef auto_vec<bracket> bracket_vec_t;
3206
3207 /* Basic block info for finding SESE regions. */
3208
3209 struct bb_sese
3210 {
3211 int node; /* Node number in spanning tree. */
3212 int parent; /* Parent node number. */
3213
3214 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3215 edges arrive at pseudo-node Ai and the outgoing edges leave at
3216 pseudo-node Ao. We have to remember which way we arrived at a
3217 particular node when generating the spanning tree. dir > 0 means
3218 we arrived at Ai, dir < 0 means we arrived at Ao. */
3219 int dir;
3220
3221 /* Lowest numbered pseudo-node reached via a backedge from thsis
3222 node, or any descendant. */
3223 pseudo_node_t high;
3224
3225 int color; /* Cycle-equivalence color */
3226
3227 /* Stack of brackets for this node. */
3228 bracket_vec_t brackets;
3229
3230 bb_sese (unsigned node_, unsigned p, int dir_)
3231 :node (node_), parent (p), dir (dir_)
3232 {
3233 }
3234 ~bb_sese ();
3235
3236 /* Push a bracket ending at BACK. */
3237 void push (const pseudo_node_t &back)
3238 {
3239 if (dump_file)
3240 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3241 back.first ? back.first->index : 0, back.second);
3242 brackets.safe_push (bracket (back));
3243 }
3244
3245 void append (bb_sese *child);
3246 void remove (const pseudo_node_t &);
3247
3248 /* Set node's color. */
3249 void set_color (auto_vec<unsigned> &color_counts)
3250 {
3251 color = brackets.last ().get_color (color_counts, brackets.length ());
3252 }
3253 };
3254
3255 bb_sese::~bb_sese ()
3256 {
3257 }
3258
3259 /* Destructively append CHILD's brackets. */
3260
3261 void
3262 bb_sese::append (bb_sese *child)
3263 {
3264 if (int len = child->brackets.length ())
3265 {
3266 int ix;
3267
3268 if (dump_file)
3269 {
3270 for (ix = 0; ix < len; ix++)
3271 {
3272 const pseudo_node_t &pseudo = child->brackets[ix].back;
3273 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3274 child->node, pseudo.first ? pseudo.first->index : 0,
3275 pseudo.second);
3276 }
3277 }
3278 if (!brackets.length ())
3279 std::swap (brackets, child->brackets);
3280 else
3281 {
3282 brackets.reserve (len);
3283 for (ix = 0; ix < len; ix++)
3284 brackets.quick_push (child->brackets[ix]);
3285 }
3286 }
3287 }
3288
3289 /* Remove brackets that terminate at PSEUDO. */
3290
3291 void
3292 bb_sese::remove (const pseudo_node_t &pseudo)
3293 {
3294 unsigned removed = 0;
3295 int len = brackets.length ();
3296
3297 for (int ix = 0; ix < len; ix++)
3298 {
3299 if (brackets[ix].back == pseudo)
3300 {
3301 if (dump_file)
3302 fprintf (dump_file, "Removing backedge %d:%+d\n",
3303 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3304 removed++;
3305 }
3306 else if (removed)
3307 brackets[ix-removed] = brackets[ix];
3308 }
3309 while (removed--)
3310 brackets.pop ();
3311 }
3312
3313 /* Accessors for BB's aux pointer. */
3314 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3315 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3316
3317 /* DFS walk creating SESE data structures. Only cover nodes with
3318 BB_VISITED set. Append discovered blocks to LIST. We number in
3319 increments of 3 so that the above and below pseudo nodes can be
3320 implicitly numbered too. */
3321
3322 static int
3323 nvptx_sese_number (int n, int p, int dir, basic_block b,
3324 auto_vec<basic_block> *list)
3325 {
3326 if (BB_GET_SESE (b))
3327 return n;
3328
3329 if (dump_file)
3330 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3331 b->index, n, p, dir);
3332
3333 BB_SET_SESE (b, new bb_sese (n, p, dir));
3334 p = n;
3335
3336 n += 3;
3337 list->quick_push (b);
3338
3339 /* First walk the nodes on the 'other side' of this node, then walk
3340 the nodes on the same side. */
3341 for (unsigned ix = 2; ix; ix--)
3342 {
3343 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3344 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3345 : offsetof (edge_def, src));
3346 edge e;
3347 edge_iterator (ei);
3348
3349 FOR_EACH_EDGE (e, ei, edges)
3350 {
3351 basic_block target = *(basic_block *)((char *)e + offset);
3352
3353 if (target->flags & BB_VISITED)
3354 n = nvptx_sese_number (n, p, dir, target, list);
3355 }
3356 dir = -dir;
3357 }
3358 return n;
3359 }
3360
3361 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3362 EDGES are the outgoing edges and OFFSET is the offset to the src
3363 or dst block on the edges. */
3364
3365 static void
3366 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3367 vec<edge, va_gc> *edges, size_t offset)
3368 {
3369 edge e;
3370 edge_iterator (ei);
3371 int hi_back = depth;
3372 pseudo_node_t node_back (0, depth);
3373 int hi_child = depth;
3374 pseudo_node_t node_child (0, depth);
3375 basic_block child = NULL;
3376 unsigned num_children = 0;
3377 int usd = -dir * sese->dir;
3378
3379 if (dump_file)
3380 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3381 me->index, sese->node, dir);
3382
3383 if (dir < 0)
3384 {
3385 /* This is the above pseudo-child. It has the BB itself as an
3386 additional child node. */
3387 node_child = sese->high;
3388 hi_child = node_child.second;
3389 if (node_child.first)
3390 hi_child += BB_GET_SESE (node_child.first)->node;
3391 num_children++;
3392 }
3393
3394 /* Examine each edge.
3395 - if it is a child (a) append its bracket list and (b) record
3396 whether it is the child with the highest reaching bracket.
3397 - if it is an edge to ancestor, record whether it's the highest
3398 reaching backlink. */
3399 FOR_EACH_EDGE (e, ei, edges)
3400 {
3401 basic_block target = *(basic_block *)((char *)e + offset);
3402
3403 if (bb_sese *t_sese = BB_GET_SESE (target))
3404 {
3405 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3406 {
3407 /* Child node. Append its bracket list. */
3408 num_children++;
3409 sese->append (t_sese);
3410
3411 /* Compare it's hi value. */
3412 int t_hi = t_sese->high.second;
3413
3414 if (basic_block child_hi_block = t_sese->high.first)
3415 t_hi += BB_GET_SESE (child_hi_block)->node;
3416
3417 if (hi_child > t_hi)
3418 {
3419 hi_child = t_hi;
3420 node_child = t_sese->high;
3421 child = target;
3422 }
3423 }
3424 else if (t_sese->node < sese->node + dir
3425 && !(dir < 0 && sese->parent == t_sese->node))
3426 {
3427 /* Non-parental ancestor node -- a backlink. */
3428 int d = usd * t_sese->dir;
3429 int back = t_sese->node + d;
3430
3431 if (hi_back > back)
3432 {
3433 hi_back = back;
3434 node_back = pseudo_node_t (target, d);
3435 }
3436 }
3437 }
3438 else
3439 { /* Fallen off graph, backlink to entry node. */
3440 hi_back = 0;
3441 node_back = pseudo_node_t (0, 0);
3442 }
3443 }
3444
3445 /* Remove any brackets that terminate at this pseudo node. */
3446 sese->remove (pseudo_node_t (me, dir));
3447
3448 /* Now push any backlinks from this pseudo node. */
3449 FOR_EACH_EDGE (e, ei, edges)
3450 {
3451 basic_block target = *(basic_block *)((char *)e + offset);
3452 if (bb_sese *t_sese = BB_GET_SESE (target))
3453 {
3454 if (t_sese->node < sese->node + dir
3455 && !(dir < 0 && sese->parent == t_sese->node))
3456 /* Non-parental ancestor node - backedge from me. */
3457 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3458 }
3459 else
3460 {
3461 /* back edge to entry node */
3462 sese->push (pseudo_node_t (0, 0));
3463 }
3464 }
3465
3466 /* If this node leads directly or indirectly to a no-return region of
3467 the graph, then fake a backedge to entry node. */
3468 if (!sese->brackets.length () || !edges || !edges->length ())
3469 {
3470 hi_back = 0;
3471 node_back = pseudo_node_t (0, 0);
3472 sese->push (node_back);
3473 }
3474
3475 /* Record the highest reaching backedge from us or a descendant. */
3476 sese->high = hi_back < hi_child ? node_back : node_child;
3477
3478 if (num_children > 1)
3479 {
3480 /* There is more than one child -- this is a Y shaped piece of
3481 spanning tree. We have to insert a fake backedge from this
3482 node to the highest ancestor reached by not-the-highest
3483 reaching child. Note that there may be multiple children
3484 with backedges to the same highest node. That's ok and we
3485 insert the edge to that highest node. */
3486 hi_child = depth;
3487 if (dir < 0 && child)
3488 {
3489 node_child = sese->high;
3490 hi_child = node_child.second;
3491 if (node_child.first)
3492 hi_child += BB_GET_SESE (node_child.first)->node;
3493 }
3494
3495 FOR_EACH_EDGE (e, ei, edges)
3496 {
3497 basic_block target = *(basic_block *)((char *)e + offset);
3498
3499 if (target == child)
3500 /* Ignore the highest child. */
3501 continue;
3502
3503 bb_sese *t_sese = BB_GET_SESE (target);
3504 if (!t_sese)
3505 continue;
3506 if (t_sese->parent != sese->node)
3507 /* Not a child. */
3508 continue;
3509
3510 /* Compare its hi value. */
3511 int t_hi = t_sese->high.second;
3512
3513 if (basic_block child_hi_block = t_sese->high.first)
3514 t_hi += BB_GET_SESE (child_hi_block)->node;
3515
3516 if (hi_child > t_hi)
3517 {
3518 hi_child = t_hi;
3519 node_child = t_sese->high;
3520 }
3521 }
3522
3523 sese->push (node_child);
3524 }
3525 }
3526
3527
3528 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3529 proceed to successors. Set SESE entry and exit nodes of
3530 REGIONS. */
3531
3532 static void
3533 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3534 basic_block block, int coloring)
3535 {
3536 bb_sese *sese = BB_GET_SESE (block);
3537
3538 if (block->flags & BB_VISITED)
3539 {
3540 /* If we've already encountered this block, either we must not
3541 be coloring, or it must have been colored the current color. */
3542 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3543 return;
3544 }
3545
3546 block->flags |= BB_VISITED;
3547
3548 if (sese)
3549 {
3550 if (coloring < 0)
3551 {
3552 /* Start coloring a region. */
3553 regions[sese->color].first = block;
3554 coloring = sese->color;
3555 }
3556
3557 if (!--color_counts[sese->color] && sese->color == coloring)
3558 {
3559 /* Found final block of SESE region. */
3560 regions[sese->color].second = block;
3561 coloring = -1;
3562 }
3563 else
3564 /* Color the node, so we can assert on revisiting the node
3565 that the graph is indeed SESE. */
3566 sese->color = coloring;
3567 }
3568 else
3569 /* Fallen off the subgraph, we cannot be coloring. */
3570 gcc_assert (coloring < 0);
3571
3572 /* Walk each successor block. */
3573 if (block->succs && block->succs->length ())
3574 {
3575 edge e;
3576 edge_iterator ei;
3577
3578 FOR_EACH_EDGE (e, ei, block->succs)
3579 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3580 }
3581 else
3582 gcc_assert (coloring < 0);
3583 }
3584
3585 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3586 end up with NULL entries in it. */
3587
3588 static void
3589 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3590 {
3591 basic_block block;
3592 int ix;
3593
3594 /* First clear each BB of the whole function. */
3595 FOR_ALL_BB_FN (block, cfun)
3596 {
3597 block->flags &= ~BB_VISITED;
3598 BB_SET_SESE (block, 0);
3599 }
3600
3601 /* Mark blocks in the function that are in this graph. */
3602 for (ix = 0; blocks.iterate (ix, &block); ix++)
3603 block->flags |= BB_VISITED;
3604
3605 /* Counts of nodes assigned to each color. There cannot be more
3606 colors than blocks (and hopefully there will be fewer). */
3607 auto_vec<unsigned> color_counts;
3608 color_counts.reserve (blocks.length ());
3609
3610 /* Worklist of nodes in the spanning tree. Again, there cannot be
3611 more nodes in the tree than blocks (there will be fewer if the
3612 CFG of blocks is disjoint). */
3613 auto_vec<basic_block> spanlist;
3614 spanlist.reserve (blocks.length ());
3615
3616 /* Make sure every block has its cycle class determined. */
3617 for (ix = 0; blocks.iterate (ix, &block); ix++)
3618 {
3619 if (BB_GET_SESE (block))
3620 /* We already met this block in an earlier graph solve. */
3621 continue;
3622
3623 if (dump_file)
3624 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3625
3626 /* Number the nodes reachable from block initial DFS order. */
3627 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3628
3629 /* Now walk in reverse DFS order to find cycle equivalents. */
3630 while (spanlist.length ())
3631 {
3632 block = spanlist.pop ();
3633 bb_sese *sese = BB_GET_SESE (block);
3634
3635 /* Do the pseudo node below. */
3636 nvptx_sese_pseudo (block, sese, depth, +1,
3637 sese->dir > 0 ? block->succs : block->preds,
3638 (sese->dir > 0 ? offsetof (edge_def, dest)
3639 : offsetof (edge_def, src)));
3640 sese->set_color (color_counts);
3641 /* Do the pseudo node above. */
3642 nvptx_sese_pseudo (block, sese, depth, -1,
3643 sese->dir < 0 ? block->succs : block->preds,
3644 (sese->dir < 0 ? offsetof (edge_def, dest)
3645 : offsetof (edge_def, src)));
3646 }
3647 if (dump_file)
3648 fprintf (dump_file, "\n");
3649 }
3650
3651 if (dump_file)
3652 {
3653 unsigned count;
3654 const char *comma = "";
3655
3656 fprintf (dump_file, "Found %d cycle equivalents\n",
3657 color_counts.length ());
3658 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3659 {
3660 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3661
3662 comma = "";
3663 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3664 if (BB_GET_SESE (block)->color == ix)
3665 {
3666 block->flags |= BB_VISITED;
3667 fprintf (dump_file, "%s%d", comma, block->index);
3668 comma=",";
3669 }
3670 fprintf (dump_file, "}");
3671 comma = ", ";
3672 }
3673 fprintf (dump_file, "\n");
3674 }
3675
3676 /* Now we've colored every block in the subgraph. We now need to
3677 determine the minimal set of SESE regions that cover that
3678 subgraph. Do this with a DFS walk of the complete function.
3679 During the walk we're either 'looking' or 'coloring'. When we
3680 reach the last node of a particular color, we stop coloring and
3681 return to looking. */
3682
3683 /* There cannot be more SESE regions than colors. */
3684 regions.reserve (color_counts.length ());
3685 for (ix = color_counts.length (); ix--;)
3686 regions.quick_push (bb_pair_t (0, 0));
3687
3688 for (ix = 0; blocks.iterate (ix, &block); ix++)
3689 block->flags &= ~BB_VISITED;
3690
3691 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3692
3693 if (dump_file)
3694 {
3695 const char *comma = "";
3696 int len = regions.length ();
3697
3698 fprintf (dump_file, "SESE regions:");
3699 for (ix = 0; ix != len; ix++)
3700 {
3701 basic_block from = regions[ix].first;
3702 basic_block to = regions[ix].second;
3703
3704 if (from)
3705 {
3706 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3707 if (to != from)
3708 fprintf (dump_file, "->%d", to->index);
3709
3710 int color = BB_GET_SESE (from)->color;
3711
3712 /* Print the blocks within the region (excluding ends). */
3713 FOR_EACH_BB_FN (block, cfun)
3714 {
3715 bb_sese *sese = BB_GET_SESE (block);
3716
3717 if (sese && sese->color == color
3718 && block != from && block != to)
3719 fprintf (dump_file, ".%d", block->index);
3720 }
3721 fprintf (dump_file, "}");
3722 }
3723 comma = ",";
3724 }
3725 fprintf (dump_file, "\n\n");
3726 }
3727
3728 for (ix = 0; blocks.iterate (ix, &block); ix++)
3729 delete BB_GET_SESE (block);
3730 }
3731
3732 #undef BB_SET_SESE
3733 #undef BB_GET_SESE
3734
3735 /* Propagate live state at the start of a partitioned region. BLOCK
3736 provides the live register information, and might not contain
3737 INSN. Propagation is inserted just after INSN. RW indicates whether
3738 we are reading and/or writing state. This
3739 separation is needed for worker-level proppagation where we
3740 essentially do a spill & fill. FN is the underlying worker
3741 function to generate the propagation instructions for single
3742 register. DATA is user data.
3743
3744 We propagate the live register set and the entire frame. We could
3745 do better by (a) propagating just the live set that is used within
3746 the partitioned regions and (b) only propagating stack entries that
3747 are used. The latter might be quite hard to determine. */
3748
3749 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3750
3751 static void
3752 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3753 propagator_fn fn, void *data)
3754 {
3755 bitmap live = DF_LIVE_IN (block);
3756 bitmap_iterator iterator;
3757 unsigned ix;
3758
3759 /* Copy the frame array. */
3760 HOST_WIDE_INT fs = get_frame_size ();
3761 if (fs)
3762 {
3763 rtx tmp = gen_reg_rtx (DImode);
3764 rtx idx = NULL_RTX;
3765 rtx ptr = gen_reg_rtx (Pmode);
3766 rtx pred = NULL_RTX;
3767 rtx_code_label *label = NULL;
3768
3769 /* The frame size might not be DImode compatible, but the frame
3770 array's declaration will be. So it's ok to round up here. */
3771 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3772 /* Detect single iteration loop. */
3773 if (fs == 1)
3774 fs = 0;
3775
3776 start_sequence ();
3777 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3778 if (fs)
3779 {
3780 idx = gen_reg_rtx (SImode);
3781 pred = gen_reg_rtx (BImode);
3782 label = gen_label_rtx ();
3783
3784 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3785 /* Allow worker function to initialize anything needed. */
3786 rtx init = fn (tmp, PM_loop_begin, fs, data);
3787 if (init)
3788 emit_insn (init);
3789 emit_label (label);
3790 LABEL_NUSES (label)++;
3791 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3792 }
3793 if (rw & PM_read)
3794 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3795 emit_insn (fn (tmp, rw, fs, data));
3796 if (rw & PM_write)
3797 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3798 if (fs)
3799 {
3800 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3801 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3802 emit_insn (gen_br_true_uni (pred, label));
3803 rtx fini = fn (tmp, PM_loop_end, fs, data);
3804 if (fini)
3805 emit_insn (fini);
3806 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3807 }
3808 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3809 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3810 rtx cpy = get_insns ();
3811 end_sequence ();
3812 insn = emit_insn_after (cpy, insn);
3813 }
3814
3815 /* Copy live registers. */
3816 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3817 {
3818 rtx reg = regno_reg_rtx[ix];
3819
3820 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3821 {
3822 rtx bcast = fn (reg, rw, 0, data);
3823
3824 insn = emit_insn_after (bcast, insn);
3825 }
3826 }
3827 }
3828
3829 /* Worker for nvptx_vpropagate. */
3830
3831 static rtx
3832 vprop_gen (rtx reg, propagate_mask pm,
3833 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3834 {
3835 if (!(pm & PM_read_write))
3836 return 0;
3837
3838 return nvptx_gen_vcast (reg);
3839 }
3840
3841 /* Propagate state that is live at start of BLOCK across the vectors
3842 of a single warp. Propagation is inserted just after INSN. */
3843
3844 static void
3845 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3846 {
3847 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3848 }
3849
3850 /* Worker for nvptx_wpropagate. */
3851
3852 static rtx
3853 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3854 {
3855 wcast_data_t *data = (wcast_data_t *)data_;
3856
3857 if (pm & PM_loop_begin)
3858 {
3859 /* Starting a loop, initialize pointer. */
3860 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3861
3862 if (align > worker_bcast_align)
3863 worker_bcast_align = align;
3864 data->offset = (data->offset + align - 1) & ~(align - 1);
3865
3866 data->ptr = gen_reg_rtx (Pmode);
3867
3868 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3869 }
3870 else if (pm & PM_loop_end)
3871 {
3872 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3873 data->ptr = NULL_RTX;
3874 return clobber;
3875 }
3876 else
3877 return nvptx_gen_wcast (reg, pm, rep, data);
3878 }
3879
3880 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3881 indicates if this is just before partitioned mode (do spill), or
3882 just after it starts (do fill). Sequence is inserted just after
3883 INSN. */
3884
3885 static void
3886 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3887 {
3888 wcast_data_t data;
3889
3890 data.base = gen_reg_rtx (Pmode);
3891 data.offset = 0;
3892 data.ptr = NULL_RTX;
3893
3894 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3895 if (data.offset)
3896 {
3897 /* Stuff was emitted, initialize the base pointer now. */
3898 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3899 emit_insn_after (init, insn);
3900
3901 if (worker_bcast_size < data.offset)
3902 worker_bcast_size = data.offset;
3903 }
3904 }
3905
3906 /* Emit a worker-level synchronization barrier. We use different
3907 markers for before and after synchronizations. */
3908
3909 static rtx
3910 nvptx_wsync (bool after)
3911 {
3912 return gen_nvptx_barsync (GEN_INT (after));
3913 }
3914
3915 #if WORKAROUND_PTXJIT_BUG
3916 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3917 real insns. */
3918
3919 static rtx_insn *
3920 bb_first_real_insn (basic_block bb)
3921 {
3922 rtx_insn *insn;
3923
3924 /* Find first insn of from block. */
3925 FOR_BB_INSNS (bb, insn)
3926 if (INSN_P (insn))
3927 return insn;
3928
3929 return 0;
3930 }
3931 #endif
3932
3933 /* Single neutering according to MASK. FROM is the incoming block and
3934 TO is the outgoing block. These may be the same block. Insert at
3935 start of FROM:
3936
3937 if (tid.<axis>) goto end.
3938
3939 and insert before ending branch of TO (if there is such an insn):
3940
3941 end:
3942 <possibly-broadcast-cond>
3943 <branch>
3944
3945 We currently only use differnt FROM and TO when skipping an entire
3946 loop. We could do more if we detected superblocks. */
3947
3948 static void
3949 nvptx_single (unsigned mask, basic_block from, basic_block to)
3950 {
3951 rtx_insn *head = BB_HEAD (from);
3952 rtx_insn *tail = BB_END (to);
3953 unsigned skip_mask = mask;
3954
3955 while (true)
3956 {
3957 /* Find first insn of from block. */
3958 while (head != BB_END (from) && !INSN_P (head))
3959 head = NEXT_INSN (head);
3960
3961 if (from == to)
3962 break;
3963
3964 if (!(JUMP_P (head) && single_succ_p (from)))
3965 break;
3966
3967 basic_block jump_target = single_succ (from);
3968 if (!single_pred_p (jump_target))
3969 break;
3970
3971 from = jump_target;
3972 head = BB_HEAD (from);
3973 }
3974
3975 /* Find last insn of to block */
3976 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3977 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3978 tail = PREV_INSN (tail);
3979
3980 /* Detect if tail is a branch. */
3981 rtx tail_branch = NULL_RTX;
3982 rtx cond_branch = NULL_RTX;
3983 if (tail && INSN_P (tail))
3984 {
3985 tail_branch = PATTERN (tail);
3986 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3987 tail_branch = NULL_RTX;
3988 else
3989 {
3990 cond_branch = SET_SRC (tail_branch);
3991 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3992 cond_branch = NULL_RTX;
3993 }
3994 }
3995
3996 if (tail == head)
3997 {
3998 /* If this is empty, do nothing. */
3999 if (!head || !INSN_P (head))
4000 return;
4001
4002 /* If this is a dummy insn, do nothing. */
4003 switch (recog_memoized (head))
4004 {
4005 default:
4006 break;
4007 case CODE_FOR_nvptx_fork:
4008 case CODE_FOR_nvptx_forked:
4009 case CODE_FOR_nvptx_joining:
4010 case CODE_FOR_nvptx_join:
4011 return;
4012 }
4013
4014 if (cond_branch)
4015 {
4016 /* If we're only doing vector single, there's no need to
4017 emit skip code because we'll not insert anything. */
4018 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4019 skip_mask = 0;
4020 }
4021 else if (tail_branch)
4022 /* Block with only unconditional branch. Nothing to do. */
4023 return;
4024 }
4025
4026 /* Insert the vector test inside the worker test. */
4027 unsigned mode;
4028 rtx_insn *before = tail;
4029 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4030 if (GOMP_DIM_MASK (mode) & skip_mask)
4031 {
4032 rtx_code_label *label = gen_label_rtx ();
4033 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4034
4035 if (!pred)
4036 {
4037 pred = gen_reg_rtx (BImode);
4038 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4039 }
4040
4041 rtx br;
4042 if (mode == GOMP_DIM_VECTOR)
4043 br = gen_br_true (pred, label);
4044 else
4045 br = gen_br_true_uni (pred, label);
4046 emit_insn_before (br, head);
4047
4048 LABEL_NUSES (label)++;
4049 if (tail_branch)
4050 before = emit_label_before (label, before);
4051 else
4052 emit_label_after (label, tail);
4053 }
4054
4055 /* Now deal with propagating the branch condition. */
4056 if (cond_branch)
4057 {
4058 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4059
4060 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4061 {
4062 /* Vector mode only, do a shuffle. */
4063 #if WORKAROUND_PTXJIT_BUG
4064 /* The branch condition %rcond is propagated like this:
4065
4066 {
4067 .reg .u32 %x;
4068 mov.u32 %x,%tid.x;
4069 setp.ne.u32 %rnotvzero,%x,0;
4070 }
4071
4072 @%rnotvzero bra Lskip;
4073 setp.<op>.<type> %rcond,op1,op2;
4074 Lskip:
4075 selp.u32 %rcondu32,1,0,%rcond;
4076 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4077 setp.ne.u32 %rcond,%rcondu32,0;
4078
4079 There seems to be a bug in the ptx JIT compiler (observed at driver
4080 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4081 unless %rcond is initialized to something before 'bra Lskip'. The
4082 bug is not observed with ptxas from cuda 8.0.61.
4083
4084 It is true that the code is non-trivial: at Lskip, %rcond is
4085 uninitialized in threads 1-31, and after the selp the same holds
4086 for %rcondu32. But shfl propagates the defined value in thread 0
4087 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4088 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4089
4090 There is nothing in the PTX spec to suggest that this is wrong, or
4091 to explain why the extra initialization is needed. So, we classify
4092 it as a JIT bug, and the extra initialization as workaround. */
4093 emit_insn_before (gen_movbi (pvar, const0_rtx),
4094 bb_first_real_insn (from));
4095 #endif
4096 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4097 }
4098 else
4099 {
4100 /* Includes worker mode, do spill & fill. By construction
4101 we should never have worker mode only. */
4102 wcast_data_t data;
4103
4104 data.base = worker_bcast_sym;
4105 data.ptr = 0;
4106
4107 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4108 worker_bcast_size = GET_MODE_SIZE (SImode);
4109
4110 data.offset = 0;
4111 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4112 before);
4113 /* Barrier so other workers can see the write. */
4114 emit_insn_before (nvptx_wsync (false), tail);
4115 data.offset = 0;
4116 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4117 /* This barrier is needed to avoid worker zero clobbering
4118 the broadcast buffer before all the other workers have
4119 had a chance to read this instance of it. */
4120 emit_insn_before (nvptx_wsync (true), tail);
4121 }
4122
4123 extract_insn (tail);
4124 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4125 UNSPEC_BR_UNIFIED);
4126 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4127 }
4128 }
4129
4130 /* PAR is a parallel that is being skipped in its entirety according to
4131 MASK. Treat this as skipping a superblock starting at forked
4132 and ending at joining. */
4133
4134 static void
4135 nvptx_skip_par (unsigned mask, parallel *par)
4136 {
4137 basic_block tail = par->join_block;
4138 gcc_assert (tail->preds->length () == 1);
4139
4140 basic_block pre_tail = (*tail->preds)[0]->src;
4141 gcc_assert (pre_tail->succs->length () == 1);
4142
4143 nvptx_single (mask, par->forked_block, pre_tail);
4144 }
4145
4146 /* If PAR has a single inner parallel and PAR itself only contains
4147 empty entry and exit blocks, swallow the inner PAR. */
4148
4149 static void
4150 nvptx_optimize_inner (parallel *par)
4151 {
4152 parallel *inner = par->inner;
4153
4154 /* We mustn't be the outer dummy par. */
4155 if (!par->mask)
4156 return;
4157
4158 /* We must have a single inner par. */
4159 if (!inner || inner->next)
4160 return;
4161
4162 /* We must only contain 2 blocks ourselves -- the head and tail of
4163 the inner par. */
4164 if (par->blocks.length () != 2)
4165 return;
4166
4167 /* We must be disjoint partitioning. As we only have vector and
4168 worker partitioning, this is sufficient to guarantee the pars
4169 have adjacent partitioning. */
4170 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4171 /* This indicates malformed code generation. */
4172 return;
4173
4174 /* The outer forked insn should be immediately followed by the inner
4175 fork insn. */
4176 rtx_insn *forked = par->forked_insn;
4177 rtx_insn *fork = BB_END (par->forked_block);
4178
4179 if (NEXT_INSN (forked) != fork)
4180 return;
4181 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4182
4183 /* The outer joining insn must immediately follow the inner join
4184 insn. */
4185 rtx_insn *joining = par->joining_insn;
4186 rtx_insn *join = inner->join_insn;
4187 if (NEXT_INSN (join) != joining)
4188 return;
4189
4190 /* Preconditions met. Swallow the inner par. */
4191 if (dump_file)
4192 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4193 inner->mask, inner->forked_block->index,
4194 inner->join_block->index,
4195 par->mask, par->forked_block->index, par->join_block->index);
4196
4197 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4198
4199 par->blocks.reserve (inner->blocks.length ());
4200 while (inner->blocks.length ())
4201 par->blocks.quick_push (inner->blocks.pop ());
4202
4203 par->inner = inner->inner;
4204 inner->inner = NULL;
4205
4206 delete inner;
4207 }
4208
4209 /* Process the parallel PAR and all its contained
4210 parallels. We do everything but the neutering. Return mask of
4211 partitioned modes used within this parallel. */
4212
4213 static unsigned
4214 nvptx_process_pars (parallel *par)
4215 {
4216 if (nvptx_optimize)
4217 nvptx_optimize_inner (par);
4218
4219 unsigned inner_mask = par->mask;
4220
4221 /* Do the inner parallels first. */
4222 if (par->inner)
4223 {
4224 par->inner_mask = nvptx_process_pars (par->inner);
4225 inner_mask |= par->inner_mask;
4226 }
4227
4228 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4229 /* No propagation needed for a call. */;
4230 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4231 {
4232 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4233 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4234 /* Insert begin and end synchronizations. */
4235 emit_insn_after (nvptx_wsync (false), par->forked_insn);
4236 emit_insn_before (nvptx_wsync (true), par->joining_insn);
4237 }
4238 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4239 nvptx_vpropagate (par->forked_block, par->forked_insn);
4240
4241 /* Now do siblings. */
4242 if (par->next)
4243 inner_mask |= nvptx_process_pars (par->next);
4244 return inner_mask;
4245 }
4246
4247 /* Neuter the parallel described by PAR. We recurse in depth-first
4248 order. MODES are the partitioning of the execution and OUTER is
4249 the partitioning of the parallels we are contained in. */
4250
4251 static void
4252 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4253 {
4254 unsigned me = (par->mask
4255 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4256 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4257 unsigned skip_mask = 0, neuter_mask = 0;
4258
4259 if (par->inner)
4260 nvptx_neuter_pars (par->inner, modes, outer | me);
4261
4262 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4263 {
4264 if ((outer | me) & GOMP_DIM_MASK (mode))
4265 {} /* Mode is partitioned: no neutering. */
4266 else if (!(modes & GOMP_DIM_MASK (mode)))
4267 {} /* Mode is not used: nothing to do. */
4268 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4269 || !par->forked_insn)
4270 /* Partitioned in inner parallels, or we're not a partitioned
4271 at all: neuter individual blocks. */
4272 neuter_mask |= GOMP_DIM_MASK (mode);
4273 else if (!par->parent || !par->parent->forked_insn
4274 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4275 /* Parent isn't a parallel or contains this paralleling: skip
4276 parallel at this level. */
4277 skip_mask |= GOMP_DIM_MASK (mode);
4278 else
4279 {} /* Parent will skip this parallel itself. */
4280 }
4281
4282 if (neuter_mask)
4283 {
4284 int ix, len;
4285
4286 if (nvptx_optimize)
4287 {
4288 /* Neuter whole SESE regions. */
4289 bb_pair_vec_t regions;
4290
4291 nvptx_find_sese (par->blocks, regions);
4292 len = regions.length ();
4293 for (ix = 0; ix != len; ix++)
4294 {
4295 basic_block from = regions[ix].first;
4296 basic_block to = regions[ix].second;
4297
4298 if (from)
4299 nvptx_single (neuter_mask, from, to);
4300 else
4301 gcc_assert (!to);
4302 }
4303 }
4304 else
4305 {
4306 /* Neuter each BB individually. */
4307 len = par->blocks.length ();
4308 for (ix = 0; ix != len; ix++)
4309 {
4310 basic_block block = par->blocks[ix];
4311
4312 nvptx_single (neuter_mask, block, block);
4313 }
4314 }
4315 }
4316
4317 if (skip_mask)
4318 nvptx_skip_par (skip_mask, par);
4319
4320 if (par->next)
4321 nvptx_neuter_pars (par->next, modes, outer);
4322 }
4323
4324 /* PTX-specific reorganization
4325 - Split blocks at fork and join instructions
4326 - Compute live registers
4327 - Mark now-unused registers, so function begin doesn't declare
4328 unused registers.
4329 - Insert state propagation when entering partitioned mode
4330 - Insert neutering instructions when in single mode
4331 - Replace subregs with suitable sequences.
4332 */
4333
4334 static void
4335 nvptx_reorg (void)
4336 {
4337 /* We are freeing block_for_insn in the toplev to keep compatibility
4338 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4339 compute_bb_for_insn ();
4340
4341 thread_prologue_and_epilogue_insns ();
4342
4343 /* Split blocks and record interesting unspecs. */
4344 bb_insn_map_t bb_insn_map;
4345
4346 nvptx_split_blocks (&bb_insn_map);
4347
4348 /* Compute live regs */
4349 df_clear_flags (DF_LR_RUN_DCE);
4350 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4351 df_live_add_problem ();
4352 df_live_set_all_dirty ();
4353 df_analyze ();
4354 regstat_init_n_sets_and_refs ();
4355
4356 if (dump_file)
4357 df_dump (dump_file);
4358
4359 /* Mark unused regs as unused. */
4360 int max_regs = max_reg_num ();
4361 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4362 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4363 regno_reg_rtx[i] = const0_rtx;
4364
4365 /* Determine launch dimensions of the function. If it is not an
4366 offloaded function (i.e. this is a regular compiler), the
4367 function has no neutering. */
4368 tree attr = oacc_get_fn_attrib (current_function_decl);
4369 if (attr)
4370 {
4371 /* If we determined this mask before RTL expansion, we could
4372 elide emission of some levels of forks and joins. */
4373 unsigned mask = 0;
4374 tree dims = TREE_VALUE (attr);
4375 unsigned ix;
4376
4377 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4378 {
4379 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4380 tree allowed = TREE_PURPOSE (dims);
4381
4382 if (size != 1 && !(allowed && integer_zerop (allowed)))
4383 mask |= GOMP_DIM_MASK (ix);
4384 }
4385 /* If there is worker neutering, there must be vector
4386 neutering. Otherwise the hardware will fail. */
4387 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4388 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4389
4390 /* Discover & process partitioned regions. */
4391 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4392 nvptx_process_pars (pars);
4393 nvptx_neuter_pars (pars, mask, 0);
4394 delete pars;
4395 }
4396
4397 /* Replace subregs. */
4398 nvptx_reorg_subreg ();
4399
4400 if (TARGET_UNIFORM_SIMT)
4401 nvptx_reorg_uniform_simt ();
4402
4403 regstat_free_n_sets_and_refs ();
4404
4405 df_finish_pass (true);
4406 }
4407 \f
4408 /* Handle a "kernel" attribute; arguments as in
4409 struct attribute_spec.handler. */
4410
4411 static tree
4412 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4413 int ARG_UNUSED (flags), bool *no_add_attrs)
4414 {
4415 tree decl = *node;
4416
4417 if (TREE_CODE (decl) != FUNCTION_DECL)
4418 {
4419 error ("%qE attribute only applies to functions", name);
4420 *no_add_attrs = true;
4421 }
4422 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4423 {
4424 error ("%qE attribute requires a void return type", name);
4425 *no_add_attrs = true;
4426 }
4427
4428 return NULL_TREE;
4429 }
4430
4431 /* Handle a "shared" attribute; arguments as in
4432 struct attribute_spec.handler. */
4433
4434 static tree
4435 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4436 int ARG_UNUSED (flags), bool *no_add_attrs)
4437 {
4438 tree decl = *node;
4439
4440 if (TREE_CODE (decl) != VAR_DECL)
4441 {
4442 error ("%qE attribute only applies to variables", name);
4443 *no_add_attrs = true;
4444 }
4445 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4446 {
4447 error ("%qE attribute not allowed with auto storage class", name);
4448 *no_add_attrs = true;
4449 }
4450
4451 return NULL_TREE;
4452 }
4453
4454 /* Table of valid machine attributes. */
4455 static const struct attribute_spec nvptx_attribute_table[] =
4456 {
4457 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
4458 affects_type_identity } */
4459 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
4460 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
4461 { NULL, 0, 0, false, false, false, NULL, false }
4462 };
4463 \f
4464 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4465
4466 static HOST_WIDE_INT
4467 nvptx_vector_alignment (const_tree type)
4468 {
4469 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4470
4471 return MIN (align, BIGGEST_ALIGNMENT);
4472 }
4473
4474 /* Indicate that INSN cannot be duplicated. */
4475
4476 static bool
4477 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4478 {
4479 switch (recog_memoized (insn))
4480 {
4481 case CODE_FOR_nvptx_shufflesi:
4482 case CODE_FOR_nvptx_shufflesf:
4483 case CODE_FOR_nvptx_barsync:
4484 case CODE_FOR_nvptx_fork:
4485 case CODE_FOR_nvptx_forked:
4486 case CODE_FOR_nvptx_joining:
4487 case CODE_FOR_nvptx_join:
4488 return true;
4489 default:
4490 return false;
4491 }
4492 }
4493
4494 /* Section anchors do not work. Initialization for flag_section_anchor
4495 probes the existence of the anchoring target hooks and prevents
4496 anchoring if they don't exist. However, we may be being used with
4497 a host-side compiler that does support anchoring, and hence see
4498 the anchor flag set (as it's not recalculated). So provide an
4499 implementation denying anchoring. */
4500
4501 static bool
4502 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4503 {
4504 return false;
4505 }
4506 \f
4507 /* Record a symbol for mkoffload to enter into the mapping table. */
4508
4509 static void
4510 nvptx_record_offload_symbol (tree decl)
4511 {
4512 switch (TREE_CODE (decl))
4513 {
4514 case VAR_DECL:
4515 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4516 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4517 break;
4518
4519 case FUNCTION_DECL:
4520 {
4521 tree attr = oacc_get_fn_attrib (decl);
4522 /* OpenMP offloading does not set this attribute. */
4523 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4524
4525 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4526 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4527
4528 for (; dims; dims = TREE_CHAIN (dims))
4529 {
4530 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4531
4532 gcc_assert (!TREE_PURPOSE (dims));
4533 fprintf (asm_out_file, ", %#x", size);
4534 }
4535
4536 fprintf (asm_out_file, "\n");
4537 }
4538 break;
4539
4540 default:
4541 gcc_unreachable ();
4542 }
4543 }
4544
4545 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4546 at the start of a file. */
4547
4548 static void
4549 nvptx_file_start (void)
4550 {
4551 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4552 fputs ("\t.version\t3.1\n", asm_out_file);
4553 fputs ("\t.target\tsm_30\n", asm_out_file);
4554 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4555 fputs ("// END PREAMBLE\n", asm_out_file);
4556 }
4557
4558 /* Emit a declaration for a worker-level buffer in .shared memory. */
4559
4560 static void
4561 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4562 {
4563 const char *name = XSTR (sym, 0);
4564
4565 write_var_marker (file, true, false, name);
4566 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4567 align, name, size);
4568 }
4569
4570 /* Write out the function declarations we've collected and declare storage
4571 for the broadcast buffer. */
4572
4573 static void
4574 nvptx_file_end (void)
4575 {
4576 hash_table<tree_hasher>::iterator iter;
4577 tree decl;
4578 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4579 nvptx_record_fndecl (decl);
4580 fputs (func_decls.str().c_str(), asm_out_file);
4581
4582 if (worker_bcast_size)
4583 write_worker_buffer (asm_out_file, worker_bcast_sym,
4584 worker_bcast_align, worker_bcast_size);
4585
4586 if (worker_red_size)
4587 write_worker_buffer (asm_out_file, worker_red_sym,
4588 worker_red_align, worker_red_size);
4589
4590 if (need_softstack_decl)
4591 {
4592 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4593 /* 32 is the maximum number of warps in a block. Even though it's an
4594 external declaration, emit the array size explicitly; otherwise, it
4595 may fail at PTX JIT time if the definition is later in link order. */
4596 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4597 POINTER_SIZE);
4598 }
4599 if (need_unisimt_decl)
4600 {
4601 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4602 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4603 }
4604 }
4605
4606 /* Expander for the shuffle builtins. */
4607
4608 static rtx
4609 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4610 {
4611 if (ignore)
4612 return target;
4613
4614 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4615 NULL_RTX, mode, EXPAND_NORMAL);
4616 if (!REG_P (src))
4617 src = copy_to_mode_reg (mode, src);
4618
4619 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4620 NULL_RTX, SImode, EXPAND_NORMAL);
4621 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4622 NULL_RTX, SImode, EXPAND_NORMAL);
4623
4624 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4625 idx = copy_to_mode_reg (SImode, idx);
4626
4627 rtx pat = nvptx_gen_shuffle (target, src, idx,
4628 (nvptx_shuffle_kind) INTVAL (op));
4629 if (pat)
4630 emit_insn (pat);
4631
4632 return target;
4633 }
4634
4635 /* Worker reduction address expander. */
4636
4637 static rtx
4638 nvptx_expand_worker_addr (tree exp, rtx target,
4639 machine_mode ARG_UNUSED (mode), int ignore)
4640 {
4641 if (ignore)
4642 return target;
4643
4644 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4645 if (align > worker_red_align)
4646 worker_red_align = align;
4647
4648 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4649 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4650 if (size + offset > worker_red_size)
4651 worker_red_size = size + offset;
4652
4653 rtx addr = worker_red_sym;
4654 if (offset)
4655 {
4656 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4657 addr = gen_rtx_CONST (Pmode, addr);
4658 }
4659
4660 emit_move_insn (target, addr);
4661
4662 return target;
4663 }
4664
4665 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4666 not require taking the address of any object, other than the memory
4667 cell being operated on. */
4668
4669 static rtx
4670 nvptx_expand_cmp_swap (tree exp, rtx target,
4671 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4672 {
4673 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4674
4675 if (!target)
4676 target = gen_reg_rtx (mode);
4677
4678 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4679 NULL_RTX, Pmode, EXPAND_NORMAL);
4680 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4681 NULL_RTX, mode, EXPAND_NORMAL);
4682 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4683 NULL_RTX, mode, EXPAND_NORMAL);
4684 rtx pat;
4685
4686 mem = gen_rtx_MEM (mode, mem);
4687 if (!REG_P (cmp))
4688 cmp = copy_to_mode_reg (mode, cmp);
4689 if (!REG_P (src))
4690 src = copy_to_mode_reg (mode, src);
4691
4692 if (mode == SImode)
4693 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4694 else
4695 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4696
4697 emit_insn (pat);
4698
4699 return target;
4700 }
4701
4702
4703 /* Codes for all the NVPTX builtins. */
4704 enum nvptx_builtins
4705 {
4706 NVPTX_BUILTIN_SHUFFLE,
4707 NVPTX_BUILTIN_SHUFFLELL,
4708 NVPTX_BUILTIN_WORKER_ADDR,
4709 NVPTX_BUILTIN_CMP_SWAP,
4710 NVPTX_BUILTIN_CMP_SWAPLL,
4711 NVPTX_BUILTIN_MAX
4712 };
4713
4714 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4715
4716 /* Return the NVPTX builtin for CODE. */
4717
4718 static tree
4719 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4720 {
4721 if (code >= NVPTX_BUILTIN_MAX)
4722 return error_mark_node;
4723
4724 return nvptx_builtin_decls[code];
4725 }
4726
4727 /* Set up all builtin functions for this target. */
4728
4729 static void
4730 nvptx_init_builtins (void)
4731 {
4732 #define DEF(ID, NAME, T) \
4733 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4734 = add_builtin_function ("__builtin_nvptx_" NAME, \
4735 build_function_type_list T, \
4736 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4737 #define ST sizetype
4738 #define UINT unsigned_type_node
4739 #define LLUINT long_long_unsigned_type_node
4740 #define PTRVOID ptr_type_node
4741
4742 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4743 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4744 DEF (WORKER_ADDR, "worker_addr",
4745 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4746 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4747 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4748
4749 #undef DEF
4750 #undef ST
4751 #undef UINT
4752 #undef LLUINT
4753 #undef PTRVOID
4754 }
4755
4756 /* Expand an expression EXP that calls a built-in function,
4757 with result going to TARGET if that's convenient
4758 (and in mode MODE if that's convenient).
4759 SUBTARGET may be used as the target for computing one of EXP's operands.
4760 IGNORE is nonzero if the value is to be ignored. */
4761
4762 static rtx
4763 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4764 machine_mode mode, int ignore)
4765 {
4766 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4767 switch (DECL_FUNCTION_CODE (fndecl))
4768 {
4769 case NVPTX_BUILTIN_SHUFFLE:
4770 case NVPTX_BUILTIN_SHUFFLELL:
4771 return nvptx_expand_shuffle (exp, target, mode, ignore);
4772
4773 case NVPTX_BUILTIN_WORKER_ADDR:
4774 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4775
4776 case NVPTX_BUILTIN_CMP_SWAP:
4777 case NVPTX_BUILTIN_CMP_SWAPLL:
4778 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4779
4780 default: gcc_unreachable ();
4781 }
4782 }
4783 \f
4784 /* Define dimension sizes for known hardware. */
4785 #define PTX_VECTOR_LENGTH 32
4786 #define PTX_WORKER_LENGTH 32
4787 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4788
4789 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4790
4791 static int
4792 nvptx_simt_vf ()
4793 {
4794 return PTX_VECTOR_LENGTH;
4795 }
4796
4797 /* Validate compute dimensions of an OpenACC offload or routine, fill
4798 in non-unity defaults. FN_LEVEL indicates the level at which a
4799 routine might spawn a loop. It is negative for non-routines. If
4800 DECL is null, we are validating the default dimensions. */
4801
4802 static bool
4803 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4804 {
4805 bool changed = false;
4806
4807 /* The vector size must be 32, unless this is a SEQ routine. */
4808 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4809 && dims[GOMP_DIM_VECTOR] >= 0
4810 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4811 {
4812 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4813 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4814 dims[GOMP_DIM_VECTOR]
4815 ? G_("using vector_length (%d), ignoring %d")
4816 : G_("using vector_length (%d), ignoring runtime setting"),
4817 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4818 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4819 changed = true;
4820 }
4821
4822 /* Check the num workers is not too large. */
4823 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4824 {
4825 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4826 "using num_workers (%d), ignoring %d",
4827 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4828 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4829 changed = true;
4830 }
4831
4832 if (!decl)
4833 {
4834 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4835 if (dims[GOMP_DIM_WORKER] < 0)
4836 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4837 if (dims[GOMP_DIM_GANG] < 0)
4838 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4839 changed = true;
4840 }
4841
4842 return changed;
4843 }
4844
4845 /* Return maximum dimension size, or zero for unbounded. */
4846
4847 static int
4848 nvptx_dim_limit (int axis)
4849 {
4850 switch (axis)
4851 {
4852 case GOMP_DIM_WORKER:
4853 return PTX_WORKER_LENGTH;
4854
4855 case GOMP_DIM_VECTOR:
4856 return PTX_VECTOR_LENGTH;
4857
4858 default:
4859 break;
4860 }
4861 return 0;
4862 }
4863
4864 /* Determine whether fork & joins are needed. */
4865
4866 static bool
4867 nvptx_goacc_fork_join (gcall *call, const int dims[],
4868 bool ARG_UNUSED (is_fork))
4869 {
4870 tree arg = gimple_call_arg (call, 2);
4871 unsigned axis = TREE_INT_CST_LOW (arg);
4872
4873 /* We only care about worker and vector partitioning. */
4874 if (axis < GOMP_DIM_WORKER)
4875 return false;
4876
4877 /* If the size is 1, there's no partitioning. */
4878 if (dims[axis] == 1)
4879 return false;
4880
4881 return true;
4882 }
4883
4884 /* Generate a PTX builtin function call that returns the address in
4885 the worker reduction buffer at OFFSET. TYPE is the type of the
4886 data at that location. */
4887
4888 static tree
4889 nvptx_get_worker_red_addr (tree type, tree offset)
4890 {
4891 machine_mode mode = TYPE_MODE (type);
4892 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4893 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4894 tree align = build_int_cst (unsigned_type_node,
4895 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4896 tree call = build_call_expr (fndecl, 3, offset, size, align);
4897
4898 return fold_convert (build_pointer_type (type), call);
4899 }
4900
4901 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4902 will cast the variable if necessary. */
4903
4904 static void
4905 nvptx_generate_vector_shuffle (location_t loc,
4906 tree dest_var, tree var, unsigned shift,
4907 gimple_seq *seq)
4908 {
4909 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4910 tree_code code = NOP_EXPR;
4911 tree arg_type = unsigned_type_node;
4912 tree var_type = TREE_TYPE (var);
4913 tree dest_type = var_type;
4914
4915 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4916 var_type = TREE_TYPE (var_type);
4917
4918 if (TREE_CODE (var_type) == REAL_TYPE)
4919 code = VIEW_CONVERT_EXPR;
4920
4921 if (TYPE_SIZE (var_type)
4922 == TYPE_SIZE (long_long_unsigned_type_node))
4923 {
4924 fn = NVPTX_BUILTIN_SHUFFLELL;
4925 arg_type = long_long_unsigned_type_node;
4926 }
4927
4928 tree call = nvptx_builtin_decl (fn, true);
4929 tree bits = build_int_cst (unsigned_type_node, shift);
4930 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4931 tree expr;
4932
4933 if (var_type != dest_type)
4934 {
4935 /* Do real and imaginary parts separately. */
4936 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4937 real = fold_build1 (code, arg_type, real);
4938 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4939 real = fold_build1 (code, var_type, real);
4940
4941 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4942 imag = fold_build1 (code, arg_type, imag);
4943 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4944 imag = fold_build1 (code, var_type, imag);
4945
4946 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4947 }
4948 else
4949 {
4950 expr = fold_build1 (code, arg_type, var);
4951 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4952 expr = fold_build1 (code, dest_type, expr);
4953 }
4954
4955 gimplify_assign (dest_var, expr, seq);
4956 }
4957
4958 /* Lazily generate the global lock var decl and return its address. */
4959
4960 static tree
4961 nvptx_global_lock_addr ()
4962 {
4963 tree v = global_lock_var;
4964
4965 if (!v)
4966 {
4967 tree name = get_identifier ("__reduction_lock");
4968 tree type = build_qualified_type (unsigned_type_node,
4969 TYPE_QUAL_VOLATILE);
4970 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4971 global_lock_var = v;
4972 DECL_ARTIFICIAL (v) = 1;
4973 DECL_EXTERNAL (v) = 1;
4974 TREE_STATIC (v) = 1;
4975 TREE_PUBLIC (v) = 1;
4976 TREE_USED (v) = 1;
4977 mark_addressable (v);
4978 mark_decl_referenced (v);
4979 }
4980
4981 return build_fold_addr_expr (v);
4982 }
4983
4984 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4985 GSI. We use a lockless scheme for nearly all case, which looks
4986 like:
4987 actual = initval(OP);
4988 do {
4989 guess = actual;
4990 write = guess OP myval;
4991 actual = cmp&swap (ptr, guess, write)
4992 } while (actual bit-different-to guess);
4993 return write;
4994
4995 This relies on a cmp&swap instruction, which is available for 32-
4996 and 64-bit types. Larger types must use a locking scheme. */
4997
4998 static tree
4999 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5000 tree ptr, tree var, tree_code op)
5001 {
5002 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5003 tree_code code = NOP_EXPR;
5004 tree arg_type = unsigned_type_node;
5005 tree var_type = TREE_TYPE (var);
5006
5007 if (TREE_CODE (var_type) == COMPLEX_TYPE
5008 || TREE_CODE (var_type) == REAL_TYPE)
5009 code = VIEW_CONVERT_EXPR;
5010
5011 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5012 {
5013 arg_type = long_long_unsigned_type_node;
5014 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5015 }
5016
5017 tree swap_fn = nvptx_builtin_decl (fn, true);
5018
5019 gimple_seq init_seq = NULL;
5020 tree init_var = make_ssa_name (arg_type);
5021 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5022 init_expr = fold_build1 (code, arg_type, init_expr);
5023 gimplify_assign (init_var, init_expr, &init_seq);
5024 gimple *init_end = gimple_seq_last (init_seq);
5025
5026 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5027
5028 /* Split the block just after the init stmts. */
5029 basic_block pre_bb = gsi_bb (*gsi);
5030 edge pre_edge = split_block (pre_bb, init_end);
5031 basic_block loop_bb = pre_edge->dest;
5032 pre_bb = pre_edge->src;
5033 /* Reset the iterator. */
5034 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5035
5036 tree expect_var = make_ssa_name (arg_type);
5037 tree actual_var = make_ssa_name (arg_type);
5038 tree write_var = make_ssa_name (arg_type);
5039
5040 /* Build and insert the reduction calculation. */
5041 gimple_seq red_seq = NULL;
5042 tree write_expr = fold_build1 (code, var_type, expect_var);
5043 write_expr = fold_build2 (op, var_type, write_expr, var);
5044 write_expr = fold_build1 (code, arg_type, write_expr);
5045 gimplify_assign (write_var, write_expr, &red_seq);
5046
5047 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5048
5049 /* Build & insert the cmp&swap sequence. */
5050 gimple_seq latch_seq = NULL;
5051 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5052 ptr, expect_var, write_var);
5053 gimplify_assign (actual_var, swap_expr, &latch_seq);
5054
5055 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5056 NULL_TREE, NULL_TREE);
5057 gimple_seq_add_stmt (&latch_seq, cond);
5058
5059 gimple *latch_end = gimple_seq_last (latch_seq);
5060 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5061
5062 /* Split the block just after the latch stmts. */
5063 edge post_edge = split_block (loop_bb, latch_end);
5064 basic_block post_bb = post_edge->dest;
5065 loop_bb = post_edge->src;
5066 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5067
5068 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5069 post_edge->probability = profile_probability::even ();
5070 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5071 loop_edge->probability = profile_probability::even ();
5072 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5073 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5074
5075 gphi *phi = create_phi_node (expect_var, loop_bb);
5076 add_phi_arg (phi, init_var, pre_edge, loc);
5077 add_phi_arg (phi, actual_var, loop_edge, loc);
5078
5079 loop *loop = alloc_loop ();
5080 loop->header = loop_bb;
5081 loop->latch = loop_bb;
5082 add_loop (loop, loop_bb->loop_father);
5083
5084 return fold_build1 (code, var_type, write_var);
5085 }
5086
5087 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5088 GSI. This is necessary for types larger than 64 bits, where there
5089 is no cmp&swap instruction to implement a lockless scheme. We use
5090 a lock variable in global memory.
5091
5092 while (cmp&swap (&lock_var, 0, 1))
5093 continue;
5094 T accum = *ptr;
5095 accum = accum OP var;
5096 *ptr = accum;
5097 cmp&swap (&lock_var, 1, 0);
5098 return accum;
5099
5100 A lock in global memory is necessary to force execution engine
5101 descheduling and avoid resource starvation that can occur if the
5102 lock is in .shared memory. */
5103
5104 static tree
5105 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5106 tree ptr, tree var, tree_code op)
5107 {
5108 tree var_type = TREE_TYPE (var);
5109 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5110 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5111 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5112
5113 /* Split the block just before the gsi. Insert a gimple nop to make
5114 this easier. */
5115 gimple *nop = gimple_build_nop ();
5116 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5117 basic_block entry_bb = gsi_bb (*gsi);
5118 edge entry_edge = split_block (entry_bb, nop);
5119 basic_block lock_bb = entry_edge->dest;
5120 /* Reset the iterator. */
5121 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5122
5123 /* Build and insert the locking sequence. */
5124 gimple_seq lock_seq = NULL;
5125 tree lock_var = make_ssa_name (unsigned_type_node);
5126 tree lock_expr = nvptx_global_lock_addr ();
5127 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5128 uns_unlocked, uns_locked);
5129 gimplify_assign (lock_var, lock_expr, &lock_seq);
5130 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5131 NULL_TREE, NULL_TREE);
5132 gimple_seq_add_stmt (&lock_seq, cond);
5133 gimple *lock_end = gimple_seq_last (lock_seq);
5134 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5135
5136 /* Split the block just after the lock sequence. */
5137 edge locked_edge = split_block (lock_bb, lock_end);
5138 basic_block update_bb = locked_edge->dest;
5139 lock_bb = locked_edge->src;
5140 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5141
5142 /* Create the lock loop ... */
5143 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5144 locked_edge->probability = profile_probability::even ();
5145 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5146 loop_edge->probability = profile_probability::even ();
5147 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5148 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5149
5150 /* ... and the loop structure. */
5151 loop *lock_loop = alloc_loop ();
5152 lock_loop->header = lock_bb;
5153 lock_loop->latch = lock_bb;
5154 lock_loop->nb_iterations_estimate = 1;
5155 lock_loop->any_estimate = true;
5156 add_loop (lock_loop, entry_bb->loop_father);
5157
5158 /* Build and insert the reduction calculation. */
5159 gimple_seq red_seq = NULL;
5160 tree acc_in = make_ssa_name (var_type);
5161 tree ref_in = build_simple_mem_ref (ptr);
5162 TREE_THIS_VOLATILE (ref_in) = 1;
5163 gimplify_assign (acc_in, ref_in, &red_seq);
5164
5165 tree acc_out = make_ssa_name (var_type);
5166 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5167 gimplify_assign (acc_out, update_expr, &red_seq);
5168
5169 tree ref_out = build_simple_mem_ref (ptr);
5170 TREE_THIS_VOLATILE (ref_out) = 1;
5171 gimplify_assign (ref_out, acc_out, &red_seq);
5172
5173 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5174
5175 /* Build & insert the unlock sequence. */
5176 gimple_seq unlock_seq = NULL;
5177 tree unlock_expr = nvptx_global_lock_addr ();
5178 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5179 uns_locked, uns_unlocked);
5180 gimplify_and_add (unlock_expr, &unlock_seq);
5181 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5182
5183 return acc_out;
5184 }
5185
5186 /* Emit a sequence to update a reduction accumlator at *PTR with the
5187 value held in VAR using operator OP. Return the updated value.
5188
5189 TODO: optimize for atomic ops and indepedent complex ops. */
5190
5191 static tree
5192 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5193 tree ptr, tree var, tree_code op)
5194 {
5195 tree type = TREE_TYPE (var);
5196 tree size = TYPE_SIZE (type);
5197
5198 if (size == TYPE_SIZE (unsigned_type_node)
5199 || size == TYPE_SIZE (long_long_unsigned_type_node))
5200 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5201 else
5202 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5203 }
5204
5205 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5206
5207 static void
5208 nvptx_goacc_reduction_setup (gcall *call)
5209 {
5210 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5211 tree lhs = gimple_call_lhs (call);
5212 tree var = gimple_call_arg (call, 2);
5213 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5214 gimple_seq seq = NULL;
5215
5216 push_gimplify_context (true);
5217
5218 if (level != GOMP_DIM_GANG)
5219 {
5220 /* Copy the receiver object. */
5221 tree ref_to_res = gimple_call_arg (call, 1);
5222
5223 if (!integer_zerop (ref_to_res))
5224 var = build_simple_mem_ref (ref_to_res);
5225 }
5226
5227 if (level == GOMP_DIM_WORKER)
5228 {
5229 /* Store incoming value to worker reduction buffer. */
5230 tree offset = gimple_call_arg (call, 5);
5231 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5232 tree ptr = make_ssa_name (TREE_TYPE (call));
5233
5234 gimplify_assign (ptr, call, &seq);
5235 tree ref = build_simple_mem_ref (ptr);
5236 TREE_THIS_VOLATILE (ref) = 1;
5237 gimplify_assign (ref, var, &seq);
5238 }
5239
5240 if (lhs)
5241 gimplify_assign (lhs, var, &seq);
5242
5243 pop_gimplify_context (NULL);
5244 gsi_replace_with_seq (&gsi, seq, true);
5245 }
5246
5247 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5248
5249 static void
5250 nvptx_goacc_reduction_init (gcall *call)
5251 {
5252 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5253 tree lhs = gimple_call_lhs (call);
5254 tree var = gimple_call_arg (call, 2);
5255 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5256 enum tree_code rcode
5257 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5258 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5259 TREE_TYPE (var));
5260 gimple_seq seq = NULL;
5261
5262 push_gimplify_context (true);
5263
5264 if (level == GOMP_DIM_VECTOR)
5265 {
5266 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5267 tree tid = make_ssa_name (integer_type_node);
5268 tree dim_vector = gimple_call_arg (call, 3);
5269 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5270 dim_vector);
5271 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5272 NULL_TREE, NULL_TREE);
5273
5274 gimple_call_set_lhs (tid_call, tid);
5275 gimple_seq_add_stmt (&seq, tid_call);
5276 gimple_seq_add_stmt (&seq, cond_stmt);
5277
5278 /* Split the block just after the call. */
5279 edge init_edge = split_block (gsi_bb (gsi), call);
5280 basic_block init_bb = init_edge->dest;
5281 basic_block call_bb = init_edge->src;
5282
5283 /* Fixup flags from call_bb to init_bb. */
5284 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5285 init_edge->probability = profile_probability::even ();
5286
5287 /* Set the initialization stmts. */
5288 gimple_seq init_seq = NULL;
5289 tree init_var = make_ssa_name (TREE_TYPE (var));
5290 gimplify_assign (init_var, init, &init_seq);
5291 gsi = gsi_start_bb (init_bb);
5292 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5293
5294 /* Split block just after the init stmt. */
5295 gsi_prev (&gsi);
5296 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5297 basic_block dst_bb = inited_edge->dest;
5298
5299 /* Create false edge from call_bb to dst_bb. */
5300 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5301 nop_edge->probability = profile_probability::even ();
5302
5303 /* Create phi node in dst block. */
5304 gphi *phi = create_phi_node (lhs, dst_bb);
5305 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5306 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5307
5308 /* Reset dominator of dst bb. */
5309 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5310
5311 /* Reset the gsi. */
5312 gsi = gsi_for_stmt (call);
5313 }
5314 else
5315 {
5316 if (level == GOMP_DIM_GANG)
5317 {
5318 /* If there's no receiver object, propagate the incoming VAR. */
5319 tree ref_to_res = gimple_call_arg (call, 1);
5320 if (integer_zerop (ref_to_res))
5321 init = var;
5322 }
5323
5324 gimplify_assign (lhs, init, &seq);
5325 }
5326
5327 pop_gimplify_context (NULL);
5328 gsi_replace_with_seq (&gsi, seq, true);
5329 }
5330
5331 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5332
5333 static void
5334 nvptx_goacc_reduction_fini (gcall *call)
5335 {
5336 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5337 tree lhs = gimple_call_lhs (call);
5338 tree ref_to_res = gimple_call_arg (call, 1);
5339 tree var = gimple_call_arg (call, 2);
5340 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5341 enum tree_code op
5342 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5343 gimple_seq seq = NULL;
5344 tree r = NULL_TREE;;
5345
5346 push_gimplify_context (true);
5347
5348 if (level == GOMP_DIM_VECTOR)
5349 {
5350 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5351 but that requires a method of emitting a unified jump at the
5352 gimple level. */
5353 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5354 {
5355 tree other_var = make_ssa_name (TREE_TYPE (var));
5356 nvptx_generate_vector_shuffle (gimple_location (call),
5357 other_var, var, shfl, &seq);
5358
5359 r = make_ssa_name (TREE_TYPE (var));
5360 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5361 var, other_var), &seq);
5362 var = r;
5363 }
5364 }
5365 else
5366 {
5367 tree accum = NULL_TREE;
5368
5369 if (level == GOMP_DIM_WORKER)
5370 {
5371 /* Get reduction buffer address. */
5372 tree offset = gimple_call_arg (call, 5);
5373 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5374 tree ptr = make_ssa_name (TREE_TYPE (call));
5375
5376 gimplify_assign (ptr, call, &seq);
5377 accum = ptr;
5378 }
5379 else if (integer_zerop (ref_to_res))
5380 r = var;
5381 else
5382 accum = ref_to_res;
5383
5384 if (accum)
5385 {
5386 /* UPDATE the accumulator. */
5387 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5388 seq = NULL;
5389 r = nvptx_reduction_update (gimple_location (call), &gsi,
5390 accum, var, op);
5391 }
5392 }
5393
5394 if (lhs)
5395 gimplify_assign (lhs, r, &seq);
5396 pop_gimplify_context (NULL);
5397
5398 gsi_replace_with_seq (&gsi, seq, true);
5399 }
5400
5401 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5402
5403 static void
5404 nvptx_goacc_reduction_teardown (gcall *call)
5405 {
5406 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5407 tree lhs = gimple_call_lhs (call);
5408 tree var = gimple_call_arg (call, 2);
5409 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5410 gimple_seq seq = NULL;
5411
5412 push_gimplify_context (true);
5413 if (level == GOMP_DIM_WORKER)
5414 {
5415 /* Read the worker reduction buffer. */
5416 tree offset = gimple_call_arg (call, 5);
5417 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5418 tree ptr = make_ssa_name (TREE_TYPE (call));
5419
5420 gimplify_assign (ptr, call, &seq);
5421 var = build_simple_mem_ref (ptr);
5422 TREE_THIS_VOLATILE (var) = 1;
5423 }
5424
5425 if (level != GOMP_DIM_GANG)
5426 {
5427 /* Write to the receiver object. */
5428 tree ref_to_res = gimple_call_arg (call, 1);
5429
5430 if (!integer_zerop (ref_to_res))
5431 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5432 }
5433
5434 if (lhs)
5435 gimplify_assign (lhs, var, &seq);
5436
5437 pop_gimplify_context (NULL);
5438
5439 gsi_replace_with_seq (&gsi, seq, true);
5440 }
5441
5442 /* NVPTX reduction expander. */
5443
5444 static void
5445 nvptx_goacc_reduction (gcall *call)
5446 {
5447 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5448
5449 switch (code)
5450 {
5451 case IFN_GOACC_REDUCTION_SETUP:
5452 nvptx_goacc_reduction_setup (call);
5453 break;
5454
5455 case IFN_GOACC_REDUCTION_INIT:
5456 nvptx_goacc_reduction_init (call);
5457 break;
5458
5459 case IFN_GOACC_REDUCTION_FINI:
5460 nvptx_goacc_reduction_fini (call);
5461 break;
5462
5463 case IFN_GOACC_REDUCTION_TEARDOWN:
5464 nvptx_goacc_reduction_teardown (call);
5465 break;
5466
5467 default:
5468 gcc_unreachable ();
5469 }
5470 }
5471
5472 static bool
5473 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5474 rtx x ATTRIBUTE_UNUSED)
5475 {
5476 return true;
5477 }
5478
5479 static bool
5480 nvptx_vector_mode_supported (machine_mode mode)
5481 {
5482 return (mode == V2SImode
5483 || mode == V2DImode);
5484 }
5485
5486 /* Return the preferred mode for vectorizing scalar MODE. */
5487
5488 static machine_mode
5489 nvptx_preferred_simd_mode (scalar_mode mode)
5490 {
5491 switch (mode)
5492 {
5493 case E_DImode:
5494 return V2DImode;
5495 case E_SImode:
5496 return V2SImode;
5497
5498 default:
5499 return default_preferred_simd_mode (mode);
5500 }
5501 }
5502
5503 unsigned int
5504 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5505 {
5506 if (TREE_CODE (type) == INTEGER_TYPE)
5507 {
5508 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5509 if (size == GET_MODE_SIZE (TImode))
5510 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5511 }
5512
5513 return basic_align;
5514 }
5515
5516 /* Implement TARGET_MODES_TIEABLE_P. */
5517
5518 static bool
5519 nvptx_modes_tieable_p (machine_mode, machine_mode)
5520 {
5521 return false;
5522 }
5523
5524 /* Implement TARGET_HARD_REGNO_NREGS. */
5525
5526 static unsigned int
5527 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5528 {
5529 return 1;
5530 }
5531
5532 #undef TARGET_OPTION_OVERRIDE
5533 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5534
5535 #undef TARGET_ATTRIBUTE_TABLE
5536 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5537
5538 #undef TARGET_LRA_P
5539 #define TARGET_LRA_P hook_bool_void_false
5540
5541 #undef TARGET_LEGITIMATE_ADDRESS_P
5542 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5543
5544 #undef TARGET_PROMOTE_FUNCTION_MODE
5545 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5546
5547 #undef TARGET_FUNCTION_ARG
5548 #define TARGET_FUNCTION_ARG nvptx_function_arg
5549 #undef TARGET_FUNCTION_INCOMING_ARG
5550 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5551 #undef TARGET_FUNCTION_ARG_ADVANCE
5552 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5553 #undef TARGET_FUNCTION_ARG_BOUNDARY
5554 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5555 #undef TARGET_PASS_BY_REFERENCE
5556 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5557 #undef TARGET_FUNCTION_VALUE_REGNO_P
5558 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5559 #undef TARGET_FUNCTION_VALUE
5560 #define TARGET_FUNCTION_VALUE nvptx_function_value
5561 #undef TARGET_LIBCALL_VALUE
5562 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5563 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5564 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5565 #undef TARGET_GET_DRAP_RTX
5566 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5567 #undef TARGET_SPLIT_COMPLEX_ARG
5568 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5569 #undef TARGET_RETURN_IN_MEMORY
5570 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5571 #undef TARGET_OMIT_STRUCT_RETURN_REG
5572 #define TARGET_OMIT_STRUCT_RETURN_REG true
5573 #undef TARGET_STRICT_ARGUMENT_NAMING
5574 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5575 #undef TARGET_CALL_ARGS
5576 #define TARGET_CALL_ARGS nvptx_call_args
5577 #undef TARGET_END_CALL_ARGS
5578 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5579
5580 #undef TARGET_ASM_FILE_START
5581 #define TARGET_ASM_FILE_START nvptx_file_start
5582 #undef TARGET_ASM_FILE_END
5583 #define TARGET_ASM_FILE_END nvptx_file_end
5584 #undef TARGET_ASM_GLOBALIZE_LABEL
5585 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5586 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5587 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5588 #undef TARGET_PRINT_OPERAND
5589 #define TARGET_PRINT_OPERAND nvptx_print_operand
5590 #undef TARGET_PRINT_OPERAND_ADDRESS
5591 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5592 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5593 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5594 #undef TARGET_ASM_INTEGER
5595 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5596 #undef TARGET_ASM_DECL_END
5597 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5598 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5599 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5600 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5601 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5602 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5603 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5604
5605 #undef TARGET_MACHINE_DEPENDENT_REORG
5606 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5607 #undef TARGET_NO_REGISTER_ALLOCATION
5608 #define TARGET_NO_REGISTER_ALLOCATION true
5609
5610 #undef TARGET_ENCODE_SECTION_INFO
5611 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5612 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5613 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5614
5615 #undef TARGET_VECTOR_ALIGNMENT
5616 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5617
5618 #undef TARGET_CANNOT_COPY_INSN_P
5619 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5620
5621 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5622 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5623
5624 #undef TARGET_INIT_BUILTINS
5625 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5626 #undef TARGET_EXPAND_BUILTIN
5627 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5628 #undef TARGET_BUILTIN_DECL
5629 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5630
5631 #undef TARGET_SIMT_VF
5632 #define TARGET_SIMT_VF nvptx_simt_vf
5633
5634 #undef TARGET_GOACC_VALIDATE_DIMS
5635 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5636
5637 #undef TARGET_GOACC_DIM_LIMIT
5638 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5639
5640 #undef TARGET_GOACC_FORK_JOIN
5641 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5642
5643 #undef TARGET_GOACC_REDUCTION
5644 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5645
5646 #undef TARGET_CANNOT_FORCE_CONST_MEM
5647 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5648
5649 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5650 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5651
5652 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5653 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5654 nvptx_preferred_simd_mode
5655
5656 #undef TARGET_MODES_TIEABLE_P
5657 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5658
5659 #undef TARGET_HARD_REGNO_NREGS
5660 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5661
5662 struct gcc_target targetm = TARGET_INITIALIZER;
5663
5664 #include "gt-nvptx.h"