]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/nvptx/nvptx.c
* tree-core.h (struct attribute_spec): Swap affects_type_identity and
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "diagnostic.h"
39 #include "alias.h"
40 #include "insn-flags.h"
41 #include "output.h"
42 #include "insn-attr.h"
43 #include "flags.h"
44 #include "dojump.h"
45 #include "explow.h"
46 #include "calls.h"
47 #include "varasm.h"
48 #include "stmt.h"
49 #include "expr.h"
50 #include "tm-preds.h"
51 #include "tm-constrs.h"
52 #include "langhooks.h"
53 #include "dbxout.h"
54 #include "cfgrtl.h"
55 #include "gimple.h"
56 #include "stor-layout.h"
57 #include "builtins.h"
58 #include "omp-general.h"
59 #include "omp-low.h"
60 #include "gomp-constants.h"
61 #include "dumpfile.h"
62 #include "internal-fn.h"
63 #include "gimple-iterator.h"
64 #include "stringpool.h"
65 #include "attribs.h"
66 #include "tree-vrp.h"
67 #include "tree-ssa-operands.h"
68 #include "tree-ssanames.h"
69 #include "gimplify.h"
70 #include "tree-phinodes.h"
71 #include "cfgloop.h"
72 #include "fold-const.h"
73 #include "intl.h"
74
75 /* This file should be included last. */
76 #include "target-def.h"
77
78 #define WORKAROUND_PTXJIT_BUG 1
79
80 /* The various PTX memory areas an object might reside in. */
81 enum nvptx_data_area
82 {
83 DATA_AREA_GENERIC,
84 DATA_AREA_GLOBAL,
85 DATA_AREA_SHARED,
86 DATA_AREA_LOCAL,
87 DATA_AREA_CONST,
88 DATA_AREA_PARAM,
89 DATA_AREA_MAX
90 };
91
92 /* We record the data area in the target symbol flags. */
93 #define SYMBOL_DATA_AREA(SYM) \
94 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
95 & 7)
96 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
97 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
98
99 /* Record the function decls we've written, and the libfuncs and function
100 decls corresponding to them. */
101 static std::stringstream func_decls;
102
103 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
104 {
105 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
106 static bool equal (rtx a, rtx b) { return a == b; }
107 };
108
109 static GTY((cache))
110 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
111
112 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
113 {
114 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
115 static bool equal (tree a, tree b) { return a == b; }
116 };
117
118 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
119 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
120
121 /* Buffer needed to broadcast across workers. This is used for both
122 worker-neutering and worker broadcasting. It is shared by all
123 functions emitted. The buffer is placed in shared memory. It'd be
124 nice if PTX supported common blocks, because then this could be
125 shared across TUs (taking the largest size). */
126 static unsigned worker_bcast_size;
127 static unsigned worker_bcast_align;
128 static GTY(()) rtx worker_bcast_sym;
129
130 /* Buffer needed for worker reductions. This has to be distinct from
131 the worker broadcast array, as both may be live concurrently. */
132 static unsigned worker_red_size;
133 static unsigned worker_red_align;
134 static GTY(()) rtx worker_red_sym;
135
136 /* Global lock variable, needed for 128bit worker & gang reductions. */
137 static GTY(()) tree global_lock_var;
138
139 /* True if any function references __nvptx_stacks. */
140 static bool need_softstack_decl;
141
142 /* True if any function references __nvptx_uni. */
143 static bool need_unisimt_decl;
144
145 /* Allocate a new, cleared machine_function structure. */
146
147 static struct machine_function *
148 nvptx_init_machine_status (void)
149 {
150 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
151 p->return_mode = VOIDmode;
152 return p;
153 }
154
155 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
156 and -fopenacc is also enabled. */
157
158 static void
159 diagnose_openacc_conflict (bool optval, const char *optname)
160 {
161 if (flag_openacc && optval)
162 error ("option %s is not supported together with -fopenacc", optname);
163 }
164
165 /* Implement TARGET_OPTION_OVERRIDE. */
166
167 static void
168 nvptx_option_override (void)
169 {
170 init_machine_status = nvptx_init_machine_status;
171
172 /* Set toplevel_reorder, unless explicitly disabled. We need
173 reordering so that we emit necessary assembler decls of
174 undeclared variables. */
175 if (!global_options_set.x_flag_toplevel_reorder)
176 flag_toplevel_reorder = 1;
177
178 /* Set flag_no_common, unless explicitly disabled. We fake common
179 using .weak, and that's not entirely accurate, so avoid it
180 unless forced. */
181 if (!global_options_set.x_flag_no_common)
182 flag_no_common = 1;
183
184 /* The patch area requires nops, which we don't have. */
185 if (function_entry_patch_area_size > 0)
186 sorry ("not generating patch area, nops not supported");
187
188 /* Assumes that it will see only hard registers. */
189 flag_var_tracking = 0;
190
191 if (nvptx_optimize < 0)
192 nvptx_optimize = optimize > 0;
193
194 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
195 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
196 declared_libfuncs_htab
197 = hash_table<declared_libfunc_hasher>::create_ggc (17);
198
199 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
200 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
201 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
202
203 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
204 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
205 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
206
207 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
208 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
209 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
210
211 if (TARGET_GOMP)
212 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
213 }
214
215 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
216 deal with ptx ideosyncracies. */
217
218 const char *
219 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
220 {
221 switch (mode)
222 {
223 case E_BLKmode:
224 return ".b8";
225 case E_BImode:
226 return ".pred";
227 case E_QImode:
228 if (promote)
229 return ".u32";
230 else
231 return ".u8";
232 case E_HImode:
233 return ".u16";
234 case E_SImode:
235 return ".u32";
236 case E_DImode:
237 return ".u64";
238
239 case E_SFmode:
240 return ".f32";
241 case E_DFmode:
242 return ".f64";
243
244 case E_V2SImode:
245 return ".v2.u32";
246 case E_V2DImode:
247 return ".v2.u64";
248
249 default:
250 gcc_unreachable ();
251 }
252 }
253
254 /* Encode the PTX data area that DECL (which might not actually be a
255 _DECL) should reside in. */
256
257 static void
258 nvptx_encode_section_info (tree decl, rtx rtl, int first)
259 {
260 default_encode_section_info (decl, rtl, first);
261 if (first && MEM_P (rtl))
262 {
263 nvptx_data_area area = DATA_AREA_GENERIC;
264
265 if (TREE_CONSTANT (decl))
266 area = DATA_AREA_CONST;
267 else if (TREE_CODE (decl) == VAR_DECL)
268 {
269 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
270 {
271 area = DATA_AREA_SHARED;
272 if (DECL_INITIAL (decl))
273 error ("static initialization of variable %q+D in %<.shared%>"
274 " memory is not supported", decl);
275 }
276 else
277 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
278 }
279
280 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
281 }
282 }
283
284 /* Return the PTX name of the data area in which SYM should be
285 placed. The symbol must have already been processed by
286 nvptx_encode_seciton_info, or equivalent. */
287
288 static const char *
289 section_for_sym (rtx sym)
290 {
291 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
292 /* Same order as nvptx_data_area enum. */
293 static char const *const areas[] =
294 {"", ".global", ".shared", ".local", ".const", ".param"};
295
296 return areas[area];
297 }
298
299 /* Similarly for a decl. */
300
301 static const char *
302 section_for_decl (const_tree decl)
303 {
304 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
305 }
306
307 /* Check NAME for special function names and redirect them by returning a
308 replacement. This applies to malloc, free and realloc, for which we
309 want to use libgcc wrappers, and call, which triggers a bug in
310 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
311 not active in an offload compiler -- the names are all set by the
312 host-side compiler. */
313
314 static const char *
315 nvptx_name_replacement (const char *name)
316 {
317 if (strcmp (name, "call") == 0)
318 return "__nvptx_call";
319 if (strcmp (name, "malloc") == 0)
320 return "__nvptx_malloc";
321 if (strcmp (name, "free") == 0)
322 return "__nvptx_free";
323 if (strcmp (name, "realloc") == 0)
324 return "__nvptx_realloc";
325 return name;
326 }
327
328 /* If MODE should be treated as two registers of an inner mode, return
329 that inner mode. Otherwise return VOIDmode. */
330
331 static machine_mode
332 maybe_split_mode (machine_mode mode)
333 {
334 if (COMPLEX_MODE_P (mode))
335 return GET_MODE_INNER (mode);
336
337 if (mode == TImode)
338 return DImode;
339
340 return VOIDmode;
341 }
342
343 /* Return true if mode should be treated as two registers. */
344
345 static bool
346 split_mode_p (machine_mode mode)
347 {
348 return maybe_split_mode (mode) != VOIDmode;
349 }
350
351 /* Output a register, subreg, or register pair (with optional
352 enclosing braces). */
353
354 static void
355 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
356 int subreg_offset = -1)
357 {
358 if (inner_mode == VOIDmode)
359 {
360 if (HARD_REGISTER_NUM_P (regno))
361 fprintf (file, "%s", reg_names[regno]);
362 else
363 fprintf (file, "%%r%d", regno);
364 }
365 else if (subreg_offset >= 0)
366 {
367 output_reg (file, regno, VOIDmode);
368 fprintf (file, "$%d", subreg_offset);
369 }
370 else
371 {
372 if (subreg_offset == -1)
373 fprintf (file, "{");
374 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
375 fprintf (file, ",");
376 output_reg (file, regno, inner_mode, 0);
377 if (subreg_offset == -1)
378 fprintf (file, "}");
379 }
380 }
381
382 /* Emit forking instructions for MASK. */
383
384 static void
385 nvptx_emit_forking (unsigned mask, bool is_call)
386 {
387 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
388 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
389 if (mask)
390 {
391 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
392
393 /* Emit fork at all levels. This helps form SESE regions, as
394 it creates a block with a single successor before entering a
395 partitooned region. That is a good candidate for the end of
396 an SESE region. */
397 if (!is_call)
398 emit_insn (gen_nvptx_fork (op));
399 emit_insn (gen_nvptx_forked (op));
400 }
401 }
402
403 /* Emit joining instructions for MASK. */
404
405 static void
406 nvptx_emit_joining (unsigned mask, bool is_call)
407 {
408 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
409 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
410 if (mask)
411 {
412 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
413
414 /* Emit joining for all non-call pars to ensure there's a single
415 predecessor for the block the join insn ends up in. This is
416 needed for skipping entire loops. */
417 if (!is_call)
418 emit_insn (gen_nvptx_joining (op));
419 emit_insn (gen_nvptx_join (op));
420 }
421 }
422
423 \f
424 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
425 returned in memory. Integer and floating types supported by the
426 machine are passed in registers, everything else is passed in
427 memory. Complex types are split. */
428
429 static bool
430 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
431 {
432 if (type)
433 {
434 if (AGGREGATE_TYPE_P (type))
435 return true;
436 if (TREE_CODE (type) == VECTOR_TYPE)
437 return true;
438 }
439
440 if (!for_return && COMPLEX_MODE_P (mode))
441 /* Complex types are passed as two underlying args. */
442 mode = GET_MODE_INNER (mode);
443
444 if (GET_MODE_CLASS (mode) != MODE_INT
445 && GET_MODE_CLASS (mode) != MODE_FLOAT)
446 return true;
447
448 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
449 return true;
450
451 return false;
452 }
453
454 /* A non-memory argument of mode MODE is being passed, determine the mode it
455 should be promoted to. This is also used for determining return
456 type promotion. */
457
458 static machine_mode
459 promote_arg (machine_mode mode, bool prototyped)
460 {
461 if (!prototyped && mode == SFmode)
462 /* K&R float promotion for unprototyped functions. */
463 mode = DFmode;
464 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
465 mode = SImode;
466
467 return mode;
468 }
469
470 /* A non-memory return type of MODE is being returned. Determine the
471 mode it should be promoted to. */
472
473 static machine_mode
474 promote_return (machine_mode mode)
475 {
476 return promote_arg (mode, true);
477 }
478
479 /* Implement TARGET_FUNCTION_ARG. */
480
481 static rtx
482 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
483 const_tree, bool named)
484 {
485 if (mode == VOIDmode || !named)
486 return NULL_RTX;
487
488 return gen_reg_rtx (mode);
489 }
490
491 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
492
493 static rtx
494 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
495 const_tree, bool named)
496 {
497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
498
499 if (mode == VOIDmode || !named)
500 return NULL_RTX;
501
502 /* No need to deal with split modes here, the only case that can
503 happen is complex modes and those are dealt with by
504 TARGET_SPLIT_COMPLEX_ARG. */
505 return gen_rtx_UNSPEC (mode,
506 gen_rtvec (1, GEN_INT (cum->count)),
507 UNSPEC_ARG_REG);
508 }
509
510 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
511
512 static void
513 nvptx_function_arg_advance (cumulative_args_t cum_v,
514 machine_mode ARG_UNUSED (mode),
515 const_tree ARG_UNUSED (type),
516 bool ARG_UNUSED (named))
517 {
518 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
519
520 cum->count++;
521 }
522
523 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
524
525 For nvptx This is only used for varadic args. The type has already
526 been promoted and/or converted to invisible reference. */
527
528 static unsigned
529 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
530 {
531 return GET_MODE_ALIGNMENT (mode);
532 }
533
534 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
535
536 For nvptx, we know how to handle functions declared as stdarg: by
537 passing an extra pointer to the unnamed arguments. However, the
538 Fortran frontend can produce a different situation, where a
539 function pointer is declared with no arguments, but the actual
540 function and calls to it take more arguments. In that case, we
541 want to ensure the call matches the definition of the function. */
542
543 static bool
544 nvptx_strict_argument_naming (cumulative_args_t cum_v)
545 {
546 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
547
548 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
549 }
550
551 /* Implement TARGET_LIBCALL_VALUE. */
552
553 static rtx
554 nvptx_libcall_value (machine_mode mode, const_rtx)
555 {
556 if (!cfun || !cfun->machine->doing_call)
557 /* Pretend to return in a hard reg for early uses before pseudos can be
558 generated. */
559 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
560
561 return gen_reg_rtx (mode);
562 }
563
564 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
565 where function FUNC returns or receives a value of data type TYPE. */
566
567 static rtx
568 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
569 bool outgoing)
570 {
571 machine_mode mode = promote_return (TYPE_MODE (type));
572
573 if (outgoing)
574 {
575 gcc_assert (cfun);
576 cfun->machine->return_mode = mode;
577 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
578 }
579
580 return nvptx_libcall_value (mode, NULL_RTX);
581 }
582
583 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
584
585 static bool
586 nvptx_function_value_regno_p (const unsigned int regno)
587 {
588 return regno == NVPTX_RETURN_REGNUM;
589 }
590
591 /* Types with a mode other than those supported by the machine are passed by
592 reference in memory. */
593
594 static bool
595 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
596 machine_mode mode, const_tree type,
597 bool ARG_UNUSED (named))
598 {
599 return pass_in_memory (mode, type, false);
600 }
601
602 /* Implement TARGET_RETURN_IN_MEMORY. */
603
604 static bool
605 nvptx_return_in_memory (const_tree type, const_tree)
606 {
607 return pass_in_memory (TYPE_MODE (type), type, true);
608 }
609
610 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
611
612 static machine_mode
613 nvptx_promote_function_mode (const_tree type, machine_mode mode,
614 int *ARG_UNUSED (punsignedp),
615 const_tree funtype, int for_return)
616 {
617 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
618 }
619
620 /* Helper for write_arg. Emit a single PTX argument of MODE, either
621 in a prototype, or as copy in a function prologue. ARGNO is the
622 index of this argument in the PTX function. FOR_REG is negative,
623 if we're emitting the PTX prototype. It is zero if we're copying
624 to an argument register and it is greater than zero if we're
625 copying to a specific hard register. */
626
627 static int
628 write_arg_mode (std::stringstream &s, int for_reg, int argno,
629 machine_mode mode)
630 {
631 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
632
633 if (for_reg < 0)
634 {
635 /* Writing PTX prototype. */
636 s << (argno ? ", " : " (");
637 s << ".param" << ptx_type << " %in_ar" << argno;
638 }
639 else
640 {
641 s << "\t.reg" << ptx_type << " ";
642 if (for_reg)
643 s << reg_names[for_reg];
644 else
645 s << "%ar" << argno;
646 s << ";\n";
647 if (argno >= 0)
648 {
649 s << "\tld.param" << ptx_type << " ";
650 if (for_reg)
651 s << reg_names[for_reg];
652 else
653 s << "%ar" << argno;
654 s << ", [%in_ar" << argno << "];\n";
655 }
656 }
657 return argno + 1;
658 }
659
660 /* Process function parameter TYPE to emit one or more PTX
661 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
662 is true, if this is a prototyped function, rather than an old-style
663 C declaration. Returns the next argument number to use.
664
665 The promotion behavior here must match the regular GCC function
666 parameter marshalling machinery. */
667
668 static int
669 write_arg_type (std::stringstream &s, int for_reg, int argno,
670 tree type, bool prototyped)
671 {
672 machine_mode mode = TYPE_MODE (type);
673
674 if (mode == VOIDmode)
675 return argno;
676
677 if (pass_in_memory (mode, type, false))
678 mode = Pmode;
679 else
680 {
681 bool split = TREE_CODE (type) == COMPLEX_TYPE;
682
683 if (split)
684 {
685 /* Complex types are sent as two separate args. */
686 type = TREE_TYPE (type);
687 mode = TYPE_MODE (type);
688 prototyped = true;
689 }
690
691 mode = promote_arg (mode, prototyped);
692 if (split)
693 argno = write_arg_mode (s, for_reg, argno, mode);
694 }
695
696 return write_arg_mode (s, for_reg, argno, mode);
697 }
698
699 /* Emit a PTX return as a prototype or function prologue declaration
700 for MODE. */
701
702 static void
703 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
704 {
705 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
706 const char *pfx = "\t.reg";
707 const char *sfx = ";\n";
708
709 if (for_proto)
710 pfx = "(.param", sfx = "_out) ";
711
712 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
713 }
714
715 /* Process a function return TYPE to emit a PTX return as a prototype
716 or function prologue declaration. Returns true if return is via an
717 additional pointer parameter. The promotion behavior here must
718 match the regular GCC function return mashalling. */
719
720 static bool
721 write_return_type (std::stringstream &s, bool for_proto, tree type)
722 {
723 machine_mode mode = TYPE_MODE (type);
724
725 if (mode == VOIDmode)
726 return false;
727
728 bool return_in_mem = pass_in_memory (mode, type, true);
729
730 if (return_in_mem)
731 {
732 if (for_proto)
733 return return_in_mem;
734
735 /* Named return values can cause us to return a pointer as well
736 as expect an argument for the return location. This is
737 optimization-level specific, so no caller can make use of
738 this data, but more importantly for us, we must ensure it
739 doesn't change the PTX prototype. */
740 mode = (machine_mode) cfun->machine->return_mode;
741
742 if (mode == VOIDmode)
743 return return_in_mem;
744
745 /* Clear return_mode to inhibit copy of retval to non-existent
746 retval parameter. */
747 cfun->machine->return_mode = VOIDmode;
748 }
749 else
750 mode = promote_return (mode);
751
752 write_return_mode (s, for_proto, mode);
753
754 return return_in_mem;
755 }
756
757 /* Look for attributes in ATTRS that would indicate we must write a function
758 as a .entry kernel rather than a .func. Return true if one is found. */
759
760 static bool
761 write_as_kernel (tree attrs)
762 {
763 return (lookup_attribute ("kernel", attrs) != NULL_TREE
764 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
765 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
766 /* For OpenMP target regions, the corresponding kernel entry is emitted from
767 write_omp_entry as a separate function. */
768 }
769
770 /* Emit a linker marker for a function decl or defn. */
771
772 static void
773 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
774 const char *name)
775 {
776 s << "\n// BEGIN";
777 if (globalize)
778 s << " GLOBAL";
779 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
780 s << name << "\n";
781 }
782
783 /* Emit a linker marker for a variable decl or defn. */
784
785 static void
786 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
787 {
788 fprintf (file, "\n// BEGIN%s VAR %s: ",
789 globalize ? " GLOBAL" : "",
790 is_defn ? "DEF" : "DECL");
791 assemble_name_raw (file, name);
792 fputs ("\n", file);
793 }
794
795 /* Write a .func or .kernel declaration or definition along with
796 a helper comment for use by ld. S is the stream to write to, DECL
797 the decl for the function with name NAME. For definitions, emit
798 a declaration too. */
799
800 static const char *
801 write_fn_proto (std::stringstream &s, bool is_defn,
802 const char *name, const_tree decl)
803 {
804 if (is_defn)
805 /* Emit a declaration. The PTX assembler gets upset without it. */
806 name = write_fn_proto (s, false, name, decl);
807 else
808 {
809 /* Avoid repeating the name replacement. */
810 name = nvptx_name_replacement (name);
811 if (name[0] == '*')
812 name++;
813 }
814
815 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
816
817 /* PTX declaration. */
818 if (DECL_EXTERNAL (decl))
819 s << ".extern ";
820 else if (TREE_PUBLIC (decl))
821 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
822 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
823
824 tree fntype = TREE_TYPE (decl);
825 tree result_type = TREE_TYPE (fntype);
826
827 /* atomic_compare_exchange_$n builtins have an exceptional calling
828 convention. */
829 int not_atomic_weak_arg = -1;
830 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
831 switch (DECL_FUNCTION_CODE (decl))
832 {
833 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
834 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
835 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
836 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
838 /* These atomics skip the 'weak' parm in an actual library
839 call. We must skip it in the prototype too. */
840 not_atomic_weak_arg = 3;
841 break;
842
843 default:
844 break;
845 }
846
847 /* Declare the result. */
848 bool return_in_mem = write_return_type (s, true, result_type);
849
850 s << name;
851
852 int argno = 0;
853
854 /* Emit argument list. */
855 if (return_in_mem)
856 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
857
858 /* We get:
859 NULL in TYPE_ARG_TYPES, for old-style functions
860 NULL in DECL_ARGUMENTS, for builtin functions without another
861 declaration.
862 So we have to pick the best one we have. */
863 tree args = TYPE_ARG_TYPES (fntype);
864 bool prototyped = true;
865 if (!args)
866 {
867 args = DECL_ARGUMENTS (decl);
868 prototyped = false;
869 }
870
871 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
872 {
873 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
874
875 if (not_atomic_weak_arg)
876 argno = write_arg_type (s, -1, argno, type, prototyped);
877 else
878 gcc_assert (type == boolean_type_node);
879 }
880
881 if (stdarg_p (fntype))
882 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
883
884 if (DECL_STATIC_CHAIN (decl))
885 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
886
887 if (!argno && strcmp (name, "main") == 0)
888 {
889 argno = write_arg_type (s, -1, argno, integer_type_node, true);
890 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
891 }
892
893 if (argno)
894 s << ")";
895
896 s << (is_defn ? "\n" : ";\n");
897
898 return name;
899 }
900
901 /* Construct a function declaration from a call insn. This can be
902 necessary for two reasons - either we have an indirect call which
903 requires a .callprototype declaration, or we have a libcall
904 generated by emit_library_call for which no decl exists. */
905
906 static void
907 write_fn_proto_from_insn (std::stringstream &s, const char *name,
908 rtx result, rtx pat)
909 {
910 if (!name)
911 {
912 s << "\t.callprototype ";
913 name = "_";
914 }
915 else
916 {
917 name = nvptx_name_replacement (name);
918 write_fn_marker (s, false, true, name);
919 s << "\t.extern .func ";
920 }
921
922 if (result != NULL_RTX)
923 write_return_mode (s, true, GET_MODE (result));
924
925 s << name;
926
927 int arg_end = XVECLEN (pat, 0);
928 for (int i = 1; i < arg_end; i++)
929 {
930 /* We don't have to deal with mode splitting & promotion here,
931 as that was already done when generating the call
932 sequence. */
933 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
934
935 write_arg_mode (s, -1, i - 1, mode);
936 }
937 if (arg_end != 1)
938 s << ")";
939 s << ";\n";
940 }
941
942 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
943 table and and write a ptx prototype. These are emitted at end of
944 compilation. */
945
946 static void
947 nvptx_record_fndecl (tree decl)
948 {
949 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
950 if (*slot == NULL)
951 {
952 *slot = decl;
953 const char *name = get_fnname_from_decl (decl);
954 write_fn_proto (func_decls, false, name, decl);
955 }
956 }
957
958 /* Record a libcall or unprototyped external function. CALLEE is the
959 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
960 declaration for it. */
961
962 static void
963 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
964 {
965 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
966 if (*slot == NULL)
967 {
968 *slot = callee;
969
970 const char *name = XSTR (callee, 0);
971 write_fn_proto_from_insn (func_decls, name, retval, pat);
972 }
973 }
974
975 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
976 is prototyped, record it now. Otherwise record it as needed at end
977 of compilation, when we might have more information about it. */
978
979 void
980 nvptx_record_needed_fndecl (tree decl)
981 {
982 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
983 {
984 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
985 if (*slot == NULL)
986 *slot = decl;
987 }
988 else
989 nvptx_record_fndecl (decl);
990 }
991
992 /* SYM is a SYMBOL_REF. If it refers to an external function, record
993 it as needed. */
994
995 static void
996 nvptx_maybe_record_fnsym (rtx sym)
997 {
998 tree decl = SYMBOL_REF_DECL (sym);
999
1000 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1001 nvptx_record_needed_fndecl (decl);
1002 }
1003
1004 /* Emit a local array to hold some part of a conventional stack frame
1005 and initialize REGNO to point to it. If the size is zero, it'll
1006 never be valid to dereference, so we can simply initialize to
1007 zero. */
1008
1009 static void
1010 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1011 {
1012 if (size)
1013 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1014 align, reg_names[regno], size);
1015 fprintf (file, "\t.reg.u%d %s;\n",
1016 POINTER_SIZE, reg_names[regno]);
1017 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1018 : "\tmov.u%d %s, 0;\n"),
1019 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1020 }
1021
1022 /* Emit soft stack frame setup sequence. */
1023
1024 static void
1025 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1026 {
1027 /* Maintain 64-bit stack alignment. */
1028 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1029 size = ROUND_UP (size, keep_align);
1030 int bits = POINTER_SIZE;
1031 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1032 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1033 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1034 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1035 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1036 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1037 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1038 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1039 fprintf (file, "\t{\n");
1040 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1041 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1042 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1043 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1044 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1045 bits == 64 ? ".wide" : ".lo", bits / 8);
1046 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1047
1048 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1049 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1050
1051 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1052 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1053 bits, reg_sspprev, reg_sspslot);
1054
1055 /* Initialize %frame = %sspprev - size. */
1056 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1057 bits, reg_frame, reg_sspprev, size);
1058
1059 /* Apply alignment, if larger than 64. */
1060 if (alignment > keep_align)
1061 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1062 bits, reg_frame, reg_frame, -alignment);
1063
1064 size = crtl->outgoing_args_size;
1065 gcc_assert (size % keep_align == 0);
1066
1067 /* Initialize %stack. */
1068 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1069 bits, reg_stack, reg_frame, size);
1070
1071 if (!crtl->is_leaf)
1072 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1073 bits, reg_sspslot, reg_stack);
1074 fprintf (file, "\t}\n");
1075 cfun->machine->has_softstack = true;
1076 need_softstack_decl = true;
1077 }
1078
1079 /* Emit code to initialize the REGNO predicate register to indicate
1080 whether we are not lane zero on the NAME axis. */
1081
1082 static void
1083 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1084 {
1085 fprintf (file, "\t{\n");
1086 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1087 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1088 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1089 fprintf (file, "\t}\n");
1090 }
1091
1092 /* Emit code to initialize predicate and master lane index registers for
1093 -muniform-simt code generation variant. */
1094
1095 static void
1096 nvptx_init_unisimt_predicate (FILE *file)
1097 {
1098 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1099 int loc = REGNO (cfun->machine->unisimt_location);
1100 int bits = POINTER_SIZE;
1101 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1102 fprintf (file, "\t{\n");
1103 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1104 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1105 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1106 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1107 bits == 64 ? ".wide" : ".lo");
1108 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1109 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1110 if (cfun->machine->unisimt_predicate)
1111 {
1112 int master = REGNO (cfun->machine->unisimt_master);
1113 int pred = REGNO (cfun->machine->unisimt_predicate);
1114 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1115 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1116 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1117 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1118 /* Compute predicate as 'tid.x == master'. */
1119 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1120 }
1121 fprintf (file, "\t}\n");
1122 need_unisimt_decl = true;
1123 }
1124
1125 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1126
1127 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1128 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1129 {
1130 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1131 __nvptx_uni[tid.y] = 0;
1132 gomp_nvptx_main (ORIG, arg);
1133 }
1134 ORIG itself should not be emitted as a PTX .entry function. */
1135
1136 static void
1137 write_omp_entry (FILE *file, const char *name, const char *orig)
1138 {
1139 static bool gomp_nvptx_main_declared;
1140 if (!gomp_nvptx_main_declared)
1141 {
1142 gomp_nvptx_main_declared = true;
1143 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1144 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1145 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1146 }
1147 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1148 #define NTID_Y "%ntid.y"
1149 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1150 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1151 {\n\
1152 .reg.u32 %r<3>;\n\
1153 .reg.u" PS " %R<4>;\n\
1154 mov.u32 %r0, %tid.y;\n\
1155 mov.u32 %r1, " NTID_Y ";\n\
1156 mov.u32 %r2, %ctaid.x;\n\
1157 cvt.u" PS ".u32 %R1, %r0;\n\
1158 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1159 mov.u" PS " %R0, __nvptx_stacks;\n\
1160 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1161 ld.param.u" PS " %R2, [%stack];\n\
1162 ld.param.u" PS " %R3, [%sz];\n\
1163 add.u" PS " %R2, %R2, %R3;\n\
1164 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1165 st.shared.u" PS " [%R0], %R2;\n\
1166 mov.u" PS " %R0, __nvptx_uni;\n\
1167 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1168 mov.u32 %r0, 0;\n\
1169 st.shared.u32 [%R0], %r0;\n\
1170 mov.u" PS " %R0, \0;\n\
1171 ld.param.u" PS " %R1, [%arg];\n\
1172 {\n\
1173 .param.u" PS " %P<2>;\n\
1174 st.param.u" PS " [%P0], %R0;\n\
1175 st.param.u" PS " [%P1], %R1;\n\
1176 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1177 }\n\
1178 ret.uni;\n\
1179 }\n"
1180 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1181 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1182 #undef ENTRY_TEMPLATE
1183 #undef NTID_Y
1184 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1185 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1186 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1187 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1188 need_softstack_decl = need_unisimt_decl = true;
1189 }
1190
1191 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1192 function, including local var decls and copies from the arguments to
1193 local regs. */
1194
1195 void
1196 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1197 {
1198 tree fntype = TREE_TYPE (decl);
1199 tree result_type = TREE_TYPE (fntype);
1200 int argno = 0;
1201
1202 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1203 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1204 {
1205 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1206 sprintf (buf, "%s$impl", name);
1207 write_omp_entry (file, name, buf);
1208 name = buf;
1209 }
1210 /* We construct the initial part of the function into a string
1211 stream, in order to share the prototype writing code. */
1212 std::stringstream s;
1213 write_fn_proto (s, true, name, decl);
1214 s << "{\n";
1215
1216 bool return_in_mem = write_return_type (s, false, result_type);
1217 if (return_in_mem)
1218 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1219
1220 /* Declare and initialize incoming arguments. */
1221 tree args = TYPE_ARG_TYPES (fntype);
1222 bool prototyped = true;
1223 if (!args)
1224 {
1225 args = DECL_ARGUMENTS (decl);
1226 prototyped = false;
1227 }
1228
1229 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1230 {
1231 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1232
1233 argno = write_arg_type (s, 0, argno, type, prototyped);
1234 }
1235
1236 if (stdarg_p (fntype))
1237 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1238 true);
1239
1240 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1241 write_arg_type (s, STATIC_CHAIN_REGNUM,
1242 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1243 true);
1244
1245 fprintf (file, "%s", s.str().c_str());
1246
1247 /* Usually 'crtl->is_leaf' is computed during register allocator
1248 initialization (which is not done on NVPTX) or for pressure-sensitive
1249 optimizations. Initialize it here, except if already set. */
1250 if (!crtl->is_leaf)
1251 crtl->is_leaf = leaf_function_p ();
1252
1253 HOST_WIDE_INT sz = get_frame_size ();
1254 bool need_frameptr = sz || cfun->machine->has_chain;
1255 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1256 if (!TARGET_SOFT_STACK)
1257 {
1258 /* Declare a local var for outgoing varargs. */
1259 if (cfun->machine->has_varadic)
1260 init_frame (file, STACK_POINTER_REGNUM,
1261 UNITS_PER_WORD, crtl->outgoing_args_size);
1262
1263 /* Declare a local variable for the frame. Force its size to be
1264 DImode-compatible. */
1265 if (need_frameptr)
1266 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1267 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1268 }
1269 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1270 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1271 init_softstack_frame (file, alignment, sz);
1272
1273 if (cfun->machine->has_simtreg)
1274 {
1275 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1276 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1277 align = MAX (align, GET_MODE_SIZE (DImode));
1278 if (!crtl->is_leaf || cfun->calls_alloca)
1279 simtsz = HOST_WIDE_INT_M1U;
1280 if (simtsz == HOST_WIDE_INT_M1U)
1281 simtsz = nvptx_softstack_size;
1282 if (cfun->machine->has_softstack)
1283 simtsz += POINTER_SIZE / 8;
1284 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1285 if (align > GET_MODE_SIZE (DImode))
1286 simtsz += align - GET_MODE_SIZE (DImode);
1287 if (simtsz)
1288 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1289 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1290 }
1291 /* Declare the pseudos we have as ptx registers. */
1292 int maxregs = max_reg_num ();
1293 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1294 {
1295 if (regno_reg_rtx[i] != const0_rtx)
1296 {
1297 machine_mode mode = PSEUDO_REGNO_MODE (i);
1298 machine_mode split = maybe_split_mode (mode);
1299
1300 if (split_mode_p (mode))
1301 mode = split;
1302 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1303 output_reg (file, i, split, -2);
1304 fprintf (file, ";\n");
1305 }
1306 }
1307
1308 /* Emit axis predicates. */
1309 if (cfun->machine->axis_predicate[0])
1310 nvptx_init_axis_predicate (file,
1311 REGNO (cfun->machine->axis_predicate[0]), "y");
1312 if (cfun->machine->axis_predicate[1])
1313 nvptx_init_axis_predicate (file,
1314 REGNO (cfun->machine->axis_predicate[1]), "x");
1315 if (cfun->machine->unisimt_predicate
1316 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1317 nvptx_init_unisimt_predicate (file);
1318 }
1319
1320 /* Output code for switching uniform-simt state. ENTERING indicates whether
1321 we are entering or leaving non-uniform execution region. */
1322
1323 static void
1324 nvptx_output_unisimt_switch (FILE *file, bool entering)
1325 {
1326 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1327 return;
1328 fprintf (file, "\t{\n");
1329 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1330 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1331 if (!crtl->is_leaf)
1332 {
1333 int loc = REGNO (cfun->machine->unisimt_location);
1334 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1335 }
1336 if (cfun->machine->unisimt_predicate)
1337 {
1338 int master = REGNO (cfun->machine->unisimt_master);
1339 int pred = REGNO (cfun->machine->unisimt_predicate);
1340 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1341 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1342 master, entering ? "%ustmp2" : "0");
1343 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1344 }
1345 fprintf (file, "\t}\n");
1346 }
1347
1348 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1349 ENTERING indicates whether we are entering or leaving non-uniform execution.
1350 PTR is the register pointing to allocated storage, it is assigned to on
1351 entering and used to restore state on leaving. SIZE and ALIGN are used only
1352 on entering. */
1353
1354 static void
1355 nvptx_output_softstack_switch (FILE *file, bool entering,
1356 rtx ptr, rtx size, rtx align)
1357 {
1358 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1359 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1360 return;
1361 int bits = POINTER_SIZE, regno = REGNO (ptr);
1362 fprintf (file, "\t{\n");
1363 if (entering)
1364 {
1365 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1366 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1367 cfun->machine->simt_stack_size);
1368 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1369 if (CONST_INT_P (size))
1370 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1371 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1372 else
1373 output_reg (file, REGNO (size), VOIDmode);
1374 fputs (";\n", file);
1375 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1376 fprintf (file,
1377 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1378 bits, regno, regno, UINTVAL (align));
1379 }
1380 if (cfun->machine->has_softstack)
1381 {
1382 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1383 if (entering)
1384 {
1385 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1386 bits, regno, bits / 8, reg_stack);
1387 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1388 bits, reg_stack, regno, bits / 8);
1389 }
1390 else
1391 {
1392 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1393 bits, reg_stack, regno, bits / 8);
1394 }
1395 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1396 }
1397 fprintf (file, "\t}\n");
1398 }
1399
1400 /* Output code to enter non-uniform execution region. DEST is a register
1401 to hold a per-lane allocation given by SIZE and ALIGN. */
1402
1403 const char *
1404 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1405 {
1406 nvptx_output_unisimt_switch (asm_out_file, true);
1407 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1408 return "";
1409 }
1410
1411 /* Output code to leave non-uniform execution region. SRC is the register
1412 holding per-lane storage previously allocated by omp_simt_enter insn. */
1413
1414 const char *
1415 nvptx_output_simt_exit (rtx src)
1416 {
1417 nvptx_output_unisimt_switch (asm_out_file, false);
1418 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1419 return "";
1420 }
1421
1422 /* Output instruction that sets soft stack pointer in shared memory to the
1423 value in register given by SRC_REGNO. */
1424
1425 const char *
1426 nvptx_output_set_softstack (unsigned src_regno)
1427 {
1428 if (cfun->machine->has_softstack && !crtl->is_leaf)
1429 {
1430 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1431 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1432 output_reg (asm_out_file, src_regno, VOIDmode);
1433 fprintf (asm_out_file, ";\n");
1434 }
1435 return "";
1436 }
1437 /* Output a return instruction. Also copy the return value to its outgoing
1438 location. */
1439
1440 const char *
1441 nvptx_output_return (void)
1442 {
1443 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1444
1445 if (mode != VOIDmode)
1446 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1447 nvptx_ptx_type_from_mode (mode, false),
1448 reg_names[NVPTX_RETURN_REGNUM],
1449 reg_names[NVPTX_RETURN_REGNUM]);
1450
1451 return "ret;";
1452 }
1453
1454 /* Terminate a function by writing a closing brace to FILE. */
1455
1456 void
1457 nvptx_function_end (FILE *file)
1458 {
1459 fprintf (file, "}\n");
1460 }
1461 \f
1462 /* Decide whether we can make a sibling call to a function. For ptx, we
1463 can't. */
1464
1465 static bool
1466 nvptx_function_ok_for_sibcall (tree, tree)
1467 {
1468 return false;
1469 }
1470
1471 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1472
1473 static rtx
1474 nvptx_get_drap_rtx (void)
1475 {
1476 if (TARGET_SOFT_STACK && stack_realign_drap)
1477 return arg_pointer_rtx;
1478 return NULL_RTX;
1479 }
1480
1481 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1482 argument to the next call. */
1483
1484 static void
1485 nvptx_call_args (rtx arg, tree fntype)
1486 {
1487 if (!cfun->machine->doing_call)
1488 {
1489 cfun->machine->doing_call = true;
1490 cfun->machine->is_varadic = false;
1491 cfun->machine->num_args = 0;
1492
1493 if (fntype && stdarg_p (fntype))
1494 {
1495 cfun->machine->is_varadic = true;
1496 cfun->machine->has_varadic = true;
1497 cfun->machine->num_args++;
1498 }
1499 }
1500
1501 if (REG_P (arg) && arg != pc_rtx)
1502 {
1503 cfun->machine->num_args++;
1504 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1505 cfun->machine->call_args);
1506 }
1507 }
1508
1509 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1510 information we recorded. */
1511
1512 static void
1513 nvptx_end_call_args (void)
1514 {
1515 cfun->machine->doing_call = false;
1516 free_EXPR_LIST_list (&cfun->machine->call_args);
1517 }
1518
1519 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1520 track of whether calls involving static chains or varargs were seen
1521 in the current function.
1522 For libcalls, maintain a hash table of decls we have seen, and
1523 record a function decl for later when encountering a new one. */
1524
1525 void
1526 nvptx_expand_call (rtx retval, rtx address)
1527 {
1528 rtx callee = XEXP (address, 0);
1529 rtx varargs = NULL_RTX;
1530 unsigned parallel = 0;
1531
1532 if (!call_insn_operand (callee, Pmode))
1533 {
1534 callee = force_reg (Pmode, callee);
1535 address = change_address (address, QImode, callee);
1536 }
1537
1538 if (GET_CODE (callee) == SYMBOL_REF)
1539 {
1540 tree decl = SYMBOL_REF_DECL (callee);
1541 if (decl != NULL_TREE)
1542 {
1543 if (DECL_STATIC_CHAIN (decl))
1544 cfun->machine->has_chain = true;
1545
1546 tree attr = oacc_get_fn_attrib (decl);
1547 if (attr)
1548 {
1549 tree dims = TREE_VALUE (attr);
1550
1551 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1552 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1553 {
1554 if (TREE_PURPOSE (dims)
1555 && !integer_zerop (TREE_PURPOSE (dims)))
1556 break;
1557 /* Not on this axis. */
1558 parallel ^= GOMP_DIM_MASK (ix);
1559 dims = TREE_CHAIN (dims);
1560 }
1561 }
1562 }
1563 }
1564
1565 unsigned nargs = cfun->machine->num_args;
1566 if (cfun->machine->is_varadic)
1567 {
1568 varargs = gen_reg_rtx (Pmode);
1569 emit_move_insn (varargs, stack_pointer_rtx);
1570 }
1571
1572 rtvec vec = rtvec_alloc (nargs + 1);
1573 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1574 int vec_pos = 0;
1575
1576 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1577 rtx tmp_retval = retval;
1578 if (retval)
1579 {
1580 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1581 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1582 call = gen_rtx_SET (tmp_retval, call);
1583 }
1584 XVECEXP (pat, 0, vec_pos++) = call;
1585
1586 /* Construct the call insn, including a USE for each argument pseudo
1587 register. These will be used when printing the insn. */
1588 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1589 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1590
1591 if (varargs)
1592 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1593
1594 gcc_assert (vec_pos = XVECLEN (pat, 0));
1595
1596 nvptx_emit_forking (parallel, true);
1597 emit_call_insn (pat);
1598 nvptx_emit_joining (parallel, true);
1599
1600 if (tmp_retval != retval)
1601 emit_move_insn (retval, tmp_retval);
1602 }
1603
1604 /* Emit a comparison COMPARE, and return the new test to be used in the
1605 jump. */
1606
1607 rtx
1608 nvptx_expand_compare (rtx compare)
1609 {
1610 rtx pred = gen_reg_rtx (BImode);
1611 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1612 XEXP (compare, 0), XEXP (compare, 1));
1613 emit_insn (gen_rtx_SET (pred, cmp));
1614 return gen_rtx_NE (BImode, pred, const0_rtx);
1615 }
1616
1617 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1618
1619 void
1620 nvptx_expand_oacc_fork (unsigned mode)
1621 {
1622 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1623 }
1624
1625 void
1626 nvptx_expand_oacc_join (unsigned mode)
1627 {
1628 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1629 }
1630
1631 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1632 objects. */
1633
1634 static rtx
1635 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1636 {
1637 rtx res;
1638
1639 switch (GET_MODE (src))
1640 {
1641 case E_DImode:
1642 res = gen_unpackdisi2 (dst0, dst1, src);
1643 break;
1644 case E_DFmode:
1645 res = gen_unpackdfsi2 (dst0, dst1, src);
1646 break;
1647 default: gcc_unreachable ();
1648 }
1649 return res;
1650 }
1651
1652 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1653 object. */
1654
1655 static rtx
1656 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1657 {
1658 rtx res;
1659
1660 switch (GET_MODE (dst))
1661 {
1662 case E_DImode:
1663 res = gen_packsidi2 (dst, src0, src1);
1664 break;
1665 case E_DFmode:
1666 res = gen_packsidf2 (dst, src0, src1);
1667 break;
1668 default: gcc_unreachable ();
1669 }
1670 return res;
1671 }
1672
1673 /* Generate an instruction or sequence to broadcast register REG
1674 across the vectors of a single warp. */
1675
1676 rtx
1677 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1678 {
1679 rtx res;
1680
1681 switch (GET_MODE (dst))
1682 {
1683 case E_SImode:
1684 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1685 break;
1686 case E_SFmode:
1687 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1688 break;
1689 case E_DImode:
1690 case E_DFmode:
1691 {
1692 rtx tmp0 = gen_reg_rtx (SImode);
1693 rtx tmp1 = gen_reg_rtx (SImode);
1694
1695 start_sequence ();
1696 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1697 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1698 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1699 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1700 res = get_insns ();
1701 end_sequence ();
1702 }
1703 break;
1704 case E_BImode:
1705 {
1706 rtx tmp = gen_reg_rtx (SImode);
1707
1708 start_sequence ();
1709 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1710 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1711 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1712 res = get_insns ();
1713 end_sequence ();
1714 }
1715 break;
1716 case E_QImode:
1717 case E_HImode:
1718 {
1719 rtx tmp = gen_reg_rtx (SImode);
1720
1721 start_sequence ();
1722 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1723 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1724 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1725 tmp)));
1726 res = get_insns ();
1727 end_sequence ();
1728 }
1729 break;
1730
1731 default:
1732 gcc_unreachable ();
1733 }
1734 return res;
1735 }
1736
1737 /* Generate an instruction or sequence to broadcast register REG
1738 across the vectors of a single warp. */
1739
1740 static rtx
1741 nvptx_gen_vcast (rtx reg)
1742 {
1743 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1744 }
1745
1746 /* Structure used when generating a worker-level spill or fill. */
1747
1748 struct wcast_data_t
1749 {
1750 rtx base; /* Register holding base addr of buffer. */
1751 rtx ptr; /* Iteration var, if needed. */
1752 unsigned offset; /* Offset into worker buffer. */
1753 };
1754
1755 /* Direction of the spill/fill and looping setup/teardown indicator. */
1756
1757 enum propagate_mask
1758 {
1759 PM_read = 1 << 0,
1760 PM_write = 1 << 1,
1761 PM_loop_begin = 1 << 2,
1762 PM_loop_end = 1 << 3,
1763
1764 PM_read_write = PM_read | PM_write
1765 };
1766
1767 /* Generate instruction(s) to spill or fill register REG to/from the
1768 worker broadcast array. PM indicates what is to be done, REP
1769 how many loop iterations will be executed (0 for not a loop). */
1770
1771 static rtx
1772 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1773 {
1774 rtx res;
1775 machine_mode mode = GET_MODE (reg);
1776
1777 switch (mode)
1778 {
1779 case E_BImode:
1780 {
1781 rtx tmp = gen_reg_rtx (SImode);
1782
1783 start_sequence ();
1784 if (pm & PM_read)
1785 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1786 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1787 if (pm & PM_write)
1788 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1789 res = get_insns ();
1790 end_sequence ();
1791 }
1792 break;
1793
1794 default:
1795 {
1796 rtx addr = data->ptr;
1797
1798 if (!addr)
1799 {
1800 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1801
1802 if (align > worker_bcast_align)
1803 worker_bcast_align = align;
1804 data->offset = (data->offset + align - 1) & ~(align - 1);
1805 addr = data->base;
1806 if (data->offset)
1807 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1808 }
1809
1810 addr = gen_rtx_MEM (mode, addr);
1811 if (pm == PM_read)
1812 res = gen_rtx_SET (addr, reg);
1813 else if (pm == PM_write)
1814 res = gen_rtx_SET (reg, addr);
1815 else
1816 gcc_unreachable ();
1817
1818 if (data->ptr)
1819 {
1820 /* We're using a ptr, increment it. */
1821 start_sequence ();
1822
1823 emit_insn (res);
1824 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1825 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1826 res = get_insns ();
1827 end_sequence ();
1828 }
1829 else
1830 rep = 1;
1831 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1832 }
1833 break;
1834 }
1835 return res;
1836 }
1837 \f
1838 /* Returns true if X is a valid address for use in a memory reference. */
1839
1840 static bool
1841 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1842 {
1843 enum rtx_code code = GET_CODE (x);
1844
1845 switch (code)
1846 {
1847 case REG:
1848 return true;
1849
1850 case PLUS:
1851 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1852 return true;
1853 return false;
1854
1855 case CONST:
1856 case SYMBOL_REF:
1857 case LABEL_REF:
1858 return true;
1859
1860 default:
1861 return false;
1862 }
1863 }
1864 \f
1865 /* Machinery to output constant initializers. When beginning an
1866 initializer, we decide on a fragment size (which is visible in ptx
1867 in the type used), and then all initializer data is buffered until
1868 a fragment is filled and ready to be written out. */
1869
1870 static struct
1871 {
1872 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1873 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1874 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1875 out. */
1876 unsigned size; /* Fragment size to accumulate. */
1877 unsigned offset; /* Offset within current fragment. */
1878 bool started; /* Whether we've output any initializer. */
1879 } init_frag;
1880
1881 /* The current fragment is full, write it out. SYM may provide a
1882 symbolic reference we should output, in which case the fragment
1883 value is the addend. */
1884
1885 static void
1886 output_init_frag (rtx sym)
1887 {
1888 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1889 unsigned HOST_WIDE_INT val = init_frag.val;
1890
1891 init_frag.started = true;
1892 init_frag.val = 0;
1893 init_frag.offset = 0;
1894 init_frag.remaining--;
1895
1896 if (sym)
1897 {
1898 fprintf (asm_out_file, "generic(");
1899 output_address (VOIDmode, sym);
1900 fprintf (asm_out_file, val ? ") + " : ")");
1901 }
1902
1903 if (!sym || val)
1904 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1905 }
1906
1907 /* Add value VAL of size SIZE to the data we're emitting, and keep
1908 writing out chunks as they fill up. */
1909
1910 static void
1911 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1912 {
1913 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1914
1915 for (unsigned part = 0; size; size -= part)
1916 {
1917 val >>= part * BITS_PER_UNIT;
1918 part = init_frag.size - init_frag.offset;
1919 if (part > size)
1920 part = size;
1921
1922 unsigned HOST_WIDE_INT partial
1923 = val << (init_frag.offset * BITS_PER_UNIT);
1924 init_frag.val |= partial & init_frag.mask;
1925 init_frag.offset += part;
1926
1927 if (init_frag.offset == init_frag.size)
1928 output_init_frag (NULL);
1929 }
1930 }
1931
1932 /* Target hook for assembling integer object X of size SIZE. */
1933
1934 static bool
1935 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1936 {
1937 HOST_WIDE_INT val = 0;
1938
1939 switch (GET_CODE (x))
1940 {
1941 default:
1942 /* Let the generic machinery figure it out, usually for a
1943 CONST_WIDE_INT. */
1944 return false;
1945
1946 case CONST_INT:
1947 nvptx_assemble_value (INTVAL (x), size);
1948 break;
1949
1950 case CONST:
1951 x = XEXP (x, 0);
1952 gcc_assert (GET_CODE (x) == PLUS);
1953 val = INTVAL (XEXP (x, 1));
1954 x = XEXP (x, 0);
1955 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1956 /* FALLTHROUGH */
1957
1958 case SYMBOL_REF:
1959 gcc_assert (size == init_frag.size);
1960 if (init_frag.offset)
1961 sorry ("cannot emit unaligned pointers in ptx assembly");
1962
1963 nvptx_maybe_record_fnsym (x);
1964 init_frag.val = val;
1965 output_init_frag (x);
1966 break;
1967 }
1968
1969 return true;
1970 }
1971
1972 /* Output SIZE zero bytes. We ignore the FILE argument since the
1973 functions we're calling to perform the output just use
1974 asm_out_file. */
1975
1976 void
1977 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1978 {
1979 /* Finish the current fragment, if it's started. */
1980 if (init_frag.offset)
1981 {
1982 unsigned part = init_frag.size - init_frag.offset;
1983 if (part > size)
1984 part = (unsigned) size;
1985 size -= part;
1986 nvptx_assemble_value (0, part);
1987 }
1988
1989 /* If this skip doesn't terminate the initializer, write as many
1990 remaining pieces as possible directly. */
1991 if (size < init_frag.remaining * init_frag.size)
1992 {
1993 while (size >= init_frag.size)
1994 {
1995 size -= init_frag.size;
1996 output_init_frag (NULL_RTX);
1997 }
1998 if (size)
1999 nvptx_assemble_value (0, size);
2000 }
2001 }
2002
2003 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2004 ignore the FILE arg. */
2005
2006 void
2007 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2008 {
2009 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2010 nvptx_assemble_value (str[i], 1);
2011 }
2012
2013 /* Emit a PTX variable decl and prepare for emission of its
2014 initializer. NAME is the symbol name and SETION the PTX data
2015 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2016 The caller has already emitted any indentation and linkage
2017 specifier. It is responsible for any initializer, terminating ;
2018 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2019 this is the opposite way round that PTX wants them! */
2020
2021 static void
2022 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2023 const_tree type, HOST_WIDE_INT size, unsigned align)
2024 {
2025 while (TREE_CODE (type) == ARRAY_TYPE)
2026 type = TREE_TYPE (type);
2027
2028 if (TREE_CODE (type) == VECTOR_TYPE
2029 || TREE_CODE (type) == COMPLEX_TYPE)
2030 /* Neither vector nor complex types can contain the other. */
2031 type = TREE_TYPE (type);
2032
2033 unsigned elt_size = int_size_in_bytes (type);
2034
2035 /* Largest mode we're prepared to accept. For BLKmode types we
2036 don't know if it'll contain pointer constants, so have to choose
2037 pointer size, otherwise we can choose DImode. */
2038 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2039
2040 elt_size |= GET_MODE_SIZE (elt_mode);
2041 elt_size &= -elt_size; /* Extract LSB set. */
2042
2043 init_frag.size = elt_size;
2044 /* Avoid undefined shift behavior by using '2'. */
2045 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2046 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2047 init_frag.val = 0;
2048 init_frag.offset = 0;
2049 init_frag.started = false;
2050 /* Size might not be a multiple of elt size, if there's an
2051 initialized trailing struct array with smaller type than
2052 elt_size. */
2053 init_frag.remaining = (size + elt_size - 1) / elt_size;
2054
2055 fprintf (file, "%s .align %d .u%d ",
2056 section, align / BITS_PER_UNIT,
2057 elt_size * BITS_PER_UNIT);
2058 assemble_name (file, name);
2059
2060 if (size)
2061 /* We make everything an array, to simplify any initialization
2062 emission. */
2063 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2064 }
2065
2066 /* Called when the initializer for a decl has been completely output through
2067 combinations of the three functions above. */
2068
2069 static void
2070 nvptx_assemble_decl_end (void)
2071 {
2072 if (init_frag.offset)
2073 /* This can happen with a packed struct with trailing array member. */
2074 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2075 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2076 }
2077
2078 /* Output an uninitialized common or file-scope variable. */
2079
2080 void
2081 nvptx_output_aligned_decl (FILE *file, const char *name,
2082 const_tree decl, HOST_WIDE_INT size, unsigned align)
2083 {
2084 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2085
2086 /* If this is public, it is common. The nearest thing we have to
2087 common is weak. */
2088 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2089
2090 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2091 TREE_TYPE (decl), size, align);
2092 nvptx_assemble_decl_end ();
2093 }
2094
2095 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2096 writing a constant variable EXP with NAME and SIZE and its
2097 initializer to FILE. */
2098
2099 static void
2100 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2101 const_tree exp, HOST_WIDE_INT obj_size)
2102 {
2103 write_var_marker (file, true, false, name);
2104
2105 fprintf (file, "\t");
2106
2107 tree type = TREE_TYPE (exp);
2108 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2109 TYPE_ALIGN (type));
2110 }
2111
2112 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2113 a variable DECL with NAME to FILE. */
2114
2115 void
2116 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2117 {
2118 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2119
2120 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2121 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2122
2123 tree type = TREE_TYPE (decl);
2124 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2125 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2126 type, obj_size, DECL_ALIGN (decl));
2127 }
2128
2129 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2130
2131 static void
2132 nvptx_globalize_label (FILE *, const char *)
2133 {
2134 }
2135
2136 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2137 declaration only for variable DECL with NAME to FILE. */
2138
2139 static void
2140 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2141 {
2142 /* The middle end can place constant pool decls into the varpool as
2143 undefined. Until that is fixed, catch the problem here. */
2144 if (DECL_IN_CONSTANT_POOL (decl))
2145 return;
2146
2147 /* We support weak defintions, and hence have the right
2148 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2149 if (DECL_WEAK (decl))
2150 error_at (DECL_SOURCE_LOCATION (decl),
2151 "PTX does not support weak declarations"
2152 " (only weak definitions)");
2153 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2154
2155 fprintf (file, "\t.extern ");
2156 tree size = DECL_SIZE_UNIT (decl);
2157 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2158 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2159 DECL_ALIGN (decl));
2160 nvptx_assemble_decl_end ();
2161 }
2162
2163 /* Output a pattern for a move instruction. */
2164
2165 const char *
2166 nvptx_output_mov_insn (rtx dst, rtx src)
2167 {
2168 machine_mode dst_mode = GET_MODE (dst);
2169 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2170 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2171 machine_mode src_inner = (GET_CODE (src) == SUBREG
2172 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2173
2174 rtx sym = src;
2175 if (GET_CODE (sym) == CONST)
2176 sym = XEXP (XEXP (sym, 0), 0);
2177 if (SYMBOL_REF_P (sym))
2178 {
2179 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2180 return "%.\tcvta%D1%t0\t%0, %1;";
2181 nvptx_maybe_record_fnsym (sym);
2182 }
2183
2184 if (src_inner == dst_inner)
2185 return "%.\tmov%t0\t%0, %1;";
2186
2187 if (CONSTANT_P (src))
2188 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2189 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2190 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2191
2192 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2193 {
2194 if (GET_MODE_BITSIZE (dst_mode) == 128
2195 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2196 {
2197 /* mov.b128 is not supported. */
2198 if (dst_inner == V2DImode && src_inner == TImode)
2199 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2200 else if (dst_inner == TImode && src_inner == V2DImode)
2201 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2202
2203 gcc_unreachable ();
2204 }
2205 return "%.\tmov.b%T0\t%0, %1;";
2206 }
2207
2208 return "%.\tcvt%t0%t1\t%0, %1;";
2209 }
2210
2211 static void nvptx_print_operand (FILE *, rtx, int);
2212
2213 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2214 involves writing .param declarations and in/out copies into them. For
2215 indirect calls, also write the .callprototype. */
2216
2217 const char *
2218 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2219 {
2220 char buf[16];
2221 static int labelno;
2222 bool needs_tgt = register_operand (callee, Pmode);
2223 rtx pat = PATTERN (insn);
2224 if (GET_CODE (pat) == COND_EXEC)
2225 pat = COND_EXEC_CODE (pat);
2226 int arg_end = XVECLEN (pat, 0);
2227 tree decl = NULL_TREE;
2228
2229 fprintf (asm_out_file, "\t{\n");
2230 if (result != NULL)
2231 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2232 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2233 reg_names[NVPTX_RETURN_REGNUM]);
2234
2235 /* Ensure we have a ptx declaration in the output if necessary. */
2236 if (GET_CODE (callee) == SYMBOL_REF)
2237 {
2238 decl = SYMBOL_REF_DECL (callee);
2239 if (!decl
2240 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2241 nvptx_record_libfunc (callee, result, pat);
2242 else if (DECL_EXTERNAL (decl))
2243 nvptx_record_fndecl (decl);
2244 }
2245
2246 if (needs_tgt)
2247 {
2248 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2249 labelno++;
2250 ASM_OUTPUT_LABEL (asm_out_file, buf);
2251 std::stringstream s;
2252 write_fn_proto_from_insn (s, NULL, result, pat);
2253 fputs (s.str().c_str(), asm_out_file);
2254 }
2255
2256 for (int argno = 1; argno < arg_end; argno++)
2257 {
2258 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2259 machine_mode mode = GET_MODE (t);
2260 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2261
2262 /* Mode splitting has already been done. */
2263 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2264 "\t\tst.param%s [%%out_arg%d], ",
2265 ptx_type, argno, ptx_type, argno);
2266 output_reg (asm_out_file, REGNO (t), VOIDmode);
2267 fprintf (asm_out_file, ";\n");
2268 }
2269
2270 /* The '.' stands for the call's predicate, if any. */
2271 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2272 fprintf (asm_out_file, "\t\tcall ");
2273 if (result != NULL_RTX)
2274 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2275
2276 if (decl)
2277 {
2278 const char *name = get_fnname_from_decl (decl);
2279 name = nvptx_name_replacement (name);
2280 assemble_name (asm_out_file, name);
2281 }
2282 else
2283 output_address (VOIDmode, callee);
2284
2285 const char *open = "(";
2286 for (int argno = 1; argno < arg_end; argno++)
2287 {
2288 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2289 open = "";
2290 }
2291 if (decl && DECL_STATIC_CHAIN (decl))
2292 {
2293 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2294 open = "";
2295 }
2296 if (!open[0])
2297 fprintf (asm_out_file, ")");
2298
2299 if (needs_tgt)
2300 {
2301 fprintf (asm_out_file, ", ");
2302 assemble_name (asm_out_file, buf);
2303 }
2304 fprintf (asm_out_file, ";\n");
2305
2306 if (find_reg_note (insn, REG_NORETURN, NULL))
2307 {
2308 /* No return functions confuse the PTX JIT, as it doesn't realize
2309 the flow control barrier they imply. It can seg fault if it
2310 encounters what looks like an unexitable loop. Emit a trailing
2311 trap and exit, which it does grok. */
2312 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2313 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2314 }
2315
2316 if (result)
2317 {
2318 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2319
2320 if (!rval[0])
2321 /* We must escape the '%' that starts RETURN_REGNUM. */
2322 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2323 reg_names[NVPTX_RETURN_REGNUM]);
2324 return rval;
2325 }
2326
2327 return "}";
2328 }
2329
2330 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2331
2332 static bool
2333 nvptx_print_operand_punct_valid_p (unsigned char c)
2334 {
2335 return c == '.' || c== '#';
2336 }
2337
2338 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2339
2340 static void
2341 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2342 {
2343 rtx off;
2344 if (GET_CODE (x) == CONST)
2345 x = XEXP (x, 0);
2346 switch (GET_CODE (x))
2347 {
2348 case PLUS:
2349 off = XEXP (x, 1);
2350 output_address (VOIDmode, XEXP (x, 0));
2351 fprintf (file, "+");
2352 output_address (VOIDmode, off);
2353 break;
2354
2355 case SYMBOL_REF:
2356 case LABEL_REF:
2357 output_addr_const (file, x);
2358 break;
2359
2360 default:
2361 gcc_assert (GET_CODE (x) != MEM);
2362 nvptx_print_operand (file, x, 0);
2363 break;
2364 }
2365 }
2366
2367 /* Write assembly language output for the address ADDR to FILE. */
2368
2369 static void
2370 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2371 {
2372 nvptx_print_address_operand (file, addr, mode);
2373 }
2374
2375 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2376
2377 Meaning of CODE:
2378 . -- print the predicate for the instruction or an emptry string for an
2379 unconditional one.
2380 # -- print a rounding mode for the instruction
2381
2382 A -- print a data area for a MEM
2383 c -- print an opcode suffix for a comparison operator, including a type code
2384 D -- print a data area for a MEM operand
2385 S -- print a shuffle kind specified by CONST_INT
2386 t -- print a type opcode suffix, promoting QImode to 32 bits
2387 T -- print a type size in bits
2388 u -- print a type opcode suffix without promotions. */
2389
2390 static void
2391 nvptx_print_operand (FILE *file, rtx x, int code)
2392 {
2393 if (code == '.')
2394 {
2395 x = current_insn_predicate;
2396 if (x)
2397 {
2398 fputs ("@", file);
2399 if (GET_CODE (x) == EQ)
2400 fputs ("!", file);
2401 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2402 }
2403 return;
2404 }
2405 else if (code == '#')
2406 {
2407 fputs (".rn", file);
2408 return;
2409 }
2410
2411 enum rtx_code x_code = GET_CODE (x);
2412 machine_mode mode = GET_MODE (x);
2413
2414 switch (code)
2415 {
2416 case 'A':
2417 x = XEXP (x, 0);
2418 /* FALLTHROUGH. */
2419
2420 case 'D':
2421 if (GET_CODE (x) == CONST)
2422 x = XEXP (x, 0);
2423 if (GET_CODE (x) == PLUS)
2424 x = XEXP (x, 0);
2425
2426 if (GET_CODE (x) == SYMBOL_REF)
2427 fputs (section_for_sym (x), file);
2428 break;
2429
2430 case 't':
2431 case 'u':
2432 if (x_code == SUBREG)
2433 {
2434 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2435 if (VECTOR_MODE_P (inner_mode)
2436 && (GET_MODE_SIZE (mode)
2437 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2438 mode = GET_MODE_INNER (inner_mode);
2439 else if (split_mode_p (inner_mode))
2440 mode = maybe_split_mode (inner_mode);
2441 else
2442 mode = inner_mode;
2443 }
2444 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2445 break;
2446
2447 case 'H':
2448 case 'L':
2449 {
2450 rtx inner_x = SUBREG_REG (x);
2451 machine_mode inner_mode = GET_MODE (inner_x);
2452 machine_mode split = maybe_split_mode (inner_mode);
2453
2454 output_reg (file, REGNO (inner_x), split,
2455 (code == 'H'
2456 ? GET_MODE_SIZE (inner_mode) / 2
2457 : 0));
2458 }
2459 break;
2460
2461 case 'S':
2462 {
2463 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2464 /* Same order as nvptx_shuffle_kind. */
2465 static const char *const kinds[] =
2466 {".up", ".down", ".bfly", ".idx"};
2467 fputs (kinds[kind], file);
2468 }
2469 break;
2470
2471 case 'T':
2472 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2473 break;
2474
2475 case 'j':
2476 fprintf (file, "@");
2477 goto common;
2478
2479 case 'J':
2480 fprintf (file, "@!");
2481 goto common;
2482
2483 case 'c':
2484 mode = GET_MODE (XEXP (x, 0));
2485 switch (x_code)
2486 {
2487 case EQ:
2488 fputs (".eq", file);
2489 break;
2490 case NE:
2491 if (FLOAT_MODE_P (mode))
2492 fputs (".neu", file);
2493 else
2494 fputs (".ne", file);
2495 break;
2496 case LE:
2497 case LEU:
2498 fputs (".le", file);
2499 break;
2500 case GE:
2501 case GEU:
2502 fputs (".ge", file);
2503 break;
2504 case LT:
2505 case LTU:
2506 fputs (".lt", file);
2507 break;
2508 case GT:
2509 case GTU:
2510 fputs (".gt", file);
2511 break;
2512 case LTGT:
2513 fputs (".ne", file);
2514 break;
2515 case UNEQ:
2516 fputs (".equ", file);
2517 break;
2518 case UNLE:
2519 fputs (".leu", file);
2520 break;
2521 case UNGE:
2522 fputs (".geu", file);
2523 break;
2524 case UNLT:
2525 fputs (".ltu", file);
2526 break;
2527 case UNGT:
2528 fputs (".gtu", file);
2529 break;
2530 case UNORDERED:
2531 fputs (".nan", file);
2532 break;
2533 case ORDERED:
2534 fputs (".num", file);
2535 break;
2536 default:
2537 gcc_unreachable ();
2538 }
2539 if (FLOAT_MODE_P (mode)
2540 || x_code == EQ || x_code == NE
2541 || x_code == GEU || x_code == GTU
2542 || x_code == LEU || x_code == LTU)
2543 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2544 else
2545 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2546 break;
2547 default:
2548 common:
2549 switch (x_code)
2550 {
2551 case SUBREG:
2552 {
2553 rtx inner_x = SUBREG_REG (x);
2554 machine_mode inner_mode = GET_MODE (inner_x);
2555 machine_mode split = maybe_split_mode (inner_mode);
2556
2557 if (VECTOR_MODE_P (inner_mode)
2558 && (GET_MODE_SIZE (mode)
2559 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2560 {
2561 output_reg (file, REGNO (inner_x), VOIDmode);
2562 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2563 }
2564 else if (split_mode_p (inner_mode)
2565 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2566 output_reg (file, REGNO (inner_x), split);
2567 else
2568 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2569 }
2570 break;
2571
2572 case REG:
2573 output_reg (file, REGNO (x), maybe_split_mode (mode));
2574 break;
2575
2576 case MEM:
2577 fputc ('[', file);
2578 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2579 fputc (']', file);
2580 break;
2581
2582 case CONST_INT:
2583 output_addr_const (file, x);
2584 break;
2585
2586 case CONST:
2587 case SYMBOL_REF:
2588 case LABEL_REF:
2589 /* We could use output_addr_const, but that can print things like
2590 "x-8", which breaks ptxas. Need to ensure it is output as
2591 "x+-8". */
2592 nvptx_print_address_operand (file, x, VOIDmode);
2593 break;
2594
2595 case CONST_DOUBLE:
2596 long vals[2];
2597 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2598 vals[0] &= 0xffffffff;
2599 vals[1] &= 0xffffffff;
2600 if (mode == SFmode)
2601 fprintf (file, "0f%08lx", vals[0]);
2602 else
2603 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2604 break;
2605
2606 case CONST_VECTOR:
2607 {
2608 unsigned n = CONST_VECTOR_NUNITS (x);
2609 fprintf (file, "{ ");
2610 for (unsigned i = 0; i < n; ++i)
2611 {
2612 if (i != 0)
2613 fprintf (file, ", ");
2614
2615 rtx elem = CONST_VECTOR_ELT (x, i);
2616 output_addr_const (file, elem);
2617 }
2618 fprintf (file, " }");
2619 }
2620 break;
2621
2622 default:
2623 output_addr_const (file, x);
2624 }
2625 }
2626 }
2627 \f
2628 /* Record replacement regs used to deal with subreg operands. */
2629 struct reg_replace
2630 {
2631 rtx replacement[MAX_RECOG_OPERANDS];
2632 machine_mode mode;
2633 int n_allocated;
2634 int n_in_use;
2635 };
2636
2637 /* Allocate or reuse a replacement in R and return the rtx. */
2638
2639 static rtx
2640 get_replacement (struct reg_replace *r)
2641 {
2642 if (r->n_allocated == r->n_in_use)
2643 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2644 return r->replacement[r->n_in_use++];
2645 }
2646
2647 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2648 the presence of subregs would break the rules for most instructions.
2649 Replace them with a suitable new register of the right size, plus
2650 conversion copyin/copyout instructions. */
2651
2652 static void
2653 nvptx_reorg_subreg (void)
2654 {
2655 struct reg_replace qiregs, hiregs, siregs, diregs;
2656 rtx_insn *insn, *next;
2657
2658 qiregs.n_allocated = 0;
2659 hiregs.n_allocated = 0;
2660 siregs.n_allocated = 0;
2661 diregs.n_allocated = 0;
2662 qiregs.mode = QImode;
2663 hiregs.mode = HImode;
2664 siregs.mode = SImode;
2665 diregs.mode = DImode;
2666
2667 for (insn = get_insns (); insn; insn = next)
2668 {
2669 next = NEXT_INSN (insn);
2670 if (!NONDEBUG_INSN_P (insn)
2671 || asm_noperands (PATTERN (insn)) >= 0
2672 || GET_CODE (PATTERN (insn)) == USE
2673 || GET_CODE (PATTERN (insn)) == CLOBBER)
2674 continue;
2675
2676 qiregs.n_in_use = 0;
2677 hiregs.n_in_use = 0;
2678 siregs.n_in_use = 0;
2679 diregs.n_in_use = 0;
2680 extract_insn (insn);
2681 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2682
2683 for (int i = 0; i < recog_data.n_operands; i++)
2684 {
2685 rtx op = recog_data.operand[i];
2686 if (GET_CODE (op) != SUBREG)
2687 continue;
2688
2689 rtx inner = SUBREG_REG (op);
2690
2691 machine_mode outer_mode = GET_MODE (op);
2692 machine_mode inner_mode = GET_MODE (inner);
2693 gcc_assert (s_ok);
2694 if (s_ok
2695 && (GET_MODE_PRECISION (inner_mode)
2696 >= GET_MODE_PRECISION (outer_mode)))
2697 continue;
2698 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2699 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2700 : outer_mode == HImode ? &hiregs
2701 : outer_mode == SImode ? &siregs
2702 : &diregs);
2703 rtx new_reg = get_replacement (r);
2704
2705 if (recog_data.operand_type[i] != OP_OUT)
2706 {
2707 enum rtx_code code;
2708 if (GET_MODE_PRECISION (inner_mode)
2709 < GET_MODE_PRECISION (outer_mode))
2710 code = ZERO_EXTEND;
2711 else
2712 code = TRUNCATE;
2713
2714 rtx pat = gen_rtx_SET (new_reg,
2715 gen_rtx_fmt_e (code, outer_mode, inner));
2716 emit_insn_before (pat, insn);
2717 }
2718
2719 if (recog_data.operand_type[i] != OP_IN)
2720 {
2721 enum rtx_code code;
2722 if (GET_MODE_PRECISION (inner_mode)
2723 < GET_MODE_PRECISION (outer_mode))
2724 code = TRUNCATE;
2725 else
2726 code = ZERO_EXTEND;
2727
2728 rtx pat = gen_rtx_SET (inner,
2729 gen_rtx_fmt_e (code, inner_mode, new_reg));
2730 emit_insn_after (pat, insn);
2731 }
2732 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2733 }
2734 }
2735 }
2736
2737 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2738 first use. */
2739
2740 static rtx
2741 nvptx_get_unisimt_master ()
2742 {
2743 rtx &master = cfun->machine->unisimt_master;
2744 return master ? master : master = gen_reg_rtx (SImode);
2745 }
2746
2747 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2748
2749 static rtx
2750 nvptx_get_unisimt_predicate ()
2751 {
2752 rtx &pred = cfun->machine->unisimt_predicate;
2753 return pred ? pred : pred = gen_reg_rtx (BImode);
2754 }
2755
2756 /* Return true if given call insn references one of the functions provided by
2757 the CUDA runtime: malloc, free, vprintf. */
2758
2759 static bool
2760 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2761 {
2762 rtx pat = PATTERN (insn);
2763 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2764 pat = XVECEXP (pat, 0, 0);
2765 if (GET_CODE (pat) == SET)
2766 pat = SET_SRC (pat);
2767 gcc_checking_assert (GET_CODE (pat) == CALL
2768 && GET_CODE (XEXP (pat, 0)) == MEM);
2769 rtx addr = XEXP (XEXP (pat, 0), 0);
2770 if (GET_CODE (addr) != SYMBOL_REF)
2771 return false;
2772 const char *name = XSTR (addr, 0);
2773 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2774 references with forced assembler name refer to PTX syscalls. For vprintf,
2775 accept both normal and forced-assembler-name references. */
2776 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2777 || !strcmp (name, "*malloc")
2778 || !strcmp (name, "*free"));
2779 }
2780
2781 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2782 propagate its value from lane MASTER to current lane. */
2783
2784 static void
2785 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2786 {
2787 rtx reg;
2788 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2789 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2790 }
2791
2792 /* Adjust code for uniform-simt code generation variant by making atomics and
2793 "syscalls" conditionally executed, and inserting shuffle-based propagation
2794 for registers being set. */
2795
2796 static void
2797 nvptx_reorg_uniform_simt ()
2798 {
2799 rtx_insn *insn, *next;
2800
2801 for (insn = get_insns (); insn; insn = next)
2802 {
2803 next = NEXT_INSN (insn);
2804 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2805 && !(NONJUMP_INSN_P (insn)
2806 && GET_CODE (PATTERN (insn)) == PARALLEL
2807 && get_attr_atomic (insn)))
2808 continue;
2809 rtx pat = PATTERN (insn);
2810 rtx master = nvptx_get_unisimt_master ();
2811 for (int i = 0; i < XVECLEN (pat, 0); i++)
2812 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2813 rtx pred = nvptx_get_unisimt_predicate ();
2814 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2815 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2816 validate_change (insn, &PATTERN (insn), pat, false);
2817 }
2818 }
2819
2820 /* Loop structure of the function. The entire function is described as
2821 a NULL loop. */
2822
2823 struct parallel
2824 {
2825 /* Parent parallel. */
2826 parallel *parent;
2827
2828 /* Next sibling parallel. */
2829 parallel *next;
2830
2831 /* First child parallel. */
2832 parallel *inner;
2833
2834 /* Partitioning mask of the parallel. */
2835 unsigned mask;
2836
2837 /* Partitioning used within inner parallels. */
2838 unsigned inner_mask;
2839
2840 /* Location of parallel forked and join. The forked is the first
2841 block in the parallel and the join is the first block after of
2842 the partition. */
2843 basic_block forked_block;
2844 basic_block join_block;
2845
2846 rtx_insn *forked_insn;
2847 rtx_insn *join_insn;
2848
2849 rtx_insn *fork_insn;
2850 rtx_insn *joining_insn;
2851
2852 /* Basic blocks in this parallel, but not in child parallels. The
2853 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2854 blocks are not. */
2855 auto_vec<basic_block> blocks;
2856
2857 public:
2858 parallel (parallel *parent, unsigned mode);
2859 ~parallel ();
2860 };
2861
2862 /* Constructor links the new parallel into it's parent's chain of
2863 children. */
2864
2865 parallel::parallel (parallel *parent_, unsigned mask_)
2866 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2867 {
2868 forked_block = join_block = 0;
2869 forked_insn = join_insn = 0;
2870 fork_insn = joining_insn = 0;
2871
2872 if (parent)
2873 {
2874 next = parent->inner;
2875 parent->inner = this;
2876 }
2877 }
2878
2879 parallel::~parallel ()
2880 {
2881 delete inner;
2882 delete next;
2883 }
2884
2885 /* Map of basic blocks to insns */
2886 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2887
2888 /* A tuple of an insn of interest and the BB in which it resides. */
2889 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2890 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2891
2892 /* Split basic blocks such that each forked and join unspecs are at
2893 the start of their basic blocks. Thus afterwards each block will
2894 have a single partitioning mode. We also do the same for return
2895 insns, as they are executed by every thread. Return the
2896 partitioning mode of the function as a whole. Populate MAP with
2897 head and tail blocks. We also clear the BB visited flag, which is
2898 used when finding partitions. */
2899
2900 static void
2901 nvptx_split_blocks (bb_insn_map_t *map)
2902 {
2903 insn_bb_vec_t worklist;
2904 basic_block block;
2905 rtx_insn *insn;
2906
2907 /* Locate all the reorg instructions of interest. */
2908 FOR_ALL_BB_FN (block, cfun)
2909 {
2910 bool seen_insn = false;
2911
2912 /* Clear visited flag, for use by parallel locator */
2913 block->flags &= ~BB_VISITED;
2914
2915 FOR_BB_INSNS (block, insn)
2916 {
2917 if (!INSN_P (insn))
2918 continue;
2919 switch (recog_memoized (insn))
2920 {
2921 default:
2922 seen_insn = true;
2923 continue;
2924 case CODE_FOR_nvptx_forked:
2925 case CODE_FOR_nvptx_join:
2926 break;
2927
2928 case CODE_FOR_return:
2929 /* We also need to split just before return insns, as
2930 that insn needs executing by all threads, but the
2931 block it is in probably does not. */
2932 break;
2933 }
2934
2935 if (seen_insn)
2936 /* We've found an instruction that must be at the start of
2937 a block, but isn't. Add it to the worklist. */
2938 worklist.safe_push (insn_bb_t (insn, block));
2939 else
2940 /* It was already the first instruction. Just add it to
2941 the map. */
2942 map->get_or_insert (block) = insn;
2943 seen_insn = true;
2944 }
2945 }
2946
2947 /* Split blocks on the worklist. */
2948 unsigned ix;
2949 insn_bb_t *elt;
2950 basic_block remap = 0;
2951 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2952 {
2953 if (remap != elt->second)
2954 {
2955 block = elt->second;
2956 remap = block;
2957 }
2958
2959 /* Split block before insn. The insn is in the new block */
2960 edge e = split_block (block, PREV_INSN (elt->first));
2961
2962 block = e->dest;
2963 map->get_or_insert (block) = elt->first;
2964 }
2965 }
2966
2967 /* BLOCK is a basic block containing a head or tail instruction.
2968 Locate the associated prehead or pretail instruction, which must be
2969 in the single predecessor block. */
2970
2971 static rtx_insn *
2972 nvptx_discover_pre (basic_block block, int expected)
2973 {
2974 gcc_assert (block->preds->length () == 1);
2975 basic_block pre_block = (*block->preds)[0]->src;
2976 rtx_insn *pre_insn;
2977
2978 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2979 pre_insn = PREV_INSN (pre_insn))
2980 gcc_assert (pre_insn != BB_HEAD (pre_block));
2981
2982 gcc_assert (recog_memoized (pre_insn) == expected);
2983 return pre_insn;
2984 }
2985
2986 /* Dump this parallel and all its inner parallels. */
2987
2988 static void
2989 nvptx_dump_pars (parallel *par, unsigned depth)
2990 {
2991 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2992 depth, par->mask,
2993 par->forked_block ? par->forked_block->index : -1,
2994 par->join_block ? par->join_block->index : -1);
2995
2996 fprintf (dump_file, " blocks:");
2997
2998 basic_block block;
2999 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3000 fprintf (dump_file, " %d", block->index);
3001 fprintf (dump_file, "\n");
3002 if (par->inner)
3003 nvptx_dump_pars (par->inner, depth + 1);
3004
3005 if (par->next)
3006 nvptx_dump_pars (par->next, depth);
3007 }
3008
3009 /* If BLOCK contains a fork/join marker, process it to create or
3010 terminate a loop structure. Add this block to the current loop,
3011 and then walk successor blocks. */
3012
3013 static parallel *
3014 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3015 {
3016 if (block->flags & BB_VISITED)
3017 return par;
3018 block->flags |= BB_VISITED;
3019
3020 if (rtx_insn **endp = map->get (block))
3021 {
3022 rtx_insn *end = *endp;
3023
3024 /* This is a block head or tail, or return instruction. */
3025 switch (recog_memoized (end))
3026 {
3027 case CODE_FOR_return:
3028 /* Return instructions are in their own block, and we
3029 don't need to do anything more. */
3030 return par;
3031
3032 case CODE_FOR_nvptx_forked:
3033 /* Loop head, create a new inner loop and add it into
3034 our parent's child list. */
3035 {
3036 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3037
3038 gcc_assert (mask);
3039 par = new parallel (par, mask);
3040 par->forked_block = block;
3041 par->forked_insn = end;
3042 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3043 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3044 par->fork_insn
3045 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3046 }
3047 break;
3048
3049 case CODE_FOR_nvptx_join:
3050 /* A loop tail. Finish the current loop and return to
3051 parent. */
3052 {
3053 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3054
3055 gcc_assert (par->mask == mask);
3056 par->join_block = block;
3057 par->join_insn = end;
3058 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3059 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3060 par->joining_insn
3061 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3062 par = par->parent;
3063 }
3064 break;
3065
3066 default:
3067 gcc_unreachable ();
3068 }
3069 }
3070
3071 if (par)
3072 /* Add this block onto the current loop's list of blocks. */
3073 par->blocks.safe_push (block);
3074 else
3075 /* This must be the entry block. Create a NULL parallel. */
3076 par = new parallel (0, 0);
3077
3078 /* Walk successor blocks. */
3079 edge e;
3080 edge_iterator ei;
3081
3082 FOR_EACH_EDGE (e, ei, block->succs)
3083 nvptx_find_par (map, par, e->dest);
3084
3085 return par;
3086 }
3087
3088 /* DFS walk the CFG looking for fork & join markers. Construct
3089 loop structures as we go. MAP is a mapping of basic blocks
3090 to head & tail markers, discovered when splitting blocks. This
3091 speeds up the discovery. We rely on the BB visited flag having
3092 been cleared when splitting blocks. */
3093
3094 static parallel *
3095 nvptx_discover_pars (bb_insn_map_t *map)
3096 {
3097 basic_block block;
3098
3099 /* Mark exit blocks as visited. */
3100 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3101 block->flags |= BB_VISITED;
3102
3103 /* And entry block as not. */
3104 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3105 block->flags &= ~BB_VISITED;
3106
3107 parallel *par = nvptx_find_par (map, 0, block);
3108
3109 if (dump_file)
3110 {
3111 fprintf (dump_file, "\nLoops\n");
3112 nvptx_dump_pars (par, 0);
3113 fprintf (dump_file, "\n");
3114 }
3115
3116 return par;
3117 }
3118
3119 /* Analyse a group of BBs within a partitioned region and create N
3120 Single-Entry-Single-Exit regions. Some of those regions will be
3121 trivial ones consisting of a single BB. The blocks of a
3122 partitioned region might form a set of disjoint graphs -- because
3123 the region encloses a differently partitoned sub region.
3124
3125 We use the linear time algorithm described in 'Finding Regions Fast:
3126 Single Entry Single Exit and control Regions in Linear Time'
3127 Johnson, Pearson & Pingali. That algorithm deals with complete
3128 CFGs, where a back edge is inserted from END to START, and thus the
3129 problem becomes one of finding equivalent loops.
3130
3131 In this case we have a partial CFG. We complete it by redirecting
3132 any incoming edge to the graph to be from an arbitrary external BB,
3133 and similarly redirecting any outgoing edge to be to that BB.
3134 Thus we end up with a closed graph.
3135
3136 The algorithm works by building a spanning tree of an undirected
3137 graph and keeping track of back edges from nodes further from the
3138 root in the tree to nodes nearer to the root in the tree. In the
3139 description below, the root is up and the tree grows downwards.
3140
3141 We avoid having to deal with degenerate back-edges to the same
3142 block, by splitting each BB into 3 -- one for input edges, one for
3143 the node itself and one for the output edges. Such back edges are
3144 referred to as 'Brackets'. Cycle equivalent nodes will have the
3145 same set of brackets.
3146
3147 Determining bracket equivalency is done by maintaining a list of
3148 brackets in such a manner that the list length and final bracket
3149 uniquely identify the set.
3150
3151 We use coloring to mark all BBs with cycle equivalency with the
3152 same color. This is the output of the 'Finding Regions Fast'
3153 algorithm. Notice it doesn't actually find the set of nodes within
3154 a particular region, just unorderd sets of nodes that are the
3155 entries and exits of SESE regions.
3156
3157 After determining cycle equivalency, we need to find the minimal
3158 set of SESE regions. Do this with a DFS coloring walk of the
3159 complete graph. We're either 'looking' or 'coloring'. When
3160 looking, and we're in the subgraph, we start coloring the color of
3161 the current node, and remember that node as the start of the
3162 current color's SESE region. Every time we go to a new node, we
3163 decrement the count of nodes with thet color. If it reaches zero,
3164 we remember that node as the end of the current color's SESE region
3165 and return to 'looking'. Otherwise we color the node the current
3166 color.
3167
3168 This way we end up with coloring the inside of non-trivial SESE
3169 regions with the color of that region. */
3170
3171 /* A pair of BBs. We use this to represent SESE regions. */
3172 typedef std::pair<basic_block, basic_block> bb_pair_t;
3173 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3174
3175 /* A node in the undirected CFG. The discriminator SECOND indicates just
3176 above or just below the BB idicated by FIRST. */
3177 typedef std::pair<basic_block, int> pseudo_node_t;
3178
3179 /* A bracket indicates an edge towards the root of the spanning tree of the
3180 undirected graph. Each bracket has a color, determined
3181 from the currrent set of brackets. */
3182 struct bracket
3183 {
3184 pseudo_node_t back; /* Back target */
3185
3186 /* Current color and size of set. */
3187 unsigned color;
3188 unsigned size;
3189
3190 bracket (pseudo_node_t back_)
3191 : back (back_), color (~0u), size (~0u)
3192 {
3193 }
3194
3195 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3196 {
3197 if (length != size)
3198 {
3199 size = length;
3200 color = color_counts.length ();
3201 color_counts.quick_push (0);
3202 }
3203 color_counts[color]++;
3204 return color;
3205 }
3206 };
3207
3208 typedef auto_vec<bracket> bracket_vec_t;
3209
3210 /* Basic block info for finding SESE regions. */
3211
3212 struct bb_sese
3213 {
3214 int node; /* Node number in spanning tree. */
3215 int parent; /* Parent node number. */
3216
3217 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3218 edges arrive at pseudo-node Ai and the outgoing edges leave at
3219 pseudo-node Ao. We have to remember which way we arrived at a
3220 particular node when generating the spanning tree. dir > 0 means
3221 we arrived at Ai, dir < 0 means we arrived at Ao. */
3222 int dir;
3223
3224 /* Lowest numbered pseudo-node reached via a backedge from thsis
3225 node, or any descendant. */
3226 pseudo_node_t high;
3227
3228 int color; /* Cycle-equivalence color */
3229
3230 /* Stack of brackets for this node. */
3231 bracket_vec_t brackets;
3232
3233 bb_sese (unsigned node_, unsigned p, int dir_)
3234 :node (node_), parent (p), dir (dir_)
3235 {
3236 }
3237 ~bb_sese ();
3238
3239 /* Push a bracket ending at BACK. */
3240 void push (const pseudo_node_t &back)
3241 {
3242 if (dump_file)
3243 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3244 back.first ? back.first->index : 0, back.second);
3245 brackets.safe_push (bracket (back));
3246 }
3247
3248 void append (bb_sese *child);
3249 void remove (const pseudo_node_t &);
3250
3251 /* Set node's color. */
3252 void set_color (auto_vec<unsigned> &color_counts)
3253 {
3254 color = brackets.last ().get_color (color_counts, brackets.length ());
3255 }
3256 };
3257
3258 bb_sese::~bb_sese ()
3259 {
3260 }
3261
3262 /* Destructively append CHILD's brackets. */
3263
3264 void
3265 bb_sese::append (bb_sese *child)
3266 {
3267 if (int len = child->brackets.length ())
3268 {
3269 int ix;
3270
3271 if (dump_file)
3272 {
3273 for (ix = 0; ix < len; ix++)
3274 {
3275 const pseudo_node_t &pseudo = child->brackets[ix].back;
3276 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3277 child->node, pseudo.first ? pseudo.first->index : 0,
3278 pseudo.second);
3279 }
3280 }
3281 if (!brackets.length ())
3282 std::swap (brackets, child->brackets);
3283 else
3284 {
3285 brackets.reserve (len);
3286 for (ix = 0; ix < len; ix++)
3287 brackets.quick_push (child->brackets[ix]);
3288 }
3289 }
3290 }
3291
3292 /* Remove brackets that terminate at PSEUDO. */
3293
3294 void
3295 bb_sese::remove (const pseudo_node_t &pseudo)
3296 {
3297 unsigned removed = 0;
3298 int len = brackets.length ();
3299
3300 for (int ix = 0; ix < len; ix++)
3301 {
3302 if (brackets[ix].back == pseudo)
3303 {
3304 if (dump_file)
3305 fprintf (dump_file, "Removing backedge %d:%+d\n",
3306 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3307 removed++;
3308 }
3309 else if (removed)
3310 brackets[ix-removed] = brackets[ix];
3311 }
3312 while (removed--)
3313 brackets.pop ();
3314 }
3315
3316 /* Accessors for BB's aux pointer. */
3317 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3318 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3319
3320 /* DFS walk creating SESE data structures. Only cover nodes with
3321 BB_VISITED set. Append discovered blocks to LIST. We number in
3322 increments of 3 so that the above and below pseudo nodes can be
3323 implicitly numbered too. */
3324
3325 static int
3326 nvptx_sese_number (int n, int p, int dir, basic_block b,
3327 auto_vec<basic_block> *list)
3328 {
3329 if (BB_GET_SESE (b))
3330 return n;
3331
3332 if (dump_file)
3333 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3334 b->index, n, p, dir);
3335
3336 BB_SET_SESE (b, new bb_sese (n, p, dir));
3337 p = n;
3338
3339 n += 3;
3340 list->quick_push (b);
3341
3342 /* First walk the nodes on the 'other side' of this node, then walk
3343 the nodes on the same side. */
3344 for (unsigned ix = 2; ix; ix--)
3345 {
3346 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3347 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3348 : offsetof (edge_def, src));
3349 edge e;
3350 edge_iterator (ei);
3351
3352 FOR_EACH_EDGE (e, ei, edges)
3353 {
3354 basic_block target = *(basic_block *)((char *)e + offset);
3355
3356 if (target->flags & BB_VISITED)
3357 n = nvptx_sese_number (n, p, dir, target, list);
3358 }
3359 dir = -dir;
3360 }
3361 return n;
3362 }
3363
3364 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3365 EDGES are the outgoing edges and OFFSET is the offset to the src
3366 or dst block on the edges. */
3367
3368 static void
3369 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3370 vec<edge, va_gc> *edges, size_t offset)
3371 {
3372 edge e;
3373 edge_iterator (ei);
3374 int hi_back = depth;
3375 pseudo_node_t node_back (0, depth);
3376 int hi_child = depth;
3377 pseudo_node_t node_child (0, depth);
3378 basic_block child = NULL;
3379 unsigned num_children = 0;
3380 int usd = -dir * sese->dir;
3381
3382 if (dump_file)
3383 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3384 me->index, sese->node, dir);
3385
3386 if (dir < 0)
3387 {
3388 /* This is the above pseudo-child. It has the BB itself as an
3389 additional child node. */
3390 node_child = sese->high;
3391 hi_child = node_child.second;
3392 if (node_child.first)
3393 hi_child += BB_GET_SESE (node_child.first)->node;
3394 num_children++;
3395 }
3396
3397 /* Examine each edge.
3398 - if it is a child (a) append its bracket list and (b) record
3399 whether it is the child with the highest reaching bracket.
3400 - if it is an edge to ancestor, record whether it's the highest
3401 reaching backlink. */
3402 FOR_EACH_EDGE (e, ei, edges)
3403 {
3404 basic_block target = *(basic_block *)((char *)e + offset);
3405
3406 if (bb_sese *t_sese = BB_GET_SESE (target))
3407 {
3408 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3409 {
3410 /* Child node. Append its bracket list. */
3411 num_children++;
3412 sese->append (t_sese);
3413
3414 /* Compare it's hi value. */
3415 int t_hi = t_sese->high.second;
3416
3417 if (basic_block child_hi_block = t_sese->high.first)
3418 t_hi += BB_GET_SESE (child_hi_block)->node;
3419
3420 if (hi_child > t_hi)
3421 {
3422 hi_child = t_hi;
3423 node_child = t_sese->high;
3424 child = target;
3425 }
3426 }
3427 else if (t_sese->node < sese->node + dir
3428 && !(dir < 0 && sese->parent == t_sese->node))
3429 {
3430 /* Non-parental ancestor node -- a backlink. */
3431 int d = usd * t_sese->dir;
3432 int back = t_sese->node + d;
3433
3434 if (hi_back > back)
3435 {
3436 hi_back = back;
3437 node_back = pseudo_node_t (target, d);
3438 }
3439 }
3440 }
3441 else
3442 { /* Fallen off graph, backlink to entry node. */
3443 hi_back = 0;
3444 node_back = pseudo_node_t (0, 0);
3445 }
3446 }
3447
3448 /* Remove any brackets that terminate at this pseudo node. */
3449 sese->remove (pseudo_node_t (me, dir));
3450
3451 /* Now push any backlinks from this pseudo node. */
3452 FOR_EACH_EDGE (e, ei, edges)
3453 {
3454 basic_block target = *(basic_block *)((char *)e + offset);
3455 if (bb_sese *t_sese = BB_GET_SESE (target))
3456 {
3457 if (t_sese->node < sese->node + dir
3458 && !(dir < 0 && sese->parent == t_sese->node))
3459 /* Non-parental ancestor node - backedge from me. */
3460 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3461 }
3462 else
3463 {
3464 /* back edge to entry node */
3465 sese->push (pseudo_node_t (0, 0));
3466 }
3467 }
3468
3469 /* If this node leads directly or indirectly to a no-return region of
3470 the graph, then fake a backedge to entry node. */
3471 if (!sese->brackets.length () || !edges || !edges->length ())
3472 {
3473 hi_back = 0;
3474 node_back = pseudo_node_t (0, 0);
3475 sese->push (node_back);
3476 }
3477
3478 /* Record the highest reaching backedge from us or a descendant. */
3479 sese->high = hi_back < hi_child ? node_back : node_child;
3480
3481 if (num_children > 1)
3482 {
3483 /* There is more than one child -- this is a Y shaped piece of
3484 spanning tree. We have to insert a fake backedge from this
3485 node to the highest ancestor reached by not-the-highest
3486 reaching child. Note that there may be multiple children
3487 with backedges to the same highest node. That's ok and we
3488 insert the edge to that highest node. */
3489 hi_child = depth;
3490 if (dir < 0 && child)
3491 {
3492 node_child = sese->high;
3493 hi_child = node_child.second;
3494 if (node_child.first)
3495 hi_child += BB_GET_SESE (node_child.first)->node;
3496 }
3497
3498 FOR_EACH_EDGE (e, ei, edges)
3499 {
3500 basic_block target = *(basic_block *)((char *)e + offset);
3501
3502 if (target == child)
3503 /* Ignore the highest child. */
3504 continue;
3505
3506 bb_sese *t_sese = BB_GET_SESE (target);
3507 if (!t_sese)
3508 continue;
3509 if (t_sese->parent != sese->node)
3510 /* Not a child. */
3511 continue;
3512
3513 /* Compare its hi value. */
3514 int t_hi = t_sese->high.second;
3515
3516 if (basic_block child_hi_block = t_sese->high.first)
3517 t_hi += BB_GET_SESE (child_hi_block)->node;
3518
3519 if (hi_child > t_hi)
3520 {
3521 hi_child = t_hi;
3522 node_child = t_sese->high;
3523 }
3524 }
3525
3526 sese->push (node_child);
3527 }
3528 }
3529
3530
3531 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3532 proceed to successors. Set SESE entry and exit nodes of
3533 REGIONS. */
3534
3535 static void
3536 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3537 basic_block block, int coloring)
3538 {
3539 bb_sese *sese = BB_GET_SESE (block);
3540
3541 if (block->flags & BB_VISITED)
3542 {
3543 /* If we've already encountered this block, either we must not
3544 be coloring, or it must have been colored the current color. */
3545 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3546 return;
3547 }
3548
3549 block->flags |= BB_VISITED;
3550
3551 if (sese)
3552 {
3553 if (coloring < 0)
3554 {
3555 /* Start coloring a region. */
3556 regions[sese->color].first = block;
3557 coloring = sese->color;
3558 }
3559
3560 if (!--color_counts[sese->color] && sese->color == coloring)
3561 {
3562 /* Found final block of SESE region. */
3563 regions[sese->color].second = block;
3564 coloring = -1;
3565 }
3566 else
3567 /* Color the node, so we can assert on revisiting the node
3568 that the graph is indeed SESE. */
3569 sese->color = coloring;
3570 }
3571 else
3572 /* Fallen off the subgraph, we cannot be coloring. */
3573 gcc_assert (coloring < 0);
3574
3575 /* Walk each successor block. */
3576 if (block->succs && block->succs->length ())
3577 {
3578 edge e;
3579 edge_iterator ei;
3580
3581 FOR_EACH_EDGE (e, ei, block->succs)
3582 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3583 }
3584 else
3585 gcc_assert (coloring < 0);
3586 }
3587
3588 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3589 end up with NULL entries in it. */
3590
3591 static void
3592 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3593 {
3594 basic_block block;
3595 int ix;
3596
3597 /* First clear each BB of the whole function. */
3598 FOR_ALL_BB_FN (block, cfun)
3599 {
3600 block->flags &= ~BB_VISITED;
3601 BB_SET_SESE (block, 0);
3602 }
3603
3604 /* Mark blocks in the function that are in this graph. */
3605 for (ix = 0; blocks.iterate (ix, &block); ix++)
3606 block->flags |= BB_VISITED;
3607
3608 /* Counts of nodes assigned to each color. There cannot be more
3609 colors than blocks (and hopefully there will be fewer). */
3610 auto_vec<unsigned> color_counts;
3611 color_counts.reserve (blocks.length ());
3612
3613 /* Worklist of nodes in the spanning tree. Again, there cannot be
3614 more nodes in the tree than blocks (there will be fewer if the
3615 CFG of blocks is disjoint). */
3616 auto_vec<basic_block> spanlist;
3617 spanlist.reserve (blocks.length ());
3618
3619 /* Make sure every block has its cycle class determined. */
3620 for (ix = 0; blocks.iterate (ix, &block); ix++)
3621 {
3622 if (BB_GET_SESE (block))
3623 /* We already met this block in an earlier graph solve. */
3624 continue;
3625
3626 if (dump_file)
3627 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3628
3629 /* Number the nodes reachable from block initial DFS order. */
3630 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3631
3632 /* Now walk in reverse DFS order to find cycle equivalents. */
3633 while (spanlist.length ())
3634 {
3635 block = spanlist.pop ();
3636 bb_sese *sese = BB_GET_SESE (block);
3637
3638 /* Do the pseudo node below. */
3639 nvptx_sese_pseudo (block, sese, depth, +1,
3640 sese->dir > 0 ? block->succs : block->preds,
3641 (sese->dir > 0 ? offsetof (edge_def, dest)
3642 : offsetof (edge_def, src)));
3643 sese->set_color (color_counts);
3644 /* Do the pseudo node above. */
3645 nvptx_sese_pseudo (block, sese, depth, -1,
3646 sese->dir < 0 ? block->succs : block->preds,
3647 (sese->dir < 0 ? offsetof (edge_def, dest)
3648 : offsetof (edge_def, src)));
3649 }
3650 if (dump_file)
3651 fprintf (dump_file, "\n");
3652 }
3653
3654 if (dump_file)
3655 {
3656 unsigned count;
3657 const char *comma = "";
3658
3659 fprintf (dump_file, "Found %d cycle equivalents\n",
3660 color_counts.length ());
3661 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3662 {
3663 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3664
3665 comma = "";
3666 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3667 if (BB_GET_SESE (block)->color == ix)
3668 {
3669 block->flags |= BB_VISITED;
3670 fprintf (dump_file, "%s%d", comma, block->index);
3671 comma=",";
3672 }
3673 fprintf (dump_file, "}");
3674 comma = ", ";
3675 }
3676 fprintf (dump_file, "\n");
3677 }
3678
3679 /* Now we've colored every block in the subgraph. We now need to
3680 determine the minimal set of SESE regions that cover that
3681 subgraph. Do this with a DFS walk of the complete function.
3682 During the walk we're either 'looking' or 'coloring'. When we
3683 reach the last node of a particular color, we stop coloring and
3684 return to looking. */
3685
3686 /* There cannot be more SESE regions than colors. */
3687 regions.reserve (color_counts.length ());
3688 for (ix = color_counts.length (); ix--;)
3689 regions.quick_push (bb_pair_t (0, 0));
3690
3691 for (ix = 0; blocks.iterate (ix, &block); ix++)
3692 block->flags &= ~BB_VISITED;
3693
3694 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3695
3696 if (dump_file)
3697 {
3698 const char *comma = "";
3699 int len = regions.length ();
3700
3701 fprintf (dump_file, "SESE regions:");
3702 for (ix = 0; ix != len; ix++)
3703 {
3704 basic_block from = regions[ix].first;
3705 basic_block to = regions[ix].second;
3706
3707 if (from)
3708 {
3709 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3710 if (to != from)
3711 fprintf (dump_file, "->%d", to->index);
3712
3713 int color = BB_GET_SESE (from)->color;
3714
3715 /* Print the blocks within the region (excluding ends). */
3716 FOR_EACH_BB_FN (block, cfun)
3717 {
3718 bb_sese *sese = BB_GET_SESE (block);
3719
3720 if (sese && sese->color == color
3721 && block != from && block != to)
3722 fprintf (dump_file, ".%d", block->index);
3723 }
3724 fprintf (dump_file, "}");
3725 }
3726 comma = ",";
3727 }
3728 fprintf (dump_file, "\n\n");
3729 }
3730
3731 for (ix = 0; blocks.iterate (ix, &block); ix++)
3732 delete BB_GET_SESE (block);
3733 }
3734
3735 #undef BB_SET_SESE
3736 #undef BB_GET_SESE
3737
3738 /* Propagate live state at the start of a partitioned region. BLOCK
3739 provides the live register information, and might not contain
3740 INSN. Propagation is inserted just after INSN. RW indicates whether
3741 we are reading and/or writing state. This
3742 separation is needed for worker-level proppagation where we
3743 essentially do a spill & fill. FN is the underlying worker
3744 function to generate the propagation instructions for single
3745 register. DATA is user data.
3746
3747 We propagate the live register set and the entire frame. We could
3748 do better by (a) propagating just the live set that is used within
3749 the partitioned regions and (b) only propagating stack entries that
3750 are used. The latter might be quite hard to determine. */
3751
3752 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3753
3754 static void
3755 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3756 propagator_fn fn, void *data)
3757 {
3758 bitmap live = DF_LIVE_IN (block);
3759 bitmap_iterator iterator;
3760 unsigned ix;
3761
3762 /* Copy the frame array. */
3763 HOST_WIDE_INT fs = get_frame_size ();
3764 if (fs)
3765 {
3766 rtx tmp = gen_reg_rtx (DImode);
3767 rtx idx = NULL_RTX;
3768 rtx ptr = gen_reg_rtx (Pmode);
3769 rtx pred = NULL_RTX;
3770 rtx_code_label *label = NULL;
3771
3772 /* The frame size might not be DImode compatible, but the frame
3773 array's declaration will be. So it's ok to round up here. */
3774 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3775 /* Detect single iteration loop. */
3776 if (fs == 1)
3777 fs = 0;
3778
3779 start_sequence ();
3780 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3781 if (fs)
3782 {
3783 idx = gen_reg_rtx (SImode);
3784 pred = gen_reg_rtx (BImode);
3785 label = gen_label_rtx ();
3786
3787 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3788 /* Allow worker function to initialize anything needed. */
3789 rtx init = fn (tmp, PM_loop_begin, fs, data);
3790 if (init)
3791 emit_insn (init);
3792 emit_label (label);
3793 LABEL_NUSES (label)++;
3794 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3795 }
3796 if (rw & PM_read)
3797 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3798 emit_insn (fn (tmp, rw, fs, data));
3799 if (rw & PM_write)
3800 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3801 if (fs)
3802 {
3803 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3804 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3805 emit_insn (gen_br_true_uni (pred, label));
3806 rtx fini = fn (tmp, PM_loop_end, fs, data);
3807 if (fini)
3808 emit_insn (fini);
3809 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3810 }
3811 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3812 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3813 rtx cpy = get_insns ();
3814 end_sequence ();
3815 insn = emit_insn_after (cpy, insn);
3816 }
3817
3818 /* Copy live registers. */
3819 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3820 {
3821 rtx reg = regno_reg_rtx[ix];
3822
3823 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3824 {
3825 rtx bcast = fn (reg, rw, 0, data);
3826
3827 insn = emit_insn_after (bcast, insn);
3828 }
3829 }
3830 }
3831
3832 /* Worker for nvptx_vpropagate. */
3833
3834 static rtx
3835 vprop_gen (rtx reg, propagate_mask pm,
3836 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3837 {
3838 if (!(pm & PM_read_write))
3839 return 0;
3840
3841 return nvptx_gen_vcast (reg);
3842 }
3843
3844 /* Propagate state that is live at start of BLOCK across the vectors
3845 of a single warp. Propagation is inserted just after INSN. */
3846
3847 static void
3848 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3849 {
3850 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3851 }
3852
3853 /* Worker for nvptx_wpropagate. */
3854
3855 static rtx
3856 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3857 {
3858 wcast_data_t *data = (wcast_data_t *)data_;
3859
3860 if (pm & PM_loop_begin)
3861 {
3862 /* Starting a loop, initialize pointer. */
3863 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3864
3865 if (align > worker_bcast_align)
3866 worker_bcast_align = align;
3867 data->offset = (data->offset + align - 1) & ~(align - 1);
3868
3869 data->ptr = gen_reg_rtx (Pmode);
3870
3871 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3872 }
3873 else if (pm & PM_loop_end)
3874 {
3875 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3876 data->ptr = NULL_RTX;
3877 return clobber;
3878 }
3879 else
3880 return nvptx_gen_wcast (reg, pm, rep, data);
3881 }
3882
3883 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3884 indicates if this is just before partitioned mode (do spill), or
3885 just after it starts (do fill). Sequence is inserted just after
3886 INSN. */
3887
3888 static void
3889 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3890 {
3891 wcast_data_t data;
3892
3893 data.base = gen_reg_rtx (Pmode);
3894 data.offset = 0;
3895 data.ptr = NULL_RTX;
3896
3897 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3898 if (data.offset)
3899 {
3900 /* Stuff was emitted, initialize the base pointer now. */
3901 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3902 emit_insn_after (init, insn);
3903
3904 if (worker_bcast_size < data.offset)
3905 worker_bcast_size = data.offset;
3906 }
3907 }
3908
3909 /* Emit a worker-level synchronization barrier. We use different
3910 markers for before and after synchronizations. */
3911
3912 static rtx
3913 nvptx_wsync (bool after)
3914 {
3915 return gen_nvptx_barsync (GEN_INT (after));
3916 }
3917
3918 #if WORKAROUND_PTXJIT_BUG
3919 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3920 real insns. */
3921
3922 static rtx_insn *
3923 bb_first_real_insn (basic_block bb)
3924 {
3925 rtx_insn *insn;
3926
3927 /* Find first insn of from block. */
3928 FOR_BB_INSNS (bb, insn)
3929 if (INSN_P (insn))
3930 return insn;
3931
3932 return 0;
3933 }
3934 #endif
3935
3936 /* Single neutering according to MASK. FROM is the incoming block and
3937 TO is the outgoing block. These may be the same block. Insert at
3938 start of FROM:
3939
3940 if (tid.<axis>) goto end.
3941
3942 and insert before ending branch of TO (if there is such an insn):
3943
3944 end:
3945 <possibly-broadcast-cond>
3946 <branch>
3947
3948 We currently only use differnt FROM and TO when skipping an entire
3949 loop. We could do more if we detected superblocks. */
3950
3951 static void
3952 nvptx_single (unsigned mask, basic_block from, basic_block to)
3953 {
3954 rtx_insn *head = BB_HEAD (from);
3955 rtx_insn *tail = BB_END (to);
3956 unsigned skip_mask = mask;
3957
3958 while (true)
3959 {
3960 /* Find first insn of from block. */
3961 while (head != BB_END (from) && !INSN_P (head))
3962 head = NEXT_INSN (head);
3963
3964 if (from == to)
3965 break;
3966
3967 if (!(JUMP_P (head) && single_succ_p (from)))
3968 break;
3969
3970 basic_block jump_target = single_succ (from);
3971 if (!single_pred_p (jump_target))
3972 break;
3973
3974 from = jump_target;
3975 head = BB_HEAD (from);
3976 }
3977
3978 /* Find last insn of to block */
3979 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3980 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3981 tail = PREV_INSN (tail);
3982
3983 /* Detect if tail is a branch. */
3984 rtx tail_branch = NULL_RTX;
3985 rtx cond_branch = NULL_RTX;
3986 if (tail && INSN_P (tail))
3987 {
3988 tail_branch = PATTERN (tail);
3989 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3990 tail_branch = NULL_RTX;
3991 else
3992 {
3993 cond_branch = SET_SRC (tail_branch);
3994 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3995 cond_branch = NULL_RTX;
3996 }
3997 }
3998
3999 if (tail == head)
4000 {
4001 /* If this is empty, do nothing. */
4002 if (!head || !INSN_P (head))
4003 return;
4004
4005 /* If this is a dummy insn, do nothing. */
4006 switch (recog_memoized (head))
4007 {
4008 default:
4009 break;
4010 case CODE_FOR_nvptx_fork:
4011 case CODE_FOR_nvptx_forked:
4012 case CODE_FOR_nvptx_joining:
4013 case CODE_FOR_nvptx_join:
4014 return;
4015 }
4016
4017 if (cond_branch)
4018 {
4019 /* If we're only doing vector single, there's no need to
4020 emit skip code because we'll not insert anything. */
4021 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4022 skip_mask = 0;
4023 }
4024 else if (tail_branch)
4025 /* Block with only unconditional branch. Nothing to do. */
4026 return;
4027 }
4028
4029 /* Insert the vector test inside the worker test. */
4030 unsigned mode;
4031 rtx_insn *before = tail;
4032 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4033 if (GOMP_DIM_MASK (mode) & skip_mask)
4034 {
4035 rtx_code_label *label = gen_label_rtx ();
4036 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4037
4038 if (!pred)
4039 {
4040 pred = gen_reg_rtx (BImode);
4041 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4042 }
4043
4044 rtx br;
4045 if (mode == GOMP_DIM_VECTOR)
4046 br = gen_br_true (pred, label);
4047 else
4048 br = gen_br_true_uni (pred, label);
4049 emit_insn_before (br, head);
4050
4051 LABEL_NUSES (label)++;
4052 if (tail_branch)
4053 before = emit_label_before (label, before);
4054 else
4055 emit_label_after (label, tail);
4056 }
4057
4058 /* Now deal with propagating the branch condition. */
4059 if (cond_branch)
4060 {
4061 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4062
4063 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4064 {
4065 /* Vector mode only, do a shuffle. */
4066 #if WORKAROUND_PTXJIT_BUG
4067 /* The branch condition %rcond is propagated like this:
4068
4069 {
4070 .reg .u32 %x;
4071 mov.u32 %x,%tid.x;
4072 setp.ne.u32 %rnotvzero,%x,0;
4073 }
4074
4075 @%rnotvzero bra Lskip;
4076 setp.<op>.<type> %rcond,op1,op2;
4077 Lskip:
4078 selp.u32 %rcondu32,1,0,%rcond;
4079 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4080 setp.ne.u32 %rcond,%rcondu32,0;
4081
4082 There seems to be a bug in the ptx JIT compiler (observed at driver
4083 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4084 unless %rcond is initialized to something before 'bra Lskip'. The
4085 bug is not observed with ptxas from cuda 8.0.61.
4086
4087 It is true that the code is non-trivial: at Lskip, %rcond is
4088 uninitialized in threads 1-31, and after the selp the same holds
4089 for %rcondu32. But shfl propagates the defined value in thread 0
4090 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4091 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4092
4093 There is nothing in the PTX spec to suggest that this is wrong, or
4094 to explain why the extra initialization is needed. So, we classify
4095 it as a JIT bug, and the extra initialization as workaround. */
4096 emit_insn_before (gen_movbi (pvar, const0_rtx),
4097 bb_first_real_insn (from));
4098 #endif
4099 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4100 }
4101 else
4102 {
4103 /* Includes worker mode, do spill & fill. By construction
4104 we should never have worker mode only. */
4105 wcast_data_t data;
4106
4107 data.base = worker_bcast_sym;
4108 data.ptr = 0;
4109
4110 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4111 worker_bcast_size = GET_MODE_SIZE (SImode);
4112
4113 data.offset = 0;
4114 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4115 before);
4116 /* Barrier so other workers can see the write. */
4117 emit_insn_before (nvptx_wsync (false), tail);
4118 data.offset = 0;
4119 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4120 /* This barrier is needed to avoid worker zero clobbering
4121 the broadcast buffer before all the other workers have
4122 had a chance to read this instance of it. */
4123 emit_insn_before (nvptx_wsync (true), tail);
4124 }
4125
4126 extract_insn (tail);
4127 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4128 UNSPEC_BR_UNIFIED);
4129 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4130 }
4131 }
4132
4133 /* PAR is a parallel that is being skipped in its entirety according to
4134 MASK. Treat this as skipping a superblock starting at forked
4135 and ending at joining. */
4136
4137 static void
4138 nvptx_skip_par (unsigned mask, parallel *par)
4139 {
4140 basic_block tail = par->join_block;
4141 gcc_assert (tail->preds->length () == 1);
4142
4143 basic_block pre_tail = (*tail->preds)[0]->src;
4144 gcc_assert (pre_tail->succs->length () == 1);
4145
4146 nvptx_single (mask, par->forked_block, pre_tail);
4147 }
4148
4149 /* If PAR has a single inner parallel and PAR itself only contains
4150 empty entry and exit blocks, swallow the inner PAR. */
4151
4152 static void
4153 nvptx_optimize_inner (parallel *par)
4154 {
4155 parallel *inner = par->inner;
4156
4157 /* We mustn't be the outer dummy par. */
4158 if (!par->mask)
4159 return;
4160
4161 /* We must have a single inner par. */
4162 if (!inner || inner->next)
4163 return;
4164
4165 /* We must only contain 2 blocks ourselves -- the head and tail of
4166 the inner par. */
4167 if (par->blocks.length () != 2)
4168 return;
4169
4170 /* We must be disjoint partitioning. As we only have vector and
4171 worker partitioning, this is sufficient to guarantee the pars
4172 have adjacent partitioning. */
4173 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4174 /* This indicates malformed code generation. */
4175 return;
4176
4177 /* The outer forked insn should be immediately followed by the inner
4178 fork insn. */
4179 rtx_insn *forked = par->forked_insn;
4180 rtx_insn *fork = BB_END (par->forked_block);
4181
4182 if (NEXT_INSN (forked) != fork)
4183 return;
4184 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4185
4186 /* The outer joining insn must immediately follow the inner join
4187 insn. */
4188 rtx_insn *joining = par->joining_insn;
4189 rtx_insn *join = inner->join_insn;
4190 if (NEXT_INSN (join) != joining)
4191 return;
4192
4193 /* Preconditions met. Swallow the inner par. */
4194 if (dump_file)
4195 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4196 inner->mask, inner->forked_block->index,
4197 inner->join_block->index,
4198 par->mask, par->forked_block->index, par->join_block->index);
4199
4200 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4201
4202 par->blocks.reserve (inner->blocks.length ());
4203 while (inner->blocks.length ())
4204 par->blocks.quick_push (inner->blocks.pop ());
4205
4206 par->inner = inner->inner;
4207 inner->inner = NULL;
4208
4209 delete inner;
4210 }
4211
4212 /* Process the parallel PAR and all its contained
4213 parallels. We do everything but the neutering. Return mask of
4214 partitioned modes used within this parallel. */
4215
4216 static unsigned
4217 nvptx_process_pars (parallel *par)
4218 {
4219 if (nvptx_optimize)
4220 nvptx_optimize_inner (par);
4221
4222 unsigned inner_mask = par->mask;
4223
4224 /* Do the inner parallels first. */
4225 if (par->inner)
4226 {
4227 par->inner_mask = nvptx_process_pars (par->inner);
4228 inner_mask |= par->inner_mask;
4229 }
4230
4231 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4232 /* No propagation needed for a call. */;
4233 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4234 {
4235 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4236 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4237 /* Insert begin and end synchronizations. */
4238 emit_insn_after (nvptx_wsync (false), par->forked_insn);
4239 emit_insn_before (nvptx_wsync (true), par->joining_insn);
4240 }
4241 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4242 nvptx_vpropagate (par->forked_block, par->forked_insn);
4243
4244 /* Now do siblings. */
4245 if (par->next)
4246 inner_mask |= nvptx_process_pars (par->next);
4247 return inner_mask;
4248 }
4249
4250 /* Neuter the parallel described by PAR. We recurse in depth-first
4251 order. MODES are the partitioning of the execution and OUTER is
4252 the partitioning of the parallels we are contained in. */
4253
4254 static void
4255 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4256 {
4257 unsigned me = (par->mask
4258 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4259 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4260 unsigned skip_mask = 0, neuter_mask = 0;
4261
4262 if (par->inner)
4263 nvptx_neuter_pars (par->inner, modes, outer | me);
4264
4265 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4266 {
4267 if ((outer | me) & GOMP_DIM_MASK (mode))
4268 {} /* Mode is partitioned: no neutering. */
4269 else if (!(modes & GOMP_DIM_MASK (mode)))
4270 {} /* Mode is not used: nothing to do. */
4271 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4272 || !par->forked_insn)
4273 /* Partitioned in inner parallels, or we're not a partitioned
4274 at all: neuter individual blocks. */
4275 neuter_mask |= GOMP_DIM_MASK (mode);
4276 else if (!par->parent || !par->parent->forked_insn
4277 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4278 /* Parent isn't a parallel or contains this paralleling: skip
4279 parallel at this level. */
4280 skip_mask |= GOMP_DIM_MASK (mode);
4281 else
4282 {} /* Parent will skip this parallel itself. */
4283 }
4284
4285 if (neuter_mask)
4286 {
4287 int ix, len;
4288
4289 if (nvptx_optimize)
4290 {
4291 /* Neuter whole SESE regions. */
4292 bb_pair_vec_t regions;
4293
4294 nvptx_find_sese (par->blocks, regions);
4295 len = regions.length ();
4296 for (ix = 0; ix != len; ix++)
4297 {
4298 basic_block from = regions[ix].first;
4299 basic_block to = regions[ix].second;
4300
4301 if (from)
4302 nvptx_single (neuter_mask, from, to);
4303 else
4304 gcc_assert (!to);
4305 }
4306 }
4307 else
4308 {
4309 /* Neuter each BB individually. */
4310 len = par->blocks.length ();
4311 for (ix = 0; ix != len; ix++)
4312 {
4313 basic_block block = par->blocks[ix];
4314
4315 nvptx_single (neuter_mask, block, block);
4316 }
4317 }
4318 }
4319
4320 if (skip_mask)
4321 nvptx_skip_par (skip_mask, par);
4322
4323 if (par->next)
4324 nvptx_neuter_pars (par->next, modes, outer);
4325 }
4326
4327 /* PTX-specific reorganization
4328 - Split blocks at fork and join instructions
4329 - Compute live registers
4330 - Mark now-unused registers, so function begin doesn't declare
4331 unused registers.
4332 - Insert state propagation when entering partitioned mode
4333 - Insert neutering instructions when in single mode
4334 - Replace subregs with suitable sequences.
4335 */
4336
4337 static void
4338 nvptx_reorg (void)
4339 {
4340 /* We are freeing block_for_insn in the toplev to keep compatibility
4341 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4342 compute_bb_for_insn ();
4343
4344 thread_prologue_and_epilogue_insns ();
4345
4346 /* Split blocks and record interesting unspecs. */
4347 bb_insn_map_t bb_insn_map;
4348
4349 nvptx_split_blocks (&bb_insn_map);
4350
4351 /* Compute live regs */
4352 df_clear_flags (DF_LR_RUN_DCE);
4353 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4354 df_live_add_problem ();
4355 df_live_set_all_dirty ();
4356 df_analyze ();
4357 regstat_init_n_sets_and_refs ();
4358
4359 if (dump_file)
4360 df_dump (dump_file);
4361
4362 /* Mark unused regs as unused. */
4363 int max_regs = max_reg_num ();
4364 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4365 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4366 regno_reg_rtx[i] = const0_rtx;
4367
4368 /* Determine launch dimensions of the function. If it is not an
4369 offloaded function (i.e. this is a regular compiler), the
4370 function has no neutering. */
4371 tree attr = oacc_get_fn_attrib (current_function_decl);
4372 if (attr)
4373 {
4374 /* If we determined this mask before RTL expansion, we could
4375 elide emission of some levels of forks and joins. */
4376 unsigned mask = 0;
4377 tree dims = TREE_VALUE (attr);
4378 unsigned ix;
4379
4380 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4381 {
4382 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4383 tree allowed = TREE_PURPOSE (dims);
4384
4385 if (size != 1 && !(allowed && integer_zerop (allowed)))
4386 mask |= GOMP_DIM_MASK (ix);
4387 }
4388 /* If there is worker neutering, there must be vector
4389 neutering. Otherwise the hardware will fail. */
4390 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4391 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4392
4393 /* Discover & process partitioned regions. */
4394 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4395 nvptx_process_pars (pars);
4396 nvptx_neuter_pars (pars, mask, 0);
4397 delete pars;
4398 }
4399
4400 /* Replace subregs. */
4401 nvptx_reorg_subreg ();
4402
4403 if (TARGET_UNIFORM_SIMT)
4404 nvptx_reorg_uniform_simt ();
4405
4406 regstat_free_n_sets_and_refs ();
4407
4408 df_finish_pass (true);
4409 }
4410 \f
4411 /* Handle a "kernel" attribute; arguments as in
4412 struct attribute_spec.handler. */
4413
4414 static tree
4415 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4416 int ARG_UNUSED (flags), bool *no_add_attrs)
4417 {
4418 tree decl = *node;
4419
4420 if (TREE_CODE (decl) != FUNCTION_DECL)
4421 {
4422 error ("%qE attribute only applies to functions", name);
4423 *no_add_attrs = true;
4424 }
4425 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4426 {
4427 error ("%qE attribute requires a void return type", name);
4428 *no_add_attrs = true;
4429 }
4430
4431 return NULL_TREE;
4432 }
4433
4434 /* Handle a "shared" attribute; arguments as in
4435 struct attribute_spec.handler. */
4436
4437 static tree
4438 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4439 int ARG_UNUSED (flags), bool *no_add_attrs)
4440 {
4441 tree decl = *node;
4442
4443 if (TREE_CODE (decl) != VAR_DECL)
4444 {
4445 error ("%qE attribute only applies to variables", name);
4446 *no_add_attrs = true;
4447 }
4448 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4449 {
4450 error ("%qE attribute not allowed with auto storage class", name);
4451 *no_add_attrs = true;
4452 }
4453
4454 return NULL_TREE;
4455 }
4456
4457 /* Table of valid machine attributes. */
4458 static const struct attribute_spec nvptx_attribute_table[] =
4459 {
4460 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4461 affects_type_identity, handler, exclude } */
4462 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4463 NULL },
4464 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4465 NULL },
4466 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4467 };
4468 \f
4469 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4470
4471 static HOST_WIDE_INT
4472 nvptx_vector_alignment (const_tree type)
4473 {
4474 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4475
4476 return MIN (align, BIGGEST_ALIGNMENT);
4477 }
4478
4479 /* Indicate that INSN cannot be duplicated. */
4480
4481 static bool
4482 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4483 {
4484 switch (recog_memoized (insn))
4485 {
4486 case CODE_FOR_nvptx_shufflesi:
4487 case CODE_FOR_nvptx_shufflesf:
4488 case CODE_FOR_nvptx_barsync:
4489 case CODE_FOR_nvptx_fork:
4490 case CODE_FOR_nvptx_forked:
4491 case CODE_FOR_nvptx_joining:
4492 case CODE_FOR_nvptx_join:
4493 return true;
4494 default:
4495 return false;
4496 }
4497 }
4498
4499 /* Section anchors do not work. Initialization for flag_section_anchor
4500 probes the existence of the anchoring target hooks and prevents
4501 anchoring if they don't exist. However, we may be being used with
4502 a host-side compiler that does support anchoring, and hence see
4503 the anchor flag set (as it's not recalculated). So provide an
4504 implementation denying anchoring. */
4505
4506 static bool
4507 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4508 {
4509 return false;
4510 }
4511 \f
4512 /* Record a symbol for mkoffload to enter into the mapping table. */
4513
4514 static void
4515 nvptx_record_offload_symbol (tree decl)
4516 {
4517 switch (TREE_CODE (decl))
4518 {
4519 case VAR_DECL:
4520 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4521 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4522 break;
4523
4524 case FUNCTION_DECL:
4525 {
4526 tree attr = oacc_get_fn_attrib (decl);
4527 /* OpenMP offloading does not set this attribute. */
4528 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4529
4530 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4531 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4532
4533 for (; dims; dims = TREE_CHAIN (dims))
4534 {
4535 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4536
4537 gcc_assert (!TREE_PURPOSE (dims));
4538 fprintf (asm_out_file, ", %#x", size);
4539 }
4540
4541 fprintf (asm_out_file, "\n");
4542 }
4543 break;
4544
4545 default:
4546 gcc_unreachable ();
4547 }
4548 }
4549
4550 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4551 at the start of a file. */
4552
4553 static void
4554 nvptx_file_start (void)
4555 {
4556 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4557 fputs ("\t.version\t3.1\n", asm_out_file);
4558 fputs ("\t.target\tsm_30\n", asm_out_file);
4559 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4560 fputs ("// END PREAMBLE\n", asm_out_file);
4561 }
4562
4563 /* Emit a declaration for a worker-level buffer in .shared memory. */
4564
4565 static void
4566 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4567 {
4568 const char *name = XSTR (sym, 0);
4569
4570 write_var_marker (file, true, false, name);
4571 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4572 align, name, size);
4573 }
4574
4575 /* Write out the function declarations we've collected and declare storage
4576 for the broadcast buffer. */
4577
4578 static void
4579 nvptx_file_end (void)
4580 {
4581 hash_table<tree_hasher>::iterator iter;
4582 tree decl;
4583 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4584 nvptx_record_fndecl (decl);
4585 fputs (func_decls.str().c_str(), asm_out_file);
4586
4587 if (worker_bcast_size)
4588 write_worker_buffer (asm_out_file, worker_bcast_sym,
4589 worker_bcast_align, worker_bcast_size);
4590
4591 if (worker_red_size)
4592 write_worker_buffer (asm_out_file, worker_red_sym,
4593 worker_red_align, worker_red_size);
4594
4595 if (need_softstack_decl)
4596 {
4597 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4598 /* 32 is the maximum number of warps in a block. Even though it's an
4599 external declaration, emit the array size explicitly; otherwise, it
4600 may fail at PTX JIT time if the definition is later in link order. */
4601 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4602 POINTER_SIZE);
4603 }
4604 if (need_unisimt_decl)
4605 {
4606 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4607 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4608 }
4609 }
4610
4611 /* Expander for the shuffle builtins. */
4612
4613 static rtx
4614 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4615 {
4616 if (ignore)
4617 return target;
4618
4619 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4620 NULL_RTX, mode, EXPAND_NORMAL);
4621 if (!REG_P (src))
4622 src = copy_to_mode_reg (mode, src);
4623
4624 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4625 NULL_RTX, SImode, EXPAND_NORMAL);
4626 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4627 NULL_RTX, SImode, EXPAND_NORMAL);
4628
4629 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4630 idx = copy_to_mode_reg (SImode, idx);
4631
4632 rtx pat = nvptx_gen_shuffle (target, src, idx,
4633 (nvptx_shuffle_kind) INTVAL (op));
4634 if (pat)
4635 emit_insn (pat);
4636
4637 return target;
4638 }
4639
4640 /* Worker reduction address expander. */
4641
4642 static rtx
4643 nvptx_expand_worker_addr (tree exp, rtx target,
4644 machine_mode ARG_UNUSED (mode), int ignore)
4645 {
4646 if (ignore)
4647 return target;
4648
4649 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4650 if (align > worker_red_align)
4651 worker_red_align = align;
4652
4653 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4654 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4655 if (size + offset > worker_red_size)
4656 worker_red_size = size + offset;
4657
4658 rtx addr = worker_red_sym;
4659 if (offset)
4660 {
4661 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4662 addr = gen_rtx_CONST (Pmode, addr);
4663 }
4664
4665 emit_move_insn (target, addr);
4666
4667 return target;
4668 }
4669
4670 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4671 not require taking the address of any object, other than the memory
4672 cell being operated on. */
4673
4674 static rtx
4675 nvptx_expand_cmp_swap (tree exp, rtx target,
4676 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4677 {
4678 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4679
4680 if (!target)
4681 target = gen_reg_rtx (mode);
4682
4683 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4684 NULL_RTX, Pmode, EXPAND_NORMAL);
4685 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4686 NULL_RTX, mode, EXPAND_NORMAL);
4687 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4688 NULL_RTX, mode, EXPAND_NORMAL);
4689 rtx pat;
4690
4691 mem = gen_rtx_MEM (mode, mem);
4692 if (!REG_P (cmp))
4693 cmp = copy_to_mode_reg (mode, cmp);
4694 if (!REG_P (src))
4695 src = copy_to_mode_reg (mode, src);
4696
4697 if (mode == SImode)
4698 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4699 else
4700 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4701
4702 emit_insn (pat);
4703
4704 return target;
4705 }
4706
4707
4708 /* Codes for all the NVPTX builtins. */
4709 enum nvptx_builtins
4710 {
4711 NVPTX_BUILTIN_SHUFFLE,
4712 NVPTX_BUILTIN_SHUFFLELL,
4713 NVPTX_BUILTIN_WORKER_ADDR,
4714 NVPTX_BUILTIN_CMP_SWAP,
4715 NVPTX_BUILTIN_CMP_SWAPLL,
4716 NVPTX_BUILTIN_MAX
4717 };
4718
4719 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4720
4721 /* Return the NVPTX builtin for CODE. */
4722
4723 static tree
4724 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4725 {
4726 if (code >= NVPTX_BUILTIN_MAX)
4727 return error_mark_node;
4728
4729 return nvptx_builtin_decls[code];
4730 }
4731
4732 /* Set up all builtin functions for this target. */
4733
4734 static void
4735 nvptx_init_builtins (void)
4736 {
4737 #define DEF(ID, NAME, T) \
4738 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4739 = add_builtin_function ("__builtin_nvptx_" NAME, \
4740 build_function_type_list T, \
4741 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4742 #define ST sizetype
4743 #define UINT unsigned_type_node
4744 #define LLUINT long_long_unsigned_type_node
4745 #define PTRVOID ptr_type_node
4746
4747 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4748 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4749 DEF (WORKER_ADDR, "worker_addr",
4750 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4751 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4752 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4753
4754 #undef DEF
4755 #undef ST
4756 #undef UINT
4757 #undef LLUINT
4758 #undef PTRVOID
4759 }
4760
4761 /* Expand an expression EXP that calls a built-in function,
4762 with result going to TARGET if that's convenient
4763 (and in mode MODE if that's convenient).
4764 SUBTARGET may be used as the target for computing one of EXP's operands.
4765 IGNORE is nonzero if the value is to be ignored. */
4766
4767 static rtx
4768 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4769 machine_mode mode, int ignore)
4770 {
4771 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4772 switch (DECL_FUNCTION_CODE (fndecl))
4773 {
4774 case NVPTX_BUILTIN_SHUFFLE:
4775 case NVPTX_BUILTIN_SHUFFLELL:
4776 return nvptx_expand_shuffle (exp, target, mode, ignore);
4777
4778 case NVPTX_BUILTIN_WORKER_ADDR:
4779 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4780
4781 case NVPTX_BUILTIN_CMP_SWAP:
4782 case NVPTX_BUILTIN_CMP_SWAPLL:
4783 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4784
4785 default: gcc_unreachable ();
4786 }
4787 }
4788 \f
4789 /* Define dimension sizes for known hardware. */
4790 #define PTX_VECTOR_LENGTH 32
4791 #define PTX_WORKER_LENGTH 32
4792 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4793
4794 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4795
4796 static int
4797 nvptx_simt_vf ()
4798 {
4799 return PTX_VECTOR_LENGTH;
4800 }
4801
4802 /* Validate compute dimensions of an OpenACC offload or routine, fill
4803 in non-unity defaults. FN_LEVEL indicates the level at which a
4804 routine might spawn a loop. It is negative for non-routines. If
4805 DECL is null, we are validating the default dimensions. */
4806
4807 static bool
4808 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4809 {
4810 bool changed = false;
4811
4812 /* The vector size must be 32, unless this is a SEQ routine. */
4813 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4814 && dims[GOMP_DIM_VECTOR] >= 0
4815 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4816 {
4817 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4818 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4819 dims[GOMP_DIM_VECTOR]
4820 ? G_("using vector_length (%d), ignoring %d")
4821 : G_("using vector_length (%d), ignoring runtime setting"),
4822 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4823 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4824 changed = true;
4825 }
4826
4827 /* Check the num workers is not too large. */
4828 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4829 {
4830 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4831 "using num_workers (%d), ignoring %d",
4832 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4833 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4834 changed = true;
4835 }
4836
4837 if (!decl)
4838 {
4839 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4840 if (dims[GOMP_DIM_WORKER] < 0)
4841 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4842 if (dims[GOMP_DIM_GANG] < 0)
4843 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4844 changed = true;
4845 }
4846
4847 return changed;
4848 }
4849
4850 /* Return maximum dimension size, or zero for unbounded. */
4851
4852 static int
4853 nvptx_dim_limit (int axis)
4854 {
4855 switch (axis)
4856 {
4857 case GOMP_DIM_WORKER:
4858 return PTX_WORKER_LENGTH;
4859
4860 case GOMP_DIM_VECTOR:
4861 return PTX_VECTOR_LENGTH;
4862
4863 default:
4864 break;
4865 }
4866 return 0;
4867 }
4868
4869 /* Determine whether fork & joins are needed. */
4870
4871 static bool
4872 nvptx_goacc_fork_join (gcall *call, const int dims[],
4873 bool ARG_UNUSED (is_fork))
4874 {
4875 tree arg = gimple_call_arg (call, 2);
4876 unsigned axis = TREE_INT_CST_LOW (arg);
4877
4878 /* We only care about worker and vector partitioning. */
4879 if (axis < GOMP_DIM_WORKER)
4880 return false;
4881
4882 /* If the size is 1, there's no partitioning. */
4883 if (dims[axis] == 1)
4884 return false;
4885
4886 return true;
4887 }
4888
4889 /* Generate a PTX builtin function call that returns the address in
4890 the worker reduction buffer at OFFSET. TYPE is the type of the
4891 data at that location. */
4892
4893 static tree
4894 nvptx_get_worker_red_addr (tree type, tree offset)
4895 {
4896 machine_mode mode = TYPE_MODE (type);
4897 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4898 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4899 tree align = build_int_cst (unsigned_type_node,
4900 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4901 tree call = build_call_expr (fndecl, 3, offset, size, align);
4902
4903 return fold_convert (build_pointer_type (type), call);
4904 }
4905
4906 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4907 will cast the variable if necessary. */
4908
4909 static void
4910 nvptx_generate_vector_shuffle (location_t loc,
4911 tree dest_var, tree var, unsigned shift,
4912 gimple_seq *seq)
4913 {
4914 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4915 tree_code code = NOP_EXPR;
4916 tree arg_type = unsigned_type_node;
4917 tree var_type = TREE_TYPE (var);
4918 tree dest_type = var_type;
4919
4920 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4921 var_type = TREE_TYPE (var_type);
4922
4923 if (TREE_CODE (var_type) == REAL_TYPE)
4924 code = VIEW_CONVERT_EXPR;
4925
4926 if (TYPE_SIZE (var_type)
4927 == TYPE_SIZE (long_long_unsigned_type_node))
4928 {
4929 fn = NVPTX_BUILTIN_SHUFFLELL;
4930 arg_type = long_long_unsigned_type_node;
4931 }
4932
4933 tree call = nvptx_builtin_decl (fn, true);
4934 tree bits = build_int_cst (unsigned_type_node, shift);
4935 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4936 tree expr;
4937
4938 if (var_type != dest_type)
4939 {
4940 /* Do real and imaginary parts separately. */
4941 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4942 real = fold_build1 (code, arg_type, real);
4943 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4944 real = fold_build1 (code, var_type, real);
4945
4946 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4947 imag = fold_build1 (code, arg_type, imag);
4948 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4949 imag = fold_build1 (code, var_type, imag);
4950
4951 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4952 }
4953 else
4954 {
4955 expr = fold_build1 (code, arg_type, var);
4956 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4957 expr = fold_build1 (code, dest_type, expr);
4958 }
4959
4960 gimplify_assign (dest_var, expr, seq);
4961 }
4962
4963 /* Lazily generate the global lock var decl and return its address. */
4964
4965 static tree
4966 nvptx_global_lock_addr ()
4967 {
4968 tree v = global_lock_var;
4969
4970 if (!v)
4971 {
4972 tree name = get_identifier ("__reduction_lock");
4973 tree type = build_qualified_type (unsigned_type_node,
4974 TYPE_QUAL_VOLATILE);
4975 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4976 global_lock_var = v;
4977 DECL_ARTIFICIAL (v) = 1;
4978 DECL_EXTERNAL (v) = 1;
4979 TREE_STATIC (v) = 1;
4980 TREE_PUBLIC (v) = 1;
4981 TREE_USED (v) = 1;
4982 mark_addressable (v);
4983 mark_decl_referenced (v);
4984 }
4985
4986 return build_fold_addr_expr (v);
4987 }
4988
4989 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4990 GSI. We use a lockless scheme for nearly all case, which looks
4991 like:
4992 actual = initval(OP);
4993 do {
4994 guess = actual;
4995 write = guess OP myval;
4996 actual = cmp&swap (ptr, guess, write)
4997 } while (actual bit-different-to guess);
4998 return write;
4999
5000 This relies on a cmp&swap instruction, which is available for 32-
5001 and 64-bit types. Larger types must use a locking scheme. */
5002
5003 static tree
5004 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5005 tree ptr, tree var, tree_code op)
5006 {
5007 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5008 tree_code code = NOP_EXPR;
5009 tree arg_type = unsigned_type_node;
5010 tree var_type = TREE_TYPE (var);
5011
5012 if (TREE_CODE (var_type) == COMPLEX_TYPE
5013 || TREE_CODE (var_type) == REAL_TYPE)
5014 code = VIEW_CONVERT_EXPR;
5015
5016 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5017 {
5018 arg_type = long_long_unsigned_type_node;
5019 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5020 }
5021
5022 tree swap_fn = nvptx_builtin_decl (fn, true);
5023
5024 gimple_seq init_seq = NULL;
5025 tree init_var = make_ssa_name (arg_type);
5026 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5027 init_expr = fold_build1 (code, arg_type, init_expr);
5028 gimplify_assign (init_var, init_expr, &init_seq);
5029 gimple *init_end = gimple_seq_last (init_seq);
5030
5031 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5032
5033 /* Split the block just after the init stmts. */
5034 basic_block pre_bb = gsi_bb (*gsi);
5035 edge pre_edge = split_block (pre_bb, init_end);
5036 basic_block loop_bb = pre_edge->dest;
5037 pre_bb = pre_edge->src;
5038 /* Reset the iterator. */
5039 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5040
5041 tree expect_var = make_ssa_name (arg_type);
5042 tree actual_var = make_ssa_name (arg_type);
5043 tree write_var = make_ssa_name (arg_type);
5044
5045 /* Build and insert the reduction calculation. */
5046 gimple_seq red_seq = NULL;
5047 tree write_expr = fold_build1 (code, var_type, expect_var);
5048 write_expr = fold_build2 (op, var_type, write_expr, var);
5049 write_expr = fold_build1 (code, arg_type, write_expr);
5050 gimplify_assign (write_var, write_expr, &red_seq);
5051
5052 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5053
5054 /* Build & insert the cmp&swap sequence. */
5055 gimple_seq latch_seq = NULL;
5056 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5057 ptr, expect_var, write_var);
5058 gimplify_assign (actual_var, swap_expr, &latch_seq);
5059
5060 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5061 NULL_TREE, NULL_TREE);
5062 gimple_seq_add_stmt (&latch_seq, cond);
5063
5064 gimple *latch_end = gimple_seq_last (latch_seq);
5065 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5066
5067 /* Split the block just after the latch stmts. */
5068 edge post_edge = split_block (loop_bb, latch_end);
5069 basic_block post_bb = post_edge->dest;
5070 loop_bb = post_edge->src;
5071 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5072
5073 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5074 post_edge->probability = profile_probability::even ();
5075 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5076 loop_edge->probability = profile_probability::even ();
5077 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5078 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5079
5080 gphi *phi = create_phi_node (expect_var, loop_bb);
5081 add_phi_arg (phi, init_var, pre_edge, loc);
5082 add_phi_arg (phi, actual_var, loop_edge, loc);
5083
5084 loop *loop = alloc_loop ();
5085 loop->header = loop_bb;
5086 loop->latch = loop_bb;
5087 add_loop (loop, loop_bb->loop_father);
5088
5089 return fold_build1 (code, var_type, write_var);
5090 }
5091
5092 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5093 GSI. This is necessary for types larger than 64 bits, where there
5094 is no cmp&swap instruction to implement a lockless scheme. We use
5095 a lock variable in global memory.
5096
5097 while (cmp&swap (&lock_var, 0, 1))
5098 continue;
5099 T accum = *ptr;
5100 accum = accum OP var;
5101 *ptr = accum;
5102 cmp&swap (&lock_var, 1, 0);
5103 return accum;
5104
5105 A lock in global memory is necessary to force execution engine
5106 descheduling and avoid resource starvation that can occur if the
5107 lock is in .shared memory. */
5108
5109 static tree
5110 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5111 tree ptr, tree var, tree_code op)
5112 {
5113 tree var_type = TREE_TYPE (var);
5114 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5115 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5116 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5117
5118 /* Split the block just before the gsi. Insert a gimple nop to make
5119 this easier. */
5120 gimple *nop = gimple_build_nop ();
5121 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5122 basic_block entry_bb = gsi_bb (*gsi);
5123 edge entry_edge = split_block (entry_bb, nop);
5124 basic_block lock_bb = entry_edge->dest;
5125 /* Reset the iterator. */
5126 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5127
5128 /* Build and insert the locking sequence. */
5129 gimple_seq lock_seq = NULL;
5130 tree lock_var = make_ssa_name (unsigned_type_node);
5131 tree lock_expr = nvptx_global_lock_addr ();
5132 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5133 uns_unlocked, uns_locked);
5134 gimplify_assign (lock_var, lock_expr, &lock_seq);
5135 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5136 NULL_TREE, NULL_TREE);
5137 gimple_seq_add_stmt (&lock_seq, cond);
5138 gimple *lock_end = gimple_seq_last (lock_seq);
5139 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5140
5141 /* Split the block just after the lock sequence. */
5142 edge locked_edge = split_block (lock_bb, lock_end);
5143 basic_block update_bb = locked_edge->dest;
5144 lock_bb = locked_edge->src;
5145 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5146
5147 /* Create the lock loop ... */
5148 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5149 locked_edge->probability = profile_probability::even ();
5150 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5151 loop_edge->probability = profile_probability::even ();
5152 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5153 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5154
5155 /* ... and the loop structure. */
5156 loop *lock_loop = alloc_loop ();
5157 lock_loop->header = lock_bb;
5158 lock_loop->latch = lock_bb;
5159 lock_loop->nb_iterations_estimate = 1;
5160 lock_loop->any_estimate = true;
5161 add_loop (lock_loop, entry_bb->loop_father);
5162
5163 /* Build and insert the reduction calculation. */
5164 gimple_seq red_seq = NULL;
5165 tree acc_in = make_ssa_name (var_type);
5166 tree ref_in = build_simple_mem_ref (ptr);
5167 TREE_THIS_VOLATILE (ref_in) = 1;
5168 gimplify_assign (acc_in, ref_in, &red_seq);
5169
5170 tree acc_out = make_ssa_name (var_type);
5171 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5172 gimplify_assign (acc_out, update_expr, &red_seq);
5173
5174 tree ref_out = build_simple_mem_ref (ptr);
5175 TREE_THIS_VOLATILE (ref_out) = 1;
5176 gimplify_assign (ref_out, acc_out, &red_seq);
5177
5178 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5179
5180 /* Build & insert the unlock sequence. */
5181 gimple_seq unlock_seq = NULL;
5182 tree unlock_expr = nvptx_global_lock_addr ();
5183 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5184 uns_locked, uns_unlocked);
5185 gimplify_and_add (unlock_expr, &unlock_seq);
5186 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5187
5188 return acc_out;
5189 }
5190
5191 /* Emit a sequence to update a reduction accumlator at *PTR with the
5192 value held in VAR using operator OP. Return the updated value.
5193
5194 TODO: optimize for atomic ops and indepedent complex ops. */
5195
5196 static tree
5197 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5198 tree ptr, tree var, tree_code op)
5199 {
5200 tree type = TREE_TYPE (var);
5201 tree size = TYPE_SIZE (type);
5202
5203 if (size == TYPE_SIZE (unsigned_type_node)
5204 || size == TYPE_SIZE (long_long_unsigned_type_node))
5205 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5206 else
5207 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5208 }
5209
5210 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5211
5212 static void
5213 nvptx_goacc_reduction_setup (gcall *call)
5214 {
5215 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5216 tree lhs = gimple_call_lhs (call);
5217 tree var = gimple_call_arg (call, 2);
5218 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5219 gimple_seq seq = NULL;
5220
5221 push_gimplify_context (true);
5222
5223 if (level != GOMP_DIM_GANG)
5224 {
5225 /* Copy the receiver object. */
5226 tree ref_to_res = gimple_call_arg (call, 1);
5227
5228 if (!integer_zerop (ref_to_res))
5229 var = build_simple_mem_ref (ref_to_res);
5230 }
5231
5232 if (level == GOMP_DIM_WORKER)
5233 {
5234 /* Store incoming value to worker reduction buffer. */
5235 tree offset = gimple_call_arg (call, 5);
5236 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5237 tree ptr = make_ssa_name (TREE_TYPE (call));
5238
5239 gimplify_assign (ptr, call, &seq);
5240 tree ref = build_simple_mem_ref (ptr);
5241 TREE_THIS_VOLATILE (ref) = 1;
5242 gimplify_assign (ref, var, &seq);
5243 }
5244
5245 if (lhs)
5246 gimplify_assign (lhs, var, &seq);
5247
5248 pop_gimplify_context (NULL);
5249 gsi_replace_with_seq (&gsi, seq, true);
5250 }
5251
5252 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5253
5254 static void
5255 nvptx_goacc_reduction_init (gcall *call)
5256 {
5257 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5258 tree lhs = gimple_call_lhs (call);
5259 tree var = gimple_call_arg (call, 2);
5260 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5261 enum tree_code rcode
5262 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5263 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5264 TREE_TYPE (var));
5265 gimple_seq seq = NULL;
5266
5267 push_gimplify_context (true);
5268
5269 if (level == GOMP_DIM_VECTOR)
5270 {
5271 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5272 tree tid = make_ssa_name (integer_type_node);
5273 tree dim_vector = gimple_call_arg (call, 3);
5274 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5275 dim_vector);
5276 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5277 NULL_TREE, NULL_TREE);
5278
5279 gimple_call_set_lhs (tid_call, tid);
5280 gimple_seq_add_stmt (&seq, tid_call);
5281 gimple_seq_add_stmt (&seq, cond_stmt);
5282
5283 /* Split the block just after the call. */
5284 edge init_edge = split_block (gsi_bb (gsi), call);
5285 basic_block init_bb = init_edge->dest;
5286 basic_block call_bb = init_edge->src;
5287
5288 /* Fixup flags from call_bb to init_bb. */
5289 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5290 init_edge->probability = profile_probability::even ();
5291
5292 /* Set the initialization stmts. */
5293 gimple_seq init_seq = NULL;
5294 tree init_var = make_ssa_name (TREE_TYPE (var));
5295 gimplify_assign (init_var, init, &init_seq);
5296 gsi = gsi_start_bb (init_bb);
5297 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5298
5299 /* Split block just after the init stmt. */
5300 gsi_prev (&gsi);
5301 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5302 basic_block dst_bb = inited_edge->dest;
5303
5304 /* Create false edge from call_bb to dst_bb. */
5305 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5306 nop_edge->probability = profile_probability::even ();
5307
5308 /* Create phi node in dst block. */
5309 gphi *phi = create_phi_node (lhs, dst_bb);
5310 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5311 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5312
5313 /* Reset dominator of dst bb. */
5314 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5315
5316 /* Reset the gsi. */
5317 gsi = gsi_for_stmt (call);
5318 }
5319 else
5320 {
5321 if (level == GOMP_DIM_GANG)
5322 {
5323 /* If there's no receiver object, propagate the incoming VAR. */
5324 tree ref_to_res = gimple_call_arg (call, 1);
5325 if (integer_zerop (ref_to_res))
5326 init = var;
5327 }
5328
5329 gimplify_assign (lhs, init, &seq);
5330 }
5331
5332 pop_gimplify_context (NULL);
5333 gsi_replace_with_seq (&gsi, seq, true);
5334 }
5335
5336 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5337
5338 static void
5339 nvptx_goacc_reduction_fini (gcall *call)
5340 {
5341 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5342 tree lhs = gimple_call_lhs (call);
5343 tree ref_to_res = gimple_call_arg (call, 1);
5344 tree var = gimple_call_arg (call, 2);
5345 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5346 enum tree_code op
5347 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5348 gimple_seq seq = NULL;
5349 tree r = NULL_TREE;;
5350
5351 push_gimplify_context (true);
5352
5353 if (level == GOMP_DIM_VECTOR)
5354 {
5355 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5356 but that requires a method of emitting a unified jump at the
5357 gimple level. */
5358 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5359 {
5360 tree other_var = make_ssa_name (TREE_TYPE (var));
5361 nvptx_generate_vector_shuffle (gimple_location (call),
5362 other_var, var, shfl, &seq);
5363
5364 r = make_ssa_name (TREE_TYPE (var));
5365 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5366 var, other_var), &seq);
5367 var = r;
5368 }
5369 }
5370 else
5371 {
5372 tree accum = NULL_TREE;
5373
5374 if (level == GOMP_DIM_WORKER)
5375 {
5376 /* Get reduction buffer address. */
5377 tree offset = gimple_call_arg (call, 5);
5378 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5379 tree ptr = make_ssa_name (TREE_TYPE (call));
5380
5381 gimplify_assign (ptr, call, &seq);
5382 accum = ptr;
5383 }
5384 else if (integer_zerop (ref_to_res))
5385 r = var;
5386 else
5387 accum = ref_to_res;
5388
5389 if (accum)
5390 {
5391 /* UPDATE the accumulator. */
5392 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5393 seq = NULL;
5394 r = nvptx_reduction_update (gimple_location (call), &gsi,
5395 accum, var, op);
5396 }
5397 }
5398
5399 if (lhs)
5400 gimplify_assign (lhs, r, &seq);
5401 pop_gimplify_context (NULL);
5402
5403 gsi_replace_with_seq (&gsi, seq, true);
5404 }
5405
5406 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5407
5408 static void
5409 nvptx_goacc_reduction_teardown (gcall *call)
5410 {
5411 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5412 tree lhs = gimple_call_lhs (call);
5413 tree var = gimple_call_arg (call, 2);
5414 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5415 gimple_seq seq = NULL;
5416
5417 push_gimplify_context (true);
5418 if (level == GOMP_DIM_WORKER)
5419 {
5420 /* Read the worker reduction buffer. */
5421 tree offset = gimple_call_arg (call, 5);
5422 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5423 tree ptr = make_ssa_name (TREE_TYPE (call));
5424
5425 gimplify_assign (ptr, call, &seq);
5426 var = build_simple_mem_ref (ptr);
5427 TREE_THIS_VOLATILE (var) = 1;
5428 }
5429
5430 if (level != GOMP_DIM_GANG)
5431 {
5432 /* Write to the receiver object. */
5433 tree ref_to_res = gimple_call_arg (call, 1);
5434
5435 if (!integer_zerop (ref_to_res))
5436 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5437 }
5438
5439 if (lhs)
5440 gimplify_assign (lhs, var, &seq);
5441
5442 pop_gimplify_context (NULL);
5443
5444 gsi_replace_with_seq (&gsi, seq, true);
5445 }
5446
5447 /* NVPTX reduction expander. */
5448
5449 static void
5450 nvptx_goacc_reduction (gcall *call)
5451 {
5452 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5453
5454 switch (code)
5455 {
5456 case IFN_GOACC_REDUCTION_SETUP:
5457 nvptx_goacc_reduction_setup (call);
5458 break;
5459
5460 case IFN_GOACC_REDUCTION_INIT:
5461 nvptx_goacc_reduction_init (call);
5462 break;
5463
5464 case IFN_GOACC_REDUCTION_FINI:
5465 nvptx_goacc_reduction_fini (call);
5466 break;
5467
5468 case IFN_GOACC_REDUCTION_TEARDOWN:
5469 nvptx_goacc_reduction_teardown (call);
5470 break;
5471
5472 default:
5473 gcc_unreachable ();
5474 }
5475 }
5476
5477 static bool
5478 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5479 rtx x ATTRIBUTE_UNUSED)
5480 {
5481 return true;
5482 }
5483
5484 static bool
5485 nvptx_vector_mode_supported (machine_mode mode)
5486 {
5487 return (mode == V2SImode
5488 || mode == V2DImode);
5489 }
5490
5491 /* Return the preferred mode for vectorizing scalar MODE. */
5492
5493 static machine_mode
5494 nvptx_preferred_simd_mode (scalar_mode mode)
5495 {
5496 switch (mode)
5497 {
5498 case E_DImode:
5499 return V2DImode;
5500 case E_SImode:
5501 return V2SImode;
5502
5503 default:
5504 return default_preferred_simd_mode (mode);
5505 }
5506 }
5507
5508 unsigned int
5509 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5510 {
5511 if (TREE_CODE (type) == INTEGER_TYPE)
5512 {
5513 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5514 if (size == GET_MODE_SIZE (TImode))
5515 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5516 }
5517
5518 return basic_align;
5519 }
5520
5521 /* Implement TARGET_MODES_TIEABLE_P. */
5522
5523 static bool
5524 nvptx_modes_tieable_p (machine_mode, machine_mode)
5525 {
5526 return false;
5527 }
5528
5529 /* Implement TARGET_HARD_REGNO_NREGS. */
5530
5531 static unsigned int
5532 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5533 {
5534 return 1;
5535 }
5536
5537 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5538
5539 static bool
5540 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5541 {
5542 return false;
5543 }
5544
5545 #undef TARGET_OPTION_OVERRIDE
5546 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5547
5548 #undef TARGET_ATTRIBUTE_TABLE
5549 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5550
5551 #undef TARGET_LRA_P
5552 #define TARGET_LRA_P hook_bool_void_false
5553
5554 #undef TARGET_LEGITIMATE_ADDRESS_P
5555 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5556
5557 #undef TARGET_PROMOTE_FUNCTION_MODE
5558 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5559
5560 #undef TARGET_FUNCTION_ARG
5561 #define TARGET_FUNCTION_ARG nvptx_function_arg
5562 #undef TARGET_FUNCTION_INCOMING_ARG
5563 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5564 #undef TARGET_FUNCTION_ARG_ADVANCE
5565 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5566 #undef TARGET_FUNCTION_ARG_BOUNDARY
5567 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5568 #undef TARGET_PASS_BY_REFERENCE
5569 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5570 #undef TARGET_FUNCTION_VALUE_REGNO_P
5571 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5572 #undef TARGET_FUNCTION_VALUE
5573 #define TARGET_FUNCTION_VALUE nvptx_function_value
5574 #undef TARGET_LIBCALL_VALUE
5575 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5576 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5577 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5578 #undef TARGET_GET_DRAP_RTX
5579 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5580 #undef TARGET_SPLIT_COMPLEX_ARG
5581 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5582 #undef TARGET_RETURN_IN_MEMORY
5583 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5584 #undef TARGET_OMIT_STRUCT_RETURN_REG
5585 #define TARGET_OMIT_STRUCT_RETURN_REG true
5586 #undef TARGET_STRICT_ARGUMENT_NAMING
5587 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5588 #undef TARGET_CALL_ARGS
5589 #define TARGET_CALL_ARGS nvptx_call_args
5590 #undef TARGET_END_CALL_ARGS
5591 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5592
5593 #undef TARGET_ASM_FILE_START
5594 #define TARGET_ASM_FILE_START nvptx_file_start
5595 #undef TARGET_ASM_FILE_END
5596 #define TARGET_ASM_FILE_END nvptx_file_end
5597 #undef TARGET_ASM_GLOBALIZE_LABEL
5598 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5599 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5600 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5601 #undef TARGET_PRINT_OPERAND
5602 #define TARGET_PRINT_OPERAND nvptx_print_operand
5603 #undef TARGET_PRINT_OPERAND_ADDRESS
5604 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5605 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5606 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5607 #undef TARGET_ASM_INTEGER
5608 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5609 #undef TARGET_ASM_DECL_END
5610 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5611 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5612 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5613 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5614 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5615 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5616 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5617
5618 #undef TARGET_MACHINE_DEPENDENT_REORG
5619 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5620 #undef TARGET_NO_REGISTER_ALLOCATION
5621 #define TARGET_NO_REGISTER_ALLOCATION true
5622
5623 #undef TARGET_ENCODE_SECTION_INFO
5624 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5625 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5626 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5627
5628 #undef TARGET_VECTOR_ALIGNMENT
5629 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5630
5631 #undef TARGET_CANNOT_COPY_INSN_P
5632 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5633
5634 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5635 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5636
5637 #undef TARGET_INIT_BUILTINS
5638 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5639 #undef TARGET_EXPAND_BUILTIN
5640 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5641 #undef TARGET_BUILTIN_DECL
5642 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5643
5644 #undef TARGET_SIMT_VF
5645 #define TARGET_SIMT_VF nvptx_simt_vf
5646
5647 #undef TARGET_GOACC_VALIDATE_DIMS
5648 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5649
5650 #undef TARGET_GOACC_DIM_LIMIT
5651 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5652
5653 #undef TARGET_GOACC_FORK_JOIN
5654 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5655
5656 #undef TARGET_GOACC_REDUCTION
5657 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5658
5659 #undef TARGET_CANNOT_FORCE_CONST_MEM
5660 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5661
5662 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5663 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5664
5665 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5666 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5667 nvptx_preferred_simd_mode
5668
5669 #undef TARGET_MODES_TIEABLE_P
5670 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5671
5672 #undef TARGET_HARD_REGNO_NREGS
5673 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5674
5675 #undef TARGET_CAN_CHANGE_MODE_CLASS
5676 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5677
5678 struct gcc_target targetm = TARGET_INITIALIZER;
5679
5680 #include "gt-nvptx.h"