]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/nvptx/nvptx.c
Put a TARGET_LRA_P into every target
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include <sstream>
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "cfghooks.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "expmed.h"
33 #include "optabs.h"
34 #include "regs.h"
35 #include "emit-rtl.h"
36 #include "recog.h"
37 #include "diagnostic.h"
38 #include "alias.h"
39 #include "insn-flags.h"
40 #include "output.h"
41 #include "insn-attr.h"
42 #include "flags.h"
43 #include "dojump.h"
44 #include "explow.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "stmt.h"
48 #include "expr.h"
49 #include "tm-preds.h"
50 #include "tm-constrs.h"
51 #include "langhooks.h"
52 #include "dbxout.h"
53 #include "cfgrtl.h"
54 #include "gimple.h"
55 #include "stor-layout.h"
56 #include "builtins.h"
57 #include "omp-low.h"
58 #include "gomp-constants.h"
59 #include "dumpfile.h"
60 #include "internal-fn.h"
61 #include "gimple-iterator.h"
62 #include "stringpool.h"
63 #include "tree-vrp.h"
64 #include "tree-ssa-operands.h"
65 #include "tree-ssanames.h"
66 #include "gimplify.h"
67 #include "tree-phinodes.h"
68 #include "cfgloop.h"
69 #include "fold-const.h"
70
71 /* This file should be included last. */
72 #include "target-def.h"
73
74 /* The kind of shuffe instruction. */
75 enum nvptx_shuffle_kind
76 {
77 SHUFFLE_UP,
78 SHUFFLE_DOWN,
79 SHUFFLE_BFLY,
80 SHUFFLE_IDX,
81 SHUFFLE_MAX
82 };
83
84 /* The various PTX memory areas an object might reside in. */
85 enum nvptx_data_area
86 {
87 DATA_AREA_GENERIC,
88 DATA_AREA_GLOBAL,
89 DATA_AREA_SHARED,
90 DATA_AREA_LOCAL,
91 DATA_AREA_CONST,
92 DATA_AREA_PARAM,
93 DATA_AREA_MAX
94 };
95
96 /* We record the data area in the target symbol flags. */
97 #define SYMBOL_DATA_AREA(SYM) \
98 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
99 & 7)
100 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
101 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
102
103 /* Record the function decls we've written, and the libfuncs and function
104 decls corresponding to them. */
105 static std::stringstream func_decls;
106
107 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
108 {
109 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
110 static bool equal (rtx a, rtx b) { return a == b; }
111 };
112
113 static GTY((cache))
114 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
115
116 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
117 {
118 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
119 static bool equal (tree a, tree b) { return a == b; }
120 };
121
122 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
123 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
124
125 /* Buffer needed to broadcast across workers. This is used for both
126 worker-neutering and worker broadcasting. It is shared by all
127 functions emitted. The buffer is placed in shared memory. It'd be
128 nice if PTX supported common blocks, because then this could be
129 shared across TUs (taking the largest size). */
130 static unsigned worker_bcast_size;
131 static unsigned worker_bcast_align;
132 static GTY(()) rtx worker_bcast_sym;
133
134 /* Buffer needed for worker reductions. This has to be distinct from
135 the worker broadcast array, as both may be live concurrently. */
136 static unsigned worker_red_size;
137 static unsigned worker_red_align;
138 static GTY(()) rtx worker_red_sym;
139
140 /* Global lock variable, needed for 128bit worker & gang reductions. */
141 static GTY(()) tree global_lock_var;
142
143 /* Allocate a new, cleared machine_function structure. */
144
145 static struct machine_function *
146 nvptx_init_machine_status (void)
147 {
148 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
149 p->return_mode = VOIDmode;
150 return p;
151 }
152
153 /* Implement TARGET_OPTION_OVERRIDE. */
154
155 static void
156 nvptx_option_override (void)
157 {
158 init_machine_status = nvptx_init_machine_status;
159
160 /* Set toplevel_reorder, unless explicitly disabled. We need
161 reordering so that we emit necessary assembler decls of
162 undeclared variables. */
163 if (!global_options_set.x_flag_toplevel_reorder)
164 flag_toplevel_reorder = 1;
165
166 /* Set flag_no_common, unless explicitly disabled. We fake common
167 using .weak, and that's not entirely accurate, so avoid it
168 unless forced. */
169 if (!global_options_set.x_flag_no_common)
170 flag_no_common = 1;
171
172 /* Assumes that it will see only hard registers. */
173 flag_var_tracking = 0;
174
175 if (nvptx_optimize < 0)
176 nvptx_optimize = optimize > 0;
177
178 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
179 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
180 declared_libfuncs_htab
181 = hash_table<declared_libfunc_hasher>::create_ggc (17);
182
183 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
184 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
185 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
186
187 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
188 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
189 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
190 }
191
192 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
193 deal with ptx ideosyncracies. */
194
195 const char *
196 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
197 {
198 switch (mode)
199 {
200 case BLKmode:
201 return ".b8";
202 case BImode:
203 return ".pred";
204 case QImode:
205 if (promote)
206 return ".u32";
207 else
208 return ".u8";
209 case HImode:
210 return ".u16";
211 case SImode:
212 return ".u32";
213 case DImode:
214 return ".u64";
215
216 case SFmode:
217 return ".f32";
218 case DFmode:
219 return ".f64";
220
221 default:
222 gcc_unreachable ();
223 }
224 }
225
226 /* Encode the PTX data area that DECL (which might not actually be a
227 _DECL) should reside in. */
228
229 static void
230 nvptx_encode_section_info (tree decl, rtx rtl, int first)
231 {
232 default_encode_section_info (decl, rtl, first);
233 if (first && MEM_P (rtl))
234 {
235 nvptx_data_area area = DATA_AREA_GENERIC;
236
237 if (TREE_CONSTANT (decl))
238 area = DATA_AREA_CONST;
239 else if (TREE_CODE (decl) == VAR_DECL)
240 /* TODO: This would be a good place to check for a .shared or
241 other section name. */
242 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
243
244 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
245 }
246 }
247
248 /* Return the PTX name of the data area in which SYM should be
249 placed. The symbol must have already been processed by
250 nvptx_encode_seciton_info, or equivalent. */
251
252 static const char *
253 section_for_sym (rtx sym)
254 {
255 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
256 /* Same order as nvptx_data_area enum. */
257 static char const *const areas[] =
258 {"", ".global", ".shared", ".local", ".const", ".param"};
259
260 return areas[area];
261 }
262
263 /* Similarly for a decl. */
264
265 static const char *
266 section_for_decl (const_tree decl)
267 {
268 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
269 }
270
271 /* Check NAME for special function names and redirect them by returning a
272 replacement. This applies to malloc, free and realloc, for which we
273 want to use libgcc wrappers, and call, which triggers a bug in
274 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
275 not active in an offload compiler -- the names are all set by the
276 host-side compiler. */
277
278 static const char *
279 nvptx_name_replacement (const char *name)
280 {
281 if (strcmp (name, "call") == 0)
282 return "__nvptx_call";
283 if (strcmp (name, "malloc") == 0)
284 return "__nvptx_malloc";
285 if (strcmp (name, "free") == 0)
286 return "__nvptx_free";
287 if (strcmp (name, "realloc") == 0)
288 return "__nvptx_realloc";
289 return name;
290 }
291
292 /* If MODE should be treated as two registers of an inner mode, return
293 that inner mode. Otherwise return VOIDmode. */
294
295 static machine_mode
296 maybe_split_mode (machine_mode mode)
297 {
298 if (COMPLEX_MODE_P (mode))
299 return GET_MODE_INNER (mode);
300
301 if (mode == TImode)
302 return DImode;
303
304 return VOIDmode;
305 }
306
307 /* Output a register, subreg, or register pair (with optional
308 enclosing braces). */
309
310 static void
311 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
312 int subreg_offset = -1)
313 {
314 if (inner_mode == VOIDmode)
315 {
316 if (HARD_REGISTER_NUM_P (regno))
317 fprintf (file, "%s", reg_names[regno]);
318 else
319 fprintf (file, "%%r%d", regno);
320 }
321 else if (subreg_offset >= 0)
322 {
323 output_reg (file, regno, VOIDmode);
324 fprintf (file, "$%d", subreg_offset);
325 }
326 else
327 {
328 if (subreg_offset == -1)
329 fprintf (file, "{");
330 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
331 fprintf (file, ",");
332 output_reg (file, regno, inner_mode, 0);
333 if (subreg_offset == -1)
334 fprintf (file, "}");
335 }
336 }
337
338 /* Emit forking instructions for MASK. */
339
340 static void
341 nvptx_emit_forking (unsigned mask, bool is_call)
342 {
343 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
344 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
345 if (mask)
346 {
347 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
348
349 /* Emit fork at all levels. This helps form SESE regions, as
350 it creates a block with a single successor before entering a
351 partitooned region. That is a good candidate for the end of
352 an SESE region. */
353 if (!is_call)
354 emit_insn (gen_nvptx_fork (op));
355 emit_insn (gen_nvptx_forked (op));
356 }
357 }
358
359 /* Emit joining instructions for MASK. */
360
361 static void
362 nvptx_emit_joining (unsigned mask, bool is_call)
363 {
364 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
365 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
366 if (mask)
367 {
368 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
369
370 /* Emit joining for all non-call pars to ensure there's a single
371 predecessor for the block the join insn ends up in. This is
372 needed for skipping entire loops. */
373 if (!is_call)
374 emit_insn (gen_nvptx_joining (op));
375 emit_insn (gen_nvptx_join (op));
376 }
377 }
378
379 \f
380 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
381 returned in memory. Integer and floating types supported by the
382 machine are passed in registers, everything else is passed in
383 memory. Complex types are split. */
384
385 static bool
386 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
387 {
388 if (type)
389 {
390 if (AGGREGATE_TYPE_P (type))
391 return true;
392 if (TREE_CODE (type) == VECTOR_TYPE)
393 return true;
394 }
395
396 if (!for_return && COMPLEX_MODE_P (mode))
397 /* Complex types are passed as two underlying args. */
398 mode = GET_MODE_INNER (mode);
399
400 if (GET_MODE_CLASS (mode) != MODE_INT
401 && GET_MODE_CLASS (mode) != MODE_FLOAT)
402 return true;
403
404 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
405 return true;
406
407 return false;
408 }
409
410 /* A non-memory argument of mode MODE is being passed, determine the mode it
411 should be promoted to. This is also used for determining return
412 type promotion. */
413
414 static machine_mode
415 promote_arg (machine_mode mode, bool prototyped)
416 {
417 if (!prototyped && mode == SFmode)
418 /* K&R float promotion for unprototyped functions. */
419 mode = DFmode;
420 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
421 mode = SImode;
422
423 return mode;
424 }
425
426 /* A non-memory return type of MODE is being returned. Determine the
427 mode it should be promoted to. */
428
429 static machine_mode
430 promote_return (machine_mode mode)
431 {
432 return promote_arg (mode, true);
433 }
434
435 /* Implement TARGET_FUNCTION_ARG. */
436
437 static rtx
438 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
439 const_tree, bool named)
440 {
441 if (mode == VOIDmode || !named)
442 return NULL_RTX;
443
444 return gen_reg_rtx (mode);
445 }
446
447 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
448
449 static rtx
450 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
451 const_tree, bool named)
452 {
453 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
454
455 if (mode == VOIDmode || !named)
456 return NULL_RTX;
457
458 /* No need to deal with split modes here, the only case that can
459 happen is complex modes and those are dealt with by
460 TARGET_SPLIT_COMPLEX_ARG. */
461 return gen_rtx_UNSPEC (mode,
462 gen_rtvec (1, GEN_INT (cum->count)),
463 UNSPEC_ARG_REG);
464 }
465
466 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
467
468 static void
469 nvptx_function_arg_advance (cumulative_args_t cum_v,
470 machine_mode ARG_UNUSED (mode),
471 const_tree ARG_UNUSED (type),
472 bool ARG_UNUSED (named))
473 {
474 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
475
476 cum->count++;
477 }
478
479 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
480
481 For nvptx This is only used for varadic args. The type has already
482 been promoted and/or converted to invisible reference. */
483
484 static unsigned
485 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
486 {
487 return GET_MODE_ALIGNMENT (mode);
488 }
489
490 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
491
492 For nvptx, we know how to handle functions declared as stdarg: by
493 passing an extra pointer to the unnamed arguments. However, the
494 Fortran frontend can produce a different situation, where a
495 function pointer is declared with no arguments, but the actual
496 function and calls to it take more arguments. In that case, we
497 want to ensure the call matches the definition of the function. */
498
499 static bool
500 nvptx_strict_argument_naming (cumulative_args_t cum_v)
501 {
502 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
503
504 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
505 }
506
507 /* Implement TARGET_LIBCALL_VALUE. */
508
509 static rtx
510 nvptx_libcall_value (machine_mode mode, const_rtx)
511 {
512 if (!cfun || !cfun->machine->doing_call)
513 /* Pretend to return in a hard reg for early uses before pseudos can be
514 generated. */
515 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
516
517 return gen_reg_rtx (mode);
518 }
519
520 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
521 where function FUNC returns or receives a value of data type TYPE. */
522
523 static rtx
524 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
525 bool outgoing)
526 {
527 machine_mode mode = promote_return (TYPE_MODE (type));
528
529 if (outgoing)
530 {
531 gcc_assert (cfun);
532 cfun->machine->return_mode = mode;
533 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
534 }
535
536 return nvptx_libcall_value (mode, NULL_RTX);
537 }
538
539 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
540
541 static bool
542 nvptx_function_value_regno_p (const unsigned int regno)
543 {
544 return regno == NVPTX_RETURN_REGNUM;
545 }
546
547 /* Types with a mode other than those supported by the machine are passed by
548 reference in memory. */
549
550 static bool
551 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
552 machine_mode mode, const_tree type,
553 bool ARG_UNUSED (named))
554 {
555 return pass_in_memory (mode, type, false);
556 }
557
558 /* Implement TARGET_RETURN_IN_MEMORY. */
559
560 static bool
561 nvptx_return_in_memory (const_tree type, const_tree)
562 {
563 return pass_in_memory (TYPE_MODE (type), type, true);
564 }
565
566 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
567
568 static machine_mode
569 nvptx_promote_function_mode (const_tree type, machine_mode mode,
570 int *ARG_UNUSED (punsignedp),
571 const_tree funtype, int for_return)
572 {
573 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
574 }
575
576 /* Helper for write_arg. Emit a single PTX argument of MODE, either
577 in a prototype, or as copy in a function prologue. ARGNO is the
578 index of this argument in the PTX function. FOR_REG is negative,
579 if we're emitting the PTX prototype. It is zero if we're copying
580 to an argument register and it is greater than zero if we're
581 copying to a specific hard register. */
582
583 static int
584 write_arg_mode (std::stringstream &s, int for_reg, int argno,
585 machine_mode mode)
586 {
587 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
588
589 if (for_reg < 0)
590 {
591 /* Writing PTX prototype. */
592 s << (argno ? ", " : " (");
593 s << ".param" << ptx_type << " %in_ar" << argno;
594 }
595 else
596 {
597 s << "\t.reg" << ptx_type << " ";
598 if (for_reg)
599 s << reg_names[for_reg];
600 else
601 s << "%ar" << argno;
602 s << ";\n";
603 if (argno >= 0)
604 {
605 s << "\tld.param" << ptx_type << " ";
606 if (for_reg)
607 s << reg_names[for_reg];
608 else
609 s << "%ar" << argno;
610 s << ", [%in_ar" << argno << "];\n";
611 }
612 }
613 return argno + 1;
614 }
615
616 /* Process function parameter TYPE to emit one or more PTX
617 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
618 is true, if this is a prototyped function, rather than an old-style
619 C declaration. Returns the next argument number to use.
620
621 The promotion behavior here must match the regular GCC function
622 parameter marshalling machinery. */
623
624 static int
625 write_arg_type (std::stringstream &s, int for_reg, int argno,
626 tree type, bool prototyped)
627 {
628 machine_mode mode = TYPE_MODE (type);
629
630 if (mode == VOIDmode)
631 return argno;
632
633 if (pass_in_memory (mode, type, false))
634 mode = Pmode;
635 else
636 {
637 bool split = TREE_CODE (type) == COMPLEX_TYPE;
638
639 if (split)
640 {
641 /* Complex types are sent as two separate args. */
642 type = TREE_TYPE (type);
643 mode = TYPE_MODE (type);
644 prototyped = true;
645 }
646
647 mode = promote_arg (mode, prototyped);
648 if (split)
649 argno = write_arg_mode (s, for_reg, argno, mode);
650 }
651
652 return write_arg_mode (s, for_reg, argno, mode);
653 }
654
655 /* Emit a PTX return as a prototype or function prologue declaration
656 for MODE. */
657
658 static void
659 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
660 {
661 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
662 const char *pfx = "\t.reg";
663 const char *sfx = ";\n";
664
665 if (for_proto)
666 pfx = "(.param", sfx = "_out) ";
667
668 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
669 }
670
671 /* Process a function return TYPE to emit a PTX return as a prototype
672 or function prologue declaration. Returns true if return is via an
673 additional pointer parameter. The promotion behavior here must
674 match the regular GCC function return mashalling. */
675
676 static bool
677 write_return_type (std::stringstream &s, bool for_proto, tree type)
678 {
679 machine_mode mode = TYPE_MODE (type);
680
681 if (mode == VOIDmode)
682 return false;
683
684 bool return_in_mem = pass_in_memory (mode, type, true);
685
686 if (return_in_mem)
687 {
688 if (for_proto)
689 return return_in_mem;
690
691 /* Named return values can cause us to return a pointer as well
692 as expect an argument for the return location. This is
693 optimization-level specific, so no caller can make use of
694 this data, but more importantly for us, we must ensure it
695 doesn't change the PTX prototype. */
696 mode = (machine_mode) cfun->machine->return_mode;
697
698 if (mode == VOIDmode)
699 return return_in_mem;
700
701 /* Clear return_mode to inhibit copy of retval to non-existent
702 retval parameter. */
703 cfun->machine->return_mode = VOIDmode;
704 }
705 else
706 mode = promote_return (mode);
707
708 write_return_mode (s, for_proto, mode);
709
710 return return_in_mem;
711 }
712
713 /* Look for attributes in ATTRS that would indicate we must write a function
714 as a .entry kernel rather than a .func. Return true if one is found. */
715
716 static bool
717 write_as_kernel (tree attrs)
718 {
719 return (lookup_attribute ("kernel", attrs) != NULL_TREE
720 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
721 }
722
723 /* Emit a linker marker for a function decl or defn. */
724
725 static void
726 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
727 const char *name)
728 {
729 s << "\n// BEGIN";
730 if (globalize)
731 s << " GLOBAL";
732 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
733 s << name << "\n";
734 }
735
736 /* Emit a linker marker for a variable decl or defn. */
737
738 static void
739 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
740 {
741 fprintf (file, "\n// BEGIN%s VAR %s: ",
742 globalize ? " GLOBAL" : "",
743 is_defn ? "DEF" : "DECL");
744 assemble_name_raw (file, name);
745 fputs ("\n", file);
746 }
747
748 /* Write a .func or .kernel declaration or definition along with
749 a helper comment for use by ld. S is the stream to write to, DECL
750 the decl for the function with name NAME. For definitions, emit
751 a declaration too. */
752
753 static const char *
754 write_fn_proto (std::stringstream &s, bool is_defn,
755 const char *name, const_tree decl)
756 {
757 if (is_defn)
758 /* Emit a declaration. The PTX assembler gets upset without it. */
759 name = write_fn_proto (s, false, name, decl);
760 else
761 {
762 /* Avoid repeating the name replacement. */
763 name = nvptx_name_replacement (name);
764 if (name[0] == '*')
765 name++;
766 }
767
768 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
769
770 /* PTX declaration. */
771 if (DECL_EXTERNAL (decl))
772 s << ".extern ";
773 else if (TREE_PUBLIC (decl))
774 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
775 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
776
777 tree fntype = TREE_TYPE (decl);
778 tree result_type = TREE_TYPE (fntype);
779
780 /* atomic_compare_exchange_$n builtins have an exceptional calling
781 convention. */
782 int not_atomic_weak_arg = -1;
783 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
784 switch (DECL_FUNCTION_CODE (decl))
785 {
786 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
787 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
788 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
789 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
790 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
791 /* These atomics skip the 'weak' parm in an actual library
792 call. We must skip it in the prototype too. */
793 not_atomic_weak_arg = 3;
794 break;
795
796 default:
797 break;
798 }
799
800 /* Declare the result. */
801 bool return_in_mem = write_return_type (s, true, result_type);
802
803 s << name;
804
805 int argno = 0;
806
807 /* Emit argument list. */
808 if (return_in_mem)
809 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
810
811 /* We get:
812 NULL in TYPE_ARG_TYPES, for old-style functions
813 NULL in DECL_ARGUMENTS, for builtin functions without another
814 declaration.
815 So we have to pick the best one we have. */
816 tree args = TYPE_ARG_TYPES (fntype);
817 bool prototyped = true;
818 if (!args)
819 {
820 args = DECL_ARGUMENTS (decl);
821 prototyped = false;
822 }
823
824 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
825 {
826 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
827
828 if (not_atomic_weak_arg)
829 argno = write_arg_type (s, -1, argno, type, prototyped);
830 else
831 gcc_assert (type == boolean_type_node);
832 }
833
834 if (stdarg_p (fntype))
835 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
836
837 if (DECL_STATIC_CHAIN (decl))
838 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
839
840 if (!argno && strcmp (name, "main") == 0)
841 {
842 argno = write_arg_type (s, -1, argno, integer_type_node, true);
843 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
844 }
845
846 if (argno)
847 s << ")";
848
849 s << (is_defn ? "\n" : ";\n");
850
851 return name;
852 }
853
854 /* Construct a function declaration from a call insn. This can be
855 necessary for two reasons - either we have an indirect call which
856 requires a .callprototype declaration, or we have a libcall
857 generated by emit_library_call for which no decl exists. */
858
859 static void
860 write_fn_proto_from_insn (std::stringstream &s, const char *name,
861 rtx result, rtx pat)
862 {
863 if (!name)
864 {
865 s << "\t.callprototype ";
866 name = "_";
867 }
868 else
869 {
870 name = nvptx_name_replacement (name);
871 write_fn_marker (s, false, true, name);
872 s << "\t.extern .func ";
873 }
874
875 if (result != NULL_RTX)
876 write_return_mode (s, true, GET_MODE (result));
877
878 s << name;
879
880 int arg_end = XVECLEN (pat, 0);
881 for (int i = 1; i < arg_end; i++)
882 {
883 /* We don't have to deal with mode splitting & promotion here,
884 as that was already done when generating the call
885 sequence. */
886 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
887
888 write_arg_mode (s, -1, i - 1, mode);
889 }
890 if (arg_end != 1)
891 s << ")";
892 s << ";\n";
893 }
894
895 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
896 table and and write a ptx prototype. These are emitted at end of
897 compilation. */
898
899 static void
900 nvptx_record_fndecl (tree decl)
901 {
902 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
903 if (*slot == NULL)
904 {
905 *slot = decl;
906 const char *name = get_fnname_from_decl (decl);
907 write_fn_proto (func_decls, false, name, decl);
908 }
909 }
910
911 /* Record a libcall or unprototyped external function. CALLEE is the
912 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
913 declaration for it. */
914
915 static void
916 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
917 {
918 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
919 if (*slot == NULL)
920 {
921 *slot = callee;
922
923 const char *name = XSTR (callee, 0);
924 write_fn_proto_from_insn (func_decls, name, retval, pat);
925 }
926 }
927
928 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
929 is prototyped, record it now. Otherwise record it as needed at end
930 of compilation, when we might have more information about it. */
931
932 void
933 nvptx_record_needed_fndecl (tree decl)
934 {
935 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
936 {
937 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
938 if (*slot == NULL)
939 *slot = decl;
940 }
941 else
942 nvptx_record_fndecl (decl);
943 }
944
945 /* SYM is a SYMBOL_REF. If it refers to an external function, record
946 it as needed. */
947
948 static void
949 nvptx_maybe_record_fnsym (rtx sym)
950 {
951 tree decl = SYMBOL_REF_DECL (sym);
952
953 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
954 nvptx_record_needed_fndecl (decl);
955 }
956
957 /* Emit a local array to hold some part of a conventional stack frame
958 and initialize REGNO to point to it. If the size is zero, it'll
959 never be valid to dereference, so we can simply initialize to
960 zero. */
961
962 static void
963 init_frame (FILE *file, int regno, unsigned align, unsigned size)
964 {
965 if (size)
966 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
967 align, reg_names[regno], size);
968 fprintf (file, "\t.reg.u%d %s;\n",
969 POINTER_SIZE, reg_names[regno]);
970 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
971 : "\tmov.u%d %s, 0;\n"),
972 POINTER_SIZE, reg_names[regno], reg_names[regno]);
973 }
974
975 /* Emit code to initialize the REGNO predicate register to indicate
976 whether we are not lane zero on the NAME axis. */
977
978 static void
979 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
980 {
981 fprintf (file, "\t{\n");
982 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
983 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
984 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
985 fprintf (file, "\t}\n");
986 }
987
988 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
989 function, including local var decls and copies from the arguments to
990 local regs. */
991
992 void
993 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
994 {
995 tree fntype = TREE_TYPE (decl);
996 tree result_type = TREE_TYPE (fntype);
997 int argno = 0;
998
999 /* We construct the initial part of the function into a string
1000 stream, in order to share the prototype writing code. */
1001 std::stringstream s;
1002 write_fn_proto (s, true, name, decl);
1003 s << "{\n";
1004
1005 bool return_in_mem = write_return_type (s, false, result_type);
1006 if (return_in_mem)
1007 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1008
1009 /* Declare and initialize incoming arguments. */
1010 tree args = TYPE_ARG_TYPES (fntype);
1011 bool prototyped = true;
1012 if (!args)
1013 {
1014 args = DECL_ARGUMENTS (decl);
1015 prototyped = false;
1016 }
1017
1018 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1019 {
1020 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1021
1022 argno = write_arg_type (s, 0, argno, type, prototyped);
1023 }
1024
1025 if (stdarg_p (fntype))
1026 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1027 true);
1028
1029 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1030 write_arg_type (s, STATIC_CHAIN_REGNUM,
1031 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1032 true);
1033
1034 fprintf (file, "%s", s.str().c_str());
1035
1036 /* Declare a local var for outgoing varargs. */
1037 if (cfun->machine->has_varadic)
1038 init_frame (file, STACK_POINTER_REGNUM,
1039 UNITS_PER_WORD, crtl->outgoing_args_size);
1040
1041 /* Declare a local variable for the frame. Force its size to be
1042 DImode-compatible. */
1043 HOST_WIDE_INT sz = get_frame_size ();
1044 if (sz || cfun->machine->has_chain)
1045 init_frame (file, FRAME_POINTER_REGNUM,
1046 crtl->stack_alignment_needed / BITS_PER_UNIT,
1047 (sz + GET_MODE_SIZE (DImode) - 1)
1048 & ~(HOST_WIDE_INT)(GET_MODE_SIZE (DImode) - 1));
1049
1050 /* Declare the pseudos we have as ptx registers. */
1051 int maxregs = max_reg_num ();
1052 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1053 {
1054 if (regno_reg_rtx[i] != const0_rtx)
1055 {
1056 machine_mode mode = PSEUDO_REGNO_MODE (i);
1057 machine_mode split = maybe_split_mode (mode);
1058
1059 if (split != VOIDmode)
1060 mode = split;
1061 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1062 output_reg (file, i, split, -2);
1063 fprintf (file, ";\n");
1064 }
1065 }
1066
1067 /* Emit axis predicates. */
1068 if (cfun->machine->axis_predicate[0])
1069 nvptx_init_axis_predicate (file,
1070 REGNO (cfun->machine->axis_predicate[0]), "y");
1071 if (cfun->machine->axis_predicate[1])
1072 nvptx_init_axis_predicate (file,
1073 REGNO (cfun->machine->axis_predicate[1]), "x");
1074 }
1075
1076 /* Output a return instruction. Also copy the return value to its outgoing
1077 location. */
1078
1079 const char *
1080 nvptx_output_return (void)
1081 {
1082 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1083
1084 if (mode != VOIDmode)
1085 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1086 nvptx_ptx_type_from_mode (mode, false),
1087 reg_names[NVPTX_RETURN_REGNUM],
1088 reg_names[NVPTX_RETURN_REGNUM]);
1089
1090 return "ret;";
1091 }
1092
1093 /* Terminate a function by writing a closing brace to FILE. */
1094
1095 void
1096 nvptx_function_end (FILE *file)
1097 {
1098 fprintf (file, "}\n");
1099 }
1100 \f
1101 /* Decide whether we can make a sibling call to a function. For ptx, we
1102 can't. */
1103
1104 static bool
1105 nvptx_function_ok_for_sibcall (tree, tree)
1106 {
1107 return false;
1108 }
1109
1110 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1111
1112 static rtx
1113 nvptx_get_drap_rtx (void)
1114 {
1115 return NULL_RTX;
1116 }
1117
1118 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1119 argument to the next call. */
1120
1121 static void
1122 nvptx_call_args (rtx arg, tree fntype)
1123 {
1124 if (!cfun->machine->doing_call)
1125 {
1126 cfun->machine->doing_call = true;
1127 cfun->machine->is_varadic = false;
1128 cfun->machine->num_args = 0;
1129
1130 if (fntype && stdarg_p (fntype))
1131 {
1132 cfun->machine->is_varadic = true;
1133 cfun->machine->has_varadic = true;
1134 cfun->machine->num_args++;
1135 }
1136 }
1137
1138 if (REG_P (arg) && arg != pc_rtx)
1139 {
1140 cfun->machine->num_args++;
1141 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1142 cfun->machine->call_args);
1143 }
1144 }
1145
1146 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1147 information we recorded. */
1148
1149 static void
1150 nvptx_end_call_args (void)
1151 {
1152 cfun->machine->doing_call = false;
1153 free_EXPR_LIST_list (&cfun->machine->call_args);
1154 }
1155
1156 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1157 track of whether calls involving static chains or varargs were seen
1158 in the current function.
1159 For libcalls, maintain a hash table of decls we have seen, and
1160 record a function decl for later when encountering a new one. */
1161
1162 void
1163 nvptx_expand_call (rtx retval, rtx address)
1164 {
1165 rtx callee = XEXP (address, 0);
1166 rtx varargs = NULL_RTX;
1167 unsigned parallel = 0;
1168
1169 if (!call_insn_operand (callee, Pmode))
1170 {
1171 callee = force_reg (Pmode, callee);
1172 address = change_address (address, QImode, callee);
1173 }
1174
1175 if (GET_CODE (callee) == SYMBOL_REF)
1176 {
1177 tree decl = SYMBOL_REF_DECL (callee);
1178 if (decl != NULL_TREE)
1179 {
1180 if (DECL_STATIC_CHAIN (decl))
1181 cfun->machine->has_chain = true;
1182
1183 tree attr = get_oacc_fn_attrib (decl);
1184 if (attr)
1185 {
1186 tree dims = TREE_VALUE (attr);
1187
1188 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1189 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1190 {
1191 if (TREE_PURPOSE (dims)
1192 && !integer_zerop (TREE_PURPOSE (dims)))
1193 break;
1194 /* Not on this axis. */
1195 parallel ^= GOMP_DIM_MASK (ix);
1196 dims = TREE_CHAIN (dims);
1197 }
1198 }
1199 }
1200 }
1201
1202 unsigned nargs = cfun->machine->num_args;
1203 if (cfun->machine->is_varadic)
1204 {
1205 varargs = gen_reg_rtx (Pmode);
1206 emit_move_insn (varargs, stack_pointer_rtx);
1207 }
1208
1209 rtvec vec = rtvec_alloc (nargs + 1);
1210 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1211 int vec_pos = 0;
1212
1213 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1214 rtx tmp_retval = retval;
1215 if (retval)
1216 {
1217 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1218 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1219 call = gen_rtx_SET (tmp_retval, call);
1220 }
1221 XVECEXP (pat, 0, vec_pos++) = call;
1222
1223 /* Construct the call insn, including a USE for each argument pseudo
1224 register. These will be used when printing the insn. */
1225 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1226 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1227
1228 if (varargs)
1229 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1230
1231 gcc_assert (vec_pos = XVECLEN (pat, 0));
1232
1233 nvptx_emit_forking (parallel, true);
1234 emit_call_insn (pat);
1235 nvptx_emit_joining (parallel, true);
1236
1237 if (tmp_retval != retval)
1238 emit_move_insn (retval, tmp_retval);
1239 }
1240
1241 /* Emit a comparison COMPARE, and return the new test to be used in the
1242 jump. */
1243
1244 rtx
1245 nvptx_expand_compare (rtx compare)
1246 {
1247 rtx pred = gen_reg_rtx (BImode);
1248 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1249 XEXP (compare, 0), XEXP (compare, 1));
1250 emit_insn (gen_rtx_SET (pred, cmp));
1251 return gen_rtx_NE (BImode, pred, const0_rtx);
1252 }
1253
1254 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1255
1256 void
1257 nvptx_expand_oacc_fork (unsigned mode)
1258 {
1259 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1260 }
1261
1262 void
1263 nvptx_expand_oacc_join (unsigned mode)
1264 {
1265 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1266 }
1267
1268 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1269 objects. */
1270
1271 static rtx
1272 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1273 {
1274 rtx res;
1275
1276 switch (GET_MODE (src))
1277 {
1278 case DImode:
1279 res = gen_unpackdisi2 (dst0, dst1, src);
1280 break;
1281 case DFmode:
1282 res = gen_unpackdfsi2 (dst0, dst1, src);
1283 break;
1284 default: gcc_unreachable ();
1285 }
1286 return res;
1287 }
1288
1289 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1290 object. */
1291
1292 static rtx
1293 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1294 {
1295 rtx res;
1296
1297 switch (GET_MODE (dst))
1298 {
1299 case DImode:
1300 res = gen_packsidi2 (dst, src0, src1);
1301 break;
1302 case DFmode:
1303 res = gen_packsidf2 (dst, src0, src1);
1304 break;
1305 default: gcc_unreachable ();
1306 }
1307 return res;
1308 }
1309
1310 /* Generate an instruction or sequence to broadcast register REG
1311 across the vectors of a single warp. */
1312
1313 static rtx
1314 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1315 {
1316 rtx res;
1317
1318 switch (GET_MODE (dst))
1319 {
1320 case SImode:
1321 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1322 break;
1323 case SFmode:
1324 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1325 break;
1326 case DImode:
1327 case DFmode:
1328 {
1329 rtx tmp0 = gen_reg_rtx (SImode);
1330 rtx tmp1 = gen_reg_rtx (SImode);
1331
1332 start_sequence ();
1333 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1334 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1335 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1336 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1337 res = get_insns ();
1338 end_sequence ();
1339 }
1340 break;
1341 case BImode:
1342 {
1343 rtx tmp = gen_reg_rtx (SImode);
1344
1345 start_sequence ();
1346 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1347 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1348 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1349 res = get_insns ();
1350 end_sequence ();
1351 }
1352 break;
1353 case QImode:
1354 case HImode:
1355 {
1356 rtx tmp = gen_reg_rtx (SImode);
1357
1358 start_sequence ();
1359 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1360 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1361 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1362 tmp)));
1363 res = get_insns ();
1364 end_sequence ();
1365 }
1366 break;
1367
1368 default:
1369 gcc_unreachable ();
1370 }
1371 return res;
1372 }
1373
1374 /* Generate an instruction or sequence to broadcast register REG
1375 across the vectors of a single warp. */
1376
1377 static rtx
1378 nvptx_gen_vcast (rtx reg)
1379 {
1380 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1381 }
1382
1383 /* Structure used when generating a worker-level spill or fill. */
1384
1385 struct wcast_data_t
1386 {
1387 rtx base; /* Register holding base addr of buffer. */
1388 rtx ptr; /* Iteration var, if needed. */
1389 unsigned offset; /* Offset into worker buffer. */
1390 };
1391
1392 /* Direction of the spill/fill and looping setup/teardown indicator. */
1393
1394 enum propagate_mask
1395 {
1396 PM_read = 1 << 0,
1397 PM_write = 1 << 1,
1398 PM_loop_begin = 1 << 2,
1399 PM_loop_end = 1 << 3,
1400
1401 PM_read_write = PM_read | PM_write
1402 };
1403
1404 /* Generate instruction(s) to spill or fill register REG to/from the
1405 worker broadcast array. PM indicates what is to be done, REP
1406 how many loop iterations will be executed (0 for not a loop). */
1407
1408 static rtx
1409 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1410 {
1411 rtx res;
1412 machine_mode mode = GET_MODE (reg);
1413
1414 switch (mode)
1415 {
1416 case BImode:
1417 {
1418 rtx tmp = gen_reg_rtx (SImode);
1419
1420 start_sequence ();
1421 if (pm & PM_read)
1422 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1423 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1424 if (pm & PM_write)
1425 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1426 res = get_insns ();
1427 end_sequence ();
1428 }
1429 break;
1430
1431 default:
1432 {
1433 rtx addr = data->ptr;
1434
1435 if (!addr)
1436 {
1437 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1438
1439 if (align > worker_bcast_align)
1440 worker_bcast_align = align;
1441 data->offset = (data->offset + align - 1) & ~(align - 1);
1442 addr = data->base;
1443 if (data->offset)
1444 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1445 }
1446
1447 addr = gen_rtx_MEM (mode, addr);
1448 if (pm == PM_read)
1449 res = gen_rtx_SET (addr, reg);
1450 else if (pm == PM_write)
1451 res = gen_rtx_SET (reg, addr);
1452 else
1453 gcc_unreachable ();
1454
1455 if (data->ptr)
1456 {
1457 /* We're using a ptr, increment it. */
1458 start_sequence ();
1459
1460 emit_insn (res);
1461 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1462 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1463 res = get_insns ();
1464 end_sequence ();
1465 }
1466 else
1467 rep = 1;
1468 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1469 }
1470 break;
1471 }
1472 return res;
1473 }
1474 \f
1475 /* Returns true if X is a valid address for use in a memory reference. */
1476
1477 static bool
1478 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1479 {
1480 enum rtx_code code = GET_CODE (x);
1481
1482 switch (code)
1483 {
1484 case REG:
1485 return true;
1486
1487 case PLUS:
1488 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1489 return true;
1490 return false;
1491
1492 case CONST:
1493 case SYMBOL_REF:
1494 case LABEL_REF:
1495 return true;
1496
1497 default:
1498 return false;
1499 }
1500 }
1501 \f
1502 /* Machinery to output constant initializers. When beginning an
1503 initializer, we decide on a fragment size (which is visible in ptx
1504 in the type used), and then all initializer data is buffered until
1505 a fragment is filled and ready to be written out. */
1506
1507 static struct
1508 {
1509 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1510 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1511 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1512 out. */
1513 unsigned size; /* Fragment size to accumulate. */
1514 unsigned offset; /* Offset within current fragment. */
1515 bool started; /* Whether we've output any initializer. */
1516 } init_frag;
1517
1518 /* The current fragment is full, write it out. SYM may provide a
1519 symbolic reference we should output, in which case the fragment
1520 value is the addend. */
1521
1522 static void
1523 output_init_frag (rtx sym)
1524 {
1525 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1526 unsigned HOST_WIDE_INT val = init_frag.val;
1527
1528 init_frag.started = true;
1529 init_frag.val = 0;
1530 init_frag.offset = 0;
1531 init_frag.remaining--;
1532
1533 if (sym)
1534 {
1535 fprintf (asm_out_file, "generic(");
1536 output_address (VOIDmode, sym);
1537 fprintf (asm_out_file, val ? ") + " : ")");
1538 }
1539
1540 if (!sym || val)
1541 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1542 }
1543
1544 /* Add value VAL of size SIZE to the data we're emitting, and keep
1545 writing out chunks as they fill up. */
1546
1547 static void
1548 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1549 {
1550 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1551
1552 for (unsigned part = 0; size; size -= part)
1553 {
1554 val >>= part * BITS_PER_UNIT;
1555 part = init_frag.size - init_frag.offset;
1556 if (part > size)
1557 part = size;
1558
1559 unsigned HOST_WIDE_INT partial
1560 = val << (init_frag.offset * BITS_PER_UNIT);
1561 init_frag.val |= partial & init_frag.mask;
1562 init_frag.offset += part;
1563
1564 if (init_frag.offset == init_frag.size)
1565 output_init_frag (NULL);
1566 }
1567 }
1568
1569 /* Target hook for assembling integer object X of size SIZE. */
1570
1571 static bool
1572 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1573 {
1574 HOST_WIDE_INT val = 0;
1575
1576 switch (GET_CODE (x))
1577 {
1578 default:
1579 /* Let the generic machinery figure it out, usually for a
1580 CONST_WIDE_INT. */
1581 return false;
1582
1583 case CONST_INT:
1584 nvptx_assemble_value (INTVAL (x), size);
1585 break;
1586
1587 case CONST:
1588 x = XEXP (x, 0);
1589 gcc_assert (GET_CODE (x) == PLUS);
1590 val = INTVAL (XEXP (x, 1));
1591 x = XEXP (x, 0);
1592 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1593 /* FALLTHROUGH */
1594
1595 case SYMBOL_REF:
1596 gcc_assert (size == init_frag.size);
1597 if (init_frag.offset)
1598 sorry ("cannot emit unaligned pointers in ptx assembly");
1599
1600 nvptx_maybe_record_fnsym (x);
1601 init_frag.val = val;
1602 output_init_frag (x);
1603 break;
1604 }
1605
1606 return true;
1607 }
1608
1609 /* Output SIZE zero bytes. We ignore the FILE argument since the
1610 functions we're calling to perform the output just use
1611 asm_out_file. */
1612
1613 void
1614 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1615 {
1616 /* Finish the current fragment, if it's started. */
1617 if (init_frag.offset)
1618 {
1619 unsigned part = init_frag.size - init_frag.offset;
1620 if (part > size)
1621 part = (unsigned) size;
1622 size -= part;
1623 nvptx_assemble_value (0, part);
1624 }
1625
1626 /* If this skip doesn't terminate the initializer, write as many
1627 remaining pieces as possible directly. */
1628 if (size < init_frag.remaining * init_frag.size)
1629 {
1630 while (size >= init_frag.size)
1631 {
1632 size -= init_frag.size;
1633 output_init_frag (NULL_RTX);
1634 }
1635 if (size)
1636 nvptx_assemble_value (0, size);
1637 }
1638 }
1639
1640 /* Output a string STR with length SIZE. As in nvptx_output_skip we
1641 ignore the FILE arg. */
1642
1643 void
1644 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1645 {
1646 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1647 nvptx_assemble_value (str[i], 1);
1648 }
1649
1650 /* Emit a PTX variable decl and prepare for emission of its
1651 initializer. NAME is the symbol name and SETION the PTX data
1652 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1653 The caller has already emitted any indentation and linkage
1654 specifier. It is responsible for any initializer, terminating ;
1655 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1656 this is the opposite way round that PTX wants them! */
1657
1658 static void
1659 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1660 const_tree type, HOST_WIDE_INT size, unsigned align)
1661 {
1662 while (TREE_CODE (type) == ARRAY_TYPE)
1663 type = TREE_TYPE (type);
1664
1665 if (TREE_CODE (type) == VECTOR_TYPE
1666 || TREE_CODE (type) == COMPLEX_TYPE)
1667 /* Neither vector nor complex types can contain the other. */
1668 type = TREE_TYPE (type);
1669
1670 unsigned elt_size = int_size_in_bytes (type);
1671
1672 /* Largest mode we're prepared to accept. For BLKmode types we
1673 don't know if it'll contain pointer constants, so have to choose
1674 pointer size, otherwise we can choose DImode. */
1675 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1676
1677 elt_size |= GET_MODE_SIZE (elt_mode);
1678 elt_size &= -elt_size; /* Extract LSB set. */
1679
1680 init_frag.size = elt_size;
1681 /* Avoid undefined shift behavior by using '2'. */
1682 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1683 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1684 init_frag.val = 0;
1685 init_frag.offset = 0;
1686 init_frag.started = false;
1687 /* Size might not be a multiple of elt size, if there's an
1688 initialized trailing struct array with smaller type than
1689 elt_size. */
1690 init_frag.remaining = (size + elt_size - 1) / elt_size;
1691
1692 fprintf (file, "%s .align %d .u%d ",
1693 section, align / BITS_PER_UNIT,
1694 elt_size * BITS_PER_UNIT);
1695 assemble_name (file, name);
1696
1697 if (size)
1698 /* We make everything an array, to simplify any initialization
1699 emission. */
1700 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
1701 }
1702
1703 /* Called when the initializer for a decl has been completely output through
1704 combinations of the three functions above. */
1705
1706 static void
1707 nvptx_assemble_decl_end (void)
1708 {
1709 if (init_frag.offset)
1710 /* This can happen with a packed struct with trailing array member. */
1711 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1712 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
1713 }
1714
1715 /* Output an uninitialized common or file-scope variable. */
1716
1717 void
1718 nvptx_output_aligned_decl (FILE *file, const char *name,
1719 const_tree decl, HOST_WIDE_INT size, unsigned align)
1720 {
1721 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1722
1723 /* If this is public, it is common. The nearest thing we have to
1724 common is weak. */
1725 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1726
1727 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1728 TREE_TYPE (decl), size, align);
1729 nvptx_assemble_decl_end ();
1730 }
1731
1732 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1733 writing a constant variable EXP with NAME and SIZE and its
1734 initializer to FILE. */
1735
1736 static void
1737 nvptx_asm_declare_constant_name (FILE *file, const char *name,
1738 const_tree exp, HOST_WIDE_INT obj_size)
1739 {
1740 write_var_marker (file, true, false, name);
1741
1742 fprintf (file, "\t");
1743
1744 tree type = TREE_TYPE (exp);
1745 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1746 TYPE_ALIGN (type));
1747 }
1748
1749 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1750 a variable DECL with NAME to FILE. */
1751
1752 void
1753 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1754 {
1755 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1756
1757 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1758 : DECL_WEAK (decl) ? ".weak " : ".visible "));
1759
1760 tree type = TREE_TYPE (decl);
1761 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1762 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1763 type, obj_size, DECL_ALIGN (decl));
1764 }
1765
1766 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1767
1768 static void
1769 nvptx_globalize_label (FILE *, const char *)
1770 {
1771 }
1772
1773 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1774 declaration only for variable DECL with NAME to FILE. */
1775
1776 static void
1777 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1778 {
1779 /* The middle end can place constant pool decls into the varpool as
1780 undefined. Until that is fixed, catch the problem here. */
1781 if (DECL_IN_CONSTANT_POOL (decl))
1782 return;
1783
1784 /* We support weak defintions, and hence have the right
1785 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
1786 if (DECL_WEAK (decl))
1787 error_at (DECL_SOURCE_LOCATION (decl),
1788 "PTX does not support weak declarations"
1789 " (only weak definitions)");
1790 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1791
1792 fprintf (file, "\t.extern ");
1793 tree size = DECL_SIZE_UNIT (decl);
1794 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1795 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1796 DECL_ALIGN (decl));
1797 nvptx_assemble_decl_end ();
1798 }
1799
1800 /* Output a pattern for a move instruction. */
1801
1802 const char *
1803 nvptx_output_mov_insn (rtx dst, rtx src)
1804 {
1805 machine_mode dst_mode = GET_MODE (dst);
1806 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1807 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1808 machine_mode src_inner = (GET_CODE (src) == SUBREG
1809 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1810
1811 rtx sym = src;
1812 if (GET_CODE (sym) == CONST)
1813 sym = XEXP (XEXP (sym, 0), 0);
1814 if (SYMBOL_REF_P (sym))
1815 {
1816 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
1817 return "%.\tcvta%D1%t0\t%0, %1;";
1818 nvptx_maybe_record_fnsym (sym);
1819 }
1820
1821 if (src_inner == dst_inner)
1822 return "%.\tmov%t0\t%0, %1;";
1823
1824 if (CONSTANT_P (src))
1825 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1826 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1827 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1828
1829 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1830 return "%.\tmov.b%T0\t%0, %1;";
1831
1832 return "%.\tcvt%t0%t1\t%0, %1;";
1833 }
1834
1835 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
1836 involves writing .param declarations and in/out copies into them. For
1837 indirect calls, also write the .callprototype. */
1838
1839 const char *
1840 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1841 {
1842 char buf[16];
1843 static int labelno;
1844 bool needs_tgt = register_operand (callee, Pmode);
1845 rtx pat = PATTERN (insn);
1846 int arg_end = XVECLEN (pat, 0);
1847 tree decl = NULL_TREE;
1848
1849 fprintf (asm_out_file, "\t{\n");
1850 if (result != NULL)
1851 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1852 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1853 reg_names[NVPTX_RETURN_REGNUM]);
1854
1855 /* Ensure we have a ptx declaration in the output if necessary. */
1856 if (GET_CODE (callee) == SYMBOL_REF)
1857 {
1858 decl = SYMBOL_REF_DECL (callee);
1859 if (!decl
1860 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1861 nvptx_record_libfunc (callee, result, pat);
1862 else if (DECL_EXTERNAL (decl))
1863 nvptx_record_fndecl (decl);
1864 }
1865
1866 if (needs_tgt)
1867 {
1868 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1869 labelno++;
1870 ASM_OUTPUT_LABEL (asm_out_file, buf);
1871 std::stringstream s;
1872 write_fn_proto_from_insn (s, NULL, result, pat);
1873 fputs (s.str().c_str(), asm_out_file);
1874 }
1875
1876 for (int argno = 1; argno < arg_end; argno++)
1877 {
1878 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
1879 machine_mode mode = GET_MODE (t);
1880 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
1881
1882 /* Mode splitting has already been done. */
1883 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
1884 "\t\tst.param%s [%%out_arg%d], ",
1885 ptx_type, argno, ptx_type, argno);
1886 output_reg (asm_out_file, REGNO (t), VOIDmode);
1887 fprintf (asm_out_file, ";\n");
1888 }
1889
1890 fprintf (asm_out_file, "\t\tcall ");
1891 if (result != NULL_RTX)
1892 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1893
1894 if (decl)
1895 {
1896 const char *name = get_fnname_from_decl (decl);
1897 name = nvptx_name_replacement (name);
1898 assemble_name (asm_out_file, name);
1899 }
1900 else
1901 output_address (VOIDmode, callee);
1902
1903 const char *open = "(";
1904 for (int argno = 1; argno < arg_end; argno++)
1905 {
1906 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1907 open = "";
1908 }
1909 if (decl && DECL_STATIC_CHAIN (decl))
1910 {
1911 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
1912 open = "";
1913 }
1914 if (!open[0])
1915 fprintf (asm_out_file, ")");
1916
1917 if (needs_tgt)
1918 {
1919 fprintf (asm_out_file, ", ");
1920 assemble_name (asm_out_file, buf);
1921 }
1922 fprintf (asm_out_file, ";\n");
1923
1924 if (find_reg_note (insn, REG_NORETURN, NULL))
1925 /* No return functions confuse the PTX JIT, as it doesn't realize
1926 the flow control barrier they imply. It can seg fault if it
1927 encounters what looks like an unexitable loop. Emit a trailing
1928 trap, which it does grok. */
1929 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1930
1931 if (result)
1932 {
1933 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1934
1935 if (!rval[0])
1936 /* We must escape the '%' that starts RETURN_REGNUM. */
1937 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1938 reg_names[NVPTX_RETURN_REGNUM]);
1939 return rval;
1940 }
1941
1942 return "}";
1943 }
1944
1945 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1946
1947 static bool
1948 nvptx_print_operand_punct_valid_p (unsigned char c)
1949 {
1950 return c == '.' || c== '#';
1951 }
1952
1953 static void nvptx_print_operand (FILE *, rtx, int);
1954
1955 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1956
1957 static void
1958 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1959 {
1960 rtx off;
1961 if (GET_CODE (x) == CONST)
1962 x = XEXP (x, 0);
1963 switch (GET_CODE (x))
1964 {
1965 case PLUS:
1966 off = XEXP (x, 1);
1967 output_address (VOIDmode, XEXP (x, 0));
1968 fprintf (file, "+");
1969 output_address (VOIDmode, off);
1970 break;
1971
1972 case SYMBOL_REF:
1973 case LABEL_REF:
1974 output_addr_const (file, x);
1975 break;
1976
1977 default:
1978 gcc_assert (GET_CODE (x) != MEM);
1979 nvptx_print_operand (file, x, 0);
1980 break;
1981 }
1982 }
1983
1984 /* Write assembly language output for the address ADDR to FILE. */
1985
1986 static void
1987 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
1988 {
1989 nvptx_print_address_operand (file, addr, mode);
1990 }
1991
1992 /* Print an operand, X, to FILE, with an optional modifier in CODE.
1993
1994 Meaning of CODE:
1995 . -- print the predicate for the instruction or an emptry string for an
1996 unconditional one.
1997 # -- print a rounding mode for the instruction
1998
1999 A -- print a data area for a MEM
2000 c -- print an opcode suffix for a comparison operator, including a type code
2001 D -- print a data area for a MEM operand
2002 S -- print a shuffle kind specified by CONST_INT
2003 t -- print a type opcode suffix, promoting QImode to 32 bits
2004 T -- print a type size in bits
2005 u -- print a type opcode suffix without promotions. */
2006
2007 static void
2008 nvptx_print_operand (FILE *file, rtx x, int code)
2009 {
2010 if (code == '.')
2011 {
2012 x = current_insn_predicate;
2013 if (x)
2014 {
2015 unsigned int regno = REGNO (XEXP (x, 0));
2016 fputs ("[", file);
2017 if (GET_CODE (x) == EQ)
2018 fputs ("!", file);
2019 fputs (reg_names [regno], file);
2020 fputs ("]", file);
2021 }
2022 return;
2023 }
2024 else if (code == '#')
2025 {
2026 fputs (".rn", file);
2027 return;
2028 }
2029
2030 enum rtx_code x_code = GET_CODE (x);
2031 machine_mode mode = GET_MODE (x);
2032
2033 switch (code)
2034 {
2035 case 'A':
2036 x = XEXP (x, 0);
2037 /* FALLTHROUGH. */
2038
2039 case 'D':
2040 if (GET_CODE (x) == CONST)
2041 x = XEXP (x, 0);
2042 if (GET_CODE (x) == PLUS)
2043 x = XEXP (x, 0);
2044
2045 if (GET_CODE (x) == SYMBOL_REF)
2046 fputs (section_for_sym (x), file);
2047 break;
2048
2049 case 't':
2050 case 'u':
2051 if (x_code == SUBREG)
2052 {
2053 mode = GET_MODE (SUBREG_REG (x));
2054 if (mode == TImode)
2055 mode = DImode;
2056 else if (COMPLEX_MODE_P (mode))
2057 mode = GET_MODE_INNER (mode);
2058 }
2059 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2060 break;
2061
2062 case 'S':
2063 {
2064 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2065 /* Same order as nvptx_shuffle_kind. */
2066 static const char *const kinds[] =
2067 {".up", ".down", ".bfly", ".idx"};
2068 fputs (kinds[kind], file);
2069 }
2070 break;
2071
2072 case 'T':
2073 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2074 break;
2075
2076 case 'j':
2077 fprintf (file, "@");
2078 goto common;
2079
2080 case 'J':
2081 fprintf (file, "@!");
2082 goto common;
2083
2084 case 'c':
2085 mode = GET_MODE (XEXP (x, 0));
2086 switch (x_code)
2087 {
2088 case EQ:
2089 fputs (".eq", file);
2090 break;
2091 case NE:
2092 if (FLOAT_MODE_P (mode))
2093 fputs (".neu", file);
2094 else
2095 fputs (".ne", file);
2096 break;
2097 case LE:
2098 case LEU:
2099 fputs (".le", file);
2100 break;
2101 case GE:
2102 case GEU:
2103 fputs (".ge", file);
2104 break;
2105 case LT:
2106 case LTU:
2107 fputs (".lt", file);
2108 break;
2109 case GT:
2110 case GTU:
2111 fputs (".gt", file);
2112 break;
2113 case LTGT:
2114 fputs (".ne", file);
2115 break;
2116 case UNEQ:
2117 fputs (".equ", file);
2118 break;
2119 case UNLE:
2120 fputs (".leu", file);
2121 break;
2122 case UNGE:
2123 fputs (".geu", file);
2124 break;
2125 case UNLT:
2126 fputs (".ltu", file);
2127 break;
2128 case UNGT:
2129 fputs (".gtu", file);
2130 break;
2131 case UNORDERED:
2132 fputs (".nan", file);
2133 break;
2134 case ORDERED:
2135 fputs (".num", file);
2136 break;
2137 default:
2138 gcc_unreachable ();
2139 }
2140 if (FLOAT_MODE_P (mode)
2141 || x_code == EQ || x_code == NE
2142 || x_code == GEU || x_code == GTU
2143 || x_code == LEU || x_code == LTU)
2144 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2145 else
2146 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2147 break;
2148 default:
2149 common:
2150 switch (x_code)
2151 {
2152 case SUBREG:
2153 {
2154 rtx inner_x = SUBREG_REG (x);
2155 machine_mode inner_mode = GET_MODE (inner_x);
2156 machine_mode split = maybe_split_mode (inner_mode);
2157
2158 if (split != VOIDmode
2159 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2160 output_reg (file, REGNO (inner_x), split);
2161 else
2162 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2163 }
2164 break;
2165
2166 case REG:
2167 output_reg (file, REGNO (x), maybe_split_mode (mode));
2168 break;
2169
2170 case MEM:
2171 fputc ('[', file);
2172 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2173 fputc (']', file);
2174 break;
2175
2176 case CONST_INT:
2177 output_addr_const (file, x);
2178 break;
2179
2180 case CONST:
2181 case SYMBOL_REF:
2182 case LABEL_REF:
2183 /* We could use output_addr_const, but that can print things like
2184 "x-8", which breaks ptxas. Need to ensure it is output as
2185 "x+-8". */
2186 nvptx_print_address_operand (file, x, VOIDmode);
2187 break;
2188
2189 case CONST_DOUBLE:
2190 long vals[2];
2191 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2192 vals[0] &= 0xffffffff;
2193 vals[1] &= 0xffffffff;
2194 if (mode == SFmode)
2195 fprintf (file, "0f%08lx", vals[0]);
2196 else
2197 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2198 break;
2199
2200 default:
2201 output_addr_const (file, x);
2202 }
2203 }
2204 }
2205 \f
2206 /* Record replacement regs used to deal with subreg operands. */
2207 struct reg_replace
2208 {
2209 rtx replacement[MAX_RECOG_OPERANDS];
2210 machine_mode mode;
2211 int n_allocated;
2212 int n_in_use;
2213 };
2214
2215 /* Allocate or reuse a replacement in R and return the rtx. */
2216
2217 static rtx
2218 get_replacement (struct reg_replace *r)
2219 {
2220 if (r->n_allocated == r->n_in_use)
2221 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2222 return r->replacement[r->n_in_use++];
2223 }
2224
2225 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2226 the presence of subregs would break the rules for most instructions.
2227 Replace them with a suitable new register of the right size, plus
2228 conversion copyin/copyout instructions. */
2229
2230 static void
2231 nvptx_reorg_subreg (void)
2232 {
2233 struct reg_replace qiregs, hiregs, siregs, diregs;
2234 rtx_insn *insn, *next;
2235
2236 qiregs.n_allocated = 0;
2237 hiregs.n_allocated = 0;
2238 siregs.n_allocated = 0;
2239 diregs.n_allocated = 0;
2240 qiregs.mode = QImode;
2241 hiregs.mode = HImode;
2242 siregs.mode = SImode;
2243 diregs.mode = DImode;
2244
2245 for (insn = get_insns (); insn; insn = next)
2246 {
2247 next = NEXT_INSN (insn);
2248 if (!NONDEBUG_INSN_P (insn)
2249 || asm_noperands (PATTERN (insn)) >= 0
2250 || GET_CODE (PATTERN (insn)) == USE
2251 || GET_CODE (PATTERN (insn)) == CLOBBER)
2252 continue;
2253
2254 qiregs.n_in_use = 0;
2255 hiregs.n_in_use = 0;
2256 siregs.n_in_use = 0;
2257 diregs.n_in_use = 0;
2258 extract_insn (insn);
2259 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2260
2261 for (int i = 0; i < recog_data.n_operands; i++)
2262 {
2263 rtx op = recog_data.operand[i];
2264 if (GET_CODE (op) != SUBREG)
2265 continue;
2266
2267 rtx inner = SUBREG_REG (op);
2268
2269 machine_mode outer_mode = GET_MODE (op);
2270 machine_mode inner_mode = GET_MODE (inner);
2271 gcc_assert (s_ok);
2272 if (s_ok
2273 && (GET_MODE_PRECISION (inner_mode)
2274 >= GET_MODE_PRECISION (outer_mode)))
2275 continue;
2276 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2277 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2278 : outer_mode == HImode ? &hiregs
2279 : outer_mode == SImode ? &siregs
2280 : &diregs);
2281 rtx new_reg = get_replacement (r);
2282
2283 if (recog_data.operand_type[i] != OP_OUT)
2284 {
2285 enum rtx_code code;
2286 if (GET_MODE_PRECISION (inner_mode)
2287 < GET_MODE_PRECISION (outer_mode))
2288 code = ZERO_EXTEND;
2289 else
2290 code = TRUNCATE;
2291
2292 rtx pat = gen_rtx_SET (new_reg,
2293 gen_rtx_fmt_e (code, outer_mode, inner));
2294 emit_insn_before (pat, insn);
2295 }
2296
2297 if (recog_data.operand_type[i] != OP_IN)
2298 {
2299 enum rtx_code code;
2300 if (GET_MODE_PRECISION (inner_mode)
2301 < GET_MODE_PRECISION (outer_mode))
2302 code = TRUNCATE;
2303 else
2304 code = ZERO_EXTEND;
2305
2306 rtx pat = gen_rtx_SET (inner,
2307 gen_rtx_fmt_e (code, inner_mode, new_reg));
2308 emit_insn_after (pat, insn);
2309 }
2310 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2311 }
2312 }
2313 }
2314
2315 /* Loop structure of the function. The entire function is described as
2316 a NULL loop. */
2317
2318 struct parallel
2319 {
2320 /* Parent parallel. */
2321 parallel *parent;
2322
2323 /* Next sibling parallel. */
2324 parallel *next;
2325
2326 /* First child parallel. */
2327 parallel *inner;
2328
2329 /* Partitioning mask of the parallel. */
2330 unsigned mask;
2331
2332 /* Partitioning used within inner parallels. */
2333 unsigned inner_mask;
2334
2335 /* Location of parallel forked and join. The forked is the first
2336 block in the parallel and the join is the first block after of
2337 the partition. */
2338 basic_block forked_block;
2339 basic_block join_block;
2340
2341 rtx_insn *forked_insn;
2342 rtx_insn *join_insn;
2343
2344 rtx_insn *fork_insn;
2345 rtx_insn *joining_insn;
2346
2347 /* Basic blocks in this parallel, but not in child parallels. The
2348 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2349 blocks are not. */
2350 auto_vec<basic_block> blocks;
2351
2352 public:
2353 parallel (parallel *parent, unsigned mode);
2354 ~parallel ();
2355 };
2356
2357 /* Constructor links the new parallel into it's parent's chain of
2358 children. */
2359
2360 parallel::parallel (parallel *parent_, unsigned mask_)
2361 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2362 {
2363 forked_block = join_block = 0;
2364 forked_insn = join_insn = 0;
2365 fork_insn = joining_insn = 0;
2366
2367 if (parent)
2368 {
2369 next = parent->inner;
2370 parent->inner = this;
2371 }
2372 }
2373
2374 parallel::~parallel ()
2375 {
2376 delete inner;
2377 delete next;
2378 }
2379
2380 /* Map of basic blocks to insns */
2381 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2382
2383 /* A tuple of an insn of interest and the BB in which it resides. */
2384 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2385 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2386
2387 /* Split basic blocks such that each forked and join unspecs are at
2388 the start of their basic blocks. Thus afterwards each block will
2389 have a single partitioning mode. We also do the same for return
2390 insns, as they are executed by every thread. Return the
2391 partitioning mode of the function as a whole. Populate MAP with
2392 head and tail blocks. We also clear the BB visited flag, which is
2393 used when finding partitions. */
2394
2395 static void
2396 nvptx_split_blocks (bb_insn_map_t *map)
2397 {
2398 insn_bb_vec_t worklist;
2399 basic_block block;
2400 rtx_insn *insn;
2401
2402 /* Locate all the reorg instructions of interest. */
2403 FOR_ALL_BB_FN (block, cfun)
2404 {
2405 bool seen_insn = false;
2406
2407 /* Clear visited flag, for use by parallel locator */
2408 block->flags &= ~BB_VISITED;
2409
2410 FOR_BB_INSNS (block, insn)
2411 {
2412 if (!INSN_P (insn))
2413 continue;
2414 switch (recog_memoized (insn))
2415 {
2416 default:
2417 seen_insn = true;
2418 continue;
2419 case CODE_FOR_nvptx_forked:
2420 case CODE_FOR_nvptx_join:
2421 break;
2422
2423 case CODE_FOR_return:
2424 /* We also need to split just before return insns, as
2425 that insn needs executing by all threads, but the
2426 block it is in probably does not. */
2427 break;
2428 }
2429
2430 if (seen_insn)
2431 /* We've found an instruction that must be at the start of
2432 a block, but isn't. Add it to the worklist. */
2433 worklist.safe_push (insn_bb_t (insn, block));
2434 else
2435 /* It was already the first instruction. Just add it to
2436 the map. */
2437 map->get_or_insert (block) = insn;
2438 seen_insn = true;
2439 }
2440 }
2441
2442 /* Split blocks on the worklist. */
2443 unsigned ix;
2444 insn_bb_t *elt;
2445 basic_block remap = 0;
2446 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2447 {
2448 if (remap != elt->second)
2449 {
2450 block = elt->second;
2451 remap = block;
2452 }
2453
2454 /* Split block before insn. The insn is in the new block */
2455 edge e = split_block (block, PREV_INSN (elt->first));
2456
2457 block = e->dest;
2458 map->get_or_insert (block) = elt->first;
2459 }
2460 }
2461
2462 /* BLOCK is a basic block containing a head or tail instruction.
2463 Locate the associated prehead or pretail instruction, which must be
2464 in the single predecessor block. */
2465
2466 static rtx_insn *
2467 nvptx_discover_pre (basic_block block, int expected)
2468 {
2469 gcc_assert (block->preds->length () == 1);
2470 basic_block pre_block = (*block->preds)[0]->src;
2471 rtx_insn *pre_insn;
2472
2473 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2474 pre_insn = PREV_INSN (pre_insn))
2475 gcc_assert (pre_insn != BB_HEAD (pre_block));
2476
2477 gcc_assert (recog_memoized (pre_insn) == expected);
2478 return pre_insn;
2479 }
2480
2481 /* Dump this parallel and all its inner parallels. */
2482
2483 static void
2484 nvptx_dump_pars (parallel *par, unsigned depth)
2485 {
2486 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2487 depth, par->mask,
2488 par->forked_block ? par->forked_block->index : -1,
2489 par->join_block ? par->join_block->index : -1);
2490
2491 fprintf (dump_file, " blocks:");
2492
2493 basic_block block;
2494 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2495 fprintf (dump_file, " %d", block->index);
2496 fprintf (dump_file, "\n");
2497 if (par->inner)
2498 nvptx_dump_pars (par->inner, depth + 1);
2499
2500 if (par->next)
2501 nvptx_dump_pars (par->next, depth);
2502 }
2503
2504 /* If BLOCK contains a fork/join marker, process it to create or
2505 terminate a loop structure. Add this block to the current loop,
2506 and then walk successor blocks. */
2507
2508 static parallel *
2509 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2510 {
2511 if (block->flags & BB_VISITED)
2512 return par;
2513 block->flags |= BB_VISITED;
2514
2515 if (rtx_insn **endp = map->get (block))
2516 {
2517 rtx_insn *end = *endp;
2518
2519 /* This is a block head or tail, or return instruction. */
2520 switch (recog_memoized (end))
2521 {
2522 case CODE_FOR_return:
2523 /* Return instructions are in their own block, and we
2524 don't need to do anything more. */
2525 return par;
2526
2527 case CODE_FOR_nvptx_forked:
2528 /* Loop head, create a new inner loop and add it into
2529 our parent's child list. */
2530 {
2531 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2532
2533 gcc_assert (mask);
2534 par = new parallel (par, mask);
2535 par->forked_block = block;
2536 par->forked_insn = end;
2537 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2538 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2539 par->fork_insn
2540 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2541 }
2542 break;
2543
2544 case CODE_FOR_nvptx_join:
2545 /* A loop tail. Finish the current loop and return to
2546 parent. */
2547 {
2548 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2549
2550 gcc_assert (par->mask == mask);
2551 par->join_block = block;
2552 par->join_insn = end;
2553 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2554 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2555 par->joining_insn
2556 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2557 par = par->parent;
2558 }
2559 break;
2560
2561 default:
2562 gcc_unreachable ();
2563 }
2564 }
2565
2566 if (par)
2567 /* Add this block onto the current loop's list of blocks. */
2568 par->blocks.safe_push (block);
2569 else
2570 /* This must be the entry block. Create a NULL parallel. */
2571 par = new parallel (0, 0);
2572
2573 /* Walk successor blocks. */
2574 edge e;
2575 edge_iterator ei;
2576
2577 FOR_EACH_EDGE (e, ei, block->succs)
2578 nvptx_find_par (map, par, e->dest);
2579
2580 return par;
2581 }
2582
2583 /* DFS walk the CFG looking for fork & join markers. Construct
2584 loop structures as we go. MAP is a mapping of basic blocks
2585 to head & tail markers, discovered when splitting blocks. This
2586 speeds up the discovery. We rely on the BB visited flag having
2587 been cleared when splitting blocks. */
2588
2589 static parallel *
2590 nvptx_discover_pars (bb_insn_map_t *map)
2591 {
2592 basic_block block;
2593
2594 /* Mark exit blocks as visited. */
2595 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2596 block->flags |= BB_VISITED;
2597
2598 /* And entry block as not. */
2599 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2600 block->flags &= ~BB_VISITED;
2601
2602 parallel *par = nvptx_find_par (map, 0, block);
2603
2604 if (dump_file)
2605 {
2606 fprintf (dump_file, "\nLoops\n");
2607 nvptx_dump_pars (par, 0);
2608 fprintf (dump_file, "\n");
2609 }
2610
2611 return par;
2612 }
2613
2614 /* Analyse a group of BBs within a partitioned region and create N
2615 Single-Entry-Single-Exit regions. Some of those regions will be
2616 trivial ones consisting of a single BB. The blocks of a
2617 partitioned region might form a set of disjoint graphs -- because
2618 the region encloses a differently partitoned sub region.
2619
2620 We use the linear time algorithm described in 'Finding Regions Fast:
2621 Single Entry Single Exit and control Regions in Linear Time'
2622 Johnson, Pearson & Pingali. That algorithm deals with complete
2623 CFGs, where a back edge is inserted from END to START, and thus the
2624 problem becomes one of finding equivalent loops.
2625
2626 In this case we have a partial CFG. We complete it by redirecting
2627 any incoming edge to the graph to be from an arbitrary external BB,
2628 and similarly redirecting any outgoing edge to be to that BB.
2629 Thus we end up with a closed graph.
2630
2631 The algorithm works by building a spanning tree of an undirected
2632 graph and keeping track of back edges from nodes further from the
2633 root in the tree to nodes nearer to the root in the tree. In the
2634 description below, the root is up and the tree grows downwards.
2635
2636 We avoid having to deal with degenerate back-edges to the same
2637 block, by splitting each BB into 3 -- one for input edges, one for
2638 the node itself and one for the output edges. Such back edges are
2639 referred to as 'Brackets'. Cycle equivalent nodes will have the
2640 same set of brackets.
2641
2642 Determining bracket equivalency is done by maintaining a list of
2643 brackets in such a manner that the list length and final bracket
2644 uniquely identify the set.
2645
2646 We use coloring to mark all BBs with cycle equivalency with the
2647 same color. This is the output of the 'Finding Regions Fast'
2648 algorithm. Notice it doesn't actually find the set of nodes within
2649 a particular region, just unorderd sets of nodes that are the
2650 entries and exits of SESE regions.
2651
2652 After determining cycle equivalency, we need to find the minimal
2653 set of SESE regions. Do this with a DFS coloring walk of the
2654 complete graph. We're either 'looking' or 'coloring'. When
2655 looking, and we're in the subgraph, we start coloring the color of
2656 the current node, and remember that node as the start of the
2657 current color's SESE region. Every time we go to a new node, we
2658 decrement the count of nodes with thet color. If it reaches zero,
2659 we remember that node as the end of the current color's SESE region
2660 and return to 'looking'. Otherwise we color the node the current
2661 color.
2662
2663 This way we end up with coloring the inside of non-trivial SESE
2664 regions with the color of that region. */
2665
2666 /* A pair of BBs. We use this to represent SESE regions. */
2667 typedef std::pair<basic_block, basic_block> bb_pair_t;
2668 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2669
2670 /* A node in the undirected CFG. The discriminator SECOND indicates just
2671 above or just below the BB idicated by FIRST. */
2672 typedef std::pair<basic_block, int> pseudo_node_t;
2673
2674 /* A bracket indicates an edge towards the root of the spanning tree of the
2675 undirected graph. Each bracket has a color, determined
2676 from the currrent set of brackets. */
2677 struct bracket
2678 {
2679 pseudo_node_t back; /* Back target */
2680
2681 /* Current color and size of set. */
2682 unsigned color;
2683 unsigned size;
2684
2685 bracket (pseudo_node_t back_)
2686 : back (back_), color (~0u), size (~0u)
2687 {
2688 }
2689
2690 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2691 {
2692 if (length != size)
2693 {
2694 size = length;
2695 color = color_counts.length ();
2696 color_counts.quick_push (0);
2697 }
2698 color_counts[color]++;
2699 return color;
2700 }
2701 };
2702
2703 typedef auto_vec<bracket> bracket_vec_t;
2704
2705 /* Basic block info for finding SESE regions. */
2706
2707 struct bb_sese
2708 {
2709 int node; /* Node number in spanning tree. */
2710 int parent; /* Parent node number. */
2711
2712 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2713 edges arrive at pseudo-node Ai and the outgoing edges leave at
2714 pseudo-node Ao. We have to remember which way we arrived at a
2715 particular node when generating the spanning tree. dir > 0 means
2716 we arrived at Ai, dir < 0 means we arrived at Ao. */
2717 int dir;
2718
2719 /* Lowest numbered pseudo-node reached via a backedge from thsis
2720 node, or any descendant. */
2721 pseudo_node_t high;
2722
2723 int color; /* Cycle-equivalence color */
2724
2725 /* Stack of brackets for this node. */
2726 bracket_vec_t brackets;
2727
2728 bb_sese (unsigned node_, unsigned p, int dir_)
2729 :node (node_), parent (p), dir (dir_)
2730 {
2731 }
2732 ~bb_sese ();
2733
2734 /* Push a bracket ending at BACK. */
2735 void push (const pseudo_node_t &back)
2736 {
2737 if (dump_file)
2738 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2739 back.first ? back.first->index : 0, back.second);
2740 brackets.safe_push (bracket (back));
2741 }
2742
2743 void append (bb_sese *child);
2744 void remove (const pseudo_node_t &);
2745
2746 /* Set node's color. */
2747 void set_color (auto_vec<unsigned> &color_counts)
2748 {
2749 color = brackets.last ().get_color (color_counts, brackets.length ());
2750 }
2751 };
2752
2753 bb_sese::~bb_sese ()
2754 {
2755 }
2756
2757 /* Destructively append CHILD's brackets. */
2758
2759 void
2760 bb_sese::append (bb_sese *child)
2761 {
2762 if (int len = child->brackets.length ())
2763 {
2764 int ix;
2765
2766 if (dump_file)
2767 {
2768 for (ix = 0; ix < len; ix++)
2769 {
2770 const pseudo_node_t &pseudo = child->brackets[ix].back;
2771 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2772 child->node, pseudo.first ? pseudo.first->index : 0,
2773 pseudo.second);
2774 }
2775 }
2776 if (!brackets.length ())
2777 std::swap (brackets, child->brackets);
2778 else
2779 {
2780 brackets.reserve (len);
2781 for (ix = 0; ix < len; ix++)
2782 brackets.quick_push (child->brackets[ix]);
2783 }
2784 }
2785 }
2786
2787 /* Remove brackets that terminate at PSEUDO. */
2788
2789 void
2790 bb_sese::remove (const pseudo_node_t &pseudo)
2791 {
2792 unsigned removed = 0;
2793 int len = brackets.length ();
2794
2795 for (int ix = 0; ix < len; ix++)
2796 {
2797 if (brackets[ix].back == pseudo)
2798 {
2799 if (dump_file)
2800 fprintf (dump_file, "Removing backedge %d:%+d\n",
2801 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2802 removed++;
2803 }
2804 else if (removed)
2805 brackets[ix-removed] = brackets[ix];
2806 }
2807 while (removed--)
2808 brackets.pop ();
2809 }
2810
2811 /* Accessors for BB's aux pointer. */
2812 #define BB_SET_SESE(B, S) ((B)->aux = (S))
2813 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2814
2815 /* DFS walk creating SESE data structures. Only cover nodes with
2816 BB_VISITED set. Append discovered blocks to LIST. We number in
2817 increments of 3 so that the above and below pseudo nodes can be
2818 implicitly numbered too. */
2819
2820 static int
2821 nvptx_sese_number (int n, int p, int dir, basic_block b,
2822 auto_vec<basic_block> *list)
2823 {
2824 if (BB_GET_SESE (b))
2825 return n;
2826
2827 if (dump_file)
2828 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2829 b->index, n, p, dir);
2830
2831 BB_SET_SESE (b, new bb_sese (n, p, dir));
2832 p = n;
2833
2834 n += 3;
2835 list->quick_push (b);
2836
2837 /* First walk the nodes on the 'other side' of this node, then walk
2838 the nodes on the same side. */
2839 for (unsigned ix = 2; ix; ix--)
2840 {
2841 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2842 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2843 : offsetof (edge_def, src));
2844 edge e;
2845 edge_iterator (ei);
2846
2847 FOR_EACH_EDGE (e, ei, edges)
2848 {
2849 basic_block target = *(basic_block *)((char *)e + offset);
2850
2851 if (target->flags & BB_VISITED)
2852 n = nvptx_sese_number (n, p, dir, target, list);
2853 }
2854 dir = -dir;
2855 }
2856 return n;
2857 }
2858
2859 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2860 EDGES are the outgoing edges and OFFSET is the offset to the src
2861 or dst block on the edges. */
2862
2863 static void
2864 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2865 vec<edge, va_gc> *edges, size_t offset)
2866 {
2867 edge e;
2868 edge_iterator (ei);
2869 int hi_back = depth;
2870 pseudo_node_t node_back (0, depth);
2871 int hi_child = depth;
2872 pseudo_node_t node_child (0, depth);
2873 basic_block child = NULL;
2874 unsigned num_children = 0;
2875 int usd = -dir * sese->dir;
2876
2877 if (dump_file)
2878 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2879 me->index, sese->node, dir);
2880
2881 if (dir < 0)
2882 {
2883 /* This is the above pseudo-child. It has the BB itself as an
2884 additional child node. */
2885 node_child = sese->high;
2886 hi_child = node_child.second;
2887 if (node_child.first)
2888 hi_child += BB_GET_SESE (node_child.first)->node;
2889 num_children++;
2890 }
2891
2892 /* Examine each edge.
2893 - if it is a child (a) append its bracket list and (b) record
2894 whether it is the child with the highest reaching bracket.
2895 - if it is an edge to ancestor, record whether it's the highest
2896 reaching backlink. */
2897 FOR_EACH_EDGE (e, ei, edges)
2898 {
2899 basic_block target = *(basic_block *)((char *)e + offset);
2900
2901 if (bb_sese *t_sese = BB_GET_SESE (target))
2902 {
2903 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2904 {
2905 /* Child node. Append its bracket list. */
2906 num_children++;
2907 sese->append (t_sese);
2908
2909 /* Compare it's hi value. */
2910 int t_hi = t_sese->high.second;
2911
2912 if (basic_block child_hi_block = t_sese->high.first)
2913 t_hi += BB_GET_SESE (child_hi_block)->node;
2914
2915 if (hi_child > t_hi)
2916 {
2917 hi_child = t_hi;
2918 node_child = t_sese->high;
2919 child = target;
2920 }
2921 }
2922 else if (t_sese->node < sese->node + dir
2923 && !(dir < 0 && sese->parent == t_sese->node))
2924 {
2925 /* Non-parental ancestor node -- a backlink. */
2926 int d = usd * t_sese->dir;
2927 int back = t_sese->node + d;
2928
2929 if (hi_back > back)
2930 {
2931 hi_back = back;
2932 node_back = pseudo_node_t (target, d);
2933 }
2934 }
2935 }
2936 else
2937 { /* Fallen off graph, backlink to entry node. */
2938 hi_back = 0;
2939 node_back = pseudo_node_t (0, 0);
2940 }
2941 }
2942
2943 /* Remove any brackets that terminate at this pseudo node. */
2944 sese->remove (pseudo_node_t (me, dir));
2945
2946 /* Now push any backlinks from this pseudo node. */
2947 FOR_EACH_EDGE (e, ei, edges)
2948 {
2949 basic_block target = *(basic_block *)((char *)e + offset);
2950 if (bb_sese *t_sese = BB_GET_SESE (target))
2951 {
2952 if (t_sese->node < sese->node + dir
2953 && !(dir < 0 && sese->parent == t_sese->node))
2954 /* Non-parental ancestor node - backedge from me. */
2955 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2956 }
2957 else
2958 {
2959 /* back edge to entry node */
2960 sese->push (pseudo_node_t (0, 0));
2961 }
2962 }
2963
2964 /* If this node leads directly or indirectly to a no-return region of
2965 the graph, then fake a backedge to entry node. */
2966 if (!sese->brackets.length () || !edges || !edges->length ())
2967 {
2968 hi_back = 0;
2969 node_back = pseudo_node_t (0, 0);
2970 sese->push (node_back);
2971 }
2972
2973 /* Record the highest reaching backedge from us or a descendant. */
2974 sese->high = hi_back < hi_child ? node_back : node_child;
2975
2976 if (num_children > 1)
2977 {
2978 /* There is more than one child -- this is a Y shaped piece of
2979 spanning tree. We have to insert a fake backedge from this
2980 node to the highest ancestor reached by not-the-highest
2981 reaching child. Note that there may be multiple children
2982 with backedges to the same highest node. That's ok and we
2983 insert the edge to that highest node. */
2984 hi_child = depth;
2985 if (dir < 0 && child)
2986 {
2987 node_child = sese->high;
2988 hi_child = node_child.second;
2989 if (node_child.first)
2990 hi_child += BB_GET_SESE (node_child.first)->node;
2991 }
2992
2993 FOR_EACH_EDGE (e, ei, edges)
2994 {
2995 basic_block target = *(basic_block *)((char *)e + offset);
2996
2997 if (target == child)
2998 /* Ignore the highest child. */
2999 continue;
3000
3001 bb_sese *t_sese = BB_GET_SESE (target);
3002 if (!t_sese)
3003 continue;
3004 if (t_sese->parent != sese->node)
3005 /* Not a child. */
3006 continue;
3007
3008 /* Compare its hi value. */
3009 int t_hi = t_sese->high.second;
3010
3011 if (basic_block child_hi_block = t_sese->high.first)
3012 t_hi += BB_GET_SESE (child_hi_block)->node;
3013
3014 if (hi_child > t_hi)
3015 {
3016 hi_child = t_hi;
3017 node_child = t_sese->high;
3018 }
3019 }
3020
3021 sese->push (node_child);
3022 }
3023 }
3024
3025
3026 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3027 proceed to successors. Set SESE entry and exit nodes of
3028 REGIONS. */
3029
3030 static void
3031 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3032 basic_block block, int coloring)
3033 {
3034 bb_sese *sese = BB_GET_SESE (block);
3035
3036 if (block->flags & BB_VISITED)
3037 {
3038 /* If we've already encountered this block, either we must not
3039 be coloring, or it must have been colored the current color. */
3040 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3041 return;
3042 }
3043
3044 block->flags |= BB_VISITED;
3045
3046 if (sese)
3047 {
3048 if (coloring < 0)
3049 {
3050 /* Start coloring a region. */
3051 regions[sese->color].first = block;
3052 coloring = sese->color;
3053 }
3054
3055 if (!--color_counts[sese->color] && sese->color == coloring)
3056 {
3057 /* Found final block of SESE region. */
3058 regions[sese->color].second = block;
3059 coloring = -1;
3060 }
3061 else
3062 /* Color the node, so we can assert on revisiting the node
3063 that the graph is indeed SESE. */
3064 sese->color = coloring;
3065 }
3066 else
3067 /* Fallen off the subgraph, we cannot be coloring. */
3068 gcc_assert (coloring < 0);
3069
3070 /* Walk each successor block. */
3071 if (block->succs && block->succs->length ())
3072 {
3073 edge e;
3074 edge_iterator ei;
3075
3076 FOR_EACH_EDGE (e, ei, block->succs)
3077 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3078 }
3079 else
3080 gcc_assert (coloring < 0);
3081 }
3082
3083 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3084 end up with NULL entries in it. */
3085
3086 static void
3087 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3088 {
3089 basic_block block;
3090 int ix;
3091
3092 /* First clear each BB of the whole function. */
3093 FOR_EACH_BB_FN (block, cfun)
3094 {
3095 block->flags &= ~BB_VISITED;
3096 BB_SET_SESE (block, 0);
3097 }
3098 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3099 block->flags &= ~BB_VISITED;
3100 BB_SET_SESE (block, 0);
3101 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3102 block->flags &= ~BB_VISITED;
3103 BB_SET_SESE (block, 0);
3104
3105 /* Mark blocks in the function that are in this graph. */
3106 for (ix = 0; blocks.iterate (ix, &block); ix++)
3107 block->flags |= BB_VISITED;
3108
3109 /* Counts of nodes assigned to each color. There cannot be more
3110 colors than blocks (and hopefully there will be fewer). */
3111 auto_vec<unsigned> color_counts;
3112 color_counts.reserve (blocks.length ());
3113
3114 /* Worklist of nodes in the spanning tree. Again, there cannot be
3115 more nodes in the tree than blocks (there will be fewer if the
3116 CFG of blocks is disjoint). */
3117 auto_vec<basic_block> spanlist;
3118 spanlist.reserve (blocks.length ());
3119
3120 /* Make sure every block has its cycle class determined. */
3121 for (ix = 0; blocks.iterate (ix, &block); ix++)
3122 {
3123 if (BB_GET_SESE (block))
3124 /* We already met this block in an earlier graph solve. */
3125 continue;
3126
3127 if (dump_file)
3128 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3129
3130 /* Number the nodes reachable from block initial DFS order. */
3131 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3132
3133 /* Now walk in reverse DFS order to find cycle equivalents. */
3134 while (spanlist.length ())
3135 {
3136 block = spanlist.pop ();
3137 bb_sese *sese = BB_GET_SESE (block);
3138
3139 /* Do the pseudo node below. */
3140 nvptx_sese_pseudo (block, sese, depth, +1,
3141 sese->dir > 0 ? block->succs : block->preds,
3142 (sese->dir > 0 ? offsetof (edge_def, dest)
3143 : offsetof (edge_def, src)));
3144 sese->set_color (color_counts);
3145 /* Do the pseudo node above. */
3146 nvptx_sese_pseudo (block, sese, depth, -1,
3147 sese->dir < 0 ? block->succs : block->preds,
3148 (sese->dir < 0 ? offsetof (edge_def, dest)
3149 : offsetof (edge_def, src)));
3150 }
3151 if (dump_file)
3152 fprintf (dump_file, "\n");
3153 }
3154
3155 if (dump_file)
3156 {
3157 unsigned count;
3158 const char *comma = "";
3159
3160 fprintf (dump_file, "Found %d cycle equivalents\n",
3161 color_counts.length ());
3162 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3163 {
3164 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3165
3166 comma = "";
3167 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3168 if (BB_GET_SESE (block)->color == ix)
3169 {
3170 block->flags |= BB_VISITED;
3171 fprintf (dump_file, "%s%d", comma, block->index);
3172 comma=",";
3173 }
3174 fprintf (dump_file, "}");
3175 comma = ", ";
3176 }
3177 fprintf (dump_file, "\n");
3178 }
3179
3180 /* Now we've colored every block in the subgraph. We now need to
3181 determine the minimal set of SESE regions that cover that
3182 subgraph. Do this with a DFS walk of the complete function.
3183 During the walk we're either 'looking' or 'coloring'. When we
3184 reach the last node of a particular color, we stop coloring and
3185 return to looking. */
3186
3187 /* There cannot be more SESE regions than colors. */
3188 regions.reserve (color_counts.length ());
3189 for (ix = color_counts.length (); ix--;)
3190 regions.quick_push (bb_pair_t (0, 0));
3191
3192 for (ix = 0; blocks.iterate (ix, &block); ix++)
3193 block->flags &= ~BB_VISITED;
3194
3195 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3196
3197 if (dump_file)
3198 {
3199 const char *comma = "";
3200 int len = regions.length ();
3201
3202 fprintf (dump_file, "SESE regions:");
3203 for (ix = 0; ix != len; ix++)
3204 {
3205 basic_block from = regions[ix].first;
3206 basic_block to = regions[ix].second;
3207
3208 if (from)
3209 {
3210 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3211 if (to != from)
3212 fprintf (dump_file, "->%d", to->index);
3213
3214 int color = BB_GET_SESE (from)->color;
3215
3216 /* Print the blocks within the region (excluding ends). */
3217 FOR_EACH_BB_FN (block, cfun)
3218 {
3219 bb_sese *sese = BB_GET_SESE (block);
3220
3221 if (sese && sese->color == color
3222 && block != from && block != to)
3223 fprintf (dump_file, ".%d", block->index);
3224 }
3225 fprintf (dump_file, "}");
3226 }
3227 comma = ",";
3228 }
3229 fprintf (dump_file, "\n\n");
3230 }
3231
3232 for (ix = 0; blocks.iterate (ix, &block); ix++)
3233 delete BB_GET_SESE (block);
3234 }
3235
3236 #undef BB_SET_SESE
3237 #undef BB_GET_SESE
3238
3239 /* Propagate live state at the start of a partitioned region. BLOCK
3240 provides the live register information, and might not contain
3241 INSN. Propagation is inserted just after INSN. RW indicates whether
3242 we are reading and/or writing state. This
3243 separation is needed for worker-level proppagation where we
3244 essentially do a spill & fill. FN is the underlying worker
3245 function to generate the propagation instructions for single
3246 register. DATA is user data.
3247
3248 We propagate the live register set and the entire frame. We could
3249 do better by (a) propagating just the live set that is used within
3250 the partitioned regions and (b) only propagating stack entries that
3251 are used. The latter might be quite hard to determine. */
3252
3253 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3254
3255 static void
3256 nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3257 propagator_fn fn, void *data)
3258 {
3259 bitmap live = DF_LIVE_IN (block);
3260 bitmap_iterator iterator;
3261 unsigned ix;
3262
3263 /* Copy the frame array. */
3264 HOST_WIDE_INT fs = get_frame_size ();
3265 if (fs)
3266 {
3267 rtx tmp = gen_reg_rtx (DImode);
3268 rtx idx = NULL_RTX;
3269 rtx ptr = gen_reg_rtx (Pmode);
3270 rtx pred = NULL_RTX;
3271 rtx_code_label *label = NULL;
3272
3273 /* The frame size might not be DImode compatible, but the frame
3274 array's declaration will be. So it's ok to round up here. */
3275 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3276 /* Detect single iteration loop. */
3277 if (fs == 1)
3278 fs = 0;
3279
3280 start_sequence ();
3281 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3282 if (fs)
3283 {
3284 idx = gen_reg_rtx (SImode);
3285 pred = gen_reg_rtx (BImode);
3286 label = gen_label_rtx ();
3287
3288 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3289 /* Allow worker function to initialize anything needed. */
3290 rtx init = fn (tmp, PM_loop_begin, fs, data);
3291 if (init)
3292 emit_insn (init);
3293 emit_label (label);
3294 LABEL_NUSES (label)++;
3295 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3296 }
3297 if (rw & PM_read)
3298 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3299 emit_insn (fn (tmp, rw, fs, data));
3300 if (rw & PM_write)
3301 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3302 if (fs)
3303 {
3304 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3305 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3306 emit_insn (gen_br_true_uni (pred, label));
3307 rtx fini = fn (tmp, PM_loop_end, fs, data);
3308 if (fini)
3309 emit_insn (fini);
3310 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3311 }
3312 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3313 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3314 rtx cpy = get_insns ();
3315 end_sequence ();
3316 insn = emit_insn_after (cpy, insn);
3317 }
3318
3319 /* Copy live registers. */
3320 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3321 {
3322 rtx reg = regno_reg_rtx[ix];
3323
3324 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3325 {
3326 rtx bcast = fn (reg, rw, 0, data);
3327
3328 insn = emit_insn_after (bcast, insn);
3329 }
3330 }
3331 }
3332
3333 /* Worker for nvptx_vpropagate. */
3334
3335 static rtx
3336 vprop_gen (rtx reg, propagate_mask pm,
3337 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3338 {
3339 if (!(pm & PM_read_write))
3340 return 0;
3341
3342 return nvptx_gen_vcast (reg);
3343 }
3344
3345 /* Propagate state that is live at start of BLOCK across the vectors
3346 of a single warp. Propagation is inserted just after INSN. */
3347
3348 static void
3349 nvptx_vpropagate (basic_block block, rtx_insn *insn)
3350 {
3351 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3352 }
3353
3354 /* Worker for nvptx_wpropagate. */
3355
3356 static rtx
3357 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3358 {
3359 wcast_data_t *data = (wcast_data_t *)data_;
3360
3361 if (pm & PM_loop_begin)
3362 {
3363 /* Starting a loop, initialize pointer. */
3364 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3365
3366 if (align > worker_bcast_align)
3367 worker_bcast_align = align;
3368 data->offset = (data->offset + align - 1) & ~(align - 1);
3369
3370 data->ptr = gen_reg_rtx (Pmode);
3371
3372 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3373 }
3374 else if (pm & PM_loop_end)
3375 {
3376 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3377 data->ptr = NULL_RTX;
3378 return clobber;
3379 }
3380 else
3381 return nvptx_gen_wcast (reg, pm, rep, data);
3382 }
3383
3384 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3385 indicates if this is just before partitioned mode (do spill), or
3386 just after it starts (do fill). Sequence is inserted just after
3387 INSN. */
3388
3389 static void
3390 nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3391 {
3392 wcast_data_t data;
3393
3394 data.base = gen_reg_rtx (Pmode);
3395 data.offset = 0;
3396 data.ptr = NULL_RTX;
3397
3398 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3399 if (data.offset)
3400 {
3401 /* Stuff was emitted, initialize the base pointer now. */
3402 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3403 emit_insn_after (init, insn);
3404
3405 if (worker_bcast_size < data.offset)
3406 worker_bcast_size = data.offset;
3407 }
3408 }
3409
3410 /* Emit a worker-level synchronization barrier. We use different
3411 markers for before and after synchronizations. */
3412
3413 static rtx
3414 nvptx_wsync (bool after)
3415 {
3416 return gen_nvptx_barsync (GEN_INT (after));
3417 }
3418
3419 /* Single neutering according to MASK. FROM is the incoming block and
3420 TO is the outgoing block. These may be the same block. Insert at
3421 start of FROM:
3422
3423 if (tid.<axis>) goto end.
3424
3425 and insert before ending branch of TO (if there is such an insn):
3426
3427 end:
3428 <possibly-broadcast-cond>
3429 <branch>
3430
3431 We currently only use differnt FROM and TO when skipping an entire
3432 loop. We could do more if we detected superblocks. */
3433
3434 static void
3435 nvptx_single (unsigned mask, basic_block from, basic_block to)
3436 {
3437 rtx_insn *head = BB_HEAD (from);
3438 rtx_insn *tail = BB_END (to);
3439 unsigned skip_mask = mask;
3440
3441 /* Find first insn of from block */
3442 while (head != BB_END (from) && !INSN_P (head))
3443 head = NEXT_INSN (head);
3444
3445 /* Find last insn of to block */
3446 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3447 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3448 tail = PREV_INSN (tail);
3449
3450 /* Detect if tail is a branch. */
3451 rtx tail_branch = NULL_RTX;
3452 rtx cond_branch = NULL_RTX;
3453 if (tail && INSN_P (tail))
3454 {
3455 tail_branch = PATTERN (tail);
3456 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3457 tail_branch = NULL_RTX;
3458 else
3459 {
3460 cond_branch = SET_SRC (tail_branch);
3461 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3462 cond_branch = NULL_RTX;
3463 }
3464 }
3465
3466 if (tail == head)
3467 {
3468 /* If this is empty, do nothing. */
3469 if (!head || !INSN_P (head))
3470 return;
3471
3472 /* If this is a dummy insn, do nothing. */
3473 switch (recog_memoized (head))
3474 {
3475 default:
3476 break;
3477 case CODE_FOR_nvptx_fork:
3478 case CODE_FOR_nvptx_forked:
3479 case CODE_FOR_nvptx_joining:
3480 case CODE_FOR_nvptx_join:
3481 return;
3482 }
3483
3484 if (cond_branch)
3485 {
3486 /* If we're only doing vector single, there's no need to
3487 emit skip code because we'll not insert anything. */
3488 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3489 skip_mask = 0;
3490 }
3491 else if (tail_branch)
3492 /* Block with only unconditional branch. Nothing to do. */
3493 return;
3494 }
3495
3496 /* Insert the vector test inside the worker test. */
3497 unsigned mode;
3498 rtx_insn *before = tail;
3499 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3500 if (GOMP_DIM_MASK (mode) & skip_mask)
3501 {
3502 rtx_code_label *label = gen_label_rtx ();
3503 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3504
3505 if (!pred)
3506 {
3507 pred = gen_reg_rtx (BImode);
3508 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3509 }
3510
3511 rtx br;
3512 if (mode == GOMP_DIM_VECTOR)
3513 br = gen_br_true (pred, label);
3514 else
3515 br = gen_br_true_uni (pred, label);
3516 emit_insn_before (br, head);
3517
3518 LABEL_NUSES (label)++;
3519 if (tail_branch)
3520 before = emit_label_before (label, before);
3521 else
3522 emit_label_after (label, tail);
3523 }
3524
3525 /* Now deal with propagating the branch condition. */
3526 if (cond_branch)
3527 {
3528 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3529
3530 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3531 {
3532 /* Vector mode only, do a shuffle. */
3533 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3534 }
3535 else
3536 {
3537 /* Includes worker mode, do spill & fill. By construction
3538 we should never have worker mode only. */
3539 wcast_data_t data;
3540
3541 data.base = worker_bcast_sym;
3542 data.ptr = 0;
3543
3544 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3545 worker_bcast_size = GET_MODE_SIZE (SImode);
3546
3547 data.offset = 0;
3548 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3549 before);
3550 /* Barrier so other workers can see the write. */
3551 emit_insn_before (nvptx_wsync (false), tail);
3552 data.offset = 0;
3553 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3554 /* This barrier is needed to avoid worker zero clobbering
3555 the broadcast buffer before all the other workers have
3556 had a chance to read this instance of it. */
3557 emit_insn_before (nvptx_wsync (true), tail);
3558 }
3559
3560 extract_insn (tail);
3561 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3562 UNSPEC_BR_UNIFIED);
3563 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3564 }
3565 }
3566
3567 /* PAR is a parallel that is being skipped in its entirety according to
3568 MASK. Treat this as skipping a superblock starting at forked
3569 and ending at joining. */
3570
3571 static void
3572 nvptx_skip_par (unsigned mask, parallel *par)
3573 {
3574 basic_block tail = par->join_block;
3575 gcc_assert (tail->preds->length () == 1);
3576
3577 basic_block pre_tail = (*tail->preds)[0]->src;
3578 gcc_assert (pre_tail->succs->length () == 1);
3579
3580 nvptx_single (mask, par->forked_block, pre_tail);
3581 }
3582
3583 /* If PAR has a single inner parallel and PAR itself only contains
3584 empty entry and exit blocks, swallow the inner PAR. */
3585
3586 static void
3587 nvptx_optimize_inner (parallel *par)
3588 {
3589 parallel *inner = par->inner;
3590
3591 /* We mustn't be the outer dummy par. */
3592 if (!par->mask)
3593 return;
3594
3595 /* We must have a single inner par. */
3596 if (!inner || inner->next)
3597 return;
3598
3599 /* We must only contain 2 blocks ourselves -- the head and tail of
3600 the inner par. */
3601 if (par->blocks.length () != 2)
3602 return;
3603
3604 /* We must be disjoint partitioning. As we only have vector and
3605 worker partitioning, this is sufficient to guarantee the pars
3606 have adjacent partitioning. */
3607 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3608 /* This indicates malformed code generation. */
3609 return;
3610
3611 /* The outer forked insn should be immediately followed by the inner
3612 fork insn. */
3613 rtx_insn *forked = par->forked_insn;
3614 rtx_insn *fork = BB_END (par->forked_block);
3615
3616 if (NEXT_INSN (forked) != fork)
3617 return;
3618 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3619
3620 /* The outer joining insn must immediately follow the inner join
3621 insn. */
3622 rtx_insn *joining = par->joining_insn;
3623 rtx_insn *join = inner->join_insn;
3624 if (NEXT_INSN (join) != joining)
3625 return;
3626
3627 /* Preconditions met. Swallow the inner par. */
3628 if (dump_file)
3629 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3630 inner->mask, inner->forked_block->index,
3631 inner->join_block->index,
3632 par->mask, par->forked_block->index, par->join_block->index);
3633
3634 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3635
3636 par->blocks.reserve (inner->blocks.length ());
3637 while (inner->blocks.length ())
3638 par->blocks.quick_push (inner->blocks.pop ());
3639
3640 par->inner = inner->inner;
3641 inner->inner = NULL;
3642
3643 delete inner;
3644 }
3645
3646 /* Process the parallel PAR and all its contained
3647 parallels. We do everything but the neutering. Return mask of
3648 partitioned modes used within this parallel. */
3649
3650 static unsigned
3651 nvptx_process_pars (parallel *par)
3652 {
3653 if (nvptx_optimize)
3654 nvptx_optimize_inner (par);
3655
3656 unsigned inner_mask = par->mask;
3657
3658 /* Do the inner parallels first. */
3659 if (par->inner)
3660 {
3661 par->inner_mask = nvptx_process_pars (par->inner);
3662 inner_mask |= par->inner_mask;
3663 }
3664
3665 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3666 /* No propagation needed for a call. */;
3667 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3668 {
3669 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3670 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3671 /* Insert begin and end synchronizations. */
3672 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3673 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3674 }
3675 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3676 nvptx_vpropagate (par->forked_block, par->forked_insn);
3677
3678 /* Now do siblings. */
3679 if (par->next)
3680 inner_mask |= nvptx_process_pars (par->next);
3681 return inner_mask;
3682 }
3683
3684 /* Neuter the parallel described by PAR. We recurse in depth-first
3685 order. MODES are the partitioning of the execution and OUTER is
3686 the partitioning of the parallels we are contained in. */
3687
3688 static void
3689 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3690 {
3691 unsigned me = (par->mask
3692 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3693 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3694 unsigned skip_mask = 0, neuter_mask = 0;
3695
3696 if (par->inner)
3697 nvptx_neuter_pars (par->inner, modes, outer | me);
3698
3699 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3700 {
3701 if ((outer | me) & GOMP_DIM_MASK (mode))
3702 {} /* Mode is partitioned: no neutering. */
3703 else if (!(modes & GOMP_DIM_MASK (mode)))
3704 {} /* Mode is not used: nothing to do. */
3705 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3706 || !par->forked_insn)
3707 /* Partitioned in inner parallels, or we're not a partitioned
3708 at all: neuter individual blocks. */
3709 neuter_mask |= GOMP_DIM_MASK (mode);
3710 else if (!par->parent || !par->parent->forked_insn
3711 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3712 /* Parent isn't a parallel or contains this paralleling: skip
3713 parallel at this level. */
3714 skip_mask |= GOMP_DIM_MASK (mode);
3715 else
3716 {} /* Parent will skip this parallel itself. */
3717 }
3718
3719 if (neuter_mask)
3720 {
3721 int ix, len;
3722
3723 if (nvptx_optimize)
3724 {
3725 /* Neuter whole SESE regions. */
3726 bb_pair_vec_t regions;
3727
3728 nvptx_find_sese (par->blocks, regions);
3729 len = regions.length ();
3730 for (ix = 0; ix != len; ix++)
3731 {
3732 basic_block from = regions[ix].first;
3733 basic_block to = regions[ix].second;
3734
3735 if (from)
3736 nvptx_single (neuter_mask, from, to);
3737 else
3738 gcc_assert (!to);
3739 }
3740 }
3741 else
3742 {
3743 /* Neuter each BB individually. */
3744 len = par->blocks.length ();
3745 for (ix = 0; ix != len; ix++)
3746 {
3747 basic_block block = par->blocks[ix];
3748
3749 nvptx_single (neuter_mask, block, block);
3750 }
3751 }
3752 }
3753
3754 if (skip_mask)
3755 nvptx_skip_par (skip_mask, par);
3756
3757 if (par->next)
3758 nvptx_neuter_pars (par->next, modes, outer);
3759 }
3760
3761 /* PTX-specific reorganization
3762 - Split blocks at fork and join instructions
3763 - Compute live registers
3764 - Mark now-unused registers, so function begin doesn't declare
3765 unused registers.
3766 - Insert state propagation when entering partitioned mode
3767 - Insert neutering instructions when in single mode
3768 - Replace subregs with suitable sequences.
3769 */
3770
3771 static void
3772 nvptx_reorg (void)
3773 {
3774 /* We are freeing block_for_insn in the toplev to keep compatibility
3775 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3776 compute_bb_for_insn ();
3777
3778 thread_prologue_and_epilogue_insns ();
3779
3780 /* Split blocks and record interesting unspecs. */
3781 bb_insn_map_t bb_insn_map;
3782
3783 nvptx_split_blocks (&bb_insn_map);
3784
3785 /* Compute live regs */
3786 df_clear_flags (DF_LR_RUN_DCE);
3787 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
3788 df_live_add_problem ();
3789 df_live_set_all_dirty ();
3790 df_analyze ();
3791 regstat_init_n_sets_and_refs ();
3792
3793 if (dump_file)
3794 df_dump (dump_file);
3795
3796 /* Mark unused regs as unused. */
3797 int max_regs = max_reg_num ();
3798 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
3799 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3800 regno_reg_rtx[i] = const0_rtx;
3801
3802 /* Determine launch dimensions of the function. If it is not an
3803 offloaded function (i.e. this is a regular compiler), the
3804 function has no neutering. */
3805 tree attr = get_oacc_fn_attrib (current_function_decl);
3806 if (attr)
3807 {
3808 /* If we determined this mask before RTL expansion, we could
3809 elide emission of some levels of forks and joins. */
3810 unsigned mask = 0;
3811 tree dims = TREE_VALUE (attr);
3812 unsigned ix;
3813
3814 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3815 {
3816 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3817 tree allowed = TREE_PURPOSE (dims);
3818
3819 if (size != 1 && !(allowed && integer_zerop (allowed)))
3820 mask |= GOMP_DIM_MASK (ix);
3821 }
3822 /* If there is worker neutering, there must be vector
3823 neutering. Otherwise the hardware will fail. */
3824 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3825 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3826
3827 /* Discover & process partitioned regions. */
3828 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3829 nvptx_process_pars (pars);
3830 nvptx_neuter_pars (pars, mask, 0);
3831 delete pars;
3832 }
3833
3834 /* Replace subregs. */
3835 nvptx_reorg_subreg ();
3836
3837 regstat_free_n_sets_and_refs ();
3838
3839 df_finish_pass (true);
3840 }
3841 \f
3842 /* Handle a "kernel" attribute; arguments as in
3843 struct attribute_spec.handler. */
3844
3845 static tree
3846 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3847 int ARG_UNUSED (flags), bool *no_add_attrs)
3848 {
3849 tree decl = *node;
3850
3851 if (TREE_CODE (decl) != FUNCTION_DECL)
3852 {
3853 error ("%qE attribute only applies to functions", name);
3854 *no_add_attrs = true;
3855 }
3856 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
3857 {
3858 error ("%qE attribute requires a void return type", name);
3859 *no_add_attrs = true;
3860 }
3861
3862 return NULL_TREE;
3863 }
3864
3865 /* Table of valid machine attributes. */
3866 static const struct attribute_spec nvptx_attribute_table[] =
3867 {
3868 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3869 affects_type_identity } */
3870 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3871 { NULL, 0, 0, false, false, false, NULL, false }
3872 };
3873 \f
3874 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
3875
3876 static HOST_WIDE_INT
3877 nvptx_vector_alignment (const_tree type)
3878 {
3879 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3880
3881 return MIN (align, BIGGEST_ALIGNMENT);
3882 }
3883
3884 /* Indicate that INSN cannot be duplicated. */
3885
3886 static bool
3887 nvptx_cannot_copy_insn_p (rtx_insn *insn)
3888 {
3889 switch (recog_memoized (insn))
3890 {
3891 case CODE_FOR_nvptx_shufflesi:
3892 case CODE_FOR_nvptx_shufflesf:
3893 case CODE_FOR_nvptx_barsync:
3894 case CODE_FOR_nvptx_fork:
3895 case CODE_FOR_nvptx_forked:
3896 case CODE_FOR_nvptx_joining:
3897 case CODE_FOR_nvptx_join:
3898 return true;
3899 default:
3900 return false;
3901 }
3902 }
3903
3904 /* Section anchors do not work. Initialization for flag_section_anchor
3905 probes the existence of the anchoring target hooks and prevents
3906 anchoring if they don't exist. However, we may be being used with
3907 a host-side compiler that does support anchoring, and hence see
3908 the anchor flag set (as it's not recalculated). So provide an
3909 implementation denying anchoring. */
3910
3911 static bool
3912 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3913 {
3914 return false;
3915 }
3916 \f
3917 /* Record a symbol for mkoffload to enter into the mapping table. */
3918
3919 static void
3920 nvptx_record_offload_symbol (tree decl)
3921 {
3922 switch (TREE_CODE (decl))
3923 {
3924 case VAR_DECL:
3925 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3926 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3927 break;
3928
3929 case FUNCTION_DECL:
3930 {
3931 tree attr = get_oacc_fn_attrib (decl);
3932 tree dims = TREE_VALUE (attr);
3933 unsigned ix;
3934
3935 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3936 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3937
3938 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3939 {
3940 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3941
3942 gcc_assert (!TREE_PURPOSE (dims));
3943 fprintf (asm_out_file, ", %#x", size);
3944 }
3945
3946 fprintf (asm_out_file, "\n");
3947 }
3948 break;
3949
3950 default:
3951 gcc_unreachable ();
3952 }
3953 }
3954
3955 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3956 at the start of a file. */
3957
3958 static void
3959 nvptx_file_start (void)
3960 {
3961 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3962 fputs ("\t.version\t3.1\n", asm_out_file);
3963 fputs ("\t.target\tsm_30\n", asm_out_file);
3964 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3965 fputs ("// END PREAMBLE\n", asm_out_file);
3966 }
3967
3968 /* Emit a declaration for a worker-level buffer in .shared memory. */
3969
3970 static void
3971 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3972 {
3973 const char *name = XSTR (sym, 0);
3974
3975 write_var_marker (file, true, false, name);
3976 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3977 align, name, size);
3978 }
3979
3980 /* Write out the function declarations we've collected and declare storage
3981 for the broadcast buffer. */
3982
3983 static void
3984 nvptx_file_end (void)
3985 {
3986 hash_table<tree_hasher>::iterator iter;
3987 tree decl;
3988 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3989 nvptx_record_fndecl (decl);
3990 fputs (func_decls.str().c_str(), asm_out_file);
3991
3992 if (worker_bcast_size)
3993 write_worker_buffer (asm_out_file, worker_bcast_sym,
3994 worker_bcast_align, worker_bcast_size);
3995
3996 if (worker_red_size)
3997 write_worker_buffer (asm_out_file, worker_red_sym,
3998 worker_red_align, worker_red_size);
3999 }
4000
4001 /* Expander for the shuffle builtins. */
4002
4003 static rtx
4004 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4005 {
4006 if (ignore)
4007 return target;
4008
4009 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4010 NULL_RTX, mode, EXPAND_NORMAL);
4011 if (!REG_P (src))
4012 src = copy_to_mode_reg (mode, src);
4013
4014 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4015 NULL_RTX, SImode, EXPAND_NORMAL);
4016 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4017 NULL_RTX, SImode, EXPAND_NORMAL);
4018
4019 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4020 idx = copy_to_mode_reg (SImode, idx);
4021
4022 rtx pat = nvptx_gen_shuffle (target, src, idx,
4023 (nvptx_shuffle_kind) INTVAL (op));
4024 if (pat)
4025 emit_insn (pat);
4026
4027 return target;
4028 }
4029
4030 /* Worker reduction address expander. */
4031
4032 static rtx
4033 nvptx_expand_worker_addr (tree exp, rtx target,
4034 machine_mode ARG_UNUSED (mode), int ignore)
4035 {
4036 if (ignore)
4037 return target;
4038
4039 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4040 if (align > worker_red_align)
4041 worker_red_align = align;
4042
4043 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4044 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4045 if (size + offset > worker_red_size)
4046 worker_red_size = size + offset;
4047
4048 rtx addr = worker_red_sym;
4049 if (offset)
4050 {
4051 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4052 addr = gen_rtx_CONST (Pmode, addr);
4053 }
4054
4055 emit_move_insn (target, addr);
4056
4057 return target;
4058 }
4059
4060 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4061 not require taking the address of any object, other than the memory
4062 cell being operated on. */
4063
4064 static rtx
4065 nvptx_expand_cmp_swap (tree exp, rtx target,
4066 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4067 {
4068 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4069
4070 if (!target)
4071 target = gen_reg_rtx (mode);
4072
4073 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4074 NULL_RTX, Pmode, EXPAND_NORMAL);
4075 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4076 NULL_RTX, mode, EXPAND_NORMAL);
4077 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4078 NULL_RTX, mode, EXPAND_NORMAL);
4079 rtx pat;
4080
4081 mem = gen_rtx_MEM (mode, mem);
4082 if (!REG_P (cmp))
4083 cmp = copy_to_mode_reg (mode, cmp);
4084 if (!REG_P (src))
4085 src = copy_to_mode_reg (mode, src);
4086
4087 if (mode == SImode)
4088 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4089 else
4090 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4091
4092 emit_insn (pat);
4093
4094 return target;
4095 }
4096
4097
4098 /* Codes for all the NVPTX builtins. */
4099 enum nvptx_builtins
4100 {
4101 NVPTX_BUILTIN_SHUFFLE,
4102 NVPTX_BUILTIN_SHUFFLELL,
4103 NVPTX_BUILTIN_WORKER_ADDR,
4104 NVPTX_BUILTIN_CMP_SWAP,
4105 NVPTX_BUILTIN_CMP_SWAPLL,
4106 NVPTX_BUILTIN_MAX
4107 };
4108
4109 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4110
4111 /* Return the NVPTX builtin for CODE. */
4112
4113 static tree
4114 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4115 {
4116 if (code >= NVPTX_BUILTIN_MAX)
4117 return error_mark_node;
4118
4119 return nvptx_builtin_decls[code];
4120 }
4121
4122 /* Set up all builtin functions for this target. */
4123
4124 static void
4125 nvptx_init_builtins (void)
4126 {
4127 #define DEF(ID, NAME, T) \
4128 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4129 = add_builtin_function ("__builtin_nvptx_" NAME, \
4130 build_function_type_list T, \
4131 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4132 #define ST sizetype
4133 #define UINT unsigned_type_node
4134 #define LLUINT long_long_unsigned_type_node
4135 #define PTRVOID ptr_type_node
4136
4137 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4138 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4139 DEF (WORKER_ADDR, "worker_addr",
4140 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4141 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4142 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4143
4144 #undef DEF
4145 #undef ST
4146 #undef UINT
4147 #undef LLUINT
4148 #undef PTRVOID
4149 }
4150
4151 /* Expand an expression EXP that calls a built-in function,
4152 with result going to TARGET if that's convenient
4153 (and in mode MODE if that's convenient).
4154 SUBTARGET may be used as the target for computing one of EXP's operands.
4155 IGNORE is nonzero if the value is to be ignored. */
4156
4157 static rtx
4158 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4159 machine_mode mode, int ignore)
4160 {
4161 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4162 switch (DECL_FUNCTION_CODE (fndecl))
4163 {
4164 case NVPTX_BUILTIN_SHUFFLE:
4165 case NVPTX_BUILTIN_SHUFFLELL:
4166 return nvptx_expand_shuffle (exp, target, mode, ignore);
4167
4168 case NVPTX_BUILTIN_WORKER_ADDR:
4169 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4170
4171 case NVPTX_BUILTIN_CMP_SWAP:
4172 case NVPTX_BUILTIN_CMP_SWAPLL:
4173 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4174
4175 default: gcc_unreachable ();
4176 }
4177 }
4178 \f
4179 /* Define dimension sizes for known hardware. */
4180 #define PTX_VECTOR_LENGTH 32
4181 #define PTX_WORKER_LENGTH 32
4182 #define PTX_GANG_DEFAULT 32
4183
4184 /* Validate compute dimensions of an OpenACC offload or routine, fill
4185 in non-unity defaults. FN_LEVEL indicates the level at which a
4186 routine might spawn a loop. It is negative for non-routines. If
4187 DECL is null, we are validating the default dimensions. */
4188
4189 static bool
4190 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4191 {
4192 bool changed = false;
4193
4194 /* The vector size must be 32, unless this is a SEQ routine. */
4195 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4196 && dims[GOMP_DIM_VECTOR] >= 0
4197 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4198 {
4199 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4200 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4201 dims[GOMP_DIM_VECTOR]
4202 ? "using vector_length (%d), ignoring %d"
4203 : "using vector_length (%d), ignoring runtime setting",
4204 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4205 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4206 changed = true;
4207 }
4208
4209 /* Check the num workers is not too large. */
4210 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4211 {
4212 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
4213 "using num_workers (%d), ignoring %d",
4214 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4215 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4216 changed = true;
4217 }
4218
4219 if (!decl)
4220 {
4221 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4222 if (dims[GOMP_DIM_WORKER] < 0)
4223 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4224 if (dims[GOMP_DIM_GANG] < 0)
4225 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4226 changed = true;
4227 }
4228
4229 return changed;
4230 }
4231
4232 /* Return maximum dimension size, or zero for unbounded. */
4233
4234 static int
4235 nvptx_dim_limit (int axis)
4236 {
4237 switch (axis)
4238 {
4239 case GOMP_DIM_WORKER:
4240 return PTX_WORKER_LENGTH;
4241
4242 case GOMP_DIM_VECTOR:
4243 return PTX_VECTOR_LENGTH;
4244
4245 default:
4246 break;
4247 }
4248 return 0;
4249 }
4250
4251 /* Determine whether fork & joins are needed. */
4252
4253 static bool
4254 nvptx_goacc_fork_join (gcall *call, const int dims[],
4255 bool ARG_UNUSED (is_fork))
4256 {
4257 tree arg = gimple_call_arg (call, 2);
4258 unsigned axis = TREE_INT_CST_LOW (arg);
4259
4260 /* We only care about worker and vector partitioning. */
4261 if (axis < GOMP_DIM_WORKER)
4262 return false;
4263
4264 /* If the size is 1, there's no partitioning. */
4265 if (dims[axis] == 1)
4266 return false;
4267
4268 return true;
4269 }
4270
4271 /* Generate a PTX builtin function call that returns the address in
4272 the worker reduction buffer at OFFSET. TYPE is the type of the
4273 data at that location. */
4274
4275 static tree
4276 nvptx_get_worker_red_addr (tree type, tree offset)
4277 {
4278 machine_mode mode = TYPE_MODE (type);
4279 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4280 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4281 tree align = build_int_cst (unsigned_type_node,
4282 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4283 tree call = build_call_expr (fndecl, 3, offset, size, align);
4284
4285 return fold_convert (build_pointer_type (type), call);
4286 }
4287
4288 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4289 will cast the variable if necessary. */
4290
4291 static void
4292 nvptx_generate_vector_shuffle (location_t loc,
4293 tree dest_var, tree var, unsigned shift,
4294 gimple_seq *seq)
4295 {
4296 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4297 tree_code code = NOP_EXPR;
4298 tree arg_type = unsigned_type_node;
4299 tree var_type = TREE_TYPE (var);
4300 tree dest_type = var_type;
4301
4302 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4303 var_type = TREE_TYPE (var_type);
4304
4305 if (TREE_CODE (var_type) == REAL_TYPE)
4306 code = VIEW_CONVERT_EXPR;
4307
4308 if (TYPE_SIZE (var_type)
4309 == TYPE_SIZE (long_long_unsigned_type_node))
4310 {
4311 fn = NVPTX_BUILTIN_SHUFFLELL;
4312 arg_type = long_long_unsigned_type_node;
4313 }
4314
4315 tree call = nvptx_builtin_decl (fn, true);
4316 tree bits = build_int_cst (unsigned_type_node, shift);
4317 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4318 tree expr;
4319
4320 if (var_type != dest_type)
4321 {
4322 /* Do real and imaginary parts separately. */
4323 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4324 real = fold_build1 (code, arg_type, real);
4325 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4326 real = fold_build1 (code, var_type, real);
4327
4328 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4329 imag = fold_build1 (code, arg_type, imag);
4330 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4331 imag = fold_build1 (code, var_type, imag);
4332
4333 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4334 }
4335 else
4336 {
4337 expr = fold_build1 (code, arg_type, var);
4338 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4339 expr = fold_build1 (code, dest_type, expr);
4340 }
4341
4342 gimplify_assign (dest_var, expr, seq);
4343 }
4344
4345 /* Lazily generate the global lock var decl and return its address. */
4346
4347 static tree
4348 nvptx_global_lock_addr ()
4349 {
4350 tree v = global_lock_var;
4351
4352 if (!v)
4353 {
4354 tree name = get_identifier ("__reduction_lock");
4355 tree type = build_qualified_type (unsigned_type_node,
4356 TYPE_QUAL_VOLATILE);
4357 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4358 global_lock_var = v;
4359 DECL_ARTIFICIAL (v) = 1;
4360 DECL_EXTERNAL (v) = 1;
4361 TREE_STATIC (v) = 1;
4362 TREE_PUBLIC (v) = 1;
4363 TREE_USED (v) = 1;
4364 mark_addressable (v);
4365 mark_decl_referenced (v);
4366 }
4367
4368 return build_fold_addr_expr (v);
4369 }
4370
4371 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
4372 GSI. We use a lockless scheme for nearly all case, which looks
4373 like:
4374 actual = initval(OP);
4375 do {
4376 guess = actual;
4377 write = guess OP myval;
4378 actual = cmp&swap (ptr, guess, write)
4379 } while (actual bit-different-to guess);
4380 return write;
4381
4382 This relies on a cmp&swap instruction, which is available for 32-
4383 and 64-bit types. Larger types must use a locking scheme. */
4384
4385 static tree
4386 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4387 tree ptr, tree var, tree_code op)
4388 {
4389 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4390 tree_code code = NOP_EXPR;
4391 tree arg_type = unsigned_type_node;
4392 tree var_type = TREE_TYPE (var);
4393
4394 if (TREE_CODE (var_type) == COMPLEX_TYPE
4395 || TREE_CODE (var_type) == REAL_TYPE)
4396 code = VIEW_CONVERT_EXPR;
4397
4398 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
4399 {
4400 arg_type = long_long_unsigned_type_node;
4401 fn = NVPTX_BUILTIN_CMP_SWAPLL;
4402 }
4403
4404 tree swap_fn = nvptx_builtin_decl (fn, true);
4405
4406 gimple_seq init_seq = NULL;
4407 tree init_var = make_ssa_name (arg_type);
4408 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4409 init_expr = fold_build1 (code, arg_type, init_expr);
4410 gimplify_assign (init_var, init_expr, &init_seq);
4411 gimple *init_end = gimple_seq_last (init_seq);
4412
4413 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4414
4415 /* Split the block just after the init stmts. */
4416 basic_block pre_bb = gsi_bb (*gsi);
4417 edge pre_edge = split_block (pre_bb, init_end);
4418 basic_block loop_bb = pre_edge->dest;
4419 pre_bb = pre_edge->src;
4420 /* Reset the iterator. */
4421 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4422
4423 tree expect_var = make_ssa_name (arg_type);
4424 tree actual_var = make_ssa_name (arg_type);
4425 tree write_var = make_ssa_name (arg_type);
4426
4427 /* Build and insert the reduction calculation. */
4428 gimple_seq red_seq = NULL;
4429 tree write_expr = fold_build1 (code, var_type, expect_var);
4430 write_expr = fold_build2 (op, var_type, write_expr, var);
4431 write_expr = fold_build1 (code, arg_type, write_expr);
4432 gimplify_assign (write_var, write_expr, &red_seq);
4433
4434 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4435
4436 /* Build & insert the cmp&swap sequence. */
4437 gimple_seq latch_seq = NULL;
4438 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4439 ptr, expect_var, write_var);
4440 gimplify_assign (actual_var, swap_expr, &latch_seq);
4441
4442 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4443 NULL_TREE, NULL_TREE);
4444 gimple_seq_add_stmt (&latch_seq, cond);
4445
4446 gimple *latch_end = gimple_seq_last (latch_seq);
4447 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
4448
4449 /* Split the block just after the latch stmts. */
4450 edge post_edge = split_block (loop_bb, latch_end);
4451 basic_block post_bb = post_edge->dest;
4452 loop_bb = post_edge->src;
4453 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4454
4455 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4456 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4457 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4458 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4459
4460 gphi *phi = create_phi_node (expect_var, loop_bb);
4461 add_phi_arg (phi, init_var, pre_edge, loc);
4462 add_phi_arg (phi, actual_var, loop_edge, loc);
4463
4464 loop *loop = alloc_loop ();
4465 loop->header = loop_bb;
4466 loop->latch = loop_bb;
4467 add_loop (loop, loop_bb->loop_father);
4468
4469 return fold_build1 (code, var_type, write_var);
4470 }
4471
4472 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
4473 GSI. This is necessary for types larger than 64 bits, where there
4474 is no cmp&swap instruction to implement a lockless scheme. We use
4475 a lock variable in global memory.
4476
4477 while (cmp&swap (&lock_var, 0, 1))
4478 continue;
4479 T accum = *ptr;
4480 accum = accum OP var;
4481 *ptr = accum;
4482 cmp&swap (&lock_var, 1, 0);
4483 return accum;
4484
4485 A lock in global memory is necessary to force execution engine
4486 descheduling and avoid resource starvation that can occur if the
4487 lock is in .shared memory. */
4488
4489 static tree
4490 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4491 tree ptr, tree var, tree_code op)
4492 {
4493 tree var_type = TREE_TYPE (var);
4494 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4495 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4496 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4497
4498 /* Split the block just before the gsi. Insert a gimple nop to make
4499 this easier. */
4500 gimple *nop = gimple_build_nop ();
4501 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4502 basic_block entry_bb = gsi_bb (*gsi);
4503 edge entry_edge = split_block (entry_bb, nop);
4504 basic_block lock_bb = entry_edge->dest;
4505 /* Reset the iterator. */
4506 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4507
4508 /* Build and insert the locking sequence. */
4509 gimple_seq lock_seq = NULL;
4510 tree lock_var = make_ssa_name (unsigned_type_node);
4511 tree lock_expr = nvptx_global_lock_addr ();
4512 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4513 uns_unlocked, uns_locked);
4514 gimplify_assign (lock_var, lock_expr, &lock_seq);
4515 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4516 NULL_TREE, NULL_TREE);
4517 gimple_seq_add_stmt (&lock_seq, cond);
4518 gimple *lock_end = gimple_seq_last (lock_seq);
4519 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4520
4521 /* Split the block just after the lock sequence. */
4522 edge locked_edge = split_block (lock_bb, lock_end);
4523 basic_block update_bb = locked_edge->dest;
4524 lock_bb = locked_edge->src;
4525 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4526
4527 /* Create the lock loop ... */
4528 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4529 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4530 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4531 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4532
4533 /* ... and the loop structure. */
4534 loop *lock_loop = alloc_loop ();
4535 lock_loop->header = lock_bb;
4536 lock_loop->latch = lock_bb;
4537 lock_loop->nb_iterations_estimate = 1;
4538 lock_loop->any_estimate = true;
4539 add_loop (lock_loop, entry_bb->loop_father);
4540
4541 /* Build and insert the reduction calculation. */
4542 gimple_seq red_seq = NULL;
4543 tree acc_in = make_ssa_name (var_type);
4544 tree ref_in = build_simple_mem_ref (ptr);
4545 TREE_THIS_VOLATILE (ref_in) = 1;
4546 gimplify_assign (acc_in, ref_in, &red_seq);
4547
4548 tree acc_out = make_ssa_name (var_type);
4549 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4550 gimplify_assign (acc_out, update_expr, &red_seq);
4551
4552 tree ref_out = build_simple_mem_ref (ptr);
4553 TREE_THIS_VOLATILE (ref_out) = 1;
4554 gimplify_assign (ref_out, acc_out, &red_seq);
4555
4556 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4557
4558 /* Build & insert the unlock sequence. */
4559 gimple_seq unlock_seq = NULL;
4560 tree unlock_expr = nvptx_global_lock_addr ();
4561 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4562 uns_locked, uns_unlocked);
4563 gimplify_and_add (unlock_expr, &unlock_seq);
4564 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4565
4566 return acc_out;
4567 }
4568
4569 /* Emit a sequence to update a reduction accumlator at *PTR with the
4570 value held in VAR using operator OP. Return the updated value.
4571
4572 TODO: optimize for atomic ops and indepedent complex ops. */
4573
4574 static tree
4575 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4576 tree ptr, tree var, tree_code op)
4577 {
4578 tree type = TREE_TYPE (var);
4579 tree size = TYPE_SIZE (type);
4580
4581 if (size == TYPE_SIZE (unsigned_type_node)
4582 || size == TYPE_SIZE (long_long_unsigned_type_node))
4583 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4584 else
4585 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
4586 }
4587
4588 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4589
4590 static void
4591 nvptx_goacc_reduction_setup (gcall *call)
4592 {
4593 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4594 tree lhs = gimple_call_lhs (call);
4595 tree var = gimple_call_arg (call, 2);
4596 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4597 gimple_seq seq = NULL;
4598
4599 push_gimplify_context (true);
4600
4601 if (level != GOMP_DIM_GANG)
4602 {
4603 /* Copy the receiver object. */
4604 tree ref_to_res = gimple_call_arg (call, 1);
4605
4606 if (!integer_zerop (ref_to_res))
4607 var = build_simple_mem_ref (ref_to_res);
4608 }
4609
4610 if (level == GOMP_DIM_WORKER)
4611 {
4612 /* Store incoming value to worker reduction buffer. */
4613 tree offset = gimple_call_arg (call, 5);
4614 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4615 tree ptr = make_ssa_name (TREE_TYPE (call));
4616
4617 gimplify_assign (ptr, call, &seq);
4618 tree ref = build_simple_mem_ref (ptr);
4619 TREE_THIS_VOLATILE (ref) = 1;
4620 gimplify_assign (ref, var, &seq);
4621 }
4622
4623 if (lhs)
4624 gimplify_assign (lhs, var, &seq);
4625
4626 pop_gimplify_context (NULL);
4627 gsi_replace_with_seq (&gsi, seq, true);
4628 }
4629
4630 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
4631
4632 static void
4633 nvptx_goacc_reduction_init (gcall *call)
4634 {
4635 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4636 tree lhs = gimple_call_lhs (call);
4637 tree var = gimple_call_arg (call, 2);
4638 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4639 enum tree_code rcode
4640 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4641 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4642 TREE_TYPE (var));
4643 gimple_seq seq = NULL;
4644
4645 push_gimplify_context (true);
4646
4647 if (level == GOMP_DIM_VECTOR)
4648 {
4649 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4650 tree tid = make_ssa_name (integer_type_node);
4651 tree dim_vector = gimple_call_arg (call, 3);
4652 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4653 dim_vector);
4654 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4655 NULL_TREE, NULL_TREE);
4656
4657 gimple_call_set_lhs (tid_call, tid);
4658 gimple_seq_add_stmt (&seq, tid_call);
4659 gimple_seq_add_stmt (&seq, cond_stmt);
4660
4661 /* Split the block just after the call. */
4662 edge init_edge = split_block (gsi_bb (gsi), call);
4663 basic_block init_bb = init_edge->dest;
4664 basic_block call_bb = init_edge->src;
4665
4666 /* Fixup flags from call_bb to init_bb. */
4667 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4668
4669 /* Set the initialization stmts. */
4670 gimple_seq init_seq = NULL;
4671 tree init_var = make_ssa_name (TREE_TYPE (var));
4672 gimplify_assign (init_var, init, &init_seq);
4673 gsi = gsi_start_bb (init_bb);
4674 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4675
4676 /* Split block just after the init stmt. */
4677 gsi_prev (&gsi);
4678 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4679 basic_block dst_bb = inited_edge->dest;
4680
4681 /* Create false edge from call_bb to dst_bb. */
4682 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4683
4684 /* Create phi node in dst block. */
4685 gphi *phi = create_phi_node (lhs, dst_bb);
4686 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4687 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4688
4689 /* Reset dominator of dst bb. */
4690 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4691
4692 /* Reset the gsi. */
4693 gsi = gsi_for_stmt (call);
4694 }
4695 else
4696 {
4697 if (level == GOMP_DIM_GANG)
4698 {
4699 /* If there's no receiver object, propagate the incoming VAR. */
4700 tree ref_to_res = gimple_call_arg (call, 1);
4701 if (integer_zerop (ref_to_res))
4702 init = var;
4703 }
4704
4705 gimplify_assign (lhs, init, &seq);
4706 }
4707
4708 pop_gimplify_context (NULL);
4709 gsi_replace_with_seq (&gsi, seq, true);
4710 }
4711
4712 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
4713
4714 static void
4715 nvptx_goacc_reduction_fini (gcall *call)
4716 {
4717 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4718 tree lhs = gimple_call_lhs (call);
4719 tree ref_to_res = gimple_call_arg (call, 1);
4720 tree var = gimple_call_arg (call, 2);
4721 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4722 enum tree_code op
4723 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4724 gimple_seq seq = NULL;
4725 tree r = NULL_TREE;;
4726
4727 push_gimplify_context (true);
4728
4729 if (level == GOMP_DIM_VECTOR)
4730 {
4731 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4732 but that requires a method of emitting a unified jump at the
4733 gimple level. */
4734 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4735 {
4736 tree other_var = make_ssa_name (TREE_TYPE (var));
4737 nvptx_generate_vector_shuffle (gimple_location (call),
4738 other_var, var, shfl, &seq);
4739
4740 r = make_ssa_name (TREE_TYPE (var));
4741 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4742 var, other_var), &seq);
4743 var = r;
4744 }
4745 }
4746 else
4747 {
4748 tree accum = NULL_TREE;
4749
4750 if (level == GOMP_DIM_WORKER)
4751 {
4752 /* Get reduction buffer address. */
4753 tree offset = gimple_call_arg (call, 5);
4754 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4755 tree ptr = make_ssa_name (TREE_TYPE (call));
4756
4757 gimplify_assign (ptr, call, &seq);
4758 accum = ptr;
4759 }
4760 else if (integer_zerop (ref_to_res))
4761 r = var;
4762 else
4763 accum = ref_to_res;
4764
4765 if (accum)
4766 {
4767 /* UPDATE the accumulator. */
4768 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4769 seq = NULL;
4770 r = nvptx_reduction_update (gimple_location (call), &gsi,
4771 accum, var, op);
4772 }
4773 }
4774
4775 if (lhs)
4776 gimplify_assign (lhs, r, &seq);
4777 pop_gimplify_context (NULL);
4778
4779 gsi_replace_with_seq (&gsi, seq, true);
4780 }
4781
4782 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4783
4784 static void
4785 nvptx_goacc_reduction_teardown (gcall *call)
4786 {
4787 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4788 tree lhs = gimple_call_lhs (call);
4789 tree var = gimple_call_arg (call, 2);
4790 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4791 gimple_seq seq = NULL;
4792
4793 push_gimplify_context (true);
4794 if (level == GOMP_DIM_WORKER)
4795 {
4796 /* Read the worker reduction buffer. */
4797 tree offset = gimple_call_arg (call, 5);
4798 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4799 tree ptr = make_ssa_name (TREE_TYPE (call));
4800
4801 gimplify_assign (ptr, call, &seq);
4802 var = build_simple_mem_ref (ptr);
4803 TREE_THIS_VOLATILE (var) = 1;
4804 }
4805
4806 if (level != GOMP_DIM_GANG)
4807 {
4808 /* Write to the receiver object. */
4809 tree ref_to_res = gimple_call_arg (call, 1);
4810
4811 if (!integer_zerop (ref_to_res))
4812 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4813 }
4814
4815 if (lhs)
4816 gimplify_assign (lhs, var, &seq);
4817
4818 pop_gimplify_context (NULL);
4819
4820 gsi_replace_with_seq (&gsi, seq, true);
4821 }
4822
4823 /* NVPTX reduction expander. */
4824
4825 static void
4826 nvptx_goacc_reduction (gcall *call)
4827 {
4828 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4829
4830 switch (code)
4831 {
4832 case IFN_GOACC_REDUCTION_SETUP:
4833 nvptx_goacc_reduction_setup (call);
4834 break;
4835
4836 case IFN_GOACC_REDUCTION_INIT:
4837 nvptx_goacc_reduction_init (call);
4838 break;
4839
4840 case IFN_GOACC_REDUCTION_FINI:
4841 nvptx_goacc_reduction_fini (call);
4842 break;
4843
4844 case IFN_GOACC_REDUCTION_TEARDOWN:
4845 nvptx_goacc_reduction_teardown (call);
4846 break;
4847
4848 default:
4849 gcc_unreachable ();
4850 }
4851 }
4852
4853 #undef TARGET_OPTION_OVERRIDE
4854 #define TARGET_OPTION_OVERRIDE nvptx_option_override
4855
4856 #undef TARGET_ATTRIBUTE_TABLE
4857 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4858
4859 #undef TARGET_LRA_P
4860 #define TARGET_LRA_P hook_bool_void_false
4861
4862 #undef TARGET_LEGITIMATE_ADDRESS_P
4863 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4864
4865 #undef TARGET_PROMOTE_FUNCTION_MODE
4866 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4867
4868 #undef TARGET_FUNCTION_ARG
4869 #define TARGET_FUNCTION_ARG nvptx_function_arg
4870 #undef TARGET_FUNCTION_INCOMING_ARG
4871 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4872 #undef TARGET_FUNCTION_ARG_ADVANCE
4873 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4874 #undef TARGET_FUNCTION_ARG_BOUNDARY
4875 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4876 #undef TARGET_PASS_BY_REFERENCE
4877 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4878 #undef TARGET_FUNCTION_VALUE_REGNO_P
4879 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4880 #undef TARGET_FUNCTION_VALUE
4881 #define TARGET_FUNCTION_VALUE nvptx_function_value
4882 #undef TARGET_LIBCALL_VALUE
4883 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
4884 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
4885 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
4886 #undef TARGET_GET_DRAP_RTX
4887 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
4888 #undef TARGET_SPLIT_COMPLEX_ARG
4889 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4890 #undef TARGET_RETURN_IN_MEMORY
4891 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4892 #undef TARGET_OMIT_STRUCT_RETURN_REG
4893 #define TARGET_OMIT_STRUCT_RETURN_REG true
4894 #undef TARGET_STRICT_ARGUMENT_NAMING
4895 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4896 #undef TARGET_CALL_ARGS
4897 #define TARGET_CALL_ARGS nvptx_call_args
4898 #undef TARGET_END_CALL_ARGS
4899 #define TARGET_END_CALL_ARGS nvptx_end_call_args
4900
4901 #undef TARGET_ASM_FILE_START
4902 #define TARGET_ASM_FILE_START nvptx_file_start
4903 #undef TARGET_ASM_FILE_END
4904 #define TARGET_ASM_FILE_END nvptx_file_end
4905 #undef TARGET_ASM_GLOBALIZE_LABEL
4906 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4907 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4908 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4909 #undef TARGET_PRINT_OPERAND
4910 #define TARGET_PRINT_OPERAND nvptx_print_operand
4911 #undef TARGET_PRINT_OPERAND_ADDRESS
4912 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4913 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4914 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4915 #undef TARGET_ASM_INTEGER
4916 #define TARGET_ASM_INTEGER nvptx_assemble_integer
4917 #undef TARGET_ASM_DECL_END
4918 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4919 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
4920 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4921 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4922 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4923 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4924 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4925
4926 #undef TARGET_MACHINE_DEPENDENT_REORG
4927 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4928 #undef TARGET_NO_REGISTER_ALLOCATION
4929 #define TARGET_NO_REGISTER_ALLOCATION true
4930
4931 #undef TARGET_ENCODE_SECTION_INFO
4932 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
4933 #undef TARGET_RECORD_OFFLOAD_SYMBOL
4934 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4935
4936 #undef TARGET_VECTOR_ALIGNMENT
4937 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4938
4939 #undef TARGET_CANNOT_COPY_INSN_P
4940 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4941
4942 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4943 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4944
4945 #undef TARGET_INIT_BUILTINS
4946 #define TARGET_INIT_BUILTINS nvptx_init_builtins
4947 #undef TARGET_EXPAND_BUILTIN
4948 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4949 #undef TARGET_BUILTIN_DECL
4950 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
4951
4952 #undef TARGET_GOACC_VALIDATE_DIMS
4953 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4954
4955 #undef TARGET_GOACC_DIM_LIMIT
4956 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4957
4958 #undef TARGET_GOACC_FORK_JOIN
4959 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4960
4961 #undef TARGET_GOACC_REDUCTION
4962 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4963
4964 struct gcc_target targetm = TARGET_INITIALIZER;
4965
4966 #include "gt-nvptx.h"