]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/nvptx/nvptx.c
aa4a67fbeadcd24151445986a8f9e3066a83605a
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
84
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
87 2.x. */
88 #define PTX_CTA_SIZE 1024
89
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
92
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
102
103 /* The various PTX memory areas an object might reside in. */
104 enum nvptx_data_area
105 {
106 DATA_AREA_GENERIC,
107 DATA_AREA_GLOBAL,
108 DATA_AREA_SHARED,
109 DATA_AREA_LOCAL,
110 DATA_AREA_CONST,
111 DATA_AREA_PARAM,
112 DATA_AREA_MAX
113 };
114
115 rtx
116 gen_set_softstack_insn (rtx op)
117 {
118 gcc_assert (GET_MODE (op) == Pmode);
119 if (GET_MODE (op) == DImode)
120 return gen_set_softstack_di (op);
121 else if (GET_MODE (op) == SImode)
122 return gen_set_softstack_si (op);
123 else
124 gcc_unreachable ();
125 }
126
127 /* We record the data area in the target symbol flags. */
128 #define SYMBOL_DATA_AREA(SYM) \
129 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
130 & 7)
131 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
132 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
133
134 /* Record the function decls we've written, and the libfuncs and function
135 decls corresponding to them. */
136 static std::stringstream func_decls;
137
138 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
139 {
140 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
141 static bool equal (rtx a, rtx b) { return a == b; }
142 };
143
144 static GTY((cache))
145 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
146
147 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
148 {
149 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
150 static bool equal (tree a, tree b) { return a == b; }
151 };
152
153 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
154 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
155
156 /* Buffer needed to broadcast across workers and vectors. This is
157 used for both worker-neutering and worker broadcasting, and
158 vector-neutering and boardcasting when vector_length > 32. It is
159 shared by all functions emitted. The buffer is placed in shared
160 memory. It'd be nice if PTX supported common blocks, because then
161 this could be shared across TUs (taking the largest size). */
162 static unsigned oacc_bcast_size;
163 static unsigned oacc_bcast_partition;
164 static unsigned oacc_bcast_align;
165 static GTY(()) rtx oacc_bcast_sym;
166
167 /* Buffer needed for worker reductions. This has to be distinct from
168 the worker broadcast array, as both may be live concurrently. */
169 static unsigned worker_red_size;
170 static unsigned worker_red_align;
171 static GTY(()) rtx worker_red_sym;
172
173 /* Buffer needed for vector reductions, when vector_length >
174 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
175 array, as both may be live concurrently. */
176 static unsigned vector_red_size;
177 static unsigned vector_red_align;
178 static unsigned vector_red_partition;
179 static GTY(()) rtx vector_red_sym;
180
181 /* Global lock variable, needed for 128bit worker & gang reductions. */
182 static GTY(()) tree global_lock_var;
183
184 /* True if any function references __nvptx_stacks. */
185 static bool need_softstack_decl;
186
187 /* True if any function references __nvptx_uni. */
188 static bool need_unisimt_decl;
189
190 static int nvptx_mach_max_workers ();
191
192 /* Allocate a new, cleared machine_function structure. */
193
194 static struct machine_function *
195 nvptx_init_machine_status (void)
196 {
197 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
198 p->return_mode = VOIDmode;
199 return p;
200 }
201
202 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
203 and -fopenacc is also enabled. */
204
205 static void
206 diagnose_openacc_conflict (bool optval, const char *optname)
207 {
208 if (flag_openacc && optval)
209 error ("option %s is not supported together with %<-fopenacc%>", optname);
210 }
211
212 /* Implement TARGET_OPTION_OVERRIDE. */
213
214 static void
215 nvptx_option_override (void)
216 {
217 init_machine_status = nvptx_init_machine_status;
218
219 /* Set toplevel_reorder, unless explicitly disabled. We need
220 reordering so that we emit necessary assembler decls of
221 undeclared variables. */
222 if (!global_options_set.x_flag_toplevel_reorder)
223 flag_toplevel_reorder = 1;
224
225 debug_nonbind_markers_p = 0;
226
227 /* Set flag_no_common, unless explicitly disabled. We fake common
228 using .weak, and that's not entirely accurate, so avoid it
229 unless forced. */
230 if (!global_options_set.x_flag_no_common)
231 flag_no_common = 1;
232
233 /* The patch area requires nops, which we don't have. */
234 if (function_entry_patch_area_size > 0)
235 sorry ("not generating patch area, nops not supported");
236
237 /* Assumes that it will see only hard registers. */
238 flag_var_tracking = 0;
239
240 if (nvptx_optimize < 0)
241 nvptx_optimize = optimize > 0;
242
243 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
244 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
245 declared_libfuncs_htab
246 = hash_table<declared_libfunc_hasher>::create_ggc (17);
247
248 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
249 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
250 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
251 oacc_bcast_partition = 0;
252
253 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
254 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
255 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
256
257 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
258 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
259 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
260 vector_red_partition = 0;
261
262 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
263 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
264 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
265
266 if (TARGET_GOMP)
267 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
268 }
269
270 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
271 deal with ptx ideosyncracies. */
272
273 const char *
274 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
275 {
276 switch (mode)
277 {
278 case E_BLKmode:
279 return ".b8";
280 case E_BImode:
281 return ".pred";
282 case E_QImode:
283 if (promote)
284 return ".u32";
285 else
286 return ".u8";
287 case E_HImode:
288 return ".u16";
289 case E_SImode:
290 return ".u32";
291 case E_DImode:
292 return ".u64";
293
294 case E_SFmode:
295 return ".f32";
296 case E_DFmode:
297 return ".f64";
298
299 case E_V2SImode:
300 return ".v2.u32";
301 case E_V2DImode:
302 return ".v2.u64";
303
304 default:
305 gcc_unreachable ();
306 }
307 }
308
309 /* Encode the PTX data area that DECL (which might not actually be a
310 _DECL) should reside in. */
311
312 static void
313 nvptx_encode_section_info (tree decl, rtx rtl, int first)
314 {
315 default_encode_section_info (decl, rtl, first);
316 if (first && MEM_P (rtl))
317 {
318 nvptx_data_area area = DATA_AREA_GENERIC;
319
320 if (TREE_CONSTANT (decl))
321 area = DATA_AREA_CONST;
322 else if (TREE_CODE (decl) == VAR_DECL)
323 {
324 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
325 {
326 area = DATA_AREA_SHARED;
327 if (DECL_INITIAL (decl))
328 error ("static initialization of variable %q+D in %<.shared%>"
329 " memory is not supported", decl);
330 }
331 else
332 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
333 }
334
335 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
336 }
337 }
338
339 /* Return the PTX name of the data area in which SYM should be
340 placed. The symbol must have already been processed by
341 nvptx_encode_seciton_info, or equivalent. */
342
343 static const char *
344 section_for_sym (rtx sym)
345 {
346 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
347 /* Same order as nvptx_data_area enum. */
348 static char const *const areas[] =
349 {"", ".global", ".shared", ".local", ".const", ".param"};
350
351 return areas[area];
352 }
353
354 /* Similarly for a decl. */
355
356 static const char *
357 section_for_decl (const_tree decl)
358 {
359 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
360 }
361
362 /* Check NAME for special function names and redirect them by returning a
363 replacement. This applies to malloc, free and realloc, for which we
364 want to use libgcc wrappers, and call, which triggers a bug in
365 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
366 not active in an offload compiler -- the names are all set by the
367 host-side compiler. */
368
369 static const char *
370 nvptx_name_replacement (const char *name)
371 {
372 if (strcmp (name, "call") == 0)
373 return "__nvptx_call";
374 if (strcmp (name, "malloc") == 0)
375 return "__nvptx_malloc";
376 if (strcmp (name, "free") == 0)
377 return "__nvptx_free";
378 if (strcmp (name, "realloc") == 0)
379 return "__nvptx_realloc";
380 return name;
381 }
382
383 /* If MODE should be treated as two registers of an inner mode, return
384 that inner mode. Otherwise return VOIDmode. */
385
386 static machine_mode
387 maybe_split_mode (machine_mode mode)
388 {
389 if (COMPLEX_MODE_P (mode))
390 return GET_MODE_INNER (mode);
391
392 if (mode == TImode)
393 return DImode;
394
395 return VOIDmode;
396 }
397
398 /* Return true if mode should be treated as two registers. */
399
400 static bool
401 split_mode_p (machine_mode mode)
402 {
403 return maybe_split_mode (mode) != VOIDmode;
404 }
405
406 /* Output a register, subreg, or register pair (with optional
407 enclosing braces). */
408
409 static void
410 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
411 int subreg_offset = -1)
412 {
413 if (inner_mode == VOIDmode)
414 {
415 if (HARD_REGISTER_NUM_P (regno))
416 fprintf (file, "%s", reg_names[regno]);
417 else
418 fprintf (file, "%%r%d", regno);
419 }
420 else if (subreg_offset >= 0)
421 {
422 output_reg (file, regno, VOIDmode);
423 fprintf (file, "$%d", subreg_offset);
424 }
425 else
426 {
427 if (subreg_offset == -1)
428 fprintf (file, "{");
429 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
430 fprintf (file, ",");
431 output_reg (file, regno, inner_mode, 0);
432 if (subreg_offset == -1)
433 fprintf (file, "}");
434 }
435 }
436
437 /* Emit forking instructions for MASK. */
438
439 static void
440 nvptx_emit_forking (unsigned mask, bool is_call)
441 {
442 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
443 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
444 if (mask)
445 {
446 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
447
448 /* Emit fork at all levels. This helps form SESE regions, as
449 it creates a block with a single successor before entering a
450 partitooned region. That is a good candidate for the end of
451 an SESE region. */
452 emit_insn (gen_nvptx_fork (op));
453 emit_insn (gen_nvptx_forked (op));
454 }
455 }
456
457 /* Emit joining instructions for MASK. */
458
459 static void
460 nvptx_emit_joining (unsigned mask, bool is_call)
461 {
462 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
463 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
464 if (mask)
465 {
466 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
467
468 /* Emit joining for all non-call pars to ensure there's a single
469 predecessor for the block the join insn ends up in. This is
470 needed for skipping entire loops. */
471 emit_insn (gen_nvptx_joining (op));
472 emit_insn (gen_nvptx_join (op));
473 }
474 }
475
476 \f
477 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
478 returned in memory. Integer and floating types supported by the
479 machine are passed in registers, everything else is passed in
480 memory. Complex types are split. */
481
482 static bool
483 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
484 {
485 if (type)
486 {
487 if (AGGREGATE_TYPE_P (type))
488 return true;
489 if (TREE_CODE (type) == VECTOR_TYPE)
490 return true;
491 }
492
493 if (!for_return && COMPLEX_MODE_P (mode))
494 /* Complex types are passed as two underlying args. */
495 mode = GET_MODE_INNER (mode);
496
497 if (GET_MODE_CLASS (mode) != MODE_INT
498 && GET_MODE_CLASS (mode) != MODE_FLOAT)
499 return true;
500
501 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
502 return true;
503
504 return false;
505 }
506
507 /* A non-memory argument of mode MODE is being passed, determine the mode it
508 should be promoted to. This is also used for determining return
509 type promotion. */
510
511 static machine_mode
512 promote_arg (machine_mode mode, bool prototyped)
513 {
514 if (!prototyped && mode == SFmode)
515 /* K&R float promotion for unprototyped functions. */
516 mode = DFmode;
517 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
518 mode = SImode;
519
520 return mode;
521 }
522
523 /* A non-memory return type of MODE is being returned. Determine the
524 mode it should be promoted to. */
525
526 static machine_mode
527 promote_return (machine_mode mode)
528 {
529 return promote_arg (mode, true);
530 }
531
532 /* Implement TARGET_FUNCTION_ARG. */
533
534 static rtx
535 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
536 const_tree, bool named)
537 {
538 if (mode == VOIDmode || !named)
539 return NULL_RTX;
540
541 return gen_reg_rtx (mode);
542 }
543
544 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
545
546 static rtx
547 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
548 const_tree, bool named)
549 {
550 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
551
552 if (mode == VOIDmode || !named)
553 return NULL_RTX;
554
555 /* No need to deal with split modes here, the only case that can
556 happen is complex modes and those are dealt with by
557 TARGET_SPLIT_COMPLEX_ARG. */
558 return gen_rtx_UNSPEC (mode,
559 gen_rtvec (1, GEN_INT (cum->count)),
560 UNSPEC_ARG_REG);
561 }
562
563 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
564
565 static void
566 nvptx_function_arg_advance (cumulative_args_t cum_v,
567 machine_mode ARG_UNUSED (mode),
568 const_tree ARG_UNUSED (type),
569 bool ARG_UNUSED (named))
570 {
571 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
572
573 cum->count++;
574 }
575
576 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
577
578 For nvptx This is only used for varadic args. The type has already
579 been promoted and/or converted to invisible reference. */
580
581 static unsigned
582 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
583 {
584 return GET_MODE_ALIGNMENT (mode);
585 }
586
587 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
588
589 For nvptx, we know how to handle functions declared as stdarg: by
590 passing an extra pointer to the unnamed arguments. However, the
591 Fortran frontend can produce a different situation, where a
592 function pointer is declared with no arguments, but the actual
593 function and calls to it take more arguments. In that case, we
594 want to ensure the call matches the definition of the function. */
595
596 static bool
597 nvptx_strict_argument_naming (cumulative_args_t cum_v)
598 {
599 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
600
601 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
602 }
603
604 /* Implement TARGET_LIBCALL_VALUE. */
605
606 static rtx
607 nvptx_libcall_value (machine_mode mode, const_rtx)
608 {
609 if (!cfun || !cfun->machine->doing_call)
610 /* Pretend to return in a hard reg for early uses before pseudos can be
611 generated. */
612 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
613
614 return gen_reg_rtx (mode);
615 }
616
617 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
618 where function FUNC returns or receives a value of data type TYPE. */
619
620 static rtx
621 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
622 bool outgoing)
623 {
624 machine_mode mode = promote_return (TYPE_MODE (type));
625
626 if (outgoing)
627 {
628 gcc_assert (cfun);
629 cfun->machine->return_mode = mode;
630 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
631 }
632
633 return nvptx_libcall_value (mode, NULL_RTX);
634 }
635
636 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
637
638 static bool
639 nvptx_function_value_regno_p (const unsigned int regno)
640 {
641 return regno == NVPTX_RETURN_REGNUM;
642 }
643
644 /* Types with a mode other than those supported by the machine are passed by
645 reference in memory. */
646
647 static bool
648 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
649 machine_mode mode, const_tree type,
650 bool ARG_UNUSED (named))
651 {
652 return pass_in_memory (mode, type, false);
653 }
654
655 /* Implement TARGET_RETURN_IN_MEMORY. */
656
657 static bool
658 nvptx_return_in_memory (const_tree type, const_tree)
659 {
660 return pass_in_memory (TYPE_MODE (type), type, true);
661 }
662
663 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
664
665 static machine_mode
666 nvptx_promote_function_mode (const_tree type, machine_mode mode,
667 int *ARG_UNUSED (punsignedp),
668 const_tree funtype, int for_return)
669 {
670 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
671 }
672
673 /* Helper for write_arg. Emit a single PTX argument of MODE, either
674 in a prototype, or as copy in a function prologue. ARGNO is the
675 index of this argument in the PTX function. FOR_REG is negative,
676 if we're emitting the PTX prototype. It is zero if we're copying
677 to an argument register and it is greater than zero if we're
678 copying to a specific hard register. */
679
680 static int
681 write_arg_mode (std::stringstream &s, int for_reg, int argno,
682 machine_mode mode)
683 {
684 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
685
686 if (for_reg < 0)
687 {
688 /* Writing PTX prototype. */
689 s << (argno ? ", " : " (");
690 s << ".param" << ptx_type << " %in_ar" << argno;
691 }
692 else
693 {
694 s << "\t.reg" << ptx_type << " ";
695 if (for_reg)
696 s << reg_names[for_reg];
697 else
698 s << "%ar" << argno;
699 s << ";\n";
700 if (argno >= 0)
701 {
702 s << "\tld.param" << ptx_type << " ";
703 if (for_reg)
704 s << reg_names[for_reg];
705 else
706 s << "%ar" << argno;
707 s << ", [%in_ar" << argno << "];\n";
708 }
709 }
710 return argno + 1;
711 }
712
713 /* Process function parameter TYPE to emit one or more PTX
714 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
715 is true, if this is a prototyped function, rather than an old-style
716 C declaration. Returns the next argument number to use.
717
718 The promotion behavior here must match the regular GCC function
719 parameter marshalling machinery. */
720
721 static int
722 write_arg_type (std::stringstream &s, int for_reg, int argno,
723 tree type, bool prototyped)
724 {
725 machine_mode mode = TYPE_MODE (type);
726
727 if (mode == VOIDmode)
728 return argno;
729
730 if (pass_in_memory (mode, type, false))
731 mode = Pmode;
732 else
733 {
734 bool split = TREE_CODE (type) == COMPLEX_TYPE;
735
736 if (split)
737 {
738 /* Complex types are sent as two separate args. */
739 type = TREE_TYPE (type);
740 mode = TYPE_MODE (type);
741 prototyped = true;
742 }
743
744 mode = promote_arg (mode, prototyped);
745 if (split)
746 argno = write_arg_mode (s, for_reg, argno, mode);
747 }
748
749 return write_arg_mode (s, for_reg, argno, mode);
750 }
751
752 /* Emit a PTX return as a prototype or function prologue declaration
753 for MODE. */
754
755 static void
756 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
757 {
758 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
759 const char *pfx = "\t.reg";
760 const char *sfx = ";\n";
761
762 if (for_proto)
763 pfx = "(.param", sfx = "_out) ";
764
765 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
766 }
767
768 /* Process a function return TYPE to emit a PTX return as a prototype
769 or function prologue declaration. Returns true if return is via an
770 additional pointer parameter. The promotion behavior here must
771 match the regular GCC function return mashalling. */
772
773 static bool
774 write_return_type (std::stringstream &s, bool for_proto, tree type)
775 {
776 machine_mode mode = TYPE_MODE (type);
777
778 if (mode == VOIDmode)
779 return false;
780
781 bool return_in_mem = pass_in_memory (mode, type, true);
782
783 if (return_in_mem)
784 {
785 if (for_proto)
786 return return_in_mem;
787
788 /* Named return values can cause us to return a pointer as well
789 as expect an argument for the return location. This is
790 optimization-level specific, so no caller can make use of
791 this data, but more importantly for us, we must ensure it
792 doesn't change the PTX prototype. */
793 mode = (machine_mode) cfun->machine->return_mode;
794
795 if (mode == VOIDmode)
796 return return_in_mem;
797
798 /* Clear return_mode to inhibit copy of retval to non-existent
799 retval parameter. */
800 cfun->machine->return_mode = VOIDmode;
801 }
802 else
803 mode = promote_return (mode);
804
805 write_return_mode (s, for_proto, mode);
806
807 return return_in_mem;
808 }
809
810 /* Look for attributes in ATTRS that would indicate we must write a function
811 as a .entry kernel rather than a .func. Return true if one is found. */
812
813 static bool
814 write_as_kernel (tree attrs)
815 {
816 return (lookup_attribute ("kernel", attrs) != NULL_TREE
817 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
818 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
819 /* For OpenMP target regions, the corresponding kernel entry is emitted from
820 write_omp_entry as a separate function. */
821 }
822
823 /* Emit a linker marker for a function decl or defn. */
824
825 static void
826 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
827 const char *name)
828 {
829 s << "\n// BEGIN";
830 if (globalize)
831 s << " GLOBAL";
832 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
833 s << name << "\n";
834 }
835
836 /* Emit a linker marker for a variable decl or defn. */
837
838 static void
839 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
840 {
841 fprintf (file, "\n// BEGIN%s VAR %s: ",
842 globalize ? " GLOBAL" : "",
843 is_defn ? "DEF" : "DECL");
844 assemble_name_raw (file, name);
845 fputs ("\n", file);
846 }
847
848 /* Write a .func or .kernel declaration or definition along with
849 a helper comment for use by ld. S is the stream to write to, DECL
850 the decl for the function with name NAME. For definitions, emit
851 a declaration too. */
852
853 static const char *
854 write_fn_proto (std::stringstream &s, bool is_defn,
855 const char *name, const_tree decl)
856 {
857 if (is_defn)
858 /* Emit a declaration. The PTX assembler gets upset without it. */
859 name = write_fn_proto (s, false, name, decl);
860 else
861 {
862 /* Avoid repeating the name replacement. */
863 name = nvptx_name_replacement (name);
864 if (name[0] == '*')
865 name++;
866 }
867
868 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
869
870 /* PTX declaration. */
871 if (DECL_EXTERNAL (decl))
872 s << ".extern ";
873 else if (TREE_PUBLIC (decl))
874 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
875 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
876
877 tree fntype = TREE_TYPE (decl);
878 tree result_type = TREE_TYPE (fntype);
879
880 /* atomic_compare_exchange_$n builtins have an exceptional calling
881 convention. */
882 int not_atomic_weak_arg = -1;
883 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
884 switch (DECL_FUNCTION_CODE (decl))
885 {
886 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
887 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
888 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
889 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
890 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
891 /* These atomics skip the 'weak' parm in an actual library
892 call. We must skip it in the prototype too. */
893 not_atomic_weak_arg = 3;
894 break;
895
896 default:
897 break;
898 }
899
900 /* Declare the result. */
901 bool return_in_mem = write_return_type (s, true, result_type);
902
903 s << name;
904
905 int argno = 0;
906
907 /* Emit argument list. */
908 if (return_in_mem)
909 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
910
911 /* We get:
912 NULL in TYPE_ARG_TYPES, for old-style functions
913 NULL in DECL_ARGUMENTS, for builtin functions without another
914 declaration.
915 So we have to pick the best one we have. */
916 tree args = TYPE_ARG_TYPES (fntype);
917 bool prototyped = true;
918 if (!args)
919 {
920 args = DECL_ARGUMENTS (decl);
921 prototyped = false;
922 }
923
924 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
925 {
926 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
927
928 if (not_atomic_weak_arg)
929 argno = write_arg_type (s, -1, argno, type, prototyped);
930 else
931 gcc_assert (type == boolean_type_node);
932 }
933
934 if (stdarg_p (fntype))
935 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
936
937 if (DECL_STATIC_CHAIN (decl))
938 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
939
940 if (!argno && strcmp (name, "main") == 0)
941 {
942 argno = write_arg_type (s, -1, argno, integer_type_node, true);
943 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
944 }
945
946 if (argno)
947 s << ")";
948
949 s << (is_defn ? "\n" : ";\n");
950
951 return name;
952 }
953
954 /* Construct a function declaration from a call insn. This can be
955 necessary for two reasons - either we have an indirect call which
956 requires a .callprototype declaration, or we have a libcall
957 generated by emit_library_call for which no decl exists. */
958
959 static void
960 write_fn_proto_from_insn (std::stringstream &s, const char *name,
961 rtx result, rtx pat)
962 {
963 if (!name)
964 {
965 s << "\t.callprototype ";
966 name = "_";
967 }
968 else
969 {
970 name = nvptx_name_replacement (name);
971 write_fn_marker (s, false, true, name);
972 s << "\t.extern .func ";
973 }
974
975 if (result != NULL_RTX)
976 write_return_mode (s, true, GET_MODE (result));
977
978 s << name;
979
980 int arg_end = XVECLEN (pat, 0);
981 for (int i = 1; i < arg_end; i++)
982 {
983 /* We don't have to deal with mode splitting & promotion here,
984 as that was already done when generating the call
985 sequence. */
986 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
987
988 write_arg_mode (s, -1, i - 1, mode);
989 }
990 if (arg_end != 1)
991 s << ")";
992 s << ";\n";
993 }
994
995 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
996 table and and write a ptx prototype. These are emitted at end of
997 compilation. */
998
999 static void
1000 nvptx_record_fndecl (tree decl)
1001 {
1002 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1003 if (*slot == NULL)
1004 {
1005 *slot = decl;
1006 const char *name = get_fnname_from_decl (decl);
1007 write_fn_proto (func_decls, false, name, decl);
1008 }
1009 }
1010
1011 /* Record a libcall or unprototyped external function. CALLEE is the
1012 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1013 declaration for it. */
1014
1015 static void
1016 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1017 {
1018 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1019 if (*slot == NULL)
1020 {
1021 *slot = callee;
1022
1023 const char *name = XSTR (callee, 0);
1024 write_fn_proto_from_insn (func_decls, name, retval, pat);
1025 }
1026 }
1027
1028 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1029 is prototyped, record it now. Otherwise record it as needed at end
1030 of compilation, when we might have more information about it. */
1031
1032 void
1033 nvptx_record_needed_fndecl (tree decl)
1034 {
1035 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1036 {
1037 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1038 if (*slot == NULL)
1039 *slot = decl;
1040 }
1041 else
1042 nvptx_record_fndecl (decl);
1043 }
1044
1045 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1046 it as needed. */
1047
1048 static void
1049 nvptx_maybe_record_fnsym (rtx sym)
1050 {
1051 tree decl = SYMBOL_REF_DECL (sym);
1052
1053 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1054 nvptx_record_needed_fndecl (decl);
1055 }
1056
1057 /* Emit a local array to hold some part of a conventional stack frame
1058 and initialize REGNO to point to it. If the size is zero, it'll
1059 never be valid to dereference, so we can simply initialize to
1060 zero. */
1061
1062 static void
1063 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1064 {
1065 if (size)
1066 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1067 align, reg_names[regno], size);
1068 fprintf (file, "\t.reg.u%d %s;\n",
1069 POINTER_SIZE, reg_names[regno]);
1070 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1071 : "\tmov.u%d %s, 0;\n"),
1072 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1073 }
1074
1075 /* Emit soft stack frame setup sequence. */
1076
1077 static void
1078 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1079 {
1080 /* Maintain 64-bit stack alignment. */
1081 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1082 size = ROUND_UP (size, keep_align);
1083 int bits = POINTER_SIZE;
1084 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1085 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1086 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1087 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1088 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1089 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1090 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1091 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1092 fprintf (file, "\t{\n");
1093 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1094 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1095 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1096 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1097 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1098 bits == 64 ? ".wide" : ".lo", bits / 8);
1099 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1100
1101 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1102 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1103
1104 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1105 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1106 bits, reg_sspprev, reg_sspslot);
1107
1108 /* Initialize %frame = %sspprev - size. */
1109 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1110 bits, reg_frame, reg_sspprev, size);
1111
1112 /* Apply alignment, if larger than 64. */
1113 if (alignment > keep_align)
1114 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1115 bits, reg_frame, reg_frame, -alignment);
1116
1117 size = crtl->outgoing_args_size;
1118 gcc_assert (size % keep_align == 0);
1119
1120 /* Initialize %stack. */
1121 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1122 bits, reg_stack, reg_frame, size);
1123
1124 if (!crtl->is_leaf)
1125 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1126 bits, reg_sspslot, reg_stack);
1127 fprintf (file, "\t}\n");
1128 cfun->machine->has_softstack = true;
1129 need_softstack_decl = true;
1130 }
1131
1132 /* Emit code to initialize the REGNO predicate register to indicate
1133 whether we are not lane zero on the NAME axis. */
1134
1135 static void
1136 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1137 {
1138 fprintf (file, "\t{\n");
1139 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1140 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1141 {
1142 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1143 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1144 }
1145 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1146 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1147 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1148 {
1149 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1150 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1151 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1152 "// vector reduction buffer\n",
1153 REGNO (cfun->machine->red_partition),
1154 vector_red_partition);
1155 }
1156 /* Verify vector_red_size. */
1157 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1158 <= vector_red_size);
1159 fprintf (file, "\t}\n");
1160 }
1161
1162 /* Emit code to initialize OpenACC worker broadcast and synchronization
1163 registers. */
1164
1165 static void
1166 nvptx_init_oacc_workers (FILE *file)
1167 {
1168 fprintf (file, "\t{\n");
1169 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1170 if (cfun->machine->bcast_partition)
1171 {
1172 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1173 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1174 }
1175 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1176 if (cfun->machine->bcast_partition)
1177 {
1178 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1179 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1180 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1181 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1182 "// vector broadcast offset\n",
1183 REGNO (cfun->machine->bcast_partition),
1184 oacc_bcast_partition);
1185 }
1186 /* Verify oacc_bcast_size. */
1187 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1188 <= oacc_bcast_size);
1189 if (cfun->machine->sync_bar)
1190 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1191 "// vector synchronization barrier\n",
1192 REGNO (cfun->machine->sync_bar));
1193 fprintf (file, "\t}\n");
1194 }
1195
1196 /* Emit code to initialize predicate and master lane index registers for
1197 -muniform-simt code generation variant. */
1198
1199 static void
1200 nvptx_init_unisimt_predicate (FILE *file)
1201 {
1202 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1203 int loc = REGNO (cfun->machine->unisimt_location);
1204 int bits = POINTER_SIZE;
1205 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1206 fprintf (file, "\t{\n");
1207 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1208 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1209 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1210 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1211 bits == 64 ? ".wide" : ".lo");
1212 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1213 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1214 if (cfun->machine->unisimt_predicate)
1215 {
1216 int master = REGNO (cfun->machine->unisimt_master);
1217 int pred = REGNO (cfun->machine->unisimt_predicate);
1218 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1219 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1220 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1221 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1222 /* Compute predicate as 'tid.x == master'. */
1223 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1224 }
1225 fprintf (file, "\t}\n");
1226 need_unisimt_decl = true;
1227 }
1228
1229 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1230
1231 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1232 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1233 {
1234 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1235 __nvptx_uni[tid.y] = 0;
1236 gomp_nvptx_main (ORIG, arg);
1237 }
1238 ORIG itself should not be emitted as a PTX .entry function. */
1239
1240 static void
1241 write_omp_entry (FILE *file, const char *name, const char *orig)
1242 {
1243 static bool gomp_nvptx_main_declared;
1244 if (!gomp_nvptx_main_declared)
1245 {
1246 gomp_nvptx_main_declared = true;
1247 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1248 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1249 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1250 }
1251 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1252 #define NTID_Y "%ntid.y"
1253 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1254 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1255 {\n\
1256 .reg.u32 %r<3>;\n\
1257 .reg.u" PS " %R<4>;\n\
1258 mov.u32 %r0, %tid.y;\n\
1259 mov.u32 %r1, " NTID_Y ";\n\
1260 mov.u32 %r2, %ctaid.x;\n\
1261 cvt.u" PS ".u32 %R1, %r0;\n\
1262 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1263 mov.u" PS " %R0, __nvptx_stacks;\n\
1264 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1265 ld.param.u" PS " %R2, [%stack];\n\
1266 ld.param.u" PS " %R3, [%sz];\n\
1267 add.u" PS " %R2, %R2, %R3;\n\
1268 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1269 st.shared.u" PS " [%R0], %R2;\n\
1270 mov.u" PS " %R0, __nvptx_uni;\n\
1271 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1272 mov.u32 %r0, 0;\n\
1273 st.shared.u32 [%R0], %r0;\n\
1274 mov.u" PS " %R0, \0;\n\
1275 ld.param.u" PS " %R1, [%arg];\n\
1276 {\n\
1277 .param.u" PS " %P<2>;\n\
1278 st.param.u" PS " [%P0], %R0;\n\
1279 st.param.u" PS " [%P1], %R1;\n\
1280 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1281 }\n\
1282 ret.uni;\n\
1283 }\n"
1284 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1285 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1286 #undef ENTRY_TEMPLATE
1287 #undef NTID_Y
1288 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1289 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1290 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1291 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1292 need_softstack_decl = need_unisimt_decl = true;
1293 }
1294
1295 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1296 function, including local var decls and copies from the arguments to
1297 local regs. */
1298
1299 void
1300 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1301 {
1302 tree fntype = TREE_TYPE (decl);
1303 tree result_type = TREE_TYPE (fntype);
1304 int argno = 0;
1305
1306 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1307 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1308 {
1309 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1310 sprintf (buf, "%s$impl", name);
1311 write_omp_entry (file, name, buf);
1312 name = buf;
1313 }
1314 /* We construct the initial part of the function into a string
1315 stream, in order to share the prototype writing code. */
1316 std::stringstream s;
1317 write_fn_proto (s, true, name, decl);
1318 s << "{\n";
1319
1320 bool return_in_mem = write_return_type (s, false, result_type);
1321 if (return_in_mem)
1322 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1323
1324 /* Declare and initialize incoming arguments. */
1325 tree args = TYPE_ARG_TYPES (fntype);
1326 bool prototyped = true;
1327 if (!args)
1328 {
1329 args = DECL_ARGUMENTS (decl);
1330 prototyped = false;
1331 }
1332
1333 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1334 {
1335 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1336
1337 argno = write_arg_type (s, 0, argno, type, prototyped);
1338 }
1339
1340 if (stdarg_p (fntype))
1341 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1342 true);
1343
1344 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1345 write_arg_type (s, STATIC_CHAIN_REGNUM,
1346 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1347 true);
1348
1349 fprintf (file, "%s", s.str().c_str());
1350
1351 /* Usually 'crtl->is_leaf' is computed during register allocator
1352 initialization (which is not done on NVPTX) or for pressure-sensitive
1353 optimizations. Initialize it here, except if already set. */
1354 if (!crtl->is_leaf)
1355 crtl->is_leaf = leaf_function_p ();
1356
1357 HOST_WIDE_INT sz = get_frame_size ();
1358 bool need_frameptr = sz || cfun->machine->has_chain;
1359 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1360 if (!TARGET_SOFT_STACK)
1361 {
1362 /* Declare a local var for outgoing varargs. */
1363 if (cfun->machine->has_varadic)
1364 init_frame (file, STACK_POINTER_REGNUM,
1365 UNITS_PER_WORD, crtl->outgoing_args_size);
1366
1367 /* Declare a local variable for the frame. Force its size to be
1368 DImode-compatible. */
1369 if (need_frameptr)
1370 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1371 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1372 }
1373 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1374 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1375 init_softstack_frame (file, alignment, sz);
1376
1377 if (cfun->machine->has_simtreg)
1378 {
1379 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1380 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1381 align = MAX (align, GET_MODE_SIZE (DImode));
1382 if (!crtl->is_leaf || cfun->calls_alloca)
1383 simtsz = HOST_WIDE_INT_M1U;
1384 if (simtsz == HOST_WIDE_INT_M1U)
1385 simtsz = nvptx_softstack_size;
1386 if (cfun->machine->has_softstack)
1387 simtsz += POINTER_SIZE / 8;
1388 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1389 if (align > GET_MODE_SIZE (DImode))
1390 simtsz += align - GET_MODE_SIZE (DImode);
1391 if (simtsz)
1392 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1393 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1394 }
1395
1396 /* Restore the vector reduction partition register, if necessary.
1397 FIXME: Find out when and why this is necessary, and fix it. */
1398 if (cfun->machine->red_partition)
1399 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1400 = cfun->machine->red_partition;
1401
1402 /* Declare the pseudos we have as ptx registers. */
1403 int maxregs = max_reg_num ();
1404 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1405 {
1406 if (regno_reg_rtx[i] != const0_rtx)
1407 {
1408 machine_mode mode = PSEUDO_REGNO_MODE (i);
1409 machine_mode split = maybe_split_mode (mode);
1410
1411 if (split_mode_p (mode))
1412 mode = split;
1413 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1414 output_reg (file, i, split, -2);
1415 fprintf (file, ";\n");
1416 }
1417 }
1418
1419 /* Emit axis predicates. */
1420 if (cfun->machine->axis_predicate[0])
1421 nvptx_init_axis_predicate (file,
1422 REGNO (cfun->machine->axis_predicate[0]), "y");
1423 if (cfun->machine->axis_predicate[1])
1424 nvptx_init_axis_predicate (file,
1425 REGNO (cfun->machine->axis_predicate[1]), "x");
1426 if (cfun->machine->unisimt_predicate
1427 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1428 nvptx_init_unisimt_predicate (file);
1429 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1430 nvptx_init_oacc_workers (file);
1431 }
1432
1433 /* Output code for switching uniform-simt state. ENTERING indicates whether
1434 we are entering or leaving non-uniform execution region. */
1435
1436 static void
1437 nvptx_output_unisimt_switch (FILE *file, bool entering)
1438 {
1439 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1440 return;
1441 fprintf (file, "\t{\n");
1442 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1443 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1444 if (!crtl->is_leaf)
1445 {
1446 int loc = REGNO (cfun->machine->unisimt_location);
1447 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1448 }
1449 if (cfun->machine->unisimt_predicate)
1450 {
1451 int master = REGNO (cfun->machine->unisimt_master);
1452 int pred = REGNO (cfun->machine->unisimt_predicate);
1453 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1454 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1455 master, entering ? "%ustmp2" : "0");
1456 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1457 }
1458 fprintf (file, "\t}\n");
1459 }
1460
1461 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1462 ENTERING indicates whether we are entering or leaving non-uniform execution.
1463 PTR is the register pointing to allocated storage, it is assigned to on
1464 entering and used to restore state on leaving. SIZE and ALIGN are used only
1465 on entering. */
1466
1467 static void
1468 nvptx_output_softstack_switch (FILE *file, bool entering,
1469 rtx ptr, rtx size, rtx align)
1470 {
1471 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1472 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1473 return;
1474 int bits = POINTER_SIZE, regno = REGNO (ptr);
1475 fprintf (file, "\t{\n");
1476 if (entering)
1477 {
1478 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1479 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1480 cfun->machine->simt_stack_size);
1481 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1482 if (CONST_INT_P (size))
1483 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1484 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1485 else
1486 output_reg (file, REGNO (size), VOIDmode);
1487 fputs (";\n", file);
1488 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1489 fprintf (file,
1490 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1491 bits, regno, regno, UINTVAL (align));
1492 }
1493 if (cfun->machine->has_softstack)
1494 {
1495 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1496 if (entering)
1497 {
1498 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1499 bits, regno, bits / 8, reg_stack);
1500 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1501 bits, reg_stack, regno, bits / 8);
1502 }
1503 else
1504 {
1505 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1506 bits, reg_stack, regno, bits / 8);
1507 }
1508 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1509 }
1510 fprintf (file, "\t}\n");
1511 }
1512
1513 /* Output code to enter non-uniform execution region. DEST is a register
1514 to hold a per-lane allocation given by SIZE and ALIGN. */
1515
1516 const char *
1517 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1518 {
1519 nvptx_output_unisimt_switch (asm_out_file, true);
1520 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1521 return "";
1522 }
1523
1524 /* Output code to leave non-uniform execution region. SRC is the register
1525 holding per-lane storage previously allocated by omp_simt_enter insn. */
1526
1527 const char *
1528 nvptx_output_simt_exit (rtx src)
1529 {
1530 nvptx_output_unisimt_switch (asm_out_file, false);
1531 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1532 return "";
1533 }
1534
1535 /* Output instruction that sets soft stack pointer in shared memory to the
1536 value in register given by SRC_REGNO. */
1537
1538 const char *
1539 nvptx_output_set_softstack (unsigned src_regno)
1540 {
1541 if (cfun->machine->has_softstack && !crtl->is_leaf)
1542 {
1543 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1544 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1545 output_reg (asm_out_file, src_regno, VOIDmode);
1546 fprintf (asm_out_file, ";\n");
1547 }
1548 return "";
1549 }
1550 /* Output a return instruction. Also copy the return value to its outgoing
1551 location. */
1552
1553 const char *
1554 nvptx_output_return (void)
1555 {
1556 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1557
1558 if (mode != VOIDmode)
1559 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1560 nvptx_ptx_type_from_mode (mode, false),
1561 reg_names[NVPTX_RETURN_REGNUM],
1562 reg_names[NVPTX_RETURN_REGNUM]);
1563
1564 return "ret;";
1565 }
1566
1567 /* Terminate a function by writing a closing brace to FILE. */
1568
1569 void
1570 nvptx_function_end (FILE *file)
1571 {
1572 fprintf (file, "}\n");
1573 }
1574 \f
1575 /* Decide whether we can make a sibling call to a function. For ptx, we
1576 can't. */
1577
1578 static bool
1579 nvptx_function_ok_for_sibcall (tree, tree)
1580 {
1581 return false;
1582 }
1583
1584 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1585
1586 static rtx
1587 nvptx_get_drap_rtx (void)
1588 {
1589 if (TARGET_SOFT_STACK && stack_realign_drap)
1590 return arg_pointer_rtx;
1591 return NULL_RTX;
1592 }
1593
1594 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1595 argument to the next call. */
1596
1597 static void
1598 nvptx_call_args (rtx arg, tree fntype)
1599 {
1600 if (!cfun->machine->doing_call)
1601 {
1602 cfun->machine->doing_call = true;
1603 cfun->machine->is_varadic = false;
1604 cfun->machine->num_args = 0;
1605
1606 if (fntype && stdarg_p (fntype))
1607 {
1608 cfun->machine->is_varadic = true;
1609 cfun->machine->has_varadic = true;
1610 cfun->machine->num_args++;
1611 }
1612 }
1613
1614 if (REG_P (arg) && arg != pc_rtx)
1615 {
1616 cfun->machine->num_args++;
1617 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1618 cfun->machine->call_args);
1619 }
1620 }
1621
1622 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1623 information we recorded. */
1624
1625 static void
1626 nvptx_end_call_args (void)
1627 {
1628 cfun->machine->doing_call = false;
1629 free_EXPR_LIST_list (&cfun->machine->call_args);
1630 }
1631
1632 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1633 track of whether calls involving static chains or varargs were seen
1634 in the current function.
1635 For libcalls, maintain a hash table of decls we have seen, and
1636 record a function decl for later when encountering a new one. */
1637
1638 void
1639 nvptx_expand_call (rtx retval, rtx address)
1640 {
1641 rtx callee = XEXP (address, 0);
1642 rtx varargs = NULL_RTX;
1643 unsigned parallel = 0;
1644
1645 if (!call_insn_operand (callee, Pmode))
1646 {
1647 callee = force_reg (Pmode, callee);
1648 address = change_address (address, QImode, callee);
1649 }
1650
1651 if (GET_CODE (callee) == SYMBOL_REF)
1652 {
1653 tree decl = SYMBOL_REF_DECL (callee);
1654 if (decl != NULL_TREE)
1655 {
1656 if (DECL_STATIC_CHAIN (decl))
1657 cfun->machine->has_chain = true;
1658
1659 tree attr = oacc_get_fn_attrib (decl);
1660 if (attr)
1661 {
1662 tree dims = TREE_VALUE (attr);
1663
1664 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1665 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1666 {
1667 if (TREE_PURPOSE (dims)
1668 && !integer_zerop (TREE_PURPOSE (dims)))
1669 break;
1670 /* Not on this axis. */
1671 parallel ^= GOMP_DIM_MASK (ix);
1672 dims = TREE_CHAIN (dims);
1673 }
1674 }
1675 }
1676 }
1677
1678 unsigned nargs = cfun->machine->num_args;
1679 if (cfun->machine->is_varadic)
1680 {
1681 varargs = gen_reg_rtx (Pmode);
1682 emit_move_insn (varargs, stack_pointer_rtx);
1683 }
1684
1685 rtvec vec = rtvec_alloc (nargs + 1);
1686 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1687 int vec_pos = 0;
1688
1689 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1690 rtx tmp_retval = retval;
1691 if (retval)
1692 {
1693 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1694 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1695 call = gen_rtx_SET (tmp_retval, call);
1696 }
1697 XVECEXP (pat, 0, vec_pos++) = call;
1698
1699 /* Construct the call insn, including a USE for each argument pseudo
1700 register. These will be used when printing the insn. */
1701 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1702 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1703
1704 if (varargs)
1705 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1706
1707 gcc_assert (vec_pos = XVECLEN (pat, 0));
1708
1709 nvptx_emit_forking (parallel, true);
1710 emit_call_insn (pat);
1711 nvptx_emit_joining (parallel, true);
1712
1713 if (tmp_retval != retval)
1714 emit_move_insn (retval, tmp_retval);
1715 }
1716
1717 /* Emit a comparison COMPARE, and return the new test to be used in the
1718 jump. */
1719
1720 rtx
1721 nvptx_expand_compare (rtx compare)
1722 {
1723 rtx pred = gen_reg_rtx (BImode);
1724 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1725 XEXP (compare, 0), XEXP (compare, 1));
1726 emit_insn (gen_rtx_SET (pred, cmp));
1727 return gen_rtx_NE (BImode, pred, const0_rtx);
1728 }
1729
1730 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1731
1732 void
1733 nvptx_expand_oacc_fork (unsigned mode)
1734 {
1735 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1736 }
1737
1738 void
1739 nvptx_expand_oacc_join (unsigned mode)
1740 {
1741 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1742 }
1743
1744 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1745 objects. */
1746
1747 static rtx
1748 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1749 {
1750 rtx res;
1751
1752 switch (GET_MODE (src))
1753 {
1754 case E_DImode:
1755 res = gen_unpackdisi2 (dst0, dst1, src);
1756 break;
1757 case E_DFmode:
1758 res = gen_unpackdfsi2 (dst0, dst1, src);
1759 break;
1760 default: gcc_unreachable ();
1761 }
1762 return res;
1763 }
1764
1765 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1766 object. */
1767
1768 static rtx
1769 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1770 {
1771 rtx res;
1772
1773 switch (GET_MODE (dst))
1774 {
1775 case E_DImode:
1776 res = gen_packsidi2 (dst, src0, src1);
1777 break;
1778 case E_DFmode:
1779 res = gen_packsidf2 (dst, src0, src1);
1780 break;
1781 default: gcc_unreachable ();
1782 }
1783 return res;
1784 }
1785
1786 /* Generate an instruction or sequence to broadcast register REG
1787 across the vectors of a single warp. */
1788
1789 rtx
1790 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1791 {
1792 rtx res;
1793
1794 switch (GET_MODE (dst))
1795 {
1796 case E_SImode:
1797 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1798 break;
1799 case E_SFmode:
1800 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1801 break;
1802 case E_DImode:
1803 case E_DFmode:
1804 {
1805 rtx tmp0 = gen_reg_rtx (SImode);
1806 rtx tmp1 = gen_reg_rtx (SImode);
1807
1808 start_sequence ();
1809 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1810 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1811 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1812 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1813 res = get_insns ();
1814 end_sequence ();
1815 }
1816 break;
1817 case E_BImode:
1818 {
1819 rtx tmp = gen_reg_rtx (SImode);
1820
1821 start_sequence ();
1822 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1823 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1824 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1825 res = get_insns ();
1826 end_sequence ();
1827 }
1828 break;
1829 case E_QImode:
1830 case E_HImode:
1831 {
1832 rtx tmp = gen_reg_rtx (SImode);
1833
1834 start_sequence ();
1835 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1836 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1837 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1838 tmp)));
1839 res = get_insns ();
1840 end_sequence ();
1841 }
1842 break;
1843
1844 default:
1845 gcc_unreachable ();
1846 }
1847 return res;
1848 }
1849
1850 /* Generate an instruction or sequence to broadcast register REG
1851 across the vectors of a single warp. */
1852
1853 static rtx
1854 nvptx_gen_warp_bcast (rtx reg)
1855 {
1856 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1857 }
1858
1859 /* Structure used when generating a worker-level spill or fill. */
1860
1861 struct broadcast_data_t
1862 {
1863 rtx base; /* Register holding base addr of buffer. */
1864 rtx ptr; /* Iteration var, if needed. */
1865 unsigned offset; /* Offset into worker buffer. */
1866 };
1867
1868 /* Direction of the spill/fill and looping setup/teardown indicator. */
1869
1870 enum propagate_mask
1871 {
1872 PM_read = 1 << 0,
1873 PM_write = 1 << 1,
1874 PM_loop_begin = 1 << 2,
1875 PM_loop_end = 1 << 3,
1876
1877 PM_read_write = PM_read | PM_write
1878 };
1879
1880 /* Generate instruction(s) to spill or fill register REG to/from the
1881 worker broadcast array. PM indicates what is to be done, REP
1882 how many loop iterations will be executed (0 for not a loop). */
1883
1884 static rtx
1885 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1886 broadcast_data_t *data, bool vector)
1887 {
1888 rtx res;
1889 machine_mode mode = GET_MODE (reg);
1890
1891 switch (mode)
1892 {
1893 case E_BImode:
1894 {
1895 rtx tmp = gen_reg_rtx (SImode);
1896
1897 start_sequence ();
1898 if (pm & PM_read)
1899 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1900 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1901 if (pm & PM_write)
1902 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1903 res = get_insns ();
1904 end_sequence ();
1905 }
1906 break;
1907
1908 default:
1909 {
1910 rtx addr = data->ptr;
1911
1912 if (!addr)
1913 {
1914 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1915
1916 oacc_bcast_align = MAX (oacc_bcast_align, align);
1917 data->offset = ROUND_UP (data->offset, align);
1918 addr = data->base;
1919 gcc_assert (data->base != NULL);
1920 if (data->offset)
1921 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1922 }
1923
1924 addr = gen_rtx_MEM (mode, addr);
1925 if (pm == PM_read)
1926 res = gen_rtx_SET (addr, reg);
1927 else if (pm == PM_write)
1928 res = gen_rtx_SET (reg, addr);
1929 else
1930 gcc_unreachable ();
1931
1932 if (data->ptr)
1933 {
1934 /* We're using a ptr, increment it. */
1935 start_sequence ();
1936
1937 emit_insn (res);
1938 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1939 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1940 res = get_insns ();
1941 end_sequence ();
1942 }
1943 else
1944 rep = 1;
1945 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1946 }
1947 break;
1948 }
1949 return res;
1950 }
1951 \f
1952 /* Returns true if X is a valid address for use in a memory reference. */
1953
1954 static bool
1955 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1956 {
1957 enum rtx_code code = GET_CODE (x);
1958
1959 switch (code)
1960 {
1961 case REG:
1962 return true;
1963
1964 case PLUS:
1965 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1966 return true;
1967 return false;
1968
1969 case CONST:
1970 case SYMBOL_REF:
1971 case LABEL_REF:
1972 return true;
1973
1974 default:
1975 return false;
1976 }
1977 }
1978 \f
1979 /* Machinery to output constant initializers. When beginning an
1980 initializer, we decide on a fragment size (which is visible in ptx
1981 in the type used), and then all initializer data is buffered until
1982 a fragment is filled and ready to be written out. */
1983
1984 static struct
1985 {
1986 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1987 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1988 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1989 out. */
1990 unsigned size; /* Fragment size to accumulate. */
1991 unsigned offset; /* Offset within current fragment. */
1992 bool started; /* Whether we've output any initializer. */
1993 } init_frag;
1994
1995 /* The current fragment is full, write it out. SYM may provide a
1996 symbolic reference we should output, in which case the fragment
1997 value is the addend. */
1998
1999 static void
2000 output_init_frag (rtx sym)
2001 {
2002 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2003 unsigned HOST_WIDE_INT val = init_frag.val;
2004
2005 init_frag.started = true;
2006 init_frag.val = 0;
2007 init_frag.offset = 0;
2008 init_frag.remaining--;
2009
2010 if (sym)
2011 {
2012 bool function = (SYMBOL_REF_DECL (sym)
2013 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2014 if (!function)
2015 fprintf (asm_out_file, "generic(");
2016 output_address (VOIDmode, sym);
2017 if (!function)
2018 fprintf (asm_out_file, ")");
2019 if (val)
2020 fprintf (asm_out_file, " + ");
2021 }
2022
2023 if (!sym || val)
2024 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2025 }
2026
2027 /* Add value VAL of size SIZE to the data we're emitting, and keep
2028 writing out chunks as they fill up. */
2029
2030 static void
2031 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2032 {
2033 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2034
2035 for (unsigned part = 0; size; size -= part)
2036 {
2037 val >>= part * BITS_PER_UNIT;
2038 part = init_frag.size - init_frag.offset;
2039 part = MIN (part, size);
2040
2041 unsigned HOST_WIDE_INT partial
2042 = val << (init_frag.offset * BITS_PER_UNIT);
2043 init_frag.val |= partial & init_frag.mask;
2044 init_frag.offset += part;
2045
2046 if (init_frag.offset == init_frag.size)
2047 output_init_frag (NULL);
2048 }
2049 }
2050
2051 /* Target hook for assembling integer object X of size SIZE. */
2052
2053 static bool
2054 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2055 {
2056 HOST_WIDE_INT val = 0;
2057
2058 switch (GET_CODE (x))
2059 {
2060 default:
2061 /* Let the generic machinery figure it out, usually for a
2062 CONST_WIDE_INT. */
2063 return false;
2064
2065 case CONST_INT:
2066 nvptx_assemble_value (INTVAL (x), size);
2067 break;
2068
2069 case CONST:
2070 x = XEXP (x, 0);
2071 gcc_assert (GET_CODE (x) == PLUS);
2072 val = INTVAL (XEXP (x, 1));
2073 x = XEXP (x, 0);
2074 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2075 /* FALLTHROUGH */
2076
2077 case SYMBOL_REF:
2078 gcc_assert (size == init_frag.size);
2079 if (init_frag.offset)
2080 sorry ("cannot emit unaligned pointers in ptx assembly");
2081
2082 nvptx_maybe_record_fnsym (x);
2083 init_frag.val = val;
2084 output_init_frag (x);
2085 break;
2086 }
2087
2088 return true;
2089 }
2090
2091 /* Output SIZE zero bytes. We ignore the FILE argument since the
2092 functions we're calling to perform the output just use
2093 asm_out_file. */
2094
2095 void
2096 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2097 {
2098 /* Finish the current fragment, if it's started. */
2099 if (init_frag.offset)
2100 {
2101 unsigned part = init_frag.size - init_frag.offset;
2102 part = MIN (part, (unsigned)size);
2103 size -= part;
2104 nvptx_assemble_value (0, part);
2105 }
2106
2107 /* If this skip doesn't terminate the initializer, write as many
2108 remaining pieces as possible directly. */
2109 if (size < init_frag.remaining * init_frag.size)
2110 {
2111 while (size >= init_frag.size)
2112 {
2113 size -= init_frag.size;
2114 output_init_frag (NULL_RTX);
2115 }
2116 if (size)
2117 nvptx_assemble_value (0, size);
2118 }
2119 }
2120
2121 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2122 ignore the FILE arg. */
2123
2124 void
2125 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2126 {
2127 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2128 nvptx_assemble_value (str[i], 1);
2129 }
2130
2131 /* Return true if TYPE is a record type where the last field is an array without
2132 given dimension. */
2133
2134 static bool
2135 flexible_array_member_type_p (const_tree type)
2136 {
2137 if (TREE_CODE (type) != RECORD_TYPE)
2138 return false;
2139
2140 const_tree last_field = NULL_TREE;
2141 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2142 last_field = f;
2143
2144 if (!last_field)
2145 return false;
2146
2147 const_tree last_field_type = TREE_TYPE (last_field);
2148 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2149 return false;
2150
2151 return (! TYPE_DOMAIN (last_field_type)
2152 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2153 }
2154
2155 /* Emit a PTX variable decl and prepare for emission of its
2156 initializer. NAME is the symbol name and SETION the PTX data
2157 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2158 The caller has already emitted any indentation and linkage
2159 specifier. It is responsible for any initializer, terminating ;
2160 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2161 this is the opposite way round that PTX wants them! */
2162
2163 static void
2164 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2165 const_tree type, HOST_WIDE_INT size, unsigned align,
2166 bool undefined = false)
2167 {
2168 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2169 && (TYPE_DOMAIN (type) == NULL_TREE);
2170
2171 if (undefined && flexible_array_member_type_p (type))
2172 {
2173 size = 0;
2174 atype = true;
2175 }
2176
2177 while (TREE_CODE (type) == ARRAY_TYPE)
2178 type = TREE_TYPE (type);
2179
2180 if (TREE_CODE (type) == VECTOR_TYPE
2181 || TREE_CODE (type) == COMPLEX_TYPE)
2182 /* Neither vector nor complex types can contain the other. */
2183 type = TREE_TYPE (type);
2184
2185 unsigned elt_size = int_size_in_bytes (type);
2186
2187 /* Largest mode we're prepared to accept. For BLKmode types we
2188 don't know if it'll contain pointer constants, so have to choose
2189 pointer size, otherwise we can choose DImode. */
2190 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2191
2192 elt_size |= GET_MODE_SIZE (elt_mode);
2193 elt_size &= -elt_size; /* Extract LSB set. */
2194
2195 init_frag.size = elt_size;
2196 /* Avoid undefined shift behavior by using '2'. */
2197 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2198 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2199 init_frag.val = 0;
2200 init_frag.offset = 0;
2201 init_frag.started = false;
2202 /* Size might not be a multiple of elt size, if there's an
2203 initialized trailing struct array with smaller type than
2204 elt_size. */
2205 init_frag.remaining = (size + elt_size - 1) / elt_size;
2206
2207 fprintf (file, "%s .align %d .u%d ",
2208 section, align / BITS_PER_UNIT,
2209 elt_size * BITS_PER_UNIT);
2210 assemble_name (file, name);
2211
2212 if (size)
2213 /* We make everything an array, to simplify any initialization
2214 emission. */
2215 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2216 else if (atype)
2217 fprintf (file, "[]");
2218 }
2219
2220 /* Called when the initializer for a decl has been completely output through
2221 combinations of the three functions above. */
2222
2223 static void
2224 nvptx_assemble_decl_end (void)
2225 {
2226 if (init_frag.offset)
2227 /* This can happen with a packed struct with trailing array member. */
2228 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2229 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2230 }
2231
2232 /* Output an uninitialized common or file-scope variable. */
2233
2234 void
2235 nvptx_output_aligned_decl (FILE *file, const char *name,
2236 const_tree decl, HOST_WIDE_INT size, unsigned align)
2237 {
2238 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2239
2240 /* If this is public, it is common. The nearest thing we have to
2241 common is weak. */
2242 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2243
2244 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2245 TREE_TYPE (decl), size, align);
2246 nvptx_assemble_decl_end ();
2247 }
2248
2249 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2250 writing a constant variable EXP with NAME and SIZE and its
2251 initializer to FILE. */
2252
2253 static void
2254 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2255 const_tree exp, HOST_WIDE_INT obj_size)
2256 {
2257 write_var_marker (file, true, false, name);
2258
2259 fprintf (file, "\t");
2260
2261 tree type = TREE_TYPE (exp);
2262 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2263 TYPE_ALIGN (type));
2264 }
2265
2266 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2267 a variable DECL with NAME to FILE. */
2268
2269 void
2270 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2271 {
2272 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2273
2274 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2275 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2276
2277 tree type = TREE_TYPE (decl);
2278 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2279 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2280 type, obj_size, DECL_ALIGN (decl));
2281 }
2282
2283 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2284
2285 static void
2286 nvptx_globalize_label (FILE *, const char *)
2287 {
2288 }
2289
2290 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2291 declaration only for variable DECL with NAME to FILE. */
2292
2293 static void
2294 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2295 {
2296 /* The middle end can place constant pool decls into the varpool as
2297 undefined. Until that is fixed, catch the problem here. */
2298 if (DECL_IN_CONSTANT_POOL (decl))
2299 return;
2300
2301 /* We support weak defintions, and hence have the right
2302 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2303 if (DECL_WEAK (decl))
2304 error_at (DECL_SOURCE_LOCATION (decl),
2305 "PTX does not support weak declarations"
2306 " (only weak definitions)");
2307 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2308
2309 fprintf (file, "\t.extern ");
2310 tree size = DECL_SIZE_UNIT (decl);
2311 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2312 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2313 DECL_ALIGN (decl), true);
2314 nvptx_assemble_decl_end ();
2315 }
2316
2317 /* Output a pattern for a move instruction. */
2318
2319 const char *
2320 nvptx_output_mov_insn (rtx dst, rtx src)
2321 {
2322 machine_mode dst_mode = GET_MODE (dst);
2323 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2324 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2325 machine_mode src_inner = (GET_CODE (src) == SUBREG
2326 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2327
2328 rtx sym = src;
2329 if (GET_CODE (sym) == CONST)
2330 sym = XEXP (XEXP (sym, 0), 0);
2331 if (SYMBOL_REF_P (sym))
2332 {
2333 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2334 return "%.\tcvta%D1%t0\t%0, %1;";
2335 nvptx_maybe_record_fnsym (sym);
2336 }
2337
2338 if (src_inner == dst_inner)
2339 return "%.\tmov%t0\t%0, %1;";
2340
2341 if (CONSTANT_P (src))
2342 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2343 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2344 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2345
2346 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2347 {
2348 if (GET_MODE_BITSIZE (dst_mode) == 128
2349 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2350 {
2351 /* mov.b128 is not supported. */
2352 if (dst_inner == V2DImode && src_inner == TImode)
2353 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2354 else if (dst_inner == TImode && src_inner == V2DImode)
2355 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2356
2357 gcc_unreachable ();
2358 }
2359 return "%.\tmov.b%T0\t%0, %1;";
2360 }
2361
2362 return "%.\tcvt%t0%t1\t%0, %1;";
2363 }
2364
2365 static void nvptx_print_operand (FILE *, rtx, int);
2366
2367 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2368 involves writing .param declarations and in/out copies into them. For
2369 indirect calls, also write the .callprototype. */
2370
2371 const char *
2372 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2373 {
2374 char buf[16];
2375 static int labelno;
2376 bool needs_tgt = register_operand (callee, Pmode);
2377 rtx pat = PATTERN (insn);
2378 if (GET_CODE (pat) == COND_EXEC)
2379 pat = COND_EXEC_CODE (pat);
2380 int arg_end = XVECLEN (pat, 0);
2381 tree decl = NULL_TREE;
2382
2383 fprintf (asm_out_file, "\t{\n");
2384 if (result != NULL)
2385 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2386 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2387 reg_names[NVPTX_RETURN_REGNUM]);
2388
2389 /* Ensure we have a ptx declaration in the output if necessary. */
2390 if (GET_CODE (callee) == SYMBOL_REF)
2391 {
2392 decl = SYMBOL_REF_DECL (callee);
2393 if (!decl
2394 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2395 nvptx_record_libfunc (callee, result, pat);
2396 else if (DECL_EXTERNAL (decl))
2397 nvptx_record_fndecl (decl);
2398 }
2399
2400 if (needs_tgt)
2401 {
2402 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2403 labelno++;
2404 ASM_OUTPUT_LABEL (asm_out_file, buf);
2405 std::stringstream s;
2406 write_fn_proto_from_insn (s, NULL, result, pat);
2407 fputs (s.str().c_str(), asm_out_file);
2408 }
2409
2410 for (int argno = 1; argno < arg_end; argno++)
2411 {
2412 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2413 machine_mode mode = GET_MODE (t);
2414 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2415
2416 /* Mode splitting has already been done. */
2417 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2418 "\t\tst.param%s [%%out_arg%d], ",
2419 ptx_type, argno, ptx_type, argno);
2420 output_reg (asm_out_file, REGNO (t), VOIDmode);
2421 fprintf (asm_out_file, ";\n");
2422 }
2423
2424 /* The '.' stands for the call's predicate, if any. */
2425 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2426 fprintf (asm_out_file, "\t\tcall ");
2427 if (result != NULL_RTX)
2428 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2429
2430 if (decl)
2431 {
2432 const char *name = get_fnname_from_decl (decl);
2433 name = nvptx_name_replacement (name);
2434 assemble_name (asm_out_file, name);
2435 }
2436 else
2437 output_address (VOIDmode, callee);
2438
2439 const char *open = "(";
2440 for (int argno = 1; argno < arg_end; argno++)
2441 {
2442 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2443 open = "";
2444 }
2445 if (decl && DECL_STATIC_CHAIN (decl))
2446 {
2447 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2448 open = "";
2449 }
2450 if (!open[0])
2451 fprintf (asm_out_file, ")");
2452
2453 if (needs_tgt)
2454 {
2455 fprintf (asm_out_file, ", ");
2456 assemble_name (asm_out_file, buf);
2457 }
2458 fprintf (asm_out_file, ";\n");
2459
2460 if (find_reg_note (insn, REG_NORETURN, NULL))
2461 {
2462 /* No return functions confuse the PTX JIT, as it doesn't realize
2463 the flow control barrier they imply. It can seg fault if it
2464 encounters what looks like an unexitable loop. Emit a trailing
2465 trap and exit, which it does grok. */
2466 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2467 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2468 }
2469
2470 if (result)
2471 {
2472 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2473
2474 if (!rval[0])
2475 /* We must escape the '%' that starts RETURN_REGNUM. */
2476 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2477 reg_names[NVPTX_RETURN_REGNUM]);
2478 return rval;
2479 }
2480
2481 return "}";
2482 }
2483
2484 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2485
2486 static bool
2487 nvptx_print_operand_punct_valid_p (unsigned char c)
2488 {
2489 return c == '.' || c== '#';
2490 }
2491
2492 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2493
2494 static void
2495 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2496 {
2497 rtx off;
2498 if (GET_CODE (x) == CONST)
2499 x = XEXP (x, 0);
2500 switch (GET_CODE (x))
2501 {
2502 case PLUS:
2503 off = XEXP (x, 1);
2504 output_address (VOIDmode, XEXP (x, 0));
2505 fprintf (file, "+");
2506 output_address (VOIDmode, off);
2507 break;
2508
2509 case SYMBOL_REF:
2510 case LABEL_REF:
2511 output_addr_const (file, x);
2512 break;
2513
2514 default:
2515 gcc_assert (GET_CODE (x) != MEM);
2516 nvptx_print_operand (file, x, 0);
2517 break;
2518 }
2519 }
2520
2521 /* Write assembly language output for the address ADDR to FILE. */
2522
2523 static void
2524 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2525 {
2526 nvptx_print_address_operand (file, addr, mode);
2527 }
2528
2529 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2530
2531 Meaning of CODE:
2532 . -- print the predicate for the instruction or an emptry string for an
2533 unconditional one.
2534 # -- print a rounding mode for the instruction
2535
2536 A -- print a data area for a MEM
2537 c -- print an opcode suffix for a comparison operator, including a type code
2538 D -- print a data area for a MEM operand
2539 S -- print a shuffle kind specified by CONST_INT
2540 t -- print a type opcode suffix, promoting QImode to 32 bits
2541 T -- print a type size in bits
2542 u -- print a type opcode suffix without promotions. */
2543
2544 static void
2545 nvptx_print_operand (FILE *file, rtx x, int code)
2546 {
2547 if (code == '.')
2548 {
2549 x = current_insn_predicate;
2550 if (x)
2551 {
2552 fputs ("@", file);
2553 if (GET_CODE (x) == EQ)
2554 fputs ("!", file);
2555 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2556 }
2557 return;
2558 }
2559 else if (code == '#')
2560 {
2561 fputs (".rn", file);
2562 return;
2563 }
2564
2565 enum rtx_code x_code = GET_CODE (x);
2566 machine_mode mode = GET_MODE (x);
2567
2568 switch (code)
2569 {
2570 case 'A':
2571 x = XEXP (x, 0);
2572 /* FALLTHROUGH. */
2573
2574 case 'D':
2575 if (GET_CODE (x) == CONST)
2576 x = XEXP (x, 0);
2577 if (GET_CODE (x) == PLUS)
2578 x = XEXP (x, 0);
2579
2580 if (GET_CODE (x) == SYMBOL_REF)
2581 fputs (section_for_sym (x), file);
2582 break;
2583
2584 case 't':
2585 case 'u':
2586 if (x_code == SUBREG)
2587 {
2588 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2589 if (VECTOR_MODE_P (inner_mode)
2590 && (GET_MODE_SIZE (mode)
2591 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2592 mode = GET_MODE_INNER (inner_mode);
2593 else if (split_mode_p (inner_mode))
2594 mode = maybe_split_mode (inner_mode);
2595 else
2596 mode = inner_mode;
2597 }
2598 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2599 break;
2600
2601 case 'H':
2602 case 'L':
2603 {
2604 rtx inner_x = SUBREG_REG (x);
2605 machine_mode inner_mode = GET_MODE (inner_x);
2606 machine_mode split = maybe_split_mode (inner_mode);
2607
2608 output_reg (file, REGNO (inner_x), split,
2609 (code == 'H'
2610 ? GET_MODE_SIZE (inner_mode) / 2
2611 : 0));
2612 }
2613 break;
2614
2615 case 'S':
2616 {
2617 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2618 /* Same order as nvptx_shuffle_kind. */
2619 static const char *const kinds[] =
2620 {".up", ".down", ".bfly", ".idx"};
2621 fputs (kinds[kind], file);
2622 }
2623 break;
2624
2625 case 'T':
2626 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2627 break;
2628
2629 case 'j':
2630 fprintf (file, "@");
2631 goto common;
2632
2633 case 'J':
2634 fprintf (file, "@!");
2635 goto common;
2636
2637 case 'c':
2638 mode = GET_MODE (XEXP (x, 0));
2639 switch (x_code)
2640 {
2641 case EQ:
2642 fputs (".eq", file);
2643 break;
2644 case NE:
2645 if (FLOAT_MODE_P (mode))
2646 fputs (".neu", file);
2647 else
2648 fputs (".ne", file);
2649 break;
2650 case LE:
2651 case LEU:
2652 fputs (".le", file);
2653 break;
2654 case GE:
2655 case GEU:
2656 fputs (".ge", file);
2657 break;
2658 case LT:
2659 case LTU:
2660 fputs (".lt", file);
2661 break;
2662 case GT:
2663 case GTU:
2664 fputs (".gt", file);
2665 break;
2666 case LTGT:
2667 fputs (".ne", file);
2668 break;
2669 case UNEQ:
2670 fputs (".equ", file);
2671 break;
2672 case UNLE:
2673 fputs (".leu", file);
2674 break;
2675 case UNGE:
2676 fputs (".geu", file);
2677 break;
2678 case UNLT:
2679 fputs (".ltu", file);
2680 break;
2681 case UNGT:
2682 fputs (".gtu", file);
2683 break;
2684 case UNORDERED:
2685 fputs (".nan", file);
2686 break;
2687 case ORDERED:
2688 fputs (".num", file);
2689 break;
2690 default:
2691 gcc_unreachable ();
2692 }
2693 if (FLOAT_MODE_P (mode)
2694 || x_code == EQ || x_code == NE
2695 || x_code == GEU || x_code == GTU
2696 || x_code == LEU || x_code == LTU)
2697 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2698 else
2699 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2700 break;
2701 default:
2702 common:
2703 switch (x_code)
2704 {
2705 case SUBREG:
2706 {
2707 rtx inner_x = SUBREG_REG (x);
2708 machine_mode inner_mode = GET_MODE (inner_x);
2709 machine_mode split = maybe_split_mode (inner_mode);
2710
2711 if (VECTOR_MODE_P (inner_mode)
2712 && (GET_MODE_SIZE (mode)
2713 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2714 {
2715 output_reg (file, REGNO (inner_x), VOIDmode);
2716 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2717 }
2718 else if (split_mode_p (inner_mode)
2719 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2720 output_reg (file, REGNO (inner_x), split);
2721 else
2722 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2723 }
2724 break;
2725
2726 case REG:
2727 output_reg (file, REGNO (x), maybe_split_mode (mode));
2728 break;
2729
2730 case MEM:
2731 fputc ('[', file);
2732 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2733 fputc (']', file);
2734 break;
2735
2736 case CONST_INT:
2737 output_addr_const (file, x);
2738 break;
2739
2740 case CONST:
2741 case SYMBOL_REF:
2742 case LABEL_REF:
2743 /* We could use output_addr_const, but that can print things like
2744 "x-8", which breaks ptxas. Need to ensure it is output as
2745 "x+-8". */
2746 nvptx_print_address_operand (file, x, VOIDmode);
2747 break;
2748
2749 case CONST_DOUBLE:
2750 long vals[2];
2751 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2752 vals[0] &= 0xffffffff;
2753 vals[1] &= 0xffffffff;
2754 if (mode == SFmode)
2755 fprintf (file, "0f%08lx", vals[0]);
2756 else
2757 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2758 break;
2759
2760 case CONST_VECTOR:
2761 {
2762 unsigned n = CONST_VECTOR_NUNITS (x);
2763 fprintf (file, "{ ");
2764 for (unsigned i = 0; i < n; ++i)
2765 {
2766 if (i != 0)
2767 fprintf (file, ", ");
2768
2769 rtx elem = CONST_VECTOR_ELT (x, i);
2770 output_addr_const (file, elem);
2771 }
2772 fprintf (file, " }");
2773 }
2774 break;
2775
2776 default:
2777 output_addr_const (file, x);
2778 }
2779 }
2780 }
2781 \f
2782 /* Record replacement regs used to deal with subreg operands. */
2783 struct reg_replace
2784 {
2785 rtx replacement[MAX_RECOG_OPERANDS];
2786 machine_mode mode;
2787 int n_allocated;
2788 int n_in_use;
2789 };
2790
2791 /* Allocate or reuse a replacement in R and return the rtx. */
2792
2793 static rtx
2794 get_replacement (struct reg_replace *r)
2795 {
2796 if (r->n_allocated == r->n_in_use)
2797 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2798 return r->replacement[r->n_in_use++];
2799 }
2800
2801 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2802 the presence of subregs would break the rules for most instructions.
2803 Replace them with a suitable new register of the right size, plus
2804 conversion copyin/copyout instructions. */
2805
2806 static void
2807 nvptx_reorg_subreg (void)
2808 {
2809 struct reg_replace qiregs, hiregs, siregs, diregs;
2810 rtx_insn *insn, *next;
2811
2812 qiregs.n_allocated = 0;
2813 hiregs.n_allocated = 0;
2814 siregs.n_allocated = 0;
2815 diregs.n_allocated = 0;
2816 qiregs.mode = QImode;
2817 hiregs.mode = HImode;
2818 siregs.mode = SImode;
2819 diregs.mode = DImode;
2820
2821 for (insn = get_insns (); insn; insn = next)
2822 {
2823 next = NEXT_INSN (insn);
2824 if (!NONDEBUG_INSN_P (insn)
2825 || asm_noperands (PATTERN (insn)) >= 0
2826 || GET_CODE (PATTERN (insn)) == USE
2827 || GET_CODE (PATTERN (insn)) == CLOBBER)
2828 continue;
2829
2830 qiregs.n_in_use = 0;
2831 hiregs.n_in_use = 0;
2832 siregs.n_in_use = 0;
2833 diregs.n_in_use = 0;
2834 extract_insn (insn);
2835 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2836
2837 for (int i = 0; i < recog_data.n_operands; i++)
2838 {
2839 rtx op = recog_data.operand[i];
2840 if (GET_CODE (op) != SUBREG)
2841 continue;
2842
2843 rtx inner = SUBREG_REG (op);
2844
2845 machine_mode outer_mode = GET_MODE (op);
2846 machine_mode inner_mode = GET_MODE (inner);
2847 gcc_assert (s_ok);
2848 if (s_ok
2849 && (GET_MODE_PRECISION (inner_mode)
2850 >= GET_MODE_PRECISION (outer_mode)))
2851 continue;
2852 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2853 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2854 : outer_mode == HImode ? &hiregs
2855 : outer_mode == SImode ? &siregs
2856 : &diregs);
2857 rtx new_reg = get_replacement (r);
2858
2859 if (recog_data.operand_type[i] != OP_OUT)
2860 {
2861 enum rtx_code code;
2862 if (GET_MODE_PRECISION (inner_mode)
2863 < GET_MODE_PRECISION (outer_mode))
2864 code = ZERO_EXTEND;
2865 else
2866 code = TRUNCATE;
2867
2868 rtx pat = gen_rtx_SET (new_reg,
2869 gen_rtx_fmt_e (code, outer_mode, inner));
2870 emit_insn_before (pat, insn);
2871 }
2872
2873 if (recog_data.operand_type[i] != OP_IN)
2874 {
2875 enum rtx_code code;
2876 if (GET_MODE_PRECISION (inner_mode)
2877 < GET_MODE_PRECISION (outer_mode))
2878 code = TRUNCATE;
2879 else
2880 code = ZERO_EXTEND;
2881
2882 rtx pat = gen_rtx_SET (inner,
2883 gen_rtx_fmt_e (code, inner_mode, new_reg));
2884 emit_insn_after (pat, insn);
2885 }
2886 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2887 }
2888 }
2889 }
2890
2891 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2892 first use. */
2893
2894 static rtx
2895 nvptx_get_unisimt_master ()
2896 {
2897 rtx &master = cfun->machine->unisimt_master;
2898 return master ? master : master = gen_reg_rtx (SImode);
2899 }
2900
2901 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2902
2903 static rtx
2904 nvptx_get_unisimt_predicate ()
2905 {
2906 rtx &pred = cfun->machine->unisimt_predicate;
2907 return pred ? pred : pred = gen_reg_rtx (BImode);
2908 }
2909
2910 /* Return true if given call insn references one of the functions provided by
2911 the CUDA runtime: malloc, free, vprintf. */
2912
2913 static bool
2914 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2915 {
2916 rtx pat = PATTERN (insn);
2917 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2918 pat = XVECEXP (pat, 0, 0);
2919 if (GET_CODE (pat) == SET)
2920 pat = SET_SRC (pat);
2921 gcc_checking_assert (GET_CODE (pat) == CALL
2922 && GET_CODE (XEXP (pat, 0)) == MEM);
2923 rtx addr = XEXP (XEXP (pat, 0), 0);
2924 if (GET_CODE (addr) != SYMBOL_REF)
2925 return false;
2926 const char *name = XSTR (addr, 0);
2927 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2928 references with forced assembler name refer to PTX syscalls. For vprintf,
2929 accept both normal and forced-assembler-name references. */
2930 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2931 || !strcmp (name, "*malloc")
2932 || !strcmp (name, "*free"));
2933 }
2934
2935 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2936 propagate its value from lane MASTER to current lane. */
2937
2938 static void
2939 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2940 {
2941 rtx reg;
2942 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2943 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2944 }
2945
2946 /* Adjust code for uniform-simt code generation variant by making atomics and
2947 "syscalls" conditionally executed, and inserting shuffle-based propagation
2948 for registers being set. */
2949
2950 static void
2951 nvptx_reorg_uniform_simt ()
2952 {
2953 rtx_insn *insn, *next;
2954
2955 for (insn = get_insns (); insn; insn = next)
2956 {
2957 next = NEXT_INSN (insn);
2958 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2959 && !(NONJUMP_INSN_P (insn)
2960 && GET_CODE (PATTERN (insn)) == PARALLEL
2961 && get_attr_atomic (insn)))
2962 continue;
2963 rtx pat = PATTERN (insn);
2964 rtx master = nvptx_get_unisimt_master ();
2965 for (int i = 0; i < XVECLEN (pat, 0); i++)
2966 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2967 rtx pred = nvptx_get_unisimt_predicate ();
2968 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2969 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2970 validate_change (insn, &PATTERN (insn), pat, false);
2971 }
2972 }
2973
2974 /* Offloading function attributes. */
2975
2976 struct offload_attrs
2977 {
2978 unsigned mask;
2979 int num_gangs;
2980 int num_workers;
2981 int vector_length;
2982 };
2983
2984 /* Define entries for cfun->machine->axis_dim. */
2985
2986 #define MACH_VECTOR_LENGTH 0
2987 #define MACH_MAX_WORKERS 1
2988
2989 static void populate_offload_attrs (offload_attrs *oa);
2990
2991 static void
2992 init_axis_dim (void)
2993 {
2994 offload_attrs oa;
2995 int max_workers;
2996
2997 populate_offload_attrs (&oa);
2998
2999 if (oa.num_workers == 0)
3000 max_workers = PTX_CTA_SIZE / oa.vector_length;
3001 else
3002 max_workers = oa.num_workers;
3003
3004 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3005 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3006 cfun->machine->axis_dim_init_p = true;
3007 }
3008
3009 static int ATTRIBUTE_UNUSED
3010 nvptx_mach_max_workers ()
3011 {
3012 if (!cfun->machine->axis_dim_init_p)
3013 init_axis_dim ();
3014 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3015 }
3016
3017 static int ATTRIBUTE_UNUSED
3018 nvptx_mach_vector_length ()
3019 {
3020 if (!cfun->machine->axis_dim_init_p)
3021 init_axis_dim ();
3022 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3023 }
3024
3025 /* Loop structure of the function. The entire function is described as
3026 a NULL loop. */
3027
3028 struct parallel
3029 {
3030 /* Parent parallel. */
3031 parallel *parent;
3032
3033 /* Next sibling parallel. */
3034 parallel *next;
3035
3036 /* First child parallel. */
3037 parallel *inner;
3038
3039 /* Partitioning mask of the parallel. */
3040 unsigned mask;
3041
3042 /* Partitioning used within inner parallels. */
3043 unsigned inner_mask;
3044
3045 /* Location of parallel forked and join. The forked is the first
3046 block in the parallel and the join is the first block after of
3047 the partition. */
3048 basic_block forked_block;
3049 basic_block join_block;
3050
3051 rtx_insn *forked_insn;
3052 rtx_insn *join_insn;
3053
3054 rtx_insn *fork_insn;
3055 rtx_insn *joining_insn;
3056
3057 /* Basic blocks in this parallel, but not in child parallels. The
3058 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3059 blocks are not. */
3060 auto_vec<basic_block> blocks;
3061
3062 public:
3063 parallel (parallel *parent, unsigned mode);
3064 ~parallel ();
3065 };
3066
3067 /* Constructor links the new parallel into it's parent's chain of
3068 children. */
3069
3070 parallel::parallel (parallel *parent_, unsigned mask_)
3071 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3072 {
3073 forked_block = join_block = 0;
3074 forked_insn = join_insn = 0;
3075 fork_insn = joining_insn = 0;
3076
3077 if (parent)
3078 {
3079 next = parent->inner;
3080 parent->inner = this;
3081 }
3082 }
3083
3084 parallel::~parallel ()
3085 {
3086 delete inner;
3087 delete next;
3088 }
3089
3090 /* Map of basic blocks to insns */
3091 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3092
3093 /* A tuple of an insn of interest and the BB in which it resides. */
3094 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3095 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3096
3097 /* Split basic blocks such that each forked and join unspecs are at
3098 the start of their basic blocks. Thus afterwards each block will
3099 have a single partitioning mode. We also do the same for return
3100 insns, as they are executed by every thread. Return the
3101 partitioning mode of the function as a whole. Populate MAP with
3102 head and tail blocks. We also clear the BB visited flag, which is
3103 used when finding partitions. */
3104
3105 static void
3106 nvptx_split_blocks (bb_insn_map_t *map)
3107 {
3108 insn_bb_vec_t worklist;
3109 basic_block block;
3110 rtx_insn *insn;
3111
3112 /* Locate all the reorg instructions of interest. */
3113 FOR_ALL_BB_FN (block, cfun)
3114 {
3115 bool seen_insn = false;
3116
3117 /* Clear visited flag, for use by parallel locator */
3118 block->flags &= ~BB_VISITED;
3119
3120 FOR_BB_INSNS (block, insn)
3121 {
3122 if (!INSN_P (insn))
3123 continue;
3124 switch (recog_memoized (insn))
3125 {
3126 default:
3127 seen_insn = true;
3128 continue;
3129 case CODE_FOR_nvptx_forked:
3130 case CODE_FOR_nvptx_join:
3131 break;
3132
3133 case CODE_FOR_return:
3134 /* We also need to split just before return insns, as
3135 that insn needs executing by all threads, but the
3136 block it is in probably does not. */
3137 break;
3138 }
3139
3140 if (seen_insn)
3141 /* We've found an instruction that must be at the start of
3142 a block, but isn't. Add it to the worklist. */
3143 worklist.safe_push (insn_bb_t (insn, block));
3144 else
3145 /* It was already the first instruction. Just add it to
3146 the map. */
3147 map->get_or_insert (block) = insn;
3148 seen_insn = true;
3149 }
3150 }
3151
3152 /* Split blocks on the worklist. */
3153 unsigned ix;
3154 insn_bb_t *elt;
3155 basic_block remap = 0;
3156 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3157 {
3158 if (remap != elt->second)
3159 {
3160 block = elt->second;
3161 remap = block;
3162 }
3163
3164 /* Split block before insn. The insn is in the new block */
3165 edge e = split_block (block, PREV_INSN (elt->first));
3166
3167 block = e->dest;
3168 map->get_or_insert (block) = elt->first;
3169 }
3170 }
3171
3172 /* Return true if MASK contains parallelism that requires shared
3173 memory to broadcast. */
3174
3175 static bool
3176 nvptx_needs_shared_bcast (unsigned mask)
3177 {
3178 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3179 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3180 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3181
3182 return worker || large_vector;
3183 }
3184
3185 /* BLOCK is a basic block containing a head or tail instruction.
3186 Locate the associated prehead or pretail instruction, which must be
3187 in the single predecessor block. */
3188
3189 static rtx_insn *
3190 nvptx_discover_pre (basic_block block, int expected)
3191 {
3192 gcc_assert (block->preds->length () == 1);
3193 basic_block pre_block = (*block->preds)[0]->src;
3194 rtx_insn *pre_insn;
3195
3196 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3197 pre_insn = PREV_INSN (pre_insn))
3198 gcc_assert (pre_insn != BB_HEAD (pre_block));
3199
3200 gcc_assert (recog_memoized (pre_insn) == expected);
3201 return pre_insn;
3202 }
3203
3204 /* Dump this parallel and all its inner parallels. */
3205
3206 static void
3207 nvptx_dump_pars (parallel *par, unsigned depth)
3208 {
3209 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3210 depth, par->mask,
3211 par->forked_block ? par->forked_block->index : -1,
3212 par->join_block ? par->join_block->index : -1);
3213
3214 fprintf (dump_file, " blocks:");
3215
3216 basic_block block;
3217 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3218 fprintf (dump_file, " %d", block->index);
3219 fprintf (dump_file, "\n");
3220 if (par->inner)
3221 nvptx_dump_pars (par->inner, depth + 1);
3222
3223 if (par->next)
3224 nvptx_dump_pars (par->next, depth);
3225 }
3226
3227 /* If BLOCK contains a fork/join marker, process it to create or
3228 terminate a loop structure. Add this block to the current loop,
3229 and then walk successor blocks. */
3230
3231 static parallel *
3232 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3233 {
3234 if (block->flags & BB_VISITED)
3235 return par;
3236 block->flags |= BB_VISITED;
3237
3238 if (rtx_insn **endp = map->get (block))
3239 {
3240 rtx_insn *end = *endp;
3241
3242 /* This is a block head or tail, or return instruction. */
3243 switch (recog_memoized (end))
3244 {
3245 case CODE_FOR_return:
3246 /* Return instructions are in their own block, and we
3247 don't need to do anything more. */
3248 return par;
3249
3250 case CODE_FOR_nvptx_forked:
3251 /* Loop head, create a new inner loop and add it into
3252 our parent's child list. */
3253 {
3254 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3255
3256 gcc_assert (mask);
3257 par = new parallel (par, mask);
3258 par->forked_block = block;
3259 par->forked_insn = end;
3260 if (nvptx_needs_shared_bcast (mask))
3261 par->fork_insn
3262 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3263 }
3264 break;
3265
3266 case CODE_FOR_nvptx_join:
3267 /* A loop tail. Finish the current loop and return to
3268 parent. */
3269 {
3270 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3271
3272 gcc_assert (par->mask == mask);
3273 gcc_assert (par->join_block == NULL);
3274 par->join_block = block;
3275 par->join_insn = end;
3276 if (nvptx_needs_shared_bcast (mask))
3277 par->joining_insn
3278 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3279 par = par->parent;
3280 }
3281 break;
3282
3283 default:
3284 gcc_unreachable ();
3285 }
3286 }
3287
3288 if (par)
3289 /* Add this block onto the current loop's list of blocks. */
3290 par->blocks.safe_push (block);
3291 else
3292 /* This must be the entry block. Create a NULL parallel. */
3293 par = new parallel (0, 0);
3294
3295 /* Walk successor blocks. */
3296 edge e;
3297 edge_iterator ei;
3298
3299 FOR_EACH_EDGE (e, ei, block->succs)
3300 nvptx_find_par (map, par, e->dest);
3301
3302 return par;
3303 }
3304
3305 /* DFS walk the CFG looking for fork & join markers. Construct
3306 loop structures as we go. MAP is a mapping of basic blocks
3307 to head & tail markers, discovered when splitting blocks. This
3308 speeds up the discovery. We rely on the BB visited flag having
3309 been cleared when splitting blocks. */
3310
3311 static parallel *
3312 nvptx_discover_pars (bb_insn_map_t *map)
3313 {
3314 basic_block block;
3315
3316 /* Mark exit blocks as visited. */
3317 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3318 block->flags |= BB_VISITED;
3319
3320 /* And entry block as not. */
3321 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3322 block->flags &= ~BB_VISITED;
3323
3324 parallel *par = nvptx_find_par (map, 0, block);
3325
3326 if (dump_file)
3327 {
3328 fprintf (dump_file, "\nLoops\n");
3329 nvptx_dump_pars (par, 0);
3330 fprintf (dump_file, "\n");
3331 }
3332
3333 return par;
3334 }
3335
3336 /* Analyse a group of BBs within a partitioned region and create N
3337 Single-Entry-Single-Exit regions. Some of those regions will be
3338 trivial ones consisting of a single BB. The blocks of a
3339 partitioned region might form a set of disjoint graphs -- because
3340 the region encloses a differently partitoned sub region.
3341
3342 We use the linear time algorithm described in 'Finding Regions Fast:
3343 Single Entry Single Exit and control Regions in Linear Time'
3344 Johnson, Pearson & Pingali. That algorithm deals with complete
3345 CFGs, where a back edge is inserted from END to START, and thus the
3346 problem becomes one of finding equivalent loops.
3347
3348 In this case we have a partial CFG. We complete it by redirecting
3349 any incoming edge to the graph to be from an arbitrary external BB,
3350 and similarly redirecting any outgoing edge to be to that BB.
3351 Thus we end up with a closed graph.
3352
3353 The algorithm works by building a spanning tree of an undirected
3354 graph and keeping track of back edges from nodes further from the
3355 root in the tree to nodes nearer to the root in the tree. In the
3356 description below, the root is up and the tree grows downwards.
3357
3358 We avoid having to deal with degenerate back-edges to the same
3359 block, by splitting each BB into 3 -- one for input edges, one for
3360 the node itself and one for the output edges. Such back edges are
3361 referred to as 'Brackets'. Cycle equivalent nodes will have the
3362 same set of brackets.
3363
3364 Determining bracket equivalency is done by maintaining a list of
3365 brackets in such a manner that the list length and final bracket
3366 uniquely identify the set.
3367
3368 We use coloring to mark all BBs with cycle equivalency with the
3369 same color. This is the output of the 'Finding Regions Fast'
3370 algorithm. Notice it doesn't actually find the set of nodes within
3371 a particular region, just unorderd sets of nodes that are the
3372 entries and exits of SESE regions.
3373
3374 After determining cycle equivalency, we need to find the minimal
3375 set of SESE regions. Do this with a DFS coloring walk of the
3376 complete graph. We're either 'looking' or 'coloring'. When
3377 looking, and we're in the subgraph, we start coloring the color of
3378 the current node, and remember that node as the start of the
3379 current color's SESE region. Every time we go to a new node, we
3380 decrement the count of nodes with thet color. If it reaches zero,
3381 we remember that node as the end of the current color's SESE region
3382 and return to 'looking'. Otherwise we color the node the current
3383 color.
3384
3385 This way we end up with coloring the inside of non-trivial SESE
3386 regions with the color of that region. */
3387
3388 /* A pair of BBs. We use this to represent SESE regions. */
3389 typedef std::pair<basic_block, basic_block> bb_pair_t;
3390 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3391
3392 /* A node in the undirected CFG. The discriminator SECOND indicates just
3393 above or just below the BB idicated by FIRST. */
3394 typedef std::pair<basic_block, int> pseudo_node_t;
3395
3396 /* A bracket indicates an edge towards the root of the spanning tree of the
3397 undirected graph. Each bracket has a color, determined
3398 from the currrent set of brackets. */
3399 struct bracket
3400 {
3401 pseudo_node_t back; /* Back target */
3402
3403 /* Current color and size of set. */
3404 unsigned color;
3405 unsigned size;
3406
3407 bracket (pseudo_node_t back_)
3408 : back (back_), color (~0u), size (~0u)
3409 {
3410 }
3411
3412 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3413 {
3414 if (length != size)
3415 {
3416 size = length;
3417 color = color_counts.length ();
3418 color_counts.quick_push (0);
3419 }
3420 color_counts[color]++;
3421 return color;
3422 }
3423 };
3424
3425 typedef auto_vec<bracket> bracket_vec_t;
3426
3427 /* Basic block info for finding SESE regions. */
3428
3429 struct bb_sese
3430 {
3431 int node; /* Node number in spanning tree. */
3432 int parent; /* Parent node number. */
3433
3434 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3435 edges arrive at pseudo-node Ai and the outgoing edges leave at
3436 pseudo-node Ao. We have to remember which way we arrived at a
3437 particular node when generating the spanning tree. dir > 0 means
3438 we arrived at Ai, dir < 0 means we arrived at Ao. */
3439 int dir;
3440
3441 /* Lowest numbered pseudo-node reached via a backedge from thsis
3442 node, or any descendant. */
3443 pseudo_node_t high;
3444
3445 int color; /* Cycle-equivalence color */
3446
3447 /* Stack of brackets for this node. */
3448 bracket_vec_t brackets;
3449
3450 bb_sese (unsigned node_, unsigned p, int dir_)
3451 :node (node_), parent (p), dir (dir_)
3452 {
3453 }
3454 ~bb_sese ();
3455
3456 /* Push a bracket ending at BACK. */
3457 void push (const pseudo_node_t &back)
3458 {
3459 if (dump_file)
3460 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3461 back.first ? back.first->index : 0, back.second);
3462 brackets.safe_push (bracket (back));
3463 }
3464
3465 void append (bb_sese *child);
3466 void remove (const pseudo_node_t &);
3467
3468 /* Set node's color. */
3469 void set_color (auto_vec<unsigned> &color_counts)
3470 {
3471 color = brackets.last ().get_color (color_counts, brackets.length ());
3472 }
3473 };
3474
3475 bb_sese::~bb_sese ()
3476 {
3477 }
3478
3479 /* Destructively append CHILD's brackets. */
3480
3481 void
3482 bb_sese::append (bb_sese *child)
3483 {
3484 if (int len = child->brackets.length ())
3485 {
3486 int ix;
3487
3488 if (dump_file)
3489 {
3490 for (ix = 0; ix < len; ix++)
3491 {
3492 const pseudo_node_t &pseudo = child->brackets[ix].back;
3493 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3494 child->node, pseudo.first ? pseudo.first->index : 0,
3495 pseudo.second);
3496 }
3497 }
3498 if (!brackets.length ())
3499 std::swap (brackets, child->brackets);
3500 else
3501 {
3502 brackets.reserve (len);
3503 for (ix = 0; ix < len; ix++)
3504 brackets.quick_push (child->brackets[ix]);
3505 }
3506 }
3507 }
3508
3509 /* Remove brackets that terminate at PSEUDO. */
3510
3511 void
3512 bb_sese::remove (const pseudo_node_t &pseudo)
3513 {
3514 unsigned removed = 0;
3515 int len = brackets.length ();
3516
3517 for (int ix = 0; ix < len; ix++)
3518 {
3519 if (brackets[ix].back == pseudo)
3520 {
3521 if (dump_file)
3522 fprintf (dump_file, "Removing backedge %d:%+d\n",
3523 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3524 removed++;
3525 }
3526 else if (removed)
3527 brackets[ix-removed] = brackets[ix];
3528 }
3529 while (removed--)
3530 brackets.pop ();
3531 }
3532
3533 /* Accessors for BB's aux pointer. */
3534 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3535 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3536
3537 /* DFS walk creating SESE data structures. Only cover nodes with
3538 BB_VISITED set. Append discovered blocks to LIST. We number in
3539 increments of 3 so that the above and below pseudo nodes can be
3540 implicitly numbered too. */
3541
3542 static int
3543 nvptx_sese_number (int n, int p, int dir, basic_block b,
3544 auto_vec<basic_block> *list)
3545 {
3546 if (BB_GET_SESE (b))
3547 return n;
3548
3549 if (dump_file)
3550 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3551 b->index, n, p, dir);
3552
3553 BB_SET_SESE (b, new bb_sese (n, p, dir));
3554 p = n;
3555
3556 n += 3;
3557 list->quick_push (b);
3558
3559 /* First walk the nodes on the 'other side' of this node, then walk
3560 the nodes on the same side. */
3561 for (unsigned ix = 2; ix; ix--)
3562 {
3563 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3564 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3565 : offsetof (edge_def, src));
3566 edge e;
3567 edge_iterator ei;
3568
3569 FOR_EACH_EDGE (e, ei, edges)
3570 {
3571 basic_block target = *(basic_block *)((char *)e + offset);
3572
3573 if (target->flags & BB_VISITED)
3574 n = nvptx_sese_number (n, p, dir, target, list);
3575 }
3576 dir = -dir;
3577 }
3578 return n;
3579 }
3580
3581 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3582 EDGES are the outgoing edges and OFFSET is the offset to the src
3583 or dst block on the edges. */
3584
3585 static void
3586 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3587 vec<edge, va_gc> *edges, size_t offset)
3588 {
3589 edge e;
3590 edge_iterator ei;
3591 int hi_back = depth;
3592 pseudo_node_t node_back (0, depth);
3593 int hi_child = depth;
3594 pseudo_node_t node_child (0, depth);
3595 basic_block child = NULL;
3596 unsigned num_children = 0;
3597 int usd = -dir * sese->dir;
3598
3599 if (dump_file)
3600 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3601 me->index, sese->node, dir);
3602
3603 if (dir < 0)
3604 {
3605 /* This is the above pseudo-child. It has the BB itself as an
3606 additional child node. */
3607 node_child = sese->high;
3608 hi_child = node_child.second;
3609 if (node_child.first)
3610 hi_child += BB_GET_SESE (node_child.first)->node;
3611 num_children++;
3612 }
3613
3614 /* Examine each edge.
3615 - if it is a child (a) append its bracket list and (b) record
3616 whether it is the child with the highest reaching bracket.
3617 - if it is an edge to ancestor, record whether it's the highest
3618 reaching backlink. */
3619 FOR_EACH_EDGE (e, ei, edges)
3620 {
3621 basic_block target = *(basic_block *)((char *)e + offset);
3622
3623 if (bb_sese *t_sese = BB_GET_SESE (target))
3624 {
3625 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3626 {
3627 /* Child node. Append its bracket list. */
3628 num_children++;
3629 sese->append (t_sese);
3630
3631 /* Compare it's hi value. */
3632 int t_hi = t_sese->high.second;
3633
3634 if (basic_block child_hi_block = t_sese->high.first)
3635 t_hi += BB_GET_SESE (child_hi_block)->node;
3636
3637 if (hi_child > t_hi)
3638 {
3639 hi_child = t_hi;
3640 node_child = t_sese->high;
3641 child = target;
3642 }
3643 }
3644 else if (t_sese->node < sese->node + dir
3645 && !(dir < 0 && sese->parent == t_sese->node))
3646 {
3647 /* Non-parental ancestor node -- a backlink. */
3648 int d = usd * t_sese->dir;
3649 int back = t_sese->node + d;
3650
3651 if (hi_back > back)
3652 {
3653 hi_back = back;
3654 node_back = pseudo_node_t (target, d);
3655 }
3656 }
3657 }
3658 else
3659 { /* Fallen off graph, backlink to entry node. */
3660 hi_back = 0;
3661 node_back = pseudo_node_t (0, 0);
3662 }
3663 }
3664
3665 /* Remove any brackets that terminate at this pseudo node. */
3666 sese->remove (pseudo_node_t (me, dir));
3667
3668 /* Now push any backlinks from this pseudo node. */
3669 FOR_EACH_EDGE (e, ei, edges)
3670 {
3671 basic_block target = *(basic_block *)((char *)e + offset);
3672 if (bb_sese *t_sese = BB_GET_SESE (target))
3673 {
3674 if (t_sese->node < sese->node + dir
3675 && !(dir < 0 && sese->parent == t_sese->node))
3676 /* Non-parental ancestor node - backedge from me. */
3677 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3678 }
3679 else
3680 {
3681 /* back edge to entry node */
3682 sese->push (pseudo_node_t (0, 0));
3683 }
3684 }
3685
3686 /* If this node leads directly or indirectly to a no-return region of
3687 the graph, then fake a backedge to entry node. */
3688 if (!sese->brackets.length () || !edges || !edges->length ())
3689 {
3690 hi_back = 0;
3691 node_back = pseudo_node_t (0, 0);
3692 sese->push (node_back);
3693 }
3694
3695 /* Record the highest reaching backedge from us or a descendant. */
3696 sese->high = hi_back < hi_child ? node_back : node_child;
3697
3698 if (num_children > 1)
3699 {
3700 /* There is more than one child -- this is a Y shaped piece of
3701 spanning tree. We have to insert a fake backedge from this
3702 node to the highest ancestor reached by not-the-highest
3703 reaching child. Note that there may be multiple children
3704 with backedges to the same highest node. That's ok and we
3705 insert the edge to that highest node. */
3706 hi_child = depth;
3707 if (dir < 0 && child)
3708 {
3709 node_child = sese->high;
3710 hi_child = node_child.second;
3711 if (node_child.first)
3712 hi_child += BB_GET_SESE (node_child.first)->node;
3713 }
3714
3715 FOR_EACH_EDGE (e, ei, edges)
3716 {
3717 basic_block target = *(basic_block *)((char *)e + offset);
3718
3719 if (target == child)
3720 /* Ignore the highest child. */
3721 continue;
3722
3723 bb_sese *t_sese = BB_GET_SESE (target);
3724 if (!t_sese)
3725 continue;
3726 if (t_sese->parent != sese->node)
3727 /* Not a child. */
3728 continue;
3729
3730 /* Compare its hi value. */
3731 int t_hi = t_sese->high.second;
3732
3733 if (basic_block child_hi_block = t_sese->high.first)
3734 t_hi += BB_GET_SESE (child_hi_block)->node;
3735
3736 if (hi_child > t_hi)
3737 {
3738 hi_child = t_hi;
3739 node_child = t_sese->high;
3740 }
3741 }
3742
3743 sese->push (node_child);
3744 }
3745 }
3746
3747
3748 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3749 proceed to successors. Set SESE entry and exit nodes of
3750 REGIONS. */
3751
3752 static void
3753 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3754 basic_block block, int coloring)
3755 {
3756 bb_sese *sese = BB_GET_SESE (block);
3757
3758 if (block->flags & BB_VISITED)
3759 {
3760 /* If we've already encountered this block, either we must not
3761 be coloring, or it must have been colored the current color. */
3762 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3763 return;
3764 }
3765
3766 block->flags |= BB_VISITED;
3767
3768 if (sese)
3769 {
3770 if (coloring < 0)
3771 {
3772 /* Start coloring a region. */
3773 regions[sese->color].first = block;
3774 coloring = sese->color;
3775 }
3776
3777 if (!--color_counts[sese->color] && sese->color == coloring)
3778 {
3779 /* Found final block of SESE region. */
3780 regions[sese->color].second = block;
3781 coloring = -1;
3782 }
3783 else
3784 /* Color the node, so we can assert on revisiting the node
3785 that the graph is indeed SESE. */
3786 sese->color = coloring;
3787 }
3788 else
3789 /* Fallen off the subgraph, we cannot be coloring. */
3790 gcc_assert (coloring < 0);
3791
3792 /* Walk each successor block. */
3793 if (block->succs && block->succs->length ())
3794 {
3795 edge e;
3796 edge_iterator ei;
3797
3798 FOR_EACH_EDGE (e, ei, block->succs)
3799 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3800 }
3801 else
3802 gcc_assert (coloring < 0);
3803 }
3804
3805 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3806 end up with NULL entries in it. */
3807
3808 static void
3809 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3810 {
3811 basic_block block;
3812 int ix;
3813
3814 /* First clear each BB of the whole function. */
3815 FOR_ALL_BB_FN (block, cfun)
3816 {
3817 block->flags &= ~BB_VISITED;
3818 BB_SET_SESE (block, 0);
3819 }
3820
3821 /* Mark blocks in the function that are in this graph. */
3822 for (ix = 0; blocks.iterate (ix, &block); ix++)
3823 block->flags |= BB_VISITED;
3824
3825 /* Counts of nodes assigned to each color. There cannot be more
3826 colors than blocks (and hopefully there will be fewer). */
3827 auto_vec<unsigned> color_counts;
3828 color_counts.reserve (blocks.length ());
3829
3830 /* Worklist of nodes in the spanning tree. Again, there cannot be
3831 more nodes in the tree than blocks (there will be fewer if the
3832 CFG of blocks is disjoint). */
3833 auto_vec<basic_block> spanlist;
3834 spanlist.reserve (blocks.length ());
3835
3836 /* Make sure every block has its cycle class determined. */
3837 for (ix = 0; blocks.iterate (ix, &block); ix++)
3838 {
3839 if (BB_GET_SESE (block))
3840 /* We already met this block in an earlier graph solve. */
3841 continue;
3842
3843 if (dump_file)
3844 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3845
3846 /* Number the nodes reachable from block initial DFS order. */
3847 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3848
3849 /* Now walk in reverse DFS order to find cycle equivalents. */
3850 while (spanlist.length ())
3851 {
3852 block = spanlist.pop ();
3853 bb_sese *sese = BB_GET_SESE (block);
3854
3855 /* Do the pseudo node below. */
3856 nvptx_sese_pseudo (block, sese, depth, +1,
3857 sese->dir > 0 ? block->succs : block->preds,
3858 (sese->dir > 0 ? offsetof (edge_def, dest)
3859 : offsetof (edge_def, src)));
3860 sese->set_color (color_counts);
3861 /* Do the pseudo node above. */
3862 nvptx_sese_pseudo (block, sese, depth, -1,
3863 sese->dir < 0 ? block->succs : block->preds,
3864 (sese->dir < 0 ? offsetof (edge_def, dest)
3865 : offsetof (edge_def, src)));
3866 }
3867 if (dump_file)
3868 fprintf (dump_file, "\n");
3869 }
3870
3871 if (dump_file)
3872 {
3873 unsigned count;
3874 const char *comma = "";
3875
3876 fprintf (dump_file, "Found %d cycle equivalents\n",
3877 color_counts.length ());
3878 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3879 {
3880 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3881
3882 comma = "";
3883 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3884 if (BB_GET_SESE (block)->color == ix)
3885 {
3886 block->flags |= BB_VISITED;
3887 fprintf (dump_file, "%s%d", comma, block->index);
3888 comma=",";
3889 }
3890 fprintf (dump_file, "}");
3891 comma = ", ";
3892 }
3893 fprintf (dump_file, "\n");
3894 }
3895
3896 /* Now we've colored every block in the subgraph. We now need to
3897 determine the minimal set of SESE regions that cover that
3898 subgraph. Do this with a DFS walk of the complete function.
3899 During the walk we're either 'looking' or 'coloring'. When we
3900 reach the last node of a particular color, we stop coloring and
3901 return to looking. */
3902
3903 /* There cannot be more SESE regions than colors. */
3904 regions.reserve (color_counts.length ());
3905 for (ix = color_counts.length (); ix--;)
3906 regions.quick_push (bb_pair_t (0, 0));
3907
3908 for (ix = 0; blocks.iterate (ix, &block); ix++)
3909 block->flags &= ~BB_VISITED;
3910
3911 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3912
3913 if (dump_file)
3914 {
3915 const char *comma = "";
3916 int len = regions.length ();
3917
3918 fprintf (dump_file, "SESE regions:");
3919 for (ix = 0; ix != len; ix++)
3920 {
3921 basic_block from = regions[ix].first;
3922 basic_block to = regions[ix].second;
3923
3924 if (from)
3925 {
3926 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3927 if (to != from)
3928 fprintf (dump_file, "->%d", to->index);
3929
3930 int color = BB_GET_SESE (from)->color;
3931
3932 /* Print the blocks within the region (excluding ends). */
3933 FOR_EACH_BB_FN (block, cfun)
3934 {
3935 bb_sese *sese = BB_GET_SESE (block);
3936
3937 if (sese && sese->color == color
3938 && block != from && block != to)
3939 fprintf (dump_file, ".%d", block->index);
3940 }
3941 fprintf (dump_file, "}");
3942 }
3943 comma = ",";
3944 }
3945 fprintf (dump_file, "\n\n");
3946 }
3947
3948 for (ix = 0; blocks.iterate (ix, &block); ix++)
3949 delete BB_GET_SESE (block);
3950 }
3951
3952 #undef BB_SET_SESE
3953 #undef BB_GET_SESE
3954
3955 /* Propagate live state at the start of a partitioned region. IS_CALL
3956 indicates whether the propagation is for a (partitioned) call
3957 instruction. BLOCK provides the live register information, and
3958 might not contain INSN. Propagation is inserted just after INSN. RW
3959 indicates whether we are reading and/or writing state. This
3960 separation is needed for worker-level proppagation where we
3961 essentially do a spill & fill. FN is the underlying worker
3962 function to generate the propagation instructions for single
3963 register. DATA is user data.
3964
3965 Returns true if we didn't emit any instructions.
3966
3967 We propagate the live register set for non-calls and the entire
3968 frame for calls and non-calls. We could do better by (a)
3969 propagating just the live set that is used within the partitioned
3970 regions and (b) only propagating stack entries that are used. The
3971 latter might be quite hard to determine. */
3972
3973 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3974
3975 static bool
3976 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3977 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3978 {
3979 bitmap live = DF_LIVE_IN (block);
3980 bitmap_iterator iterator;
3981 unsigned ix;
3982 bool empty = true;
3983
3984 /* Copy the frame array. */
3985 HOST_WIDE_INT fs = get_frame_size ();
3986 if (fs)
3987 {
3988 rtx tmp = gen_reg_rtx (DImode);
3989 rtx idx = NULL_RTX;
3990 rtx ptr = gen_reg_rtx (Pmode);
3991 rtx pred = NULL_RTX;
3992 rtx_code_label *label = NULL;
3993
3994 empty = false;
3995 /* The frame size might not be DImode compatible, but the frame
3996 array's declaration will be. So it's ok to round up here. */
3997 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3998 /* Detect single iteration loop. */
3999 if (fs == 1)
4000 fs = 0;
4001
4002 start_sequence ();
4003 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4004 if (fs)
4005 {
4006 idx = gen_reg_rtx (SImode);
4007 pred = gen_reg_rtx (BImode);
4008 label = gen_label_rtx ();
4009
4010 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4011 /* Allow worker function to initialize anything needed. */
4012 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4013 if (init)
4014 emit_insn (init);
4015 emit_label (label);
4016 LABEL_NUSES (label)++;
4017 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4018 }
4019 if (rw & PM_read)
4020 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4021 emit_insn (fn (tmp, rw, fs, data, vector));
4022 if (rw & PM_write)
4023 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4024 if (fs)
4025 {
4026 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4027 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4028 emit_insn (gen_br_true_uni (pred, label));
4029 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4030 if (fini)
4031 emit_insn (fini);
4032 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4033 }
4034 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4035 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4036 rtx cpy = get_insns ();
4037 end_sequence ();
4038 insn = emit_insn_after (cpy, insn);
4039 }
4040
4041 if (!is_call)
4042 /* Copy live registers. */
4043 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4044 {
4045 rtx reg = regno_reg_rtx[ix];
4046
4047 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4048 {
4049 rtx bcast = fn (reg, rw, 0, data, vector);
4050
4051 insn = emit_insn_after (bcast, insn);
4052 empty = false;
4053 }
4054 }
4055 return empty;
4056 }
4057
4058 /* Worker for nvptx_warp_propagate. */
4059
4060 static rtx
4061 warp_prop_gen (rtx reg, propagate_mask pm,
4062 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4063 bool ARG_UNUSED (vector))
4064 {
4065 if (!(pm & PM_read_write))
4066 return 0;
4067
4068 return nvptx_gen_warp_bcast (reg);
4069 }
4070
4071 /* Propagate state that is live at start of BLOCK across the vectors
4072 of a single warp. Propagation is inserted just after INSN.
4073 IS_CALL and return as for nvptx_propagate. */
4074
4075 static bool
4076 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4077 {
4078 return nvptx_propagate (is_call, block, insn, PM_read_write,
4079 warp_prop_gen, 0, false);
4080 }
4081
4082 /* Worker for nvptx_shared_propagate. */
4083
4084 static rtx
4085 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4086 bool vector)
4087 {
4088 broadcast_data_t *data = (broadcast_data_t *)data_;
4089
4090 if (pm & PM_loop_begin)
4091 {
4092 /* Starting a loop, initialize pointer. */
4093 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4094
4095 oacc_bcast_align = MAX (oacc_bcast_align, align);
4096 data->offset = ROUND_UP (data->offset, align);
4097
4098 data->ptr = gen_reg_rtx (Pmode);
4099
4100 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4101 }
4102 else if (pm & PM_loop_end)
4103 {
4104 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4105 data->ptr = NULL_RTX;
4106 return clobber;
4107 }
4108 else
4109 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4110 }
4111
4112 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4113 indicates if this is just before partitioned mode (do spill), or
4114 just after it starts (do fill). Sequence is inserted just after
4115 INSN. IS_CALL and return as for nvptx_propagate. */
4116
4117 static bool
4118 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4119 rtx_insn *insn, bool vector)
4120 {
4121 broadcast_data_t data;
4122
4123 data.base = gen_reg_rtx (Pmode);
4124 data.offset = 0;
4125 data.ptr = NULL_RTX;
4126
4127 bool empty = nvptx_propagate (is_call, block, insn,
4128 pre_p ? PM_read : PM_write, shared_prop_gen,
4129 &data, vector);
4130 gcc_assert (empty == !data.offset);
4131 if (data.offset)
4132 {
4133 rtx bcast_sym = oacc_bcast_sym;
4134
4135 /* Stuff was emitted, initialize the base pointer now. */
4136 if (vector && nvptx_mach_max_workers () > 1)
4137 {
4138 if (!cfun->machine->bcast_partition)
4139 {
4140 /* It would be nice to place this register in
4141 DATA_AREA_SHARED. */
4142 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4143 }
4144 if (!cfun->machine->sync_bar)
4145 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4146
4147 bcast_sym = cfun->machine->bcast_partition;
4148 }
4149
4150 rtx init = gen_rtx_SET (data.base, bcast_sym);
4151 emit_insn_after (init, insn);
4152
4153 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4154 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4155 ? nvptx_mach_max_workers () + 1
4156 : 1);
4157
4158 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4159 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4160 }
4161 return empty;
4162 }
4163
4164 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4165 which is an integer or a register. THREADS is the number of threads
4166 controlled by the barrier. */
4167
4168 static rtx
4169 nvptx_cta_sync (rtx lock, int threads)
4170 {
4171 return gen_nvptx_barsync (lock, GEN_INT (threads));
4172 }
4173
4174 #if WORKAROUND_PTXJIT_BUG
4175 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4176 real insns. */
4177
4178 static rtx_insn *
4179 bb_first_real_insn (basic_block bb)
4180 {
4181 rtx_insn *insn;
4182
4183 /* Find first insn of from block. */
4184 FOR_BB_INSNS (bb, insn)
4185 if (INSN_P (insn))
4186 return insn;
4187
4188 return 0;
4189 }
4190 #endif
4191
4192 /* Return true if INSN needs neutering. */
4193
4194 static bool
4195 needs_neutering_p (rtx_insn *insn)
4196 {
4197 if (!INSN_P (insn))
4198 return false;
4199
4200 switch (recog_memoized (insn))
4201 {
4202 case CODE_FOR_nvptx_fork:
4203 case CODE_FOR_nvptx_forked:
4204 case CODE_FOR_nvptx_joining:
4205 case CODE_FOR_nvptx_join:
4206 case CODE_FOR_nvptx_barsync:
4207 return false;
4208 default:
4209 return true;
4210 }
4211 }
4212
4213 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4214
4215 static bool
4216 verify_neutering_jumps (basic_block from,
4217 rtx_insn *vector_jump, rtx_insn *worker_jump,
4218 rtx_insn *vector_label, rtx_insn *worker_label)
4219 {
4220 basic_block bb = from;
4221 rtx_insn *insn = BB_HEAD (bb);
4222 bool seen_worker_jump = false;
4223 bool seen_vector_jump = false;
4224 bool seen_worker_label = false;
4225 bool seen_vector_label = false;
4226 bool worker_neutered = false;
4227 bool vector_neutered = false;
4228 while (true)
4229 {
4230 if (insn == worker_jump)
4231 {
4232 seen_worker_jump = true;
4233 worker_neutered = true;
4234 gcc_assert (!vector_neutered);
4235 }
4236 else if (insn == vector_jump)
4237 {
4238 seen_vector_jump = true;
4239 vector_neutered = true;
4240 }
4241 else if (insn == worker_label)
4242 {
4243 seen_worker_label = true;
4244 gcc_assert (worker_neutered);
4245 worker_neutered = false;
4246 }
4247 else if (insn == vector_label)
4248 {
4249 seen_vector_label = true;
4250 gcc_assert (vector_neutered);
4251 vector_neutered = false;
4252 }
4253 else if (INSN_P (insn))
4254 switch (recog_memoized (insn))
4255 {
4256 case CODE_FOR_nvptx_barsync:
4257 gcc_assert (!vector_neutered && !worker_neutered);
4258 break;
4259 default:
4260 break;
4261 }
4262
4263 if (insn != BB_END (bb))
4264 insn = NEXT_INSN (insn);
4265 else if (JUMP_P (insn) && single_succ_p (bb)
4266 && !seen_vector_jump && !seen_worker_jump)
4267 {
4268 bb = single_succ (bb);
4269 insn = BB_HEAD (bb);
4270 }
4271 else
4272 break;
4273 }
4274
4275 gcc_assert (!(vector_jump && !seen_vector_jump));
4276 gcc_assert (!(worker_jump && !seen_worker_jump));
4277
4278 if (seen_vector_label || seen_worker_label)
4279 {
4280 gcc_assert (!(vector_label && !seen_vector_label));
4281 gcc_assert (!(worker_label && !seen_worker_label));
4282
4283 return true;
4284 }
4285
4286 return false;
4287 }
4288
4289 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4290
4291 static void
4292 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4293 rtx_insn *worker_label)
4294 {
4295 basic_block bb = to;
4296 rtx_insn *insn = BB_END (bb);
4297 bool seen_worker_label = false;
4298 bool seen_vector_label = false;
4299 while (true)
4300 {
4301 if (insn == worker_label)
4302 {
4303 seen_worker_label = true;
4304 gcc_assert (!seen_vector_label);
4305 }
4306 else if (insn == vector_label)
4307 seen_vector_label = true;
4308 else if (INSN_P (insn))
4309 switch (recog_memoized (insn))
4310 {
4311 case CODE_FOR_nvptx_barsync:
4312 gcc_assert (!seen_vector_label && !seen_worker_label);
4313 break;
4314 }
4315
4316 if (insn != BB_HEAD (bb))
4317 insn = PREV_INSN (insn);
4318 else
4319 break;
4320 }
4321
4322 gcc_assert (!(vector_label && !seen_vector_label));
4323 gcc_assert (!(worker_label && !seen_worker_label));
4324 }
4325
4326 /* Single neutering according to MASK. FROM is the incoming block and
4327 TO is the outgoing block. These may be the same block. Insert at
4328 start of FROM:
4329
4330 if (tid.<axis>) goto end.
4331
4332 and insert before ending branch of TO (if there is such an insn):
4333
4334 end:
4335 <possibly-broadcast-cond>
4336 <branch>
4337
4338 We currently only use differnt FROM and TO when skipping an entire
4339 loop. We could do more if we detected superblocks. */
4340
4341 static void
4342 nvptx_single (unsigned mask, basic_block from, basic_block to)
4343 {
4344 rtx_insn *head = BB_HEAD (from);
4345 rtx_insn *tail = BB_END (to);
4346 unsigned skip_mask = mask;
4347
4348 while (true)
4349 {
4350 /* Find first insn of from block. */
4351 while (head != BB_END (from) && !needs_neutering_p (head))
4352 head = NEXT_INSN (head);
4353
4354 if (from == to)
4355 break;
4356
4357 if (!(JUMP_P (head) && single_succ_p (from)))
4358 break;
4359
4360 basic_block jump_target = single_succ (from);
4361 if (!single_pred_p (jump_target))
4362 break;
4363
4364 from = jump_target;
4365 head = BB_HEAD (from);
4366 }
4367
4368 /* Find last insn of to block */
4369 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4370 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4371 tail = PREV_INSN (tail);
4372
4373 /* Detect if tail is a branch. */
4374 rtx tail_branch = NULL_RTX;
4375 rtx cond_branch = NULL_RTX;
4376 if (tail && INSN_P (tail))
4377 {
4378 tail_branch = PATTERN (tail);
4379 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4380 tail_branch = NULL_RTX;
4381 else
4382 {
4383 cond_branch = SET_SRC (tail_branch);
4384 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4385 cond_branch = NULL_RTX;
4386 }
4387 }
4388
4389 if (tail == head)
4390 {
4391 /* If this is empty, do nothing. */
4392 if (!head || !needs_neutering_p (head))
4393 return;
4394
4395 if (cond_branch)
4396 {
4397 /* If we're only doing vector single, there's no need to
4398 emit skip code because we'll not insert anything. */
4399 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4400 skip_mask = 0;
4401 }
4402 else if (tail_branch)
4403 /* Block with only unconditional branch. Nothing to do. */
4404 return;
4405 }
4406
4407 /* Insert the vector test inside the worker test. */
4408 unsigned mode;
4409 rtx_insn *before = tail;
4410 rtx_insn *neuter_start = NULL;
4411 rtx_insn *worker_label = NULL, *vector_label = NULL;
4412 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4413 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4414 if (GOMP_DIM_MASK (mode) & skip_mask)
4415 {
4416 rtx_code_label *label = gen_label_rtx ();
4417 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4418 rtx_insn **mode_jump
4419 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4420 rtx_insn **mode_label
4421 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4422
4423 if (!pred)
4424 {
4425 pred = gen_reg_rtx (BImode);
4426 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4427 }
4428
4429 rtx br;
4430 if (mode == GOMP_DIM_VECTOR)
4431 br = gen_br_true (pred, label);
4432 else
4433 br = gen_br_true_uni (pred, label);
4434 if (neuter_start)
4435 neuter_start = emit_insn_after (br, neuter_start);
4436 else
4437 neuter_start = emit_insn_before (br, head);
4438 *mode_jump = neuter_start;
4439
4440 LABEL_NUSES (label)++;
4441 rtx_insn *label_insn;
4442 if (tail_branch)
4443 {
4444 label_insn = emit_label_before (label, before);
4445 before = label_insn;
4446 }
4447 else
4448 {
4449 label_insn = emit_label_after (label, tail);
4450 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4451 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4452 emit_insn_after (gen_exit (), label_insn);
4453 }
4454
4455 *mode_label = label_insn;
4456 }
4457
4458 /* Now deal with propagating the branch condition. */
4459 if (cond_branch)
4460 {
4461 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4462
4463 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4464 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4465 {
4466 /* Vector mode only, do a shuffle. */
4467 #if WORKAROUND_PTXJIT_BUG
4468 /* The branch condition %rcond is propagated like this:
4469
4470 {
4471 .reg .u32 %x;
4472 mov.u32 %x,%tid.x;
4473 setp.ne.u32 %rnotvzero,%x,0;
4474 }
4475
4476 @%rnotvzero bra Lskip;
4477 setp.<op>.<type> %rcond,op1,op2;
4478 Lskip:
4479 selp.u32 %rcondu32,1,0,%rcond;
4480 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4481 setp.ne.u32 %rcond,%rcondu32,0;
4482
4483 There seems to be a bug in the ptx JIT compiler (observed at driver
4484 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4485 unless %rcond is initialized to something before 'bra Lskip'. The
4486 bug is not observed with ptxas from cuda 8.0.61.
4487
4488 It is true that the code is non-trivial: at Lskip, %rcond is
4489 uninitialized in threads 1-31, and after the selp the same holds
4490 for %rcondu32. But shfl propagates the defined value in thread 0
4491 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4492 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4493
4494 There is nothing in the PTX spec to suggest that this is wrong, or
4495 to explain why the extra initialization is needed. So, we classify
4496 it as a JIT bug, and the extra initialization as workaround:
4497
4498 {
4499 .reg .u32 %x;
4500 mov.u32 %x,%tid.x;
4501 setp.ne.u32 %rnotvzero,%x,0;
4502 }
4503
4504 +.reg .pred %rcond2;
4505 +setp.eq.u32 %rcond2, 1, 0;
4506
4507 @%rnotvzero bra Lskip;
4508 setp.<op>.<type> %rcond,op1,op2;
4509 +mov.pred %rcond2, %rcond;
4510 Lskip:
4511 +mov.pred %rcond, %rcond2;
4512 selp.u32 %rcondu32,1,0,%rcond;
4513 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4514 setp.ne.u32 %rcond,%rcondu32,0;
4515 */
4516 rtx_insn *label = PREV_INSN (tail);
4517 gcc_assert (label && LABEL_P (label));
4518 rtx tmp = gen_reg_rtx (BImode);
4519 emit_insn_before (gen_movbi (tmp, const0_rtx),
4520 bb_first_real_insn (from));
4521 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4522 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4523 #endif
4524 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4525 }
4526 else
4527 {
4528 /* Includes worker mode, do spill & fill. By construction
4529 we should never have worker mode only. */
4530 broadcast_data_t data;
4531 unsigned size = GET_MODE_SIZE (SImode);
4532 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4533 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4534 rtx barrier = GEN_INT (0);
4535 int threads = 0;
4536
4537 data.base = oacc_bcast_sym;
4538 data.ptr = 0;
4539
4540 bool use_partitioning_p = (vector && !worker
4541 && nvptx_mach_max_workers () > 1
4542 && cfun->machine->bcast_partition);
4543 if (use_partitioning_p)
4544 {
4545 data.base = cfun->machine->bcast_partition;
4546 barrier = cfun->machine->sync_bar;
4547 threads = nvptx_mach_vector_length ();
4548 }
4549 gcc_assert (data.base != NULL);
4550 gcc_assert (barrier);
4551
4552 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4553 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4554 ? nvptx_mach_max_workers () + 1
4555 : 1);
4556
4557 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4558 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4559
4560 data.offset = 0;
4561 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4562 vector),
4563 before);
4564
4565 /* Barrier so other workers can see the write. */
4566 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4567 data.offset = 0;
4568 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4569 vector),
4570 tail);
4571 /* This barrier is needed to avoid worker zero clobbering
4572 the broadcast buffer before all the other workers have
4573 had a chance to read this instance of it. */
4574 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4575 }
4576
4577 extract_insn (tail);
4578 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4579 UNSPEC_BR_UNIFIED);
4580 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4581 }
4582
4583 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4584 vector_label, worker_label);
4585 if (!seen_label)
4586 verify_neutering_labels (to, vector_label, worker_label);
4587 }
4588
4589 /* PAR is a parallel that is being skipped in its entirety according to
4590 MASK. Treat this as skipping a superblock starting at forked
4591 and ending at joining. */
4592
4593 static void
4594 nvptx_skip_par (unsigned mask, parallel *par)
4595 {
4596 basic_block tail = par->join_block;
4597 gcc_assert (tail->preds->length () == 1);
4598
4599 basic_block pre_tail = (*tail->preds)[0]->src;
4600 gcc_assert (pre_tail->succs->length () == 1);
4601
4602 nvptx_single (mask, par->forked_block, pre_tail);
4603 }
4604
4605 /* If PAR has a single inner parallel and PAR itself only contains
4606 empty entry and exit blocks, swallow the inner PAR. */
4607
4608 static void
4609 nvptx_optimize_inner (parallel *par)
4610 {
4611 parallel *inner = par->inner;
4612
4613 /* We mustn't be the outer dummy par. */
4614 if (!par->mask)
4615 return;
4616
4617 /* We must have a single inner par. */
4618 if (!inner || inner->next)
4619 return;
4620
4621 /* We must only contain 2 blocks ourselves -- the head and tail of
4622 the inner par. */
4623 if (par->blocks.length () != 2)
4624 return;
4625
4626 /* We must be disjoint partitioning. As we only have vector and
4627 worker partitioning, this is sufficient to guarantee the pars
4628 have adjacent partitioning. */
4629 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4630 /* This indicates malformed code generation. */
4631 return;
4632
4633 /* The outer forked insn should be immediately followed by the inner
4634 fork insn. */
4635 rtx_insn *forked = par->forked_insn;
4636 rtx_insn *fork = BB_END (par->forked_block);
4637
4638 if (NEXT_INSN (forked) != fork)
4639 return;
4640 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4641
4642 /* The outer joining insn must immediately follow the inner join
4643 insn. */
4644 rtx_insn *joining = par->joining_insn;
4645 rtx_insn *join = inner->join_insn;
4646 if (NEXT_INSN (join) != joining)
4647 return;
4648
4649 /* Preconditions met. Swallow the inner par. */
4650 if (dump_file)
4651 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4652 inner->mask, inner->forked_block->index,
4653 inner->join_block->index,
4654 par->mask, par->forked_block->index, par->join_block->index);
4655
4656 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4657
4658 par->blocks.reserve (inner->blocks.length ());
4659 while (inner->blocks.length ())
4660 par->blocks.quick_push (inner->blocks.pop ());
4661
4662 par->inner = inner->inner;
4663 inner->inner = NULL;
4664
4665 delete inner;
4666 }
4667
4668 /* Process the parallel PAR and all its contained
4669 parallels. We do everything but the neutering. Return mask of
4670 partitioned modes used within this parallel. */
4671
4672 static unsigned
4673 nvptx_process_pars (parallel *par)
4674 {
4675 if (nvptx_optimize)
4676 nvptx_optimize_inner (par);
4677
4678 unsigned inner_mask = par->mask;
4679
4680 /* Do the inner parallels first. */
4681 if (par->inner)
4682 {
4683 par->inner_mask = nvptx_process_pars (par->inner);
4684 inner_mask |= par->inner_mask;
4685 }
4686
4687 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4688 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4689 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4690 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4691
4692 if (worker || large_vector)
4693 {
4694 nvptx_shared_propagate (false, is_call, par->forked_block,
4695 par->forked_insn, !worker);
4696 bool no_prop_p
4697 = nvptx_shared_propagate (true, is_call, par->forked_block,
4698 par->fork_insn, !worker);
4699 bool empty_loop_p
4700 = !is_call && (NEXT_INSN (par->forked_insn)
4701 && NEXT_INSN (par->forked_insn) == par->joining_insn);
4702 rtx barrier = GEN_INT (0);
4703 int threads = 0;
4704
4705 if (!worker && cfun->machine->sync_bar)
4706 {
4707 barrier = cfun->machine->sync_bar;
4708 threads = nvptx_mach_vector_length ();
4709 }
4710
4711 if (no_prop_p && empty_loop_p)
4712 ;
4713 else if (no_prop_p && is_call)
4714 ;
4715 else
4716 {
4717 /* Insert begin and end synchronizations. */
4718 emit_insn_before (nvptx_cta_sync (barrier, threads),
4719 par->forked_insn);
4720 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4721 }
4722 }
4723 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4724 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4725
4726 /* Now do siblings. */
4727 if (par->next)
4728 inner_mask |= nvptx_process_pars (par->next);
4729 return inner_mask;
4730 }
4731
4732 /* Neuter the parallel described by PAR. We recurse in depth-first
4733 order. MODES are the partitioning of the execution and OUTER is
4734 the partitioning of the parallels we are contained in. */
4735
4736 static void
4737 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4738 {
4739 unsigned me = (par->mask
4740 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4741 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4742 unsigned skip_mask = 0, neuter_mask = 0;
4743
4744 if (par->inner)
4745 nvptx_neuter_pars (par->inner, modes, outer | me);
4746
4747 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4748 {
4749 if ((outer | me) & GOMP_DIM_MASK (mode))
4750 {} /* Mode is partitioned: no neutering. */
4751 else if (!(modes & GOMP_DIM_MASK (mode)))
4752 {} /* Mode is not used: nothing to do. */
4753 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4754 || !par->forked_insn)
4755 /* Partitioned in inner parallels, or we're not a partitioned
4756 at all: neuter individual blocks. */
4757 neuter_mask |= GOMP_DIM_MASK (mode);
4758 else if (!par->parent || !par->parent->forked_insn
4759 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4760 /* Parent isn't a parallel or contains this paralleling: skip
4761 parallel at this level. */
4762 skip_mask |= GOMP_DIM_MASK (mode);
4763 else
4764 {} /* Parent will skip this parallel itself. */
4765 }
4766
4767 if (neuter_mask)
4768 {
4769 int ix, len;
4770
4771 if (nvptx_optimize)
4772 {
4773 /* Neuter whole SESE regions. */
4774 bb_pair_vec_t regions;
4775
4776 nvptx_find_sese (par->blocks, regions);
4777 len = regions.length ();
4778 for (ix = 0; ix != len; ix++)
4779 {
4780 basic_block from = regions[ix].first;
4781 basic_block to = regions[ix].second;
4782
4783 if (from)
4784 nvptx_single (neuter_mask, from, to);
4785 else
4786 gcc_assert (!to);
4787 }
4788 }
4789 else
4790 {
4791 /* Neuter each BB individually. */
4792 len = par->blocks.length ();
4793 for (ix = 0; ix != len; ix++)
4794 {
4795 basic_block block = par->blocks[ix];
4796
4797 nvptx_single (neuter_mask, block, block);
4798 }
4799 }
4800 }
4801
4802 if (skip_mask)
4803 nvptx_skip_par (skip_mask, par);
4804
4805 if (par->next)
4806 nvptx_neuter_pars (par->next, modes, outer);
4807 }
4808
4809 static void
4810 populate_offload_attrs (offload_attrs *oa)
4811 {
4812 tree attr = oacc_get_fn_attrib (current_function_decl);
4813 tree dims = TREE_VALUE (attr);
4814 unsigned ix;
4815
4816 oa->mask = 0;
4817
4818 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4819 {
4820 tree t = TREE_VALUE (dims);
4821 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4822 tree allowed = TREE_PURPOSE (dims);
4823
4824 if (size != 1 && !(allowed && integer_zerop (allowed)))
4825 oa->mask |= GOMP_DIM_MASK (ix);
4826
4827 switch (ix)
4828 {
4829 case GOMP_DIM_GANG:
4830 oa->num_gangs = size;
4831 break;
4832
4833 case GOMP_DIM_WORKER:
4834 oa->num_workers = size;
4835 break;
4836
4837 case GOMP_DIM_VECTOR:
4838 oa->vector_length = size;
4839 break;
4840 }
4841 }
4842 }
4843
4844 #if WORKAROUND_PTXJIT_BUG_2
4845 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4846 is needed in the nvptx target because the branches generated for
4847 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4848
4849 static rtx
4850 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4851 {
4852 rtx pat;
4853 if ((strict && !JUMP_P (insn))
4854 || (!strict && !INSN_P (insn)))
4855 return NULL_RTX;
4856 pat = PATTERN (insn);
4857
4858 /* The set is allowed to appear either as the insn pattern or
4859 the first set in a PARALLEL. */
4860 if (GET_CODE (pat) == PARALLEL)
4861 pat = XVECEXP (pat, 0, 0);
4862 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4863 return pat;
4864
4865 return NULL_RTX;
4866 }
4867
4868 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4869
4870 static rtx
4871 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4872 {
4873 rtx x = nvptx_pc_set (insn, strict);
4874
4875 if (!x)
4876 return NULL_RTX;
4877 x = SET_SRC (x);
4878 if (GET_CODE (x) == LABEL_REF)
4879 return x;
4880 if (GET_CODE (x) != IF_THEN_ELSE)
4881 return NULL_RTX;
4882 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4883 return XEXP (x, 1);
4884 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4885 return XEXP (x, 2);
4886 return NULL_RTX;
4887 }
4888
4889 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4890 insn inbetween the branch and the label. This works around a JIT bug
4891 observed at driver version 384.111, at -O0 for sm_50. */
4892
4893 static void
4894 prevent_branch_around_nothing (void)
4895 {
4896 rtx_insn *seen_label = NULL;
4897 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4898 {
4899 if (INSN_P (insn) && condjump_p (insn))
4900 {
4901 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4902 continue;
4903 }
4904
4905 if (seen_label == NULL)
4906 continue;
4907
4908 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4909 continue;
4910
4911 if (INSN_P (insn))
4912 switch (recog_memoized (insn))
4913 {
4914 case CODE_FOR_nvptx_fork:
4915 case CODE_FOR_nvptx_forked:
4916 case CODE_FOR_nvptx_joining:
4917 case CODE_FOR_nvptx_join:
4918 continue;
4919 default:
4920 seen_label = NULL;
4921 continue;
4922 }
4923
4924 if (LABEL_P (insn) && insn == seen_label)
4925 emit_insn_before (gen_fake_nop (), insn);
4926
4927 seen_label = NULL;
4928 }
4929 }
4930 #endif
4931
4932 #ifdef WORKAROUND_PTXJIT_BUG_3
4933 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4934 works around a hang observed at driver version 390.48 for sm_50. */
4935
4936 static void
4937 workaround_barsyncs (void)
4938 {
4939 bool seen_barsync = false;
4940 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4941 {
4942 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4943 {
4944 if (seen_barsync)
4945 {
4946 emit_insn_before (gen_nvptx_membar_cta (), insn);
4947 emit_insn_before (gen_nvptx_membar_cta (), insn);
4948 }
4949
4950 seen_barsync = true;
4951 continue;
4952 }
4953
4954 if (!seen_barsync)
4955 continue;
4956
4957 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4958 continue;
4959 else if (INSN_P (insn))
4960 switch (recog_memoized (insn))
4961 {
4962 case CODE_FOR_nvptx_fork:
4963 case CODE_FOR_nvptx_forked:
4964 case CODE_FOR_nvptx_joining:
4965 case CODE_FOR_nvptx_join:
4966 continue;
4967 default:
4968 break;
4969 }
4970
4971 seen_barsync = false;
4972 }
4973 }
4974 #endif
4975
4976 /* PTX-specific reorganization
4977 - Split blocks at fork and join instructions
4978 - Compute live registers
4979 - Mark now-unused registers, so function begin doesn't declare
4980 unused registers.
4981 - Insert state propagation when entering partitioned mode
4982 - Insert neutering instructions when in single mode
4983 - Replace subregs with suitable sequences.
4984 */
4985
4986 static void
4987 nvptx_reorg (void)
4988 {
4989 /* We are freeing block_for_insn in the toplev to keep compatibility
4990 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4991 compute_bb_for_insn ();
4992
4993 thread_prologue_and_epilogue_insns ();
4994
4995 /* Split blocks and record interesting unspecs. */
4996 bb_insn_map_t bb_insn_map;
4997
4998 nvptx_split_blocks (&bb_insn_map);
4999
5000 /* Compute live regs */
5001 df_clear_flags (DF_LR_RUN_DCE);
5002 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5003 df_live_add_problem ();
5004 df_live_set_all_dirty ();
5005 df_analyze ();
5006 regstat_init_n_sets_and_refs ();
5007
5008 if (dump_file)
5009 df_dump (dump_file);
5010
5011 /* Mark unused regs as unused. */
5012 int max_regs = max_reg_num ();
5013 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5014 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5015 regno_reg_rtx[i] = const0_rtx;
5016
5017 /* Determine launch dimensions of the function. If it is not an
5018 offloaded function (i.e. this is a regular compiler), the
5019 function has no neutering. */
5020 tree attr = oacc_get_fn_attrib (current_function_decl);
5021 if (attr)
5022 {
5023 /* If we determined this mask before RTL expansion, we could
5024 elide emission of some levels of forks and joins. */
5025 offload_attrs oa;
5026
5027 populate_offload_attrs (&oa);
5028
5029 /* If there is worker neutering, there must be vector
5030 neutering. Otherwise the hardware will fail. */
5031 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5032 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5033
5034 /* Discover & process partitioned regions. */
5035 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5036 nvptx_process_pars (pars);
5037 nvptx_neuter_pars (pars, oa.mask, 0);
5038 delete pars;
5039 }
5040
5041 /* Replace subregs. */
5042 nvptx_reorg_subreg ();
5043
5044 if (TARGET_UNIFORM_SIMT)
5045 nvptx_reorg_uniform_simt ();
5046
5047 #if WORKAROUND_PTXJIT_BUG_2
5048 prevent_branch_around_nothing ();
5049 #endif
5050
5051 #ifdef WORKAROUND_PTXJIT_BUG_3
5052 workaround_barsyncs ();
5053 #endif
5054
5055 regstat_free_n_sets_and_refs ();
5056
5057 df_finish_pass (true);
5058 }
5059 \f
5060 /* Handle a "kernel" attribute; arguments as in
5061 struct attribute_spec.handler. */
5062
5063 static tree
5064 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5065 int ARG_UNUSED (flags), bool *no_add_attrs)
5066 {
5067 tree decl = *node;
5068
5069 if (TREE_CODE (decl) != FUNCTION_DECL)
5070 {
5071 error ("%qE attribute only applies to functions", name);
5072 *no_add_attrs = true;
5073 }
5074 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5075 {
5076 error ("%qE attribute requires a void return type", name);
5077 *no_add_attrs = true;
5078 }
5079
5080 return NULL_TREE;
5081 }
5082
5083 /* Handle a "shared" attribute; arguments as in
5084 struct attribute_spec.handler. */
5085
5086 static tree
5087 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5088 int ARG_UNUSED (flags), bool *no_add_attrs)
5089 {
5090 tree decl = *node;
5091
5092 if (TREE_CODE (decl) != VAR_DECL)
5093 {
5094 error ("%qE attribute only applies to variables", name);
5095 *no_add_attrs = true;
5096 }
5097 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5098 {
5099 error ("%qE attribute not allowed with auto storage class", name);
5100 *no_add_attrs = true;
5101 }
5102
5103 return NULL_TREE;
5104 }
5105
5106 /* Table of valid machine attributes. */
5107 static const struct attribute_spec nvptx_attribute_table[] =
5108 {
5109 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5110 affects_type_identity, handler, exclude } */
5111 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5112 NULL },
5113 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5114 NULL },
5115 { NULL, 0, 0, false, false, false, false, NULL, NULL }
5116 };
5117 \f
5118 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5119
5120 static HOST_WIDE_INT
5121 nvptx_vector_alignment (const_tree type)
5122 {
5123 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5124
5125 return MIN (align, BIGGEST_ALIGNMENT);
5126 }
5127
5128 /* Indicate that INSN cannot be duplicated. */
5129
5130 static bool
5131 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5132 {
5133 switch (recog_memoized (insn))
5134 {
5135 case CODE_FOR_nvptx_shufflesi:
5136 case CODE_FOR_nvptx_shufflesf:
5137 case CODE_FOR_nvptx_barsync:
5138 case CODE_FOR_nvptx_fork:
5139 case CODE_FOR_nvptx_forked:
5140 case CODE_FOR_nvptx_joining:
5141 case CODE_FOR_nvptx_join:
5142 return true;
5143 default:
5144 return false;
5145 }
5146 }
5147
5148 /* Section anchors do not work. Initialization for flag_section_anchor
5149 probes the existence of the anchoring target hooks and prevents
5150 anchoring if they don't exist. However, we may be being used with
5151 a host-side compiler that does support anchoring, and hence see
5152 the anchor flag set (as it's not recalculated). So provide an
5153 implementation denying anchoring. */
5154
5155 static bool
5156 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5157 {
5158 return false;
5159 }
5160 \f
5161 /* Record a symbol for mkoffload to enter into the mapping table. */
5162
5163 static void
5164 nvptx_record_offload_symbol (tree decl)
5165 {
5166 switch (TREE_CODE (decl))
5167 {
5168 case VAR_DECL:
5169 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5170 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5171 break;
5172
5173 case FUNCTION_DECL:
5174 {
5175 tree attr = oacc_get_fn_attrib (decl);
5176 /* OpenMP offloading does not set this attribute. */
5177 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5178
5179 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5180 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5181
5182 for (; dims; dims = TREE_CHAIN (dims))
5183 {
5184 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5185
5186 gcc_assert (!TREE_PURPOSE (dims));
5187 fprintf (asm_out_file, ", %#x", size);
5188 }
5189
5190 fprintf (asm_out_file, "\n");
5191 }
5192 break;
5193
5194 default:
5195 gcc_unreachable ();
5196 }
5197 }
5198
5199 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5200 at the start of a file. */
5201
5202 static void
5203 nvptx_file_start (void)
5204 {
5205 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5206 fputs ("\t.version\t3.1\n", asm_out_file);
5207 if (TARGET_SM35)
5208 fputs ("\t.target\tsm_35\n", asm_out_file);
5209 else
5210 fputs ("\t.target\tsm_30\n", asm_out_file);
5211 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5212 fputs ("// END PREAMBLE\n", asm_out_file);
5213 }
5214
5215 /* Emit a declaration for a worker and vector-level buffer in .shared
5216 memory. */
5217
5218 static void
5219 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5220 {
5221 const char *name = XSTR (sym, 0);
5222
5223 write_var_marker (file, true, false, name);
5224 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5225 align, name, size);
5226 }
5227
5228 /* Write out the function declarations we've collected and declare storage
5229 for the broadcast buffer. */
5230
5231 static void
5232 nvptx_file_end (void)
5233 {
5234 hash_table<tree_hasher>::iterator iter;
5235 tree decl;
5236 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5237 nvptx_record_fndecl (decl);
5238 fputs (func_decls.str().c_str(), asm_out_file);
5239
5240 if (oacc_bcast_size)
5241 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5242 oacc_bcast_align, oacc_bcast_size);
5243
5244 if (worker_red_size)
5245 write_shared_buffer (asm_out_file, worker_red_sym,
5246 worker_red_align, worker_red_size);
5247
5248 if (vector_red_size)
5249 write_shared_buffer (asm_out_file, vector_red_sym,
5250 vector_red_align, vector_red_size);
5251
5252 if (need_softstack_decl)
5253 {
5254 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5255 /* 32 is the maximum number of warps in a block. Even though it's an
5256 external declaration, emit the array size explicitly; otherwise, it
5257 may fail at PTX JIT time if the definition is later in link order. */
5258 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5259 POINTER_SIZE);
5260 }
5261 if (need_unisimt_decl)
5262 {
5263 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5264 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5265 }
5266 }
5267
5268 /* Expander for the shuffle builtins. */
5269
5270 static rtx
5271 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5272 {
5273 if (ignore)
5274 return target;
5275
5276 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5277 NULL_RTX, mode, EXPAND_NORMAL);
5278 if (!REG_P (src))
5279 src = copy_to_mode_reg (mode, src);
5280
5281 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5282 NULL_RTX, SImode, EXPAND_NORMAL);
5283 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5284 NULL_RTX, SImode, EXPAND_NORMAL);
5285
5286 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5287 idx = copy_to_mode_reg (SImode, idx);
5288
5289 rtx pat = nvptx_gen_shuffle (target, src, idx,
5290 (nvptx_shuffle_kind) INTVAL (op));
5291 if (pat)
5292 emit_insn (pat);
5293
5294 return target;
5295 }
5296
5297 const char *
5298 nvptx_output_red_partition (rtx dst, rtx offset)
5299 {
5300 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5301 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5302
5303 if (offset == const0_rtx)
5304 fprintf (asm_out_file, zero_offset, REGNO (dst),
5305 REGNO (cfun->machine->red_partition));
5306 else
5307 fprintf (asm_out_file, with_offset, REGNO (dst),
5308 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5309
5310 return "";
5311 }
5312
5313 /* Shared-memory reduction address expander. */
5314
5315 static rtx
5316 nvptx_expand_shared_addr (tree exp, rtx target,
5317 machine_mode ARG_UNUSED (mode), int ignore,
5318 int vector)
5319 {
5320 if (ignore)
5321 return target;
5322
5323 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5324 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5325 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5326 rtx addr = worker_red_sym;
5327
5328 if (vector)
5329 {
5330 offload_attrs oa;
5331
5332 populate_offload_attrs (&oa);
5333
5334 unsigned int psize = ROUND_UP (size + offset, align);
5335 unsigned int pnum = nvptx_mach_max_workers ();
5336 vector_red_partition = MAX (vector_red_partition, psize);
5337 vector_red_size = MAX (vector_red_size, psize * pnum);
5338 vector_red_align = MAX (vector_red_align, align);
5339
5340 if (cfun->machine->red_partition == NULL)
5341 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5342
5343 addr = gen_reg_rtx (Pmode);
5344 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5345 }
5346 else
5347 {
5348 worker_red_align = MAX (worker_red_align, align);
5349 worker_red_size = MAX (worker_red_size, size + offset);
5350
5351 if (offset)
5352 {
5353 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5354 addr = gen_rtx_CONST (Pmode, addr);
5355 }
5356 }
5357
5358 emit_move_insn (target, addr);
5359 return target;
5360 }
5361
5362 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5363 not require taking the address of any object, other than the memory
5364 cell being operated on. */
5365
5366 static rtx
5367 nvptx_expand_cmp_swap (tree exp, rtx target,
5368 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5369 {
5370 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5371
5372 if (!target)
5373 target = gen_reg_rtx (mode);
5374
5375 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5376 NULL_RTX, Pmode, EXPAND_NORMAL);
5377 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5378 NULL_RTX, mode, EXPAND_NORMAL);
5379 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5380 NULL_RTX, mode, EXPAND_NORMAL);
5381 rtx pat;
5382
5383 mem = gen_rtx_MEM (mode, mem);
5384 if (!REG_P (cmp))
5385 cmp = copy_to_mode_reg (mode, cmp);
5386 if (!REG_P (src))
5387 src = copy_to_mode_reg (mode, src);
5388
5389 if (mode == SImode)
5390 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5391 else
5392 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5393
5394 emit_insn (pat);
5395
5396 return target;
5397 }
5398
5399
5400 /* Codes for all the NVPTX builtins. */
5401 enum nvptx_builtins
5402 {
5403 NVPTX_BUILTIN_SHUFFLE,
5404 NVPTX_BUILTIN_SHUFFLELL,
5405 NVPTX_BUILTIN_WORKER_ADDR,
5406 NVPTX_BUILTIN_VECTOR_ADDR,
5407 NVPTX_BUILTIN_CMP_SWAP,
5408 NVPTX_BUILTIN_CMP_SWAPLL,
5409 NVPTX_BUILTIN_MAX
5410 };
5411
5412 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5413
5414 /* Return the NVPTX builtin for CODE. */
5415
5416 static tree
5417 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5418 {
5419 if (code >= NVPTX_BUILTIN_MAX)
5420 return error_mark_node;
5421
5422 return nvptx_builtin_decls[code];
5423 }
5424
5425 /* Set up all builtin functions for this target. */
5426
5427 static void
5428 nvptx_init_builtins (void)
5429 {
5430 #define DEF(ID, NAME, T) \
5431 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5432 = add_builtin_function ("__builtin_nvptx_" NAME, \
5433 build_function_type_list T, \
5434 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5435 #define ST sizetype
5436 #define UINT unsigned_type_node
5437 #define LLUINT long_long_unsigned_type_node
5438 #define PTRVOID ptr_type_node
5439
5440 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5441 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5442 DEF (WORKER_ADDR, "worker_addr",
5443 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5444 DEF (VECTOR_ADDR, "vector_addr",
5445 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5446 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5447 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5448
5449 #undef DEF
5450 #undef ST
5451 #undef UINT
5452 #undef LLUINT
5453 #undef PTRVOID
5454 }
5455
5456 /* Expand an expression EXP that calls a built-in function,
5457 with result going to TARGET if that's convenient
5458 (and in mode MODE if that's convenient).
5459 SUBTARGET may be used as the target for computing one of EXP's operands.
5460 IGNORE is nonzero if the value is to be ignored. */
5461
5462 static rtx
5463 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5464 machine_mode mode, int ignore)
5465 {
5466 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5467 switch (DECL_FUNCTION_CODE (fndecl))
5468 {
5469 case NVPTX_BUILTIN_SHUFFLE:
5470 case NVPTX_BUILTIN_SHUFFLELL:
5471 return nvptx_expand_shuffle (exp, target, mode, ignore);
5472
5473 case NVPTX_BUILTIN_WORKER_ADDR:
5474 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5475
5476 case NVPTX_BUILTIN_VECTOR_ADDR:
5477 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5478
5479 case NVPTX_BUILTIN_CMP_SWAP:
5480 case NVPTX_BUILTIN_CMP_SWAPLL:
5481 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5482
5483 default: gcc_unreachable ();
5484 }
5485 }
5486
5487 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5488
5489 static int
5490 nvptx_simt_vf ()
5491 {
5492 return PTX_WARP_SIZE;
5493 }
5494
5495 static bool
5496 nvptx_welformed_vector_length_p (int l)
5497 {
5498 gcc_assert (l > 0);
5499 return l % PTX_WARP_SIZE == 0;
5500 }
5501
5502 static void
5503 nvptx_apply_dim_limits (int dims[])
5504 {
5505 /* Check that the vector_length is not too large. */
5506 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5507 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5508
5509 /* Check that the number of workers is not too large. */
5510 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5511 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5512
5513 /* Ensure that num_worker * vector_length <= cta size. */
5514 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5515 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5516 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5517
5518 /* If we need a per-worker barrier ... . */
5519 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5520 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5521 /* Don't use more barriers than available. */
5522 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5523 PTX_NUM_PER_WORKER_BARRIERS);
5524 }
5525
5526 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5527
5528 static bool
5529 has_vector_partitionable_routine_calls_p (tree fndecl)
5530 {
5531 if (!fndecl)
5532 return false;
5533
5534 basic_block bb;
5535 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5536 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5537 gsi_next_nondebug (&i))
5538 {
5539 gimple *stmt = gsi_stmt (i);
5540 if (gimple_code (stmt) != GIMPLE_CALL)
5541 continue;
5542
5543 tree callee = gimple_call_fndecl (stmt);
5544 if (!callee)
5545 continue;
5546
5547 tree attrs = oacc_get_fn_attrib (callee);
5548 if (attrs == NULL_TREE)
5549 return false;
5550
5551 int partition_level = oacc_fn_attrib_level (attrs);
5552 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5553 if (!seq_routine_p)
5554 return true;
5555 }
5556
5557 return false;
5558 }
5559
5560 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5561 DIMS has changed. */
5562
5563 static void
5564 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5565 {
5566 bool oacc_default_dims_p = false;
5567 bool oacc_min_dims_p = false;
5568 bool offload_region_p = false;
5569 bool routine_p = false;
5570 bool routine_seq_p = false;
5571 int default_vector_length = -1;
5572
5573 if (decl == NULL_TREE)
5574 {
5575 if (fn_level == -1)
5576 oacc_default_dims_p = true;
5577 else if (fn_level == -2)
5578 oacc_min_dims_p = true;
5579 else
5580 gcc_unreachable ();
5581 }
5582 else if (fn_level == -1)
5583 offload_region_p = true;
5584 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5585 {
5586 routine_p = true;
5587 routine_seq_p = fn_level == GOMP_DIM_MAX;
5588 }
5589 else
5590 gcc_unreachable ();
5591
5592 if (oacc_min_dims_p)
5593 {
5594 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5595 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5596 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5597
5598 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5599 return;
5600 }
5601
5602 if (routine_p)
5603 {
5604 if (!routine_seq_p)
5605 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5606
5607 return;
5608 }
5609
5610 if (oacc_default_dims_p)
5611 {
5612 /* -1 : not set
5613 0 : set at runtime, f.i. -fopenacc-dims=-
5614 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5615 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5616 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5617 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5618
5619 /* But -fopenacc-dims=- is not yet supported on trunk. */
5620 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5621 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5622 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5623 }
5624
5625 if (offload_region_p)
5626 {
5627 /* -1 : not set
5628 0 : set using variable, f.i. num_gangs (n)
5629 >= 1: set using constant, f.i. num_gangs (1). */
5630 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5631 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5632 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5633 }
5634
5635 if (offload_region_p)
5636 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5637 else
5638 /* oacc_default_dims_p. */
5639 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5640
5641 int old_dims[GOMP_DIM_MAX];
5642 unsigned int i;
5643 for (i = 0; i < GOMP_DIM_MAX; ++i)
5644 old_dims[i] = dims[i];
5645
5646 const char *vector_reason = NULL;
5647 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5648 {
5649 default_vector_length = PTX_WARP_SIZE;
5650
5651 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5652 {
5653 vector_reason = G_("using vector_length (%d) due to call to"
5654 " vector-partitionable routine, ignoring %d");
5655 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5656 }
5657 }
5658
5659 if (dims[GOMP_DIM_VECTOR] == 0)
5660 {
5661 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5662 dims[GOMP_DIM_VECTOR] = default_vector_length;
5663 }
5664
5665 if (dims[GOMP_DIM_VECTOR] > 0
5666 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5667 dims[GOMP_DIM_VECTOR] = default_vector_length;
5668
5669 nvptx_apply_dim_limits (dims);
5670
5671 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5672 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5673 vector_reason != NULL
5674 ? vector_reason
5675 : G_("using vector_length (%d), ignoring %d"),
5676 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5677
5678 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5679 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5680 G_("using num_workers (%d), ignoring %d"),
5681 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5682
5683 if (oacc_default_dims_p)
5684 {
5685 if (dims[GOMP_DIM_VECTOR] < 0)
5686 dims[GOMP_DIM_VECTOR] = default_vector_length;
5687 if (dims[GOMP_DIM_WORKER] < 0)
5688 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5689 if (dims[GOMP_DIM_GANG] < 0)
5690 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5691 nvptx_apply_dim_limits (dims);
5692 }
5693
5694 if (offload_region_p)
5695 {
5696 for (i = 0; i < GOMP_DIM_MAX; i++)
5697 {
5698 if (!(dims[i] < 0))
5699 continue;
5700
5701 if ((used & GOMP_DIM_MASK (i)) == 0)
5702 /* Function oacc_validate_dims will apply the minimal dimension. */
5703 continue;
5704
5705 dims[i] = (i == GOMP_DIM_VECTOR
5706 ? default_vector_length
5707 : oacc_get_default_dim (i));
5708 }
5709
5710 nvptx_apply_dim_limits (dims);
5711 }
5712 }
5713
5714 /* Validate compute dimensions of an OpenACC offload or routine, fill
5715 in non-unity defaults. FN_LEVEL indicates the level at which a
5716 routine might spawn a loop. It is negative for non-routines. If
5717 DECL is null, we are validating the default dimensions. */
5718
5719 static bool
5720 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5721 {
5722 int old_dims[GOMP_DIM_MAX];
5723 unsigned int i;
5724
5725 for (i = 0; i < GOMP_DIM_MAX; ++i)
5726 old_dims[i] = dims[i];
5727
5728 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5729
5730 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5731 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5732 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5733
5734 for (i = 0; i < GOMP_DIM_MAX; ++i)
5735 if (old_dims[i] != dims[i])
5736 return true;
5737
5738 return false;
5739 }
5740
5741 /* Return maximum dimension size, or zero for unbounded. */
5742
5743 static int
5744 nvptx_dim_limit (int axis)
5745 {
5746 switch (axis)
5747 {
5748 case GOMP_DIM_VECTOR:
5749 return PTX_MAX_VECTOR_LENGTH;
5750
5751 default:
5752 break;
5753 }
5754 return 0;
5755 }
5756
5757 /* Determine whether fork & joins are needed. */
5758
5759 static bool
5760 nvptx_goacc_fork_join (gcall *call, const int dims[],
5761 bool ARG_UNUSED (is_fork))
5762 {
5763 tree arg = gimple_call_arg (call, 2);
5764 unsigned axis = TREE_INT_CST_LOW (arg);
5765
5766 /* We only care about worker and vector partitioning. */
5767 if (axis < GOMP_DIM_WORKER)
5768 return false;
5769
5770 /* If the size is 1, there's no partitioning. */
5771 if (dims[axis] == 1)
5772 return false;
5773
5774 return true;
5775 }
5776
5777 /* Generate a PTX builtin function call that returns the address in
5778 the worker reduction buffer at OFFSET. TYPE is the type of the
5779 data at that location. */
5780
5781 static tree
5782 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5783 {
5784 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5785 if (vector)
5786 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5787 machine_mode mode = TYPE_MODE (type);
5788 tree fndecl = nvptx_builtin_decl (addr_dim, true);
5789 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5790 tree align = build_int_cst (unsigned_type_node,
5791 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5792 tree call = build_call_expr (fndecl, 3, offset, size, align);
5793
5794 return fold_convert (build_pointer_type (type), call);
5795 }
5796
5797 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5798 will cast the variable if necessary. */
5799
5800 static void
5801 nvptx_generate_vector_shuffle (location_t loc,
5802 tree dest_var, tree var, unsigned shift,
5803 gimple_seq *seq)
5804 {
5805 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5806 tree_code code = NOP_EXPR;
5807 tree arg_type = unsigned_type_node;
5808 tree var_type = TREE_TYPE (var);
5809 tree dest_type = var_type;
5810
5811 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5812 var_type = TREE_TYPE (var_type);
5813
5814 if (TREE_CODE (var_type) == REAL_TYPE)
5815 code = VIEW_CONVERT_EXPR;
5816
5817 if (TYPE_SIZE (var_type)
5818 == TYPE_SIZE (long_long_unsigned_type_node))
5819 {
5820 fn = NVPTX_BUILTIN_SHUFFLELL;
5821 arg_type = long_long_unsigned_type_node;
5822 }
5823
5824 tree call = nvptx_builtin_decl (fn, true);
5825 tree bits = build_int_cst (unsigned_type_node, shift);
5826 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5827 tree expr;
5828
5829 if (var_type != dest_type)
5830 {
5831 /* Do real and imaginary parts separately. */
5832 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5833 real = fold_build1 (code, arg_type, real);
5834 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5835 real = fold_build1 (code, var_type, real);
5836
5837 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5838 imag = fold_build1 (code, arg_type, imag);
5839 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5840 imag = fold_build1 (code, var_type, imag);
5841
5842 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5843 }
5844 else
5845 {
5846 expr = fold_build1 (code, arg_type, var);
5847 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5848 expr = fold_build1 (code, dest_type, expr);
5849 }
5850
5851 gimplify_assign (dest_var, expr, seq);
5852 }
5853
5854 /* Lazily generate the global lock var decl and return its address. */
5855
5856 static tree
5857 nvptx_global_lock_addr ()
5858 {
5859 tree v = global_lock_var;
5860
5861 if (!v)
5862 {
5863 tree name = get_identifier ("__reduction_lock");
5864 tree type = build_qualified_type (unsigned_type_node,
5865 TYPE_QUAL_VOLATILE);
5866 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5867 global_lock_var = v;
5868 DECL_ARTIFICIAL (v) = 1;
5869 DECL_EXTERNAL (v) = 1;
5870 TREE_STATIC (v) = 1;
5871 TREE_PUBLIC (v) = 1;
5872 TREE_USED (v) = 1;
5873 mark_addressable (v);
5874 mark_decl_referenced (v);
5875 }
5876
5877 return build_fold_addr_expr (v);
5878 }
5879
5880 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5881 GSI. We use a lockless scheme for nearly all case, which looks
5882 like:
5883 actual = initval(OP);
5884 do {
5885 guess = actual;
5886 write = guess OP myval;
5887 actual = cmp&swap (ptr, guess, write)
5888 } while (actual bit-different-to guess);
5889 return write;
5890
5891 This relies on a cmp&swap instruction, which is available for 32-
5892 and 64-bit types. Larger types must use a locking scheme. */
5893
5894 static tree
5895 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5896 tree ptr, tree var, tree_code op)
5897 {
5898 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5899 tree_code code = NOP_EXPR;
5900 tree arg_type = unsigned_type_node;
5901 tree var_type = TREE_TYPE (var);
5902
5903 if (TREE_CODE (var_type) == COMPLEX_TYPE
5904 || TREE_CODE (var_type) == REAL_TYPE)
5905 code = VIEW_CONVERT_EXPR;
5906
5907 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5908 {
5909 arg_type = long_long_unsigned_type_node;
5910 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5911 }
5912
5913 tree swap_fn = nvptx_builtin_decl (fn, true);
5914
5915 gimple_seq init_seq = NULL;
5916 tree init_var = make_ssa_name (arg_type);
5917 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5918 init_expr = fold_build1 (code, arg_type, init_expr);
5919 gimplify_assign (init_var, init_expr, &init_seq);
5920 gimple *init_end = gimple_seq_last (init_seq);
5921
5922 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5923
5924 /* Split the block just after the init stmts. */
5925 basic_block pre_bb = gsi_bb (*gsi);
5926 edge pre_edge = split_block (pre_bb, init_end);
5927 basic_block loop_bb = pre_edge->dest;
5928 pre_bb = pre_edge->src;
5929 /* Reset the iterator. */
5930 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5931
5932 tree expect_var = make_ssa_name (arg_type);
5933 tree actual_var = make_ssa_name (arg_type);
5934 tree write_var = make_ssa_name (arg_type);
5935
5936 /* Build and insert the reduction calculation. */
5937 gimple_seq red_seq = NULL;
5938 tree write_expr = fold_build1 (code, var_type, expect_var);
5939 write_expr = fold_build2 (op, var_type, write_expr, var);
5940 write_expr = fold_build1 (code, arg_type, write_expr);
5941 gimplify_assign (write_var, write_expr, &red_seq);
5942
5943 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5944
5945 /* Build & insert the cmp&swap sequence. */
5946 gimple_seq latch_seq = NULL;
5947 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5948 ptr, expect_var, write_var);
5949 gimplify_assign (actual_var, swap_expr, &latch_seq);
5950
5951 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5952 NULL_TREE, NULL_TREE);
5953 gimple_seq_add_stmt (&latch_seq, cond);
5954
5955 gimple *latch_end = gimple_seq_last (latch_seq);
5956 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5957
5958 /* Split the block just after the latch stmts. */
5959 edge post_edge = split_block (loop_bb, latch_end);
5960 basic_block post_bb = post_edge->dest;
5961 loop_bb = post_edge->src;
5962 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5963
5964 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5965 post_edge->probability = profile_probability::even ();
5966 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5967 loop_edge->probability = profile_probability::even ();
5968 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5969 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5970
5971 gphi *phi = create_phi_node (expect_var, loop_bb);
5972 add_phi_arg (phi, init_var, pre_edge, loc);
5973 add_phi_arg (phi, actual_var, loop_edge, loc);
5974
5975 loop *loop = alloc_loop ();
5976 loop->header = loop_bb;
5977 loop->latch = loop_bb;
5978 add_loop (loop, loop_bb->loop_father);
5979
5980 return fold_build1 (code, var_type, write_var);
5981 }
5982
5983 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5984 GSI. This is necessary for types larger than 64 bits, where there
5985 is no cmp&swap instruction to implement a lockless scheme. We use
5986 a lock variable in global memory.
5987
5988 while (cmp&swap (&lock_var, 0, 1))
5989 continue;
5990 T accum = *ptr;
5991 accum = accum OP var;
5992 *ptr = accum;
5993 cmp&swap (&lock_var, 1, 0);
5994 return accum;
5995
5996 A lock in global memory is necessary to force execution engine
5997 descheduling and avoid resource starvation that can occur if the
5998 lock is in .shared memory. */
5999
6000 static tree
6001 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6002 tree ptr, tree var, tree_code op)
6003 {
6004 tree var_type = TREE_TYPE (var);
6005 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6006 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6007 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6008
6009 /* Split the block just before the gsi. Insert a gimple nop to make
6010 this easier. */
6011 gimple *nop = gimple_build_nop ();
6012 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6013 basic_block entry_bb = gsi_bb (*gsi);
6014 edge entry_edge = split_block (entry_bb, nop);
6015 basic_block lock_bb = entry_edge->dest;
6016 /* Reset the iterator. */
6017 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6018
6019 /* Build and insert the locking sequence. */
6020 gimple_seq lock_seq = NULL;
6021 tree lock_var = make_ssa_name (unsigned_type_node);
6022 tree lock_expr = nvptx_global_lock_addr ();
6023 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6024 uns_unlocked, uns_locked);
6025 gimplify_assign (lock_var, lock_expr, &lock_seq);
6026 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6027 NULL_TREE, NULL_TREE);
6028 gimple_seq_add_stmt (&lock_seq, cond);
6029 gimple *lock_end = gimple_seq_last (lock_seq);
6030 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6031
6032 /* Split the block just after the lock sequence. */
6033 edge locked_edge = split_block (lock_bb, lock_end);
6034 basic_block update_bb = locked_edge->dest;
6035 lock_bb = locked_edge->src;
6036 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6037
6038 /* Create the lock loop ... */
6039 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6040 locked_edge->probability = profile_probability::even ();
6041 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6042 loop_edge->probability = profile_probability::even ();
6043 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6044 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6045
6046 /* ... and the loop structure. */
6047 loop *lock_loop = alloc_loop ();
6048 lock_loop->header = lock_bb;
6049 lock_loop->latch = lock_bb;
6050 lock_loop->nb_iterations_estimate = 1;
6051 lock_loop->any_estimate = true;
6052 add_loop (lock_loop, entry_bb->loop_father);
6053
6054 /* Build and insert the reduction calculation. */
6055 gimple_seq red_seq = NULL;
6056 tree acc_in = make_ssa_name (var_type);
6057 tree ref_in = build_simple_mem_ref (ptr);
6058 TREE_THIS_VOLATILE (ref_in) = 1;
6059 gimplify_assign (acc_in, ref_in, &red_seq);
6060
6061 tree acc_out = make_ssa_name (var_type);
6062 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6063 gimplify_assign (acc_out, update_expr, &red_seq);
6064
6065 tree ref_out = build_simple_mem_ref (ptr);
6066 TREE_THIS_VOLATILE (ref_out) = 1;
6067 gimplify_assign (ref_out, acc_out, &red_seq);
6068
6069 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6070
6071 /* Build & insert the unlock sequence. */
6072 gimple_seq unlock_seq = NULL;
6073 tree unlock_expr = nvptx_global_lock_addr ();
6074 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6075 uns_locked, uns_unlocked);
6076 gimplify_and_add (unlock_expr, &unlock_seq);
6077 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6078
6079 return acc_out;
6080 }
6081
6082 /* Emit a sequence to update a reduction accumlator at *PTR with the
6083 value held in VAR using operator OP. Return the updated value.
6084
6085 TODO: optimize for atomic ops and indepedent complex ops. */
6086
6087 static tree
6088 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6089 tree ptr, tree var, tree_code op)
6090 {
6091 tree type = TREE_TYPE (var);
6092 tree size = TYPE_SIZE (type);
6093
6094 if (size == TYPE_SIZE (unsigned_type_node)
6095 || size == TYPE_SIZE (long_long_unsigned_type_node))
6096 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6097 else
6098 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6099 }
6100
6101 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6102
6103 static void
6104 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6105 {
6106 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6107 tree lhs = gimple_call_lhs (call);
6108 tree var = gimple_call_arg (call, 2);
6109 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6110 gimple_seq seq = NULL;
6111
6112 push_gimplify_context (true);
6113
6114 if (level != GOMP_DIM_GANG)
6115 {
6116 /* Copy the receiver object. */
6117 tree ref_to_res = gimple_call_arg (call, 1);
6118
6119 if (!integer_zerop (ref_to_res))
6120 var = build_simple_mem_ref (ref_to_res);
6121 }
6122
6123 if (level == GOMP_DIM_WORKER
6124 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6125 {
6126 /* Store incoming value to worker reduction buffer. */
6127 tree offset = gimple_call_arg (call, 5);
6128 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6129 level == GOMP_DIM_VECTOR);
6130 tree ptr = make_ssa_name (TREE_TYPE (call));
6131
6132 gimplify_assign (ptr, call, &seq);
6133 tree ref = build_simple_mem_ref (ptr);
6134 TREE_THIS_VOLATILE (ref) = 1;
6135 gimplify_assign (ref, var, &seq);
6136 }
6137
6138 if (lhs)
6139 gimplify_assign (lhs, var, &seq);
6140
6141 pop_gimplify_context (NULL);
6142 gsi_replace_with_seq (&gsi, seq, true);
6143 }
6144
6145 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6146
6147 static void
6148 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6149 {
6150 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6151 tree lhs = gimple_call_lhs (call);
6152 tree var = gimple_call_arg (call, 2);
6153 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6154 enum tree_code rcode
6155 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6156 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6157 TREE_TYPE (var));
6158 gimple_seq seq = NULL;
6159
6160 push_gimplify_context (true);
6161
6162 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6163 {
6164 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6165 tree tid = make_ssa_name (integer_type_node);
6166 tree dim_vector = gimple_call_arg (call, 3);
6167 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6168 dim_vector);
6169 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6170 NULL_TREE, NULL_TREE);
6171
6172 gimple_call_set_lhs (tid_call, tid);
6173 gimple_seq_add_stmt (&seq, tid_call);
6174 gimple_seq_add_stmt (&seq, cond_stmt);
6175
6176 /* Split the block just after the call. */
6177 edge init_edge = split_block (gsi_bb (gsi), call);
6178 basic_block init_bb = init_edge->dest;
6179 basic_block call_bb = init_edge->src;
6180
6181 /* Fixup flags from call_bb to init_bb. */
6182 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6183 init_edge->probability = profile_probability::even ();
6184
6185 /* Set the initialization stmts. */
6186 gimple_seq init_seq = NULL;
6187 tree init_var = make_ssa_name (TREE_TYPE (var));
6188 gimplify_assign (init_var, init, &init_seq);
6189 gsi = gsi_start_bb (init_bb);
6190 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6191
6192 /* Split block just after the init stmt. */
6193 gsi_prev (&gsi);
6194 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6195 basic_block dst_bb = inited_edge->dest;
6196
6197 /* Create false edge from call_bb to dst_bb. */
6198 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6199 nop_edge->probability = profile_probability::even ();
6200
6201 /* Create phi node in dst block. */
6202 gphi *phi = create_phi_node (lhs, dst_bb);
6203 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6204 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6205
6206 /* Reset dominator of dst bb. */
6207 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6208
6209 /* Reset the gsi. */
6210 gsi = gsi_for_stmt (call);
6211 }
6212 else
6213 {
6214 if (level == GOMP_DIM_GANG)
6215 {
6216 /* If there's no receiver object, propagate the incoming VAR. */
6217 tree ref_to_res = gimple_call_arg (call, 1);
6218 if (integer_zerop (ref_to_res))
6219 init = var;
6220 }
6221
6222 if (lhs != NULL_TREE)
6223 gimplify_assign (lhs, init, &seq);
6224 }
6225
6226 pop_gimplify_context (NULL);
6227 gsi_replace_with_seq (&gsi, seq, true);
6228 }
6229
6230 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6231
6232 static void
6233 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6234 {
6235 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6236 tree lhs = gimple_call_lhs (call);
6237 tree ref_to_res = gimple_call_arg (call, 1);
6238 tree var = gimple_call_arg (call, 2);
6239 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6240 enum tree_code op
6241 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6242 gimple_seq seq = NULL;
6243 tree r = NULL_TREE;;
6244
6245 push_gimplify_context (true);
6246
6247 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6248 {
6249 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6250 but that requires a method of emitting a unified jump at the
6251 gimple level. */
6252 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6253 {
6254 tree other_var = make_ssa_name (TREE_TYPE (var));
6255 nvptx_generate_vector_shuffle (gimple_location (call),
6256 other_var, var, shfl, &seq);
6257
6258 r = make_ssa_name (TREE_TYPE (var));
6259 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6260 var, other_var), &seq);
6261 var = r;
6262 }
6263 }
6264 else
6265 {
6266 tree accum = NULL_TREE;
6267
6268 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6269 {
6270 /* Get reduction buffer address. */
6271 tree offset = gimple_call_arg (call, 5);
6272 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6273 level == GOMP_DIM_VECTOR);
6274 tree ptr = make_ssa_name (TREE_TYPE (call));
6275
6276 gimplify_assign (ptr, call, &seq);
6277 accum = ptr;
6278 }
6279 else if (integer_zerop (ref_to_res))
6280 r = var;
6281 else
6282 accum = ref_to_res;
6283
6284 if (accum)
6285 {
6286 /* UPDATE the accumulator. */
6287 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6288 seq = NULL;
6289 r = nvptx_reduction_update (gimple_location (call), &gsi,
6290 accum, var, op);
6291 }
6292 }
6293
6294 if (lhs)
6295 gimplify_assign (lhs, r, &seq);
6296 pop_gimplify_context (NULL);
6297
6298 gsi_replace_with_seq (&gsi, seq, true);
6299 }
6300
6301 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6302
6303 static void
6304 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6305 {
6306 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6307 tree lhs = gimple_call_lhs (call);
6308 tree var = gimple_call_arg (call, 2);
6309 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6310 gimple_seq seq = NULL;
6311
6312 push_gimplify_context (true);
6313 if (level == GOMP_DIM_WORKER
6314 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6315 {
6316 /* Read the worker reduction buffer. */
6317 tree offset = gimple_call_arg (call, 5);
6318 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6319 level == GOMP_DIM_VECTOR);
6320 tree ptr = make_ssa_name (TREE_TYPE (call));
6321
6322 gimplify_assign (ptr, call, &seq);
6323 var = build_simple_mem_ref (ptr);
6324 TREE_THIS_VOLATILE (var) = 1;
6325 }
6326
6327 if (level != GOMP_DIM_GANG)
6328 {
6329 /* Write to the receiver object. */
6330 tree ref_to_res = gimple_call_arg (call, 1);
6331
6332 if (!integer_zerop (ref_to_res))
6333 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6334 }
6335
6336 if (lhs)
6337 gimplify_assign (lhs, var, &seq);
6338
6339 pop_gimplify_context (NULL);
6340
6341 gsi_replace_with_seq (&gsi, seq, true);
6342 }
6343
6344 /* NVPTX reduction expander. */
6345
6346 static void
6347 nvptx_goacc_reduction (gcall *call)
6348 {
6349 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6350 offload_attrs oa;
6351
6352 populate_offload_attrs (&oa);
6353
6354 switch (code)
6355 {
6356 case IFN_GOACC_REDUCTION_SETUP:
6357 nvptx_goacc_reduction_setup (call, &oa);
6358 break;
6359
6360 case IFN_GOACC_REDUCTION_INIT:
6361 nvptx_goacc_reduction_init (call, &oa);
6362 break;
6363
6364 case IFN_GOACC_REDUCTION_FINI:
6365 nvptx_goacc_reduction_fini (call, &oa);
6366 break;
6367
6368 case IFN_GOACC_REDUCTION_TEARDOWN:
6369 nvptx_goacc_reduction_teardown (call, &oa);
6370 break;
6371
6372 default:
6373 gcc_unreachable ();
6374 }
6375 }
6376
6377 static bool
6378 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6379 rtx x ATTRIBUTE_UNUSED)
6380 {
6381 return true;
6382 }
6383
6384 static bool
6385 nvptx_vector_mode_supported (machine_mode mode)
6386 {
6387 return (mode == V2SImode
6388 || mode == V2DImode);
6389 }
6390
6391 /* Return the preferred mode for vectorizing scalar MODE. */
6392
6393 static machine_mode
6394 nvptx_preferred_simd_mode (scalar_mode mode)
6395 {
6396 switch (mode)
6397 {
6398 case E_DImode:
6399 return V2DImode;
6400 case E_SImode:
6401 return V2SImode;
6402
6403 default:
6404 return default_preferred_simd_mode (mode);
6405 }
6406 }
6407
6408 unsigned int
6409 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6410 {
6411 if (TREE_CODE (type) == INTEGER_TYPE)
6412 {
6413 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6414 if (size == GET_MODE_SIZE (TImode))
6415 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6416 }
6417
6418 return basic_align;
6419 }
6420
6421 /* Implement TARGET_MODES_TIEABLE_P. */
6422
6423 static bool
6424 nvptx_modes_tieable_p (machine_mode, machine_mode)
6425 {
6426 return false;
6427 }
6428
6429 /* Implement TARGET_HARD_REGNO_NREGS. */
6430
6431 static unsigned int
6432 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6433 {
6434 return 1;
6435 }
6436
6437 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6438
6439 static bool
6440 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6441 {
6442 return false;
6443 }
6444
6445 static GTY(()) tree nvptx_previous_fndecl;
6446
6447 static void
6448 nvptx_set_current_function (tree fndecl)
6449 {
6450 if (!fndecl || fndecl == nvptx_previous_fndecl)
6451 return;
6452
6453 nvptx_previous_fndecl = fndecl;
6454 vector_red_partition = 0;
6455 oacc_bcast_partition = 0;
6456 }
6457
6458 #undef TARGET_OPTION_OVERRIDE
6459 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6460
6461 #undef TARGET_ATTRIBUTE_TABLE
6462 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6463
6464 #undef TARGET_LRA_P
6465 #define TARGET_LRA_P hook_bool_void_false
6466
6467 #undef TARGET_LEGITIMATE_ADDRESS_P
6468 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6469
6470 #undef TARGET_PROMOTE_FUNCTION_MODE
6471 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6472
6473 #undef TARGET_FUNCTION_ARG
6474 #define TARGET_FUNCTION_ARG nvptx_function_arg
6475 #undef TARGET_FUNCTION_INCOMING_ARG
6476 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6477 #undef TARGET_FUNCTION_ARG_ADVANCE
6478 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6479 #undef TARGET_FUNCTION_ARG_BOUNDARY
6480 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6481 #undef TARGET_PASS_BY_REFERENCE
6482 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6483 #undef TARGET_FUNCTION_VALUE_REGNO_P
6484 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6485 #undef TARGET_FUNCTION_VALUE
6486 #define TARGET_FUNCTION_VALUE nvptx_function_value
6487 #undef TARGET_LIBCALL_VALUE
6488 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6489 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6490 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6491 #undef TARGET_GET_DRAP_RTX
6492 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6493 #undef TARGET_SPLIT_COMPLEX_ARG
6494 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6495 #undef TARGET_RETURN_IN_MEMORY
6496 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6497 #undef TARGET_OMIT_STRUCT_RETURN_REG
6498 #define TARGET_OMIT_STRUCT_RETURN_REG true
6499 #undef TARGET_STRICT_ARGUMENT_NAMING
6500 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6501 #undef TARGET_CALL_ARGS
6502 #define TARGET_CALL_ARGS nvptx_call_args
6503 #undef TARGET_END_CALL_ARGS
6504 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6505
6506 #undef TARGET_ASM_FILE_START
6507 #define TARGET_ASM_FILE_START nvptx_file_start
6508 #undef TARGET_ASM_FILE_END
6509 #define TARGET_ASM_FILE_END nvptx_file_end
6510 #undef TARGET_ASM_GLOBALIZE_LABEL
6511 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6512 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6513 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6514 #undef TARGET_PRINT_OPERAND
6515 #define TARGET_PRINT_OPERAND nvptx_print_operand
6516 #undef TARGET_PRINT_OPERAND_ADDRESS
6517 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6518 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6519 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6520 #undef TARGET_ASM_INTEGER
6521 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6522 #undef TARGET_ASM_DECL_END
6523 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6524 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6525 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6526 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6527 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6528 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6529 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6530
6531 #undef TARGET_MACHINE_DEPENDENT_REORG
6532 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6533 #undef TARGET_NO_REGISTER_ALLOCATION
6534 #define TARGET_NO_REGISTER_ALLOCATION true
6535
6536 #undef TARGET_ENCODE_SECTION_INFO
6537 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6538 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6539 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6540
6541 #undef TARGET_VECTOR_ALIGNMENT
6542 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6543
6544 #undef TARGET_CANNOT_COPY_INSN_P
6545 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6546
6547 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6548 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6549
6550 #undef TARGET_INIT_BUILTINS
6551 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6552 #undef TARGET_EXPAND_BUILTIN
6553 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6554 #undef TARGET_BUILTIN_DECL
6555 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6556
6557 #undef TARGET_SIMT_VF
6558 #define TARGET_SIMT_VF nvptx_simt_vf
6559
6560 #undef TARGET_GOACC_VALIDATE_DIMS
6561 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6562
6563 #undef TARGET_GOACC_DIM_LIMIT
6564 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6565
6566 #undef TARGET_GOACC_FORK_JOIN
6567 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6568
6569 #undef TARGET_GOACC_REDUCTION
6570 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6571
6572 #undef TARGET_CANNOT_FORCE_CONST_MEM
6573 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6574
6575 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6576 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6577
6578 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6579 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6580 nvptx_preferred_simd_mode
6581
6582 #undef TARGET_MODES_TIEABLE_P
6583 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6584
6585 #undef TARGET_HARD_REGNO_NREGS
6586 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6587
6588 #undef TARGET_CAN_CHANGE_MODE_CLASS
6589 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6590
6591 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6592 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6593
6594 #undef TARGET_SET_CURRENT_FUNCTION
6595 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6596
6597 struct gcc_target targetm = TARGET_INITIALIZER;
6598
6599 #include "gt-nvptx.h"