]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/nvptx/nvptx.c
* config/nvptx/nvptx.c (nvptx_sese_number, nvptx_sese_pseudo): Don't
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
84
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
87 2.x. */
88 #define PTX_CTA_SIZE 1024
89
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
92
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
102
103 /* The various PTX memory areas an object might reside in. */
104 enum nvptx_data_area
105 {
106 DATA_AREA_GENERIC,
107 DATA_AREA_GLOBAL,
108 DATA_AREA_SHARED,
109 DATA_AREA_LOCAL,
110 DATA_AREA_CONST,
111 DATA_AREA_PARAM,
112 DATA_AREA_MAX
113 };
114
115 /* We record the data area in the target symbol flags. */
116 #define SYMBOL_DATA_AREA(SYM) \
117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
118 & 7)
119 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
121
122 /* Record the function decls we've written, and the libfuncs and function
123 decls corresponding to them. */
124 static std::stringstream func_decls;
125
126 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
127 {
128 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
129 static bool equal (rtx a, rtx b) { return a == b; }
130 };
131
132 static GTY((cache))
133 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
134
135 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
136 {
137 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
138 static bool equal (tree a, tree b) { return a == b; }
139 };
140
141 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
142 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
143
144 /* Buffer needed to broadcast across workers and vectors. This is
145 used for both worker-neutering and worker broadcasting, and
146 vector-neutering and boardcasting when vector_length > 32. It is
147 shared by all functions emitted. The buffer is placed in shared
148 memory. It'd be nice if PTX supported common blocks, because then
149 this could be shared across TUs (taking the largest size). */
150 static unsigned oacc_bcast_size;
151 static unsigned oacc_bcast_partition;
152 static unsigned oacc_bcast_align;
153 static GTY(()) rtx oacc_bcast_sym;
154
155 /* Buffer needed for worker reductions. This has to be distinct from
156 the worker broadcast array, as both may be live concurrently. */
157 static unsigned worker_red_size;
158 static unsigned worker_red_align;
159 static GTY(()) rtx worker_red_sym;
160
161 /* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164 static unsigned vector_red_size;
165 static unsigned vector_red_align;
166 static unsigned vector_red_partition;
167 static GTY(()) rtx vector_red_sym;
168
169 /* Global lock variable, needed for 128bit worker & gang reductions. */
170 static GTY(()) tree global_lock_var;
171
172 /* True if any function references __nvptx_stacks. */
173 static bool need_softstack_decl;
174
175 /* True if any function references __nvptx_uni. */
176 static bool need_unisimt_decl;
177
178 static int nvptx_mach_max_workers ();
179
180 /* Allocate a new, cleared machine_function structure. */
181
182 static struct machine_function *
183 nvptx_init_machine_status (void)
184 {
185 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
186 p->return_mode = VOIDmode;
187 return p;
188 }
189
190 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191 and -fopenacc is also enabled. */
192
193 static void
194 diagnose_openacc_conflict (bool optval, const char *optname)
195 {
196 if (flag_openacc && optval)
197 error ("option %s is not supported together with %<-fopenacc%>", optname);
198 }
199
200 /* Implement TARGET_OPTION_OVERRIDE. */
201
202 static void
203 nvptx_option_override (void)
204 {
205 init_machine_status = nvptx_init_machine_status;
206
207 /* Set toplevel_reorder, unless explicitly disabled. We need
208 reordering so that we emit necessary assembler decls of
209 undeclared variables. */
210 if (!global_options_set.x_flag_toplevel_reorder)
211 flag_toplevel_reorder = 1;
212
213 debug_nonbind_markers_p = 0;
214
215 /* Set flag_no_common, unless explicitly disabled. We fake common
216 using .weak, and that's not entirely accurate, so avoid it
217 unless forced. */
218 if (!global_options_set.x_flag_no_common)
219 flag_no_common = 1;
220
221 /* The patch area requires nops, which we don't have. */
222 if (function_entry_patch_area_size > 0)
223 sorry ("not generating patch area, nops not supported");
224
225 /* Assumes that it will see only hard registers. */
226 flag_var_tracking = 0;
227
228 if (nvptx_optimize < 0)
229 nvptx_optimize = optimize > 0;
230
231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
233 declared_libfuncs_htab
234 = hash_table<declared_libfunc_hasher>::create_ggc (17);
235
236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
239 oacc_bcast_partition = 0;
240
241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
244
245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248 vector_red_partition = 0;
249
250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
253
254 if (TARGET_GOMP)
255 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
256 }
257
258 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
259 deal with ptx ideosyncracies. */
260
261 const char *
262 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
263 {
264 switch (mode)
265 {
266 case E_BLKmode:
267 return ".b8";
268 case E_BImode:
269 return ".pred";
270 case E_QImode:
271 if (promote)
272 return ".u32";
273 else
274 return ".u8";
275 case E_HImode:
276 return ".u16";
277 case E_SImode:
278 return ".u32";
279 case E_DImode:
280 return ".u64";
281
282 case E_SFmode:
283 return ".f32";
284 case E_DFmode:
285 return ".f64";
286
287 case E_V2SImode:
288 return ".v2.u32";
289 case E_V2DImode:
290 return ".v2.u64";
291
292 default:
293 gcc_unreachable ();
294 }
295 }
296
297 /* Encode the PTX data area that DECL (which might not actually be a
298 _DECL) should reside in. */
299
300 static void
301 nvptx_encode_section_info (tree decl, rtx rtl, int first)
302 {
303 default_encode_section_info (decl, rtl, first);
304 if (first && MEM_P (rtl))
305 {
306 nvptx_data_area area = DATA_AREA_GENERIC;
307
308 if (TREE_CONSTANT (decl))
309 area = DATA_AREA_CONST;
310 else if (TREE_CODE (decl) == VAR_DECL)
311 {
312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
313 {
314 area = DATA_AREA_SHARED;
315 if (DECL_INITIAL (decl))
316 error ("static initialization of variable %q+D in %<.shared%>"
317 " memory is not supported", decl);
318 }
319 else
320 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
321 }
322
323 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
324 }
325 }
326
327 /* Return the PTX name of the data area in which SYM should be
328 placed. The symbol must have already been processed by
329 nvptx_encode_seciton_info, or equivalent. */
330
331 static const char *
332 section_for_sym (rtx sym)
333 {
334 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
335 /* Same order as nvptx_data_area enum. */
336 static char const *const areas[] =
337 {"", ".global", ".shared", ".local", ".const", ".param"};
338
339 return areas[area];
340 }
341
342 /* Similarly for a decl. */
343
344 static const char *
345 section_for_decl (const_tree decl)
346 {
347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
348 }
349
350 /* Check NAME for special function names and redirect them by returning a
351 replacement. This applies to malloc, free and realloc, for which we
352 want to use libgcc wrappers, and call, which triggers a bug in
353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354 not active in an offload compiler -- the names are all set by the
355 host-side compiler. */
356
357 static const char *
358 nvptx_name_replacement (const char *name)
359 {
360 if (strcmp (name, "call") == 0)
361 return "__nvptx_call";
362 if (strcmp (name, "malloc") == 0)
363 return "__nvptx_malloc";
364 if (strcmp (name, "free") == 0)
365 return "__nvptx_free";
366 if (strcmp (name, "realloc") == 0)
367 return "__nvptx_realloc";
368 return name;
369 }
370
371 /* If MODE should be treated as two registers of an inner mode, return
372 that inner mode. Otherwise return VOIDmode. */
373
374 static machine_mode
375 maybe_split_mode (machine_mode mode)
376 {
377 if (COMPLEX_MODE_P (mode))
378 return GET_MODE_INNER (mode);
379
380 if (mode == TImode)
381 return DImode;
382
383 return VOIDmode;
384 }
385
386 /* Return true if mode should be treated as two registers. */
387
388 static bool
389 split_mode_p (machine_mode mode)
390 {
391 return maybe_split_mode (mode) != VOIDmode;
392 }
393
394 /* Output a register, subreg, or register pair (with optional
395 enclosing braces). */
396
397 static void
398 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
399 int subreg_offset = -1)
400 {
401 if (inner_mode == VOIDmode)
402 {
403 if (HARD_REGISTER_NUM_P (regno))
404 fprintf (file, "%s", reg_names[regno]);
405 else
406 fprintf (file, "%%r%d", regno);
407 }
408 else if (subreg_offset >= 0)
409 {
410 output_reg (file, regno, VOIDmode);
411 fprintf (file, "$%d", subreg_offset);
412 }
413 else
414 {
415 if (subreg_offset == -1)
416 fprintf (file, "{");
417 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
418 fprintf (file, ",");
419 output_reg (file, regno, inner_mode, 0);
420 if (subreg_offset == -1)
421 fprintf (file, "}");
422 }
423 }
424
425 /* Emit forking instructions for MASK. */
426
427 static void
428 nvptx_emit_forking (unsigned mask, bool is_call)
429 {
430 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
431 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
432 if (mask)
433 {
434 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
435
436 /* Emit fork at all levels. This helps form SESE regions, as
437 it creates a block with a single successor before entering a
438 partitooned region. That is a good candidate for the end of
439 an SESE region. */
440 emit_insn (gen_nvptx_fork (op));
441 emit_insn (gen_nvptx_forked (op));
442 }
443 }
444
445 /* Emit joining instructions for MASK. */
446
447 static void
448 nvptx_emit_joining (unsigned mask, bool is_call)
449 {
450 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452 if (mask)
453 {
454 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455
456 /* Emit joining for all non-call pars to ensure there's a single
457 predecessor for the block the join insn ends up in. This is
458 needed for skipping entire loops. */
459 emit_insn (gen_nvptx_joining (op));
460 emit_insn (gen_nvptx_join (op));
461 }
462 }
463
464 \f
465 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
466 returned in memory. Integer and floating types supported by the
467 machine are passed in registers, everything else is passed in
468 memory. Complex types are split. */
469
470 static bool
471 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
472 {
473 if (type)
474 {
475 if (AGGREGATE_TYPE_P (type))
476 return true;
477 if (TREE_CODE (type) == VECTOR_TYPE)
478 return true;
479 }
480
481 if (!for_return && COMPLEX_MODE_P (mode))
482 /* Complex types are passed as two underlying args. */
483 mode = GET_MODE_INNER (mode);
484
485 if (GET_MODE_CLASS (mode) != MODE_INT
486 && GET_MODE_CLASS (mode) != MODE_FLOAT)
487 return true;
488
489 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
490 return true;
491
492 return false;
493 }
494
495 /* A non-memory argument of mode MODE is being passed, determine the mode it
496 should be promoted to. This is also used for determining return
497 type promotion. */
498
499 static machine_mode
500 promote_arg (machine_mode mode, bool prototyped)
501 {
502 if (!prototyped && mode == SFmode)
503 /* K&R float promotion for unprototyped functions. */
504 mode = DFmode;
505 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
506 mode = SImode;
507
508 return mode;
509 }
510
511 /* A non-memory return type of MODE is being returned. Determine the
512 mode it should be promoted to. */
513
514 static machine_mode
515 promote_return (machine_mode mode)
516 {
517 return promote_arg (mode, true);
518 }
519
520 /* Implement TARGET_FUNCTION_ARG. */
521
522 static rtx
523 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
524 const_tree, bool named)
525 {
526 if (mode == VOIDmode || !named)
527 return NULL_RTX;
528
529 return gen_reg_rtx (mode);
530 }
531
532 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
533
534 static rtx
535 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
536 const_tree, bool named)
537 {
538 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
539
540 if (mode == VOIDmode || !named)
541 return NULL_RTX;
542
543 /* No need to deal with split modes here, the only case that can
544 happen is complex modes and those are dealt with by
545 TARGET_SPLIT_COMPLEX_ARG. */
546 return gen_rtx_UNSPEC (mode,
547 gen_rtvec (1, GEN_INT (cum->count)),
548 UNSPEC_ARG_REG);
549 }
550
551 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
552
553 static void
554 nvptx_function_arg_advance (cumulative_args_t cum_v,
555 machine_mode ARG_UNUSED (mode),
556 const_tree ARG_UNUSED (type),
557 bool ARG_UNUSED (named))
558 {
559 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
560
561 cum->count++;
562 }
563
564 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
565
566 For nvptx This is only used for varadic args. The type has already
567 been promoted and/or converted to invisible reference. */
568
569 static unsigned
570 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
571 {
572 return GET_MODE_ALIGNMENT (mode);
573 }
574
575 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
576
577 For nvptx, we know how to handle functions declared as stdarg: by
578 passing an extra pointer to the unnamed arguments. However, the
579 Fortran frontend can produce a different situation, where a
580 function pointer is declared with no arguments, but the actual
581 function and calls to it take more arguments. In that case, we
582 want to ensure the call matches the definition of the function. */
583
584 static bool
585 nvptx_strict_argument_naming (cumulative_args_t cum_v)
586 {
587 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
588
589 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
590 }
591
592 /* Implement TARGET_LIBCALL_VALUE. */
593
594 static rtx
595 nvptx_libcall_value (machine_mode mode, const_rtx)
596 {
597 if (!cfun || !cfun->machine->doing_call)
598 /* Pretend to return in a hard reg for early uses before pseudos can be
599 generated. */
600 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
601
602 return gen_reg_rtx (mode);
603 }
604
605 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
606 where function FUNC returns or receives a value of data type TYPE. */
607
608 static rtx
609 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
610 bool outgoing)
611 {
612 machine_mode mode = promote_return (TYPE_MODE (type));
613
614 if (outgoing)
615 {
616 gcc_assert (cfun);
617 cfun->machine->return_mode = mode;
618 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
619 }
620
621 return nvptx_libcall_value (mode, NULL_RTX);
622 }
623
624 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
625
626 static bool
627 nvptx_function_value_regno_p (const unsigned int regno)
628 {
629 return regno == NVPTX_RETURN_REGNUM;
630 }
631
632 /* Types with a mode other than those supported by the machine are passed by
633 reference in memory. */
634
635 static bool
636 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
637 machine_mode mode, const_tree type,
638 bool ARG_UNUSED (named))
639 {
640 return pass_in_memory (mode, type, false);
641 }
642
643 /* Implement TARGET_RETURN_IN_MEMORY. */
644
645 static bool
646 nvptx_return_in_memory (const_tree type, const_tree)
647 {
648 return pass_in_memory (TYPE_MODE (type), type, true);
649 }
650
651 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
652
653 static machine_mode
654 nvptx_promote_function_mode (const_tree type, machine_mode mode,
655 int *ARG_UNUSED (punsignedp),
656 const_tree funtype, int for_return)
657 {
658 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
659 }
660
661 /* Helper for write_arg. Emit a single PTX argument of MODE, either
662 in a prototype, or as copy in a function prologue. ARGNO is the
663 index of this argument in the PTX function. FOR_REG is negative,
664 if we're emitting the PTX prototype. It is zero if we're copying
665 to an argument register and it is greater than zero if we're
666 copying to a specific hard register. */
667
668 static int
669 write_arg_mode (std::stringstream &s, int for_reg, int argno,
670 machine_mode mode)
671 {
672 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
673
674 if (for_reg < 0)
675 {
676 /* Writing PTX prototype. */
677 s << (argno ? ", " : " (");
678 s << ".param" << ptx_type << " %in_ar" << argno;
679 }
680 else
681 {
682 s << "\t.reg" << ptx_type << " ";
683 if (for_reg)
684 s << reg_names[for_reg];
685 else
686 s << "%ar" << argno;
687 s << ";\n";
688 if (argno >= 0)
689 {
690 s << "\tld.param" << ptx_type << " ";
691 if (for_reg)
692 s << reg_names[for_reg];
693 else
694 s << "%ar" << argno;
695 s << ", [%in_ar" << argno << "];\n";
696 }
697 }
698 return argno + 1;
699 }
700
701 /* Process function parameter TYPE to emit one or more PTX
702 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
703 is true, if this is a prototyped function, rather than an old-style
704 C declaration. Returns the next argument number to use.
705
706 The promotion behavior here must match the regular GCC function
707 parameter marshalling machinery. */
708
709 static int
710 write_arg_type (std::stringstream &s, int for_reg, int argno,
711 tree type, bool prototyped)
712 {
713 machine_mode mode = TYPE_MODE (type);
714
715 if (mode == VOIDmode)
716 return argno;
717
718 if (pass_in_memory (mode, type, false))
719 mode = Pmode;
720 else
721 {
722 bool split = TREE_CODE (type) == COMPLEX_TYPE;
723
724 if (split)
725 {
726 /* Complex types are sent as two separate args. */
727 type = TREE_TYPE (type);
728 mode = TYPE_MODE (type);
729 prototyped = true;
730 }
731
732 mode = promote_arg (mode, prototyped);
733 if (split)
734 argno = write_arg_mode (s, for_reg, argno, mode);
735 }
736
737 return write_arg_mode (s, for_reg, argno, mode);
738 }
739
740 /* Emit a PTX return as a prototype or function prologue declaration
741 for MODE. */
742
743 static void
744 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
745 {
746 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
747 const char *pfx = "\t.reg";
748 const char *sfx = ";\n";
749
750 if (for_proto)
751 pfx = "(.param", sfx = "_out) ";
752
753 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
754 }
755
756 /* Process a function return TYPE to emit a PTX return as a prototype
757 or function prologue declaration. Returns true if return is via an
758 additional pointer parameter. The promotion behavior here must
759 match the regular GCC function return mashalling. */
760
761 static bool
762 write_return_type (std::stringstream &s, bool for_proto, tree type)
763 {
764 machine_mode mode = TYPE_MODE (type);
765
766 if (mode == VOIDmode)
767 return false;
768
769 bool return_in_mem = pass_in_memory (mode, type, true);
770
771 if (return_in_mem)
772 {
773 if (for_proto)
774 return return_in_mem;
775
776 /* Named return values can cause us to return a pointer as well
777 as expect an argument for the return location. This is
778 optimization-level specific, so no caller can make use of
779 this data, but more importantly for us, we must ensure it
780 doesn't change the PTX prototype. */
781 mode = (machine_mode) cfun->machine->return_mode;
782
783 if (mode == VOIDmode)
784 return return_in_mem;
785
786 /* Clear return_mode to inhibit copy of retval to non-existent
787 retval parameter. */
788 cfun->machine->return_mode = VOIDmode;
789 }
790 else
791 mode = promote_return (mode);
792
793 write_return_mode (s, for_proto, mode);
794
795 return return_in_mem;
796 }
797
798 /* Look for attributes in ATTRS that would indicate we must write a function
799 as a .entry kernel rather than a .func. Return true if one is found. */
800
801 static bool
802 write_as_kernel (tree attrs)
803 {
804 return (lookup_attribute ("kernel", attrs) != NULL_TREE
805 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
806 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
807 /* For OpenMP target regions, the corresponding kernel entry is emitted from
808 write_omp_entry as a separate function. */
809 }
810
811 /* Emit a linker marker for a function decl or defn. */
812
813 static void
814 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
815 const char *name)
816 {
817 s << "\n// BEGIN";
818 if (globalize)
819 s << " GLOBAL";
820 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
821 s << name << "\n";
822 }
823
824 /* Emit a linker marker for a variable decl or defn. */
825
826 static void
827 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
828 {
829 fprintf (file, "\n// BEGIN%s VAR %s: ",
830 globalize ? " GLOBAL" : "",
831 is_defn ? "DEF" : "DECL");
832 assemble_name_raw (file, name);
833 fputs ("\n", file);
834 }
835
836 /* Write a .func or .kernel declaration or definition along with
837 a helper comment for use by ld. S is the stream to write to, DECL
838 the decl for the function with name NAME. For definitions, emit
839 a declaration too. */
840
841 static const char *
842 write_fn_proto (std::stringstream &s, bool is_defn,
843 const char *name, const_tree decl)
844 {
845 if (is_defn)
846 /* Emit a declaration. The PTX assembler gets upset without it. */
847 name = write_fn_proto (s, false, name, decl);
848 else
849 {
850 /* Avoid repeating the name replacement. */
851 name = nvptx_name_replacement (name);
852 if (name[0] == '*')
853 name++;
854 }
855
856 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
857
858 /* PTX declaration. */
859 if (DECL_EXTERNAL (decl))
860 s << ".extern ";
861 else if (TREE_PUBLIC (decl))
862 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
863 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
864
865 tree fntype = TREE_TYPE (decl);
866 tree result_type = TREE_TYPE (fntype);
867
868 /* atomic_compare_exchange_$n builtins have an exceptional calling
869 convention. */
870 int not_atomic_weak_arg = -1;
871 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
872 switch (DECL_FUNCTION_CODE (decl))
873 {
874 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
875 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
876 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
877 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
878 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
879 /* These atomics skip the 'weak' parm in an actual library
880 call. We must skip it in the prototype too. */
881 not_atomic_weak_arg = 3;
882 break;
883
884 default:
885 break;
886 }
887
888 /* Declare the result. */
889 bool return_in_mem = write_return_type (s, true, result_type);
890
891 s << name;
892
893 int argno = 0;
894
895 /* Emit argument list. */
896 if (return_in_mem)
897 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
898
899 /* We get:
900 NULL in TYPE_ARG_TYPES, for old-style functions
901 NULL in DECL_ARGUMENTS, for builtin functions without another
902 declaration.
903 So we have to pick the best one we have. */
904 tree args = TYPE_ARG_TYPES (fntype);
905 bool prototyped = true;
906 if (!args)
907 {
908 args = DECL_ARGUMENTS (decl);
909 prototyped = false;
910 }
911
912 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
913 {
914 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
915
916 if (not_atomic_weak_arg)
917 argno = write_arg_type (s, -1, argno, type, prototyped);
918 else
919 gcc_assert (type == boolean_type_node);
920 }
921
922 if (stdarg_p (fntype))
923 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
924
925 if (DECL_STATIC_CHAIN (decl))
926 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
927
928 if (!argno && strcmp (name, "main") == 0)
929 {
930 argno = write_arg_type (s, -1, argno, integer_type_node, true);
931 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
932 }
933
934 if (argno)
935 s << ")";
936
937 s << (is_defn ? "\n" : ";\n");
938
939 return name;
940 }
941
942 /* Construct a function declaration from a call insn. This can be
943 necessary for two reasons - either we have an indirect call which
944 requires a .callprototype declaration, or we have a libcall
945 generated by emit_library_call for which no decl exists. */
946
947 static void
948 write_fn_proto_from_insn (std::stringstream &s, const char *name,
949 rtx result, rtx pat)
950 {
951 if (!name)
952 {
953 s << "\t.callprototype ";
954 name = "_";
955 }
956 else
957 {
958 name = nvptx_name_replacement (name);
959 write_fn_marker (s, false, true, name);
960 s << "\t.extern .func ";
961 }
962
963 if (result != NULL_RTX)
964 write_return_mode (s, true, GET_MODE (result));
965
966 s << name;
967
968 int arg_end = XVECLEN (pat, 0);
969 for (int i = 1; i < arg_end; i++)
970 {
971 /* We don't have to deal with mode splitting & promotion here,
972 as that was already done when generating the call
973 sequence. */
974 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
975
976 write_arg_mode (s, -1, i - 1, mode);
977 }
978 if (arg_end != 1)
979 s << ")";
980 s << ";\n";
981 }
982
983 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
984 table and and write a ptx prototype. These are emitted at end of
985 compilation. */
986
987 static void
988 nvptx_record_fndecl (tree decl)
989 {
990 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
991 if (*slot == NULL)
992 {
993 *slot = decl;
994 const char *name = get_fnname_from_decl (decl);
995 write_fn_proto (func_decls, false, name, decl);
996 }
997 }
998
999 /* Record a libcall or unprototyped external function. CALLEE is the
1000 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1001 declaration for it. */
1002
1003 static void
1004 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1005 {
1006 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1007 if (*slot == NULL)
1008 {
1009 *slot = callee;
1010
1011 const char *name = XSTR (callee, 0);
1012 write_fn_proto_from_insn (func_decls, name, retval, pat);
1013 }
1014 }
1015
1016 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1017 is prototyped, record it now. Otherwise record it as needed at end
1018 of compilation, when we might have more information about it. */
1019
1020 void
1021 nvptx_record_needed_fndecl (tree decl)
1022 {
1023 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1024 {
1025 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1026 if (*slot == NULL)
1027 *slot = decl;
1028 }
1029 else
1030 nvptx_record_fndecl (decl);
1031 }
1032
1033 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1034 it as needed. */
1035
1036 static void
1037 nvptx_maybe_record_fnsym (rtx sym)
1038 {
1039 tree decl = SYMBOL_REF_DECL (sym);
1040
1041 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1042 nvptx_record_needed_fndecl (decl);
1043 }
1044
1045 /* Emit a local array to hold some part of a conventional stack frame
1046 and initialize REGNO to point to it. If the size is zero, it'll
1047 never be valid to dereference, so we can simply initialize to
1048 zero. */
1049
1050 static void
1051 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1052 {
1053 if (size)
1054 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1055 align, reg_names[regno], size);
1056 fprintf (file, "\t.reg.u%d %s;\n",
1057 POINTER_SIZE, reg_names[regno]);
1058 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1059 : "\tmov.u%d %s, 0;\n"),
1060 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1061 }
1062
1063 /* Emit soft stack frame setup sequence. */
1064
1065 static void
1066 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1067 {
1068 /* Maintain 64-bit stack alignment. */
1069 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1070 size = ROUND_UP (size, keep_align);
1071 int bits = POINTER_SIZE;
1072 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1073 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1074 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1075 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1076 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1077 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1078 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1079 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1080 fprintf (file, "\t{\n");
1081 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1082 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1083 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1084 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1085 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1086 bits == 64 ? ".wide" : ".lo", bits / 8);
1087 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1088
1089 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1090 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1091
1092 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1093 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1094 bits, reg_sspprev, reg_sspslot);
1095
1096 /* Initialize %frame = %sspprev - size. */
1097 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1098 bits, reg_frame, reg_sspprev, size);
1099
1100 /* Apply alignment, if larger than 64. */
1101 if (alignment > keep_align)
1102 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1103 bits, reg_frame, reg_frame, -alignment);
1104
1105 size = crtl->outgoing_args_size;
1106 gcc_assert (size % keep_align == 0);
1107
1108 /* Initialize %stack. */
1109 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1110 bits, reg_stack, reg_frame, size);
1111
1112 if (!crtl->is_leaf)
1113 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1114 bits, reg_sspslot, reg_stack);
1115 fprintf (file, "\t}\n");
1116 cfun->machine->has_softstack = true;
1117 need_softstack_decl = true;
1118 }
1119
1120 /* Emit code to initialize the REGNO predicate register to indicate
1121 whether we are not lane zero on the NAME axis. */
1122
1123 static void
1124 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1125 {
1126 fprintf (file, "\t{\n");
1127 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1128 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1129 {
1130 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1131 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1132 }
1133 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1134 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1135 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1136 {
1137 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1138 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1139 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1140 "// vector reduction buffer\n",
1141 REGNO (cfun->machine->red_partition),
1142 vector_red_partition);
1143 }
1144 /* Verify vector_red_size. */
1145 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1146 <= vector_red_size);
1147 fprintf (file, "\t}\n");
1148 }
1149
1150 /* Emit code to initialize OpenACC worker broadcast and synchronization
1151 registers. */
1152
1153 static void
1154 nvptx_init_oacc_workers (FILE *file)
1155 {
1156 fprintf (file, "\t{\n");
1157 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1158 if (cfun->machine->bcast_partition)
1159 {
1160 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1161 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1162 }
1163 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1164 if (cfun->machine->bcast_partition)
1165 {
1166 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1167 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1168 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1169 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1170 "// vector broadcast offset\n",
1171 REGNO (cfun->machine->bcast_partition),
1172 oacc_bcast_partition);
1173 }
1174 /* Verify oacc_bcast_size. */
1175 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1176 <= oacc_bcast_size);
1177 if (cfun->machine->sync_bar)
1178 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1179 "// vector synchronization barrier\n",
1180 REGNO (cfun->machine->sync_bar));
1181 fprintf (file, "\t}\n");
1182 }
1183
1184 /* Emit code to initialize predicate and master lane index registers for
1185 -muniform-simt code generation variant. */
1186
1187 static void
1188 nvptx_init_unisimt_predicate (FILE *file)
1189 {
1190 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1191 int loc = REGNO (cfun->machine->unisimt_location);
1192 int bits = POINTER_SIZE;
1193 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1194 fprintf (file, "\t{\n");
1195 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1196 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1197 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1198 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1199 bits == 64 ? ".wide" : ".lo");
1200 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1201 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1202 if (cfun->machine->unisimt_predicate)
1203 {
1204 int master = REGNO (cfun->machine->unisimt_master);
1205 int pred = REGNO (cfun->machine->unisimt_predicate);
1206 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1207 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1208 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1209 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1210 /* Compute predicate as 'tid.x == master'. */
1211 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1212 }
1213 fprintf (file, "\t}\n");
1214 need_unisimt_decl = true;
1215 }
1216
1217 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1218
1219 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1220 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1221 {
1222 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1223 __nvptx_uni[tid.y] = 0;
1224 gomp_nvptx_main (ORIG, arg);
1225 }
1226 ORIG itself should not be emitted as a PTX .entry function. */
1227
1228 static void
1229 write_omp_entry (FILE *file, const char *name, const char *orig)
1230 {
1231 static bool gomp_nvptx_main_declared;
1232 if (!gomp_nvptx_main_declared)
1233 {
1234 gomp_nvptx_main_declared = true;
1235 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1236 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1237 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1238 }
1239 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1240 #define NTID_Y "%ntid.y"
1241 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1242 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1243 {\n\
1244 .reg.u32 %r<3>;\n\
1245 .reg.u" PS " %R<4>;\n\
1246 mov.u32 %r0, %tid.y;\n\
1247 mov.u32 %r1, " NTID_Y ";\n\
1248 mov.u32 %r2, %ctaid.x;\n\
1249 cvt.u" PS ".u32 %R1, %r0;\n\
1250 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1251 mov.u" PS " %R0, __nvptx_stacks;\n\
1252 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1253 ld.param.u" PS " %R2, [%stack];\n\
1254 ld.param.u" PS " %R3, [%sz];\n\
1255 add.u" PS " %R2, %R2, %R3;\n\
1256 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1257 st.shared.u" PS " [%R0], %R2;\n\
1258 mov.u" PS " %R0, __nvptx_uni;\n\
1259 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1260 mov.u32 %r0, 0;\n\
1261 st.shared.u32 [%R0], %r0;\n\
1262 mov.u" PS " %R0, \0;\n\
1263 ld.param.u" PS " %R1, [%arg];\n\
1264 {\n\
1265 .param.u" PS " %P<2>;\n\
1266 st.param.u" PS " [%P0], %R0;\n\
1267 st.param.u" PS " [%P1], %R1;\n\
1268 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1269 }\n\
1270 ret.uni;\n\
1271 }\n"
1272 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1273 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1274 #undef ENTRY_TEMPLATE
1275 #undef NTID_Y
1276 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1277 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1278 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1279 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1280 need_softstack_decl = need_unisimt_decl = true;
1281 }
1282
1283 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1284 function, including local var decls and copies from the arguments to
1285 local regs. */
1286
1287 void
1288 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1289 {
1290 tree fntype = TREE_TYPE (decl);
1291 tree result_type = TREE_TYPE (fntype);
1292 int argno = 0;
1293
1294 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1295 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1296 {
1297 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1298 sprintf (buf, "%s$impl", name);
1299 write_omp_entry (file, name, buf);
1300 name = buf;
1301 }
1302 /* We construct the initial part of the function into a string
1303 stream, in order to share the prototype writing code. */
1304 std::stringstream s;
1305 write_fn_proto (s, true, name, decl);
1306 s << "{\n";
1307
1308 bool return_in_mem = write_return_type (s, false, result_type);
1309 if (return_in_mem)
1310 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1311
1312 /* Declare and initialize incoming arguments. */
1313 tree args = TYPE_ARG_TYPES (fntype);
1314 bool prototyped = true;
1315 if (!args)
1316 {
1317 args = DECL_ARGUMENTS (decl);
1318 prototyped = false;
1319 }
1320
1321 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1322 {
1323 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1324
1325 argno = write_arg_type (s, 0, argno, type, prototyped);
1326 }
1327
1328 if (stdarg_p (fntype))
1329 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1330 true);
1331
1332 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1333 write_arg_type (s, STATIC_CHAIN_REGNUM,
1334 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1335 true);
1336
1337 fprintf (file, "%s", s.str().c_str());
1338
1339 /* Usually 'crtl->is_leaf' is computed during register allocator
1340 initialization (which is not done on NVPTX) or for pressure-sensitive
1341 optimizations. Initialize it here, except if already set. */
1342 if (!crtl->is_leaf)
1343 crtl->is_leaf = leaf_function_p ();
1344
1345 HOST_WIDE_INT sz = get_frame_size ();
1346 bool need_frameptr = sz || cfun->machine->has_chain;
1347 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1348 if (!TARGET_SOFT_STACK)
1349 {
1350 /* Declare a local var for outgoing varargs. */
1351 if (cfun->machine->has_varadic)
1352 init_frame (file, STACK_POINTER_REGNUM,
1353 UNITS_PER_WORD, crtl->outgoing_args_size);
1354
1355 /* Declare a local variable for the frame. Force its size to be
1356 DImode-compatible. */
1357 if (need_frameptr)
1358 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1359 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1360 }
1361 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1362 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1363 init_softstack_frame (file, alignment, sz);
1364
1365 if (cfun->machine->has_simtreg)
1366 {
1367 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1368 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1369 align = MAX (align, GET_MODE_SIZE (DImode));
1370 if (!crtl->is_leaf || cfun->calls_alloca)
1371 simtsz = HOST_WIDE_INT_M1U;
1372 if (simtsz == HOST_WIDE_INT_M1U)
1373 simtsz = nvptx_softstack_size;
1374 if (cfun->machine->has_softstack)
1375 simtsz += POINTER_SIZE / 8;
1376 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1377 if (align > GET_MODE_SIZE (DImode))
1378 simtsz += align - GET_MODE_SIZE (DImode);
1379 if (simtsz)
1380 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1381 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1382 }
1383
1384 /* Restore the vector reduction partition register, if necessary.
1385 FIXME: Find out when and why this is necessary, and fix it. */
1386 if (cfun->machine->red_partition)
1387 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1388 = cfun->machine->red_partition;
1389
1390 /* Declare the pseudos we have as ptx registers. */
1391 int maxregs = max_reg_num ();
1392 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1393 {
1394 if (regno_reg_rtx[i] != const0_rtx)
1395 {
1396 machine_mode mode = PSEUDO_REGNO_MODE (i);
1397 machine_mode split = maybe_split_mode (mode);
1398
1399 if (split_mode_p (mode))
1400 mode = split;
1401 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1402 output_reg (file, i, split, -2);
1403 fprintf (file, ";\n");
1404 }
1405 }
1406
1407 /* Emit axis predicates. */
1408 if (cfun->machine->axis_predicate[0])
1409 nvptx_init_axis_predicate (file,
1410 REGNO (cfun->machine->axis_predicate[0]), "y");
1411 if (cfun->machine->axis_predicate[1])
1412 nvptx_init_axis_predicate (file,
1413 REGNO (cfun->machine->axis_predicate[1]), "x");
1414 if (cfun->machine->unisimt_predicate
1415 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1416 nvptx_init_unisimt_predicate (file);
1417 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1418 nvptx_init_oacc_workers (file);
1419 }
1420
1421 /* Output code for switching uniform-simt state. ENTERING indicates whether
1422 we are entering or leaving non-uniform execution region. */
1423
1424 static void
1425 nvptx_output_unisimt_switch (FILE *file, bool entering)
1426 {
1427 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1428 return;
1429 fprintf (file, "\t{\n");
1430 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1431 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1432 if (!crtl->is_leaf)
1433 {
1434 int loc = REGNO (cfun->machine->unisimt_location);
1435 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1436 }
1437 if (cfun->machine->unisimt_predicate)
1438 {
1439 int master = REGNO (cfun->machine->unisimt_master);
1440 int pred = REGNO (cfun->machine->unisimt_predicate);
1441 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1442 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1443 master, entering ? "%ustmp2" : "0");
1444 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1445 }
1446 fprintf (file, "\t}\n");
1447 }
1448
1449 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1450 ENTERING indicates whether we are entering or leaving non-uniform execution.
1451 PTR is the register pointing to allocated storage, it is assigned to on
1452 entering and used to restore state on leaving. SIZE and ALIGN are used only
1453 on entering. */
1454
1455 static void
1456 nvptx_output_softstack_switch (FILE *file, bool entering,
1457 rtx ptr, rtx size, rtx align)
1458 {
1459 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1460 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1461 return;
1462 int bits = POINTER_SIZE, regno = REGNO (ptr);
1463 fprintf (file, "\t{\n");
1464 if (entering)
1465 {
1466 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1467 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1468 cfun->machine->simt_stack_size);
1469 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1470 if (CONST_INT_P (size))
1471 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1472 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1473 else
1474 output_reg (file, REGNO (size), VOIDmode);
1475 fputs (";\n", file);
1476 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1477 fprintf (file,
1478 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1479 bits, regno, regno, UINTVAL (align));
1480 }
1481 if (cfun->machine->has_softstack)
1482 {
1483 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1484 if (entering)
1485 {
1486 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1487 bits, regno, bits / 8, reg_stack);
1488 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1489 bits, reg_stack, regno, bits / 8);
1490 }
1491 else
1492 {
1493 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1494 bits, reg_stack, regno, bits / 8);
1495 }
1496 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1497 }
1498 fprintf (file, "\t}\n");
1499 }
1500
1501 /* Output code to enter non-uniform execution region. DEST is a register
1502 to hold a per-lane allocation given by SIZE and ALIGN. */
1503
1504 const char *
1505 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1506 {
1507 nvptx_output_unisimt_switch (asm_out_file, true);
1508 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1509 return "";
1510 }
1511
1512 /* Output code to leave non-uniform execution region. SRC is the register
1513 holding per-lane storage previously allocated by omp_simt_enter insn. */
1514
1515 const char *
1516 nvptx_output_simt_exit (rtx src)
1517 {
1518 nvptx_output_unisimt_switch (asm_out_file, false);
1519 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1520 return "";
1521 }
1522
1523 /* Output instruction that sets soft stack pointer in shared memory to the
1524 value in register given by SRC_REGNO. */
1525
1526 const char *
1527 nvptx_output_set_softstack (unsigned src_regno)
1528 {
1529 if (cfun->machine->has_softstack && !crtl->is_leaf)
1530 {
1531 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1532 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1533 output_reg (asm_out_file, src_regno, VOIDmode);
1534 fprintf (asm_out_file, ";\n");
1535 }
1536 return "";
1537 }
1538 /* Output a return instruction. Also copy the return value to its outgoing
1539 location. */
1540
1541 const char *
1542 nvptx_output_return (void)
1543 {
1544 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1545
1546 if (mode != VOIDmode)
1547 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1548 nvptx_ptx_type_from_mode (mode, false),
1549 reg_names[NVPTX_RETURN_REGNUM],
1550 reg_names[NVPTX_RETURN_REGNUM]);
1551
1552 return "ret;";
1553 }
1554
1555 /* Terminate a function by writing a closing brace to FILE. */
1556
1557 void
1558 nvptx_function_end (FILE *file)
1559 {
1560 fprintf (file, "}\n");
1561 }
1562 \f
1563 /* Decide whether we can make a sibling call to a function. For ptx, we
1564 can't. */
1565
1566 static bool
1567 nvptx_function_ok_for_sibcall (tree, tree)
1568 {
1569 return false;
1570 }
1571
1572 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1573
1574 static rtx
1575 nvptx_get_drap_rtx (void)
1576 {
1577 if (TARGET_SOFT_STACK && stack_realign_drap)
1578 return arg_pointer_rtx;
1579 return NULL_RTX;
1580 }
1581
1582 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1583 argument to the next call. */
1584
1585 static void
1586 nvptx_call_args (rtx arg, tree fntype)
1587 {
1588 if (!cfun->machine->doing_call)
1589 {
1590 cfun->machine->doing_call = true;
1591 cfun->machine->is_varadic = false;
1592 cfun->machine->num_args = 0;
1593
1594 if (fntype && stdarg_p (fntype))
1595 {
1596 cfun->machine->is_varadic = true;
1597 cfun->machine->has_varadic = true;
1598 cfun->machine->num_args++;
1599 }
1600 }
1601
1602 if (REG_P (arg) && arg != pc_rtx)
1603 {
1604 cfun->machine->num_args++;
1605 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1606 cfun->machine->call_args);
1607 }
1608 }
1609
1610 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1611 information we recorded. */
1612
1613 static void
1614 nvptx_end_call_args (void)
1615 {
1616 cfun->machine->doing_call = false;
1617 free_EXPR_LIST_list (&cfun->machine->call_args);
1618 }
1619
1620 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1621 track of whether calls involving static chains or varargs were seen
1622 in the current function.
1623 For libcalls, maintain a hash table of decls we have seen, and
1624 record a function decl for later when encountering a new one. */
1625
1626 void
1627 nvptx_expand_call (rtx retval, rtx address)
1628 {
1629 rtx callee = XEXP (address, 0);
1630 rtx varargs = NULL_RTX;
1631 unsigned parallel = 0;
1632
1633 if (!call_insn_operand (callee, Pmode))
1634 {
1635 callee = force_reg (Pmode, callee);
1636 address = change_address (address, QImode, callee);
1637 }
1638
1639 if (GET_CODE (callee) == SYMBOL_REF)
1640 {
1641 tree decl = SYMBOL_REF_DECL (callee);
1642 if (decl != NULL_TREE)
1643 {
1644 if (DECL_STATIC_CHAIN (decl))
1645 cfun->machine->has_chain = true;
1646
1647 tree attr = oacc_get_fn_attrib (decl);
1648 if (attr)
1649 {
1650 tree dims = TREE_VALUE (attr);
1651
1652 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1653 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1654 {
1655 if (TREE_PURPOSE (dims)
1656 && !integer_zerop (TREE_PURPOSE (dims)))
1657 break;
1658 /* Not on this axis. */
1659 parallel ^= GOMP_DIM_MASK (ix);
1660 dims = TREE_CHAIN (dims);
1661 }
1662 }
1663 }
1664 }
1665
1666 unsigned nargs = cfun->machine->num_args;
1667 if (cfun->machine->is_varadic)
1668 {
1669 varargs = gen_reg_rtx (Pmode);
1670 emit_move_insn (varargs, stack_pointer_rtx);
1671 }
1672
1673 rtvec vec = rtvec_alloc (nargs + 1);
1674 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1675 int vec_pos = 0;
1676
1677 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1678 rtx tmp_retval = retval;
1679 if (retval)
1680 {
1681 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1682 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1683 call = gen_rtx_SET (tmp_retval, call);
1684 }
1685 XVECEXP (pat, 0, vec_pos++) = call;
1686
1687 /* Construct the call insn, including a USE for each argument pseudo
1688 register. These will be used when printing the insn. */
1689 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1690 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1691
1692 if (varargs)
1693 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1694
1695 gcc_assert (vec_pos = XVECLEN (pat, 0));
1696
1697 nvptx_emit_forking (parallel, true);
1698 emit_call_insn (pat);
1699 nvptx_emit_joining (parallel, true);
1700
1701 if (tmp_retval != retval)
1702 emit_move_insn (retval, tmp_retval);
1703 }
1704
1705 /* Emit a comparison COMPARE, and return the new test to be used in the
1706 jump. */
1707
1708 rtx
1709 nvptx_expand_compare (rtx compare)
1710 {
1711 rtx pred = gen_reg_rtx (BImode);
1712 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1713 XEXP (compare, 0), XEXP (compare, 1));
1714 emit_insn (gen_rtx_SET (pred, cmp));
1715 return gen_rtx_NE (BImode, pred, const0_rtx);
1716 }
1717
1718 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1719
1720 void
1721 nvptx_expand_oacc_fork (unsigned mode)
1722 {
1723 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1724 }
1725
1726 void
1727 nvptx_expand_oacc_join (unsigned mode)
1728 {
1729 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1730 }
1731
1732 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1733 objects. */
1734
1735 static rtx
1736 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1737 {
1738 rtx res;
1739
1740 switch (GET_MODE (src))
1741 {
1742 case E_DImode:
1743 res = gen_unpackdisi2 (dst0, dst1, src);
1744 break;
1745 case E_DFmode:
1746 res = gen_unpackdfsi2 (dst0, dst1, src);
1747 break;
1748 default: gcc_unreachable ();
1749 }
1750 return res;
1751 }
1752
1753 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1754 object. */
1755
1756 static rtx
1757 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1758 {
1759 rtx res;
1760
1761 switch (GET_MODE (dst))
1762 {
1763 case E_DImode:
1764 res = gen_packsidi2 (dst, src0, src1);
1765 break;
1766 case E_DFmode:
1767 res = gen_packsidf2 (dst, src0, src1);
1768 break;
1769 default: gcc_unreachable ();
1770 }
1771 return res;
1772 }
1773
1774 /* Generate an instruction or sequence to broadcast register REG
1775 across the vectors of a single warp. */
1776
1777 rtx
1778 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1779 {
1780 rtx res;
1781
1782 switch (GET_MODE (dst))
1783 {
1784 case E_SImode:
1785 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1786 break;
1787 case E_SFmode:
1788 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1789 break;
1790 case E_DImode:
1791 case E_DFmode:
1792 {
1793 rtx tmp0 = gen_reg_rtx (SImode);
1794 rtx tmp1 = gen_reg_rtx (SImode);
1795
1796 start_sequence ();
1797 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1798 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1799 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1800 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1801 res = get_insns ();
1802 end_sequence ();
1803 }
1804 break;
1805 case E_BImode:
1806 {
1807 rtx tmp = gen_reg_rtx (SImode);
1808
1809 start_sequence ();
1810 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1811 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1812 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1813 res = get_insns ();
1814 end_sequence ();
1815 }
1816 break;
1817 case E_QImode:
1818 case E_HImode:
1819 {
1820 rtx tmp = gen_reg_rtx (SImode);
1821
1822 start_sequence ();
1823 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1824 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1825 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1826 tmp)));
1827 res = get_insns ();
1828 end_sequence ();
1829 }
1830 break;
1831
1832 default:
1833 gcc_unreachable ();
1834 }
1835 return res;
1836 }
1837
1838 /* Generate an instruction or sequence to broadcast register REG
1839 across the vectors of a single warp. */
1840
1841 static rtx
1842 nvptx_gen_warp_bcast (rtx reg)
1843 {
1844 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1845 }
1846
1847 /* Structure used when generating a worker-level spill or fill. */
1848
1849 struct broadcast_data_t
1850 {
1851 rtx base; /* Register holding base addr of buffer. */
1852 rtx ptr; /* Iteration var, if needed. */
1853 unsigned offset; /* Offset into worker buffer. */
1854 };
1855
1856 /* Direction of the spill/fill and looping setup/teardown indicator. */
1857
1858 enum propagate_mask
1859 {
1860 PM_read = 1 << 0,
1861 PM_write = 1 << 1,
1862 PM_loop_begin = 1 << 2,
1863 PM_loop_end = 1 << 3,
1864
1865 PM_read_write = PM_read | PM_write
1866 };
1867
1868 /* Generate instruction(s) to spill or fill register REG to/from the
1869 worker broadcast array. PM indicates what is to be done, REP
1870 how many loop iterations will be executed (0 for not a loop). */
1871
1872 static rtx
1873 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1874 broadcast_data_t *data, bool vector)
1875 {
1876 rtx res;
1877 machine_mode mode = GET_MODE (reg);
1878
1879 switch (mode)
1880 {
1881 case E_BImode:
1882 {
1883 rtx tmp = gen_reg_rtx (SImode);
1884
1885 start_sequence ();
1886 if (pm & PM_read)
1887 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1888 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1889 if (pm & PM_write)
1890 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1891 res = get_insns ();
1892 end_sequence ();
1893 }
1894 break;
1895
1896 default:
1897 {
1898 rtx addr = data->ptr;
1899
1900 if (!addr)
1901 {
1902 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1903
1904 oacc_bcast_align = MAX (oacc_bcast_align, align);
1905 data->offset = ROUND_UP (data->offset, align);
1906 addr = data->base;
1907 gcc_assert (data->base != NULL);
1908 if (data->offset)
1909 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1910 }
1911
1912 addr = gen_rtx_MEM (mode, addr);
1913 if (pm == PM_read)
1914 res = gen_rtx_SET (addr, reg);
1915 else if (pm == PM_write)
1916 res = gen_rtx_SET (reg, addr);
1917 else
1918 gcc_unreachable ();
1919
1920 if (data->ptr)
1921 {
1922 /* We're using a ptr, increment it. */
1923 start_sequence ();
1924
1925 emit_insn (res);
1926 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1927 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1928 res = get_insns ();
1929 end_sequence ();
1930 }
1931 else
1932 rep = 1;
1933 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1934 }
1935 break;
1936 }
1937 return res;
1938 }
1939 \f
1940 /* Returns true if X is a valid address for use in a memory reference. */
1941
1942 static bool
1943 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1944 {
1945 enum rtx_code code = GET_CODE (x);
1946
1947 switch (code)
1948 {
1949 case REG:
1950 return true;
1951
1952 case PLUS:
1953 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1954 return true;
1955 return false;
1956
1957 case CONST:
1958 case SYMBOL_REF:
1959 case LABEL_REF:
1960 return true;
1961
1962 default:
1963 return false;
1964 }
1965 }
1966 \f
1967 /* Machinery to output constant initializers. When beginning an
1968 initializer, we decide on a fragment size (which is visible in ptx
1969 in the type used), and then all initializer data is buffered until
1970 a fragment is filled and ready to be written out. */
1971
1972 static struct
1973 {
1974 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1975 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1976 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1977 out. */
1978 unsigned size; /* Fragment size to accumulate. */
1979 unsigned offset; /* Offset within current fragment. */
1980 bool started; /* Whether we've output any initializer. */
1981 } init_frag;
1982
1983 /* The current fragment is full, write it out. SYM may provide a
1984 symbolic reference we should output, in which case the fragment
1985 value is the addend. */
1986
1987 static void
1988 output_init_frag (rtx sym)
1989 {
1990 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1991 unsigned HOST_WIDE_INT val = init_frag.val;
1992
1993 init_frag.started = true;
1994 init_frag.val = 0;
1995 init_frag.offset = 0;
1996 init_frag.remaining--;
1997
1998 if (sym)
1999 {
2000 bool function = (SYMBOL_REF_DECL (sym)
2001 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2002 if (!function)
2003 fprintf (asm_out_file, "generic(");
2004 output_address (VOIDmode, sym);
2005 if (!function)
2006 fprintf (asm_out_file, ")");
2007 if (val)
2008 fprintf (asm_out_file, " + ");
2009 }
2010
2011 if (!sym || val)
2012 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2013 }
2014
2015 /* Add value VAL of size SIZE to the data we're emitting, and keep
2016 writing out chunks as they fill up. */
2017
2018 static void
2019 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2020 {
2021 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2022
2023 for (unsigned part = 0; size; size -= part)
2024 {
2025 val >>= part * BITS_PER_UNIT;
2026 part = init_frag.size - init_frag.offset;
2027 part = MIN (part, size);
2028
2029 unsigned HOST_WIDE_INT partial
2030 = val << (init_frag.offset * BITS_PER_UNIT);
2031 init_frag.val |= partial & init_frag.mask;
2032 init_frag.offset += part;
2033
2034 if (init_frag.offset == init_frag.size)
2035 output_init_frag (NULL);
2036 }
2037 }
2038
2039 /* Target hook for assembling integer object X of size SIZE. */
2040
2041 static bool
2042 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2043 {
2044 HOST_WIDE_INT val = 0;
2045
2046 switch (GET_CODE (x))
2047 {
2048 default:
2049 /* Let the generic machinery figure it out, usually for a
2050 CONST_WIDE_INT. */
2051 return false;
2052
2053 case CONST_INT:
2054 nvptx_assemble_value (INTVAL (x), size);
2055 break;
2056
2057 case CONST:
2058 x = XEXP (x, 0);
2059 gcc_assert (GET_CODE (x) == PLUS);
2060 val = INTVAL (XEXP (x, 1));
2061 x = XEXP (x, 0);
2062 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2063 /* FALLTHROUGH */
2064
2065 case SYMBOL_REF:
2066 gcc_assert (size == init_frag.size);
2067 if (init_frag.offset)
2068 sorry ("cannot emit unaligned pointers in ptx assembly");
2069
2070 nvptx_maybe_record_fnsym (x);
2071 init_frag.val = val;
2072 output_init_frag (x);
2073 break;
2074 }
2075
2076 return true;
2077 }
2078
2079 /* Output SIZE zero bytes. We ignore the FILE argument since the
2080 functions we're calling to perform the output just use
2081 asm_out_file. */
2082
2083 void
2084 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2085 {
2086 /* Finish the current fragment, if it's started. */
2087 if (init_frag.offset)
2088 {
2089 unsigned part = init_frag.size - init_frag.offset;
2090 part = MIN (part, (unsigned)size);
2091 size -= part;
2092 nvptx_assemble_value (0, part);
2093 }
2094
2095 /* If this skip doesn't terminate the initializer, write as many
2096 remaining pieces as possible directly. */
2097 if (size < init_frag.remaining * init_frag.size)
2098 {
2099 while (size >= init_frag.size)
2100 {
2101 size -= init_frag.size;
2102 output_init_frag (NULL_RTX);
2103 }
2104 if (size)
2105 nvptx_assemble_value (0, size);
2106 }
2107 }
2108
2109 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2110 ignore the FILE arg. */
2111
2112 void
2113 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2114 {
2115 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2116 nvptx_assemble_value (str[i], 1);
2117 }
2118
2119 /* Return true if TYPE is a record type where the last field is an array without
2120 given dimension. */
2121
2122 static bool
2123 flexible_array_member_type_p (const_tree type)
2124 {
2125 if (TREE_CODE (type) != RECORD_TYPE)
2126 return false;
2127
2128 const_tree last_field = NULL_TREE;
2129 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2130 last_field = f;
2131
2132 if (!last_field)
2133 return false;
2134
2135 const_tree last_field_type = TREE_TYPE (last_field);
2136 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2137 return false;
2138
2139 return (! TYPE_DOMAIN (last_field_type)
2140 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2141 }
2142
2143 /* Emit a PTX variable decl and prepare for emission of its
2144 initializer. NAME is the symbol name and SETION the PTX data
2145 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2146 The caller has already emitted any indentation and linkage
2147 specifier. It is responsible for any initializer, terminating ;
2148 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2149 this is the opposite way round that PTX wants them! */
2150
2151 static void
2152 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2153 const_tree type, HOST_WIDE_INT size, unsigned align,
2154 bool undefined = false)
2155 {
2156 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2157 && (TYPE_DOMAIN (type) == NULL_TREE);
2158
2159 if (undefined && flexible_array_member_type_p (type))
2160 {
2161 size = 0;
2162 atype = true;
2163 }
2164
2165 while (TREE_CODE (type) == ARRAY_TYPE)
2166 type = TREE_TYPE (type);
2167
2168 if (TREE_CODE (type) == VECTOR_TYPE
2169 || TREE_CODE (type) == COMPLEX_TYPE)
2170 /* Neither vector nor complex types can contain the other. */
2171 type = TREE_TYPE (type);
2172
2173 unsigned elt_size = int_size_in_bytes (type);
2174
2175 /* Largest mode we're prepared to accept. For BLKmode types we
2176 don't know if it'll contain pointer constants, so have to choose
2177 pointer size, otherwise we can choose DImode. */
2178 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2179
2180 elt_size |= GET_MODE_SIZE (elt_mode);
2181 elt_size &= -elt_size; /* Extract LSB set. */
2182
2183 init_frag.size = elt_size;
2184 /* Avoid undefined shift behavior by using '2'. */
2185 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2186 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2187 init_frag.val = 0;
2188 init_frag.offset = 0;
2189 init_frag.started = false;
2190 /* Size might not be a multiple of elt size, if there's an
2191 initialized trailing struct array with smaller type than
2192 elt_size. */
2193 init_frag.remaining = (size + elt_size - 1) / elt_size;
2194
2195 fprintf (file, "%s .align %d .u%d ",
2196 section, align / BITS_PER_UNIT,
2197 elt_size * BITS_PER_UNIT);
2198 assemble_name (file, name);
2199
2200 if (size)
2201 /* We make everything an array, to simplify any initialization
2202 emission. */
2203 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2204 else if (atype)
2205 fprintf (file, "[]");
2206 }
2207
2208 /* Called when the initializer for a decl has been completely output through
2209 combinations of the three functions above. */
2210
2211 static void
2212 nvptx_assemble_decl_end (void)
2213 {
2214 if (init_frag.offset)
2215 /* This can happen with a packed struct with trailing array member. */
2216 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2217 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2218 }
2219
2220 /* Output an uninitialized common or file-scope variable. */
2221
2222 void
2223 nvptx_output_aligned_decl (FILE *file, const char *name,
2224 const_tree decl, HOST_WIDE_INT size, unsigned align)
2225 {
2226 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2227
2228 /* If this is public, it is common. The nearest thing we have to
2229 common is weak. */
2230 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2231
2232 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2233 TREE_TYPE (decl), size, align);
2234 nvptx_assemble_decl_end ();
2235 }
2236
2237 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2238 writing a constant variable EXP with NAME and SIZE and its
2239 initializer to FILE. */
2240
2241 static void
2242 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2243 const_tree exp, HOST_WIDE_INT obj_size)
2244 {
2245 write_var_marker (file, true, false, name);
2246
2247 fprintf (file, "\t");
2248
2249 tree type = TREE_TYPE (exp);
2250 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2251 TYPE_ALIGN (type));
2252 }
2253
2254 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2255 a variable DECL with NAME to FILE. */
2256
2257 void
2258 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2259 {
2260 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2261
2262 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2263 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2264
2265 tree type = TREE_TYPE (decl);
2266 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2267 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2268 type, obj_size, DECL_ALIGN (decl));
2269 }
2270
2271 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2272
2273 static void
2274 nvptx_globalize_label (FILE *, const char *)
2275 {
2276 }
2277
2278 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2279 declaration only for variable DECL with NAME to FILE. */
2280
2281 static void
2282 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2283 {
2284 /* The middle end can place constant pool decls into the varpool as
2285 undefined. Until that is fixed, catch the problem here. */
2286 if (DECL_IN_CONSTANT_POOL (decl))
2287 return;
2288
2289 /* We support weak defintions, and hence have the right
2290 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2291 if (DECL_WEAK (decl))
2292 error_at (DECL_SOURCE_LOCATION (decl),
2293 "PTX does not support weak declarations"
2294 " (only weak definitions)");
2295 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2296
2297 fprintf (file, "\t.extern ");
2298 tree size = DECL_SIZE_UNIT (decl);
2299 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2300 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2301 DECL_ALIGN (decl), true);
2302 nvptx_assemble_decl_end ();
2303 }
2304
2305 /* Output a pattern for a move instruction. */
2306
2307 const char *
2308 nvptx_output_mov_insn (rtx dst, rtx src)
2309 {
2310 machine_mode dst_mode = GET_MODE (dst);
2311 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2312 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2313 machine_mode src_inner = (GET_CODE (src) == SUBREG
2314 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2315
2316 rtx sym = src;
2317 if (GET_CODE (sym) == CONST)
2318 sym = XEXP (XEXP (sym, 0), 0);
2319 if (SYMBOL_REF_P (sym))
2320 {
2321 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2322 return "%.\tcvta%D1%t0\t%0, %1;";
2323 nvptx_maybe_record_fnsym (sym);
2324 }
2325
2326 if (src_inner == dst_inner)
2327 return "%.\tmov%t0\t%0, %1;";
2328
2329 if (CONSTANT_P (src))
2330 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2331 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2332 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2333
2334 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2335 {
2336 if (GET_MODE_BITSIZE (dst_mode) == 128
2337 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2338 {
2339 /* mov.b128 is not supported. */
2340 if (dst_inner == V2DImode && src_inner == TImode)
2341 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2342 else if (dst_inner == TImode && src_inner == V2DImode)
2343 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2344
2345 gcc_unreachable ();
2346 }
2347 return "%.\tmov.b%T0\t%0, %1;";
2348 }
2349
2350 return "%.\tcvt%t0%t1\t%0, %1;";
2351 }
2352
2353 static void nvptx_print_operand (FILE *, rtx, int);
2354
2355 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2356 involves writing .param declarations and in/out copies into them. For
2357 indirect calls, also write the .callprototype. */
2358
2359 const char *
2360 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2361 {
2362 char buf[16];
2363 static int labelno;
2364 bool needs_tgt = register_operand (callee, Pmode);
2365 rtx pat = PATTERN (insn);
2366 if (GET_CODE (pat) == COND_EXEC)
2367 pat = COND_EXEC_CODE (pat);
2368 int arg_end = XVECLEN (pat, 0);
2369 tree decl = NULL_TREE;
2370
2371 fprintf (asm_out_file, "\t{\n");
2372 if (result != NULL)
2373 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2374 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2375 reg_names[NVPTX_RETURN_REGNUM]);
2376
2377 /* Ensure we have a ptx declaration in the output if necessary. */
2378 if (GET_CODE (callee) == SYMBOL_REF)
2379 {
2380 decl = SYMBOL_REF_DECL (callee);
2381 if (!decl
2382 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2383 nvptx_record_libfunc (callee, result, pat);
2384 else if (DECL_EXTERNAL (decl))
2385 nvptx_record_fndecl (decl);
2386 }
2387
2388 if (needs_tgt)
2389 {
2390 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2391 labelno++;
2392 ASM_OUTPUT_LABEL (asm_out_file, buf);
2393 std::stringstream s;
2394 write_fn_proto_from_insn (s, NULL, result, pat);
2395 fputs (s.str().c_str(), asm_out_file);
2396 }
2397
2398 for (int argno = 1; argno < arg_end; argno++)
2399 {
2400 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2401 machine_mode mode = GET_MODE (t);
2402 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2403
2404 /* Mode splitting has already been done. */
2405 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2406 "\t\tst.param%s [%%out_arg%d], ",
2407 ptx_type, argno, ptx_type, argno);
2408 output_reg (asm_out_file, REGNO (t), VOIDmode);
2409 fprintf (asm_out_file, ";\n");
2410 }
2411
2412 /* The '.' stands for the call's predicate, if any. */
2413 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2414 fprintf (asm_out_file, "\t\tcall ");
2415 if (result != NULL_RTX)
2416 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2417
2418 if (decl)
2419 {
2420 const char *name = get_fnname_from_decl (decl);
2421 name = nvptx_name_replacement (name);
2422 assemble_name (asm_out_file, name);
2423 }
2424 else
2425 output_address (VOIDmode, callee);
2426
2427 const char *open = "(";
2428 for (int argno = 1; argno < arg_end; argno++)
2429 {
2430 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2431 open = "";
2432 }
2433 if (decl && DECL_STATIC_CHAIN (decl))
2434 {
2435 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2436 open = "";
2437 }
2438 if (!open[0])
2439 fprintf (asm_out_file, ")");
2440
2441 if (needs_tgt)
2442 {
2443 fprintf (asm_out_file, ", ");
2444 assemble_name (asm_out_file, buf);
2445 }
2446 fprintf (asm_out_file, ";\n");
2447
2448 if (find_reg_note (insn, REG_NORETURN, NULL))
2449 {
2450 /* No return functions confuse the PTX JIT, as it doesn't realize
2451 the flow control barrier they imply. It can seg fault if it
2452 encounters what looks like an unexitable loop. Emit a trailing
2453 trap and exit, which it does grok. */
2454 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2455 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2456 }
2457
2458 if (result)
2459 {
2460 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2461
2462 if (!rval[0])
2463 /* We must escape the '%' that starts RETURN_REGNUM. */
2464 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2465 reg_names[NVPTX_RETURN_REGNUM]);
2466 return rval;
2467 }
2468
2469 return "}";
2470 }
2471
2472 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2473
2474 static bool
2475 nvptx_print_operand_punct_valid_p (unsigned char c)
2476 {
2477 return c == '.' || c== '#';
2478 }
2479
2480 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2481
2482 static void
2483 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2484 {
2485 rtx off;
2486 if (GET_CODE (x) == CONST)
2487 x = XEXP (x, 0);
2488 switch (GET_CODE (x))
2489 {
2490 case PLUS:
2491 off = XEXP (x, 1);
2492 output_address (VOIDmode, XEXP (x, 0));
2493 fprintf (file, "+");
2494 output_address (VOIDmode, off);
2495 break;
2496
2497 case SYMBOL_REF:
2498 case LABEL_REF:
2499 output_addr_const (file, x);
2500 break;
2501
2502 default:
2503 gcc_assert (GET_CODE (x) != MEM);
2504 nvptx_print_operand (file, x, 0);
2505 break;
2506 }
2507 }
2508
2509 /* Write assembly language output for the address ADDR to FILE. */
2510
2511 static void
2512 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2513 {
2514 nvptx_print_address_operand (file, addr, mode);
2515 }
2516
2517 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2518
2519 Meaning of CODE:
2520 . -- print the predicate for the instruction or an emptry string for an
2521 unconditional one.
2522 # -- print a rounding mode for the instruction
2523
2524 A -- print a data area for a MEM
2525 c -- print an opcode suffix for a comparison operator, including a type code
2526 D -- print a data area for a MEM operand
2527 S -- print a shuffle kind specified by CONST_INT
2528 t -- print a type opcode suffix, promoting QImode to 32 bits
2529 T -- print a type size in bits
2530 u -- print a type opcode suffix without promotions. */
2531
2532 static void
2533 nvptx_print_operand (FILE *file, rtx x, int code)
2534 {
2535 if (code == '.')
2536 {
2537 x = current_insn_predicate;
2538 if (x)
2539 {
2540 fputs ("@", file);
2541 if (GET_CODE (x) == EQ)
2542 fputs ("!", file);
2543 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2544 }
2545 return;
2546 }
2547 else if (code == '#')
2548 {
2549 fputs (".rn", file);
2550 return;
2551 }
2552
2553 enum rtx_code x_code = GET_CODE (x);
2554 machine_mode mode = GET_MODE (x);
2555
2556 switch (code)
2557 {
2558 case 'A':
2559 x = XEXP (x, 0);
2560 /* FALLTHROUGH. */
2561
2562 case 'D':
2563 if (GET_CODE (x) == CONST)
2564 x = XEXP (x, 0);
2565 if (GET_CODE (x) == PLUS)
2566 x = XEXP (x, 0);
2567
2568 if (GET_CODE (x) == SYMBOL_REF)
2569 fputs (section_for_sym (x), file);
2570 break;
2571
2572 case 't':
2573 case 'u':
2574 if (x_code == SUBREG)
2575 {
2576 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2577 if (VECTOR_MODE_P (inner_mode)
2578 && (GET_MODE_SIZE (mode)
2579 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2580 mode = GET_MODE_INNER (inner_mode);
2581 else if (split_mode_p (inner_mode))
2582 mode = maybe_split_mode (inner_mode);
2583 else
2584 mode = inner_mode;
2585 }
2586 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2587 break;
2588
2589 case 'H':
2590 case 'L':
2591 {
2592 rtx inner_x = SUBREG_REG (x);
2593 machine_mode inner_mode = GET_MODE (inner_x);
2594 machine_mode split = maybe_split_mode (inner_mode);
2595
2596 output_reg (file, REGNO (inner_x), split,
2597 (code == 'H'
2598 ? GET_MODE_SIZE (inner_mode) / 2
2599 : 0));
2600 }
2601 break;
2602
2603 case 'S':
2604 {
2605 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2606 /* Same order as nvptx_shuffle_kind. */
2607 static const char *const kinds[] =
2608 {".up", ".down", ".bfly", ".idx"};
2609 fputs (kinds[kind], file);
2610 }
2611 break;
2612
2613 case 'T':
2614 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2615 break;
2616
2617 case 'j':
2618 fprintf (file, "@");
2619 goto common;
2620
2621 case 'J':
2622 fprintf (file, "@!");
2623 goto common;
2624
2625 case 'c':
2626 mode = GET_MODE (XEXP (x, 0));
2627 switch (x_code)
2628 {
2629 case EQ:
2630 fputs (".eq", file);
2631 break;
2632 case NE:
2633 if (FLOAT_MODE_P (mode))
2634 fputs (".neu", file);
2635 else
2636 fputs (".ne", file);
2637 break;
2638 case LE:
2639 case LEU:
2640 fputs (".le", file);
2641 break;
2642 case GE:
2643 case GEU:
2644 fputs (".ge", file);
2645 break;
2646 case LT:
2647 case LTU:
2648 fputs (".lt", file);
2649 break;
2650 case GT:
2651 case GTU:
2652 fputs (".gt", file);
2653 break;
2654 case LTGT:
2655 fputs (".ne", file);
2656 break;
2657 case UNEQ:
2658 fputs (".equ", file);
2659 break;
2660 case UNLE:
2661 fputs (".leu", file);
2662 break;
2663 case UNGE:
2664 fputs (".geu", file);
2665 break;
2666 case UNLT:
2667 fputs (".ltu", file);
2668 break;
2669 case UNGT:
2670 fputs (".gtu", file);
2671 break;
2672 case UNORDERED:
2673 fputs (".nan", file);
2674 break;
2675 case ORDERED:
2676 fputs (".num", file);
2677 break;
2678 default:
2679 gcc_unreachable ();
2680 }
2681 if (FLOAT_MODE_P (mode)
2682 || x_code == EQ || x_code == NE
2683 || x_code == GEU || x_code == GTU
2684 || x_code == LEU || x_code == LTU)
2685 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2686 else
2687 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2688 break;
2689 default:
2690 common:
2691 switch (x_code)
2692 {
2693 case SUBREG:
2694 {
2695 rtx inner_x = SUBREG_REG (x);
2696 machine_mode inner_mode = GET_MODE (inner_x);
2697 machine_mode split = maybe_split_mode (inner_mode);
2698
2699 if (VECTOR_MODE_P (inner_mode)
2700 && (GET_MODE_SIZE (mode)
2701 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2702 {
2703 output_reg (file, REGNO (inner_x), VOIDmode);
2704 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2705 }
2706 else if (split_mode_p (inner_mode)
2707 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2708 output_reg (file, REGNO (inner_x), split);
2709 else
2710 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2711 }
2712 break;
2713
2714 case REG:
2715 output_reg (file, REGNO (x), maybe_split_mode (mode));
2716 break;
2717
2718 case MEM:
2719 fputc ('[', file);
2720 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2721 fputc (']', file);
2722 break;
2723
2724 case CONST_INT:
2725 output_addr_const (file, x);
2726 break;
2727
2728 case CONST:
2729 case SYMBOL_REF:
2730 case LABEL_REF:
2731 /* We could use output_addr_const, but that can print things like
2732 "x-8", which breaks ptxas. Need to ensure it is output as
2733 "x+-8". */
2734 nvptx_print_address_operand (file, x, VOIDmode);
2735 break;
2736
2737 case CONST_DOUBLE:
2738 long vals[2];
2739 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2740 vals[0] &= 0xffffffff;
2741 vals[1] &= 0xffffffff;
2742 if (mode == SFmode)
2743 fprintf (file, "0f%08lx", vals[0]);
2744 else
2745 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2746 break;
2747
2748 case CONST_VECTOR:
2749 {
2750 unsigned n = CONST_VECTOR_NUNITS (x);
2751 fprintf (file, "{ ");
2752 for (unsigned i = 0; i < n; ++i)
2753 {
2754 if (i != 0)
2755 fprintf (file, ", ");
2756
2757 rtx elem = CONST_VECTOR_ELT (x, i);
2758 output_addr_const (file, elem);
2759 }
2760 fprintf (file, " }");
2761 }
2762 break;
2763
2764 default:
2765 output_addr_const (file, x);
2766 }
2767 }
2768 }
2769 \f
2770 /* Record replacement regs used to deal with subreg operands. */
2771 struct reg_replace
2772 {
2773 rtx replacement[MAX_RECOG_OPERANDS];
2774 machine_mode mode;
2775 int n_allocated;
2776 int n_in_use;
2777 };
2778
2779 /* Allocate or reuse a replacement in R and return the rtx. */
2780
2781 static rtx
2782 get_replacement (struct reg_replace *r)
2783 {
2784 if (r->n_allocated == r->n_in_use)
2785 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2786 return r->replacement[r->n_in_use++];
2787 }
2788
2789 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2790 the presence of subregs would break the rules for most instructions.
2791 Replace them with a suitable new register of the right size, plus
2792 conversion copyin/copyout instructions. */
2793
2794 static void
2795 nvptx_reorg_subreg (void)
2796 {
2797 struct reg_replace qiregs, hiregs, siregs, diregs;
2798 rtx_insn *insn, *next;
2799
2800 qiregs.n_allocated = 0;
2801 hiregs.n_allocated = 0;
2802 siregs.n_allocated = 0;
2803 diregs.n_allocated = 0;
2804 qiregs.mode = QImode;
2805 hiregs.mode = HImode;
2806 siregs.mode = SImode;
2807 diregs.mode = DImode;
2808
2809 for (insn = get_insns (); insn; insn = next)
2810 {
2811 next = NEXT_INSN (insn);
2812 if (!NONDEBUG_INSN_P (insn)
2813 || asm_noperands (PATTERN (insn)) >= 0
2814 || GET_CODE (PATTERN (insn)) == USE
2815 || GET_CODE (PATTERN (insn)) == CLOBBER)
2816 continue;
2817
2818 qiregs.n_in_use = 0;
2819 hiregs.n_in_use = 0;
2820 siregs.n_in_use = 0;
2821 diregs.n_in_use = 0;
2822 extract_insn (insn);
2823 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2824
2825 for (int i = 0; i < recog_data.n_operands; i++)
2826 {
2827 rtx op = recog_data.operand[i];
2828 if (GET_CODE (op) != SUBREG)
2829 continue;
2830
2831 rtx inner = SUBREG_REG (op);
2832
2833 machine_mode outer_mode = GET_MODE (op);
2834 machine_mode inner_mode = GET_MODE (inner);
2835 gcc_assert (s_ok);
2836 if (s_ok
2837 && (GET_MODE_PRECISION (inner_mode)
2838 >= GET_MODE_PRECISION (outer_mode)))
2839 continue;
2840 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2841 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2842 : outer_mode == HImode ? &hiregs
2843 : outer_mode == SImode ? &siregs
2844 : &diregs);
2845 rtx new_reg = get_replacement (r);
2846
2847 if (recog_data.operand_type[i] != OP_OUT)
2848 {
2849 enum rtx_code code;
2850 if (GET_MODE_PRECISION (inner_mode)
2851 < GET_MODE_PRECISION (outer_mode))
2852 code = ZERO_EXTEND;
2853 else
2854 code = TRUNCATE;
2855
2856 rtx pat = gen_rtx_SET (new_reg,
2857 gen_rtx_fmt_e (code, outer_mode, inner));
2858 emit_insn_before (pat, insn);
2859 }
2860
2861 if (recog_data.operand_type[i] != OP_IN)
2862 {
2863 enum rtx_code code;
2864 if (GET_MODE_PRECISION (inner_mode)
2865 < GET_MODE_PRECISION (outer_mode))
2866 code = TRUNCATE;
2867 else
2868 code = ZERO_EXTEND;
2869
2870 rtx pat = gen_rtx_SET (inner,
2871 gen_rtx_fmt_e (code, inner_mode, new_reg));
2872 emit_insn_after (pat, insn);
2873 }
2874 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2875 }
2876 }
2877 }
2878
2879 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2880 first use. */
2881
2882 static rtx
2883 nvptx_get_unisimt_master ()
2884 {
2885 rtx &master = cfun->machine->unisimt_master;
2886 return master ? master : master = gen_reg_rtx (SImode);
2887 }
2888
2889 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2890
2891 static rtx
2892 nvptx_get_unisimt_predicate ()
2893 {
2894 rtx &pred = cfun->machine->unisimt_predicate;
2895 return pred ? pred : pred = gen_reg_rtx (BImode);
2896 }
2897
2898 /* Return true if given call insn references one of the functions provided by
2899 the CUDA runtime: malloc, free, vprintf. */
2900
2901 static bool
2902 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2903 {
2904 rtx pat = PATTERN (insn);
2905 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2906 pat = XVECEXP (pat, 0, 0);
2907 if (GET_CODE (pat) == SET)
2908 pat = SET_SRC (pat);
2909 gcc_checking_assert (GET_CODE (pat) == CALL
2910 && GET_CODE (XEXP (pat, 0)) == MEM);
2911 rtx addr = XEXP (XEXP (pat, 0), 0);
2912 if (GET_CODE (addr) != SYMBOL_REF)
2913 return false;
2914 const char *name = XSTR (addr, 0);
2915 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2916 references with forced assembler name refer to PTX syscalls. For vprintf,
2917 accept both normal and forced-assembler-name references. */
2918 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2919 || !strcmp (name, "*malloc")
2920 || !strcmp (name, "*free"));
2921 }
2922
2923 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2924 propagate its value from lane MASTER to current lane. */
2925
2926 static void
2927 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2928 {
2929 rtx reg;
2930 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2931 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2932 }
2933
2934 /* Adjust code for uniform-simt code generation variant by making atomics and
2935 "syscalls" conditionally executed, and inserting shuffle-based propagation
2936 for registers being set. */
2937
2938 static void
2939 nvptx_reorg_uniform_simt ()
2940 {
2941 rtx_insn *insn, *next;
2942
2943 for (insn = get_insns (); insn; insn = next)
2944 {
2945 next = NEXT_INSN (insn);
2946 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2947 && !(NONJUMP_INSN_P (insn)
2948 && GET_CODE (PATTERN (insn)) == PARALLEL
2949 && get_attr_atomic (insn)))
2950 continue;
2951 rtx pat = PATTERN (insn);
2952 rtx master = nvptx_get_unisimt_master ();
2953 for (int i = 0; i < XVECLEN (pat, 0); i++)
2954 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2955 rtx pred = nvptx_get_unisimt_predicate ();
2956 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2957 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2958 validate_change (insn, &PATTERN (insn), pat, false);
2959 }
2960 }
2961
2962 /* Offloading function attributes. */
2963
2964 struct offload_attrs
2965 {
2966 unsigned mask;
2967 int num_gangs;
2968 int num_workers;
2969 int vector_length;
2970 };
2971
2972 /* Define entries for cfun->machine->axis_dim. */
2973
2974 #define MACH_VECTOR_LENGTH 0
2975 #define MACH_MAX_WORKERS 1
2976
2977 static void populate_offload_attrs (offload_attrs *oa);
2978
2979 static void
2980 init_axis_dim (void)
2981 {
2982 offload_attrs oa;
2983 int max_workers;
2984
2985 populate_offload_attrs (&oa);
2986
2987 if (oa.num_workers == 0)
2988 max_workers = PTX_CTA_SIZE / oa.vector_length;
2989 else
2990 max_workers = oa.num_workers;
2991
2992 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2993 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2994 cfun->machine->axis_dim_init_p = true;
2995 }
2996
2997 static int ATTRIBUTE_UNUSED
2998 nvptx_mach_max_workers ()
2999 {
3000 if (!cfun->machine->axis_dim_init_p)
3001 init_axis_dim ();
3002 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3003 }
3004
3005 static int ATTRIBUTE_UNUSED
3006 nvptx_mach_vector_length ()
3007 {
3008 if (!cfun->machine->axis_dim_init_p)
3009 init_axis_dim ();
3010 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3011 }
3012
3013 /* Loop structure of the function. The entire function is described as
3014 a NULL loop. */
3015
3016 struct parallel
3017 {
3018 /* Parent parallel. */
3019 parallel *parent;
3020
3021 /* Next sibling parallel. */
3022 parallel *next;
3023
3024 /* First child parallel. */
3025 parallel *inner;
3026
3027 /* Partitioning mask of the parallel. */
3028 unsigned mask;
3029
3030 /* Partitioning used within inner parallels. */
3031 unsigned inner_mask;
3032
3033 /* Location of parallel forked and join. The forked is the first
3034 block in the parallel and the join is the first block after of
3035 the partition. */
3036 basic_block forked_block;
3037 basic_block join_block;
3038
3039 rtx_insn *forked_insn;
3040 rtx_insn *join_insn;
3041
3042 rtx_insn *fork_insn;
3043 rtx_insn *joining_insn;
3044
3045 /* Basic blocks in this parallel, but not in child parallels. The
3046 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3047 blocks are not. */
3048 auto_vec<basic_block> blocks;
3049
3050 public:
3051 parallel (parallel *parent, unsigned mode);
3052 ~parallel ();
3053 };
3054
3055 /* Constructor links the new parallel into it's parent's chain of
3056 children. */
3057
3058 parallel::parallel (parallel *parent_, unsigned mask_)
3059 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3060 {
3061 forked_block = join_block = 0;
3062 forked_insn = join_insn = 0;
3063 fork_insn = joining_insn = 0;
3064
3065 if (parent)
3066 {
3067 next = parent->inner;
3068 parent->inner = this;
3069 }
3070 }
3071
3072 parallel::~parallel ()
3073 {
3074 delete inner;
3075 delete next;
3076 }
3077
3078 /* Map of basic blocks to insns */
3079 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3080
3081 /* A tuple of an insn of interest and the BB in which it resides. */
3082 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3083 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3084
3085 /* Split basic blocks such that each forked and join unspecs are at
3086 the start of their basic blocks. Thus afterwards each block will
3087 have a single partitioning mode. We also do the same for return
3088 insns, as they are executed by every thread. Return the
3089 partitioning mode of the function as a whole. Populate MAP with
3090 head and tail blocks. We also clear the BB visited flag, which is
3091 used when finding partitions. */
3092
3093 static void
3094 nvptx_split_blocks (bb_insn_map_t *map)
3095 {
3096 insn_bb_vec_t worklist;
3097 basic_block block;
3098 rtx_insn *insn;
3099
3100 /* Locate all the reorg instructions of interest. */
3101 FOR_ALL_BB_FN (block, cfun)
3102 {
3103 bool seen_insn = false;
3104
3105 /* Clear visited flag, for use by parallel locator */
3106 block->flags &= ~BB_VISITED;
3107
3108 FOR_BB_INSNS (block, insn)
3109 {
3110 if (!INSN_P (insn))
3111 continue;
3112 switch (recog_memoized (insn))
3113 {
3114 default:
3115 seen_insn = true;
3116 continue;
3117 case CODE_FOR_nvptx_forked:
3118 case CODE_FOR_nvptx_join:
3119 break;
3120
3121 case CODE_FOR_return:
3122 /* We also need to split just before return insns, as
3123 that insn needs executing by all threads, but the
3124 block it is in probably does not. */
3125 break;
3126 }
3127
3128 if (seen_insn)
3129 /* We've found an instruction that must be at the start of
3130 a block, but isn't. Add it to the worklist. */
3131 worklist.safe_push (insn_bb_t (insn, block));
3132 else
3133 /* It was already the first instruction. Just add it to
3134 the map. */
3135 map->get_or_insert (block) = insn;
3136 seen_insn = true;
3137 }
3138 }
3139
3140 /* Split blocks on the worklist. */
3141 unsigned ix;
3142 insn_bb_t *elt;
3143 basic_block remap = 0;
3144 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3145 {
3146 if (remap != elt->second)
3147 {
3148 block = elt->second;
3149 remap = block;
3150 }
3151
3152 /* Split block before insn. The insn is in the new block */
3153 edge e = split_block (block, PREV_INSN (elt->first));
3154
3155 block = e->dest;
3156 map->get_or_insert (block) = elt->first;
3157 }
3158 }
3159
3160 /* Return true if MASK contains parallelism that requires shared
3161 memory to broadcast. */
3162
3163 static bool
3164 nvptx_needs_shared_bcast (unsigned mask)
3165 {
3166 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3167 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3168 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3169
3170 return worker || large_vector;
3171 }
3172
3173 /* BLOCK is a basic block containing a head or tail instruction.
3174 Locate the associated prehead or pretail instruction, which must be
3175 in the single predecessor block. */
3176
3177 static rtx_insn *
3178 nvptx_discover_pre (basic_block block, int expected)
3179 {
3180 gcc_assert (block->preds->length () == 1);
3181 basic_block pre_block = (*block->preds)[0]->src;
3182 rtx_insn *pre_insn;
3183
3184 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3185 pre_insn = PREV_INSN (pre_insn))
3186 gcc_assert (pre_insn != BB_HEAD (pre_block));
3187
3188 gcc_assert (recog_memoized (pre_insn) == expected);
3189 return pre_insn;
3190 }
3191
3192 /* Dump this parallel and all its inner parallels. */
3193
3194 static void
3195 nvptx_dump_pars (parallel *par, unsigned depth)
3196 {
3197 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3198 depth, par->mask,
3199 par->forked_block ? par->forked_block->index : -1,
3200 par->join_block ? par->join_block->index : -1);
3201
3202 fprintf (dump_file, " blocks:");
3203
3204 basic_block block;
3205 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3206 fprintf (dump_file, " %d", block->index);
3207 fprintf (dump_file, "\n");
3208 if (par->inner)
3209 nvptx_dump_pars (par->inner, depth + 1);
3210
3211 if (par->next)
3212 nvptx_dump_pars (par->next, depth);
3213 }
3214
3215 /* If BLOCK contains a fork/join marker, process it to create or
3216 terminate a loop structure. Add this block to the current loop,
3217 and then walk successor blocks. */
3218
3219 static parallel *
3220 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3221 {
3222 if (block->flags & BB_VISITED)
3223 return par;
3224 block->flags |= BB_VISITED;
3225
3226 if (rtx_insn **endp = map->get (block))
3227 {
3228 rtx_insn *end = *endp;
3229
3230 /* This is a block head or tail, or return instruction. */
3231 switch (recog_memoized (end))
3232 {
3233 case CODE_FOR_return:
3234 /* Return instructions are in their own block, and we
3235 don't need to do anything more. */
3236 return par;
3237
3238 case CODE_FOR_nvptx_forked:
3239 /* Loop head, create a new inner loop and add it into
3240 our parent's child list. */
3241 {
3242 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3243
3244 gcc_assert (mask);
3245 par = new parallel (par, mask);
3246 par->forked_block = block;
3247 par->forked_insn = end;
3248 if (nvptx_needs_shared_bcast (mask))
3249 par->fork_insn
3250 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3251 }
3252 break;
3253
3254 case CODE_FOR_nvptx_join:
3255 /* A loop tail. Finish the current loop and return to
3256 parent. */
3257 {
3258 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3259
3260 gcc_assert (par->mask == mask);
3261 gcc_assert (par->join_block == NULL);
3262 par->join_block = block;
3263 par->join_insn = end;
3264 if (nvptx_needs_shared_bcast (mask))
3265 par->joining_insn
3266 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3267 par = par->parent;
3268 }
3269 break;
3270
3271 default:
3272 gcc_unreachable ();
3273 }
3274 }
3275
3276 if (par)
3277 /* Add this block onto the current loop's list of blocks. */
3278 par->blocks.safe_push (block);
3279 else
3280 /* This must be the entry block. Create a NULL parallel. */
3281 par = new parallel (0, 0);
3282
3283 /* Walk successor blocks. */
3284 edge e;
3285 edge_iterator ei;
3286
3287 FOR_EACH_EDGE (e, ei, block->succs)
3288 nvptx_find_par (map, par, e->dest);
3289
3290 return par;
3291 }
3292
3293 /* DFS walk the CFG looking for fork & join markers. Construct
3294 loop structures as we go. MAP is a mapping of basic blocks
3295 to head & tail markers, discovered when splitting blocks. This
3296 speeds up the discovery. We rely on the BB visited flag having
3297 been cleared when splitting blocks. */
3298
3299 static parallel *
3300 nvptx_discover_pars (bb_insn_map_t *map)
3301 {
3302 basic_block block;
3303
3304 /* Mark exit blocks as visited. */
3305 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3306 block->flags |= BB_VISITED;
3307
3308 /* And entry block as not. */
3309 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3310 block->flags &= ~BB_VISITED;
3311
3312 parallel *par = nvptx_find_par (map, 0, block);
3313
3314 if (dump_file)
3315 {
3316 fprintf (dump_file, "\nLoops\n");
3317 nvptx_dump_pars (par, 0);
3318 fprintf (dump_file, "\n");
3319 }
3320
3321 return par;
3322 }
3323
3324 /* Analyse a group of BBs within a partitioned region and create N
3325 Single-Entry-Single-Exit regions. Some of those regions will be
3326 trivial ones consisting of a single BB. The blocks of a
3327 partitioned region might form a set of disjoint graphs -- because
3328 the region encloses a differently partitoned sub region.
3329
3330 We use the linear time algorithm described in 'Finding Regions Fast:
3331 Single Entry Single Exit and control Regions in Linear Time'
3332 Johnson, Pearson & Pingali. That algorithm deals with complete
3333 CFGs, where a back edge is inserted from END to START, and thus the
3334 problem becomes one of finding equivalent loops.
3335
3336 In this case we have a partial CFG. We complete it by redirecting
3337 any incoming edge to the graph to be from an arbitrary external BB,
3338 and similarly redirecting any outgoing edge to be to that BB.
3339 Thus we end up with a closed graph.
3340
3341 The algorithm works by building a spanning tree of an undirected
3342 graph and keeping track of back edges from nodes further from the
3343 root in the tree to nodes nearer to the root in the tree. In the
3344 description below, the root is up and the tree grows downwards.
3345
3346 We avoid having to deal with degenerate back-edges to the same
3347 block, by splitting each BB into 3 -- one for input edges, one for
3348 the node itself and one for the output edges. Such back edges are
3349 referred to as 'Brackets'. Cycle equivalent nodes will have the
3350 same set of brackets.
3351
3352 Determining bracket equivalency is done by maintaining a list of
3353 brackets in such a manner that the list length and final bracket
3354 uniquely identify the set.
3355
3356 We use coloring to mark all BBs with cycle equivalency with the
3357 same color. This is the output of the 'Finding Regions Fast'
3358 algorithm. Notice it doesn't actually find the set of nodes within
3359 a particular region, just unorderd sets of nodes that are the
3360 entries and exits of SESE regions.
3361
3362 After determining cycle equivalency, we need to find the minimal
3363 set of SESE regions. Do this with a DFS coloring walk of the
3364 complete graph. We're either 'looking' or 'coloring'. When
3365 looking, and we're in the subgraph, we start coloring the color of
3366 the current node, and remember that node as the start of the
3367 current color's SESE region. Every time we go to a new node, we
3368 decrement the count of nodes with thet color. If it reaches zero,
3369 we remember that node as the end of the current color's SESE region
3370 and return to 'looking'. Otherwise we color the node the current
3371 color.
3372
3373 This way we end up with coloring the inside of non-trivial SESE
3374 regions with the color of that region. */
3375
3376 /* A pair of BBs. We use this to represent SESE regions. */
3377 typedef std::pair<basic_block, basic_block> bb_pair_t;
3378 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3379
3380 /* A node in the undirected CFG. The discriminator SECOND indicates just
3381 above or just below the BB idicated by FIRST. */
3382 typedef std::pair<basic_block, int> pseudo_node_t;
3383
3384 /* A bracket indicates an edge towards the root of the spanning tree of the
3385 undirected graph. Each bracket has a color, determined
3386 from the currrent set of brackets. */
3387 struct bracket
3388 {
3389 pseudo_node_t back; /* Back target */
3390
3391 /* Current color and size of set. */
3392 unsigned color;
3393 unsigned size;
3394
3395 bracket (pseudo_node_t back_)
3396 : back (back_), color (~0u), size (~0u)
3397 {
3398 }
3399
3400 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3401 {
3402 if (length != size)
3403 {
3404 size = length;
3405 color = color_counts.length ();
3406 color_counts.quick_push (0);
3407 }
3408 color_counts[color]++;
3409 return color;
3410 }
3411 };
3412
3413 typedef auto_vec<bracket> bracket_vec_t;
3414
3415 /* Basic block info for finding SESE regions. */
3416
3417 struct bb_sese
3418 {
3419 int node; /* Node number in spanning tree. */
3420 int parent; /* Parent node number. */
3421
3422 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3423 edges arrive at pseudo-node Ai and the outgoing edges leave at
3424 pseudo-node Ao. We have to remember which way we arrived at a
3425 particular node when generating the spanning tree. dir > 0 means
3426 we arrived at Ai, dir < 0 means we arrived at Ao. */
3427 int dir;
3428
3429 /* Lowest numbered pseudo-node reached via a backedge from thsis
3430 node, or any descendant. */
3431 pseudo_node_t high;
3432
3433 int color; /* Cycle-equivalence color */
3434
3435 /* Stack of brackets for this node. */
3436 bracket_vec_t brackets;
3437
3438 bb_sese (unsigned node_, unsigned p, int dir_)
3439 :node (node_), parent (p), dir (dir_)
3440 {
3441 }
3442 ~bb_sese ();
3443
3444 /* Push a bracket ending at BACK. */
3445 void push (const pseudo_node_t &back)
3446 {
3447 if (dump_file)
3448 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3449 back.first ? back.first->index : 0, back.second);
3450 brackets.safe_push (bracket (back));
3451 }
3452
3453 void append (bb_sese *child);
3454 void remove (const pseudo_node_t &);
3455
3456 /* Set node's color. */
3457 void set_color (auto_vec<unsigned> &color_counts)
3458 {
3459 color = brackets.last ().get_color (color_counts, brackets.length ());
3460 }
3461 };
3462
3463 bb_sese::~bb_sese ()
3464 {
3465 }
3466
3467 /* Destructively append CHILD's brackets. */
3468
3469 void
3470 bb_sese::append (bb_sese *child)
3471 {
3472 if (int len = child->brackets.length ())
3473 {
3474 int ix;
3475
3476 if (dump_file)
3477 {
3478 for (ix = 0; ix < len; ix++)
3479 {
3480 const pseudo_node_t &pseudo = child->brackets[ix].back;
3481 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3482 child->node, pseudo.first ? pseudo.first->index : 0,
3483 pseudo.second);
3484 }
3485 }
3486 if (!brackets.length ())
3487 std::swap (brackets, child->brackets);
3488 else
3489 {
3490 brackets.reserve (len);
3491 for (ix = 0; ix < len; ix++)
3492 brackets.quick_push (child->brackets[ix]);
3493 }
3494 }
3495 }
3496
3497 /* Remove brackets that terminate at PSEUDO. */
3498
3499 void
3500 bb_sese::remove (const pseudo_node_t &pseudo)
3501 {
3502 unsigned removed = 0;
3503 int len = brackets.length ();
3504
3505 for (int ix = 0; ix < len; ix++)
3506 {
3507 if (brackets[ix].back == pseudo)
3508 {
3509 if (dump_file)
3510 fprintf (dump_file, "Removing backedge %d:%+d\n",
3511 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3512 removed++;
3513 }
3514 else if (removed)
3515 brackets[ix-removed] = brackets[ix];
3516 }
3517 while (removed--)
3518 brackets.pop ();
3519 }
3520
3521 /* Accessors for BB's aux pointer. */
3522 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3523 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3524
3525 /* DFS walk creating SESE data structures. Only cover nodes with
3526 BB_VISITED set. Append discovered blocks to LIST. We number in
3527 increments of 3 so that the above and below pseudo nodes can be
3528 implicitly numbered too. */
3529
3530 static int
3531 nvptx_sese_number (int n, int p, int dir, basic_block b,
3532 auto_vec<basic_block> *list)
3533 {
3534 if (BB_GET_SESE (b))
3535 return n;
3536
3537 if (dump_file)
3538 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3539 b->index, n, p, dir);
3540
3541 BB_SET_SESE (b, new bb_sese (n, p, dir));
3542 p = n;
3543
3544 n += 3;
3545 list->quick_push (b);
3546
3547 /* First walk the nodes on the 'other side' of this node, then walk
3548 the nodes on the same side. */
3549 for (unsigned ix = 2; ix; ix--)
3550 {
3551 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3552 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3553 : offsetof (edge_def, src));
3554 edge e;
3555 edge_iterator ei;
3556
3557 FOR_EACH_EDGE (e, ei, edges)
3558 {
3559 basic_block target = *(basic_block *)((char *)e + offset);
3560
3561 if (target->flags & BB_VISITED)
3562 n = nvptx_sese_number (n, p, dir, target, list);
3563 }
3564 dir = -dir;
3565 }
3566 return n;
3567 }
3568
3569 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3570 EDGES are the outgoing edges and OFFSET is the offset to the src
3571 or dst block on the edges. */
3572
3573 static void
3574 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3575 vec<edge, va_gc> *edges, size_t offset)
3576 {
3577 edge e;
3578 edge_iterator ei;
3579 int hi_back = depth;
3580 pseudo_node_t node_back (0, depth);
3581 int hi_child = depth;
3582 pseudo_node_t node_child (0, depth);
3583 basic_block child = NULL;
3584 unsigned num_children = 0;
3585 int usd = -dir * sese->dir;
3586
3587 if (dump_file)
3588 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3589 me->index, sese->node, dir);
3590
3591 if (dir < 0)
3592 {
3593 /* This is the above pseudo-child. It has the BB itself as an
3594 additional child node. */
3595 node_child = sese->high;
3596 hi_child = node_child.second;
3597 if (node_child.first)
3598 hi_child += BB_GET_SESE (node_child.first)->node;
3599 num_children++;
3600 }
3601
3602 /* Examine each edge.
3603 - if it is a child (a) append its bracket list and (b) record
3604 whether it is the child with the highest reaching bracket.
3605 - if it is an edge to ancestor, record whether it's the highest
3606 reaching backlink. */
3607 FOR_EACH_EDGE (e, ei, edges)
3608 {
3609 basic_block target = *(basic_block *)((char *)e + offset);
3610
3611 if (bb_sese *t_sese = BB_GET_SESE (target))
3612 {
3613 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3614 {
3615 /* Child node. Append its bracket list. */
3616 num_children++;
3617 sese->append (t_sese);
3618
3619 /* Compare it's hi value. */
3620 int t_hi = t_sese->high.second;
3621
3622 if (basic_block child_hi_block = t_sese->high.first)
3623 t_hi += BB_GET_SESE (child_hi_block)->node;
3624
3625 if (hi_child > t_hi)
3626 {
3627 hi_child = t_hi;
3628 node_child = t_sese->high;
3629 child = target;
3630 }
3631 }
3632 else if (t_sese->node < sese->node + dir
3633 && !(dir < 0 && sese->parent == t_sese->node))
3634 {
3635 /* Non-parental ancestor node -- a backlink. */
3636 int d = usd * t_sese->dir;
3637 int back = t_sese->node + d;
3638
3639 if (hi_back > back)
3640 {
3641 hi_back = back;
3642 node_back = pseudo_node_t (target, d);
3643 }
3644 }
3645 }
3646 else
3647 { /* Fallen off graph, backlink to entry node. */
3648 hi_back = 0;
3649 node_back = pseudo_node_t (0, 0);
3650 }
3651 }
3652
3653 /* Remove any brackets that terminate at this pseudo node. */
3654 sese->remove (pseudo_node_t (me, dir));
3655
3656 /* Now push any backlinks from this pseudo node. */
3657 FOR_EACH_EDGE (e, ei, edges)
3658 {
3659 basic_block target = *(basic_block *)((char *)e + offset);
3660 if (bb_sese *t_sese = BB_GET_SESE (target))
3661 {
3662 if (t_sese->node < sese->node + dir
3663 && !(dir < 0 && sese->parent == t_sese->node))
3664 /* Non-parental ancestor node - backedge from me. */
3665 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3666 }
3667 else
3668 {
3669 /* back edge to entry node */
3670 sese->push (pseudo_node_t (0, 0));
3671 }
3672 }
3673
3674 /* If this node leads directly or indirectly to a no-return region of
3675 the graph, then fake a backedge to entry node. */
3676 if (!sese->brackets.length () || !edges || !edges->length ())
3677 {
3678 hi_back = 0;
3679 node_back = pseudo_node_t (0, 0);
3680 sese->push (node_back);
3681 }
3682
3683 /* Record the highest reaching backedge from us or a descendant. */
3684 sese->high = hi_back < hi_child ? node_back : node_child;
3685
3686 if (num_children > 1)
3687 {
3688 /* There is more than one child -- this is a Y shaped piece of
3689 spanning tree. We have to insert a fake backedge from this
3690 node to the highest ancestor reached by not-the-highest
3691 reaching child. Note that there may be multiple children
3692 with backedges to the same highest node. That's ok and we
3693 insert the edge to that highest node. */
3694 hi_child = depth;
3695 if (dir < 0 && child)
3696 {
3697 node_child = sese->high;
3698 hi_child = node_child.second;
3699 if (node_child.first)
3700 hi_child += BB_GET_SESE (node_child.first)->node;
3701 }
3702
3703 FOR_EACH_EDGE (e, ei, edges)
3704 {
3705 basic_block target = *(basic_block *)((char *)e + offset);
3706
3707 if (target == child)
3708 /* Ignore the highest child. */
3709 continue;
3710
3711 bb_sese *t_sese = BB_GET_SESE (target);
3712 if (!t_sese)
3713 continue;
3714 if (t_sese->parent != sese->node)
3715 /* Not a child. */
3716 continue;
3717
3718 /* Compare its hi value. */
3719 int t_hi = t_sese->high.second;
3720
3721 if (basic_block child_hi_block = t_sese->high.first)
3722 t_hi += BB_GET_SESE (child_hi_block)->node;
3723
3724 if (hi_child > t_hi)
3725 {
3726 hi_child = t_hi;
3727 node_child = t_sese->high;
3728 }
3729 }
3730
3731 sese->push (node_child);
3732 }
3733 }
3734
3735
3736 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3737 proceed to successors. Set SESE entry and exit nodes of
3738 REGIONS. */
3739
3740 static void
3741 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3742 basic_block block, int coloring)
3743 {
3744 bb_sese *sese = BB_GET_SESE (block);
3745
3746 if (block->flags & BB_VISITED)
3747 {
3748 /* If we've already encountered this block, either we must not
3749 be coloring, or it must have been colored the current color. */
3750 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3751 return;
3752 }
3753
3754 block->flags |= BB_VISITED;
3755
3756 if (sese)
3757 {
3758 if (coloring < 0)
3759 {
3760 /* Start coloring a region. */
3761 regions[sese->color].first = block;
3762 coloring = sese->color;
3763 }
3764
3765 if (!--color_counts[sese->color] && sese->color == coloring)
3766 {
3767 /* Found final block of SESE region. */
3768 regions[sese->color].second = block;
3769 coloring = -1;
3770 }
3771 else
3772 /* Color the node, so we can assert on revisiting the node
3773 that the graph is indeed SESE. */
3774 sese->color = coloring;
3775 }
3776 else
3777 /* Fallen off the subgraph, we cannot be coloring. */
3778 gcc_assert (coloring < 0);
3779
3780 /* Walk each successor block. */
3781 if (block->succs && block->succs->length ())
3782 {
3783 edge e;
3784 edge_iterator ei;
3785
3786 FOR_EACH_EDGE (e, ei, block->succs)
3787 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3788 }
3789 else
3790 gcc_assert (coloring < 0);
3791 }
3792
3793 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3794 end up with NULL entries in it. */
3795
3796 static void
3797 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3798 {
3799 basic_block block;
3800 int ix;
3801
3802 /* First clear each BB of the whole function. */
3803 FOR_ALL_BB_FN (block, cfun)
3804 {
3805 block->flags &= ~BB_VISITED;
3806 BB_SET_SESE (block, 0);
3807 }
3808
3809 /* Mark blocks in the function that are in this graph. */
3810 for (ix = 0; blocks.iterate (ix, &block); ix++)
3811 block->flags |= BB_VISITED;
3812
3813 /* Counts of nodes assigned to each color. There cannot be more
3814 colors than blocks (and hopefully there will be fewer). */
3815 auto_vec<unsigned> color_counts;
3816 color_counts.reserve (blocks.length ());
3817
3818 /* Worklist of nodes in the spanning tree. Again, there cannot be
3819 more nodes in the tree than blocks (there will be fewer if the
3820 CFG of blocks is disjoint). */
3821 auto_vec<basic_block> spanlist;
3822 spanlist.reserve (blocks.length ());
3823
3824 /* Make sure every block has its cycle class determined. */
3825 for (ix = 0; blocks.iterate (ix, &block); ix++)
3826 {
3827 if (BB_GET_SESE (block))
3828 /* We already met this block in an earlier graph solve. */
3829 continue;
3830
3831 if (dump_file)
3832 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3833
3834 /* Number the nodes reachable from block initial DFS order. */
3835 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3836
3837 /* Now walk in reverse DFS order to find cycle equivalents. */
3838 while (spanlist.length ())
3839 {
3840 block = spanlist.pop ();
3841 bb_sese *sese = BB_GET_SESE (block);
3842
3843 /* Do the pseudo node below. */
3844 nvptx_sese_pseudo (block, sese, depth, +1,
3845 sese->dir > 0 ? block->succs : block->preds,
3846 (sese->dir > 0 ? offsetof (edge_def, dest)
3847 : offsetof (edge_def, src)));
3848 sese->set_color (color_counts);
3849 /* Do the pseudo node above. */
3850 nvptx_sese_pseudo (block, sese, depth, -1,
3851 sese->dir < 0 ? block->succs : block->preds,
3852 (sese->dir < 0 ? offsetof (edge_def, dest)
3853 : offsetof (edge_def, src)));
3854 }
3855 if (dump_file)
3856 fprintf (dump_file, "\n");
3857 }
3858
3859 if (dump_file)
3860 {
3861 unsigned count;
3862 const char *comma = "";
3863
3864 fprintf (dump_file, "Found %d cycle equivalents\n",
3865 color_counts.length ());
3866 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3867 {
3868 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3869
3870 comma = "";
3871 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3872 if (BB_GET_SESE (block)->color == ix)
3873 {
3874 block->flags |= BB_VISITED;
3875 fprintf (dump_file, "%s%d", comma, block->index);
3876 comma=",";
3877 }
3878 fprintf (dump_file, "}");
3879 comma = ", ";
3880 }
3881 fprintf (dump_file, "\n");
3882 }
3883
3884 /* Now we've colored every block in the subgraph. We now need to
3885 determine the minimal set of SESE regions that cover that
3886 subgraph. Do this with a DFS walk of the complete function.
3887 During the walk we're either 'looking' or 'coloring'. When we
3888 reach the last node of a particular color, we stop coloring and
3889 return to looking. */
3890
3891 /* There cannot be more SESE regions than colors. */
3892 regions.reserve (color_counts.length ());
3893 for (ix = color_counts.length (); ix--;)
3894 regions.quick_push (bb_pair_t (0, 0));
3895
3896 for (ix = 0; blocks.iterate (ix, &block); ix++)
3897 block->flags &= ~BB_VISITED;
3898
3899 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3900
3901 if (dump_file)
3902 {
3903 const char *comma = "";
3904 int len = regions.length ();
3905
3906 fprintf (dump_file, "SESE regions:");
3907 for (ix = 0; ix != len; ix++)
3908 {
3909 basic_block from = regions[ix].first;
3910 basic_block to = regions[ix].second;
3911
3912 if (from)
3913 {
3914 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3915 if (to != from)
3916 fprintf (dump_file, "->%d", to->index);
3917
3918 int color = BB_GET_SESE (from)->color;
3919
3920 /* Print the blocks within the region (excluding ends). */
3921 FOR_EACH_BB_FN (block, cfun)
3922 {
3923 bb_sese *sese = BB_GET_SESE (block);
3924
3925 if (sese && sese->color == color
3926 && block != from && block != to)
3927 fprintf (dump_file, ".%d", block->index);
3928 }
3929 fprintf (dump_file, "}");
3930 }
3931 comma = ",";
3932 }
3933 fprintf (dump_file, "\n\n");
3934 }
3935
3936 for (ix = 0; blocks.iterate (ix, &block); ix++)
3937 delete BB_GET_SESE (block);
3938 }
3939
3940 #undef BB_SET_SESE
3941 #undef BB_GET_SESE
3942
3943 /* Propagate live state at the start of a partitioned region. IS_CALL
3944 indicates whether the propagation is for a (partitioned) call
3945 instruction. BLOCK provides the live register information, and
3946 might not contain INSN. Propagation is inserted just after INSN. RW
3947 indicates whether we are reading and/or writing state. This
3948 separation is needed for worker-level proppagation where we
3949 essentially do a spill & fill. FN is the underlying worker
3950 function to generate the propagation instructions for single
3951 register. DATA is user data.
3952
3953 Returns true if we didn't emit any instructions.
3954
3955 We propagate the live register set for non-calls and the entire
3956 frame for calls and non-calls. We could do better by (a)
3957 propagating just the live set that is used within the partitioned
3958 regions and (b) only propagating stack entries that are used. The
3959 latter might be quite hard to determine. */
3960
3961 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3962
3963 static bool
3964 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3965 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3966 {
3967 bitmap live = DF_LIVE_IN (block);
3968 bitmap_iterator iterator;
3969 unsigned ix;
3970 bool empty = true;
3971
3972 /* Copy the frame array. */
3973 HOST_WIDE_INT fs = get_frame_size ();
3974 if (fs)
3975 {
3976 rtx tmp = gen_reg_rtx (DImode);
3977 rtx idx = NULL_RTX;
3978 rtx ptr = gen_reg_rtx (Pmode);
3979 rtx pred = NULL_RTX;
3980 rtx_code_label *label = NULL;
3981
3982 empty = false;
3983 /* The frame size might not be DImode compatible, but the frame
3984 array's declaration will be. So it's ok to round up here. */
3985 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3986 /* Detect single iteration loop. */
3987 if (fs == 1)
3988 fs = 0;
3989
3990 start_sequence ();
3991 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3992 if (fs)
3993 {
3994 idx = gen_reg_rtx (SImode);
3995 pred = gen_reg_rtx (BImode);
3996 label = gen_label_rtx ();
3997
3998 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3999 /* Allow worker function to initialize anything needed. */
4000 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4001 if (init)
4002 emit_insn (init);
4003 emit_label (label);
4004 LABEL_NUSES (label)++;
4005 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4006 }
4007 if (rw & PM_read)
4008 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4009 emit_insn (fn (tmp, rw, fs, data, vector));
4010 if (rw & PM_write)
4011 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4012 if (fs)
4013 {
4014 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4015 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4016 emit_insn (gen_br_true_uni (pred, label));
4017 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4018 if (fini)
4019 emit_insn (fini);
4020 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4021 }
4022 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4023 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4024 rtx cpy = get_insns ();
4025 end_sequence ();
4026 insn = emit_insn_after (cpy, insn);
4027 }
4028
4029 if (!is_call)
4030 /* Copy live registers. */
4031 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4032 {
4033 rtx reg = regno_reg_rtx[ix];
4034
4035 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4036 {
4037 rtx bcast = fn (reg, rw, 0, data, vector);
4038
4039 insn = emit_insn_after (bcast, insn);
4040 empty = false;
4041 }
4042 }
4043 return empty;
4044 }
4045
4046 /* Worker for nvptx_warp_propagate. */
4047
4048 static rtx
4049 warp_prop_gen (rtx reg, propagate_mask pm,
4050 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4051 bool ARG_UNUSED (vector))
4052 {
4053 if (!(pm & PM_read_write))
4054 return 0;
4055
4056 return nvptx_gen_warp_bcast (reg);
4057 }
4058
4059 /* Propagate state that is live at start of BLOCK across the vectors
4060 of a single warp. Propagation is inserted just after INSN.
4061 IS_CALL and return as for nvptx_propagate. */
4062
4063 static bool
4064 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4065 {
4066 return nvptx_propagate (is_call, block, insn, PM_read_write,
4067 warp_prop_gen, 0, false);
4068 }
4069
4070 /* Worker for nvptx_shared_propagate. */
4071
4072 static rtx
4073 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4074 bool vector)
4075 {
4076 broadcast_data_t *data = (broadcast_data_t *)data_;
4077
4078 if (pm & PM_loop_begin)
4079 {
4080 /* Starting a loop, initialize pointer. */
4081 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4082
4083 oacc_bcast_align = MAX (oacc_bcast_align, align);
4084 data->offset = ROUND_UP (data->offset, align);
4085
4086 data->ptr = gen_reg_rtx (Pmode);
4087
4088 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4089 }
4090 else if (pm & PM_loop_end)
4091 {
4092 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4093 data->ptr = NULL_RTX;
4094 return clobber;
4095 }
4096 else
4097 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4098 }
4099
4100 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4101 indicates if this is just before partitioned mode (do spill), or
4102 just after it starts (do fill). Sequence is inserted just after
4103 INSN. IS_CALL and return as for nvptx_propagate. */
4104
4105 static bool
4106 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4107 rtx_insn *insn, bool vector)
4108 {
4109 broadcast_data_t data;
4110
4111 data.base = gen_reg_rtx (Pmode);
4112 data.offset = 0;
4113 data.ptr = NULL_RTX;
4114
4115 bool empty = nvptx_propagate (is_call, block, insn,
4116 pre_p ? PM_read : PM_write, shared_prop_gen,
4117 &data, vector);
4118 gcc_assert (empty == !data.offset);
4119 if (data.offset)
4120 {
4121 rtx bcast_sym = oacc_bcast_sym;
4122
4123 /* Stuff was emitted, initialize the base pointer now. */
4124 if (vector && nvptx_mach_max_workers () > 1)
4125 {
4126 if (!cfun->machine->bcast_partition)
4127 {
4128 /* It would be nice to place this register in
4129 DATA_AREA_SHARED. */
4130 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4131 }
4132 if (!cfun->machine->sync_bar)
4133 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4134
4135 bcast_sym = cfun->machine->bcast_partition;
4136 }
4137
4138 rtx init = gen_rtx_SET (data.base, bcast_sym);
4139 emit_insn_after (init, insn);
4140
4141 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4142 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4143 ? nvptx_mach_max_workers () + 1
4144 : 1);
4145
4146 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4147 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4148 }
4149 return empty;
4150 }
4151
4152 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4153 which is an integer or a register. THREADS is the number of threads
4154 controlled by the barrier. */
4155
4156 static rtx
4157 nvptx_cta_sync (rtx lock, int threads)
4158 {
4159 return gen_nvptx_barsync (lock, GEN_INT (threads));
4160 }
4161
4162 #if WORKAROUND_PTXJIT_BUG
4163 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4164 real insns. */
4165
4166 static rtx_insn *
4167 bb_first_real_insn (basic_block bb)
4168 {
4169 rtx_insn *insn;
4170
4171 /* Find first insn of from block. */
4172 FOR_BB_INSNS (bb, insn)
4173 if (INSN_P (insn))
4174 return insn;
4175
4176 return 0;
4177 }
4178 #endif
4179
4180 /* Return true if INSN needs neutering. */
4181
4182 static bool
4183 needs_neutering_p (rtx_insn *insn)
4184 {
4185 if (!INSN_P (insn))
4186 return false;
4187
4188 switch (recog_memoized (insn))
4189 {
4190 case CODE_FOR_nvptx_fork:
4191 case CODE_FOR_nvptx_forked:
4192 case CODE_FOR_nvptx_joining:
4193 case CODE_FOR_nvptx_join:
4194 case CODE_FOR_nvptx_barsync:
4195 return false;
4196 default:
4197 return true;
4198 }
4199 }
4200
4201 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4202
4203 static bool
4204 verify_neutering_jumps (basic_block from,
4205 rtx_insn *vector_jump, rtx_insn *worker_jump,
4206 rtx_insn *vector_label, rtx_insn *worker_label)
4207 {
4208 basic_block bb = from;
4209 rtx_insn *insn = BB_HEAD (bb);
4210 bool seen_worker_jump = false;
4211 bool seen_vector_jump = false;
4212 bool seen_worker_label = false;
4213 bool seen_vector_label = false;
4214 bool worker_neutered = false;
4215 bool vector_neutered = false;
4216 while (true)
4217 {
4218 if (insn == worker_jump)
4219 {
4220 seen_worker_jump = true;
4221 worker_neutered = true;
4222 gcc_assert (!vector_neutered);
4223 }
4224 else if (insn == vector_jump)
4225 {
4226 seen_vector_jump = true;
4227 vector_neutered = true;
4228 }
4229 else if (insn == worker_label)
4230 {
4231 seen_worker_label = true;
4232 gcc_assert (worker_neutered);
4233 worker_neutered = false;
4234 }
4235 else if (insn == vector_label)
4236 {
4237 seen_vector_label = true;
4238 gcc_assert (vector_neutered);
4239 vector_neutered = false;
4240 }
4241 else if (INSN_P (insn))
4242 switch (recog_memoized (insn))
4243 {
4244 case CODE_FOR_nvptx_barsync:
4245 gcc_assert (!vector_neutered && !worker_neutered);
4246 break;
4247 default:
4248 break;
4249 }
4250
4251 if (insn != BB_END (bb))
4252 insn = NEXT_INSN (insn);
4253 else if (JUMP_P (insn) && single_succ_p (bb)
4254 && !seen_vector_jump && !seen_worker_jump)
4255 {
4256 bb = single_succ (bb);
4257 insn = BB_HEAD (bb);
4258 }
4259 else
4260 break;
4261 }
4262
4263 gcc_assert (!(vector_jump && !seen_vector_jump));
4264 gcc_assert (!(worker_jump && !seen_worker_jump));
4265
4266 if (seen_vector_label || seen_worker_label)
4267 {
4268 gcc_assert (!(vector_label && !seen_vector_label));
4269 gcc_assert (!(worker_label && !seen_worker_label));
4270
4271 return true;
4272 }
4273
4274 return false;
4275 }
4276
4277 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4278
4279 static void
4280 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4281 rtx_insn *worker_label)
4282 {
4283 basic_block bb = to;
4284 rtx_insn *insn = BB_END (bb);
4285 bool seen_worker_label = false;
4286 bool seen_vector_label = false;
4287 while (true)
4288 {
4289 if (insn == worker_label)
4290 {
4291 seen_worker_label = true;
4292 gcc_assert (!seen_vector_label);
4293 }
4294 else if (insn == vector_label)
4295 seen_vector_label = true;
4296 else if (INSN_P (insn))
4297 switch (recog_memoized (insn))
4298 {
4299 case CODE_FOR_nvptx_barsync:
4300 gcc_assert (!seen_vector_label && !seen_worker_label);
4301 break;
4302 }
4303
4304 if (insn != BB_HEAD (bb))
4305 insn = PREV_INSN (insn);
4306 else
4307 break;
4308 }
4309
4310 gcc_assert (!(vector_label && !seen_vector_label));
4311 gcc_assert (!(worker_label && !seen_worker_label));
4312 }
4313
4314 /* Single neutering according to MASK. FROM is the incoming block and
4315 TO is the outgoing block. These may be the same block. Insert at
4316 start of FROM:
4317
4318 if (tid.<axis>) goto end.
4319
4320 and insert before ending branch of TO (if there is such an insn):
4321
4322 end:
4323 <possibly-broadcast-cond>
4324 <branch>
4325
4326 We currently only use differnt FROM and TO when skipping an entire
4327 loop. We could do more if we detected superblocks. */
4328
4329 static void
4330 nvptx_single (unsigned mask, basic_block from, basic_block to)
4331 {
4332 rtx_insn *head = BB_HEAD (from);
4333 rtx_insn *tail = BB_END (to);
4334 unsigned skip_mask = mask;
4335
4336 while (true)
4337 {
4338 /* Find first insn of from block. */
4339 while (head != BB_END (from) && !needs_neutering_p (head))
4340 head = NEXT_INSN (head);
4341
4342 if (from == to)
4343 break;
4344
4345 if (!(JUMP_P (head) && single_succ_p (from)))
4346 break;
4347
4348 basic_block jump_target = single_succ (from);
4349 if (!single_pred_p (jump_target))
4350 break;
4351
4352 from = jump_target;
4353 head = BB_HEAD (from);
4354 }
4355
4356 /* Find last insn of to block */
4357 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4358 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4359 tail = PREV_INSN (tail);
4360
4361 /* Detect if tail is a branch. */
4362 rtx tail_branch = NULL_RTX;
4363 rtx cond_branch = NULL_RTX;
4364 if (tail && INSN_P (tail))
4365 {
4366 tail_branch = PATTERN (tail);
4367 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4368 tail_branch = NULL_RTX;
4369 else
4370 {
4371 cond_branch = SET_SRC (tail_branch);
4372 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4373 cond_branch = NULL_RTX;
4374 }
4375 }
4376
4377 if (tail == head)
4378 {
4379 /* If this is empty, do nothing. */
4380 if (!head || !needs_neutering_p (head))
4381 return;
4382
4383 if (cond_branch)
4384 {
4385 /* If we're only doing vector single, there's no need to
4386 emit skip code because we'll not insert anything. */
4387 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4388 skip_mask = 0;
4389 }
4390 else if (tail_branch)
4391 /* Block with only unconditional branch. Nothing to do. */
4392 return;
4393 }
4394
4395 /* Insert the vector test inside the worker test. */
4396 unsigned mode;
4397 rtx_insn *before = tail;
4398 rtx_insn *neuter_start = NULL;
4399 rtx_insn *worker_label = NULL, *vector_label = NULL;
4400 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4401 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4402 if (GOMP_DIM_MASK (mode) & skip_mask)
4403 {
4404 rtx_code_label *label = gen_label_rtx ();
4405 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4406 rtx_insn **mode_jump
4407 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4408 rtx_insn **mode_label
4409 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4410
4411 if (!pred)
4412 {
4413 pred = gen_reg_rtx (BImode);
4414 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4415 }
4416
4417 rtx br;
4418 if (mode == GOMP_DIM_VECTOR)
4419 br = gen_br_true (pred, label);
4420 else
4421 br = gen_br_true_uni (pred, label);
4422 if (neuter_start)
4423 neuter_start = emit_insn_after (br, neuter_start);
4424 else
4425 neuter_start = emit_insn_before (br, head);
4426 *mode_jump = neuter_start;
4427
4428 LABEL_NUSES (label)++;
4429 rtx_insn *label_insn;
4430 if (tail_branch)
4431 {
4432 label_insn = emit_label_before (label, before);
4433 before = label_insn;
4434 }
4435 else
4436 {
4437 label_insn = emit_label_after (label, tail);
4438 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4439 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4440 emit_insn_after (gen_exit (), label_insn);
4441 }
4442
4443 *mode_label = label_insn;
4444 }
4445
4446 /* Now deal with propagating the branch condition. */
4447 if (cond_branch)
4448 {
4449 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4450
4451 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4452 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4453 {
4454 /* Vector mode only, do a shuffle. */
4455 #if WORKAROUND_PTXJIT_BUG
4456 /* The branch condition %rcond is propagated like this:
4457
4458 {
4459 .reg .u32 %x;
4460 mov.u32 %x,%tid.x;
4461 setp.ne.u32 %rnotvzero,%x,0;
4462 }
4463
4464 @%rnotvzero bra Lskip;
4465 setp.<op>.<type> %rcond,op1,op2;
4466 Lskip:
4467 selp.u32 %rcondu32,1,0,%rcond;
4468 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4469 setp.ne.u32 %rcond,%rcondu32,0;
4470
4471 There seems to be a bug in the ptx JIT compiler (observed at driver
4472 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4473 unless %rcond is initialized to something before 'bra Lskip'. The
4474 bug is not observed with ptxas from cuda 8.0.61.
4475
4476 It is true that the code is non-trivial: at Lskip, %rcond is
4477 uninitialized in threads 1-31, and after the selp the same holds
4478 for %rcondu32. But shfl propagates the defined value in thread 0
4479 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4480 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4481
4482 There is nothing in the PTX spec to suggest that this is wrong, or
4483 to explain why the extra initialization is needed. So, we classify
4484 it as a JIT bug, and the extra initialization as workaround:
4485
4486 {
4487 .reg .u32 %x;
4488 mov.u32 %x,%tid.x;
4489 setp.ne.u32 %rnotvzero,%x,0;
4490 }
4491
4492 +.reg .pred %rcond2;
4493 +setp.eq.u32 %rcond2, 1, 0;
4494
4495 @%rnotvzero bra Lskip;
4496 setp.<op>.<type> %rcond,op1,op2;
4497 +mov.pred %rcond2, %rcond;
4498 Lskip:
4499 +mov.pred %rcond, %rcond2;
4500 selp.u32 %rcondu32,1,0,%rcond;
4501 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4502 setp.ne.u32 %rcond,%rcondu32,0;
4503 */
4504 rtx_insn *label = PREV_INSN (tail);
4505 gcc_assert (label && LABEL_P (label));
4506 rtx tmp = gen_reg_rtx (BImode);
4507 emit_insn_before (gen_movbi (tmp, const0_rtx),
4508 bb_first_real_insn (from));
4509 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4510 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4511 #endif
4512 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4513 }
4514 else
4515 {
4516 /* Includes worker mode, do spill & fill. By construction
4517 we should never have worker mode only. */
4518 broadcast_data_t data;
4519 unsigned size = GET_MODE_SIZE (SImode);
4520 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4521 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4522 rtx barrier = GEN_INT (0);
4523 int threads = 0;
4524
4525 data.base = oacc_bcast_sym;
4526 data.ptr = 0;
4527
4528 bool use_partitioning_p = (vector && !worker
4529 && nvptx_mach_max_workers () > 1
4530 && cfun->machine->bcast_partition);
4531 if (use_partitioning_p)
4532 {
4533 data.base = cfun->machine->bcast_partition;
4534 barrier = cfun->machine->sync_bar;
4535 threads = nvptx_mach_vector_length ();
4536 }
4537 gcc_assert (data.base != NULL);
4538 gcc_assert (barrier);
4539
4540 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4541 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4542 ? nvptx_mach_max_workers () + 1
4543 : 1);
4544
4545 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4546 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4547
4548 data.offset = 0;
4549 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4550 vector),
4551 before);
4552
4553 /* Barrier so other workers can see the write. */
4554 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4555 data.offset = 0;
4556 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4557 vector),
4558 tail);
4559 /* This barrier is needed to avoid worker zero clobbering
4560 the broadcast buffer before all the other workers have
4561 had a chance to read this instance of it. */
4562 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4563 }
4564
4565 extract_insn (tail);
4566 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4567 UNSPEC_BR_UNIFIED);
4568 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4569 }
4570
4571 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4572 vector_label, worker_label);
4573 if (!seen_label)
4574 verify_neutering_labels (to, vector_label, worker_label);
4575 }
4576
4577 /* PAR is a parallel that is being skipped in its entirety according to
4578 MASK. Treat this as skipping a superblock starting at forked
4579 and ending at joining. */
4580
4581 static void
4582 nvptx_skip_par (unsigned mask, parallel *par)
4583 {
4584 basic_block tail = par->join_block;
4585 gcc_assert (tail->preds->length () == 1);
4586
4587 basic_block pre_tail = (*tail->preds)[0]->src;
4588 gcc_assert (pre_tail->succs->length () == 1);
4589
4590 nvptx_single (mask, par->forked_block, pre_tail);
4591 }
4592
4593 /* If PAR has a single inner parallel and PAR itself only contains
4594 empty entry and exit blocks, swallow the inner PAR. */
4595
4596 static void
4597 nvptx_optimize_inner (parallel *par)
4598 {
4599 parallel *inner = par->inner;
4600
4601 /* We mustn't be the outer dummy par. */
4602 if (!par->mask)
4603 return;
4604
4605 /* We must have a single inner par. */
4606 if (!inner || inner->next)
4607 return;
4608
4609 /* We must only contain 2 blocks ourselves -- the head and tail of
4610 the inner par. */
4611 if (par->blocks.length () != 2)
4612 return;
4613
4614 /* We must be disjoint partitioning. As we only have vector and
4615 worker partitioning, this is sufficient to guarantee the pars
4616 have adjacent partitioning. */
4617 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4618 /* This indicates malformed code generation. */
4619 return;
4620
4621 /* The outer forked insn should be immediately followed by the inner
4622 fork insn. */
4623 rtx_insn *forked = par->forked_insn;
4624 rtx_insn *fork = BB_END (par->forked_block);
4625
4626 if (NEXT_INSN (forked) != fork)
4627 return;
4628 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4629
4630 /* The outer joining insn must immediately follow the inner join
4631 insn. */
4632 rtx_insn *joining = par->joining_insn;
4633 rtx_insn *join = inner->join_insn;
4634 if (NEXT_INSN (join) != joining)
4635 return;
4636
4637 /* Preconditions met. Swallow the inner par. */
4638 if (dump_file)
4639 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4640 inner->mask, inner->forked_block->index,
4641 inner->join_block->index,
4642 par->mask, par->forked_block->index, par->join_block->index);
4643
4644 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4645
4646 par->blocks.reserve (inner->blocks.length ());
4647 while (inner->blocks.length ())
4648 par->blocks.quick_push (inner->blocks.pop ());
4649
4650 par->inner = inner->inner;
4651 inner->inner = NULL;
4652
4653 delete inner;
4654 }
4655
4656 /* Process the parallel PAR and all its contained
4657 parallels. We do everything but the neutering. Return mask of
4658 partitioned modes used within this parallel. */
4659
4660 static unsigned
4661 nvptx_process_pars (parallel *par)
4662 {
4663 if (nvptx_optimize)
4664 nvptx_optimize_inner (par);
4665
4666 unsigned inner_mask = par->mask;
4667
4668 /* Do the inner parallels first. */
4669 if (par->inner)
4670 {
4671 par->inner_mask = nvptx_process_pars (par->inner);
4672 inner_mask |= par->inner_mask;
4673 }
4674
4675 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4676 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4677 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4678 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4679
4680 if (worker || large_vector)
4681 {
4682 nvptx_shared_propagate (false, is_call, par->forked_block,
4683 par->forked_insn, !worker);
4684 bool no_prop_p
4685 = nvptx_shared_propagate (true, is_call, par->forked_block,
4686 par->fork_insn, !worker);
4687 bool empty_loop_p
4688 = !is_call && (NEXT_INSN (par->forked_insn)
4689 && NEXT_INSN (par->forked_insn) == par->joining_insn);
4690 rtx barrier = GEN_INT (0);
4691 int threads = 0;
4692
4693 if (!worker && cfun->machine->sync_bar)
4694 {
4695 barrier = cfun->machine->sync_bar;
4696 threads = nvptx_mach_vector_length ();
4697 }
4698
4699 if (no_prop_p && empty_loop_p)
4700 ;
4701 else if (no_prop_p && is_call)
4702 ;
4703 else
4704 {
4705 /* Insert begin and end synchronizations. */
4706 emit_insn_before (nvptx_cta_sync (barrier, threads),
4707 par->forked_insn);
4708 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4709 }
4710 }
4711 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4712 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4713
4714 /* Now do siblings. */
4715 if (par->next)
4716 inner_mask |= nvptx_process_pars (par->next);
4717 return inner_mask;
4718 }
4719
4720 /* Neuter the parallel described by PAR. We recurse in depth-first
4721 order. MODES are the partitioning of the execution and OUTER is
4722 the partitioning of the parallels we are contained in. */
4723
4724 static void
4725 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4726 {
4727 unsigned me = (par->mask
4728 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4729 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4730 unsigned skip_mask = 0, neuter_mask = 0;
4731
4732 if (par->inner)
4733 nvptx_neuter_pars (par->inner, modes, outer | me);
4734
4735 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4736 {
4737 if ((outer | me) & GOMP_DIM_MASK (mode))
4738 {} /* Mode is partitioned: no neutering. */
4739 else if (!(modes & GOMP_DIM_MASK (mode)))
4740 {} /* Mode is not used: nothing to do. */
4741 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4742 || !par->forked_insn)
4743 /* Partitioned in inner parallels, or we're not a partitioned
4744 at all: neuter individual blocks. */
4745 neuter_mask |= GOMP_DIM_MASK (mode);
4746 else if (!par->parent || !par->parent->forked_insn
4747 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4748 /* Parent isn't a parallel or contains this paralleling: skip
4749 parallel at this level. */
4750 skip_mask |= GOMP_DIM_MASK (mode);
4751 else
4752 {} /* Parent will skip this parallel itself. */
4753 }
4754
4755 if (neuter_mask)
4756 {
4757 int ix, len;
4758
4759 if (nvptx_optimize)
4760 {
4761 /* Neuter whole SESE regions. */
4762 bb_pair_vec_t regions;
4763
4764 nvptx_find_sese (par->blocks, regions);
4765 len = regions.length ();
4766 for (ix = 0; ix != len; ix++)
4767 {
4768 basic_block from = regions[ix].first;
4769 basic_block to = regions[ix].second;
4770
4771 if (from)
4772 nvptx_single (neuter_mask, from, to);
4773 else
4774 gcc_assert (!to);
4775 }
4776 }
4777 else
4778 {
4779 /* Neuter each BB individually. */
4780 len = par->blocks.length ();
4781 for (ix = 0; ix != len; ix++)
4782 {
4783 basic_block block = par->blocks[ix];
4784
4785 nvptx_single (neuter_mask, block, block);
4786 }
4787 }
4788 }
4789
4790 if (skip_mask)
4791 nvptx_skip_par (skip_mask, par);
4792
4793 if (par->next)
4794 nvptx_neuter_pars (par->next, modes, outer);
4795 }
4796
4797 static void
4798 populate_offload_attrs (offload_attrs *oa)
4799 {
4800 tree attr = oacc_get_fn_attrib (current_function_decl);
4801 tree dims = TREE_VALUE (attr);
4802 unsigned ix;
4803
4804 oa->mask = 0;
4805
4806 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4807 {
4808 tree t = TREE_VALUE (dims);
4809 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4810 tree allowed = TREE_PURPOSE (dims);
4811
4812 if (size != 1 && !(allowed && integer_zerop (allowed)))
4813 oa->mask |= GOMP_DIM_MASK (ix);
4814
4815 switch (ix)
4816 {
4817 case GOMP_DIM_GANG:
4818 oa->num_gangs = size;
4819 break;
4820
4821 case GOMP_DIM_WORKER:
4822 oa->num_workers = size;
4823 break;
4824
4825 case GOMP_DIM_VECTOR:
4826 oa->vector_length = size;
4827 break;
4828 }
4829 }
4830 }
4831
4832 #if WORKAROUND_PTXJIT_BUG_2
4833 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4834 is needed in the nvptx target because the branches generated for
4835 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4836
4837 static rtx
4838 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4839 {
4840 rtx pat;
4841 if ((strict && !JUMP_P (insn))
4842 || (!strict && !INSN_P (insn)))
4843 return NULL_RTX;
4844 pat = PATTERN (insn);
4845
4846 /* The set is allowed to appear either as the insn pattern or
4847 the first set in a PARALLEL. */
4848 if (GET_CODE (pat) == PARALLEL)
4849 pat = XVECEXP (pat, 0, 0);
4850 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4851 return pat;
4852
4853 return NULL_RTX;
4854 }
4855
4856 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4857
4858 static rtx
4859 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4860 {
4861 rtx x = nvptx_pc_set (insn, strict);
4862
4863 if (!x)
4864 return NULL_RTX;
4865 x = SET_SRC (x);
4866 if (GET_CODE (x) == LABEL_REF)
4867 return x;
4868 if (GET_CODE (x) != IF_THEN_ELSE)
4869 return NULL_RTX;
4870 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4871 return XEXP (x, 1);
4872 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4873 return XEXP (x, 2);
4874 return NULL_RTX;
4875 }
4876
4877 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4878 insn inbetween the branch and the label. This works around a JIT bug
4879 observed at driver version 384.111, at -O0 for sm_50. */
4880
4881 static void
4882 prevent_branch_around_nothing (void)
4883 {
4884 rtx_insn *seen_label = NULL;
4885 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4886 {
4887 if (INSN_P (insn) && condjump_p (insn))
4888 {
4889 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4890 continue;
4891 }
4892
4893 if (seen_label == NULL)
4894 continue;
4895
4896 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4897 continue;
4898
4899 if (INSN_P (insn))
4900 switch (recog_memoized (insn))
4901 {
4902 case CODE_FOR_nvptx_fork:
4903 case CODE_FOR_nvptx_forked:
4904 case CODE_FOR_nvptx_joining:
4905 case CODE_FOR_nvptx_join:
4906 continue;
4907 default:
4908 seen_label = NULL;
4909 continue;
4910 }
4911
4912 if (LABEL_P (insn) && insn == seen_label)
4913 emit_insn_before (gen_fake_nop (), insn);
4914
4915 seen_label = NULL;
4916 }
4917 }
4918 #endif
4919
4920 #ifdef WORKAROUND_PTXJIT_BUG_3
4921 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4922 works around a hang observed at driver version 390.48 for sm_50. */
4923
4924 static void
4925 workaround_barsyncs (void)
4926 {
4927 bool seen_barsync = false;
4928 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4929 {
4930 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4931 {
4932 if (seen_barsync)
4933 {
4934 emit_insn_before (gen_nvptx_membar_cta (), insn);
4935 emit_insn_before (gen_nvptx_membar_cta (), insn);
4936 }
4937
4938 seen_barsync = true;
4939 continue;
4940 }
4941
4942 if (!seen_barsync)
4943 continue;
4944
4945 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4946 continue;
4947 else if (INSN_P (insn))
4948 switch (recog_memoized (insn))
4949 {
4950 case CODE_FOR_nvptx_fork:
4951 case CODE_FOR_nvptx_forked:
4952 case CODE_FOR_nvptx_joining:
4953 case CODE_FOR_nvptx_join:
4954 continue;
4955 default:
4956 break;
4957 }
4958
4959 seen_barsync = false;
4960 }
4961 }
4962 #endif
4963
4964 /* PTX-specific reorganization
4965 - Split blocks at fork and join instructions
4966 - Compute live registers
4967 - Mark now-unused registers, so function begin doesn't declare
4968 unused registers.
4969 - Insert state propagation when entering partitioned mode
4970 - Insert neutering instructions when in single mode
4971 - Replace subregs with suitable sequences.
4972 */
4973
4974 static void
4975 nvptx_reorg (void)
4976 {
4977 /* We are freeing block_for_insn in the toplev to keep compatibility
4978 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4979 compute_bb_for_insn ();
4980
4981 thread_prologue_and_epilogue_insns ();
4982
4983 /* Split blocks and record interesting unspecs. */
4984 bb_insn_map_t bb_insn_map;
4985
4986 nvptx_split_blocks (&bb_insn_map);
4987
4988 /* Compute live regs */
4989 df_clear_flags (DF_LR_RUN_DCE);
4990 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4991 df_live_add_problem ();
4992 df_live_set_all_dirty ();
4993 df_analyze ();
4994 regstat_init_n_sets_and_refs ();
4995
4996 if (dump_file)
4997 df_dump (dump_file);
4998
4999 /* Mark unused regs as unused. */
5000 int max_regs = max_reg_num ();
5001 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5002 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5003 regno_reg_rtx[i] = const0_rtx;
5004
5005 /* Determine launch dimensions of the function. If it is not an
5006 offloaded function (i.e. this is a regular compiler), the
5007 function has no neutering. */
5008 tree attr = oacc_get_fn_attrib (current_function_decl);
5009 if (attr)
5010 {
5011 /* If we determined this mask before RTL expansion, we could
5012 elide emission of some levels of forks and joins. */
5013 offload_attrs oa;
5014
5015 populate_offload_attrs (&oa);
5016
5017 /* If there is worker neutering, there must be vector
5018 neutering. Otherwise the hardware will fail. */
5019 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5020 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5021
5022 /* Discover & process partitioned regions. */
5023 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5024 nvptx_process_pars (pars);
5025 nvptx_neuter_pars (pars, oa.mask, 0);
5026 delete pars;
5027 }
5028
5029 /* Replace subregs. */
5030 nvptx_reorg_subreg ();
5031
5032 if (TARGET_UNIFORM_SIMT)
5033 nvptx_reorg_uniform_simt ();
5034
5035 #if WORKAROUND_PTXJIT_BUG_2
5036 prevent_branch_around_nothing ();
5037 #endif
5038
5039 #ifdef WORKAROUND_PTXJIT_BUG_3
5040 workaround_barsyncs ();
5041 #endif
5042
5043 regstat_free_n_sets_and_refs ();
5044
5045 df_finish_pass (true);
5046 }
5047 \f
5048 /* Handle a "kernel" attribute; arguments as in
5049 struct attribute_spec.handler. */
5050
5051 static tree
5052 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5053 int ARG_UNUSED (flags), bool *no_add_attrs)
5054 {
5055 tree decl = *node;
5056
5057 if (TREE_CODE (decl) != FUNCTION_DECL)
5058 {
5059 error ("%qE attribute only applies to functions", name);
5060 *no_add_attrs = true;
5061 }
5062 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5063 {
5064 error ("%qE attribute requires a void return type", name);
5065 *no_add_attrs = true;
5066 }
5067
5068 return NULL_TREE;
5069 }
5070
5071 /* Handle a "shared" attribute; arguments as in
5072 struct attribute_spec.handler. */
5073
5074 static tree
5075 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5076 int ARG_UNUSED (flags), bool *no_add_attrs)
5077 {
5078 tree decl = *node;
5079
5080 if (TREE_CODE (decl) != VAR_DECL)
5081 {
5082 error ("%qE attribute only applies to variables", name);
5083 *no_add_attrs = true;
5084 }
5085 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5086 {
5087 error ("%qE attribute not allowed with auto storage class", name);
5088 *no_add_attrs = true;
5089 }
5090
5091 return NULL_TREE;
5092 }
5093
5094 /* Table of valid machine attributes. */
5095 static const struct attribute_spec nvptx_attribute_table[] =
5096 {
5097 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5098 affects_type_identity, handler, exclude } */
5099 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5100 NULL },
5101 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5102 NULL },
5103 { NULL, 0, 0, false, false, false, false, NULL, NULL }
5104 };
5105 \f
5106 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5107
5108 static HOST_WIDE_INT
5109 nvptx_vector_alignment (const_tree type)
5110 {
5111 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5112
5113 return MIN (align, BIGGEST_ALIGNMENT);
5114 }
5115
5116 /* Indicate that INSN cannot be duplicated. */
5117
5118 static bool
5119 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5120 {
5121 switch (recog_memoized (insn))
5122 {
5123 case CODE_FOR_nvptx_shufflesi:
5124 case CODE_FOR_nvptx_shufflesf:
5125 case CODE_FOR_nvptx_barsync:
5126 case CODE_FOR_nvptx_fork:
5127 case CODE_FOR_nvptx_forked:
5128 case CODE_FOR_nvptx_joining:
5129 case CODE_FOR_nvptx_join:
5130 return true;
5131 default:
5132 return false;
5133 }
5134 }
5135
5136 /* Section anchors do not work. Initialization for flag_section_anchor
5137 probes the existence of the anchoring target hooks and prevents
5138 anchoring if they don't exist. However, we may be being used with
5139 a host-side compiler that does support anchoring, and hence see
5140 the anchor flag set (as it's not recalculated). So provide an
5141 implementation denying anchoring. */
5142
5143 static bool
5144 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5145 {
5146 return false;
5147 }
5148 \f
5149 /* Record a symbol for mkoffload to enter into the mapping table. */
5150
5151 static void
5152 nvptx_record_offload_symbol (tree decl)
5153 {
5154 switch (TREE_CODE (decl))
5155 {
5156 case VAR_DECL:
5157 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5158 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5159 break;
5160
5161 case FUNCTION_DECL:
5162 {
5163 tree attr = oacc_get_fn_attrib (decl);
5164 /* OpenMP offloading does not set this attribute. */
5165 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5166
5167 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5168 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5169
5170 for (; dims; dims = TREE_CHAIN (dims))
5171 {
5172 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5173
5174 gcc_assert (!TREE_PURPOSE (dims));
5175 fprintf (asm_out_file, ", %#x", size);
5176 }
5177
5178 fprintf (asm_out_file, "\n");
5179 }
5180 break;
5181
5182 default:
5183 gcc_unreachable ();
5184 }
5185 }
5186
5187 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5188 at the start of a file. */
5189
5190 static void
5191 nvptx_file_start (void)
5192 {
5193 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5194 fputs ("\t.version\t3.1\n", asm_out_file);
5195 if (TARGET_SM35)
5196 fputs ("\t.target\tsm_35\n", asm_out_file);
5197 else
5198 fputs ("\t.target\tsm_30\n", asm_out_file);
5199 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5200 fputs ("// END PREAMBLE\n", asm_out_file);
5201 }
5202
5203 /* Emit a declaration for a worker and vector-level buffer in .shared
5204 memory. */
5205
5206 static void
5207 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5208 {
5209 const char *name = XSTR (sym, 0);
5210
5211 write_var_marker (file, true, false, name);
5212 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5213 align, name, size);
5214 }
5215
5216 /* Write out the function declarations we've collected and declare storage
5217 for the broadcast buffer. */
5218
5219 static void
5220 nvptx_file_end (void)
5221 {
5222 hash_table<tree_hasher>::iterator iter;
5223 tree decl;
5224 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5225 nvptx_record_fndecl (decl);
5226 fputs (func_decls.str().c_str(), asm_out_file);
5227
5228 if (oacc_bcast_size)
5229 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5230 oacc_bcast_align, oacc_bcast_size);
5231
5232 if (worker_red_size)
5233 write_shared_buffer (asm_out_file, worker_red_sym,
5234 worker_red_align, worker_red_size);
5235
5236 if (vector_red_size)
5237 write_shared_buffer (asm_out_file, vector_red_sym,
5238 vector_red_align, vector_red_size);
5239
5240 if (need_softstack_decl)
5241 {
5242 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5243 /* 32 is the maximum number of warps in a block. Even though it's an
5244 external declaration, emit the array size explicitly; otherwise, it
5245 may fail at PTX JIT time if the definition is later in link order. */
5246 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5247 POINTER_SIZE);
5248 }
5249 if (need_unisimt_decl)
5250 {
5251 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5252 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5253 }
5254 }
5255
5256 /* Expander for the shuffle builtins. */
5257
5258 static rtx
5259 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5260 {
5261 if (ignore)
5262 return target;
5263
5264 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5265 NULL_RTX, mode, EXPAND_NORMAL);
5266 if (!REG_P (src))
5267 src = copy_to_mode_reg (mode, src);
5268
5269 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5270 NULL_RTX, SImode, EXPAND_NORMAL);
5271 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5272 NULL_RTX, SImode, EXPAND_NORMAL);
5273
5274 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5275 idx = copy_to_mode_reg (SImode, idx);
5276
5277 rtx pat = nvptx_gen_shuffle (target, src, idx,
5278 (nvptx_shuffle_kind) INTVAL (op));
5279 if (pat)
5280 emit_insn (pat);
5281
5282 return target;
5283 }
5284
5285 const char *
5286 nvptx_output_red_partition (rtx dst, rtx offset)
5287 {
5288 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5289 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5290
5291 if (offset == const0_rtx)
5292 fprintf (asm_out_file, zero_offset, REGNO (dst),
5293 REGNO (cfun->machine->red_partition));
5294 else
5295 fprintf (asm_out_file, with_offset, REGNO (dst),
5296 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5297
5298 return "";
5299 }
5300
5301 /* Shared-memory reduction address expander. */
5302
5303 static rtx
5304 nvptx_expand_shared_addr (tree exp, rtx target,
5305 machine_mode ARG_UNUSED (mode), int ignore,
5306 int vector)
5307 {
5308 if (ignore)
5309 return target;
5310
5311 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5312 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5313 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5314 rtx addr = worker_red_sym;
5315
5316 if (vector)
5317 {
5318 offload_attrs oa;
5319
5320 populate_offload_attrs (&oa);
5321
5322 unsigned int psize = ROUND_UP (size + offset, align);
5323 unsigned int pnum = nvptx_mach_max_workers ();
5324 vector_red_partition = MAX (vector_red_partition, psize);
5325 vector_red_size = MAX (vector_red_size, psize * pnum);
5326 vector_red_align = MAX (vector_red_align, align);
5327
5328 if (cfun->machine->red_partition == NULL)
5329 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5330
5331 addr = gen_reg_rtx (Pmode);
5332 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5333 }
5334 else
5335 {
5336 worker_red_align = MAX (worker_red_align, align);
5337 worker_red_size = MAX (worker_red_size, size + offset);
5338
5339 if (offset)
5340 {
5341 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5342 addr = gen_rtx_CONST (Pmode, addr);
5343 }
5344 }
5345
5346 emit_move_insn (target, addr);
5347 return target;
5348 }
5349
5350 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5351 not require taking the address of any object, other than the memory
5352 cell being operated on. */
5353
5354 static rtx
5355 nvptx_expand_cmp_swap (tree exp, rtx target,
5356 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5357 {
5358 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5359
5360 if (!target)
5361 target = gen_reg_rtx (mode);
5362
5363 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5364 NULL_RTX, Pmode, EXPAND_NORMAL);
5365 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5366 NULL_RTX, mode, EXPAND_NORMAL);
5367 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5368 NULL_RTX, mode, EXPAND_NORMAL);
5369 rtx pat;
5370
5371 mem = gen_rtx_MEM (mode, mem);
5372 if (!REG_P (cmp))
5373 cmp = copy_to_mode_reg (mode, cmp);
5374 if (!REG_P (src))
5375 src = copy_to_mode_reg (mode, src);
5376
5377 if (mode == SImode)
5378 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5379 else
5380 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5381
5382 emit_insn (pat);
5383
5384 return target;
5385 }
5386
5387
5388 /* Codes for all the NVPTX builtins. */
5389 enum nvptx_builtins
5390 {
5391 NVPTX_BUILTIN_SHUFFLE,
5392 NVPTX_BUILTIN_SHUFFLELL,
5393 NVPTX_BUILTIN_WORKER_ADDR,
5394 NVPTX_BUILTIN_VECTOR_ADDR,
5395 NVPTX_BUILTIN_CMP_SWAP,
5396 NVPTX_BUILTIN_CMP_SWAPLL,
5397 NVPTX_BUILTIN_MAX
5398 };
5399
5400 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5401
5402 /* Return the NVPTX builtin for CODE. */
5403
5404 static tree
5405 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5406 {
5407 if (code >= NVPTX_BUILTIN_MAX)
5408 return error_mark_node;
5409
5410 return nvptx_builtin_decls[code];
5411 }
5412
5413 /* Set up all builtin functions for this target. */
5414
5415 static void
5416 nvptx_init_builtins (void)
5417 {
5418 #define DEF(ID, NAME, T) \
5419 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5420 = add_builtin_function ("__builtin_nvptx_" NAME, \
5421 build_function_type_list T, \
5422 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5423 #define ST sizetype
5424 #define UINT unsigned_type_node
5425 #define LLUINT long_long_unsigned_type_node
5426 #define PTRVOID ptr_type_node
5427
5428 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5429 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5430 DEF (WORKER_ADDR, "worker_addr",
5431 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5432 DEF (VECTOR_ADDR, "vector_addr",
5433 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5434 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5435 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5436
5437 #undef DEF
5438 #undef ST
5439 #undef UINT
5440 #undef LLUINT
5441 #undef PTRVOID
5442 }
5443
5444 /* Expand an expression EXP that calls a built-in function,
5445 with result going to TARGET if that's convenient
5446 (and in mode MODE if that's convenient).
5447 SUBTARGET may be used as the target for computing one of EXP's operands.
5448 IGNORE is nonzero if the value is to be ignored. */
5449
5450 static rtx
5451 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5452 machine_mode mode, int ignore)
5453 {
5454 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5455 switch (DECL_FUNCTION_CODE (fndecl))
5456 {
5457 case NVPTX_BUILTIN_SHUFFLE:
5458 case NVPTX_BUILTIN_SHUFFLELL:
5459 return nvptx_expand_shuffle (exp, target, mode, ignore);
5460
5461 case NVPTX_BUILTIN_WORKER_ADDR:
5462 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5463
5464 case NVPTX_BUILTIN_VECTOR_ADDR:
5465 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5466
5467 case NVPTX_BUILTIN_CMP_SWAP:
5468 case NVPTX_BUILTIN_CMP_SWAPLL:
5469 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5470
5471 default: gcc_unreachable ();
5472 }
5473 }
5474
5475 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5476
5477 static int
5478 nvptx_simt_vf ()
5479 {
5480 return PTX_WARP_SIZE;
5481 }
5482
5483 static bool
5484 nvptx_welformed_vector_length_p (int l)
5485 {
5486 gcc_assert (l > 0);
5487 return l % PTX_WARP_SIZE == 0;
5488 }
5489
5490 static void
5491 nvptx_apply_dim_limits (int dims[])
5492 {
5493 /* Check that the vector_length is not too large. */
5494 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5495 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5496
5497 /* Check that the number of workers is not too large. */
5498 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5499 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5500
5501 /* Ensure that num_worker * vector_length <= cta size. */
5502 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5503 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5504 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5505
5506 /* If we need a per-worker barrier ... . */
5507 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5508 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5509 /* Don't use more barriers than available. */
5510 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5511 PTX_NUM_PER_WORKER_BARRIERS);
5512 }
5513
5514 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5515
5516 static bool
5517 has_vector_partitionable_routine_calls_p (tree fndecl)
5518 {
5519 if (!fndecl)
5520 return false;
5521
5522 basic_block bb;
5523 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5524 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5525 gsi_next_nondebug (&i))
5526 {
5527 gimple *stmt = gsi_stmt (i);
5528 if (gimple_code (stmt) != GIMPLE_CALL)
5529 continue;
5530
5531 tree callee = gimple_call_fndecl (stmt);
5532 if (!callee)
5533 continue;
5534
5535 tree attrs = oacc_get_fn_attrib (callee);
5536 if (attrs == NULL_TREE)
5537 return false;
5538
5539 int partition_level = oacc_fn_attrib_level (attrs);
5540 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5541 if (!seq_routine_p)
5542 return true;
5543 }
5544
5545 return false;
5546 }
5547
5548 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5549 DIMS has changed. */
5550
5551 static void
5552 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5553 {
5554 bool oacc_default_dims_p = false;
5555 bool oacc_min_dims_p = false;
5556 bool offload_region_p = false;
5557 bool routine_p = false;
5558 bool routine_seq_p = false;
5559 int default_vector_length = -1;
5560
5561 if (decl == NULL_TREE)
5562 {
5563 if (fn_level == -1)
5564 oacc_default_dims_p = true;
5565 else if (fn_level == -2)
5566 oacc_min_dims_p = true;
5567 else
5568 gcc_unreachable ();
5569 }
5570 else if (fn_level == -1)
5571 offload_region_p = true;
5572 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5573 {
5574 routine_p = true;
5575 routine_seq_p = fn_level == GOMP_DIM_MAX;
5576 }
5577 else
5578 gcc_unreachable ();
5579
5580 if (oacc_min_dims_p)
5581 {
5582 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5583 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5584 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5585
5586 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5587 return;
5588 }
5589
5590 if (routine_p)
5591 {
5592 if (!routine_seq_p)
5593 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5594
5595 return;
5596 }
5597
5598 if (oacc_default_dims_p)
5599 {
5600 /* -1 : not set
5601 0 : set at runtime, f.i. -fopenacc-dims=-
5602 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5603 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5604 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5605 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5606
5607 /* But -fopenacc-dims=- is not yet supported on trunk. */
5608 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5609 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5610 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5611 }
5612
5613 if (offload_region_p)
5614 {
5615 /* -1 : not set
5616 0 : set using variable, f.i. num_gangs (n)
5617 >= 1: set using constant, f.i. num_gangs (1). */
5618 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5619 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5620 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5621 }
5622
5623 if (offload_region_p)
5624 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5625 else
5626 /* oacc_default_dims_p. */
5627 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5628
5629 int old_dims[GOMP_DIM_MAX];
5630 unsigned int i;
5631 for (i = 0; i < GOMP_DIM_MAX; ++i)
5632 old_dims[i] = dims[i];
5633
5634 const char *vector_reason = NULL;
5635 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5636 {
5637 default_vector_length = PTX_WARP_SIZE;
5638
5639 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5640 {
5641 vector_reason = G_("using vector_length (%d) due to call to"
5642 " vector-partitionable routine, ignoring %d");
5643 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5644 }
5645 }
5646
5647 if (dims[GOMP_DIM_VECTOR] == 0)
5648 {
5649 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5650 dims[GOMP_DIM_VECTOR] = default_vector_length;
5651 }
5652
5653 if (dims[GOMP_DIM_VECTOR] > 0
5654 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5655 dims[GOMP_DIM_VECTOR] = default_vector_length;
5656
5657 nvptx_apply_dim_limits (dims);
5658
5659 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5660 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5661 vector_reason != NULL
5662 ? vector_reason
5663 : G_("using vector_length (%d), ignoring %d"),
5664 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5665
5666 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5667 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5668 G_("using num_workers (%d), ignoring %d"),
5669 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5670
5671 if (oacc_default_dims_p)
5672 {
5673 if (dims[GOMP_DIM_VECTOR] < 0)
5674 dims[GOMP_DIM_VECTOR] = default_vector_length;
5675 if (dims[GOMP_DIM_WORKER] < 0)
5676 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5677 if (dims[GOMP_DIM_GANG] < 0)
5678 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5679 nvptx_apply_dim_limits (dims);
5680 }
5681
5682 if (offload_region_p)
5683 {
5684 for (i = 0; i < GOMP_DIM_MAX; i++)
5685 {
5686 if (!(dims[i] < 0))
5687 continue;
5688
5689 if ((used & GOMP_DIM_MASK (i)) == 0)
5690 /* Function oacc_validate_dims will apply the minimal dimension. */
5691 continue;
5692
5693 dims[i] = (i == GOMP_DIM_VECTOR
5694 ? default_vector_length
5695 : oacc_get_default_dim (i));
5696 }
5697
5698 nvptx_apply_dim_limits (dims);
5699 }
5700 }
5701
5702 /* Validate compute dimensions of an OpenACC offload or routine, fill
5703 in non-unity defaults. FN_LEVEL indicates the level at which a
5704 routine might spawn a loop. It is negative for non-routines. If
5705 DECL is null, we are validating the default dimensions. */
5706
5707 static bool
5708 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5709 {
5710 int old_dims[GOMP_DIM_MAX];
5711 unsigned int i;
5712
5713 for (i = 0; i < GOMP_DIM_MAX; ++i)
5714 old_dims[i] = dims[i];
5715
5716 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5717
5718 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5719 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5720 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5721
5722 for (i = 0; i < GOMP_DIM_MAX; ++i)
5723 if (old_dims[i] != dims[i])
5724 return true;
5725
5726 return false;
5727 }
5728
5729 /* Return maximum dimension size, or zero for unbounded. */
5730
5731 static int
5732 nvptx_dim_limit (int axis)
5733 {
5734 switch (axis)
5735 {
5736 case GOMP_DIM_VECTOR:
5737 return PTX_MAX_VECTOR_LENGTH;
5738
5739 default:
5740 break;
5741 }
5742 return 0;
5743 }
5744
5745 /* Determine whether fork & joins are needed. */
5746
5747 static bool
5748 nvptx_goacc_fork_join (gcall *call, const int dims[],
5749 bool ARG_UNUSED (is_fork))
5750 {
5751 tree arg = gimple_call_arg (call, 2);
5752 unsigned axis = TREE_INT_CST_LOW (arg);
5753
5754 /* We only care about worker and vector partitioning. */
5755 if (axis < GOMP_DIM_WORKER)
5756 return false;
5757
5758 /* If the size is 1, there's no partitioning. */
5759 if (dims[axis] == 1)
5760 return false;
5761
5762 return true;
5763 }
5764
5765 /* Generate a PTX builtin function call that returns the address in
5766 the worker reduction buffer at OFFSET. TYPE is the type of the
5767 data at that location. */
5768
5769 static tree
5770 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5771 {
5772 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5773 if (vector)
5774 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5775 machine_mode mode = TYPE_MODE (type);
5776 tree fndecl = nvptx_builtin_decl (addr_dim, true);
5777 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5778 tree align = build_int_cst (unsigned_type_node,
5779 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5780 tree call = build_call_expr (fndecl, 3, offset, size, align);
5781
5782 return fold_convert (build_pointer_type (type), call);
5783 }
5784
5785 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5786 will cast the variable if necessary. */
5787
5788 static void
5789 nvptx_generate_vector_shuffle (location_t loc,
5790 tree dest_var, tree var, unsigned shift,
5791 gimple_seq *seq)
5792 {
5793 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5794 tree_code code = NOP_EXPR;
5795 tree arg_type = unsigned_type_node;
5796 tree var_type = TREE_TYPE (var);
5797 tree dest_type = var_type;
5798
5799 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5800 var_type = TREE_TYPE (var_type);
5801
5802 if (TREE_CODE (var_type) == REAL_TYPE)
5803 code = VIEW_CONVERT_EXPR;
5804
5805 if (TYPE_SIZE (var_type)
5806 == TYPE_SIZE (long_long_unsigned_type_node))
5807 {
5808 fn = NVPTX_BUILTIN_SHUFFLELL;
5809 arg_type = long_long_unsigned_type_node;
5810 }
5811
5812 tree call = nvptx_builtin_decl (fn, true);
5813 tree bits = build_int_cst (unsigned_type_node, shift);
5814 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5815 tree expr;
5816
5817 if (var_type != dest_type)
5818 {
5819 /* Do real and imaginary parts separately. */
5820 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5821 real = fold_build1 (code, arg_type, real);
5822 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5823 real = fold_build1 (code, var_type, real);
5824
5825 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5826 imag = fold_build1 (code, arg_type, imag);
5827 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5828 imag = fold_build1 (code, var_type, imag);
5829
5830 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5831 }
5832 else
5833 {
5834 expr = fold_build1 (code, arg_type, var);
5835 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5836 expr = fold_build1 (code, dest_type, expr);
5837 }
5838
5839 gimplify_assign (dest_var, expr, seq);
5840 }
5841
5842 /* Lazily generate the global lock var decl and return its address. */
5843
5844 static tree
5845 nvptx_global_lock_addr ()
5846 {
5847 tree v = global_lock_var;
5848
5849 if (!v)
5850 {
5851 tree name = get_identifier ("__reduction_lock");
5852 tree type = build_qualified_type (unsigned_type_node,
5853 TYPE_QUAL_VOLATILE);
5854 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5855 global_lock_var = v;
5856 DECL_ARTIFICIAL (v) = 1;
5857 DECL_EXTERNAL (v) = 1;
5858 TREE_STATIC (v) = 1;
5859 TREE_PUBLIC (v) = 1;
5860 TREE_USED (v) = 1;
5861 mark_addressable (v);
5862 mark_decl_referenced (v);
5863 }
5864
5865 return build_fold_addr_expr (v);
5866 }
5867
5868 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5869 GSI. We use a lockless scheme for nearly all case, which looks
5870 like:
5871 actual = initval(OP);
5872 do {
5873 guess = actual;
5874 write = guess OP myval;
5875 actual = cmp&swap (ptr, guess, write)
5876 } while (actual bit-different-to guess);
5877 return write;
5878
5879 This relies on a cmp&swap instruction, which is available for 32-
5880 and 64-bit types. Larger types must use a locking scheme. */
5881
5882 static tree
5883 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5884 tree ptr, tree var, tree_code op)
5885 {
5886 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5887 tree_code code = NOP_EXPR;
5888 tree arg_type = unsigned_type_node;
5889 tree var_type = TREE_TYPE (var);
5890
5891 if (TREE_CODE (var_type) == COMPLEX_TYPE
5892 || TREE_CODE (var_type) == REAL_TYPE)
5893 code = VIEW_CONVERT_EXPR;
5894
5895 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5896 {
5897 arg_type = long_long_unsigned_type_node;
5898 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5899 }
5900
5901 tree swap_fn = nvptx_builtin_decl (fn, true);
5902
5903 gimple_seq init_seq = NULL;
5904 tree init_var = make_ssa_name (arg_type);
5905 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5906 init_expr = fold_build1 (code, arg_type, init_expr);
5907 gimplify_assign (init_var, init_expr, &init_seq);
5908 gimple *init_end = gimple_seq_last (init_seq);
5909
5910 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5911
5912 /* Split the block just after the init stmts. */
5913 basic_block pre_bb = gsi_bb (*gsi);
5914 edge pre_edge = split_block (pre_bb, init_end);
5915 basic_block loop_bb = pre_edge->dest;
5916 pre_bb = pre_edge->src;
5917 /* Reset the iterator. */
5918 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5919
5920 tree expect_var = make_ssa_name (arg_type);
5921 tree actual_var = make_ssa_name (arg_type);
5922 tree write_var = make_ssa_name (arg_type);
5923
5924 /* Build and insert the reduction calculation. */
5925 gimple_seq red_seq = NULL;
5926 tree write_expr = fold_build1 (code, var_type, expect_var);
5927 write_expr = fold_build2 (op, var_type, write_expr, var);
5928 write_expr = fold_build1 (code, arg_type, write_expr);
5929 gimplify_assign (write_var, write_expr, &red_seq);
5930
5931 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5932
5933 /* Build & insert the cmp&swap sequence. */
5934 gimple_seq latch_seq = NULL;
5935 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5936 ptr, expect_var, write_var);
5937 gimplify_assign (actual_var, swap_expr, &latch_seq);
5938
5939 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5940 NULL_TREE, NULL_TREE);
5941 gimple_seq_add_stmt (&latch_seq, cond);
5942
5943 gimple *latch_end = gimple_seq_last (latch_seq);
5944 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5945
5946 /* Split the block just after the latch stmts. */
5947 edge post_edge = split_block (loop_bb, latch_end);
5948 basic_block post_bb = post_edge->dest;
5949 loop_bb = post_edge->src;
5950 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5951
5952 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5953 post_edge->probability = profile_probability::even ();
5954 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5955 loop_edge->probability = profile_probability::even ();
5956 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5957 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5958
5959 gphi *phi = create_phi_node (expect_var, loop_bb);
5960 add_phi_arg (phi, init_var, pre_edge, loc);
5961 add_phi_arg (phi, actual_var, loop_edge, loc);
5962
5963 loop *loop = alloc_loop ();
5964 loop->header = loop_bb;
5965 loop->latch = loop_bb;
5966 add_loop (loop, loop_bb->loop_father);
5967
5968 return fold_build1 (code, var_type, write_var);
5969 }
5970
5971 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5972 GSI. This is necessary for types larger than 64 bits, where there
5973 is no cmp&swap instruction to implement a lockless scheme. We use
5974 a lock variable in global memory.
5975
5976 while (cmp&swap (&lock_var, 0, 1))
5977 continue;
5978 T accum = *ptr;
5979 accum = accum OP var;
5980 *ptr = accum;
5981 cmp&swap (&lock_var, 1, 0);
5982 return accum;
5983
5984 A lock in global memory is necessary to force execution engine
5985 descheduling and avoid resource starvation that can occur if the
5986 lock is in .shared memory. */
5987
5988 static tree
5989 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5990 tree ptr, tree var, tree_code op)
5991 {
5992 tree var_type = TREE_TYPE (var);
5993 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5994 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5995 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5996
5997 /* Split the block just before the gsi. Insert a gimple nop to make
5998 this easier. */
5999 gimple *nop = gimple_build_nop ();
6000 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6001 basic_block entry_bb = gsi_bb (*gsi);
6002 edge entry_edge = split_block (entry_bb, nop);
6003 basic_block lock_bb = entry_edge->dest;
6004 /* Reset the iterator. */
6005 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6006
6007 /* Build and insert the locking sequence. */
6008 gimple_seq lock_seq = NULL;
6009 tree lock_var = make_ssa_name (unsigned_type_node);
6010 tree lock_expr = nvptx_global_lock_addr ();
6011 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6012 uns_unlocked, uns_locked);
6013 gimplify_assign (lock_var, lock_expr, &lock_seq);
6014 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6015 NULL_TREE, NULL_TREE);
6016 gimple_seq_add_stmt (&lock_seq, cond);
6017 gimple *lock_end = gimple_seq_last (lock_seq);
6018 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6019
6020 /* Split the block just after the lock sequence. */
6021 edge locked_edge = split_block (lock_bb, lock_end);
6022 basic_block update_bb = locked_edge->dest;
6023 lock_bb = locked_edge->src;
6024 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6025
6026 /* Create the lock loop ... */
6027 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6028 locked_edge->probability = profile_probability::even ();
6029 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6030 loop_edge->probability = profile_probability::even ();
6031 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6032 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6033
6034 /* ... and the loop structure. */
6035 loop *lock_loop = alloc_loop ();
6036 lock_loop->header = lock_bb;
6037 lock_loop->latch = lock_bb;
6038 lock_loop->nb_iterations_estimate = 1;
6039 lock_loop->any_estimate = true;
6040 add_loop (lock_loop, entry_bb->loop_father);
6041
6042 /* Build and insert the reduction calculation. */
6043 gimple_seq red_seq = NULL;
6044 tree acc_in = make_ssa_name (var_type);
6045 tree ref_in = build_simple_mem_ref (ptr);
6046 TREE_THIS_VOLATILE (ref_in) = 1;
6047 gimplify_assign (acc_in, ref_in, &red_seq);
6048
6049 tree acc_out = make_ssa_name (var_type);
6050 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6051 gimplify_assign (acc_out, update_expr, &red_seq);
6052
6053 tree ref_out = build_simple_mem_ref (ptr);
6054 TREE_THIS_VOLATILE (ref_out) = 1;
6055 gimplify_assign (ref_out, acc_out, &red_seq);
6056
6057 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6058
6059 /* Build & insert the unlock sequence. */
6060 gimple_seq unlock_seq = NULL;
6061 tree unlock_expr = nvptx_global_lock_addr ();
6062 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6063 uns_locked, uns_unlocked);
6064 gimplify_and_add (unlock_expr, &unlock_seq);
6065 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6066
6067 return acc_out;
6068 }
6069
6070 /* Emit a sequence to update a reduction accumlator at *PTR with the
6071 value held in VAR using operator OP. Return the updated value.
6072
6073 TODO: optimize for atomic ops and indepedent complex ops. */
6074
6075 static tree
6076 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6077 tree ptr, tree var, tree_code op)
6078 {
6079 tree type = TREE_TYPE (var);
6080 tree size = TYPE_SIZE (type);
6081
6082 if (size == TYPE_SIZE (unsigned_type_node)
6083 || size == TYPE_SIZE (long_long_unsigned_type_node))
6084 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6085 else
6086 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6087 }
6088
6089 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6090
6091 static void
6092 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6093 {
6094 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6095 tree lhs = gimple_call_lhs (call);
6096 tree var = gimple_call_arg (call, 2);
6097 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6098 gimple_seq seq = NULL;
6099
6100 push_gimplify_context (true);
6101
6102 if (level != GOMP_DIM_GANG)
6103 {
6104 /* Copy the receiver object. */
6105 tree ref_to_res = gimple_call_arg (call, 1);
6106
6107 if (!integer_zerop (ref_to_res))
6108 var = build_simple_mem_ref (ref_to_res);
6109 }
6110
6111 if (level == GOMP_DIM_WORKER
6112 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6113 {
6114 /* Store incoming value to worker reduction buffer. */
6115 tree offset = gimple_call_arg (call, 5);
6116 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6117 level == GOMP_DIM_VECTOR);
6118 tree ptr = make_ssa_name (TREE_TYPE (call));
6119
6120 gimplify_assign (ptr, call, &seq);
6121 tree ref = build_simple_mem_ref (ptr);
6122 TREE_THIS_VOLATILE (ref) = 1;
6123 gimplify_assign (ref, var, &seq);
6124 }
6125
6126 if (lhs)
6127 gimplify_assign (lhs, var, &seq);
6128
6129 pop_gimplify_context (NULL);
6130 gsi_replace_with_seq (&gsi, seq, true);
6131 }
6132
6133 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6134
6135 static void
6136 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6137 {
6138 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6139 tree lhs = gimple_call_lhs (call);
6140 tree var = gimple_call_arg (call, 2);
6141 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6142 enum tree_code rcode
6143 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6144 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6145 TREE_TYPE (var));
6146 gimple_seq seq = NULL;
6147
6148 push_gimplify_context (true);
6149
6150 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6151 {
6152 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6153 tree tid = make_ssa_name (integer_type_node);
6154 tree dim_vector = gimple_call_arg (call, 3);
6155 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6156 dim_vector);
6157 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6158 NULL_TREE, NULL_TREE);
6159
6160 gimple_call_set_lhs (tid_call, tid);
6161 gimple_seq_add_stmt (&seq, tid_call);
6162 gimple_seq_add_stmt (&seq, cond_stmt);
6163
6164 /* Split the block just after the call. */
6165 edge init_edge = split_block (gsi_bb (gsi), call);
6166 basic_block init_bb = init_edge->dest;
6167 basic_block call_bb = init_edge->src;
6168
6169 /* Fixup flags from call_bb to init_bb. */
6170 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6171 init_edge->probability = profile_probability::even ();
6172
6173 /* Set the initialization stmts. */
6174 gimple_seq init_seq = NULL;
6175 tree init_var = make_ssa_name (TREE_TYPE (var));
6176 gimplify_assign (init_var, init, &init_seq);
6177 gsi = gsi_start_bb (init_bb);
6178 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6179
6180 /* Split block just after the init stmt. */
6181 gsi_prev (&gsi);
6182 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6183 basic_block dst_bb = inited_edge->dest;
6184
6185 /* Create false edge from call_bb to dst_bb. */
6186 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6187 nop_edge->probability = profile_probability::even ();
6188
6189 /* Create phi node in dst block. */
6190 gphi *phi = create_phi_node (lhs, dst_bb);
6191 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6192 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6193
6194 /* Reset dominator of dst bb. */
6195 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6196
6197 /* Reset the gsi. */
6198 gsi = gsi_for_stmt (call);
6199 }
6200 else
6201 {
6202 if (level == GOMP_DIM_GANG)
6203 {
6204 /* If there's no receiver object, propagate the incoming VAR. */
6205 tree ref_to_res = gimple_call_arg (call, 1);
6206 if (integer_zerop (ref_to_res))
6207 init = var;
6208 }
6209
6210 if (lhs != NULL_TREE)
6211 gimplify_assign (lhs, init, &seq);
6212 }
6213
6214 pop_gimplify_context (NULL);
6215 gsi_replace_with_seq (&gsi, seq, true);
6216 }
6217
6218 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6219
6220 static void
6221 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6222 {
6223 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6224 tree lhs = gimple_call_lhs (call);
6225 tree ref_to_res = gimple_call_arg (call, 1);
6226 tree var = gimple_call_arg (call, 2);
6227 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6228 enum tree_code op
6229 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6230 gimple_seq seq = NULL;
6231 tree r = NULL_TREE;;
6232
6233 push_gimplify_context (true);
6234
6235 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6236 {
6237 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6238 but that requires a method of emitting a unified jump at the
6239 gimple level. */
6240 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6241 {
6242 tree other_var = make_ssa_name (TREE_TYPE (var));
6243 nvptx_generate_vector_shuffle (gimple_location (call),
6244 other_var, var, shfl, &seq);
6245
6246 r = make_ssa_name (TREE_TYPE (var));
6247 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6248 var, other_var), &seq);
6249 var = r;
6250 }
6251 }
6252 else
6253 {
6254 tree accum = NULL_TREE;
6255
6256 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6257 {
6258 /* Get reduction buffer address. */
6259 tree offset = gimple_call_arg (call, 5);
6260 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6261 level == GOMP_DIM_VECTOR);
6262 tree ptr = make_ssa_name (TREE_TYPE (call));
6263
6264 gimplify_assign (ptr, call, &seq);
6265 accum = ptr;
6266 }
6267 else if (integer_zerop (ref_to_res))
6268 r = var;
6269 else
6270 accum = ref_to_res;
6271
6272 if (accum)
6273 {
6274 /* UPDATE the accumulator. */
6275 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6276 seq = NULL;
6277 r = nvptx_reduction_update (gimple_location (call), &gsi,
6278 accum, var, op);
6279 }
6280 }
6281
6282 if (lhs)
6283 gimplify_assign (lhs, r, &seq);
6284 pop_gimplify_context (NULL);
6285
6286 gsi_replace_with_seq (&gsi, seq, true);
6287 }
6288
6289 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6290
6291 static void
6292 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6293 {
6294 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6295 tree lhs = gimple_call_lhs (call);
6296 tree var = gimple_call_arg (call, 2);
6297 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6298 gimple_seq seq = NULL;
6299
6300 push_gimplify_context (true);
6301 if (level == GOMP_DIM_WORKER
6302 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6303 {
6304 /* Read the worker reduction buffer. */
6305 tree offset = gimple_call_arg (call, 5);
6306 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6307 level == GOMP_DIM_VECTOR);
6308 tree ptr = make_ssa_name (TREE_TYPE (call));
6309
6310 gimplify_assign (ptr, call, &seq);
6311 var = build_simple_mem_ref (ptr);
6312 TREE_THIS_VOLATILE (var) = 1;
6313 }
6314
6315 if (level != GOMP_DIM_GANG)
6316 {
6317 /* Write to the receiver object. */
6318 tree ref_to_res = gimple_call_arg (call, 1);
6319
6320 if (!integer_zerop (ref_to_res))
6321 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6322 }
6323
6324 if (lhs)
6325 gimplify_assign (lhs, var, &seq);
6326
6327 pop_gimplify_context (NULL);
6328
6329 gsi_replace_with_seq (&gsi, seq, true);
6330 }
6331
6332 /* NVPTX reduction expander. */
6333
6334 static void
6335 nvptx_goacc_reduction (gcall *call)
6336 {
6337 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6338 offload_attrs oa;
6339
6340 populate_offload_attrs (&oa);
6341
6342 switch (code)
6343 {
6344 case IFN_GOACC_REDUCTION_SETUP:
6345 nvptx_goacc_reduction_setup (call, &oa);
6346 break;
6347
6348 case IFN_GOACC_REDUCTION_INIT:
6349 nvptx_goacc_reduction_init (call, &oa);
6350 break;
6351
6352 case IFN_GOACC_REDUCTION_FINI:
6353 nvptx_goacc_reduction_fini (call, &oa);
6354 break;
6355
6356 case IFN_GOACC_REDUCTION_TEARDOWN:
6357 nvptx_goacc_reduction_teardown (call, &oa);
6358 break;
6359
6360 default:
6361 gcc_unreachable ();
6362 }
6363 }
6364
6365 static bool
6366 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6367 rtx x ATTRIBUTE_UNUSED)
6368 {
6369 return true;
6370 }
6371
6372 static bool
6373 nvptx_vector_mode_supported (machine_mode mode)
6374 {
6375 return (mode == V2SImode
6376 || mode == V2DImode);
6377 }
6378
6379 /* Return the preferred mode for vectorizing scalar MODE. */
6380
6381 static machine_mode
6382 nvptx_preferred_simd_mode (scalar_mode mode)
6383 {
6384 switch (mode)
6385 {
6386 case E_DImode:
6387 return V2DImode;
6388 case E_SImode:
6389 return V2SImode;
6390
6391 default:
6392 return default_preferred_simd_mode (mode);
6393 }
6394 }
6395
6396 unsigned int
6397 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6398 {
6399 if (TREE_CODE (type) == INTEGER_TYPE)
6400 {
6401 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6402 if (size == GET_MODE_SIZE (TImode))
6403 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6404 }
6405
6406 return basic_align;
6407 }
6408
6409 /* Implement TARGET_MODES_TIEABLE_P. */
6410
6411 static bool
6412 nvptx_modes_tieable_p (machine_mode, machine_mode)
6413 {
6414 return false;
6415 }
6416
6417 /* Implement TARGET_HARD_REGNO_NREGS. */
6418
6419 static unsigned int
6420 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6421 {
6422 return 1;
6423 }
6424
6425 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6426
6427 static bool
6428 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6429 {
6430 return false;
6431 }
6432
6433 static GTY(()) tree nvptx_previous_fndecl;
6434
6435 static void
6436 nvptx_set_current_function (tree fndecl)
6437 {
6438 if (!fndecl || fndecl == nvptx_previous_fndecl)
6439 return;
6440
6441 nvptx_previous_fndecl = fndecl;
6442 vector_red_partition = 0;
6443 oacc_bcast_partition = 0;
6444 }
6445
6446 #undef TARGET_OPTION_OVERRIDE
6447 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6448
6449 #undef TARGET_ATTRIBUTE_TABLE
6450 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6451
6452 #undef TARGET_LRA_P
6453 #define TARGET_LRA_P hook_bool_void_false
6454
6455 #undef TARGET_LEGITIMATE_ADDRESS_P
6456 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6457
6458 #undef TARGET_PROMOTE_FUNCTION_MODE
6459 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6460
6461 #undef TARGET_FUNCTION_ARG
6462 #define TARGET_FUNCTION_ARG nvptx_function_arg
6463 #undef TARGET_FUNCTION_INCOMING_ARG
6464 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6465 #undef TARGET_FUNCTION_ARG_ADVANCE
6466 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6467 #undef TARGET_FUNCTION_ARG_BOUNDARY
6468 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6469 #undef TARGET_PASS_BY_REFERENCE
6470 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6471 #undef TARGET_FUNCTION_VALUE_REGNO_P
6472 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6473 #undef TARGET_FUNCTION_VALUE
6474 #define TARGET_FUNCTION_VALUE nvptx_function_value
6475 #undef TARGET_LIBCALL_VALUE
6476 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6477 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6478 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6479 #undef TARGET_GET_DRAP_RTX
6480 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6481 #undef TARGET_SPLIT_COMPLEX_ARG
6482 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6483 #undef TARGET_RETURN_IN_MEMORY
6484 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6485 #undef TARGET_OMIT_STRUCT_RETURN_REG
6486 #define TARGET_OMIT_STRUCT_RETURN_REG true
6487 #undef TARGET_STRICT_ARGUMENT_NAMING
6488 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6489 #undef TARGET_CALL_ARGS
6490 #define TARGET_CALL_ARGS nvptx_call_args
6491 #undef TARGET_END_CALL_ARGS
6492 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6493
6494 #undef TARGET_ASM_FILE_START
6495 #define TARGET_ASM_FILE_START nvptx_file_start
6496 #undef TARGET_ASM_FILE_END
6497 #define TARGET_ASM_FILE_END nvptx_file_end
6498 #undef TARGET_ASM_GLOBALIZE_LABEL
6499 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6500 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6501 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6502 #undef TARGET_PRINT_OPERAND
6503 #define TARGET_PRINT_OPERAND nvptx_print_operand
6504 #undef TARGET_PRINT_OPERAND_ADDRESS
6505 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6506 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6507 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6508 #undef TARGET_ASM_INTEGER
6509 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6510 #undef TARGET_ASM_DECL_END
6511 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6512 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6513 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6514 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6515 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6516 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6517 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6518
6519 #undef TARGET_MACHINE_DEPENDENT_REORG
6520 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6521 #undef TARGET_NO_REGISTER_ALLOCATION
6522 #define TARGET_NO_REGISTER_ALLOCATION true
6523
6524 #undef TARGET_ENCODE_SECTION_INFO
6525 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6526 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6527 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6528
6529 #undef TARGET_VECTOR_ALIGNMENT
6530 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6531
6532 #undef TARGET_CANNOT_COPY_INSN_P
6533 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6534
6535 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6536 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6537
6538 #undef TARGET_INIT_BUILTINS
6539 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6540 #undef TARGET_EXPAND_BUILTIN
6541 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6542 #undef TARGET_BUILTIN_DECL
6543 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6544
6545 #undef TARGET_SIMT_VF
6546 #define TARGET_SIMT_VF nvptx_simt_vf
6547
6548 #undef TARGET_GOACC_VALIDATE_DIMS
6549 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6550
6551 #undef TARGET_GOACC_DIM_LIMIT
6552 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6553
6554 #undef TARGET_GOACC_FORK_JOIN
6555 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6556
6557 #undef TARGET_GOACC_REDUCTION
6558 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6559
6560 #undef TARGET_CANNOT_FORCE_CONST_MEM
6561 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6562
6563 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6564 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6565
6566 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6567 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6568 nvptx_preferred_simd_mode
6569
6570 #undef TARGET_MODES_TIEABLE_P
6571 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6572
6573 #undef TARGET_HARD_REGNO_NREGS
6574 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6575
6576 #undef TARGET_CAN_CHANGE_MODE_CLASS
6577 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6578
6579 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6580 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6581
6582 #undef TARGET_SET_CURRENT_FUNCTION
6583 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6584
6585 struct gcc_target targetm = TARGET_INITIALIZER;
6586
6587 #include "gt-nvptx.h"