]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
Skip gcc.dg/gimplefe-26.c on non-c99 runtime targets
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
cbe34bb5 2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
4d0cdd0c 31#include "memmodel.h"
e11c4407
AM
32#include "tm_p.h"
33#include "expmed.h"
34#include "optabs.h"
35#include "regs.h"
36#include "emit-rtl.h"
37#include "recog.h"
38#include "diagnostic.h"
40e23961 39#include "alias.h"
738f2522
BS
40#include "insn-flags.h"
41#include "output.h"
42#include "insn-attr.h"
36566b39 43#include "flags.h"
36566b39
PK
44#include "dojump.h"
45#include "explow.h"
46#include "calls.h"
36566b39
PK
47#include "varasm.h"
48#include "stmt.h"
738f2522 49#include "expr.h"
738f2522
BS
50#include "tm-preds.h"
51#include "tm-constrs.h"
738f2522
BS
52#include "langhooks.h"
53#include "dbxout.h"
738f2522 54#include "cfgrtl.h"
d88cd9c4 55#include "gimple.h"
738f2522 56#include "stor-layout.h"
738f2522 57#include "builtins.h"
629b3d75 58#include "omp-general.h"
3e32ee19
NS
59#include "omp-low.h"
60#include "gomp-constants.h"
d88cd9c4 61#include "dumpfile.h"
f3552158
NS
62#include "internal-fn.h"
63#include "gimple-iterator.h"
64#include "stringpool.h"
f2d4d839 65#include "tree-vrp.h"
f3552158
NS
66#include "tree-ssa-operands.h"
67#include "tree-ssanames.h"
68#include "gimplify.h"
69#include "tree-phinodes.h"
70#include "cfgloop.h"
71#include "fold-const.h"
738f2522 72
994c5d85 73/* This file should be included last. */
d58627a0
RS
74#include "target-def.h"
75
9a863523
NS
76/* The various PTX memory areas an object might reside in. */
77enum nvptx_data_area
78{
79 DATA_AREA_GENERIC,
80 DATA_AREA_GLOBAL,
81 DATA_AREA_SHARED,
82 DATA_AREA_LOCAL,
83 DATA_AREA_CONST,
84 DATA_AREA_PARAM,
85 DATA_AREA_MAX
86};
87
88/* We record the data area in the target symbol flags. */
89#define SYMBOL_DATA_AREA(SYM) \
90 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
91 & 7)
92#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
93 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
94
738f2522
BS
95/* Record the function decls we've written, and the libfuncs and function
96 decls corresponding to them. */
97static std::stringstream func_decls;
f3dba894 98
6c907cff 99struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
100{
101 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
102 static bool equal (rtx a, rtx b) { return a == b; }
103};
104
105static GTY((cache))
106 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
107
6c907cff 108struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
109{
110 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
111 static bool equal (tree a, tree b) { return a == b; }
112};
113
114static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
115static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 116
f3552158
NS
117/* Buffer needed to broadcast across workers. This is used for both
118 worker-neutering and worker broadcasting. It is shared by all
119 functions emitted. The buffer is placed in shared memory. It'd be
120 nice if PTX supported common blocks, because then this could be
121 shared across TUs (taking the largest size). */
d88cd9c4
NS
122static unsigned worker_bcast_size;
123static unsigned worker_bcast_align;
d88cd9c4
NS
124static GTY(()) rtx worker_bcast_sym;
125
f3552158
NS
126/* Buffer needed for worker reductions. This has to be distinct from
127 the worker broadcast array, as both may be live concurrently. */
128static unsigned worker_red_size;
129static unsigned worker_red_align;
f3552158
NS
130static GTY(()) rtx worker_red_sym;
131
33f47f42
NS
132/* Global lock variable, needed for 128bit worker & gang reductions. */
133static GTY(()) tree global_lock_var;
134
5012919d
AM
135/* True if any function references __nvptx_stacks. */
136static bool need_softstack_decl;
137
138/* True if any function references __nvptx_uni. */
139static bool need_unisimt_decl;
140
738f2522
BS
141/* Allocate a new, cleared machine_function structure. */
142
143static struct machine_function *
144nvptx_init_machine_status (void)
145{
146 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 147 p->return_mode = VOIDmode;
738f2522
BS
148 return p;
149}
150
5012919d
AM
151/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
152 and -fopenacc is also enabled. */
153
154static void
155diagnose_openacc_conflict (bool optval, const char *optname)
156{
157 if (flag_openacc && optval)
158 error ("option %s is not supported together with -fopenacc", optname);
159}
160
738f2522
BS
161/* Implement TARGET_OPTION_OVERRIDE. */
162
163static void
164nvptx_option_override (void)
165{
166 init_machine_status = nvptx_init_machine_status;
ae578943 167
3d339d5e
AM
168 /* Set toplevel_reorder, unless explicitly disabled. We need
169 reordering so that we emit necessary assembler decls of
170 undeclared variables. */
171 if (!global_options_set.x_flag_toplevel_reorder)
172 flag_toplevel_reorder = 1;
173
ae578943
NS
174 /* Set flag_no_common, unless explicitly disabled. We fake common
175 using .weak, and that's not entirely accurate, so avoid it
176 unless forced. */
177 if (!global_options_set.x_flag_no_common)
178 flag_no_common = 1;
179
738f2522
BS
180 /* Assumes that it will see only hard registers. */
181 flag_var_tracking = 0;
1e5154e7 182
dba619f3
NS
183 if (nvptx_optimize < 0)
184 nvptx_optimize = optimize > 0;
185
f3dba894
TS
186 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
187 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 188 declared_libfuncs_htab
f3dba894 189 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 190
15ab6f00 191 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
9a863523 192 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
d88cd9c4 193 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158 194
15ab6f00 195 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 196 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 197 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d
AM
198
199 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
200 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
201 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
202
203 if (TARGET_GOMP)
204 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
205}
206
738f2522
BS
207/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
208 deal with ptx ideosyncracies. */
209
210const char *
211nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
212{
213 switch (mode)
214 {
215 case BLKmode:
216 return ".b8";
217 case BImode:
218 return ".pred";
219 case QImode:
220 if (promote)
221 return ".u32";
222 else
223 return ".u8";
224 case HImode:
225 return ".u16";
226 case SImode:
227 return ".u32";
228 case DImode:
229 return ".u64";
230
231 case SFmode:
232 return ".f32";
233 case DFmode:
234 return ".f64";
235
236 default:
237 gcc_unreachable ();
238 }
239}
240
9a863523
NS
241/* Encode the PTX data area that DECL (which might not actually be a
242 _DECL) should reside in. */
7b8edc29 243
9a863523
NS
244static void
245nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 246{
9a863523
NS
247 default_encode_section_info (decl, rtl, first);
248 if (first && MEM_P (rtl))
249 {
250 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 251
9a863523
NS
252 if (TREE_CONSTANT (decl))
253 area = DATA_AREA_CONST;
254 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
255 {
256 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
257 {
258 area = DATA_AREA_SHARED;
259 if (DECL_INITIAL (decl))
260 error ("static initialization of variable %q+D in %<.shared%>"
261 " memory is not supported", decl);
262 }
263 else
264 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
265 }
7b8edc29 266
9a863523
NS
267 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
268 }
269}
270
271/* Return the PTX name of the data area in which SYM should be
272 placed. The symbol must have already been processed by
273 nvptx_encode_seciton_info, or equivalent. */
274
275static const char *
276section_for_sym (rtx sym)
277{
278 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
279 /* Same order as nvptx_data_area enum. */
280 static char const *const areas[] =
281 {"", ".global", ".shared", ".local", ".const", ".param"};
282
283 return areas[area];
284}
285
286/* Similarly for a decl. */
287
288static const char *
289section_for_decl (const_tree decl)
290{
291 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
292}
293
b4346107
NS
294/* Check NAME for special function names and redirect them by returning a
295 replacement. This applies to malloc, free and realloc, for which we
296 want to use libgcc wrappers, and call, which triggers a bug in
297 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
298 not active in an offload compiler -- the names are all set by the
299 host-side compiler. */
300
301static const char *
302nvptx_name_replacement (const char *name)
303{
304 if (strcmp (name, "call") == 0)
305 return "__nvptx_call";
306 if (strcmp (name, "malloc") == 0)
307 return "__nvptx_malloc";
308 if (strcmp (name, "free") == 0)
309 return "__nvptx_free";
310 if (strcmp (name, "realloc") == 0)
311 return "__nvptx_realloc";
312 return name;
313}
314
d7479262
NS
315/* If MODE should be treated as two registers of an inner mode, return
316 that inner mode. Otherwise return VOIDmode. */
738f2522 317
d7479262
NS
318static machine_mode
319maybe_split_mode (machine_mode mode)
738f2522 320{
738f2522 321 if (COMPLEX_MODE_P (mode))
d7479262 322 return GET_MODE_INNER (mode);
738f2522 323
738f2522 324 if (mode == TImode)
d7479262
NS
325 return DImode;
326
327 return VOIDmode;
738f2522
BS
328}
329
f313d112
NS
330/* Output a register, subreg, or register pair (with optional
331 enclosing braces). */
332
333static void
334output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
335 int subreg_offset = -1)
336{
337 if (inner_mode == VOIDmode)
338 {
339 if (HARD_REGISTER_NUM_P (regno))
340 fprintf (file, "%s", reg_names[regno]);
341 else
342 fprintf (file, "%%r%d", regno);
343 }
344 else if (subreg_offset >= 0)
345 {
346 output_reg (file, regno, VOIDmode);
347 fprintf (file, "$%d", subreg_offset);
348 }
349 else
350 {
351 if (subreg_offset == -1)
352 fprintf (file, "{");
353 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
354 fprintf (file, ",");
355 output_reg (file, regno, inner_mode, 0);
356 if (subreg_offset == -1)
357 fprintf (file, "}");
358 }
359}
360
d88cd9c4
NS
361/* Emit forking instructions for MASK. */
362
363static void
364nvptx_emit_forking (unsigned mask, bool is_call)
365{
366 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
367 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
368 if (mask)
369 {
370 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
371
372 /* Emit fork at all levels. This helps form SESE regions, as
373 it creates a block with a single successor before entering a
374 partitooned region. That is a good candidate for the end of
375 an SESE region. */
376 if (!is_call)
377 emit_insn (gen_nvptx_fork (op));
378 emit_insn (gen_nvptx_forked (op));
379 }
380}
381
382/* Emit joining instructions for MASK. */
383
384static void
385nvptx_emit_joining (unsigned mask, bool is_call)
386{
387 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
388 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
389 if (mask)
390 {
391 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
392
393 /* Emit joining for all non-call pars to ensure there's a single
394 predecessor for the block the join insn ends up in. This is
395 needed for skipping entire loops. */
396 if (!is_call)
397 emit_insn (gen_nvptx_joining (op));
398 emit_insn (gen_nvptx_join (op));
399 }
400}
401
738f2522 402\f
44eba92d
NS
403/* Determine whether MODE and TYPE (possibly NULL) should be passed or
404 returned in memory. Integer and floating types supported by the
405 machine are passed in registers, everything else is passed in
406 memory. Complex types are split. */
407
408static bool
409pass_in_memory (machine_mode mode, const_tree type, bool for_return)
410{
411 if (type)
412 {
413 if (AGGREGATE_TYPE_P (type))
414 return true;
415 if (TREE_CODE (type) == VECTOR_TYPE)
416 return true;
417 }
418
419 if (!for_return && COMPLEX_MODE_P (mode))
420 /* Complex types are passed as two underlying args. */
421 mode = GET_MODE_INNER (mode);
422
423 if (GET_MODE_CLASS (mode) != MODE_INT
424 && GET_MODE_CLASS (mode) != MODE_FLOAT)
425 return true;
426
427 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
428 return true;
429
430 return false;
431}
432
433/* A non-memory argument of mode MODE is being passed, determine the mode it
434 should be promoted to. This is also used for determining return
435 type promotion. */
436
437static machine_mode
438promote_arg (machine_mode mode, bool prototyped)
439{
440 if (!prototyped && mode == SFmode)
441 /* K&R float promotion for unprototyped functions. */
442 mode = DFmode;
443 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
444 mode = SImode;
445
446 return mode;
447}
448
449/* A non-memory return type of MODE is being returned. Determine the
450 mode it should be promoted to. */
451
452static machine_mode
453promote_return (machine_mode mode)
454{
455 return promote_arg (mode, true);
456}
457
e74f7152 458/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 459
e74f7152 460static rtx
b49e35a9 461nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
462 const_tree, bool named)
463{
b49e35a9 464 if (mode == VOIDmode || !named)
e74f7152 465 return NULL_RTX;
738f2522 466
b49e35a9 467 return gen_reg_rtx (mode);
e74f7152
NS
468}
469
470/* Implement TARGET_FUNCTION_INCOMING_ARG. */
471
472static rtx
473nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
474 const_tree, bool named)
738f2522 475{
e74f7152 476 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 477
b49e35a9 478 if (mode == VOIDmode || !named)
e74f7152 479 return NULL_RTX;
738f2522 480
e74f7152
NS
481 /* No need to deal with split modes here, the only case that can
482 happen is complex modes and those are dealt with by
483 TARGET_SPLIT_COMPLEX_ARG. */
484 return gen_rtx_UNSPEC (mode,
485 gen_rtvec (1, GEN_INT (cum->count)),
486 UNSPEC_ARG_REG);
487}
488
489/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
490
491static void
492nvptx_function_arg_advance (cumulative_args_t cum_v,
493 machine_mode ARG_UNUSED (mode),
494 const_tree ARG_UNUSED (type),
495 bool ARG_UNUSED (named))
496{
497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 498
e74f7152
NS
499 cum->count++;
500}
501
1988b2c2
NS
502/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
503
504 For nvptx This is only used for varadic args. The type has already
505 been promoted and/or converted to invisible reference. */
506
507static unsigned
508nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
509{
510 return GET_MODE_ALIGNMENT (mode);
511}
512
e74f7152
NS
513/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
514
515 For nvptx, we know how to handle functions declared as stdarg: by
516 passing an extra pointer to the unnamed arguments. However, the
517 Fortran frontend can produce a different situation, where a
518 function pointer is declared with no arguments, but the actual
519 function and calls to it take more arguments. In that case, we
520 want to ensure the call matches the definition of the function. */
521
522static bool
523nvptx_strict_argument_naming (cumulative_args_t cum_v)
524{
525 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 526
e74f7152
NS
527 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
528}
529
e74f7152
NS
530/* Implement TARGET_LIBCALL_VALUE. */
531
532static rtx
533nvptx_libcall_value (machine_mode mode, const_rtx)
534{
e91cacae 535 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
536 /* Pretend to return in a hard reg for early uses before pseudos can be
537 generated. */
538 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 539
e74f7152
NS
540 return gen_reg_rtx (mode);
541}
542
543/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
544 where function FUNC returns or receives a value of data type TYPE. */
545
546static rtx
44eba92d 547nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
548 bool outgoing)
549{
44eba92d
NS
550 machine_mode mode = promote_return (TYPE_MODE (type));
551
e74f7152 552 if (outgoing)
5c036f3f 553 {
e91cacae 554 gcc_assert (cfun);
44c068ae 555 cfun->machine->return_mode = mode;
5c036f3f
NS
556 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
557 }
b49e35a9
NS
558
559 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
560}
561
562/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
563
564static bool
565nvptx_function_value_regno_p (const unsigned int regno)
566{
567 return regno == NVPTX_RETURN_REGNUM;
568}
569
570/* Types with a mode other than those supported by the machine are passed by
571 reference in memory. */
572
573static bool
5563d5c0
NS
574nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
575 machine_mode mode, const_tree type,
576 bool ARG_UNUSED (named))
e74f7152 577{
44eba92d 578 return pass_in_memory (mode, type, false);
e74f7152
NS
579}
580
581/* Implement TARGET_RETURN_IN_MEMORY. */
582
583static bool
584nvptx_return_in_memory (const_tree type, const_tree)
585{
44eba92d 586 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
587}
588
589/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
590
591static machine_mode
592nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 593 int *ARG_UNUSED (punsignedp),
e74f7152
NS
594 const_tree funtype, int for_return)
595{
44eba92d 596 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
597}
598
e74f7152
NS
599/* Helper for write_arg. Emit a single PTX argument of MODE, either
600 in a prototype, or as copy in a function prologue. ARGNO is the
601 index of this argument in the PTX function. FOR_REG is negative,
602 if we're emitting the PTX prototype. It is zero if we're copying
603 to an argument register and it is greater than zero if we're
604 copying to a specific hard register. */
605
606static int
1f065954
NS
607write_arg_mode (std::stringstream &s, int for_reg, int argno,
608 machine_mode mode)
e74f7152
NS
609{
610 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
611
dc3d2aeb
NS
612 if (for_reg < 0)
613 {
614 /* Writing PTX prototype. */
615 s << (argno ? ", " : " (");
e74f7152 616 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
617 }
618 else
619 {
e74f7152 620 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
621 if (for_reg)
622 s << reg_names[for_reg];
623 else
624 s << "%ar" << argno;
625 s << ";\n";
5563d5c0
NS
626 if (argno >= 0)
627 {
628 s << "\tld.param" << ptx_type << " ";
629 if (for_reg)
630 s << reg_names[for_reg];
631 else
632 s << "%ar" << argno;
633 s << ", [%in_ar" << argno << "];\n";
634 }
dc3d2aeb
NS
635 }
636 return argno + 1;
738f2522
BS
637}
638
e74f7152 639/* Process function parameter TYPE to emit one or more PTX
1f065954 640 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
641 is true, if this is a prototyped function, rather than an old-style
642 C declaration. Returns the next argument number to use.
e74f7152 643
9c582551 644 The promotion behavior here must match the regular GCC function
e74f7152
NS
645 parameter marshalling machinery. */
646
647static int
1f065954
NS
648write_arg_type (std::stringstream &s, int for_reg, int argno,
649 tree type, bool prototyped)
e74f7152
NS
650{
651 machine_mode mode = TYPE_MODE (type);
652
653 if (mode == VOIDmode)
654 return argno;
655
44eba92d 656 if (pass_in_memory (mode, type, false))
e74f7152 657 mode = Pmode;
44eba92d
NS
658 else
659 {
660 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 661
44eba92d
NS
662 if (split)
663 {
664 /* Complex types are sent as two separate args. */
665 type = TREE_TYPE (type);
5563d5c0 666 mode = TYPE_MODE (type);
44eba92d
NS
667 prototyped = true;
668 }
e74f7152 669
44eba92d
NS
670 mode = promote_arg (mode, prototyped);
671 if (split)
1f065954 672 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 673 }
e74f7152 674
1f065954
NS
675 return write_arg_mode (s, for_reg, argno, mode);
676}
677
678/* Emit a PTX return as a prototype or function prologue declaration
679 for MODE. */
680
681static void
682write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
683{
684 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
685 const char *pfx = "\t.reg";
686 const char *sfx = ";\n";
687
688 if (for_proto)
689 pfx = "(.param", sfx = "_out) ";
690
691 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
692}
693
44eba92d 694/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 695 or function prologue declaration. Returns true if return is via an
9c582551 696 additional pointer parameter. The promotion behavior here must
1f065954 697 match the regular GCC function return mashalling. */
44eba92d 698
0beb7c71 699static bool
1f065954 700write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
701{
702 machine_mode mode = TYPE_MODE (type);
0beb7c71 703
44eba92d
NS
704 if (mode == VOIDmode)
705 return false;
706
707 bool return_in_mem = pass_in_memory (mode, type, true);
708
709 if (return_in_mem)
0beb7c71 710 {
44eba92d
NS
711 if (for_proto)
712 return return_in_mem;
713
714 /* Named return values can cause us to return a pointer as well
715 as expect an argument for the return location. This is
716 optimization-level specific, so no caller can make use of
717 this data, but more importantly for us, we must ensure it
718 doesn't change the PTX prototype. */
44c068ae 719 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 720
44eba92d
NS
721 if (mode == VOIDmode)
722 return return_in_mem;
723
44c068ae 724 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 725 retval parameter. */
44c068ae 726 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
727 }
728 else
44eba92d
NS
729 mode = promote_return (mode);
730
1f065954 731 write_return_mode (s, for_proto, mode);
0beb7c71
NS
732
733 return return_in_mem;
734}
735
738f2522
BS
736/* Look for attributes in ATTRS that would indicate we must write a function
737 as a .entry kernel rather than a .func. Return true if one is found. */
738
739static bool
740write_as_kernel (tree attrs)
741{
742 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
743 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
744 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
745 /* For OpenMP target regions, the corresponding kernel entry is emitted from
746 write_omp_entry as a separate function. */
738f2522
BS
747}
748
69823d76
NS
749/* Emit a linker marker for a function decl or defn. */
750
751static void
752write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
753 const char *name)
754{
755 s << "\n// BEGIN";
756 if (globalize)
757 s << " GLOBAL";
758 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
759 s << name << "\n";
760}
761
762/* Emit a linker marker for a variable decl or defn. */
763
764static void
765write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
766{
767 fprintf (file, "\n// BEGIN%s VAR %s: ",
768 globalize ? " GLOBAL" : "",
769 is_defn ? "DEF" : "DECL");
770 assemble_name_raw (file, name);
771 fputs ("\n", file);
772}
773
b699adcc
NS
774/* Write a .func or .kernel declaration or definition along with
775 a helper comment for use by ld. S is the stream to write to, DECL
776 the decl for the function with name NAME. For definitions, emit
777 a declaration too. */
738f2522 778
b699adcc
NS
779static const char *
780write_fn_proto (std::stringstream &s, bool is_defn,
781 const char *name, const_tree decl)
738f2522 782{
b699adcc
NS
783 if (is_defn)
784 /* Emit a declaration. The PTX assembler gets upset without it. */
785 name = write_fn_proto (s, false, name, decl);
b4346107
NS
786 else
787 {
788 /* Avoid repeating the name replacement. */
789 name = nvptx_name_replacement (name);
790 if (name[0] == '*')
791 name++;
792 }
738f2522 793
69823d76 794 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
795
796 /* PTX declaration. */
738f2522
BS
797 if (DECL_EXTERNAL (decl))
798 s << ".extern ";
799 else if (TREE_PUBLIC (decl))
0766660b 800 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 801 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 802
b699adcc
NS
803 tree fntype = TREE_TYPE (decl);
804 tree result_type = TREE_TYPE (fntype);
738f2522 805
ac3d02e2
NS
806 /* atomic_compare_exchange_$n builtins have an exceptional calling
807 convention. */
808 int not_atomic_weak_arg = -1;
809 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
810 switch (DECL_FUNCTION_CODE (decl))
811 {
812 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
813 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
814 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
815 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
816 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
817 /* These atomics skip the 'weak' parm in an actual library
818 call. We must skip it in the prototype too. */
819 not_atomic_weak_arg = 3;
820 break;
821
822 default:
823 break;
824 }
825
738f2522 826 /* Declare the result. */
1f065954 827 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 828
b699adcc
NS
829 s << name;
830
dc3d2aeb 831 int argno = 0;
b699adcc
NS
832
833 /* Emit argument list. */
834 if (return_in_mem)
1f065954 835 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 836
b699adcc
NS
837 /* We get:
838 NULL in TYPE_ARG_TYPES, for old-style functions
839 NULL in DECL_ARGUMENTS, for builtin functions without another
840 declaration.
841 So we have to pick the best one we have. */
842 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
843 bool prototyped = true;
844 if (!args)
845 {
846 args = DECL_ARGUMENTS (decl);
847 prototyped = false;
848 }
738f2522 849
ac3d02e2 850 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 851 {
dc3d2aeb 852 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
853
854 if (not_atomic_weak_arg)
855 argno = write_arg_type (s, -1, argno, type, prototyped);
856 else
857 gcc_assert (type == boolean_type_node);
738f2522 858 }
738f2522 859
b699adcc 860 if (stdarg_p (fntype))
1f065954 861 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 862
b699adcc 863 if (DECL_STATIC_CHAIN (decl))
1f065954 864 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 865
dc3d2aeb 866 if (!argno && strcmp (name, "main") == 0)
b699adcc 867 {
1f065954
NS
868 argno = write_arg_type (s, -1, argno, integer_type_node, true);
869 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
870 }
871
dc3d2aeb 872 if (argno)
b699adcc
NS
873 s << ")";
874
875 s << (is_defn ? "\n" : ";\n");
876
877 return name;
738f2522
BS
878}
879
00e52418
NS
880/* Construct a function declaration from a call insn. This can be
881 necessary for two reasons - either we have an indirect call which
882 requires a .callprototype declaration, or we have a libcall
883 generated by emit_library_call for which no decl exists. */
884
885static void
b699adcc
NS
886write_fn_proto_from_insn (std::stringstream &s, const char *name,
887 rtx result, rtx pat)
00e52418
NS
888{
889 if (!name)
890 {
891 s << "\t.callprototype ";
892 name = "_";
893 }
894 else
895 {
b4346107 896 name = nvptx_name_replacement (name);
69823d76 897 write_fn_marker (s, false, true, name);
00e52418
NS
898 s << "\t.extern .func ";
899 }
900
901 if (result != NULL_RTX)
1f065954 902 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
903
904 s << name;
905
00e52418
NS
906 int arg_end = XVECLEN (pat, 0);
907 for (int i = 1; i < arg_end; i++)
908 {
1f065954
NS
909 /* We don't have to deal with mode splitting & promotion here,
910 as that was already done when generating the call
911 sequence. */
00e52418
NS
912 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
913
1f065954 914 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
915 }
916 if (arg_end != 1)
917 s << ")";
918 s << ";\n";
919}
920
00e52418
NS
921/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
922 table and and write a ptx prototype. These are emitted at end of
923 compilation. */
738f2522 924
00e52418
NS
925static void
926nvptx_record_fndecl (tree decl)
738f2522 927{
f3dba894 928 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
929 if (*slot == NULL)
930 {
931 *slot = decl;
932 const char *name = get_fnname_from_decl (decl);
b699adcc 933 write_fn_proto (func_decls, false, name, decl);
738f2522 934 }
738f2522
BS
935}
936
00e52418
NS
937/* Record a libcall or unprototyped external function. CALLEE is the
938 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
939 declaration for it. */
940
941static void
942nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
943{
944 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
945 if (*slot == NULL)
946 {
947 *slot = callee;
948
949 const char *name = XSTR (callee, 0);
b699adcc 950 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
951 }
952}
953
954/* DECL is an external FUNCTION_DECL, that we're referencing. If it
955 is prototyped, record it now. Otherwise record it as needed at end
956 of compilation, when we might have more information about it. */
738f2522
BS
957
958void
959nvptx_record_needed_fndecl (tree decl)
960{
00e52418
NS
961 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
962 {
963 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
964 if (*slot == NULL)
965 *slot = decl;
966 }
967 else
968 nvptx_record_fndecl (decl);
969}
738f2522 970
00e52418
NS
971/* SYM is a SYMBOL_REF. If it refers to an external function, record
972 it as needed. */
973
974static void
975nvptx_maybe_record_fnsym (rtx sym)
976{
977 tree decl = SYMBOL_REF_DECL (sym);
978
979 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
980 nvptx_record_needed_fndecl (decl);
738f2522
BS
981}
982
1f065954 983/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
984 and initialize REGNO to point to it. If the size is zero, it'll
985 never be valid to dereference, so we can simply initialize to
986 zero. */
1f065954
NS
987
988static void
989init_frame (FILE *file, int regno, unsigned align, unsigned size)
990{
5563d5c0
NS
991 if (size)
992 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
993 align, reg_names[regno], size);
994 fprintf (file, "\t.reg.u%d %s;\n",
995 POINTER_SIZE, reg_names[regno]);
996 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
997 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
998 POINTER_SIZE, reg_names[regno], reg_names[regno]);
999}
1000
5012919d
AM
1001/* Emit soft stack frame setup sequence. */
1002
1003static void
1004init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1005{
1006 /* Maintain 64-bit stack alignment. */
1007 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1008 size = ROUND_UP (size, keep_align);
1009 int bits = POINTER_SIZE;
1010 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1011 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1012 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1013 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1014 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1015 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1016 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1017 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1018 fprintf (file, "\t{\n");
1019 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1020 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1021 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1022 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1023 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1024 bits == 64 ? ".wide" : ".lo", bits / 8);
1025 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1026
1027 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1028 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1029
1030 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1031 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1032 bits, reg_sspprev, reg_sspslot);
1033
1034 /* Initialize %frame = %sspprev - size. */
1035 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1036 bits, reg_frame, reg_sspprev, size);
1037
1038 /* Apply alignment, if larger than 64. */
1039 if (alignment > keep_align)
1040 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1041 bits, reg_frame, reg_frame, -alignment);
1042
1043 size = crtl->outgoing_args_size;
1044 gcc_assert (size % keep_align == 0);
1045
1046 /* Initialize %stack. */
1047 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1048 bits, reg_stack, reg_frame, size);
1049
1050 /* Usually 'crtl->is_leaf' is computed during register allocator
40829cb8
AM
1051 initialization (which is not done on NVPTX) or for pressure-sensitive
1052 optimizations. Initialize it here, except if already set. */
1053 if (!crtl->is_leaf)
1054 crtl->is_leaf = leaf_function_p ();
5012919d
AM
1055 if (!crtl->is_leaf)
1056 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1057 bits, reg_sspslot, reg_stack);
1058 fprintf (file, "\t}\n");
1059 cfun->machine->has_softstack = true;
1060 need_softstack_decl = true;
1061}
1062
d88cd9c4
NS
1063/* Emit code to initialize the REGNO predicate register to indicate
1064 whether we are not lane zero on the NAME axis. */
1065
1066static void
1067nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1068{
1069 fprintf (file, "\t{\n");
1070 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1071 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1072 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1073 fprintf (file, "\t}\n");
1074}
1075
5012919d
AM
1076/* Emit code to initialize predicate and master lane index registers for
1077 -muniform-simt code generation variant. */
1078
1079static void
1080nvptx_init_unisimt_predicate (FILE *file)
1081{
1082 int bits = POINTER_SIZE;
1083 int master = REGNO (cfun->machine->unisimt_master);
1084 int pred = REGNO (cfun->machine->unisimt_predicate);
1085 fprintf (file, "\t{\n");
1086 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1087 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1088 fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
1089 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1090 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1091 bits == 64 ? ".wide" : ".lo");
1092 fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
1093 fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
1094 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
1095 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.x;\n");
1096 /* Compute 'master lane index' as 'tid.x & __nvptx_uni[tid.y]'. */
1097 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1098 /* Compute predicate as 'tid.x == master'. */
1099 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1100 fprintf (file, "\t}\n");
1101 need_unisimt_decl = true;
1102}
1103
1104/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1105
1106 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1107 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1108 {
1109 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1110 __nvptx_uni[tid.y] = 0;
1111 gomp_nvptx_main (ORIG, arg);
1112 }
1113 ORIG itself should not be emitted as a PTX .entry function. */
1114
1115static void
1116write_omp_entry (FILE *file, const char *name, const char *orig)
1117{
1118 static bool gomp_nvptx_main_declared;
1119 if (!gomp_nvptx_main_declared)
1120 {
1121 gomp_nvptx_main_declared = true;
1122 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1123 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1124 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1125 }
1126#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1127 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1128{\n\
1129 .reg.u32 %r<3>;\n\
1130 .reg.u" PS " %R<4>;\n\
1131 mov.u32 %r0, %tid.y;\n\
1132 mov.u32 %r1, %ntid.y;\n\
1133 mov.u32 %r2, %ctaid.x;\n\
1134 cvt.u" PS ".u32 %R1, %r0;\n\
1135 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1136 mov.u" PS " %R0, __nvptx_stacks;\n\
1137 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1138 ld.param.u" PS " %R2, [%stack];\n\
1139 ld.param.u" PS " %R3, [%sz];\n\
1140 add.u" PS " %R2, %R2, %R3;\n\
1141 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1142 st.shared.u" PS " [%R0], %R2;\n\
1143 mov.u" PS " %R0, __nvptx_uni;\n\
1144 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1145 mov.u32 %r0, 0;\n\
1146 st.shared.u32 [%R0], %r0;\n\
1147 mov.u" PS " %R0, \0;\n\
1148 ld.param.u" PS " %R1, [%arg];\n\
1149 {\n\
1150 .param.u" PS " %P<2>;\n\
1151 st.param.u" PS " [%P0], %R0;\n\
1152 st.param.u" PS " [%P1], %R1;\n\
1153 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1154 }\n\
1155 ret.uni;\n\
1156}\n"
1157 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1158 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1159#undef ENTRY_TEMPLATE
1160 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1161 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1162 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1163 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1164 need_softstack_decl = need_unisimt_decl = true;
1165}
1166
738f2522
BS
1167/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1168 function, including local var decls and copies from the arguments to
1169 local regs. */
1170
1171void
1172nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1173{
1174 tree fntype = TREE_TYPE (decl);
1175 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1176 int argno = 0;
738f2522 1177
5012919d
AM
1178 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1179 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1180 {
1181 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1182 sprintf (buf, "%s$impl", name);
1183 write_omp_entry (file, name, buf);
1184 name = buf;
1185 }
dc3d2aeb
NS
1186 /* We construct the initial part of the function into a string
1187 stream, in order to share the prototype writing code. */
738f2522 1188 std::stringstream s;
b699adcc 1189 write_fn_proto (s, true, name, decl);
dc3d2aeb 1190 s << "{\n";
738f2522 1191
1f065954 1192 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1193 if (return_in_mem)
1f065954 1194 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1195
5ab662d5 1196 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1197 tree args = TYPE_ARG_TYPES (fntype);
1198 bool prototyped = true;
1199 if (!args)
5ab662d5 1200 {
dc3d2aeb
NS
1201 args = DECL_ARGUMENTS (decl);
1202 prototyped = false;
5ab662d5
NS
1203 }
1204
1205 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1206 {
1207 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1208
1f065954 1209 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1210 }
5ab662d5 1211
dc3d2aeb 1212 if (stdarg_p (fntype))
5563d5c0 1213 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1214 true);
5ab662d5 1215
5563d5c0
NS
1216 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1217 write_arg_type (s, STATIC_CHAIN_REGNUM,
1218 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1219 true);
1220
dc3d2aeb 1221 fprintf (file, "%s", s.str().c_str());
25662751 1222
44c068ae 1223 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1224 bool need_frameptr = sz || cfun->machine->has_chain;
1225 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1226 if (!TARGET_SOFT_STACK)
1227 {
1228 /* Declare a local var for outgoing varargs. */
1229 if (cfun->machine->has_varadic)
1230 init_frame (file, STACK_POINTER_REGNUM,
1231 UNITS_PER_WORD, crtl->outgoing_args_size);
1232
1233 /* Declare a local variable for the frame. Force its size to be
1234 DImode-compatible. */
1235 if (need_frameptr)
1236 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1237 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1238 }
1239 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca)
1240 init_softstack_frame (file, alignment, sz);
44c068ae 1241
738f2522
BS
1242 /* Declare the pseudos we have as ptx registers. */
1243 int maxregs = max_reg_num ();
1244 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1245 {
1246 if (regno_reg_rtx[i] != const0_rtx)
1247 {
1248 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1249 machine_mode split = maybe_split_mode (mode);
f313d112 1250
d7479262 1251 if (split != VOIDmode)
f313d112
NS
1252 mode = split;
1253 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1254 output_reg (file, i, split, -2);
1255 fprintf (file, ";\n");
738f2522
BS
1256 }
1257 }
1258
d88cd9c4
NS
1259 /* Emit axis predicates. */
1260 if (cfun->machine->axis_predicate[0])
1261 nvptx_init_axis_predicate (file,
1262 REGNO (cfun->machine->axis_predicate[0]), "y");
1263 if (cfun->machine->axis_predicate[1])
1264 nvptx_init_axis_predicate (file,
1265 REGNO (cfun->machine->axis_predicate[1]), "x");
5012919d
AM
1266 if (cfun->machine->unisimt_predicate)
1267 nvptx_init_unisimt_predicate (file);
738f2522
BS
1268}
1269
5012919d
AM
1270/* Output instruction that sets soft stack pointer in shared memory to the
1271 value in register given by SRC_REGNO. */
1272
1273const char *
1274nvptx_output_set_softstack (unsigned src_regno)
1275{
1276 if (cfun->machine->has_softstack && !crtl->is_leaf)
1277 {
1278 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1279 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1280 output_reg (asm_out_file, src_regno, VOIDmode);
1281 fprintf (asm_out_file, ";\n");
1282 }
1283 return "";
1284}
738f2522
BS
1285/* Output a return instruction. Also copy the return value to its outgoing
1286 location. */
1287
1288const char *
1289nvptx_output_return (void)
1290{
44c068ae 1291 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1292
1293 if (mode != VOIDmode)
1f065954
NS
1294 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1295 nvptx_ptx_type_from_mode (mode, false),
1296 reg_names[NVPTX_RETURN_REGNUM],
1297 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1298
1299 return "ret;";
1300}
1301
738f2522
BS
1302/* Terminate a function by writing a closing brace to FILE. */
1303
1304void
1305nvptx_function_end (FILE *file)
1306{
cf08c344 1307 fprintf (file, "}\n");
738f2522
BS
1308}
1309\f
1310/* Decide whether we can make a sibling call to a function. For ptx, we
1311 can't. */
1312
1313static bool
1314nvptx_function_ok_for_sibcall (tree, tree)
1315{
1316 return false;
1317}
1318
18c05628
NS
1319/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1320
1321static rtx
1322nvptx_get_drap_rtx (void)
1323{
5012919d
AM
1324 if (TARGET_SOFT_STACK && stack_realign_drap)
1325 return arg_pointer_rtx;
18c05628
NS
1326 return NULL_RTX;
1327}
1328
738f2522
BS
1329/* Implement the TARGET_CALL_ARGS hook. Record information about one
1330 argument to the next call. */
1331
1332static void
44c068ae 1333nvptx_call_args (rtx arg, tree fntype)
738f2522 1334{
44c068ae 1335 if (!cfun->machine->doing_call)
738f2522 1336 {
44c068ae
NS
1337 cfun->machine->doing_call = true;
1338 cfun->machine->is_varadic = false;
1339 cfun->machine->num_args = 0;
1340
1341 if (fntype && stdarg_p (fntype))
1342 {
1343 cfun->machine->is_varadic = true;
1344 cfun->machine->has_varadic = true;
1345 cfun->machine->num_args++;
1346 }
738f2522 1347 }
738f2522 1348
44c068ae
NS
1349 if (REG_P (arg) && arg != pc_rtx)
1350 {
1351 cfun->machine->num_args++;
1352 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1353 cfun->machine->call_args);
1354 }
738f2522
BS
1355}
1356
1357/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1358 information we recorded. */
1359
1360static void
1361nvptx_end_call_args (void)
1362{
44c068ae 1363 cfun->machine->doing_call = false;
738f2522
BS
1364 free_EXPR_LIST_list (&cfun->machine->call_args);
1365}
1366
ecf6e535
BS
1367/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1368 track of whether calls involving static chains or varargs were seen
1369 in the current function.
1370 For libcalls, maintain a hash table of decls we have seen, and
1371 record a function decl for later when encountering a new one. */
738f2522
BS
1372
1373void
1374nvptx_expand_call (rtx retval, rtx address)
1375{
738f2522 1376 rtx callee = XEXP (address, 0);
f324806d 1377 rtx varargs = NULL_RTX;
d88cd9c4 1378 unsigned parallel = 0;
738f2522 1379
738f2522
BS
1380 if (!call_insn_operand (callee, Pmode))
1381 {
1382 callee = force_reg (Pmode, callee);
1383 address = change_address (address, QImode, callee);
1384 }
1385
1386 if (GET_CODE (callee) == SYMBOL_REF)
1387 {
1388 tree decl = SYMBOL_REF_DECL (callee);
1389 if (decl != NULL_TREE)
1390 {
738f2522 1391 if (DECL_STATIC_CHAIN (decl))
44c068ae 1392 cfun->machine->has_chain = true;
00e52418 1393
629b3d75 1394 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1395 if (attr)
1396 {
1397 tree dims = TREE_VALUE (attr);
1398
1399 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1400 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1401 {
1402 if (TREE_PURPOSE (dims)
1403 && !integer_zerop (TREE_PURPOSE (dims)))
1404 break;
1405 /* Not on this axis. */
1406 parallel ^= GOMP_DIM_MASK (ix);
1407 dims = TREE_CHAIN (dims);
1408 }
1409 }
738f2522
BS
1410 }
1411 }
c38f0d8c 1412
44c068ae
NS
1413 unsigned nargs = cfun->machine->num_args;
1414 if (cfun->machine->is_varadic)
738f2522 1415 {
f324806d 1416 varargs = gen_reg_rtx (Pmode);
863af9a4 1417 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1418 }
1419
44c068ae
NS
1420 rtvec vec = rtvec_alloc (nargs + 1);
1421 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1422 int vec_pos = 0;
44c068ae
NS
1423
1424 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1425 rtx tmp_retval = retval;
44c068ae 1426 if (retval)
738f2522
BS
1427 {
1428 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1429 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1430 call = gen_rtx_SET (tmp_retval, call);
738f2522 1431 }
44c068ae 1432 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1433
1434 /* Construct the call insn, including a USE for each argument pseudo
1435 register. These will be used when printing the insn. */
1436 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1437 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1438
1439 if (varargs)
cf08c344 1440 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1441
1442 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1443
d88cd9c4 1444 nvptx_emit_forking (parallel, true);
738f2522 1445 emit_call_insn (pat);
d88cd9c4
NS
1446 nvptx_emit_joining (parallel, true);
1447
738f2522
BS
1448 if (tmp_retval != retval)
1449 emit_move_insn (retval, tmp_retval);
1450}
44eba92d 1451
738f2522
BS
1452/* Emit a comparison COMPARE, and return the new test to be used in the
1453 jump. */
1454
1455rtx
1456nvptx_expand_compare (rtx compare)
1457{
1458 rtx pred = gen_reg_rtx (BImode);
1459 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1460 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1461 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1462 return gen_rtx_NE (BImode, pred, const0_rtx);
1463}
1464
d88cd9c4
NS
1465/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1466
1467void
1468nvptx_expand_oacc_fork (unsigned mode)
1469{
1470 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1471}
1472
1473void
1474nvptx_expand_oacc_join (unsigned mode)
1475{
1476 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1477}
1478
1479/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1480 objects. */
1481
1482static rtx
1483nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1484{
1485 rtx res;
1486
1487 switch (GET_MODE (src))
1488 {
1489 case DImode:
1490 res = gen_unpackdisi2 (dst0, dst1, src);
1491 break;
1492 case DFmode:
1493 res = gen_unpackdfsi2 (dst0, dst1, src);
1494 break;
1495 default: gcc_unreachable ();
1496 }
1497 return res;
1498}
1499
1500/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1501 object. */
1502
1503static rtx
1504nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1505{
1506 rtx res;
1507
1508 switch (GET_MODE (dst))
1509 {
1510 case DImode:
1511 res = gen_packsidi2 (dst, src0, src1);
1512 break;
1513 case DFmode:
1514 res = gen_packsidf2 (dst, src0, src1);
1515 break;
1516 default: gcc_unreachable ();
1517 }
1518 return res;
1519}
1520
1521/* Generate an instruction or sequence to broadcast register REG
1522 across the vectors of a single warp. */
1523
5012919d 1524rtx
59263259 1525nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1526{
1527 rtx res;
1528
1529 switch (GET_MODE (dst))
1530 {
1531 case SImode:
1532 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1533 break;
1534 case SFmode:
1535 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1536 break;
1537 case DImode:
1538 case DFmode:
1539 {
1540 rtx tmp0 = gen_reg_rtx (SImode);
1541 rtx tmp1 = gen_reg_rtx (SImode);
1542
1543 start_sequence ();
1544 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1545 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1546 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1547 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1548 res = get_insns ();
1549 end_sequence ();
1550 }
1551 break;
1552 case BImode:
1553 {
1554 rtx tmp = gen_reg_rtx (SImode);
1555
1556 start_sequence ();
1557 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1558 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1559 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1560 res = get_insns ();
1561 end_sequence ();
1562 }
1563 break;
d5ace3b5
CP
1564 case QImode:
1565 case HImode:
1566 {
1567 rtx tmp = gen_reg_rtx (SImode);
1568
1569 start_sequence ();
1570 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1571 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1572 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1573 tmp)));
1574 res = get_insns ();
1575 end_sequence ();
1576 }
1577 break;
d88cd9c4
NS
1578
1579 default:
1580 gcc_unreachable ();
1581 }
1582 return res;
1583}
1584
1585/* Generate an instruction or sequence to broadcast register REG
1586 across the vectors of a single warp. */
1587
1588static rtx
1589nvptx_gen_vcast (rtx reg)
1590{
1591 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1592}
1593
1594/* Structure used when generating a worker-level spill or fill. */
1595
1596struct wcast_data_t
1597{
1598 rtx base; /* Register holding base addr of buffer. */
1599 rtx ptr; /* Iteration var, if needed. */
1600 unsigned offset; /* Offset into worker buffer. */
1601};
1602
1603/* Direction of the spill/fill and looping setup/teardown indicator. */
1604
1605enum propagate_mask
1606 {
1607 PM_read = 1 << 0,
1608 PM_write = 1 << 1,
1609 PM_loop_begin = 1 << 2,
1610 PM_loop_end = 1 << 3,
1611
1612 PM_read_write = PM_read | PM_write
1613 };
1614
1615/* Generate instruction(s) to spill or fill register REG to/from the
1616 worker broadcast array. PM indicates what is to be done, REP
1617 how many loop iterations will be executed (0 for not a loop). */
1618
1619static rtx
1620nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1621{
1622 rtx res;
1623 machine_mode mode = GET_MODE (reg);
1624
1625 switch (mode)
1626 {
1627 case BImode:
1628 {
1629 rtx tmp = gen_reg_rtx (SImode);
1630
1631 start_sequence ();
1632 if (pm & PM_read)
1633 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1634 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1635 if (pm & PM_write)
1636 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1637 res = get_insns ();
1638 end_sequence ();
1639 }
1640 break;
1641
1642 default:
1643 {
1644 rtx addr = data->ptr;
1645
1646 if (!addr)
1647 {
1648 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1649
1650 if (align > worker_bcast_align)
1651 worker_bcast_align = align;
1652 data->offset = (data->offset + align - 1) & ~(align - 1);
1653 addr = data->base;
1654 if (data->offset)
1655 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1656 }
1657
1658 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1659 if (pm == PM_read)
1660 res = gen_rtx_SET (addr, reg);
1661 else if (pm == PM_write)
1662 res = gen_rtx_SET (reg, addr);
1663 else
1664 gcc_unreachable ();
1665
1666 if (data->ptr)
1667 {
1668 /* We're using a ptr, increment it. */
1669 start_sequence ();
1670
1671 emit_insn (res);
1672 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1673 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1674 res = get_insns ();
1675 end_sequence ();
1676 }
1677 else
1678 rep = 1;
1679 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1680 }
1681 break;
1682 }
1683 return res;
1684}
738f2522
BS
1685\f
1686/* Returns true if X is a valid address for use in a memory reference. */
1687
1688static bool
1689nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1690{
1691 enum rtx_code code = GET_CODE (x);
1692
1693 switch (code)
1694 {
1695 case REG:
1696 return true;
1697
1698 case PLUS:
1699 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1700 return true;
1701 return false;
1702
1703 case CONST:
1704 case SYMBOL_REF:
1705 case LABEL_REF:
1706 return true;
1707
1708 default:
1709 return false;
1710 }
1711}
738f2522 1712\f
4d5438cd
NS
1713/* Machinery to output constant initializers. When beginning an
1714 initializer, we decide on a fragment size (which is visible in ptx
1715 in the type used), and then all initializer data is buffered until
1716 a fragment is filled and ready to be written out. */
1717
1718static struct
1719{
1720 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1721 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1722 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1723 out. */
1724 unsigned size; /* Fragment size to accumulate. */
1725 unsigned offset; /* Offset within current fragment. */
1726 bool started; /* Whether we've output any initializer. */
1727} init_frag;
1728
1729/* The current fragment is full, write it out. SYM may provide a
1730 symbolic reference we should output, in which case the fragment
1731 value is the addend. */
738f2522
BS
1732
1733static void
4d5438cd 1734output_init_frag (rtx sym)
738f2522 1735{
4d5438cd
NS
1736 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1737 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1738
4d5438cd
NS
1739 init_frag.started = true;
1740 init_frag.val = 0;
1741 init_frag.offset = 0;
1742 init_frag.remaining--;
1743
1744 if (sym)
1745 {
1746 fprintf (asm_out_file, "generic(");
1747 output_address (VOIDmode, sym);
1748 fprintf (asm_out_file, val ? ") + " : ")");
1749 }
738f2522 1750
4d5438cd
NS
1751 if (!sym || val)
1752 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
1753}
1754
4d5438cd
NS
1755/* Add value VAL of size SIZE to the data we're emitting, and keep
1756 writing out chunks as they fill up. */
738f2522
BS
1757
1758static void
4d5438cd 1759nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 1760{
4d5438cd
NS
1761 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1762
1763 for (unsigned part = 0; size; size -= part)
738f2522 1764 {
4d5438cd
NS
1765 val >>= part * BITS_PER_UNIT;
1766 part = init_frag.size - init_frag.offset;
1767 if (part > size)
1768 part = size;
1769
1770 unsigned HOST_WIDE_INT partial
1771 = val << (init_frag.offset * BITS_PER_UNIT);
1772 init_frag.val |= partial & init_frag.mask;
1773 init_frag.offset += part;
1774
1775 if (init_frag.offset == init_frag.size)
1776 output_init_frag (NULL);
738f2522
BS
1777 }
1778}
1779
1780/* Target hook for assembling integer object X of size SIZE. */
1781
1782static bool
1783nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1784{
00e52418
NS
1785 HOST_WIDE_INT val = 0;
1786
1787 switch (GET_CODE (x))
738f2522 1788 {
00e52418 1789 default:
a9000e1e
NS
1790 /* Let the generic machinery figure it out, usually for a
1791 CONST_WIDE_INT. */
1792 return false;
00e52418
NS
1793
1794 case CONST_INT:
4d5438cd 1795 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
1796 break;
1797
1798 case CONST:
1799 x = XEXP (x, 0);
1800 gcc_assert (GET_CODE (x) == PLUS);
1801 val = INTVAL (XEXP (x, 1));
1802 x = XEXP (x, 0);
1803 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1804 /* FALLTHROUGH */
1805
1806 case SYMBOL_REF:
4d5438cd
NS
1807 gcc_assert (size == init_frag.size);
1808 if (init_frag.offset)
738f2522 1809 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 1810
00e52418 1811 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
1812 init_frag.val = val;
1813 output_init_frag (x);
738f2522 1814 break;
738f2522
BS
1815 }
1816
738f2522
BS
1817 return true;
1818}
1819
1820/* Output SIZE zero bytes. We ignore the FILE argument since the
1821 functions we're calling to perform the output just use
1822 asm_out_file. */
1823
1824void
1825nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1826{
4d5438cd
NS
1827 /* Finish the current fragment, if it's started. */
1828 if (init_frag.offset)
738f2522 1829 {
4d5438cd
NS
1830 unsigned part = init_frag.size - init_frag.offset;
1831 if (part > size)
1832 part = (unsigned) size;
1833 size -= part;
1834 nvptx_assemble_value (0, part);
738f2522
BS
1835 }
1836
4d5438cd
NS
1837 /* If this skip doesn't terminate the initializer, write as many
1838 remaining pieces as possible directly. */
1839 if (size < init_frag.remaining * init_frag.size)
738f2522 1840 {
4d5438cd
NS
1841 while (size >= init_frag.size)
1842 {
1843 size -= init_frag.size;
1844 output_init_frag (NULL_RTX);
1845 }
1846 if (size)
1847 nvptx_assemble_value (0, size);
738f2522 1848 }
738f2522
BS
1849}
1850
1851/* Output a string STR with length SIZE. As in nvptx_output_skip we
1852 ignore the FILE arg. */
1853
1854void
1855nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1856{
1857 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1858 nvptx_assemble_value (str[i], 1);
1859}
1860
4ff3145a
NS
1861/* Emit a PTX variable decl and prepare for emission of its
1862 initializer. NAME is the symbol name and SETION the PTX data
1863 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1864 The caller has already emitted any indentation and linkage
1865 specifier. It is responsible for any initializer, terminating ;
1866 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1867 this is the opposite way round that PTX wants them! */
1868
1869static void
1870nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1871 const_tree type, HOST_WIDE_INT size, unsigned align)
1872{
1873 while (TREE_CODE (type) == ARRAY_TYPE)
1874 type = TREE_TYPE (type);
1875
fc0efeea
NS
1876 if (TREE_CODE (type) == VECTOR_TYPE
1877 || TREE_CODE (type) == COMPLEX_TYPE)
1878 /* Neither vector nor complex types can contain the other. */
1879 type = TREE_TYPE (type);
1880
4ff3145a 1881 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
1882
1883 /* Largest mode we're prepared to accept. For BLKmode types we
1884 don't know if it'll contain pointer constants, so have to choose
1885 pointer size, otherwise we can choose DImode. */
1886 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1887
1888 elt_size |= GET_MODE_SIZE (elt_mode);
1889 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 1890
4d5438cd 1891 init_frag.size = elt_size;
9c582551 1892 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
1893 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1894 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1895 init_frag.val = 0;
1896 init_frag.offset = 0;
1897 init_frag.started = false;
1898 /* Size might not be a multiple of elt size, if there's an
1899 initialized trailing struct array with smaller type than
1900 elt_size. */
1901 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
1902
1903 fprintf (file, "%s .align %d .u%d ",
1904 section, align / BITS_PER_UNIT,
1905 elt_size * BITS_PER_UNIT);
1906 assemble_name (file, name);
1907
1908 if (size)
1909 /* We make everything an array, to simplify any initialization
1910 emission. */
4d5438cd 1911 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
4ff3145a
NS
1912}
1913
738f2522
BS
1914/* Called when the initializer for a decl has been completely output through
1915 combinations of the three functions above. */
1916
1917static void
1918nvptx_assemble_decl_end (void)
1919{
4d5438cd
NS
1920 if (init_frag.offset)
1921 /* This can happen with a packed struct with trailing array member. */
1922 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1923 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
1924}
1925
69823d76
NS
1926/* Output an uninitialized common or file-scope variable. */
1927
1928void
1929nvptx_output_aligned_decl (FILE *file, const char *name,
1930 const_tree decl, HOST_WIDE_INT size, unsigned align)
1931{
1932 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1933
1934 /* If this is public, it is common. The nearest thing we have to
1935 common is weak. */
4ff3145a
NS
1936 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1937
1938 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1939 TREE_TYPE (decl), size, align);
4d5438cd 1940 nvptx_assemble_decl_end ();
69823d76
NS
1941}
1942
738f2522
BS
1943/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1944 writing a constant variable EXP with NAME and SIZE and its
1945 initializer to FILE. */
1946
1947static void
1948nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 1949 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 1950{
4ff3145a
NS
1951 write_var_marker (file, true, false, name);
1952
1953 fprintf (file, "\t");
1954
738f2522 1955 tree type = TREE_TYPE (exp);
4ff3145a
NS
1956 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1957 TYPE_ALIGN (type));
738f2522
BS
1958}
1959
1960/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1961 a variable DECL with NAME to FILE. */
1962
1963void
1964nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1965{
4ff3145a 1966 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 1967
4ff3145a
NS
1968 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1969 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 1970
4ff3145a
NS
1971 tree type = TREE_TYPE (decl);
1972 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1973 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1974 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
1975}
1976
1977/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1978
1979static void
1980nvptx_globalize_label (FILE *, const char *)
1981{
1982}
1983
1984/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1985 declaration only for variable DECL with NAME to FILE. */
f313d112 1986
738f2522
BS
1987static void
1988nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1989{
0a0f74aa
NS
1990 /* The middle end can place constant pool decls into the varpool as
1991 undefined. Until that is fixed, catch the problem here. */
1992 if (DECL_IN_CONSTANT_POOL (decl))
1993 return;
1994
721547cd
NS
1995 /* We support weak defintions, and hence have the right
1996 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
1997 if (DECL_WEAK (decl))
1998 error_at (DECL_SOURCE_LOCATION (decl),
1999 "PTX does not support weak declarations"
2000 " (only weak definitions)");
69823d76
NS
2001 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2002
4ff3145a
NS
2003 fprintf (file, "\t.extern ");
2004 tree size = DECL_SIZE_UNIT (decl);
2005 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2006 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2007 DECL_ALIGN (decl));
1e5154e7 2008 nvptx_assemble_decl_end ();
738f2522
BS
2009}
2010
f313d112
NS
2011/* Output a pattern for a move instruction. */
2012
2013const char *
2014nvptx_output_mov_insn (rtx dst, rtx src)
2015{
2016 machine_mode dst_mode = GET_MODE (dst);
2017 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2018 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2019 machine_mode src_inner = (GET_CODE (src) == SUBREG
2020 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2021
15113b03
NS
2022 rtx sym = src;
2023 if (GET_CODE (sym) == CONST)
2024 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2025 if (SYMBOL_REF_P (sym))
2026 {
2027 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2028 return "%.\tcvta%D1%t0\t%0, %1;";
2029 nvptx_maybe_record_fnsym (sym);
2030 }
15113b03 2031
f313d112
NS
2032 if (src_inner == dst_inner)
2033 return "%.\tmov%t0\t%0, %1;";
2034
2035 if (CONSTANT_P (src))
2036 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2037 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2038 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2039
2040 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2041 return "%.\tmov.b%T0\t%0, %1;";
2042
2043 return "%.\tcvt%t0%t1\t%0, %1;";
2044}
2045
5012919d
AM
2046static void nvptx_print_operand (FILE *, rtx, int);
2047
738f2522 2048/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2049 involves writing .param declarations and in/out copies into them. For
2050 indirect calls, also write the .callprototype. */
738f2522
BS
2051
2052const char *
2053nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2054{
863af9a4 2055 char buf[16];
738f2522
BS
2056 static int labelno;
2057 bool needs_tgt = register_operand (callee, Pmode);
2058 rtx pat = PATTERN (insn);
5012919d
AM
2059 if (GET_CODE (pat) == COND_EXEC)
2060 pat = COND_EXEC_CODE (pat);
f324806d 2061 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2062 tree decl = NULL_TREE;
2063
2064 fprintf (asm_out_file, "\t{\n");
2065 if (result != NULL)
1f065954
NS
2066 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2067 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2068 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2069
ecf6e535 2070 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2071 if (GET_CODE (callee) == SYMBOL_REF)
2072 {
2073 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2074 if (!decl
2075 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2076 nvptx_record_libfunc (callee, result, pat);
2077 else if (DECL_EXTERNAL (decl))
738f2522
BS
2078 nvptx_record_fndecl (decl);
2079 }
2080
2081 if (needs_tgt)
2082 {
2083 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2084 labelno++;
2085 ASM_OUTPUT_LABEL (asm_out_file, buf);
2086 std::stringstream s;
b699adcc 2087 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2088 fputs (s.str().c_str(), asm_out_file);
2089 }
2090
863af9a4 2091 for (int argno = 1; argno < arg_end; argno++)
738f2522 2092 {
863af9a4 2093 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2094 machine_mode mode = GET_MODE (t);
a02d84b6 2095 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2096
863af9a4 2097 /* Mode splitting has already been done. */
a02d84b6
NS
2098 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2099 "\t\tst.param%s [%%out_arg%d], ",
2100 ptx_type, argno, ptx_type, argno);
2101 output_reg (asm_out_file, REGNO (t), VOIDmode);
2102 fprintf (asm_out_file, ";\n");
738f2522
BS
2103 }
2104
5012919d
AM
2105 /* The '.' stands for the call's predicate, if any. */
2106 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2107 fprintf (asm_out_file, "\t\tcall ");
2108 if (result != NULL_RTX)
1f065954
NS
2109 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2110
738f2522
BS
2111 if (decl)
2112 {
2113 const char *name = get_fnname_from_decl (decl);
b4346107 2114 name = nvptx_name_replacement (name);
738f2522
BS
2115 assemble_name (asm_out_file, name);
2116 }
2117 else
cc8ca59e 2118 output_address (VOIDmode, callee);
738f2522 2119
863af9a4
NS
2120 const char *open = "(";
2121 for (int argno = 1; argno < arg_end; argno++)
738f2522 2122 {
863af9a4
NS
2123 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2124 open = "";
738f2522 2125 }
863af9a4
NS
2126 if (decl && DECL_STATIC_CHAIN (decl))
2127 {
5563d5c0 2128 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2129 open = "";
2130 }
2131 if (!open[0])
2132 fprintf (asm_out_file, ")");
f324806d 2133
738f2522
BS
2134 if (needs_tgt)
2135 {
2136 fprintf (asm_out_file, ", ");
2137 assemble_name (asm_out_file, buf);
2138 }
2139 fprintf (asm_out_file, ";\n");
738f2522 2140
51baf85a
NS
2141 if (find_reg_note (insn, REG_NORETURN, NULL))
2142 /* No return functions confuse the PTX JIT, as it doesn't realize
2143 the flow control barrier they imply. It can seg fault if it
2144 encounters what looks like an unexitable loop. Emit a trailing
2145 trap, which it does grok. */
2146 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2147
1f065954
NS
2148 if (result)
2149 {
2150 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2151
2152 if (!rval[0])
2153 /* We must escape the '%' that starts RETURN_REGNUM. */
2154 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2155 reg_names[NVPTX_RETURN_REGNUM]);
2156 return rval;
2157 }
2158
2159 return "}";
738f2522
BS
2160}
2161
2162/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2163
2164static bool
2165nvptx_print_operand_punct_valid_p (unsigned char c)
2166{
2167 return c == '.' || c== '#';
2168}
2169
738f2522
BS
2170/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2171
2172static void
2173nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2174{
2175 rtx off;
2176 if (GET_CODE (x) == CONST)
2177 x = XEXP (x, 0);
2178 switch (GET_CODE (x))
2179 {
2180 case PLUS:
2181 off = XEXP (x, 1);
cc8ca59e 2182 output_address (VOIDmode, XEXP (x, 0));
738f2522 2183 fprintf (file, "+");
cc8ca59e 2184 output_address (VOIDmode, off);
738f2522
BS
2185 break;
2186
2187 case SYMBOL_REF:
2188 case LABEL_REF:
2189 output_addr_const (file, x);
2190 break;
2191
2192 default:
2193 gcc_assert (GET_CODE (x) != MEM);
2194 nvptx_print_operand (file, x, 0);
2195 break;
2196 }
2197}
2198
2199/* Write assembly language output for the address ADDR to FILE. */
2200
2201static void
cc8ca59e 2202nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2203{
cc8ca59e 2204 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2205}
2206
2207/* Print an operand, X, to FILE, with an optional modifier in CODE.
2208
2209 Meaning of CODE:
2210 . -- print the predicate for the instruction or an emptry string for an
2211 unconditional one.
2212 # -- print a rounding mode for the instruction
2213
9a863523 2214 A -- print a data area for a MEM
738f2522 2215 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2216 D -- print a data area for a MEM operand
d88cd9c4 2217 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2218 t -- print a type opcode suffix, promoting QImode to 32 bits
2219 T -- print a type size in bits
2220 u -- print a type opcode suffix without promotions. */
2221
2222static void
2223nvptx_print_operand (FILE *file, rtx x, int code)
2224{
738f2522
BS
2225 if (code == '.')
2226 {
2227 x = current_insn_predicate;
2228 if (x)
2229 {
5012919d 2230 fputs ("@", file);
738f2522
BS
2231 if (GET_CODE (x) == EQ)
2232 fputs ("!", file);
5012919d 2233 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2234 }
2235 return;
2236 }
2237 else if (code == '#')
2238 {
2239 fputs (".rn", file);
2240 return;
2241 }
2242
2243 enum rtx_code x_code = GET_CODE (x);
f313d112 2244 machine_mode mode = GET_MODE (x);
738f2522
BS
2245
2246 switch (code)
2247 {
2248 case 'A':
9a863523
NS
2249 x = XEXP (x, 0);
2250 /* FALLTHROUGH. */
7b8edc29 2251
9a863523
NS
2252 case 'D':
2253 if (GET_CODE (x) == CONST)
2254 x = XEXP (x, 0);
2255 if (GET_CODE (x) == PLUS)
2256 x = XEXP (x, 0);
7b8edc29 2257
9a863523
NS
2258 if (GET_CODE (x) == SYMBOL_REF)
2259 fputs (section_for_sym (x), file);
738f2522
BS
2260 break;
2261
738f2522 2262 case 't':
738f2522 2263 case 'u':
f313d112
NS
2264 if (x_code == SUBREG)
2265 {
2266 mode = GET_MODE (SUBREG_REG (x));
2267 if (mode == TImode)
2268 mode = DImode;
2269 else if (COMPLEX_MODE_P (mode))
2270 mode = GET_MODE_INNER (mode);
2271 }
2272 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2273 break;
2274
d88cd9c4
NS
2275 case 'S':
2276 {
59263259
NS
2277 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2278 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2279 static const char *const kinds[] =
59263259
NS
2280 {".up", ".down", ".bfly", ".idx"};
2281 fputs (kinds[kind], file);
d88cd9c4
NS
2282 }
2283 break;
2284
738f2522 2285 case 'T':
f313d112 2286 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2287 break;
2288
2289 case 'j':
2290 fprintf (file, "@");
2291 goto common;
2292
2293 case 'J':
2294 fprintf (file, "@!");
2295 goto common;
2296
2297 case 'c':
f313d112 2298 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2299 switch (x_code)
2300 {
2301 case EQ:
2302 fputs (".eq", file);
2303 break;
2304 case NE:
f313d112 2305 if (FLOAT_MODE_P (mode))
738f2522
BS
2306 fputs (".neu", file);
2307 else
2308 fputs (".ne", file);
2309 break;
2310 case LE:
578fb225 2311 case LEU:
738f2522
BS
2312 fputs (".le", file);
2313 break;
2314 case GE:
578fb225 2315 case GEU:
738f2522
BS
2316 fputs (".ge", file);
2317 break;
2318 case LT:
578fb225 2319 case LTU:
738f2522
BS
2320 fputs (".lt", file);
2321 break;
2322 case GT:
738f2522 2323 case GTU:
578fb225 2324 fputs (".gt", file);
738f2522
BS
2325 break;
2326 case LTGT:
2327 fputs (".ne", file);
2328 break;
2329 case UNEQ:
2330 fputs (".equ", file);
2331 break;
2332 case UNLE:
2333 fputs (".leu", file);
2334 break;
2335 case UNGE:
2336 fputs (".geu", file);
2337 break;
2338 case UNLT:
2339 fputs (".ltu", file);
2340 break;
2341 case UNGT:
2342 fputs (".gtu", file);
2343 break;
2344 case UNORDERED:
2345 fputs (".nan", file);
2346 break;
2347 case ORDERED:
2348 fputs (".num", file);
2349 break;
2350 default:
2351 gcc_unreachable ();
2352 }
f313d112 2353 if (FLOAT_MODE_P (mode)
738f2522
BS
2354 || x_code == EQ || x_code == NE
2355 || x_code == GEU || x_code == GTU
2356 || x_code == LEU || x_code == LTU)
f313d112 2357 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2358 else
f313d112 2359 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2360 break;
2361 default:
2362 common:
2363 switch (x_code)
2364 {
2365 case SUBREG:
f313d112
NS
2366 {
2367 rtx inner_x = SUBREG_REG (x);
2368 machine_mode inner_mode = GET_MODE (inner_x);
2369 machine_mode split = maybe_split_mode (inner_mode);
2370
2371 if (split != VOIDmode
2372 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2373 output_reg (file, REGNO (inner_x), split);
2374 else
2375 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2376 }
2377 break;
738f2522
BS
2378
2379 case REG:
f313d112 2380 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2381 break;
2382
2383 case MEM:
2384 fputc ('[', file);
f313d112 2385 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2386 fputc (']', file);
2387 break;
2388
2389 case CONST_INT:
2390 output_addr_const (file, x);
2391 break;
2392
2393 case CONST:
2394 case SYMBOL_REF:
2395 case LABEL_REF:
2396 /* We could use output_addr_const, but that can print things like
2397 "x-8", which breaks ptxas. Need to ensure it is output as
2398 "x+-8". */
2399 nvptx_print_address_operand (file, x, VOIDmode);
2400 break;
2401
2402 case CONST_DOUBLE:
2403 long vals[2];
f313d112 2404 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2405 vals[0] &= 0xffffffff;
2406 vals[1] &= 0xffffffff;
f313d112 2407 if (mode == SFmode)
738f2522
BS
2408 fprintf (file, "0f%08lx", vals[0]);
2409 else
2410 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2411 break;
2412
2413 default:
2414 output_addr_const (file, x);
2415 }
2416 }
2417}
2418\f
2419/* Record replacement regs used to deal with subreg operands. */
2420struct reg_replace
2421{
2422 rtx replacement[MAX_RECOG_OPERANDS];
2423 machine_mode mode;
2424 int n_allocated;
2425 int n_in_use;
2426};
2427
2428/* Allocate or reuse a replacement in R and return the rtx. */
2429
2430static rtx
2431get_replacement (struct reg_replace *r)
2432{
2433 if (r->n_allocated == r->n_in_use)
2434 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2435 return r->replacement[r->n_in_use++];
2436}
2437
2438/* Clean up subreg operands. In ptx assembly, everything is typed, and
2439 the presence of subregs would break the rules for most instructions.
2440 Replace them with a suitable new register of the right size, plus
2441 conversion copyin/copyout instructions. */
2442
2443static void
517665b3 2444nvptx_reorg_subreg (void)
738f2522
BS
2445{
2446 struct reg_replace qiregs, hiregs, siregs, diregs;
2447 rtx_insn *insn, *next;
2448
738f2522
BS
2449 qiregs.n_allocated = 0;
2450 hiregs.n_allocated = 0;
2451 siregs.n_allocated = 0;
2452 diregs.n_allocated = 0;
2453 qiregs.mode = QImode;
2454 hiregs.mode = HImode;
2455 siregs.mode = SImode;
2456 diregs.mode = DImode;
2457
2458 for (insn = get_insns (); insn; insn = next)
2459 {
2460 next = NEXT_INSN (insn);
2461 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2462 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2463 || GET_CODE (PATTERN (insn)) == USE
2464 || GET_CODE (PATTERN (insn)) == CLOBBER)
2465 continue;
f324806d 2466
738f2522
BS
2467 qiregs.n_in_use = 0;
2468 hiregs.n_in_use = 0;
2469 siregs.n_in_use = 0;
2470 diregs.n_in_use = 0;
2471 extract_insn (insn);
2472 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2473
738f2522
BS
2474 for (int i = 0; i < recog_data.n_operands; i++)
2475 {
2476 rtx op = recog_data.operand[i];
2477 if (GET_CODE (op) != SUBREG)
2478 continue;
2479
2480 rtx inner = SUBREG_REG (op);
2481
2482 machine_mode outer_mode = GET_MODE (op);
2483 machine_mode inner_mode = GET_MODE (inner);
2484 gcc_assert (s_ok);
2485 if (s_ok
2486 && (GET_MODE_PRECISION (inner_mode)
2487 >= GET_MODE_PRECISION (outer_mode)))
2488 continue;
2489 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2490 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2491 : outer_mode == HImode ? &hiregs
2492 : outer_mode == SImode ? &siregs
2493 : &diregs);
2494 rtx new_reg = get_replacement (r);
2495
2496 if (recog_data.operand_type[i] != OP_OUT)
2497 {
2498 enum rtx_code code;
2499 if (GET_MODE_PRECISION (inner_mode)
2500 < GET_MODE_PRECISION (outer_mode))
2501 code = ZERO_EXTEND;
2502 else
2503 code = TRUNCATE;
2504
f7df4a84 2505 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2506 gen_rtx_fmt_e (code, outer_mode, inner));
2507 emit_insn_before (pat, insn);
2508 }
2509
2510 if (recog_data.operand_type[i] != OP_IN)
2511 {
2512 enum rtx_code code;
2513 if (GET_MODE_PRECISION (inner_mode)
2514 < GET_MODE_PRECISION (outer_mode))
2515 code = TRUNCATE;
2516 else
2517 code = ZERO_EXTEND;
2518
f7df4a84 2519 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2520 gen_rtx_fmt_e (code, inner_mode, new_reg));
2521 emit_insn_after (pat, insn);
2522 }
2523 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2524 }
2525 }
517665b3 2526}
738f2522 2527
5012919d
AM
2528/* Return a SImode "master lane index" register for uniform-simt, allocating on
2529 first use. */
2530
2531static rtx
2532nvptx_get_unisimt_master ()
2533{
2534 rtx &master = cfun->machine->unisimt_master;
2535 return master ? master : master = gen_reg_rtx (SImode);
2536}
2537
2538/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2539
2540static rtx
2541nvptx_get_unisimt_predicate ()
2542{
2543 rtx &pred = cfun->machine->unisimt_predicate;
2544 return pred ? pred : pred = gen_reg_rtx (BImode);
2545}
2546
2547/* Return true if given call insn references one of the functions provided by
2548 the CUDA runtime: malloc, free, vprintf. */
2549
2550static bool
2551nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2552{
2553 rtx pat = PATTERN (insn);
2554 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2555 pat = XVECEXP (pat, 0, 0);
2556 if (GET_CODE (pat) == SET)
2557 pat = SET_SRC (pat);
2558 gcc_checking_assert (GET_CODE (pat) == CALL
2559 && GET_CODE (XEXP (pat, 0)) == MEM);
2560 rtx addr = XEXP (XEXP (pat, 0), 0);
2561 if (GET_CODE (addr) != SYMBOL_REF)
2562 return false;
2563 const char *name = XSTR (addr, 0);
2564 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2565 references with forced assembler name refer to PTX syscalls. For vprintf,
2566 accept both normal and forced-assembler-name references. */
2567 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2568 || !strcmp (name, "*malloc")
2569 || !strcmp (name, "*free"));
2570}
2571
2572/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2573 propagate its value from lane MASTER to current lane. */
2574
2575static void
2576nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2577{
2578 rtx reg;
2579 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2580 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2581}
2582
2583/* Adjust code for uniform-simt code generation variant by making atomics and
2584 "syscalls" conditionally executed, and inserting shuffle-based propagation
2585 for registers being set. */
2586
2587static void
2588nvptx_reorg_uniform_simt ()
2589{
2590 rtx_insn *insn, *next;
2591
2592 for (insn = get_insns (); insn; insn = next)
2593 {
2594 next = NEXT_INSN (insn);
2595 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2596 && !(NONJUMP_INSN_P (insn)
2597 && GET_CODE (PATTERN (insn)) == PARALLEL
2598 && get_attr_atomic (insn)))
2599 continue;
2600 rtx pat = PATTERN (insn);
2601 rtx master = nvptx_get_unisimt_master ();
2602 for (int i = 0; i < XVECLEN (pat, 0); i++)
2603 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2604 rtx pred = nvptx_get_unisimt_predicate ();
2605 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2606 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2607 validate_change (insn, &PATTERN (insn), pat, false);
2608 }
2609}
2610
d2d47a28
NS
2611/* Loop structure of the function. The entire function is described as
2612 a NULL loop. */
d88cd9c4
NS
2613
2614struct parallel
2615{
2616 /* Parent parallel. */
2617 parallel *parent;
2618
2619 /* Next sibling parallel. */
2620 parallel *next;
2621
2622 /* First child parallel. */
2623 parallel *inner;
2624
2625 /* Partitioning mask of the parallel. */
2626 unsigned mask;
2627
2628 /* Partitioning used within inner parallels. */
2629 unsigned inner_mask;
2630
2631 /* Location of parallel forked and join. The forked is the first
2632 block in the parallel and the join is the first block after of
2633 the partition. */
2634 basic_block forked_block;
2635 basic_block join_block;
2636
2637 rtx_insn *forked_insn;
2638 rtx_insn *join_insn;
2639
2640 rtx_insn *fork_insn;
2641 rtx_insn *joining_insn;
2642
2643 /* Basic blocks in this parallel, but not in child parallels. The
2644 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2645 blocks are not. */
2646 auto_vec<basic_block> blocks;
2647
2648public:
2649 parallel (parallel *parent, unsigned mode);
2650 ~parallel ();
2651};
2652
2653/* Constructor links the new parallel into it's parent's chain of
2654 children. */
2655
2656parallel::parallel (parallel *parent_, unsigned mask_)
2657 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2658{
2659 forked_block = join_block = 0;
2660 forked_insn = join_insn = 0;
2661 fork_insn = joining_insn = 0;
2662
2663 if (parent)
2664 {
2665 next = parent->inner;
2666 parent->inner = this;
2667 }
2668}
2669
2670parallel::~parallel ()
2671{
2672 delete inner;
2673 delete next;
2674}
2675
2676/* Map of basic blocks to insns */
2677typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2678
2679/* A tuple of an insn of interest and the BB in which it resides. */
2680typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2681typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2682
2683/* Split basic blocks such that each forked and join unspecs are at
2684 the start of their basic blocks. Thus afterwards each block will
2685 have a single partitioning mode. We also do the same for return
2686 insns, as they are executed by every thread. Return the
2687 partitioning mode of the function as a whole. Populate MAP with
2688 head and tail blocks. We also clear the BB visited flag, which is
2689 used when finding partitions. */
2690
2691static void
2692nvptx_split_blocks (bb_insn_map_t *map)
2693{
2694 insn_bb_vec_t worklist;
2695 basic_block block;
2696 rtx_insn *insn;
2697
2698 /* Locate all the reorg instructions of interest. */
2699 FOR_ALL_BB_FN (block, cfun)
2700 {
2701 bool seen_insn = false;
2702
2703 /* Clear visited flag, for use by parallel locator */
2704 block->flags &= ~BB_VISITED;
2705
2706 FOR_BB_INSNS (block, insn)
2707 {
2708 if (!INSN_P (insn))
2709 continue;
2710 switch (recog_memoized (insn))
2711 {
2712 default:
2713 seen_insn = true;
2714 continue;
2715 case CODE_FOR_nvptx_forked:
2716 case CODE_FOR_nvptx_join:
2717 break;
2718
2719 case CODE_FOR_return:
2720 /* We also need to split just before return insns, as
2721 that insn needs executing by all threads, but the
2722 block it is in probably does not. */
2723 break;
2724 }
2725
2726 if (seen_insn)
2727 /* We've found an instruction that must be at the start of
2728 a block, but isn't. Add it to the worklist. */
2729 worklist.safe_push (insn_bb_t (insn, block));
2730 else
2731 /* It was already the first instruction. Just add it to
2732 the map. */
2733 map->get_or_insert (block) = insn;
2734 seen_insn = true;
2735 }
2736 }
2737
2738 /* Split blocks on the worklist. */
2739 unsigned ix;
2740 insn_bb_t *elt;
2741 basic_block remap = 0;
2742 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2743 {
2744 if (remap != elt->second)
2745 {
2746 block = elt->second;
2747 remap = block;
2748 }
2749
2750 /* Split block before insn. The insn is in the new block */
2751 edge e = split_block (block, PREV_INSN (elt->first));
2752
2753 block = e->dest;
2754 map->get_or_insert (block) = elt->first;
2755 }
2756}
2757
2758/* BLOCK is a basic block containing a head or tail instruction.
2759 Locate the associated prehead or pretail instruction, which must be
2760 in the single predecessor block. */
2761
2762static rtx_insn *
2763nvptx_discover_pre (basic_block block, int expected)
2764{
2765 gcc_assert (block->preds->length () == 1);
2766 basic_block pre_block = (*block->preds)[0]->src;
2767 rtx_insn *pre_insn;
2768
2769 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2770 pre_insn = PREV_INSN (pre_insn))
2771 gcc_assert (pre_insn != BB_HEAD (pre_block));
2772
2773 gcc_assert (recog_memoized (pre_insn) == expected);
2774 return pre_insn;
2775}
2776
2777/* Dump this parallel and all its inner parallels. */
2778
2779static void
2780nvptx_dump_pars (parallel *par, unsigned depth)
2781{
2782 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2783 depth, par->mask,
2784 par->forked_block ? par->forked_block->index : -1,
2785 par->join_block ? par->join_block->index : -1);
2786
2787 fprintf (dump_file, " blocks:");
2788
2789 basic_block block;
2790 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2791 fprintf (dump_file, " %d", block->index);
2792 fprintf (dump_file, "\n");
2793 if (par->inner)
2794 nvptx_dump_pars (par->inner, depth + 1);
2795
2796 if (par->next)
2797 nvptx_dump_pars (par->next, depth);
2798}
2799
2800/* If BLOCK contains a fork/join marker, process it to create or
2801 terminate a loop structure. Add this block to the current loop,
2802 and then walk successor blocks. */
2803
2804static parallel *
2805nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2806{
2807 if (block->flags & BB_VISITED)
2808 return par;
2809 block->flags |= BB_VISITED;
2810
2811 if (rtx_insn **endp = map->get (block))
2812 {
2813 rtx_insn *end = *endp;
2814
2815 /* This is a block head or tail, or return instruction. */
2816 switch (recog_memoized (end))
2817 {
2818 case CODE_FOR_return:
2819 /* Return instructions are in their own block, and we
2820 don't need to do anything more. */
2821 return par;
2822
2823 case CODE_FOR_nvptx_forked:
2824 /* Loop head, create a new inner loop and add it into
2825 our parent's child list. */
2826 {
2827 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2828
2829 gcc_assert (mask);
2830 par = new parallel (par, mask);
2831 par->forked_block = block;
2832 par->forked_insn = end;
2833 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2834 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2835 par->fork_insn
2836 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2837 }
2838 break;
2839
2840 case CODE_FOR_nvptx_join:
2841 /* A loop tail. Finish the current loop and return to
2842 parent. */
2843 {
2844 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2845
2846 gcc_assert (par->mask == mask);
2847 par->join_block = block;
2848 par->join_insn = end;
2849 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2850 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2851 par->joining_insn
2852 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2853 par = par->parent;
2854 }
2855 break;
2856
2857 default:
2858 gcc_unreachable ();
2859 }
2860 }
2861
2862 if (par)
2863 /* Add this block onto the current loop's list of blocks. */
2864 par->blocks.safe_push (block);
2865 else
2866 /* This must be the entry block. Create a NULL parallel. */
2867 par = new parallel (0, 0);
2868
2869 /* Walk successor blocks. */
2870 edge e;
2871 edge_iterator ei;
2872
2873 FOR_EACH_EDGE (e, ei, block->succs)
2874 nvptx_find_par (map, par, e->dest);
2875
2876 return par;
2877}
2878
2879/* DFS walk the CFG looking for fork & join markers. Construct
2880 loop structures as we go. MAP is a mapping of basic blocks
2881 to head & tail markers, discovered when splitting blocks. This
2882 speeds up the discovery. We rely on the BB visited flag having
2883 been cleared when splitting blocks. */
2884
2885static parallel *
2886nvptx_discover_pars (bb_insn_map_t *map)
2887{
2888 basic_block block;
2889
2890 /* Mark exit blocks as visited. */
2891 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2892 block->flags |= BB_VISITED;
2893
2894 /* And entry block as not. */
2895 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2896 block->flags &= ~BB_VISITED;
2897
2898 parallel *par = nvptx_find_par (map, 0, block);
2899
2900 if (dump_file)
2901 {
2902 fprintf (dump_file, "\nLoops\n");
2903 nvptx_dump_pars (par, 0);
2904 fprintf (dump_file, "\n");
2905 }
2906
2907 return par;
2908}
2909
912442c2
NS
2910/* Analyse a group of BBs within a partitioned region and create N
2911 Single-Entry-Single-Exit regions. Some of those regions will be
2912 trivial ones consisting of a single BB. The blocks of a
2913 partitioned region might form a set of disjoint graphs -- because
2914 the region encloses a differently partitoned sub region.
2915
2916 We use the linear time algorithm described in 'Finding Regions Fast:
2917 Single Entry Single Exit and control Regions in Linear Time'
2918 Johnson, Pearson & Pingali. That algorithm deals with complete
2919 CFGs, where a back edge is inserted from END to START, and thus the
2920 problem becomes one of finding equivalent loops.
2921
2922 In this case we have a partial CFG. We complete it by redirecting
2923 any incoming edge to the graph to be from an arbitrary external BB,
2924 and similarly redirecting any outgoing edge to be to that BB.
2925 Thus we end up with a closed graph.
2926
2927 The algorithm works by building a spanning tree of an undirected
2928 graph and keeping track of back edges from nodes further from the
2929 root in the tree to nodes nearer to the root in the tree. In the
2930 description below, the root is up and the tree grows downwards.
2931
2932 We avoid having to deal with degenerate back-edges to the same
2933 block, by splitting each BB into 3 -- one for input edges, one for
2934 the node itself and one for the output edges. Such back edges are
2935 referred to as 'Brackets'. Cycle equivalent nodes will have the
2936 same set of brackets.
2937
2938 Determining bracket equivalency is done by maintaining a list of
2939 brackets in such a manner that the list length and final bracket
2940 uniquely identify the set.
2941
2942 We use coloring to mark all BBs with cycle equivalency with the
2943 same color. This is the output of the 'Finding Regions Fast'
2944 algorithm. Notice it doesn't actually find the set of nodes within
2945 a particular region, just unorderd sets of nodes that are the
2946 entries and exits of SESE regions.
2947
2948 After determining cycle equivalency, we need to find the minimal
2949 set of SESE regions. Do this with a DFS coloring walk of the
2950 complete graph. We're either 'looking' or 'coloring'. When
2951 looking, and we're in the subgraph, we start coloring the color of
2952 the current node, and remember that node as the start of the
2953 current color's SESE region. Every time we go to a new node, we
2954 decrement the count of nodes with thet color. If it reaches zero,
2955 we remember that node as the end of the current color's SESE region
2956 and return to 'looking'. Otherwise we color the node the current
2957 color.
2958
2959 This way we end up with coloring the inside of non-trivial SESE
2960 regions with the color of that region. */
2961
2962/* A pair of BBs. We use this to represent SESE regions. */
2963typedef std::pair<basic_block, basic_block> bb_pair_t;
2964typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2965
2966/* A node in the undirected CFG. The discriminator SECOND indicates just
2967 above or just below the BB idicated by FIRST. */
2968typedef std::pair<basic_block, int> pseudo_node_t;
2969
2970/* A bracket indicates an edge towards the root of the spanning tree of the
2971 undirected graph. Each bracket has a color, determined
2972 from the currrent set of brackets. */
2973struct bracket
2974{
2975 pseudo_node_t back; /* Back target */
2976
2977 /* Current color and size of set. */
2978 unsigned color;
2979 unsigned size;
2980
2981 bracket (pseudo_node_t back_)
2982 : back (back_), color (~0u), size (~0u)
2983 {
2984 }
2985
2986 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2987 {
2988 if (length != size)
2989 {
2990 size = length;
2991 color = color_counts.length ();
2992 color_counts.quick_push (0);
2993 }
2994 color_counts[color]++;
2995 return color;
2996 }
2997};
2998
2999typedef auto_vec<bracket> bracket_vec_t;
3000
3001/* Basic block info for finding SESE regions. */
3002
3003struct bb_sese
3004{
3005 int node; /* Node number in spanning tree. */
3006 int parent; /* Parent node number. */
3007
3008 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3009 edges arrive at pseudo-node Ai and the outgoing edges leave at
3010 pseudo-node Ao. We have to remember which way we arrived at a
3011 particular node when generating the spanning tree. dir > 0 means
3012 we arrived at Ai, dir < 0 means we arrived at Ao. */
3013 int dir;
3014
3015 /* Lowest numbered pseudo-node reached via a backedge from thsis
3016 node, or any descendant. */
3017 pseudo_node_t high;
3018
3019 int color; /* Cycle-equivalence color */
3020
3021 /* Stack of brackets for this node. */
3022 bracket_vec_t brackets;
3023
3024 bb_sese (unsigned node_, unsigned p, int dir_)
3025 :node (node_), parent (p), dir (dir_)
3026 {
3027 }
3028 ~bb_sese ();
3029
3030 /* Push a bracket ending at BACK. */
3031 void push (const pseudo_node_t &back)
3032 {
3033 if (dump_file)
3034 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3035 back.first ? back.first->index : 0, back.second);
3036 brackets.safe_push (bracket (back));
3037 }
3038
3039 void append (bb_sese *child);
3040 void remove (const pseudo_node_t &);
3041
3042 /* Set node's color. */
3043 void set_color (auto_vec<unsigned> &color_counts)
3044 {
3045 color = brackets.last ().get_color (color_counts, brackets.length ());
3046 }
3047};
3048
3049bb_sese::~bb_sese ()
3050{
3051}
3052
3053/* Destructively append CHILD's brackets. */
3054
3055void
3056bb_sese::append (bb_sese *child)
3057{
3058 if (int len = child->brackets.length ())
3059 {
3060 int ix;
3061
3062 if (dump_file)
3063 {
3064 for (ix = 0; ix < len; ix++)
3065 {
3066 const pseudo_node_t &pseudo = child->brackets[ix].back;
3067 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3068 child->node, pseudo.first ? pseudo.first->index : 0,
3069 pseudo.second);
3070 }
3071 }
3072 if (!brackets.length ())
3073 std::swap (brackets, child->brackets);
3074 else
3075 {
3076 brackets.reserve (len);
3077 for (ix = 0; ix < len; ix++)
3078 brackets.quick_push (child->brackets[ix]);
3079 }
3080 }
3081}
3082
3083/* Remove brackets that terminate at PSEUDO. */
3084
3085void
3086bb_sese::remove (const pseudo_node_t &pseudo)
3087{
3088 unsigned removed = 0;
3089 int len = brackets.length ();
3090
3091 for (int ix = 0; ix < len; ix++)
3092 {
3093 if (brackets[ix].back == pseudo)
3094 {
3095 if (dump_file)
3096 fprintf (dump_file, "Removing backedge %d:%+d\n",
3097 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3098 removed++;
3099 }
3100 else if (removed)
3101 brackets[ix-removed] = brackets[ix];
3102 }
3103 while (removed--)
3104 brackets.pop ();
3105}
3106
3107/* Accessors for BB's aux pointer. */
3108#define BB_SET_SESE(B, S) ((B)->aux = (S))
3109#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3110
3111/* DFS walk creating SESE data structures. Only cover nodes with
3112 BB_VISITED set. Append discovered blocks to LIST. We number in
3113 increments of 3 so that the above and below pseudo nodes can be
3114 implicitly numbered too. */
3115
3116static int
3117nvptx_sese_number (int n, int p, int dir, basic_block b,
3118 auto_vec<basic_block> *list)
3119{
3120 if (BB_GET_SESE (b))
3121 return n;
3122
3123 if (dump_file)
3124 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3125 b->index, n, p, dir);
3126
3127 BB_SET_SESE (b, new bb_sese (n, p, dir));
3128 p = n;
3129
3130 n += 3;
3131 list->quick_push (b);
3132
3133 /* First walk the nodes on the 'other side' of this node, then walk
3134 the nodes on the same side. */
3135 for (unsigned ix = 2; ix; ix--)
3136 {
3137 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3138 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3139 : offsetof (edge_def, src));
3140 edge e;
3141 edge_iterator (ei);
3142
3143 FOR_EACH_EDGE (e, ei, edges)
3144 {
3145 basic_block target = *(basic_block *)((char *)e + offset);
3146
3147 if (target->flags & BB_VISITED)
3148 n = nvptx_sese_number (n, p, dir, target, list);
3149 }
3150 dir = -dir;
3151 }
3152 return n;
3153}
3154
3155/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3156 EDGES are the outgoing edges and OFFSET is the offset to the src
3157 or dst block on the edges. */
3158
3159static void
3160nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3161 vec<edge, va_gc> *edges, size_t offset)
3162{
3163 edge e;
3164 edge_iterator (ei);
3165 int hi_back = depth;
3166 pseudo_node_t node_back (0, depth);
3167 int hi_child = depth;
3168 pseudo_node_t node_child (0, depth);
3169 basic_block child = NULL;
3170 unsigned num_children = 0;
3171 int usd = -dir * sese->dir;
3172
3173 if (dump_file)
3174 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3175 me->index, sese->node, dir);
3176
3177 if (dir < 0)
3178 {
3179 /* This is the above pseudo-child. It has the BB itself as an
3180 additional child node. */
3181 node_child = sese->high;
3182 hi_child = node_child.second;
3183 if (node_child.first)
3184 hi_child += BB_GET_SESE (node_child.first)->node;
3185 num_children++;
3186 }
3187
3188 /* Examine each edge.
3189 - if it is a child (a) append its bracket list and (b) record
3190 whether it is the child with the highest reaching bracket.
3191 - if it is an edge to ancestor, record whether it's the highest
3192 reaching backlink. */
3193 FOR_EACH_EDGE (e, ei, edges)
3194 {
3195 basic_block target = *(basic_block *)((char *)e + offset);
3196
3197 if (bb_sese *t_sese = BB_GET_SESE (target))
3198 {
3199 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3200 {
3201 /* Child node. Append its bracket list. */
3202 num_children++;
3203 sese->append (t_sese);
3204
3205 /* Compare it's hi value. */
3206 int t_hi = t_sese->high.second;
3207
3208 if (basic_block child_hi_block = t_sese->high.first)
3209 t_hi += BB_GET_SESE (child_hi_block)->node;
3210
3211 if (hi_child > t_hi)
3212 {
3213 hi_child = t_hi;
3214 node_child = t_sese->high;
3215 child = target;
3216 }
3217 }
3218 else if (t_sese->node < sese->node + dir
3219 && !(dir < 0 && sese->parent == t_sese->node))
3220 {
3221 /* Non-parental ancestor node -- a backlink. */
3222 int d = usd * t_sese->dir;
3223 int back = t_sese->node + d;
3224
3225 if (hi_back > back)
3226 {
3227 hi_back = back;
3228 node_back = pseudo_node_t (target, d);
3229 }
3230 }
3231 }
3232 else
3233 { /* Fallen off graph, backlink to entry node. */
3234 hi_back = 0;
3235 node_back = pseudo_node_t (0, 0);
3236 }
3237 }
3238
3239 /* Remove any brackets that terminate at this pseudo node. */
3240 sese->remove (pseudo_node_t (me, dir));
3241
3242 /* Now push any backlinks from this pseudo node. */
3243 FOR_EACH_EDGE (e, ei, edges)
3244 {
3245 basic_block target = *(basic_block *)((char *)e + offset);
3246 if (bb_sese *t_sese = BB_GET_SESE (target))
3247 {
3248 if (t_sese->node < sese->node + dir
3249 && !(dir < 0 && sese->parent == t_sese->node))
3250 /* Non-parental ancestor node - backedge from me. */
3251 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3252 }
3253 else
3254 {
3255 /* back edge to entry node */
3256 sese->push (pseudo_node_t (0, 0));
3257 }
3258 }
3259
3260 /* If this node leads directly or indirectly to a no-return region of
3261 the graph, then fake a backedge to entry node. */
3262 if (!sese->brackets.length () || !edges || !edges->length ())
3263 {
3264 hi_back = 0;
3265 node_back = pseudo_node_t (0, 0);
3266 sese->push (node_back);
3267 }
3268
3269 /* Record the highest reaching backedge from us or a descendant. */
3270 sese->high = hi_back < hi_child ? node_back : node_child;
3271
3272 if (num_children > 1)
3273 {
3274 /* There is more than one child -- this is a Y shaped piece of
3275 spanning tree. We have to insert a fake backedge from this
3276 node to the highest ancestor reached by not-the-highest
3277 reaching child. Note that there may be multiple children
3278 with backedges to the same highest node. That's ok and we
3279 insert the edge to that highest node. */
3280 hi_child = depth;
3281 if (dir < 0 && child)
3282 {
3283 node_child = sese->high;
3284 hi_child = node_child.second;
3285 if (node_child.first)
3286 hi_child += BB_GET_SESE (node_child.first)->node;
3287 }
3288
3289 FOR_EACH_EDGE (e, ei, edges)
3290 {
3291 basic_block target = *(basic_block *)((char *)e + offset);
3292
3293 if (target == child)
3294 /* Ignore the highest child. */
3295 continue;
3296
3297 bb_sese *t_sese = BB_GET_SESE (target);
3298 if (!t_sese)
3299 continue;
3300 if (t_sese->parent != sese->node)
3301 /* Not a child. */
3302 continue;
3303
3304 /* Compare its hi value. */
3305 int t_hi = t_sese->high.second;
3306
3307 if (basic_block child_hi_block = t_sese->high.first)
3308 t_hi += BB_GET_SESE (child_hi_block)->node;
3309
3310 if (hi_child > t_hi)
3311 {
3312 hi_child = t_hi;
3313 node_child = t_sese->high;
3314 }
3315 }
3316
3317 sese->push (node_child);
3318 }
3319}
3320
3321
3322/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3323 proceed to successors. Set SESE entry and exit nodes of
3324 REGIONS. */
3325
3326static void
3327nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3328 basic_block block, int coloring)
3329{
3330 bb_sese *sese = BB_GET_SESE (block);
3331
3332 if (block->flags & BB_VISITED)
3333 {
3334 /* If we've already encountered this block, either we must not
3335 be coloring, or it must have been colored the current color. */
3336 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3337 return;
3338 }
3339
3340 block->flags |= BB_VISITED;
3341
3342 if (sese)
3343 {
3344 if (coloring < 0)
3345 {
3346 /* Start coloring a region. */
3347 regions[sese->color].first = block;
3348 coloring = sese->color;
3349 }
3350
3351 if (!--color_counts[sese->color] && sese->color == coloring)
3352 {
3353 /* Found final block of SESE region. */
3354 regions[sese->color].second = block;
3355 coloring = -1;
3356 }
3357 else
3358 /* Color the node, so we can assert on revisiting the node
3359 that the graph is indeed SESE. */
3360 sese->color = coloring;
3361 }
3362 else
3363 /* Fallen off the subgraph, we cannot be coloring. */
3364 gcc_assert (coloring < 0);
3365
3366 /* Walk each successor block. */
3367 if (block->succs && block->succs->length ())
3368 {
3369 edge e;
3370 edge_iterator ei;
3371
3372 FOR_EACH_EDGE (e, ei, block->succs)
3373 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3374 }
3375 else
3376 gcc_assert (coloring < 0);
3377}
3378
3379/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3380 end up with NULL entries in it. */
3381
3382static void
3383nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3384{
3385 basic_block block;
3386 int ix;
3387
3388 /* First clear each BB of the whole function. */
2300c332 3389 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3390 {
3391 block->flags &= ~BB_VISITED;
3392 BB_SET_SESE (block, 0);
3393 }
912442c2
NS
3394
3395 /* Mark blocks in the function that are in this graph. */
3396 for (ix = 0; blocks.iterate (ix, &block); ix++)
3397 block->flags |= BB_VISITED;
3398
3399 /* Counts of nodes assigned to each color. There cannot be more
3400 colors than blocks (and hopefully there will be fewer). */
3401 auto_vec<unsigned> color_counts;
3402 color_counts.reserve (blocks.length ());
3403
3404 /* Worklist of nodes in the spanning tree. Again, there cannot be
3405 more nodes in the tree than blocks (there will be fewer if the
3406 CFG of blocks is disjoint). */
3407 auto_vec<basic_block> spanlist;
3408 spanlist.reserve (blocks.length ());
3409
3410 /* Make sure every block has its cycle class determined. */
3411 for (ix = 0; blocks.iterate (ix, &block); ix++)
3412 {
3413 if (BB_GET_SESE (block))
3414 /* We already met this block in an earlier graph solve. */
3415 continue;
3416
3417 if (dump_file)
3418 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3419
3420 /* Number the nodes reachable from block initial DFS order. */
3421 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3422
3423 /* Now walk in reverse DFS order to find cycle equivalents. */
3424 while (spanlist.length ())
3425 {
3426 block = spanlist.pop ();
3427 bb_sese *sese = BB_GET_SESE (block);
3428
3429 /* Do the pseudo node below. */
3430 nvptx_sese_pseudo (block, sese, depth, +1,
3431 sese->dir > 0 ? block->succs : block->preds,
3432 (sese->dir > 0 ? offsetof (edge_def, dest)
3433 : offsetof (edge_def, src)));
3434 sese->set_color (color_counts);
3435 /* Do the pseudo node above. */
3436 nvptx_sese_pseudo (block, sese, depth, -1,
3437 sese->dir < 0 ? block->succs : block->preds,
3438 (sese->dir < 0 ? offsetof (edge_def, dest)
3439 : offsetof (edge_def, src)));
3440 }
3441 if (dump_file)
3442 fprintf (dump_file, "\n");
3443 }
3444
3445 if (dump_file)
3446 {
3447 unsigned count;
3448 const char *comma = "";
3449
3450 fprintf (dump_file, "Found %d cycle equivalents\n",
3451 color_counts.length ());
3452 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3453 {
3454 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3455
3456 comma = "";
3457 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3458 if (BB_GET_SESE (block)->color == ix)
3459 {
3460 block->flags |= BB_VISITED;
3461 fprintf (dump_file, "%s%d", comma, block->index);
3462 comma=",";
3463 }
3464 fprintf (dump_file, "}");
3465 comma = ", ";
3466 }
3467 fprintf (dump_file, "\n");
3468 }
3469
3470 /* Now we've colored every block in the subgraph. We now need to
3471 determine the minimal set of SESE regions that cover that
3472 subgraph. Do this with a DFS walk of the complete function.
3473 During the walk we're either 'looking' or 'coloring'. When we
3474 reach the last node of a particular color, we stop coloring and
3475 return to looking. */
3476
3477 /* There cannot be more SESE regions than colors. */
3478 regions.reserve (color_counts.length ());
3479 for (ix = color_counts.length (); ix--;)
3480 regions.quick_push (bb_pair_t (0, 0));
3481
3482 for (ix = 0; blocks.iterate (ix, &block); ix++)
3483 block->flags &= ~BB_VISITED;
3484
3485 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3486
3487 if (dump_file)
3488 {
3489 const char *comma = "";
3490 int len = regions.length ();
3491
3492 fprintf (dump_file, "SESE regions:");
3493 for (ix = 0; ix != len; ix++)
3494 {
3495 basic_block from = regions[ix].first;
3496 basic_block to = regions[ix].second;
3497
3498 if (from)
3499 {
3500 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3501 if (to != from)
3502 fprintf (dump_file, "->%d", to->index);
3503
3504 int color = BB_GET_SESE (from)->color;
3505
3506 /* Print the blocks within the region (excluding ends). */
3507 FOR_EACH_BB_FN (block, cfun)
3508 {
3509 bb_sese *sese = BB_GET_SESE (block);
3510
3511 if (sese && sese->color == color
3512 && block != from && block != to)
3513 fprintf (dump_file, ".%d", block->index);
3514 }
3515 fprintf (dump_file, "}");
3516 }
3517 comma = ",";
3518 }
3519 fprintf (dump_file, "\n\n");
3520 }
3521
3522 for (ix = 0; blocks.iterate (ix, &block); ix++)
3523 delete BB_GET_SESE (block);
3524}
3525
3526#undef BB_SET_SESE
3527#undef BB_GET_SESE
3528
d88cd9c4
NS
3529/* Propagate live state at the start of a partitioned region. BLOCK
3530 provides the live register information, and might not contain
3531 INSN. Propagation is inserted just after INSN. RW indicates whether
3532 we are reading and/or writing state. This
3533 separation is needed for worker-level proppagation where we
3534 essentially do a spill & fill. FN is the underlying worker
3535 function to generate the propagation instructions for single
3536 register. DATA is user data.
3537
3538 We propagate the live register set and the entire frame. We could
3539 do better by (a) propagating just the live set that is used within
3540 the partitioned regions and (b) only propagating stack entries that
3541 are used. The latter might be quite hard to determine. */
3542
3543typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3544
3545static void
3546nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3547 propagator_fn fn, void *data)
3548{
3549 bitmap live = DF_LIVE_IN (block);
3550 bitmap_iterator iterator;
3551 unsigned ix;
3552
3553 /* Copy the frame array. */
3554 HOST_WIDE_INT fs = get_frame_size ();
3555 if (fs)
3556 {
3557 rtx tmp = gen_reg_rtx (DImode);
3558 rtx idx = NULL_RTX;
3559 rtx ptr = gen_reg_rtx (Pmode);
3560 rtx pred = NULL_RTX;
3561 rtx_code_label *label = NULL;
3562
c1311c86
NS
3563 /* The frame size might not be DImode compatible, but the frame
3564 array's declaration will be. So it's ok to round up here. */
3565 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3566 /* Detect single iteration loop. */
3567 if (fs == 1)
3568 fs = 0;
3569
3570 start_sequence ();
3571 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3572 if (fs)
3573 {
3574 idx = gen_reg_rtx (SImode);
3575 pred = gen_reg_rtx (BImode);
3576 label = gen_label_rtx ();
3577
3578 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3579 /* Allow worker function to initialize anything needed. */
3580 rtx init = fn (tmp, PM_loop_begin, fs, data);
3581 if (init)
3582 emit_insn (init);
3583 emit_label (label);
3584 LABEL_NUSES (label)++;
3585 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3586 }
3587 if (rw & PM_read)
3588 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3589 emit_insn (fn (tmp, rw, fs, data));
3590 if (rw & PM_write)
3591 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3592 if (fs)
3593 {
3594 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3595 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3596 emit_insn (gen_br_true_uni (pred, label));
3597 rtx fini = fn (tmp, PM_loop_end, fs, data);
3598 if (fini)
3599 emit_insn (fini);
3600 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3601 }
3602 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3603 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3604 rtx cpy = get_insns ();
3605 end_sequence ();
3606 insn = emit_insn_after (cpy, insn);
3607 }
3608
3609 /* Copy live registers. */
3610 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3611 {
3612 rtx reg = regno_reg_rtx[ix];
3613
3614 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3615 {
3616 rtx bcast = fn (reg, rw, 0, data);
3617
3618 insn = emit_insn_after (bcast, insn);
3619 }
3620 }
3621}
3622
3623/* Worker for nvptx_vpropagate. */
3624
3625static rtx
3626vprop_gen (rtx reg, propagate_mask pm,
3627 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3628{
3629 if (!(pm & PM_read_write))
3630 return 0;
3631
3632 return nvptx_gen_vcast (reg);
3633}
3634
3635/* Propagate state that is live at start of BLOCK across the vectors
3636 of a single warp. Propagation is inserted just after INSN. */
3637
3638static void
3639nvptx_vpropagate (basic_block block, rtx_insn *insn)
3640{
3641 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3642}
3643
3644/* Worker for nvptx_wpropagate. */
3645
3646static rtx
3647wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3648{
3649 wcast_data_t *data = (wcast_data_t *)data_;
3650
3651 if (pm & PM_loop_begin)
3652 {
3653 /* Starting a loop, initialize pointer. */
3654 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3655
3656 if (align > worker_bcast_align)
3657 worker_bcast_align = align;
3658 data->offset = (data->offset + align - 1) & ~(align - 1);
3659
3660 data->ptr = gen_reg_rtx (Pmode);
3661
3662 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3663 }
3664 else if (pm & PM_loop_end)
3665 {
3666 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3667 data->ptr = NULL_RTX;
3668 return clobber;
3669 }
3670 else
3671 return nvptx_gen_wcast (reg, pm, rep, data);
3672}
3673
3674/* Spill or fill live state that is live at start of BLOCK. PRE_P
3675 indicates if this is just before partitioned mode (do spill), or
3676 just after it starts (do fill). Sequence is inserted just after
3677 INSN. */
3678
3679static void
3680nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3681{
3682 wcast_data_t data;
3683
3684 data.base = gen_reg_rtx (Pmode);
3685 data.offset = 0;
3686 data.ptr = NULL_RTX;
3687
3688 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3689 if (data.offset)
3690 {
3691 /* Stuff was emitted, initialize the base pointer now. */
bd602b7f 3692 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
d88cd9c4 3693 emit_insn_after (init, insn);
15ab6f00 3694
d88cd9c4
NS
3695 if (worker_bcast_size < data.offset)
3696 worker_bcast_size = data.offset;
3697 }
3698}
3699
3700/* Emit a worker-level synchronization barrier. We use different
3701 markers for before and after synchronizations. */
3702
3703static rtx
3704nvptx_wsync (bool after)
3705{
3706 return gen_nvptx_barsync (GEN_INT (after));
3707}
3708
3709/* Single neutering according to MASK. FROM is the incoming block and
3710 TO is the outgoing block. These may be the same block. Insert at
3711 start of FROM:
3712
3713 if (tid.<axis>) goto end.
3714
3715 and insert before ending branch of TO (if there is such an insn):
3716
3717 end:
3718 <possibly-broadcast-cond>
3719 <branch>
3720
3721 We currently only use differnt FROM and TO when skipping an entire
3722 loop. We could do more if we detected superblocks. */
3723
3724static void
3725nvptx_single (unsigned mask, basic_block from, basic_block to)
3726{
3727 rtx_insn *head = BB_HEAD (from);
3728 rtx_insn *tail = BB_END (to);
3729 unsigned skip_mask = mask;
3730
3731 /* Find first insn of from block */
3732 while (head != BB_END (from) && !INSN_P (head))
3733 head = NEXT_INSN (head);
3734
3735 /* Find last insn of to block */
3736 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3737 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3738 tail = PREV_INSN (tail);
3739
3740 /* Detect if tail is a branch. */
3741 rtx tail_branch = NULL_RTX;
3742 rtx cond_branch = NULL_RTX;
3743 if (tail && INSN_P (tail))
3744 {
3745 tail_branch = PATTERN (tail);
3746 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3747 tail_branch = NULL_RTX;
3748 else
3749 {
3750 cond_branch = SET_SRC (tail_branch);
3751 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3752 cond_branch = NULL_RTX;
3753 }
3754 }
3755
3756 if (tail == head)
3757 {
3758 /* If this is empty, do nothing. */
3759 if (!head || !INSN_P (head))
3760 return;
3761
3762 /* If this is a dummy insn, do nothing. */
3763 switch (recog_memoized (head))
3764 {
3765 default:
3766 break;
3767 case CODE_FOR_nvptx_fork:
3768 case CODE_FOR_nvptx_forked:
3769 case CODE_FOR_nvptx_joining:
3770 case CODE_FOR_nvptx_join:
3771 return;
3772 }
3773
3774 if (cond_branch)
3775 {
3776 /* If we're only doing vector single, there's no need to
3777 emit skip code because we'll not insert anything. */
3778 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3779 skip_mask = 0;
3780 }
3781 else if (tail_branch)
3782 /* Block with only unconditional branch. Nothing to do. */
3783 return;
3784 }
3785
3786 /* Insert the vector test inside the worker test. */
3787 unsigned mode;
3788 rtx_insn *before = tail;
3789 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3790 if (GOMP_DIM_MASK (mode) & skip_mask)
3791 {
3792 rtx_code_label *label = gen_label_rtx ();
3793 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3794
3795 if (!pred)
3796 {
3797 pred = gen_reg_rtx (BImode);
3798 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3799 }
3800
3801 rtx br;
3802 if (mode == GOMP_DIM_VECTOR)
3803 br = gen_br_true (pred, label);
3804 else
3805 br = gen_br_true_uni (pred, label);
3806 emit_insn_before (br, head);
3807
3808 LABEL_NUSES (label)++;
3809 if (tail_branch)
3810 before = emit_label_before (label, before);
3811 else
3812 emit_label_after (label, tail);
3813 }
3814
3815 /* Now deal with propagating the branch condition. */
3816 if (cond_branch)
3817 {
3818 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3819
3820 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3821 {
3822 /* Vector mode only, do a shuffle. */
3823 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3824 }
3825 else
3826 {
3827 /* Includes worker mode, do spill & fill. By construction
3828 we should never have worker mode only. */
3829 wcast_data_t data;
3830
3831 data.base = worker_bcast_sym;
3832 data.ptr = 0;
3833
3834 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3835 worker_bcast_size = GET_MODE_SIZE (SImode);
3836
3837 data.offset = 0;
3838 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3839 before);
3840 /* Barrier so other workers can see the write. */
3841 emit_insn_before (nvptx_wsync (false), tail);
3842 data.offset = 0;
3843 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3844 /* This barrier is needed to avoid worker zero clobbering
3845 the broadcast buffer before all the other workers have
3846 had a chance to read this instance of it. */
3847 emit_insn_before (nvptx_wsync (true), tail);
3848 }
3849
3850 extract_insn (tail);
3851 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3852 UNSPEC_BR_UNIFIED);
3853 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3854 }
3855}
3856
3857/* PAR is a parallel that is being skipped in its entirety according to
3858 MASK. Treat this as skipping a superblock starting at forked
3859 and ending at joining. */
3860
3861static void
3862nvptx_skip_par (unsigned mask, parallel *par)
3863{
3864 basic_block tail = par->join_block;
3865 gcc_assert (tail->preds->length () == 1);
3866
3867 basic_block pre_tail = (*tail->preds)[0]->src;
3868 gcc_assert (pre_tail->succs->length () == 1);
3869
3870 nvptx_single (mask, par->forked_block, pre_tail);
3871}
3872
dba619f3
NS
3873/* If PAR has a single inner parallel and PAR itself only contains
3874 empty entry and exit blocks, swallow the inner PAR. */
3875
3876static void
3877nvptx_optimize_inner (parallel *par)
3878{
3879 parallel *inner = par->inner;
3880
3881 /* We mustn't be the outer dummy par. */
3882 if (!par->mask)
3883 return;
3884
3885 /* We must have a single inner par. */
3886 if (!inner || inner->next)
3887 return;
3888
3889 /* We must only contain 2 blocks ourselves -- the head and tail of
3890 the inner par. */
3891 if (par->blocks.length () != 2)
3892 return;
3893
3894 /* We must be disjoint partitioning. As we only have vector and
3895 worker partitioning, this is sufficient to guarantee the pars
3896 have adjacent partitioning. */
3897 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3898 /* This indicates malformed code generation. */
3899 return;
3900
3901 /* The outer forked insn should be immediately followed by the inner
3902 fork insn. */
3903 rtx_insn *forked = par->forked_insn;
3904 rtx_insn *fork = BB_END (par->forked_block);
3905
3906 if (NEXT_INSN (forked) != fork)
3907 return;
3908 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3909
3910 /* The outer joining insn must immediately follow the inner join
3911 insn. */
3912 rtx_insn *joining = par->joining_insn;
3913 rtx_insn *join = inner->join_insn;
3914 if (NEXT_INSN (join) != joining)
3915 return;
3916
3917 /* Preconditions met. Swallow the inner par. */
3918 if (dump_file)
3919 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3920 inner->mask, inner->forked_block->index,
3921 inner->join_block->index,
3922 par->mask, par->forked_block->index, par->join_block->index);
3923
3924 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3925
3926 par->blocks.reserve (inner->blocks.length ());
3927 while (inner->blocks.length ())
3928 par->blocks.quick_push (inner->blocks.pop ());
3929
3930 par->inner = inner->inner;
3931 inner->inner = NULL;
3932
3933 delete inner;
3934}
3935
d88cd9c4
NS
3936/* Process the parallel PAR and all its contained
3937 parallels. We do everything but the neutering. Return mask of
3938 partitioned modes used within this parallel. */
3939
3940static unsigned
3941nvptx_process_pars (parallel *par)
3942{
dba619f3
NS
3943 if (nvptx_optimize)
3944 nvptx_optimize_inner (par);
3945
d88cd9c4
NS
3946 unsigned inner_mask = par->mask;
3947
3948 /* Do the inner parallels first. */
3949 if (par->inner)
3950 {
3951 par->inner_mask = nvptx_process_pars (par->inner);
3952 inner_mask |= par->inner_mask;
3953 }
3954
3955 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3956 /* No propagation needed for a call. */;
5d306e55 3957 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3958 {
3959 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3960 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3961 /* Insert begin and end synchronizations. */
3962 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3963 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3964 }
3965 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3966 nvptx_vpropagate (par->forked_block, par->forked_insn);
3967
3968 /* Now do siblings. */
3969 if (par->next)
3970 inner_mask |= nvptx_process_pars (par->next);
3971 return inner_mask;
3972}
3973
3974/* Neuter the parallel described by PAR. We recurse in depth-first
3975 order. MODES are the partitioning of the execution and OUTER is
3976 the partitioning of the parallels we are contained in. */
3977
3978static void
3979nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3980{
3981 unsigned me = (par->mask
3982 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3983 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3984 unsigned skip_mask = 0, neuter_mask = 0;
3985
3986 if (par->inner)
3987 nvptx_neuter_pars (par->inner, modes, outer | me);
3988
3989 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3990 {
3991 if ((outer | me) & GOMP_DIM_MASK (mode))
3992 {} /* Mode is partitioned: no neutering. */
3993 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 3994 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
3995 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3996 || !par->forked_insn)
3997 /* Partitioned in inner parallels, or we're not a partitioned
3998 at all: neuter individual blocks. */
3999 neuter_mask |= GOMP_DIM_MASK (mode);
4000 else if (!par->parent || !par->parent->forked_insn
4001 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4002 /* Parent isn't a parallel or contains this paralleling: skip
4003 parallel at this level. */
4004 skip_mask |= GOMP_DIM_MASK (mode);
4005 else
4006 {} /* Parent will skip this parallel itself. */
4007 }
4008
4009 if (neuter_mask)
4010 {
912442c2 4011 int ix, len;
d88cd9c4 4012
912442c2
NS
4013 if (nvptx_optimize)
4014 {
4015 /* Neuter whole SESE regions. */
4016 bb_pair_vec_t regions;
4017
4018 nvptx_find_sese (par->blocks, regions);
4019 len = regions.length ();
4020 for (ix = 0; ix != len; ix++)
4021 {
4022 basic_block from = regions[ix].first;
4023 basic_block to = regions[ix].second;
4024
4025 if (from)
4026 nvptx_single (neuter_mask, from, to);
4027 else
4028 gcc_assert (!to);
4029 }
4030 }
4031 else
d88cd9c4 4032 {
912442c2
NS
4033 /* Neuter each BB individually. */
4034 len = par->blocks.length ();
4035 for (ix = 0; ix != len; ix++)
4036 {
4037 basic_block block = par->blocks[ix];
d88cd9c4 4038
912442c2
NS
4039 nvptx_single (neuter_mask, block, block);
4040 }
d88cd9c4
NS
4041 }
4042 }
4043
4044 if (skip_mask)
4045 nvptx_skip_par (skip_mask, par);
4046
4047 if (par->next)
4048 nvptx_neuter_pars (par->next, modes, outer);
4049}
4050
517665b3 4051/* PTX-specific reorganization
d88cd9c4 4052 - Split blocks at fork and join instructions
c38f0d8c
NS
4053 - Compute live registers
4054 - Mark now-unused registers, so function begin doesn't declare
517665b3 4055 unused registers.
d88cd9c4
NS
4056 - Insert state propagation when entering partitioned mode
4057 - Insert neutering instructions when in single mode
c38f0d8c 4058 - Replace subregs with suitable sequences.
517665b3
NS
4059*/
4060
4061static void
4062nvptx_reorg (void)
4063{
517665b3
NS
4064 /* We are freeing block_for_insn in the toplev to keep compatibility
4065 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4066 compute_bb_for_insn ();
4067
4068 thread_prologue_and_epilogue_insns ();
4069
d88cd9c4
NS
4070 /* Split blocks and record interesting unspecs. */
4071 bb_insn_map_t bb_insn_map;
4072
4073 nvptx_split_blocks (&bb_insn_map);
4074
c38f0d8c 4075 /* Compute live regs */
517665b3
NS
4076 df_clear_flags (DF_LR_RUN_DCE);
4077 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4078 df_live_add_problem ();
4079 df_live_set_all_dirty ();
517665b3 4080 df_analyze ();
738f2522
BS
4081 regstat_init_n_sets_and_refs ();
4082
d88cd9c4
NS
4083 if (dump_file)
4084 df_dump (dump_file);
4085
517665b3 4086 /* Mark unused regs as unused. */
d88cd9c4 4087 int max_regs = max_reg_num ();
44c068ae 4088 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
4089 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4090 regno_reg_rtx[i] = const0_rtx;
517665b3 4091
d88cd9c4
NS
4092 /* Determine launch dimensions of the function. If it is not an
4093 offloaded function (i.e. this is a regular compiler), the
4094 function has no neutering. */
629b3d75 4095 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
4096 if (attr)
4097 {
4098 /* If we determined this mask before RTL expansion, we could
4099 elide emission of some levels of forks and joins. */
4100 unsigned mask = 0;
4101 tree dims = TREE_VALUE (attr);
4102 unsigned ix;
4103
4104 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4105 {
4106 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4107 tree allowed = TREE_PURPOSE (dims);
4108
4109 if (size != 1 && !(allowed && integer_zerop (allowed)))
4110 mask |= GOMP_DIM_MASK (ix);
4111 }
4112 /* If there is worker neutering, there must be vector
4113 neutering. Otherwise the hardware will fail. */
4114 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4115 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4116
4117 /* Discover & process partitioned regions. */
4118 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4119 nvptx_process_pars (pars);
4120 nvptx_neuter_pars (pars, mask, 0);
4121 delete pars;
4122 }
4123
517665b3 4124 /* Replace subregs. */
c03b0416 4125 nvptx_reorg_subreg ();
517665b3 4126
5012919d
AM
4127 if (TARGET_UNIFORM_SIMT)
4128 nvptx_reorg_uniform_simt ();
4129
738f2522 4130 regstat_free_n_sets_and_refs ();
517665b3
NS
4131
4132 df_finish_pass (true);
738f2522
BS
4133}
4134\f
4135/* Handle a "kernel" attribute; arguments as in
4136 struct attribute_spec.handler. */
4137
4138static tree
4139nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4140 int ARG_UNUSED (flags), bool *no_add_attrs)
4141{
4142 tree decl = *node;
4143
4144 if (TREE_CODE (decl) != FUNCTION_DECL)
4145 {
4146 error ("%qE attribute only applies to functions", name);
4147 *no_add_attrs = true;
4148 }
b49e35a9 4149 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
4150 {
4151 error ("%qE attribute requires a void return type", name);
4152 *no_add_attrs = true;
4153 }
4154
4155 return NULL_TREE;
4156}
4157
5012919d
AM
4158/* Handle a "shared" attribute; arguments as in
4159 struct attribute_spec.handler. */
4160
4161static tree
4162nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4163 int ARG_UNUSED (flags), bool *no_add_attrs)
4164{
4165 tree decl = *node;
4166
4167 if (TREE_CODE (decl) != VAR_DECL)
4168 {
4169 error ("%qE attribute only applies to variables", name);
4170 *no_add_attrs = true;
4171 }
4172 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4173 {
4174 error ("%qE attribute not allowed with auto storage class", name);
4175 *no_add_attrs = true;
4176 }
4177
4178 return NULL_TREE;
4179}
4180
738f2522
BS
4181/* Table of valid machine attributes. */
4182static const struct attribute_spec nvptx_attribute_table[] =
4183{
4184 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
4185 affects_type_identity } */
4186 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
5012919d 4187 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
738f2522
BS
4188 { NULL, 0, 0, false, false, false, NULL, false }
4189};
4190\f
4191/* Limit vector alignments to BIGGEST_ALIGNMENT. */
4192
4193static HOST_WIDE_INT
4194nvptx_vector_alignment (const_tree type)
4195{
4196 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4197
4198 return MIN (align, BIGGEST_ALIGNMENT);
4199}
d88cd9c4
NS
4200
4201/* Indicate that INSN cannot be duplicated. */
4202
4203static bool
4204nvptx_cannot_copy_insn_p (rtx_insn *insn)
4205{
4206 switch (recog_memoized (insn))
4207 {
4208 case CODE_FOR_nvptx_shufflesi:
4209 case CODE_FOR_nvptx_shufflesf:
4210 case CODE_FOR_nvptx_barsync:
4211 case CODE_FOR_nvptx_fork:
4212 case CODE_FOR_nvptx_forked:
4213 case CODE_FOR_nvptx_joining:
4214 case CODE_FOR_nvptx_join:
4215 return true;
4216 default:
4217 return false;
4218 }
4219}
a794bd20
NS
4220
4221/* Section anchors do not work. Initialization for flag_section_anchor
4222 probes the existence of the anchoring target hooks and prevents
4223 anchoring if they don't exist. However, we may be being used with
4224 a host-side compiler that does support anchoring, and hence see
4225 the anchor flag set (as it's not recalculated). So provide an
4226 implementation denying anchoring. */
4227
4228static bool
4229nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4230{
4231 return false;
4232}
738f2522 4233\f
1f83528e
TS
4234/* Record a symbol for mkoffload to enter into the mapping table. */
4235
4236static void
4237nvptx_record_offload_symbol (tree decl)
4238{
3e32ee19
NS
4239 switch (TREE_CODE (decl))
4240 {
4241 case VAR_DECL:
4242 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4243 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4244 break;
4245
4246 case FUNCTION_DECL:
4247 {
629b3d75 4248 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
4249 /* OpenMP offloading does not set this attribute. */
4250 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 4251
3e32ee19
NS
4252 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4253 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4254
5012919d 4255 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 4256 {
5d306e55 4257 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 4258
5d306e55 4259 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
4260 fprintf (asm_out_file, ", %#x", size);
4261 }
d2d47a28 4262
3e32ee19
NS
4263 fprintf (asm_out_file, "\n");
4264 }
4265 break;
d2d47a28 4266
3e32ee19
NS
4267 default:
4268 gcc_unreachable ();
4269 }
1f83528e
TS
4270}
4271
738f2522
BS
4272/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4273 at the start of a file. */
4274
4275static void
4276nvptx_file_start (void)
4277{
4278 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4279 fputs ("\t.version\t3.1\n", asm_out_file);
4280 fputs ("\t.target\tsm_30\n", asm_out_file);
4281 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4282 fputs ("// END PREAMBLE\n", asm_out_file);
4283}
4284
15ab6f00
NS
4285/* Emit a declaration for a worker-level buffer in .shared memory. */
4286
4287static void
4288write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4289{
4290 const char *name = XSTR (sym, 0);
4291
4292 write_var_marker (file, true, false, name);
4293 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4294 align, name, size);
4295}
4296
ecf6e535
BS
4297/* Write out the function declarations we've collected and declare storage
4298 for the broadcast buffer. */
738f2522
BS
4299
4300static void
4301nvptx_file_end (void)
4302{
f3dba894
TS
4303 hash_table<tree_hasher>::iterator iter;
4304 tree decl;
4305 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 4306 nvptx_record_fndecl (decl);
738f2522 4307 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
4308
4309 if (worker_bcast_size)
15ab6f00
NS
4310 write_worker_buffer (asm_out_file, worker_bcast_sym,
4311 worker_bcast_align, worker_bcast_size);
f3552158
NS
4312
4313 if (worker_red_size)
15ab6f00
NS
4314 write_worker_buffer (asm_out_file, worker_red_sym,
4315 worker_red_align, worker_red_size);
5012919d
AM
4316
4317 if (need_softstack_decl)
4318 {
4319 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4320 /* 32 is the maximum number of warps in a block. Even though it's an
4321 external declaration, emit the array size explicitly; otherwise, it
4322 may fail at PTX JIT time if the definition is later in link order. */
4323 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4324 POINTER_SIZE);
4325 }
4326 if (need_unisimt_decl)
4327 {
4328 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4329 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4330 }
f3552158
NS
4331}
4332
4333/* Expander for the shuffle builtins. */
4334
4335static rtx
4336nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4337{
4338 if (ignore)
4339 return target;
4340
4341 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4342 NULL_RTX, mode, EXPAND_NORMAL);
4343 if (!REG_P (src))
4344 src = copy_to_mode_reg (mode, src);
4345
4346 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4347 NULL_RTX, SImode, EXPAND_NORMAL);
4348 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4349 NULL_RTX, SImode, EXPAND_NORMAL);
4350
4351 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4352 idx = copy_to_mode_reg (SImode, idx);
4353
59263259
NS
4354 rtx pat = nvptx_gen_shuffle (target, src, idx,
4355 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
4356 if (pat)
4357 emit_insn (pat);
4358
4359 return target;
4360}
4361
4362/* Worker reduction address expander. */
4363
4364static rtx
4365nvptx_expand_worker_addr (tree exp, rtx target,
4366 machine_mode ARG_UNUSED (mode), int ignore)
4367{
4368 if (ignore)
4369 return target;
4370
4371 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4372 if (align > worker_red_align)
4373 worker_red_align = align;
4374
4375 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4376 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4377 if (size + offset > worker_red_size)
4378 worker_red_size = size + offset;
4379
9a863523 4380 rtx addr = worker_red_sym;
f3552158 4381 if (offset)
9a863523
NS
4382 {
4383 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4384 addr = gen_rtx_CONST (Pmode, addr);
4385 }
f3552158 4386
9a863523 4387 emit_move_insn (target, addr);
f3552158
NS
4388
4389 return target;
4390}
4391
4392/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4393 not require taking the address of any object, other than the memory
4394 cell being operated on. */
4395
4396static rtx
4397nvptx_expand_cmp_swap (tree exp, rtx target,
4398 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4399{
4400 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4401
4402 if (!target)
4403 target = gen_reg_rtx (mode);
4404
4405 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4406 NULL_RTX, Pmode, EXPAND_NORMAL);
4407 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4408 NULL_RTX, mode, EXPAND_NORMAL);
4409 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4410 NULL_RTX, mode, EXPAND_NORMAL);
4411 rtx pat;
4412
4413 mem = gen_rtx_MEM (mode, mem);
4414 if (!REG_P (cmp))
4415 cmp = copy_to_mode_reg (mode, cmp);
4416 if (!REG_P (src))
4417 src = copy_to_mode_reg (mode, src);
4418
4419 if (mode == SImode)
4420 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4421 else
4422 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4423
4424 emit_insn (pat);
4425
4426 return target;
4427}
4428
4429
4430/* Codes for all the NVPTX builtins. */
4431enum nvptx_builtins
4432{
4433 NVPTX_BUILTIN_SHUFFLE,
4434 NVPTX_BUILTIN_SHUFFLELL,
4435 NVPTX_BUILTIN_WORKER_ADDR,
4436 NVPTX_BUILTIN_CMP_SWAP,
4437 NVPTX_BUILTIN_CMP_SWAPLL,
4438 NVPTX_BUILTIN_MAX
4439};
4440
4441static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4442
4443/* Return the NVPTX builtin for CODE. */
4444
4445static tree
4446nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4447{
4448 if (code >= NVPTX_BUILTIN_MAX)
4449 return error_mark_node;
4450
4451 return nvptx_builtin_decls[code];
4452}
4453
4454/* Set up all builtin functions for this target. */
4455
4456static void
4457nvptx_init_builtins (void)
4458{
4459#define DEF(ID, NAME, T) \
4460 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4461 = add_builtin_function ("__builtin_nvptx_" NAME, \
4462 build_function_type_list T, \
4463 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4464#define ST sizetype
4465#define UINT unsigned_type_node
4466#define LLUINT long_long_unsigned_type_node
4467#define PTRVOID ptr_type_node
4468
4469 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4470 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4471 DEF (WORKER_ADDR, "worker_addr",
4472 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4473 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4474 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4475
4476#undef DEF
4477#undef ST
4478#undef UINT
4479#undef LLUINT
4480#undef PTRVOID
4481}
4482
4483/* Expand an expression EXP that calls a built-in function,
4484 with result going to TARGET if that's convenient
4485 (and in mode MODE if that's convenient).
4486 SUBTARGET may be used as the target for computing one of EXP's operands.
4487 IGNORE is nonzero if the value is to be ignored. */
4488
4489static rtx
4490nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4491 machine_mode mode, int ignore)
4492{
4493 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4494 switch (DECL_FUNCTION_CODE (fndecl))
4495 {
4496 case NVPTX_BUILTIN_SHUFFLE:
4497 case NVPTX_BUILTIN_SHUFFLELL:
4498 return nvptx_expand_shuffle (exp, target, mode, ignore);
4499
4500 case NVPTX_BUILTIN_WORKER_ADDR:
4501 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4502
4503 case NVPTX_BUILTIN_CMP_SWAP:
4504 case NVPTX_BUILTIN_CMP_SWAPLL:
4505 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4506
4507 default: gcc_unreachable ();
4508 }
738f2522
BS
4509}
4510\f
f3552158
NS
4511/* Define dimension sizes for known hardware. */
4512#define PTX_VECTOR_LENGTH 32
4513#define PTX_WORKER_LENGTH 32
6668eb45 4514#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
f3552158 4515
5012919d
AM
4516/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4517
4518static int
4519nvptx_simt_vf ()
4520{
4521 return PTX_VECTOR_LENGTH;
4522}
4523
94829f87
NS
4524/* Validate compute dimensions of an OpenACC offload or routine, fill
4525 in non-unity defaults. FN_LEVEL indicates the level at which a
b6adbb9f
NS
4526 routine might spawn a loop. It is negative for non-routines. If
4527 DECL is null, we are validating the default dimensions. */
94829f87
NS
4528
4529static bool
5d306e55 4530nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4531{
4532 bool changed = false;
4533
ccc8282b 4534 /* The vector size must be 32, unless this is a SEQ routine. */
b6adbb9f
NS
4535 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4536 && dims[GOMP_DIM_VECTOR] >= 0
ccc8282b
NS
4537 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4538 {
b6adbb9f
NS
4539 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4540 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
4541 dims[GOMP_DIM_VECTOR]
4542 ? "using vector_length (%d), ignoring %d"
4543 : "using vector_length (%d), ignoring runtime setting",
4544 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4545 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4546 changed = true;
4547 }
4548
4549 /* Check the num workers is not too large. */
4550 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4551 {
b6adbb9f 4552 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
4553 "using num_workers (%d), ignoring %d",
4554 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4555 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4556 changed = true;
4557 }
94829f87 4558
b6adbb9f
NS
4559 if (!decl)
4560 {
4561 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4562 if (dims[GOMP_DIM_WORKER] < 0)
4563 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4564 if (dims[GOMP_DIM_GANG] < 0)
4565 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4566 changed = true;
4567 }
4568
94829f87
NS
4569 return changed;
4570}
d88cd9c4 4571
bd751975
NS
4572/* Return maximum dimension size, or zero for unbounded. */
4573
4574static int
4575nvptx_dim_limit (int axis)
4576{
4577 switch (axis)
4578 {
4579 case GOMP_DIM_WORKER:
4580 return PTX_WORKER_LENGTH;
4581
4582 case GOMP_DIM_VECTOR:
4583 return PTX_VECTOR_LENGTH;
4584
4585 default:
4586 break;
4587 }
4588 return 0;
4589}
4590
d88cd9c4
NS
4591/* Determine whether fork & joins are needed. */
4592
4593static bool
4594nvptx_goacc_fork_join (gcall *call, const int dims[],
4595 bool ARG_UNUSED (is_fork))
4596{
4597 tree arg = gimple_call_arg (call, 2);
4598 unsigned axis = TREE_INT_CST_LOW (arg);
4599
4600 /* We only care about worker and vector partitioning. */
4601 if (axis < GOMP_DIM_WORKER)
4602 return false;
4603
4604 /* If the size is 1, there's no partitioning. */
4605 if (dims[axis] == 1)
4606 return false;
4607
4608 return true;
4609}
4610
f3552158
NS
4611/* Generate a PTX builtin function call that returns the address in
4612 the worker reduction buffer at OFFSET. TYPE is the type of the
4613 data at that location. */
4614
4615static tree
4616nvptx_get_worker_red_addr (tree type, tree offset)
4617{
4618 machine_mode mode = TYPE_MODE (type);
4619 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4620 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4621 tree align = build_int_cst (unsigned_type_node,
4622 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4623 tree call = build_call_expr (fndecl, 3, offset, size, align);
4624
4625 return fold_convert (build_pointer_type (type), call);
4626}
4627
4628/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4629 will cast the variable if necessary. */
4630
4631static void
4632nvptx_generate_vector_shuffle (location_t loc,
4633 tree dest_var, tree var, unsigned shift,
4634 gimple_seq *seq)
4635{
4636 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4637 tree_code code = NOP_EXPR;
dd3c1b14
NS
4638 tree arg_type = unsigned_type_node;
4639 tree var_type = TREE_TYPE (var);
4640 tree dest_type = var_type;
f3552158 4641
dd3c1b14
NS
4642 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4643 var_type = TREE_TYPE (var_type);
4644
4645 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4646 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4647
4648 if (TYPE_SIZE (var_type)
4649 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4650 {
4651 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4652 arg_type = long_long_unsigned_type_node;
f3552158 4653 }
dd3c1b14 4654
f3552158 4655 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4656 tree bits = build_int_cst (unsigned_type_node, shift);
4657 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4658 tree expr;
4659
4660 if (var_type != dest_type)
4661 {
4662 /* Do real and imaginary parts separately. */
4663 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4664 real = fold_build1 (code, arg_type, real);
4665 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4666 real = fold_build1 (code, var_type, real);
f3552158 4667
dd3c1b14
NS
4668 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4669 imag = fold_build1 (code, arg_type, imag);
4670 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4671 imag = fold_build1 (code, var_type, imag);
4672
4673 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4674 }
4675 else
4676 {
4677 expr = fold_build1 (code, arg_type, var);
4678 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4679 expr = fold_build1 (code, dest_type, expr);
4680 }
f3552158 4681
dd3c1b14 4682 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4683}
4684
33f47f42
NS
4685/* Lazily generate the global lock var decl and return its address. */
4686
4687static tree
4688nvptx_global_lock_addr ()
4689{
4690 tree v = global_lock_var;
4691
4692 if (!v)
4693 {
4694 tree name = get_identifier ("__reduction_lock");
4695 tree type = build_qualified_type (unsigned_type_node,
4696 TYPE_QUAL_VOLATILE);
4697 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4698 global_lock_var = v;
4699 DECL_ARTIFICIAL (v) = 1;
4700 DECL_EXTERNAL (v) = 1;
4701 TREE_STATIC (v) = 1;
4702 TREE_PUBLIC (v) = 1;
4703 TREE_USED (v) = 1;
4704 mark_addressable (v);
4705 mark_decl_referenced (v);
4706 }
4707
4708 return build_fold_addr_expr (v);
4709}
4710
4711/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4712 GSI. We use a lockless scheme for nearly all case, which looks
4713 like:
4714 actual = initval(OP);
4715 do {
4716 guess = actual;
4717 write = guess OP myval;
4718 actual = cmp&swap (ptr, guess, write)
4719 } while (actual bit-different-to guess);
4720 return write;
4721
4722 This relies on a cmp&swap instruction, which is available for 32-
4723 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4724
4725static tree
4726nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4727 tree ptr, tree var, tree_code op)
4728{
4729 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4730 tree_code code = NOP_EXPR;
33f47f42
NS
4731 tree arg_type = unsigned_type_node;
4732 tree var_type = TREE_TYPE (var);
f3552158 4733
33f47f42
NS
4734 if (TREE_CODE (var_type) == COMPLEX_TYPE
4735 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4736 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4737
4738 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4739 {
33f47f42 4740 arg_type = long_long_unsigned_type_node;
f3552158 4741 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4742 }
4743
33f47f42
NS
4744 tree swap_fn = nvptx_builtin_decl (fn, true);
4745
f3552158 4746 gimple_seq init_seq = NULL;
33f47f42
NS
4747 tree init_var = make_ssa_name (arg_type);
4748 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4749 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4750 gimplify_assign (init_var, init_expr, &init_seq);
4751 gimple *init_end = gimple_seq_last (init_seq);
4752
4753 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4754
f3552158
NS
4755 /* Split the block just after the init stmts. */
4756 basic_block pre_bb = gsi_bb (*gsi);
4757 edge pre_edge = split_block (pre_bb, init_end);
4758 basic_block loop_bb = pre_edge->dest;
4759 pre_bb = pre_edge->src;
4760 /* Reset the iterator. */
4761 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4762
33f47f42
NS
4763 tree expect_var = make_ssa_name (arg_type);
4764 tree actual_var = make_ssa_name (arg_type);
4765 tree write_var = make_ssa_name (arg_type);
4766
4767 /* Build and insert the reduction calculation. */
4768 gimple_seq red_seq = NULL;
4769 tree write_expr = fold_build1 (code, var_type, expect_var);
4770 write_expr = fold_build2 (op, var_type, write_expr, var);
4771 write_expr = fold_build1 (code, arg_type, write_expr);
4772 gimplify_assign (write_var, write_expr, &red_seq);
4773
4774 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4775
4776 /* Build & insert the cmp&swap sequence. */
4777 gimple_seq latch_seq = NULL;
4778 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4779 ptr, expect_var, write_var);
4780 gimplify_assign (actual_var, swap_expr, &latch_seq);
4781
4782 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4783 NULL_TREE, NULL_TREE);
4784 gimple_seq_add_stmt (&latch_seq, cond);
4785
4786 gimple *latch_end = gimple_seq_last (latch_seq);
4787 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4788
33f47f42
NS
4789 /* Split the block just after the latch stmts. */
4790 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4791 basic_block post_bb = post_edge->dest;
4792 loop_bb = post_edge->src;
4793 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4794
4795 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4796 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4797 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4798 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4799
4800 gphi *phi = create_phi_node (expect_var, loop_bb);
4801 add_phi_arg (phi, init_var, pre_edge, loc);
4802 add_phi_arg (phi, actual_var, loop_edge, loc);
4803
4804 loop *loop = alloc_loop ();
4805 loop->header = loop_bb;
4806 loop->latch = loop_bb;
4807 add_loop (loop, loop_bb->loop_father);
4808
33f47f42
NS
4809 return fold_build1 (code, var_type, write_var);
4810}
4811
4812/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4813 GSI. This is necessary for types larger than 64 bits, where there
4814 is no cmp&swap instruction to implement a lockless scheme. We use
4815 a lock variable in global memory.
4816
4817 while (cmp&swap (&lock_var, 0, 1))
4818 continue;
4819 T accum = *ptr;
4820 accum = accum OP var;
4821 *ptr = accum;
4822 cmp&swap (&lock_var, 1, 0);
4823 return accum;
4824
4825 A lock in global memory is necessary to force execution engine
4826 descheduling and avoid resource starvation that can occur if the
4827 lock is in .shared memory. */
4828
4829static tree
4830nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4831 tree ptr, tree var, tree_code op)
4832{
4833 tree var_type = TREE_TYPE (var);
4834 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4835 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4836 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4837
4838 /* Split the block just before the gsi. Insert a gimple nop to make
4839 this easier. */
4840 gimple *nop = gimple_build_nop ();
4841 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4842 basic_block entry_bb = gsi_bb (*gsi);
4843 edge entry_edge = split_block (entry_bb, nop);
4844 basic_block lock_bb = entry_edge->dest;
4845 /* Reset the iterator. */
4846 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4847
4848 /* Build and insert the locking sequence. */
4849 gimple_seq lock_seq = NULL;
4850 tree lock_var = make_ssa_name (unsigned_type_node);
4851 tree lock_expr = nvptx_global_lock_addr ();
4852 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4853 uns_unlocked, uns_locked);
4854 gimplify_assign (lock_var, lock_expr, &lock_seq);
4855 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4856 NULL_TREE, NULL_TREE);
4857 gimple_seq_add_stmt (&lock_seq, cond);
4858 gimple *lock_end = gimple_seq_last (lock_seq);
4859 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4860
4861 /* Split the block just after the lock sequence. */
4862 edge locked_edge = split_block (lock_bb, lock_end);
4863 basic_block update_bb = locked_edge->dest;
4864 lock_bb = locked_edge->src;
4865 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4866
4867 /* Create the lock loop ... */
4868 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4869 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4870 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4871 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4872
4873 /* ... and the loop structure. */
4874 loop *lock_loop = alloc_loop ();
4875 lock_loop->header = lock_bb;
4876 lock_loop->latch = lock_bb;
4877 lock_loop->nb_iterations_estimate = 1;
4878 lock_loop->any_estimate = true;
4879 add_loop (lock_loop, entry_bb->loop_father);
4880
4881 /* Build and insert the reduction calculation. */
4882 gimple_seq red_seq = NULL;
4883 tree acc_in = make_ssa_name (var_type);
4884 tree ref_in = build_simple_mem_ref (ptr);
4885 TREE_THIS_VOLATILE (ref_in) = 1;
4886 gimplify_assign (acc_in, ref_in, &red_seq);
4887
4888 tree acc_out = make_ssa_name (var_type);
4889 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4890 gimplify_assign (acc_out, update_expr, &red_seq);
4891
4892 tree ref_out = build_simple_mem_ref (ptr);
4893 TREE_THIS_VOLATILE (ref_out) = 1;
4894 gimplify_assign (ref_out, acc_out, &red_seq);
4895
4896 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4897
4898 /* Build & insert the unlock sequence. */
4899 gimple_seq unlock_seq = NULL;
4900 tree unlock_expr = nvptx_global_lock_addr ();
4901 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4902 uns_locked, uns_unlocked);
4903 gimplify_and_add (unlock_expr, &unlock_seq);
4904 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4905
4906 return acc_out;
4907}
4908
4909/* Emit a sequence to update a reduction accumlator at *PTR with the
4910 value held in VAR using operator OP. Return the updated value.
4911
4912 TODO: optimize for atomic ops and indepedent complex ops. */
4913
4914static tree
4915nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4916 tree ptr, tree var, tree_code op)
4917{
4918 tree type = TREE_TYPE (var);
4919 tree size = TYPE_SIZE (type);
4920
4921 if (size == TYPE_SIZE (unsigned_type_node)
4922 || size == TYPE_SIZE (long_long_unsigned_type_node))
4923 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4924 else
4925 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
4926}
4927
4928/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4929
4930static void
4931nvptx_goacc_reduction_setup (gcall *call)
4932{
4933 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4934 tree lhs = gimple_call_lhs (call);
4935 tree var = gimple_call_arg (call, 2);
4936 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4937 gimple_seq seq = NULL;
4938
4939 push_gimplify_context (true);
4940
4941 if (level != GOMP_DIM_GANG)
4942 {
4943 /* Copy the receiver object. */
4944 tree ref_to_res = gimple_call_arg (call, 1);
4945
4946 if (!integer_zerop (ref_to_res))
4947 var = build_simple_mem_ref (ref_to_res);
4948 }
4949
4950 if (level == GOMP_DIM_WORKER)
4951 {
4952 /* Store incoming value to worker reduction buffer. */
4953 tree offset = gimple_call_arg (call, 5);
4954 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4955 tree ptr = make_ssa_name (TREE_TYPE (call));
4956
4957 gimplify_assign (ptr, call, &seq);
4958 tree ref = build_simple_mem_ref (ptr);
4959 TREE_THIS_VOLATILE (ref) = 1;
4960 gimplify_assign (ref, var, &seq);
4961 }
4962
4963 if (lhs)
4964 gimplify_assign (lhs, var, &seq);
4965
4966 pop_gimplify_context (NULL);
4967 gsi_replace_with_seq (&gsi, seq, true);
4968}
4969
4970/* NVPTX implementation of GOACC_REDUCTION_INIT. */
4971
4972static void
4973nvptx_goacc_reduction_init (gcall *call)
4974{
4975 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4976 tree lhs = gimple_call_lhs (call);
4977 tree var = gimple_call_arg (call, 2);
4978 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4979 enum tree_code rcode
4980 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4981 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4982 TREE_TYPE (var));
4983 gimple_seq seq = NULL;
4984
4985 push_gimplify_context (true);
4986
4987 if (level == GOMP_DIM_VECTOR)
4988 {
4989 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4990 tree tid = make_ssa_name (integer_type_node);
4991 tree dim_vector = gimple_call_arg (call, 3);
4992 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4993 dim_vector);
4994 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4995 NULL_TREE, NULL_TREE);
4996
4997 gimple_call_set_lhs (tid_call, tid);
4998 gimple_seq_add_stmt (&seq, tid_call);
4999 gimple_seq_add_stmt (&seq, cond_stmt);
5000
5001 /* Split the block just after the call. */
5002 edge init_edge = split_block (gsi_bb (gsi), call);
5003 basic_block init_bb = init_edge->dest;
5004 basic_block call_bb = init_edge->src;
5005
5006 /* Fixup flags from call_bb to init_bb. */
5007 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5008
5009 /* Set the initialization stmts. */
5010 gimple_seq init_seq = NULL;
5011 tree init_var = make_ssa_name (TREE_TYPE (var));
5012 gimplify_assign (init_var, init, &init_seq);
5013 gsi = gsi_start_bb (init_bb);
5014 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5015
5016 /* Split block just after the init stmt. */
5017 gsi_prev (&gsi);
5018 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5019 basic_block dst_bb = inited_edge->dest;
5020
5021 /* Create false edge from call_bb to dst_bb. */
5022 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5023
5024 /* Create phi node in dst block. */
5025 gphi *phi = create_phi_node (lhs, dst_bb);
5026 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5027 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5028
5029 /* Reset dominator of dst bb. */
5030 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5031
5032 /* Reset the gsi. */
5033 gsi = gsi_for_stmt (call);
5034 }
5035 else
5036 {
5037 if (level == GOMP_DIM_GANG)
5038 {
5039 /* If there's no receiver object, propagate the incoming VAR. */
5040 tree ref_to_res = gimple_call_arg (call, 1);
5041 if (integer_zerop (ref_to_res))
5042 init = var;
5043 }
5044
5045 gimplify_assign (lhs, init, &seq);
5046 }
5047
5048 pop_gimplify_context (NULL);
5049 gsi_replace_with_seq (&gsi, seq, true);
5050}
5051
5052/* NVPTX implementation of GOACC_REDUCTION_FINI. */
5053
5054static void
5055nvptx_goacc_reduction_fini (gcall *call)
5056{
5057 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5058 tree lhs = gimple_call_lhs (call);
5059 tree ref_to_res = gimple_call_arg (call, 1);
5060 tree var = gimple_call_arg (call, 2);
5061 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5062 enum tree_code op
5063 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5064 gimple_seq seq = NULL;
5065 tree r = NULL_TREE;;
5066
5067 push_gimplify_context (true);
5068
5069 if (level == GOMP_DIM_VECTOR)
5070 {
5071 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5072 but that requires a method of emitting a unified jump at the
5073 gimple level. */
5074 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5075 {
5076 tree other_var = make_ssa_name (TREE_TYPE (var));
5077 nvptx_generate_vector_shuffle (gimple_location (call),
5078 other_var, var, shfl, &seq);
5079
5080 r = make_ssa_name (TREE_TYPE (var));
5081 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5082 var, other_var), &seq);
5083 var = r;
5084 }
5085 }
5086 else
5087 {
5088 tree accum = NULL_TREE;
5089
5090 if (level == GOMP_DIM_WORKER)
5091 {
5092 /* Get reduction buffer address. */
5093 tree offset = gimple_call_arg (call, 5);
5094 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5095 tree ptr = make_ssa_name (TREE_TYPE (call));
5096
5097 gimplify_assign (ptr, call, &seq);
5098 accum = ptr;
5099 }
5100 else if (integer_zerop (ref_to_res))
5101 r = var;
5102 else
5103 accum = ref_to_res;
5104
5105 if (accum)
5106 {
33f47f42 5107 /* UPDATE the accumulator. */
f3552158
NS
5108 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5109 seq = NULL;
33f47f42
NS
5110 r = nvptx_reduction_update (gimple_location (call), &gsi,
5111 accum, var, op);
f3552158
NS
5112 }
5113 }
5114
5115 if (lhs)
5116 gimplify_assign (lhs, r, &seq);
5117 pop_gimplify_context (NULL);
5118
5119 gsi_replace_with_seq (&gsi, seq, true);
5120}
5121
5122/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5123
5124static void
5125nvptx_goacc_reduction_teardown (gcall *call)
5126{
5127 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5128 tree lhs = gimple_call_lhs (call);
5129 tree var = gimple_call_arg (call, 2);
5130 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5131 gimple_seq seq = NULL;
5132
5133 push_gimplify_context (true);
5134 if (level == GOMP_DIM_WORKER)
5135 {
5136 /* Read the worker reduction buffer. */
5137 tree offset = gimple_call_arg (call, 5);
5138 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5139 tree ptr = make_ssa_name (TREE_TYPE (call));
5140
5141 gimplify_assign (ptr, call, &seq);
5142 var = build_simple_mem_ref (ptr);
5143 TREE_THIS_VOLATILE (var) = 1;
5144 }
5145
5146 if (level != GOMP_DIM_GANG)
5147 {
5148 /* Write to the receiver object. */
5149 tree ref_to_res = gimple_call_arg (call, 1);
5150
5151 if (!integer_zerop (ref_to_res))
5152 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5153 }
5154
5155 if (lhs)
5156 gimplify_assign (lhs, var, &seq);
5157
5158 pop_gimplify_context (NULL);
5159
5160 gsi_replace_with_seq (&gsi, seq, true);
5161}
5162
5163/* NVPTX reduction expander. */
5164
5563d5c0 5165static void
f3552158
NS
5166nvptx_goacc_reduction (gcall *call)
5167{
5168 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5169
5170 switch (code)
5171 {
5172 case IFN_GOACC_REDUCTION_SETUP:
5173 nvptx_goacc_reduction_setup (call);
5174 break;
5175
5176 case IFN_GOACC_REDUCTION_INIT:
5177 nvptx_goacc_reduction_init (call);
5178 break;
5179
5180 case IFN_GOACC_REDUCTION_FINI:
5181 nvptx_goacc_reduction_fini (call);
5182 break;
5183
5184 case IFN_GOACC_REDUCTION_TEARDOWN:
5185 nvptx_goacc_reduction_teardown (call);
5186 break;
5187
5188 default:
5189 gcc_unreachable ();
5190 }
5191}
5192
738f2522
BS
5193#undef TARGET_OPTION_OVERRIDE
5194#define TARGET_OPTION_OVERRIDE nvptx_option_override
5195
5196#undef TARGET_ATTRIBUTE_TABLE
5197#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5198
d81db636
SB
5199#undef TARGET_LRA_P
5200#define TARGET_LRA_P hook_bool_void_false
5201
738f2522
BS
5202#undef TARGET_LEGITIMATE_ADDRESS_P
5203#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5204
5205#undef TARGET_PROMOTE_FUNCTION_MODE
5206#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5207
5208#undef TARGET_FUNCTION_ARG
5209#define TARGET_FUNCTION_ARG nvptx_function_arg
5210#undef TARGET_FUNCTION_INCOMING_ARG
5211#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5212#undef TARGET_FUNCTION_ARG_ADVANCE
5213#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
5214#undef TARGET_FUNCTION_ARG_BOUNDARY
5215#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
5216#undef TARGET_PASS_BY_REFERENCE
5217#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5218#undef TARGET_FUNCTION_VALUE_REGNO_P
5219#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5220#undef TARGET_FUNCTION_VALUE
5221#define TARGET_FUNCTION_VALUE nvptx_function_value
5222#undef TARGET_LIBCALL_VALUE
5223#define TARGET_LIBCALL_VALUE nvptx_libcall_value
5224#undef TARGET_FUNCTION_OK_FOR_SIBCALL
5225#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
5226#undef TARGET_GET_DRAP_RTX
5227#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
5228#undef TARGET_SPLIT_COMPLEX_ARG
5229#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5230#undef TARGET_RETURN_IN_MEMORY
5231#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5232#undef TARGET_OMIT_STRUCT_RETURN_REG
5233#define TARGET_OMIT_STRUCT_RETURN_REG true
5234#undef TARGET_STRICT_ARGUMENT_NAMING
5235#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
5236#undef TARGET_CALL_ARGS
5237#define TARGET_CALL_ARGS nvptx_call_args
5238#undef TARGET_END_CALL_ARGS
5239#define TARGET_END_CALL_ARGS nvptx_end_call_args
5240
5241#undef TARGET_ASM_FILE_START
5242#define TARGET_ASM_FILE_START nvptx_file_start
5243#undef TARGET_ASM_FILE_END
5244#define TARGET_ASM_FILE_END nvptx_file_end
5245#undef TARGET_ASM_GLOBALIZE_LABEL
5246#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5247#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5248#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5249#undef TARGET_PRINT_OPERAND
5250#define TARGET_PRINT_OPERAND nvptx_print_operand
5251#undef TARGET_PRINT_OPERAND_ADDRESS
5252#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5253#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5254#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5255#undef TARGET_ASM_INTEGER
5256#define TARGET_ASM_INTEGER nvptx_assemble_integer
5257#undef TARGET_ASM_DECL_END
5258#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5259#undef TARGET_ASM_DECLARE_CONSTANT_NAME
5260#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5261#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5262#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5263#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5264#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5265
5266#undef TARGET_MACHINE_DEPENDENT_REORG
5267#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5268#undef TARGET_NO_REGISTER_ALLOCATION
5269#define TARGET_NO_REGISTER_ALLOCATION true
5270
9a863523
NS
5271#undef TARGET_ENCODE_SECTION_INFO
5272#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
5273#undef TARGET_RECORD_OFFLOAD_SYMBOL
5274#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5275
738f2522
BS
5276#undef TARGET_VECTOR_ALIGNMENT
5277#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5278
d88cd9c4
NS
5279#undef TARGET_CANNOT_COPY_INSN_P
5280#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5281
a794bd20
NS
5282#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5283#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5284
f3552158
NS
5285#undef TARGET_INIT_BUILTINS
5286#define TARGET_INIT_BUILTINS nvptx_init_builtins
5287#undef TARGET_EXPAND_BUILTIN
5288#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5289#undef TARGET_BUILTIN_DECL
5290#define TARGET_BUILTIN_DECL nvptx_builtin_decl
5291
5012919d
AM
5292#undef TARGET_SIMT_VF
5293#define TARGET_SIMT_VF nvptx_simt_vf
5294
94829f87
NS
5295#undef TARGET_GOACC_VALIDATE_DIMS
5296#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5297
bd751975
NS
5298#undef TARGET_GOACC_DIM_LIMIT
5299#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5300
d88cd9c4
NS
5301#undef TARGET_GOACC_FORK_JOIN
5302#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5303
f3552158
NS
5304#undef TARGET_GOACC_REDUCTION
5305#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5306
738f2522
BS
5307struct gcc_target targetm = TARGET_INITIALIZER;
5308
5309#include "gt-nvptx.h"