]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
[nvptx] Generalize bar.sync instruction
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
85ec4feb 2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19
NS
61#include "omp-low.h"
62#include "gomp-constants.h"
d88cd9c4 63#include "dumpfile.h"
f3552158
NS
64#include "internal-fn.h"
65#include "gimple-iterator.h"
66#include "stringpool.h"
314e6352 67#include "attribs.h"
f2d4d839 68#include "tree-vrp.h"
f3552158
NS
69#include "tree-ssa-operands.h"
70#include "tree-ssanames.h"
71#include "gimplify.h"
72#include "tree-phinodes.h"
73#include "cfgloop.h"
74#include "fold-const.h"
50c620ca 75#include "intl.h"
738f2522 76
994c5d85 77/* This file should be included last. */
d58627a0
RS
78#include "target-def.h"
79
43c371e8 80#define WORKAROUND_PTXJIT_BUG 1
3dede32b 81#define WORKAROUND_PTXJIT_BUG_2 1
21251395 82#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 83
5d17a476 84#define PTX_WARP_SIZE 32
693ad66b
TV
85#define PTX_VECTOR_LENGTH 32
86#define PTX_WORKER_LENGTH 32
87#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 88
9a863523
NS
89/* The various PTX memory areas an object might reside in. */
90enum nvptx_data_area
91{
92 DATA_AREA_GENERIC,
93 DATA_AREA_GLOBAL,
94 DATA_AREA_SHARED,
95 DATA_AREA_LOCAL,
96 DATA_AREA_CONST,
97 DATA_AREA_PARAM,
98 DATA_AREA_MAX
99};
100
101/* We record the data area in the target symbol flags. */
102#define SYMBOL_DATA_AREA(SYM) \
103 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
104 & 7)
105#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
106 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
107
738f2522
BS
108/* Record the function decls we've written, and the libfuncs and function
109 decls corresponding to them. */
110static std::stringstream func_decls;
f3dba894 111
6c907cff 112struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
113{
114 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
115 static bool equal (rtx a, rtx b) { return a == b; }
116};
117
118static GTY((cache))
119 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
120
6c907cff 121struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
122{
123 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
124 static bool equal (tree a, tree b) { return a == b; }
125};
126
127static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
128static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 129
f3552158
NS
130/* Buffer needed to broadcast across workers. This is used for both
131 worker-neutering and worker broadcasting. It is shared by all
132 functions emitted. The buffer is placed in shared memory. It'd be
133 nice if PTX supported common blocks, because then this could be
134 shared across TUs (taking the largest size). */
d88cd9c4
NS
135static unsigned worker_bcast_size;
136static unsigned worker_bcast_align;
d88cd9c4
NS
137static GTY(()) rtx worker_bcast_sym;
138
f3552158
NS
139/* Buffer needed for worker reductions. This has to be distinct from
140 the worker broadcast array, as both may be live concurrently. */
141static unsigned worker_red_size;
142static unsigned worker_red_align;
f3552158
NS
143static GTY(()) rtx worker_red_sym;
144
33f47f42
NS
145/* Global lock variable, needed for 128bit worker & gang reductions. */
146static GTY(()) tree global_lock_var;
147
5012919d
AM
148/* True if any function references __nvptx_stacks. */
149static bool need_softstack_decl;
150
151/* True if any function references __nvptx_uni. */
152static bool need_unisimt_decl;
153
738f2522
BS
154/* Allocate a new, cleared machine_function structure. */
155
156static struct machine_function *
157nvptx_init_machine_status (void)
158{
159 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 160 p->return_mode = VOIDmode;
738f2522
BS
161 return p;
162}
163
5012919d
AM
164/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
165 and -fopenacc is also enabled. */
166
167static void
168diagnose_openacc_conflict (bool optval, const char *optname)
169{
170 if (flag_openacc && optval)
171 error ("option %s is not supported together with -fopenacc", optname);
172}
173
738f2522
BS
174/* Implement TARGET_OPTION_OVERRIDE. */
175
176static void
177nvptx_option_override (void)
178{
179 init_machine_status = nvptx_init_machine_status;
ae578943 180
3d339d5e
AM
181 /* Set toplevel_reorder, unless explicitly disabled. We need
182 reordering so that we emit necessary assembler decls of
183 undeclared variables. */
184 if (!global_options_set.x_flag_toplevel_reorder)
185 flag_toplevel_reorder = 1;
186
fa9afdc3
TV
187 debug_nonbind_markers_p = 0;
188
ae578943
NS
189 /* Set flag_no_common, unless explicitly disabled. We fake common
190 using .weak, and that's not entirely accurate, so avoid it
191 unless forced. */
192 if (!global_options_set.x_flag_no_common)
193 flag_no_common = 1;
194
cad136a4
TV
195 /* The patch area requires nops, which we don't have. */
196 if (function_entry_patch_area_size > 0)
197 sorry ("not generating patch area, nops not supported");
198
738f2522
BS
199 /* Assumes that it will see only hard registers. */
200 flag_var_tracking = 0;
1e5154e7 201
dba619f3
NS
202 if (nvptx_optimize < 0)
203 nvptx_optimize = optimize > 0;
204
f3dba894
TS
205 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
206 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 207 declared_libfuncs_htab
f3dba894 208 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 209
15ab6f00 210 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
9a863523 211 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
d88cd9c4 212 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158 213
15ab6f00 214 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 215 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 216 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d
AM
217
218 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
219 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
220 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
221
222 if (TARGET_GOMP)
223 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
224}
225
738f2522
BS
226/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
227 deal with ptx ideosyncracies. */
228
229const char *
230nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
231{
232 switch (mode)
233 {
4e10a5a7 234 case E_BLKmode:
738f2522 235 return ".b8";
4e10a5a7 236 case E_BImode:
738f2522 237 return ".pred";
4e10a5a7 238 case E_QImode:
738f2522
BS
239 if (promote)
240 return ".u32";
241 else
242 return ".u8";
4e10a5a7 243 case E_HImode:
738f2522 244 return ".u16";
4e10a5a7 245 case E_SImode:
738f2522 246 return ".u32";
4e10a5a7 247 case E_DImode:
738f2522
BS
248 return ".u64";
249
4e10a5a7 250 case E_SFmode:
738f2522 251 return ".f32";
4e10a5a7 252 case E_DFmode:
738f2522
BS
253 return ".f64";
254
4e10a5a7 255 case E_V2SImode:
b98b34b7 256 return ".v2.u32";
4e10a5a7 257 case E_V2DImode:
3717fbe3 258 return ".v2.u64";
b98b34b7 259
738f2522
BS
260 default:
261 gcc_unreachable ();
262 }
263}
264
9a863523
NS
265/* Encode the PTX data area that DECL (which might not actually be a
266 _DECL) should reside in. */
7b8edc29 267
9a863523
NS
268static void
269nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 270{
9a863523
NS
271 default_encode_section_info (decl, rtl, first);
272 if (first && MEM_P (rtl))
273 {
274 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 275
9a863523
NS
276 if (TREE_CONSTANT (decl))
277 area = DATA_AREA_CONST;
278 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
279 {
280 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
281 {
282 area = DATA_AREA_SHARED;
283 if (DECL_INITIAL (decl))
284 error ("static initialization of variable %q+D in %<.shared%>"
285 " memory is not supported", decl);
286 }
287 else
288 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
289 }
7b8edc29 290
9a863523
NS
291 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
292 }
293}
294
295/* Return the PTX name of the data area in which SYM should be
296 placed. The symbol must have already been processed by
297 nvptx_encode_seciton_info, or equivalent. */
298
299static const char *
300section_for_sym (rtx sym)
301{
302 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
303 /* Same order as nvptx_data_area enum. */
304 static char const *const areas[] =
305 {"", ".global", ".shared", ".local", ".const", ".param"};
306
307 return areas[area];
308}
309
310/* Similarly for a decl. */
311
312static const char *
313section_for_decl (const_tree decl)
314{
315 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
316}
317
b4346107
NS
318/* Check NAME for special function names and redirect them by returning a
319 replacement. This applies to malloc, free and realloc, for which we
320 want to use libgcc wrappers, and call, which triggers a bug in
321 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
322 not active in an offload compiler -- the names are all set by the
323 host-side compiler. */
324
325static const char *
326nvptx_name_replacement (const char *name)
327{
328 if (strcmp (name, "call") == 0)
329 return "__nvptx_call";
330 if (strcmp (name, "malloc") == 0)
331 return "__nvptx_malloc";
332 if (strcmp (name, "free") == 0)
333 return "__nvptx_free";
334 if (strcmp (name, "realloc") == 0)
335 return "__nvptx_realloc";
336 return name;
337}
338
d7479262
NS
339/* If MODE should be treated as two registers of an inner mode, return
340 that inner mode. Otherwise return VOIDmode. */
738f2522 341
d7479262
NS
342static machine_mode
343maybe_split_mode (machine_mode mode)
738f2522 344{
738f2522 345 if (COMPLEX_MODE_P (mode))
d7479262 346 return GET_MODE_INNER (mode);
738f2522 347
738f2522 348 if (mode == TImode)
d7479262
NS
349 return DImode;
350
351 return VOIDmode;
738f2522
BS
352}
353
38827811
TV
354/* Return true if mode should be treated as two registers. */
355
356static bool
357split_mode_p (machine_mode mode)
358{
359 return maybe_split_mode (mode) != VOIDmode;
360}
361
f313d112
NS
362/* Output a register, subreg, or register pair (with optional
363 enclosing braces). */
364
365static void
366output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
367 int subreg_offset = -1)
368{
369 if (inner_mode == VOIDmode)
370 {
371 if (HARD_REGISTER_NUM_P (regno))
372 fprintf (file, "%s", reg_names[regno]);
373 else
374 fprintf (file, "%%r%d", regno);
375 }
376 else if (subreg_offset >= 0)
377 {
378 output_reg (file, regno, VOIDmode);
379 fprintf (file, "$%d", subreg_offset);
380 }
381 else
382 {
383 if (subreg_offset == -1)
384 fprintf (file, "{");
385 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
386 fprintf (file, ",");
387 output_reg (file, regno, inner_mode, 0);
388 if (subreg_offset == -1)
389 fprintf (file, "}");
390 }
391}
392
d88cd9c4
NS
393/* Emit forking instructions for MASK. */
394
395static void
396nvptx_emit_forking (unsigned mask, bool is_call)
397{
398 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
399 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
400 if (mask)
401 {
402 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
403
404 /* Emit fork at all levels. This helps form SESE regions, as
405 it creates a block with a single successor before entering a
406 partitooned region. That is a good candidate for the end of
407 an SESE region. */
e91eba31 408 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
409 emit_insn (gen_nvptx_forked (op));
410 }
411}
412
413/* Emit joining instructions for MASK. */
414
415static void
416nvptx_emit_joining (unsigned mask, bool is_call)
417{
418 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
419 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
420 if (mask)
421 {
422 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
423
424 /* Emit joining for all non-call pars to ensure there's a single
425 predecessor for the block the join insn ends up in. This is
426 needed for skipping entire loops. */
e91eba31 427 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
428 emit_insn (gen_nvptx_join (op));
429 }
430}
431
738f2522 432\f
44eba92d
NS
433/* Determine whether MODE and TYPE (possibly NULL) should be passed or
434 returned in memory. Integer and floating types supported by the
435 machine are passed in registers, everything else is passed in
436 memory. Complex types are split. */
437
438static bool
439pass_in_memory (machine_mode mode, const_tree type, bool for_return)
440{
441 if (type)
442 {
443 if (AGGREGATE_TYPE_P (type))
444 return true;
445 if (TREE_CODE (type) == VECTOR_TYPE)
446 return true;
447 }
448
449 if (!for_return && COMPLEX_MODE_P (mode))
450 /* Complex types are passed as two underlying args. */
451 mode = GET_MODE_INNER (mode);
452
453 if (GET_MODE_CLASS (mode) != MODE_INT
454 && GET_MODE_CLASS (mode) != MODE_FLOAT)
455 return true;
456
457 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
458 return true;
459
460 return false;
461}
462
463/* A non-memory argument of mode MODE is being passed, determine the mode it
464 should be promoted to. This is also used for determining return
465 type promotion. */
466
467static machine_mode
468promote_arg (machine_mode mode, bool prototyped)
469{
470 if (!prototyped && mode == SFmode)
471 /* K&R float promotion for unprototyped functions. */
472 mode = DFmode;
473 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
474 mode = SImode;
475
476 return mode;
477}
478
479/* A non-memory return type of MODE is being returned. Determine the
480 mode it should be promoted to. */
481
482static machine_mode
483promote_return (machine_mode mode)
484{
485 return promote_arg (mode, true);
486}
487
e74f7152 488/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 489
e74f7152 490static rtx
b49e35a9 491nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
492 const_tree, bool named)
493{
b49e35a9 494 if (mode == VOIDmode || !named)
e74f7152 495 return NULL_RTX;
738f2522 496
b49e35a9 497 return gen_reg_rtx (mode);
e74f7152
NS
498}
499
500/* Implement TARGET_FUNCTION_INCOMING_ARG. */
501
502static rtx
503nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
504 const_tree, bool named)
738f2522 505{
e74f7152 506 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 507
b49e35a9 508 if (mode == VOIDmode || !named)
e74f7152 509 return NULL_RTX;
738f2522 510
e74f7152
NS
511 /* No need to deal with split modes here, the only case that can
512 happen is complex modes and those are dealt with by
513 TARGET_SPLIT_COMPLEX_ARG. */
514 return gen_rtx_UNSPEC (mode,
515 gen_rtvec (1, GEN_INT (cum->count)),
516 UNSPEC_ARG_REG);
517}
518
519/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
520
521static void
522nvptx_function_arg_advance (cumulative_args_t cum_v,
523 machine_mode ARG_UNUSED (mode),
524 const_tree ARG_UNUSED (type),
525 bool ARG_UNUSED (named))
526{
527 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 528
e74f7152
NS
529 cum->count++;
530}
531
1988b2c2
NS
532/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
533
534 For nvptx This is only used for varadic args. The type has already
535 been promoted and/or converted to invisible reference. */
536
537static unsigned
538nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
539{
540 return GET_MODE_ALIGNMENT (mode);
541}
542
e74f7152
NS
543/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
544
545 For nvptx, we know how to handle functions declared as stdarg: by
546 passing an extra pointer to the unnamed arguments. However, the
547 Fortran frontend can produce a different situation, where a
548 function pointer is declared with no arguments, but the actual
549 function and calls to it take more arguments. In that case, we
550 want to ensure the call matches the definition of the function. */
551
552static bool
553nvptx_strict_argument_naming (cumulative_args_t cum_v)
554{
555 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 556
e74f7152
NS
557 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
558}
559
e74f7152
NS
560/* Implement TARGET_LIBCALL_VALUE. */
561
562static rtx
563nvptx_libcall_value (machine_mode mode, const_rtx)
564{
e91cacae 565 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
566 /* Pretend to return in a hard reg for early uses before pseudos can be
567 generated. */
568 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 569
e74f7152
NS
570 return gen_reg_rtx (mode);
571}
572
573/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
574 where function FUNC returns or receives a value of data type TYPE. */
575
576static rtx
44eba92d 577nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
578 bool outgoing)
579{
44eba92d
NS
580 machine_mode mode = promote_return (TYPE_MODE (type));
581
e74f7152 582 if (outgoing)
5c036f3f 583 {
e91cacae 584 gcc_assert (cfun);
44c068ae 585 cfun->machine->return_mode = mode;
5c036f3f
NS
586 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
587 }
b49e35a9
NS
588
589 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
590}
591
592/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
593
594static bool
595nvptx_function_value_regno_p (const unsigned int regno)
596{
597 return regno == NVPTX_RETURN_REGNUM;
598}
599
600/* Types with a mode other than those supported by the machine are passed by
601 reference in memory. */
602
603static bool
5563d5c0
NS
604nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
605 machine_mode mode, const_tree type,
606 bool ARG_UNUSED (named))
e74f7152 607{
44eba92d 608 return pass_in_memory (mode, type, false);
e74f7152
NS
609}
610
611/* Implement TARGET_RETURN_IN_MEMORY. */
612
613static bool
614nvptx_return_in_memory (const_tree type, const_tree)
615{
44eba92d 616 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
617}
618
619/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
620
621static machine_mode
622nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 623 int *ARG_UNUSED (punsignedp),
e74f7152
NS
624 const_tree funtype, int for_return)
625{
44eba92d 626 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
627}
628
e74f7152
NS
629/* Helper for write_arg. Emit a single PTX argument of MODE, either
630 in a prototype, or as copy in a function prologue. ARGNO is the
631 index of this argument in the PTX function. FOR_REG is negative,
632 if we're emitting the PTX prototype. It is zero if we're copying
633 to an argument register and it is greater than zero if we're
634 copying to a specific hard register. */
635
636static int
1f065954
NS
637write_arg_mode (std::stringstream &s, int for_reg, int argno,
638 machine_mode mode)
e74f7152
NS
639{
640 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
641
dc3d2aeb
NS
642 if (for_reg < 0)
643 {
644 /* Writing PTX prototype. */
645 s << (argno ? ", " : " (");
e74f7152 646 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
647 }
648 else
649 {
e74f7152 650 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
651 if (for_reg)
652 s << reg_names[for_reg];
653 else
654 s << "%ar" << argno;
655 s << ";\n";
5563d5c0
NS
656 if (argno >= 0)
657 {
658 s << "\tld.param" << ptx_type << " ";
659 if (for_reg)
660 s << reg_names[for_reg];
661 else
662 s << "%ar" << argno;
663 s << ", [%in_ar" << argno << "];\n";
664 }
dc3d2aeb
NS
665 }
666 return argno + 1;
738f2522
BS
667}
668
e74f7152 669/* Process function parameter TYPE to emit one or more PTX
1f065954 670 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
671 is true, if this is a prototyped function, rather than an old-style
672 C declaration. Returns the next argument number to use.
e74f7152 673
9c582551 674 The promotion behavior here must match the regular GCC function
e74f7152
NS
675 parameter marshalling machinery. */
676
677static int
1f065954
NS
678write_arg_type (std::stringstream &s, int for_reg, int argno,
679 tree type, bool prototyped)
e74f7152
NS
680{
681 machine_mode mode = TYPE_MODE (type);
682
683 if (mode == VOIDmode)
684 return argno;
685
44eba92d 686 if (pass_in_memory (mode, type, false))
e74f7152 687 mode = Pmode;
44eba92d
NS
688 else
689 {
690 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 691
44eba92d
NS
692 if (split)
693 {
694 /* Complex types are sent as two separate args. */
695 type = TREE_TYPE (type);
5563d5c0 696 mode = TYPE_MODE (type);
44eba92d
NS
697 prototyped = true;
698 }
e74f7152 699
44eba92d
NS
700 mode = promote_arg (mode, prototyped);
701 if (split)
1f065954 702 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 703 }
e74f7152 704
1f065954
NS
705 return write_arg_mode (s, for_reg, argno, mode);
706}
707
708/* Emit a PTX return as a prototype or function prologue declaration
709 for MODE. */
710
711static void
712write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
713{
714 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
715 const char *pfx = "\t.reg";
716 const char *sfx = ";\n";
717
718 if (for_proto)
719 pfx = "(.param", sfx = "_out) ";
720
721 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
722}
723
44eba92d 724/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 725 or function prologue declaration. Returns true if return is via an
9c582551 726 additional pointer parameter. The promotion behavior here must
1f065954 727 match the regular GCC function return mashalling. */
44eba92d 728
0beb7c71 729static bool
1f065954 730write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
731{
732 machine_mode mode = TYPE_MODE (type);
0beb7c71 733
44eba92d
NS
734 if (mode == VOIDmode)
735 return false;
736
737 bool return_in_mem = pass_in_memory (mode, type, true);
738
739 if (return_in_mem)
0beb7c71 740 {
44eba92d
NS
741 if (for_proto)
742 return return_in_mem;
743
744 /* Named return values can cause us to return a pointer as well
745 as expect an argument for the return location. This is
746 optimization-level specific, so no caller can make use of
747 this data, but more importantly for us, we must ensure it
748 doesn't change the PTX prototype. */
44c068ae 749 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 750
44eba92d
NS
751 if (mode == VOIDmode)
752 return return_in_mem;
753
44c068ae 754 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 755 retval parameter. */
44c068ae 756 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
757 }
758 else
44eba92d
NS
759 mode = promote_return (mode);
760
1f065954 761 write_return_mode (s, for_proto, mode);
0beb7c71
NS
762
763 return return_in_mem;
764}
765
738f2522
BS
766/* Look for attributes in ATTRS that would indicate we must write a function
767 as a .entry kernel rather than a .func. Return true if one is found. */
768
769static bool
770write_as_kernel (tree attrs)
771{
772 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
773 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
774 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
775 /* For OpenMP target regions, the corresponding kernel entry is emitted from
776 write_omp_entry as a separate function. */
738f2522
BS
777}
778
69823d76
NS
779/* Emit a linker marker for a function decl or defn. */
780
781static void
782write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
783 const char *name)
784{
785 s << "\n// BEGIN";
786 if (globalize)
787 s << " GLOBAL";
788 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
789 s << name << "\n";
790}
791
792/* Emit a linker marker for a variable decl or defn. */
793
794static void
795write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
796{
797 fprintf (file, "\n// BEGIN%s VAR %s: ",
798 globalize ? " GLOBAL" : "",
799 is_defn ? "DEF" : "DECL");
800 assemble_name_raw (file, name);
801 fputs ("\n", file);
802}
803
b699adcc
NS
804/* Write a .func or .kernel declaration or definition along with
805 a helper comment for use by ld. S is the stream to write to, DECL
806 the decl for the function with name NAME. For definitions, emit
807 a declaration too. */
738f2522 808
b699adcc
NS
809static const char *
810write_fn_proto (std::stringstream &s, bool is_defn,
811 const char *name, const_tree decl)
738f2522 812{
b699adcc
NS
813 if (is_defn)
814 /* Emit a declaration. The PTX assembler gets upset without it. */
815 name = write_fn_proto (s, false, name, decl);
b4346107
NS
816 else
817 {
818 /* Avoid repeating the name replacement. */
819 name = nvptx_name_replacement (name);
820 if (name[0] == '*')
821 name++;
822 }
738f2522 823
69823d76 824 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
825
826 /* PTX declaration. */
738f2522
BS
827 if (DECL_EXTERNAL (decl))
828 s << ".extern ";
829 else if (TREE_PUBLIC (decl))
0766660b 830 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 831 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 832
b699adcc
NS
833 tree fntype = TREE_TYPE (decl);
834 tree result_type = TREE_TYPE (fntype);
738f2522 835
ac3d02e2
NS
836 /* atomic_compare_exchange_$n builtins have an exceptional calling
837 convention. */
838 int not_atomic_weak_arg = -1;
839 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
840 switch (DECL_FUNCTION_CODE (decl))
841 {
842 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
843 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
844 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
845 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
846 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
847 /* These atomics skip the 'weak' parm in an actual library
848 call. We must skip it in the prototype too. */
849 not_atomic_weak_arg = 3;
850 break;
851
852 default:
853 break;
854 }
855
738f2522 856 /* Declare the result. */
1f065954 857 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 858
b699adcc
NS
859 s << name;
860
dc3d2aeb 861 int argno = 0;
b699adcc
NS
862
863 /* Emit argument list. */
864 if (return_in_mem)
1f065954 865 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 866
b699adcc
NS
867 /* We get:
868 NULL in TYPE_ARG_TYPES, for old-style functions
869 NULL in DECL_ARGUMENTS, for builtin functions without another
870 declaration.
871 So we have to pick the best one we have. */
872 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
873 bool prototyped = true;
874 if (!args)
875 {
876 args = DECL_ARGUMENTS (decl);
877 prototyped = false;
878 }
738f2522 879
ac3d02e2 880 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 881 {
dc3d2aeb 882 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
883
884 if (not_atomic_weak_arg)
885 argno = write_arg_type (s, -1, argno, type, prototyped);
886 else
887 gcc_assert (type == boolean_type_node);
738f2522 888 }
738f2522 889
b699adcc 890 if (stdarg_p (fntype))
1f065954 891 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 892
b699adcc 893 if (DECL_STATIC_CHAIN (decl))
1f065954 894 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 895
dc3d2aeb 896 if (!argno && strcmp (name, "main") == 0)
b699adcc 897 {
1f065954
NS
898 argno = write_arg_type (s, -1, argno, integer_type_node, true);
899 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
900 }
901
dc3d2aeb 902 if (argno)
b699adcc
NS
903 s << ")";
904
905 s << (is_defn ? "\n" : ";\n");
906
907 return name;
738f2522
BS
908}
909
00e52418
NS
910/* Construct a function declaration from a call insn. This can be
911 necessary for two reasons - either we have an indirect call which
912 requires a .callprototype declaration, or we have a libcall
913 generated by emit_library_call for which no decl exists. */
914
915static void
b699adcc
NS
916write_fn_proto_from_insn (std::stringstream &s, const char *name,
917 rtx result, rtx pat)
00e52418
NS
918{
919 if (!name)
920 {
921 s << "\t.callprototype ";
922 name = "_";
923 }
924 else
925 {
b4346107 926 name = nvptx_name_replacement (name);
69823d76 927 write_fn_marker (s, false, true, name);
00e52418
NS
928 s << "\t.extern .func ";
929 }
930
931 if (result != NULL_RTX)
1f065954 932 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
933
934 s << name;
935
00e52418
NS
936 int arg_end = XVECLEN (pat, 0);
937 for (int i = 1; i < arg_end; i++)
938 {
1f065954
NS
939 /* We don't have to deal with mode splitting & promotion here,
940 as that was already done when generating the call
941 sequence. */
00e52418
NS
942 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
943
1f065954 944 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
945 }
946 if (arg_end != 1)
947 s << ")";
948 s << ";\n";
949}
950
00e52418
NS
951/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
952 table and and write a ptx prototype. These are emitted at end of
953 compilation. */
738f2522 954
00e52418
NS
955static void
956nvptx_record_fndecl (tree decl)
738f2522 957{
f3dba894 958 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
959 if (*slot == NULL)
960 {
961 *slot = decl;
962 const char *name = get_fnname_from_decl (decl);
b699adcc 963 write_fn_proto (func_decls, false, name, decl);
738f2522 964 }
738f2522
BS
965}
966
00e52418
NS
967/* Record a libcall or unprototyped external function. CALLEE is the
968 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
969 declaration for it. */
970
971static void
972nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
973{
974 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
975 if (*slot == NULL)
976 {
977 *slot = callee;
978
979 const char *name = XSTR (callee, 0);
b699adcc 980 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
981 }
982}
983
984/* DECL is an external FUNCTION_DECL, that we're referencing. If it
985 is prototyped, record it now. Otherwise record it as needed at end
986 of compilation, when we might have more information about it. */
738f2522
BS
987
988void
989nvptx_record_needed_fndecl (tree decl)
990{
00e52418
NS
991 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
992 {
993 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
994 if (*slot == NULL)
995 *slot = decl;
996 }
997 else
998 nvptx_record_fndecl (decl);
999}
738f2522 1000
00e52418
NS
1001/* SYM is a SYMBOL_REF. If it refers to an external function, record
1002 it as needed. */
1003
1004static void
1005nvptx_maybe_record_fnsym (rtx sym)
1006{
1007 tree decl = SYMBOL_REF_DECL (sym);
1008
1009 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1010 nvptx_record_needed_fndecl (decl);
738f2522
BS
1011}
1012
1f065954 1013/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1014 and initialize REGNO to point to it. If the size is zero, it'll
1015 never be valid to dereference, so we can simply initialize to
1016 zero. */
1f065954
NS
1017
1018static void
1019init_frame (FILE *file, int regno, unsigned align, unsigned size)
1020{
5563d5c0
NS
1021 if (size)
1022 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1023 align, reg_names[regno], size);
1024 fprintf (file, "\t.reg.u%d %s;\n",
1025 POINTER_SIZE, reg_names[regno]);
1026 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1027 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1028 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1029}
1030
5012919d
AM
1031/* Emit soft stack frame setup sequence. */
1032
1033static void
1034init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1035{
1036 /* Maintain 64-bit stack alignment. */
1037 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1038 size = ROUND_UP (size, keep_align);
1039 int bits = POINTER_SIZE;
1040 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1041 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1042 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1043 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1044 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1045 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1046 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1047 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1048 fprintf (file, "\t{\n");
1049 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1050 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1051 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1052 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1053 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1054 bits == 64 ? ".wide" : ".lo", bits / 8);
1055 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1056
1057 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1058 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1059
1060 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1061 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1062 bits, reg_sspprev, reg_sspslot);
1063
1064 /* Initialize %frame = %sspprev - size. */
1065 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1066 bits, reg_frame, reg_sspprev, size);
1067
1068 /* Apply alignment, if larger than 64. */
1069 if (alignment > keep_align)
1070 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1071 bits, reg_frame, reg_frame, -alignment);
1072
1073 size = crtl->outgoing_args_size;
1074 gcc_assert (size % keep_align == 0);
1075
1076 /* Initialize %stack. */
1077 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1078 bits, reg_stack, reg_frame, size);
1079
5012919d
AM
1080 if (!crtl->is_leaf)
1081 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1082 bits, reg_sspslot, reg_stack);
1083 fprintf (file, "\t}\n");
1084 cfun->machine->has_softstack = true;
1085 need_softstack_decl = true;
1086}
1087
d88cd9c4
NS
1088/* Emit code to initialize the REGNO predicate register to indicate
1089 whether we are not lane zero on the NAME axis. */
1090
1091static void
1092nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1093{
1094 fprintf (file, "\t{\n");
1095 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1096 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1097 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1098 fprintf (file, "\t}\n");
1099}
1100
5012919d
AM
1101/* Emit code to initialize predicate and master lane index registers for
1102 -muniform-simt code generation variant. */
1103
1104static void
1105nvptx_init_unisimt_predicate (FILE *file)
1106{
0c6b03b5
AM
1107 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1108 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1109 int bits = POINTER_SIZE;
0c6b03b5 1110 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1111 fprintf (file, "\t{\n");
1112 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1113 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1114 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1115 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1116 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1117 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1118 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1119 if (cfun->machine->unisimt_predicate)
1120 {
1121 int master = REGNO (cfun->machine->unisimt_master);
1122 int pred = REGNO (cfun->machine->unisimt_predicate);
1123 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1124 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1125 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1126 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1127 /* Compute predicate as 'tid.x == master'. */
1128 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1129 }
5012919d
AM
1130 fprintf (file, "\t}\n");
1131 need_unisimt_decl = true;
1132}
1133
1134/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1135
1136 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1137 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1138 {
1139 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1140 __nvptx_uni[tid.y] = 0;
1141 gomp_nvptx_main (ORIG, arg);
1142 }
1143 ORIG itself should not be emitted as a PTX .entry function. */
1144
1145static void
1146write_omp_entry (FILE *file, const char *name, const char *orig)
1147{
1148 static bool gomp_nvptx_main_declared;
1149 if (!gomp_nvptx_main_declared)
1150 {
1151 gomp_nvptx_main_declared = true;
1152 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1153 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1154 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1155 }
3c5b4c6d
TS
1156 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1157#define NTID_Y "%ntid.y"
5012919d
AM
1158#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1159 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1160{\n\
1161 .reg.u32 %r<3>;\n\
1162 .reg.u" PS " %R<4>;\n\
1163 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1164 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1165 mov.u32 %r2, %ctaid.x;\n\
1166 cvt.u" PS ".u32 %R1, %r0;\n\
1167 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1168 mov.u" PS " %R0, __nvptx_stacks;\n\
1169 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1170 ld.param.u" PS " %R2, [%stack];\n\
1171 ld.param.u" PS " %R3, [%sz];\n\
1172 add.u" PS " %R2, %R2, %R3;\n\
1173 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1174 st.shared.u" PS " [%R0], %R2;\n\
1175 mov.u" PS " %R0, __nvptx_uni;\n\
1176 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1177 mov.u32 %r0, 0;\n\
1178 st.shared.u32 [%R0], %r0;\n\
1179 mov.u" PS " %R0, \0;\n\
1180 ld.param.u" PS " %R1, [%arg];\n\
1181 {\n\
1182 .param.u" PS " %P<2>;\n\
1183 st.param.u" PS " [%P0], %R0;\n\
1184 st.param.u" PS " [%P1], %R1;\n\
1185 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1186 }\n\
1187 ret.uni;\n\
1188}\n"
1189 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1190 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1191#undef ENTRY_TEMPLATE
3c5b4c6d 1192#undef NTID_Y
5012919d
AM
1193 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1194 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1195 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1196 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1197 need_softstack_decl = need_unisimt_decl = true;
1198}
1199
738f2522
BS
1200/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1201 function, including local var decls and copies from the arguments to
1202 local regs. */
1203
1204void
1205nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1206{
1207 tree fntype = TREE_TYPE (decl);
1208 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1209 int argno = 0;
738f2522 1210
5012919d
AM
1211 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1212 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1213 {
1214 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1215 sprintf (buf, "%s$impl", name);
1216 write_omp_entry (file, name, buf);
1217 name = buf;
1218 }
dc3d2aeb
NS
1219 /* We construct the initial part of the function into a string
1220 stream, in order to share the prototype writing code. */
738f2522 1221 std::stringstream s;
b699adcc 1222 write_fn_proto (s, true, name, decl);
dc3d2aeb 1223 s << "{\n";
738f2522 1224
1f065954 1225 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1226 if (return_in_mem)
1f065954 1227 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1228
5ab662d5 1229 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1230 tree args = TYPE_ARG_TYPES (fntype);
1231 bool prototyped = true;
1232 if (!args)
5ab662d5 1233 {
dc3d2aeb
NS
1234 args = DECL_ARGUMENTS (decl);
1235 prototyped = false;
5ab662d5
NS
1236 }
1237
1238 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1239 {
1240 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1241
1f065954 1242 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1243 }
5ab662d5 1244
dc3d2aeb 1245 if (stdarg_p (fntype))
5563d5c0 1246 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1247 true);
5ab662d5 1248
5563d5c0
NS
1249 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1250 write_arg_type (s, STATIC_CHAIN_REGNUM,
1251 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1252 true);
1253
dc3d2aeb 1254 fprintf (file, "%s", s.str().c_str());
25662751 1255
0c6b03b5
AM
1256 /* Usually 'crtl->is_leaf' is computed during register allocator
1257 initialization (which is not done on NVPTX) or for pressure-sensitive
1258 optimizations. Initialize it here, except if already set. */
1259 if (!crtl->is_leaf)
1260 crtl->is_leaf = leaf_function_p ();
1261
44c068ae 1262 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1263 bool need_frameptr = sz || cfun->machine->has_chain;
1264 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1265 if (!TARGET_SOFT_STACK)
1266 {
1267 /* Declare a local var for outgoing varargs. */
1268 if (cfun->machine->has_varadic)
1269 init_frame (file, STACK_POINTER_REGNUM,
1270 UNITS_PER_WORD, crtl->outgoing_args_size);
1271
1272 /* Declare a local variable for the frame. Force its size to be
1273 DImode-compatible. */
1274 if (need_frameptr)
1275 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1276 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1277 }
0c6b03b5
AM
1278 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1279 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1280 init_softstack_frame (file, alignment, sz);
44c068ae 1281
0c6b03b5
AM
1282 if (cfun->machine->has_simtreg)
1283 {
1284 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1285 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1286 align = MAX (align, GET_MODE_SIZE (DImode));
1287 if (!crtl->is_leaf || cfun->calls_alloca)
1288 simtsz = HOST_WIDE_INT_M1U;
1289 if (simtsz == HOST_WIDE_INT_M1U)
1290 simtsz = nvptx_softstack_size;
1291 if (cfun->machine->has_softstack)
1292 simtsz += POINTER_SIZE / 8;
1293 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1294 if (align > GET_MODE_SIZE (DImode))
1295 simtsz += align - GET_MODE_SIZE (DImode);
1296 if (simtsz)
1297 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1298 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1299 }
738f2522
BS
1300 /* Declare the pseudos we have as ptx registers. */
1301 int maxregs = max_reg_num ();
1302 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1303 {
1304 if (regno_reg_rtx[i] != const0_rtx)
1305 {
1306 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1307 machine_mode split = maybe_split_mode (mode);
f313d112 1308
38827811 1309 if (split_mode_p (mode))
f313d112
NS
1310 mode = split;
1311 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1312 output_reg (file, i, split, -2);
1313 fprintf (file, ";\n");
738f2522
BS
1314 }
1315 }
1316
d88cd9c4
NS
1317 /* Emit axis predicates. */
1318 if (cfun->machine->axis_predicate[0])
1319 nvptx_init_axis_predicate (file,
1320 REGNO (cfun->machine->axis_predicate[0]), "y");
1321 if (cfun->machine->axis_predicate[1])
1322 nvptx_init_axis_predicate (file,
1323 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1324 if (cfun->machine->unisimt_predicate
1325 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1326 nvptx_init_unisimt_predicate (file);
738f2522
BS
1327}
1328
0c6b03b5
AM
1329/* Output code for switching uniform-simt state. ENTERING indicates whether
1330 we are entering or leaving non-uniform execution region. */
1331
1332static void
1333nvptx_output_unisimt_switch (FILE *file, bool entering)
1334{
1335 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1336 return;
1337 fprintf (file, "\t{\n");
1338 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1339 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1340 if (!crtl->is_leaf)
1341 {
1342 int loc = REGNO (cfun->machine->unisimt_location);
1343 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1344 }
1345 if (cfun->machine->unisimt_predicate)
1346 {
1347 int master = REGNO (cfun->machine->unisimt_master);
1348 int pred = REGNO (cfun->machine->unisimt_predicate);
1349 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1350 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1351 master, entering ? "%ustmp2" : "0");
1352 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1353 }
1354 fprintf (file, "\t}\n");
1355}
1356
1357/* Output code for allocating per-lane storage and switching soft-stack pointer.
1358 ENTERING indicates whether we are entering or leaving non-uniform execution.
1359 PTR is the register pointing to allocated storage, it is assigned to on
1360 entering and used to restore state on leaving. SIZE and ALIGN are used only
1361 on entering. */
1362
1363static void
1364nvptx_output_softstack_switch (FILE *file, bool entering,
1365 rtx ptr, rtx size, rtx align)
1366{
1367 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1368 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1369 return;
1370 int bits = POINTER_SIZE, regno = REGNO (ptr);
1371 fprintf (file, "\t{\n");
1372 if (entering)
1373 {
1374 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1375 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1376 cfun->machine->simt_stack_size);
1377 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1378 if (CONST_INT_P (size))
1379 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1380 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1381 else
1382 output_reg (file, REGNO (size), VOIDmode);
1383 fputs (";\n", file);
1384 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296
AM
1385 fprintf (file,
1386 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1387 bits, regno, regno, UINTVAL (align));
1388 }
1389 if (cfun->machine->has_softstack)
1390 {
1391 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1392 if (entering)
1393 {
1394 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1395 bits, regno, bits / 8, reg_stack);
1396 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1397 bits, reg_stack, regno, bits / 8);
1398 }
1399 else
1400 {
1401 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1402 bits, reg_stack, regno, bits / 8);
1403 }
1404 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1405 }
1406 fprintf (file, "\t}\n");
1407}
1408
1409/* Output code to enter non-uniform execution region. DEST is a register
1410 to hold a per-lane allocation given by SIZE and ALIGN. */
1411
1412const char *
1413nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1414{
1415 nvptx_output_unisimt_switch (asm_out_file, true);
1416 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1417 return "";
1418}
1419
1420/* Output code to leave non-uniform execution region. SRC is the register
1421 holding per-lane storage previously allocated by omp_simt_enter insn. */
1422
1423const char *
1424nvptx_output_simt_exit (rtx src)
1425{
1426 nvptx_output_unisimt_switch (asm_out_file, false);
1427 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1428 return "";
1429}
1430
5012919d
AM
1431/* Output instruction that sets soft stack pointer in shared memory to the
1432 value in register given by SRC_REGNO. */
1433
1434const char *
1435nvptx_output_set_softstack (unsigned src_regno)
1436{
1437 if (cfun->machine->has_softstack && !crtl->is_leaf)
1438 {
1439 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1440 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1441 output_reg (asm_out_file, src_regno, VOIDmode);
1442 fprintf (asm_out_file, ";\n");
1443 }
1444 return "";
1445}
738f2522
BS
1446/* Output a return instruction. Also copy the return value to its outgoing
1447 location. */
1448
1449const char *
1450nvptx_output_return (void)
1451{
44c068ae 1452 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1453
1454 if (mode != VOIDmode)
1f065954
NS
1455 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1456 nvptx_ptx_type_from_mode (mode, false),
1457 reg_names[NVPTX_RETURN_REGNUM],
1458 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1459
1460 return "ret;";
1461}
1462
738f2522
BS
1463/* Terminate a function by writing a closing brace to FILE. */
1464
1465void
1466nvptx_function_end (FILE *file)
1467{
cf08c344 1468 fprintf (file, "}\n");
738f2522
BS
1469}
1470\f
1471/* Decide whether we can make a sibling call to a function. For ptx, we
1472 can't. */
1473
1474static bool
1475nvptx_function_ok_for_sibcall (tree, tree)
1476{
1477 return false;
1478}
1479
18c05628
NS
1480/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1481
1482static rtx
1483nvptx_get_drap_rtx (void)
1484{
5012919d
AM
1485 if (TARGET_SOFT_STACK && stack_realign_drap)
1486 return arg_pointer_rtx;
18c05628
NS
1487 return NULL_RTX;
1488}
1489
738f2522
BS
1490/* Implement the TARGET_CALL_ARGS hook. Record information about one
1491 argument to the next call. */
1492
1493static void
44c068ae 1494nvptx_call_args (rtx arg, tree fntype)
738f2522 1495{
44c068ae 1496 if (!cfun->machine->doing_call)
738f2522 1497 {
44c068ae
NS
1498 cfun->machine->doing_call = true;
1499 cfun->machine->is_varadic = false;
1500 cfun->machine->num_args = 0;
1501
1502 if (fntype && stdarg_p (fntype))
1503 {
1504 cfun->machine->is_varadic = true;
1505 cfun->machine->has_varadic = true;
1506 cfun->machine->num_args++;
1507 }
738f2522 1508 }
738f2522 1509
44c068ae
NS
1510 if (REG_P (arg) && arg != pc_rtx)
1511 {
1512 cfun->machine->num_args++;
1513 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1514 cfun->machine->call_args);
1515 }
738f2522
BS
1516}
1517
1518/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1519 information we recorded. */
1520
1521static void
1522nvptx_end_call_args (void)
1523{
44c068ae 1524 cfun->machine->doing_call = false;
738f2522
BS
1525 free_EXPR_LIST_list (&cfun->machine->call_args);
1526}
1527
ecf6e535
BS
1528/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1529 track of whether calls involving static chains or varargs were seen
1530 in the current function.
1531 For libcalls, maintain a hash table of decls we have seen, and
1532 record a function decl for later when encountering a new one. */
738f2522
BS
1533
1534void
1535nvptx_expand_call (rtx retval, rtx address)
1536{
738f2522 1537 rtx callee = XEXP (address, 0);
f324806d 1538 rtx varargs = NULL_RTX;
d88cd9c4 1539 unsigned parallel = 0;
738f2522 1540
738f2522
BS
1541 if (!call_insn_operand (callee, Pmode))
1542 {
1543 callee = force_reg (Pmode, callee);
1544 address = change_address (address, QImode, callee);
1545 }
1546
1547 if (GET_CODE (callee) == SYMBOL_REF)
1548 {
1549 tree decl = SYMBOL_REF_DECL (callee);
1550 if (decl != NULL_TREE)
1551 {
738f2522 1552 if (DECL_STATIC_CHAIN (decl))
44c068ae 1553 cfun->machine->has_chain = true;
00e52418 1554
629b3d75 1555 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1556 if (attr)
1557 {
1558 tree dims = TREE_VALUE (attr);
1559
1560 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1561 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1562 {
1563 if (TREE_PURPOSE (dims)
1564 && !integer_zerop (TREE_PURPOSE (dims)))
1565 break;
1566 /* Not on this axis. */
1567 parallel ^= GOMP_DIM_MASK (ix);
1568 dims = TREE_CHAIN (dims);
1569 }
1570 }
738f2522
BS
1571 }
1572 }
c38f0d8c 1573
44c068ae
NS
1574 unsigned nargs = cfun->machine->num_args;
1575 if (cfun->machine->is_varadic)
738f2522 1576 {
f324806d 1577 varargs = gen_reg_rtx (Pmode);
863af9a4 1578 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1579 }
1580
44c068ae
NS
1581 rtvec vec = rtvec_alloc (nargs + 1);
1582 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1583 int vec_pos = 0;
44c068ae
NS
1584
1585 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1586 rtx tmp_retval = retval;
44c068ae 1587 if (retval)
738f2522
BS
1588 {
1589 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1590 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1591 call = gen_rtx_SET (tmp_retval, call);
738f2522 1592 }
44c068ae 1593 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1594
1595 /* Construct the call insn, including a USE for each argument pseudo
1596 register. These will be used when printing the insn. */
1597 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1598 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1599
1600 if (varargs)
cf08c344 1601 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1602
1603 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1604
d88cd9c4 1605 nvptx_emit_forking (parallel, true);
738f2522 1606 emit_call_insn (pat);
d88cd9c4
NS
1607 nvptx_emit_joining (parallel, true);
1608
738f2522
BS
1609 if (tmp_retval != retval)
1610 emit_move_insn (retval, tmp_retval);
1611}
44eba92d 1612
738f2522
BS
1613/* Emit a comparison COMPARE, and return the new test to be used in the
1614 jump. */
1615
1616rtx
1617nvptx_expand_compare (rtx compare)
1618{
1619 rtx pred = gen_reg_rtx (BImode);
1620 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1621 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1622 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1623 return gen_rtx_NE (BImode, pred, const0_rtx);
1624}
1625
d88cd9c4
NS
1626/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1627
1628void
1629nvptx_expand_oacc_fork (unsigned mode)
1630{
1631 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1632}
1633
1634void
1635nvptx_expand_oacc_join (unsigned mode)
1636{
1637 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1638}
1639
1640/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1641 objects. */
1642
1643static rtx
1644nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1645{
1646 rtx res;
1647
1648 switch (GET_MODE (src))
1649 {
4e10a5a7 1650 case E_DImode:
d88cd9c4
NS
1651 res = gen_unpackdisi2 (dst0, dst1, src);
1652 break;
4e10a5a7 1653 case E_DFmode:
d88cd9c4
NS
1654 res = gen_unpackdfsi2 (dst0, dst1, src);
1655 break;
1656 default: gcc_unreachable ();
1657 }
1658 return res;
1659}
1660
1661/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1662 object. */
1663
1664static rtx
1665nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1666{
1667 rtx res;
1668
1669 switch (GET_MODE (dst))
1670 {
4e10a5a7 1671 case E_DImode:
d88cd9c4
NS
1672 res = gen_packsidi2 (dst, src0, src1);
1673 break;
4e10a5a7 1674 case E_DFmode:
d88cd9c4
NS
1675 res = gen_packsidf2 (dst, src0, src1);
1676 break;
1677 default: gcc_unreachable ();
1678 }
1679 return res;
1680}
1681
1682/* Generate an instruction or sequence to broadcast register REG
1683 across the vectors of a single warp. */
1684
5012919d 1685rtx
59263259 1686nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1687{
1688 rtx res;
1689
1690 switch (GET_MODE (dst))
1691 {
4e10a5a7 1692 case E_SImode:
d88cd9c4
NS
1693 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1694 break;
4e10a5a7 1695 case E_SFmode:
d88cd9c4
NS
1696 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1697 break;
4e10a5a7
RS
1698 case E_DImode:
1699 case E_DFmode:
d88cd9c4
NS
1700 {
1701 rtx tmp0 = gen_reg_rtx (SImode);
1702 rtx tmp1 = gen_reg_rtx (SImode);
1703
1704 start_sequence ();
1705 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1706 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1707 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1708 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1709 res = get_insns ();
1710 end_sequence ();
1711 }
1712 break;
4e10a5a7 1713 case E_BImode:
d88cd9c4
NS
1714 {
1715 rtx tmp = gen_reg_rtx (SImode);
1716
1717 start_sequence ();
1718 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1719 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1720 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1721 res = get_insns ();
1722 end_sequence ();
1723 }
1724 break;
4e10a5a7
RS
1725 case E_QImode:
1726 case E_HImode:
d5ace3b5
CP
1727 {
1728 rtx tmp = gen_reg_rtx (SImode);
1729
1730 start_sequence ();
1731 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1732 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1733 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1734 tmp)));
1735 res = get_insns ();
1736 end_sequence ();
1737 }
1738 break;
d88cd9c4
NS
1739
1740 default:
1741 gcc_unreachable ();
1742 }
1743 return res;
1744}
1745
1746/* Generate an instruction or sequence to broadcast register REG
1747 across the vectors of a single warp. */
1748
1749static rtx
1750nvptx_gen_vcast (rtx reg)
1751{
1752 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1753}
1754
1755/* Structure used when generating a worker-level spill or fill. */
1756
1757struct wcast_data_t
1758{
1759 rtx base; /* Register holding base addr of buffer. */
1760 rtx ptr; /* Iteration var, if needed. */
1761 unsigned offset; /* Offset into worker buffer. */
1762};
1763
1764/* Direction of the spill/fill and looping setup/teardown indicator. */
1765
1766enum propagate_mask
1767 {
1768 PM_read = 1 << 0,
1769 PM_write = 1 << 1,
1770 PM_loop_begin = 1 << 2,
1771 PM_loop_end = 1 << 3,
1772
1773 PM_read_write = PM_read | PM_write
1774 };
1775
1776/* Generate instruction(s) to spill or fill register REG to/from the
1777 worker broadcast array. PM indicates what is to be done, REP
1778 how many loop iterations will be executed (0 for not a loop). */
1779
1780static rtx
1781nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1782{
1783 rtx res;
1784 machine_mode mode = GET_MODE (reg);
1785
1786 switch (mode)
1787 {
4e10a5a7 1788 case E_BImode:
d88cd9c4
NS
1789 {
1790 rtx tmp = gen_reg_rtx (SImode);
1791
1792 start_sequence ();
1793 if (pm & PM_read)
1794 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1795 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1796 if (pm & PM_write)
1797 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1798 res = get_insns ();
1799 end_sequence ();
1800 }
1801 break;
1802
1803 default:
1804 {
1805 rtx addr = data->ptr;
1806
1807 if (!addr)
1808 {
1809 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1810
1811 if (align > worker_bcast_align)
1812 worker_bcast_align = align;
1813 data->offset = (data->offset + align - 1) & ~(align - 1);
1814 addr = data->base;
1815 if (data->offset)
1816 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1817 }
1818
1819 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1820 if (pm == PM_read)
1821 res = gen_rtx_SET (addr, reg);
1822 else if (pm == PM_write)
1823 res = gen_rtx_SET (reg, addr);
1824 else
1825 gcc_unreachable ();
1826
1827 if (data->ptr)
1828 {
1829 /* We're using a ptr, increment it. */
1830 start_sequence ();
1831
1832 emit_insn (res);
1833 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1834 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1835 res = get_insns ();
1836 end_sequence ();
1837 }
1838 else
1839 rep = 1;
1840 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1841 }
1842 break;
1843 }
1844 return res;
1845}
738f2522
BS
1846\f
1847/* Returns true if X is a valid address for use in a memory reference. */
1848
1849static bool
1850nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1851{
1852 enum rtx_code code = GET_CODE (x);
1853
1854 switch (code)
1855 {
1856 case REG:
1857 return true;
1858
1859 case PLUS:
1860 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1861 return true;
1862 return false;
1863
1864 case CONST:
1865 case SYMBOL_REF:
1866 case LABEL_REF:
1867 return true;
1868
1869 default:
1870 return false;
1871 }
1872}
738f2522 1873\f
4d5438cd
NS
1874/* Machinery to output constant initializers. When beginning an
1875 initializer, we decide on a fragment size (which is visible in ptx
1876 in the type used), and then all initializer data is buffered until
1877 a fragment is filled and ready to be written out. */
1878
1879static struct
1880{
1881 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1882 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1883 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1884 out. */
1885 unsigned size; /* Fragment size to accumulate. */
1886 unsigned offset; /* Offset within current fragment. */
1887 bool started; /* Whether we've output any initializer. */
1888} init_frag;
1889
1890/* The current fragment is full, write it out. SYM may provide a
1891 symbolic reference we should output, in which case the fragment
1892 value is the addend. */
738f2522
BS
1893
1894static void
4d5438cd 1895output_init_frag (rtx sym)
738f2522 1896{
4d5438cd
NS
1897 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1898 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1899
4d5438cd
NS
1900 init_frag.started = true;
1901 init_frag.val = 0;
1902 init_frag.offset = 0;
1903 init_frag.remaining--;
1904
1905 if (sym)
1906 {
6c7c4708
CP
1907 bool function = (SYMBOL_REF_DECL (sym)
1908 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1909 if (!function)
1910 fprintf (asm_out_file, "generic(");
4d5438cd 1911 output_address (VOIDmode, sym);
6c7c4708
CP
1912 if (!function)
1913 fprintf (asm_out_file, ")");
1914 if (val)
1915 fprintf (asm_out_file, " + ");
4d5438cd 1916 }
738f2522 1917
4d5438cd
NS
1918 if (!sym || val)
1919 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
1920}
1921
4d5438cd
NS
1922/* Add value VAL of size SIZE to the data we're emitting, and keep
1923 writing out chunks as they fill up. */
738f2522
BS
1924
1925static void
4d5438cd 1926nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 1927{
4d5438cd
NS
1928 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1929
1930 for (unsigned part = 0; size; size -= part)
738f2522 1931 {
4d5438cd
NS
1932 val >>= part * BITS_PER_UNIT;
1933 part = init_frag.size - init_frag.offset;
1934 if (part > size)
1935 part = size;
1936
1937 unsigned HOST_WIDE_INT partial
1938 = val << (init_frag.offset * BITS_PER_UNIT);
1939 init_frag.val |= partial & init_frag.mask;
1940 init_frag.offset += part;
1941
1942 if (init_frag.offset == init_frag.size)
1943 output_init_frag (NULL);
738f2522
BS
1944 }
1945}
1946
1947/* Target hook for assembling integer object X of size SIZE. */
1948
1949static bool
1950nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1951{
00e52418
NS
1952 HOST_WIDE_INT val = 0;
1953
1954 switch (GET_CODE (x))
738f2522 1955 {
00e52418 1956 default:
a9000e1e
NS
1957 /* Let the generic machinery figure it out, usually for a
1958 CONST_WIDE_INT. */
1959 return false;
00e52418
NS
1960
1961 case CONST_INT:
4d5438cd 1962 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
1963 break;
1964
1965 case CONST:
1966 x = XEXP (x, 0);
1967 gcc_assert (GET_CODE (x) == PLUS);
1968 val = INTVAL (XEXP (x, 1));
1969 x = XEXP (x, 0);
1970 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1971 /* FALLTHROUGH */
1972
1973 case SYMBOL_REF:
4d5438cd
NS
1974 gcc_assert (size == init_frag.size);
1975 if (init_frag.offset)
738f2522 1976 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 1977
00e52418 1978 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
1979 init_frag.val = val;
1980 output_init_frag (x);
738f2522 1981 break;
738f2522
BS
1982 }
1983
738f2522
BS
1984 return true;
1985}
1986
1987/* Output SIZE zero bytes. We ignore the FILE argument since the
1988 functions we're calling to perform the output just use
1989 asm_out_file. */
1990
1991void
1992nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1993{
4d5438cd
NS
1994 /* Finish the current fragment, if it's started. */
1995 if (init_frag.offset)
738f2522 1996 {
4d5438cd
NS
1997 unsigned part = init_frag.size - init_frag.offset;
1998 if (part > size)
1999 part = (unsigned) size;
2000 size -= part;
2001 nvptx_assemble_value (0, part);
738f2522
BS
2002 }
2003
4d5438cd
NS
2004 /* If this skip doesn't terminate the initializer, write as many
2005 remaining pieces as possible directly. */
2006 if (size < init_frag.remaining * init_frag.size)
738f2522 2007 {
4d5438cd
NS
2008 while (size >= init_frag.size)
2009 {
2010 size -= init_frag.size;
2011 output_init_frag (NULL_RTX);
2012 }
2013 if (size)
2014 nvptx_assemble_value (0, size);
738f2522 2015 }
738f2522
BS
2016}
2017
2018/* Output a string STR with length SIZE. As in nvptx_output_skip we
2019 ignore the FILE arg. */
2020
2021void
2022nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2023{
2024 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2025 nvptx_assemble_value (str[i], 1);
2026}
2027
bf398920
TV
2028/* Return true if TYPE is a record type where the last field is an array without
2029 given dimension. */
2030
2031static bool
2032flexible_array_member_type_p (const_tree type)
2033{
2034 if (TREE_CODE (type) != RECORD_TYPE)
2035 return false;
2036
2037 const_tree last_field = NULL_TREE;
2038 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2039 last_field = f;
2040
2041 if (!last_field)
2042 return false;
2043
2044 const_tree last_field_type = TREE_TYPE (last_field);
2045 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2046 return false;
2047
2048 return (! TYPE_DOMAIN (last_field_type)
2049 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2050}
2051
4ff3145a
NS
2052/* Emit a PTX variable decl and prepare for emission of its
2053 initializer. NAME is the symbol name and SETION the PTX data
2054 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2055 The caller has already emitted any indentation and linkage
2056 specifier. It is responsible for any initializer, terminating ;
2057 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2058 this is the opposite way round that PTX wants them! */
2059
2060static void
2061nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2062 const_tree type, HOST_WIDE_INT size, unsigned align,
2063 bool undefined = false)
4ff3145a 2064{
59d2d238
CP
2065 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2066 && (TYPE_DOMAIN (type) == NULL_TREE);
2067
bf398920
TV
2068 if (undefined && flexible_array_member_type_p (type))
2069 {
2070 size = 0;
2071 atype = true;
2072 }
2073
4ff3145a
NS
2074 while (TREE_CODE (type) == ARRAY_TYPE)
2075 type = TREE_TYPE (type);
2076
fc0efeea
NS
2077 if (TREE_CODE (type) == VECTOR_TYPE
2078 || TREE_CODE (type) == COMPLEX_TYPE)
2079 /* Neither vector nor complex types can contain the other. */
2080 type = TREE_TYPE (type);
2081
4ff3145a 2082 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
2083
2084 /* Largest mode we're prepared to accept. For BLKmode types we
2085 don't know if it'll contain pointer constants, so have to choose
2086 pointer size, otherwise we can choose DImode. */
2087 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2088
2089 elt_size |= GET_MODE_SIZE (elt_mode);
2090 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2091
4d5438cd 2092 init_frag.size = elt_size;
9c582551 2093 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2094 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2095 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2096 init_frag.val = 0;
2097 init_frag.offset = 0;
2098 init_frag.started = false;
2099 /* Size might not be a multiple of elt size, if there's an
2100 initialized trailing struct array with smaller type than
2101 elt_size. */
2102 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
2103
2104 fprintf (file, "%s .align %d .u%d ",
2105 section, align / BITS_PER_UNIT,
2106 elt_size * BITS_PER_UNIT);
2107 assemble_name (file, name);
2108
2109 if (size)
2110 /* We make everything an array, to simplify any initialization
2111 emission. */
4d5438cd 2112 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
59d2d238
CP
2113 else if (atype)
2114 fprintf (file, "[]");
4ff3145a
NS
2115}
2116
738f2522
BS
2117/* Called when the initializer for a decl has been completely output through
2118 combinations of the three functions above. */
2119
2120static void
2121nvptx_assemble_decl_end (void)
2122{
4d5438cd
NS
2123 if (init_frag.offset)
2124 /* This can happen with a packed struct with trailing array member. */
2125 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2126 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2127}
2128
69823d76
NS
2129/* Output an uninitialized common or file-scope variable. */
2130
2131void
2132nvptx_output_aligned_decl (FILE *file, const char *name,
2133 const_tree decl, HOST_WIDE_INT size, unsigned align)
2134{
2135 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2136
2137 /* If this is public, it is common. The nearest thing we have to
2138 common is weak. */
4ff3145a
NS
2139 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2140
2141 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2142 TREE_TYPE (decl), size, align);
4d5438cd 2143 nvptx_assemble_decl_end ();
69823d76
NS
2144}
2145
738f2522
BS
2146/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2147 writing a constant variable EXP with NAME and SIZE and its
2148 initializer to FILE. */
2149
2150static void
2151nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2152 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2153{
4ff3145a
NS
2154 write_var_marker (file, true, false, name);
2155
2156 fprintf (file, "\t");
2157
738f2522 2158 tree type = TREE_TYPE (exp);
4ff3145a
NS
2159 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2160 TYPE_ALIGN (type));
738f2522
BS
2161}
2162
2163/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2164 a variable DECL with NAME to FILE. */
2165
2166void
2167nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2168{
4ff3145a 2169 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2170
4ff3145a
NS
2171 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2172 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2173
4ff3145a
NS
2174 tree type = TREE_TYPE (decl);
2175 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2176 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2177 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2178}
2179
2180/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2181
2182static void
2183nvptx_globalize_label (FILE *, const char *)
2184{
2185}
2186
2187/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2188 declaration only for variable DECL with NAME to FILE. */
f313d112 2189
738f2522
BS
2190static void
2191nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2192{
0a0f74aa
NS
2193 /* The middle end can place constant pool decls into the varpool as
2194 undefined. Until that is fixed, catch the problem here. */
2195 if (DECL_IN_CONSTANT_POOL (decl))
2196 return;
2197
721547cd
NS
2198 /* We support weak defintions, and hence have the right
2199 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2200 if (DECL_WEAK (decl))
2201 error_at (DECL_SOURCE_LOCATION (decl),
2202 "PTX does not support weak declarations"
2203 " (only weak definitions)");
69823d76
NS
2204 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2205
4ff3145a
NS
2206 fprintf (file, "\t.extern ");
2207 tree size = DECL_SIZE_UNIT (decl);
2208 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2209 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2210 DECL_ALIGN (decl), true);
1e5154e7 2211 nvptx_assemble_decl_end ();
738f2522
BS
2212}
2213
f313d112
NS
2214/* Output a pattern for a move instruction. */
2215
2216const char *
2217nvptx_output_mov_insn (rtx dst, rtx src)
2218{
2219 machine_mode dst_mode = GET_MODE (dst);
2220 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2221 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2222 machine_mode src_inner = (GET_CODE (src) == SUBREG
2223 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2224
15113b03
NS
2225 rtx sym = src;
2226 if (GET_CODE (sym) == CONST)
2227 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2228 if (SYMBOL_REF_P (sym))
2229 {
2230 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2231 return "%.\tcvta%D1%t0\t%0, %1;";
2232 nvptx_maybe_record_fnsym (sym);
2233 }
15113b03 2234
f313d112
NS
2235 if (src_inner == dst_inner)
2236 return "%.\tmov%t0\t%0, %1;";
2237
2238 if (CONSTANT_P (src))
2239 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2240 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2241 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2242
2243 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2244 {
2245 if (GET_MODE_BITSIZE (dst_mode) == 128
2246 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2247 {
2248 /* mov.b128 is not supported. */
2249 if (dst_inner == V2DImode && src_inner == TImode)
2250 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2251 else if (dst_inner == TImode && src_inner == V2DImode)
2252 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2253
2254 gcc_unreachable ();
2255 }
2256 return "%.\tmov.b%T0\t%0, %1;";
2257 }
f313d112
NS
2258
2259 return "%.\tcvt%t0%t1\t%0, %1;";
2260}
2261
5012919d
AM
2262static void nvptx_print_operand (FILE *, rtx, int);
2263
738f2522 2264/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2265 involves writing .param declarations and in/out copies into them. For
2266 indirect calls, also write the .callprototype. */
738f2522
BS
2267
2268const char *
2269nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2270{
863af9a4 2271 char buf[16];
738f2522
BS
2272 static int labelno;
2273 bool needs_tgt = register_operand (callee, Pmode);
2274 rtx pat = PATTERN (insn);
5012919d
AM
2275 if (GET_CODE (pat) == COND_EXEC)
2276 pat = COND_EXEC_CODE (pat);
f324806d 2277 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2278 tree decl = NULL_TREE;
2279
2280 fprintf (asm_out_file, "\t{\n");
2281 if (result != NULL)
1f065954
NS
2282 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2283 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2284 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2285
ecf6e535 2286 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2287 if (GET_CODE (callee) == SYMBOL_REF)
2288 {
2289 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2290 if (!decl
2291 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2292 nvptx_record_libfunc (callee, result, pat);
2293 else if (DECL_EXTERNAL (decl))
738f2522
BS
2294 nvptx_record_fndecl (decl);
2295 }
2296
2297 if (needs_tgt)
2298 {
2299 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2300 labelno++;
2301 ASM_OUTPUT_LABEL (asm_out_file, buf);
2302 std::stringstream s;
b699adcc 2303 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2304 fputs (s.str().c_str(), asm_out_file);
2305 }
2306
863af9a4 2307 for (int argno = 1; argno < arg_end; argno++)
738f2522 2308 {
863af9a4 2309 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2310 machine_mode mode = GET_MODE (t);
a02d84b6 2311 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2312
863af9a4 2313 /* Mode splitting has already been done. */
a02d84b6
NS
2314 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2315 "\t\tst.param%s [%%out_arg%d], ",
2316 ptx_type, argno, ptx_type, argno);
2317 output_reg (asm_out_file, REGNO (t), VOIDmode);
2318 fprintf (asm_out_file, ";\n");
738f2522
BS
2319 }
2320
5012919d
AM
2321 /* The '.' stands for the call's predicate, if any. */
2322 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2323 fprintf (asm_out_file, "\t\tcall ");
2324 if (result != NULL_RTX)
1f065954
NS
2325 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2326
738f2522
BS
2327 if (decl)
2328 {
2329 const char *name = get_fnname_from_decl (decl);
b4346107 2330 name = nvptx_name_replacement (name);
738f2522
BS
2331 assemble_name (asm_out_file, name);
2332 }
2333 else
cc8ca59e 2334 output_address (VOIDmode, callee);
738f2522 2335
863af9a4
NS
2336 const char *open = "(";
2337 for (int argno = 1; argno < arg_end; argno++)
738f2522 2338 {
863af9a4
NS
2339 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2340 open = "";
738f2522 2341 }
863af9a4
NS
2342 if (decl && DECL_STATIC_CHAIN (decl))
2343 {
5563d5c0 2344 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2345 open = "";
2346 }
2347 if (!open[0])
2348 fprintf (asm_out_file, ")");
f324806d 2349
738f2522
BS
2350 if (needs_tgt)
2351 {
2352 fprintf (asm_out_file, ", ");
2353 assemble_name (asm_out_file, buf);
2354 }
2355 fprintf (asm_out_file, ";\n");
738f2522 2356
51baf85a 2357 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2358 {
2359 /* No return functions confuse the PTX JIT, as it doesn't realize
2360 the flow control barrier they imply. It can seg fault if it
2361 encounters what looks like an unexitable loop. Emit a trailing
2362 trap and exit, which it does grok. */
2363 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2364 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2365 }
51baf85a 2366
1f065954
NS
2367 if (result)
2368 {
2369 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2370
2371 if (!rval[0])
2372 /* We must escape the '%' that starts RETURN_REGNUM. */
2373 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2374 reg_names[NVPTX_RETURN_REGNUM]);
2375 return rval;
2376 }
2377
2378 return "}";
738f2522
BS
2379}
2380
2381/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2382
2383static bool
2384nvptx_print_operand_punct_valid_p (unsigned char c)
2385{
2386 return c == '.' || c== '#';
2387}
2388
738f2522
BS
2389/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2390
2391static void
2392nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2393{
2394 rtx off;
2395 if (GET_CODE (x) == CONST)
2396 x = XEXP (x, 0);
2397 switch (GET_CODE (x))
2398 {
2399 case PLUS:
2400 off = XEXP (x, 1);
cc8ca59e 2401 output_address (VOIDmode, XEXP (x, 0));
738f2522 2402 fprintf (file, "+");
cc8ca59e 2403 output_address (VOIDmode, off);
738f2522
BS
2404 break;
2405
2406 case SYMBOL_REF:
2407 case LABEL_REF:
2408 output_addr_const (file, x);
2409 break;
2410
2411 default:
2412 gcc_assert (GET_CODE (x) != MEM);
2413 nvptx_print_operand (file, x, 0);
2414 break;
2415 }
2416}
2417
2418/* Write assembly language output for the address ADDR to FILE. */
2419
2420static void
cc8ca59e 2421nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2422{
cc8ca59e 2423 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2424}
2425
2426/* Print an operand, X, to FILE, with an optional modifier in CODE.
2427
2428 Meaning of CODE:
2429 . -- print the predicate for the instruction or an emptry string for an
2430 unconditional one.
2431 # -- print a rounding mode for the instruction
2432
9a863523 2433 A -- print a data area for a MEM
738f2522 2434 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2435 D -- print a data area for a MEM operand
d88cd9c4 2436 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2437 t -- print a type opcode suffix, promoting QImode to 32 bits
2438 T -- print a type size in bits
2439 u -- print a type opcode suffix without promotions. */
2440
2441static void
2442nvptx_print_operand (FILE *file, rtx x, int code)
2443{
738f2522
BS
2444 if (code == '.')
2445 {
2446 x = current_insn_predicate;
2447 if (x)
2448 {
5012919d 2449 fputs ("@", file);
738f2522
BS
2450 if (GET_CODE (x) == EQ)
2451 fputs ("!", file);
5012919d 2452 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2453 }
2454 return;
2455 }
2456 else if (code == '#')
2457 {
2458 fputs (".rn", file);
2459 return;
2460 }
2461
2462 enum rtx_code x_code = GET_CODE (x);
f313d112 2463 machine_mode mode = GET_MODE (x);
738f2522
BS
2464
2465 switch (code)
2466 {
2467 case 'A':
9a863523
NS
2468 x = XEXP (x, 0);
2469 /* FALLTHROUGH. */
7b8edc29 2470
9a863523
NS
2471 case 'D':
2472 if (GET_CODE (x) == CONST)
2473 x = XEXP (x, 0);
2474 if (GET_CODE (x) == PLUS)
2475 x = XEXP (x, 0);
7b8edc29 2476
9a863523
NS
2477 if (GET_CODE (x) == SYMBOL_REF)
2478 fputs (section_for_sym (x), file);
738f2522
BS
2479 break;
2480
738f2522 2481 case 't':
738f2522 2482 case 'u':
f313d112
NS
2483 if (x_code == SUBREG)
2484 {
1310ff03
TV
2485 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2486 if (VECTOR_MODE_P (inner_mode)
2487 && (GET_MODE_SIZE (mode)
2488 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2489 mode = GET_MODE_INNER (inner_mode);
2490 else if (split_mode_p (inner_mode))
2491 mode = maybe_split_mode (inner_mode);
2492 else
2493 mode = inner_mode;
f313d112
NS
2494 }
2495 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2496 break;
2497
3717fbe3
TV
2498 case 'H':
2499 case 'L':
2500 {
2501 rtx inner_x = SUBREG_REG (x);
2502 machine_mode inner_mode = GET_MODE (inner_x);
2503 machine_mode split = maybe_split_mode (inner_mode);
2504
2505 output_reg (file, REGNO (inner_x), split,
2506 (code == 'H'
2507 ? GET_MODE_SIZE (inner_mode) / 2
2508 : 0));
2509 }
2510 break;
2511
d88cd9c4
NS
2512 case 'S':
2513 {
59263259
NS
2514 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2515 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2516 static const char *const kinds[] =
59263259
NS
2517 {".up", ".down", ".bfly", ".idx"};
2518 fputs (kinds[kind], file);
d88cd9c4
NS
2519 }
2520 break;
2521
738f2522 2522 case 'T':
f313d112 2523 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2524 break;
2525
2526 case 'j':
2527 fprintf (file, "@");
2528 goto common;
2529
2530 case 'J':
2531 fprintf (file, "@!");
2532 goto common;
2533
2534 case 'c':
f313d112 2535 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2536 switch (x_code)
2537 {
2538 case EQ:
2539 fputs (".eq", file);
2540 break;
2541 case NE:
f313d112 2542 if (FLOAT_MODE_P (mode))
738f2522
BS
2543 fputs (".neu", file);
2544 else
2545 fputs (".ne", file);
2546 break;
2547 case LE:
578fb225 2548 case LEU:
738f2522
BS
2549 fputs (".le", file);
2550 break;
2551 case GE:
578fb225 2552 case GEU:
738f2522
BS
2553 fputs (".ge", file);
2554 break;
2555 case LT:
578fb225 2556 case LTU:
738f2522
BS
2557 fputs (".lt", file);
2558 break;
2559 case GT:
738f2522 2560 case GTU:
578fb225 2561 fputs (".gt", file);
738f2522
BS
2562 break;
2563 case LTGT:
2564 fputs (".ne", file);
2565 break;
2566 case UNEQ:
2567 fputs (".equ", file);
2568 break;
2569 case UNLE:
2570 fputs (".leu", file);
2571 break;
2572 case UNGE:
2573 fputs (".geu", file);
2574 break;
2575 case UNLT:
2576 fputs (".ltu", file);
2577 break;
2578 case UNGT:
2579 fputs (".gtu", file);
2580 break;
2581 case UNORDERED:
2582 fputs (".nan", file);
2583 break;
2584 case ORDERED:
2585 fputs (".num", file);
2586 break;
2587 default:
2588 gcc_unreachable ();
2589 }
f313d112 2590 if (FLOAT_MODE_P (mode)
738f2522
BS
2591 || x_code == EQ || x_code == NE
2592 || x_code == GEU || x_code == GTU
2593 || x_code == LEU || x_code == LTU)
f313d112 2594 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2595 else
f313d112 2596 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2597 break;
2598 default:
2599 common:
2600 switch (x_code)
2601 {
2602 case SUBREG:
f313d112
NS
2603 {
2604 rtx inner_x = SUBREG_REG (x);
2605 machine_mode inner_mode = GET_MODE (inner_x);
2606 machine_mode split = maybe_split_mode (inner_mode);
2607
1310ff03
TV
2608 if (VECTOR_MODE_P (inner_mode)
2609 && (GET_MODE_SIZE (mode)
2610 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2611 {
2612 output_reg (file, REGNO (inner_x), VOIDmode);
2613 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2614 }
2615 else if (split_mode_p (inner_mode)
f313d112
NS
2616 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2617 output_reg (file, REGNO (inner_x), split);
2618 else
2619 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2620 }
2621 break;
738f2522
BS
2622
2623 case REG:
f313d112 2624 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2625 break;
2626
2627 case MEM:
2628 fputc ('[', file);
f313d112 2629 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2630 fputc (']', file);
2631 break;
2632
2633 case CONST_INT:
2634 output_addr_const (file, x);
2635 break;
2636
2637 case CONST:
2638 case SYMBOL_REF:
2639 case LABEL_REF:
2640 /* We could use output_addr_const, but that can print things like
2641 "x-8", which breaks ptxas. Need to ensure it is output as
2642 "x+-8". */
2643 nvptx_print_address_operand (file, x, VOIDmode);
2644 break;
2645
2646 case CONST_DOUBLE:
2647 long vals[2];
f313d112 2648 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2649 vals[0] &= 0xffffffff;
2650 vals[1] &= 0xffffffff;
f313d112 2651 if (mode == SFmode)
738f2522
BS
2652 fprintf (file, "0f%08lx", vals[0]);
2653 else
2654 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2655 break;
2656
1310ff03
TV
2657 case CONST_VECTOR:
2658 {
2659 unsigned n = CONST_VECTOR_NUNITS (x);
2660 fprintf (file, "{ ");
2661 for (unsigned i = 0; i < n; ++i)
2662 {
2663 if (i != 0)
2664 fprintf (file, ", ");
2665
2666 rtx elem = CONST_VECTOR_ELT (x, i);
2667 output_addr_const (file, elem);
2668 }
2669 fprintf (file, " }");
2670 }
2671 break;
2672
738f2522
BS
2673 default:
2674 output_addr_const (file, x);
2675 }
2676 }
2677}
2678\f
2679/* Record replacement regs used to deal with subreg operands. */
2680struct reg_replace
2681{
2682 rtx replacement[MAX_RECOG_OPERANDS];
2683 machine_mode mode;
2684 int n_allocated;
2685 int n_in_use;
2686};
2687
2688/* Allocate or reuse a replacement in R and return the rtx. */
2689
2690static rtx
2691get_replacement (struct reg_replace *r)
2692{
2693 if (r->n_allocated == r->n_in_use)
2694 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2695 return r->replacement[r->n_in_use++];
2696}
2697
2698/* Clean up subreg operands. In ptx assembly, everything is typed, and
2699 the presence of subregs would break the rules for most instructions.
2700 Replace them with a suitable new register of the right size, plus
2701 conversion copyin/copyout instructions. */
2702
2703static void
517665b3 2704nvptx_reorg_subreg (void)
738f2522
BS
2705{
2706 struct reg_replace qiregs, hiregs, siregs, diregs;
2707 rtx_insn *insn, *next;
2708
738f2522
BS
2709 qiregs.n_allocated = 0;
2710 hiregs.n_allocated = 0;
2711 siregs.n_allocated = 0;
2712 diregs.n_allocated = 0;
2713 qiregs.mode = QImode;
2714 hiregs.mode = HImode;
2715 siregs.mode = SImode;
2716 diregs.mode = DImode;
2717
2718 for (insn = get_insns (); insn; insn = next)
2719 {
2720 next = NEXT_INSN (insn);
2721 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2722 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2723 || GET_CODE (PATTERN (insn)) == USE
2724 || GET_CODE (PATTERN (insn)) == CLOBBER)
2725 continue;
f324806d 2726
738f2522
BS
2727 qiregs.n_in_use = 0;
2728 hiregs.n_in_use = 0;
2729 siregs.n_in_use = 0;
2730 diregs.n_in_use = 0;
2731 extract_insn (insn);
2732 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2733
738f2522
BS
2734 for (int i = 0; i < recog_data.n_operands; i++)
2735 {
2736 rtx op = recog_data.operand[i];
2737 if (GET_CODE (op) != SUBREG)
2738 continue;
2739
2740 rtx inner = SUBREG_REG (op);
2741
2742 machine_mode outer_mode = GET_MODE (op);
2743 machine_mode inner_mode = GET_MODE (inner);
2744 gcc_assert (s_ok);
2745 if (s_ok
2746 && (GET_MODE_PRECISION (inner_mode)
2747 >= GET_MODE_PRECISION (outer_mode)))
2748 continue;
2749 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2750 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2751 : outer_mode == HImode ? &hiregs
2752 : outer_mode == SImode ? &siregs
2753 : &diregs);
2754 rtx new_reg = get_replacement (r);
2755
2756 if (recog_data.operand_type[i] != OP_OUT)
2757 {
2758 enum rtx_code code;
2759 if (GET_MODE_PRECISION (inner_mode)
2760 < GET_MODE_PRECISION (outer_mode))
2761 code = ZERO_EXTEND;
2762 else
2763 code = TRUNCATE;
2764
f7df4a84 2765 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2766 gen_rtx_fmt_e (code, outer_mode, inner));
2767 emit_insn_before (pat, insn);
2768 }
2769
2770 if (recog_data.operand_type[i] != OP_IN)
2771 {
2772 enum rtx_code code;
2773 if (GET_MODE_PRECISION (inner_mode)
2774 < GET_MODE_PRECISION (outer_mode))
2775 code = TRUNCATE;
2776 else
2777 code = ZERO_EXTEND;
2778
f7df4a84 2779 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2780 gen_rtx_fmt_e (code, inner_mode, new_reg));
2781 emit_insn_after (pat, insn);
2782 }
2783 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2784 }
2785 }
517665b3 2786}
738f2522 2787
5012919d
AM
2788/* Return a SImode "master lane index" register for uniform-simt, allocating on
2789 first use. */
2790
2791static rtx
2792nvptx_get_unisimt_master ()
2793{
2794 rtx &master = cfun->machine->unisimt_master;
2795 return master ? master : master = gen_reg_rtx (SImode);
2796}
2797
2798/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2799
2800static rtx
2801nvptx_get_unisimt_predicate ()
2802{
2803 rtx &pred = cfun->machine->unisimt_predicate;
2804 return pred ? pred : pred = gen_reg_rtx (BImode);
2805}
2806
2807/* Return true if given call insn references one of the functions provided by
2808 the CUDA runtime: malloc, free, vprintf. */
2809
2810static bool
2811nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2812{
2813 rtx pat = PATTERN (insn);
2814 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2815 pat = XVECEXP (pat, 0, 0);
2816 if (GET_CODE (pat) == SET)
2817 pat = SET_SRC (pat);
2818 gcc_checking_assert (GET_CODE (pat) == CALL
2819 && GET_CODE (XEXP (pat, 0)) == MEM);
2820 rtx addr = XEXP (XEXP (pat, 0), 0);
2821 if (GET_CODE (addr) != SYMBOL_REF)
2822 return false;
2823 const char *name = XSTR (addr, 0);
2824 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2825 references with forced assembler name refer to PTX syscalls. For vprintf,
2826 accept both normal and forced-assembler-name references. */
2827 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2828 || !strcmp (name, "*malloc")
2829 || !strcmp (name, "*free"));
2830}
2831
2832/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2833 propagate its value from lane MASTER to current lane. */
2834
2835static void
2836nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2837{
2838 rtx reg;
2839 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2840 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2841}
2842
2843/* Adjust code for uniform-simt code generation variant by making atomics and
2844 "syscalls" conditionally executed, and inserting shuffle-based propagation
2845 for registers being set. */
2846
2847static void
2848nvptx_reorg_uniform_simt ()
2849{
2850 rtx_insn *insn, *next;
2851
2852 for (insn = get_insns (); insn; insn = next)
2853 {
2854 next = NEXT_INSN (insn);
2855 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2856 && !(NONJUMP_INSN_P (insn)
2857 && GET_CODE (PATTERN (insn)) == PARALLEL
2858 && get_attr_atomic (insn)))
2859 continue;
2860 rtx pat = PATTERN (insn);
2861 rtx master = nvptx_get_unisimt_master ();
2862 for (int i = 0; i < XVECLEN (pat, 0); i++)
2863 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2864 rtx pred = nvptx_get_unisimt_predicate ();
2865 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2866 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2867 validate_change (insn, &PATTERN (insn), pat, false);
2868 }
2869}
2870
d2d47a28
NS
2871/* Loop structure of the function. The entire function is described as
2872 a NULL loop. */
d88cd9c4
NS
2873
2874struct parallel
2875{
2876 /* Parent parallel. */
2877 parallel *parent;
2878
2879 /* Next sibling parallel. */
2880 parallel *next;
2881
2882 /* First child parallel. */
2883 parallel *inner;
2884
2885 /* Partitioning mask of the parallel. */
2886 unsigned mask;
2887
2888 /* Partitioning used within inner parallels. */
2889 unsigned inner_mask;
2890
2891 /* Location of parallel forked and join. The forked is the first
2892 block in the parallel and the join is the first block after of
2893 the partition. */
2894 basic_block forked_block;
2895 basic_block join_block;
2896
2897 rtx_insn *forked_insn;
2898 rtx_insn *join_insn;
2899
2900 rtx_insn *fork_insn;
2901 rtx_insn *joining_insn;
2902
2903 /* Basic blocks in this parallel, but not in child parallels. The
2904 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2905 blocks are not. */
2906 auto_vec<basic_block> blocks;
2907
2908public:
2909 parallel (parallel *parent, unsigned mode);
2910 ~parallel ();
2911};
2912
2913/* Constructor links the new parallel into it's parent's chain of
2914 children. */
2915
2916parallel::parallel (parallel *parent_, unsigned mask_)
2917 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2918{
2919 forked_block = join_block = 0;
2920 forked_insn = join_insn = 0;
2921 fork_insn = joining_insn = 0;
2922
2923 if (parent)
2924 {
2925 next = parent->inner;
2926 parent->inner = this;
2927 }
2928}
2929
2930parallel::~parallel ()
2931{
2932 delete inner;
2933 delete next;
2934}
2935
2936/* Map of basic blocks to insns */
2937typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2938
2939/* A tuple of an insn of interest and the BB in which it resides. */
2940typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2941typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2942
2943/* Split basic blocks such that each forked and join unspecs are at
2944 the start of their basic blocks. Thus afterwards each block will
2945 have a single partitioning mode. We also do the same for return
2946 insns, as they are executed by every thread. Return the
2947 partitioning mode of the function as a whole. Populate MAP with
2948 head and tail blocks. We also clear the BB visited flag, which is
2949 used when finding partitions. */
2950
2951static void
2952nvptx_split_blocks (bb_insn_map_t *map)
2953{
2954 insn_bb_vec_t worklist;
2955 basic_block block;
2956 rtx_insn *insn;
2957
2958 /* Locate all the reorg instructions of interest. */
2959 FOR_ALL_BB_FN (block, cfun)
2960 {
2961 bool seen_insn = false;
2962
2963 /* Clear visited flag, for use by parallel locator */
2964 block->flags &= ~BB_VISITED;
2965
2966 FOR_BB_INSNS (block, insn)
2967 {
2968 if (!INSN_P (insn))
2969 continue;
2970 switch (recog_memoized (insn))
2971 {
2972 default:
2973 seen_insn = true;
2974 continue;
2975 case CODE_FOR_nvptx_forked:
2976 case CODE_FOR_nvptx_join:
2977 break;
2978
2979 case CODE_FOR_return:
2980 /* We also need to split just before return insns, as
2981 that insn needs executing by all threads, but the
2982 block it is in probably does not. */
2983 break;
2984 }
2985
2986 if (seen_insn)
2987 /* We've found an instruction that must be at the start of
2988 a block, but isn't. Add it to the worklist. */
2989 worklist.safe_push (insn_bb_t (insn, block));
2990 else
2991 /* It was already the first instruction. Just add it to
2992 the map. */
2993 map->get_or_insert (block) = insn;
2994 seen_insn = true;
2995 }
2996 }
2997
2998 /* Split blocks on the worklist. */
2999 unsigned ix;
3000 insn_bb_t *elt;
3001 basic_block remap = 0;
3002 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3003 {
3004 if (remap != elt->second)
3005 {
3006 block = elt->second;
3007 remap = block;
3008 }
3009
3010 /* Split block before insn. The insn is in the new block */
3011 edge e = split_block (block, PREV_INSN (elt->first));
3012
3013 block = e->dest;
3014 map->get_or_insert (block) = elt->first;
3015 }
3016}
3017
3018/* BLOCK is a basic block containing a head or tail instruction.
3019 Locate the associated prehead or pretail instruction, which must be
3020 in the single predecessor block. */
3021
3022static rtx_insn *
3023nvptx_discover_pre (basic_block block, int expected)
3024{
3025 gcc_assert (block->preds->length () == 1);
3026 basic_block pre_block = (*block->preds)[0]->src;
3027 rtx_insn *pre_insn;
3028
3029 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3030 pre_insn = PREV_INSN (pre_insn))
3031 gcc_assert (pre_insn != BB_HEAD (pre_block));
3032
3033 gcc_assert (recog_memoized (pre_insn) == expected);
3034 return pre_insn;
3035}
3036
3037/* Dump this parallel and all its inner parallels. */
3038
3039static void
3040nvptx_dump_pars (parallel *par, unsigned depth)
3041{
3042 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3043 depth, par->mask,
3044 par->forked_block ? par->forked_block->index : -1,
3045 par->join_block ? par->join_block->index : -1);
3046
3047 fprintf (dump_file, " blocks:");
3048
3049 basic_block block;
3050 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3051 fprintf (dump_file, " %d", block->index);
3052 fprintf (dump_file, "\n");
3053 if (par->inner)
3054 nvptx_dump_pars (par->inner, depth + 1);
3055
3056 if (par->next)
3057 nvptx_dump_pars (par->next, depth);
3058}
3059
3060/* If BLOCK contains a fork/join marker, process it to create or
3061 terminate a loop structure. Add this block to the current loop,
3062 and then walk successor blocks. */
3063
3064static parallel *
3065nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3066{
3067 if (block->flags & BB_VISITED)
3068 return par;
3069 block->flags |= BB_VISITED;
3070
3071 if (rtx_insn **endp = map->get (block))
3072 {
3073 rtx_insn *end = *endp;
3074
3075 /* This is a block head or tail, or return instruction. */
3076 switch (recog_memoized (end))
3077 {
3078 case CODE_FOR_return:
3079 /* Return instructions are in their own block, and we
3080 don't need to do anything more. */
3081 return par;
3082
3083 case CODE_FOR_nvptx_forked:
3084 /* Loop head, create a new inner loop and add it into
3085 our parent's child list. */
3086 {
3087 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3088
3089 gcc_assert (mask);
3090 par = new parallel (par, mask);
3091 par->forked_block = block;
3092 par->forked_insn = end;
e91eba31 3093 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3094 par->fork_insn
3095 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3096 }
3097 break;
3098
3099 case CODE_FOR_nvptx_join:
3100 /* A loop tail. Finish the current loop and return to
3101 parent. */
3102 {
3103 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3104
3105 gcc_assert (par->mask == mask);
3106 par->join_block = block;
3107 par->join_insn = end;
e91eba31 3108 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3109 par->joining_insn
3110 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3111 par = par->parent;
3112 }
3113 break;
3114
3115 default:
3116 gcc_unreachable ();
3117 }
3118 }
3119
3120 if (par)
3121 /* Add this block onto the current loop's list of blocks. */
3122 par->blocks.safe_push (block);
3123 else
3124 /* This must be the entry block. Create a NULL parallel. */
3125 par = new parallel (0, 0);
3126
3127 /* Walk successor blocks. */
3128 edge e;
3129 edge_iterator ei;
3130
3131 FOR_EACH_EDGE (e, ei, block->succs)
3132 nvptx_find_par (map, par, e->dest);
3133
3134 return par;
3135}
3136
3137/* DFS walk the CFG looking for fork & join markers. Construct
3138 loop structures as we go. MAP is a mapping of basic blocks
3139 to head & tail markers, discovered when splitting blocks. This
3140 speeds up the discovery. We rely on the BB visited flag having
3141 been cleared when splitting blocks. */
3142
3143static parallel *
3144nvptx_discover_pars (bb_insn_map_t *map)
3145{
3146 basic_block block;
3147
3148 /* Mark exit blocks as visited. */
3149 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3150 block->flags |= BB_VISITED;
3151
3152 /* And entry block as not. */
3153 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3154 block->flags &= ~BB_VISITED;
3155
3156 parallel *par = nvptx_find_par (map, 0, block);
3157
3158 if (dump_file)
3159 {
3160 fprintf (dump_file, "\nLoops\n");
3161 nvptx_dump_pars (par, 0);
3162 fprintf (dump_file, "\n");
3163 }
3164
3165 return par;
3166}
3167
912442c2
NS
3168/* Analyse a group of BBs within a partitioned region and create N
3169 Single-Entry-Single-Exit regions. Some of those regions will be
3170 trivial ones consisting of a single BB. The blocks of a
3171 partitioned region might form a set of disjoint graphs -- because
3172 the region encloses a differently partitoned sub region.
3173
3174 We use the linear time algorithm described in 'Finding Regions Fast:
3175 Single Entry Single Exit and control Regions in Linear Time'
3176 Johnson, Pearson & Pingali. That algorithm deals with complete
3177 CFGs, where a back edge is inserted from END to START, and thus the
3178 problem becomes one of finding equivalent loops.
3179
3180 In this case we have a partial CFG. We complete it by redirecting
3181 any incoming edge to the graph to be from an arbitrary external BB,
3182 and similarly redirecting any outgoing edge to be to that BB.
3183 Thus we end up with a closed graph.
3184
3185 The algorithm works by building a spanning tree of an undirected
3186 graph and keeping track of back edges from nodes further from the
3187 root in the tree to nodes nearer to the root in the tree. In the
3188 description below, the root is up and the tree grows downwards.
3189
3190 We avoid having to deal with degenerate back-edges to the same
3191 block, by splitting each BB into 3 -- one for input edges, one for
3192 the node itself and one for the output edges. Such back edges are
3193 referred to as 'Brackets'. Cycle equivalent nodes will have the
3194 same set of brackets.
3195
3196 Determining bracket equivalency is done by maintaining a list of
3197 brackets in such a manner that the list length and final bracket
3198 uniquely identify the set.
3199
3200 We use coloring to mark all BBs with cycle equivalency with the
3201 same color. This is the output of the 'Finding Regions Fast'
3202 algorithm. Notice it doesn't actually find the set of nodes within
3203 a particular region, just unorderd sets of nodes that are the
3204 entries and exits of SESE regions.
3205
3206 After determining cycle equivalency, we need to find the minimal
3207 set of SESE regions. Do this with a DFS coloring walk of the
3208 complete graph. We're either 'looking' or 'coloring'. When
3209 looking, and we're in the subgraph, we start coloring the color of
3210 the current node, and remember that node as the start of the
3211 current color's SESE region. Every time we go to a new node, we
3212 decrement the count of nodes with thet color. If it reaches zero,
3213 we remember that node as the end of the current color's SESE region
3214 and return to 'looking'. Otherwise we color the node the current
3215 color.
3216
3217 This way we end up with coloring the inside of non-trivial SESE
3218 regions with the color of that region. */
3219
3220/* A pair of BBs. We use this to represent SESE regions. */
3221typedef std::pair<basic_block, basic_block> bb_pair_t;
3222typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3223
3224/* A node in the undirected CFG. The discriminator SECOND indicates just
3225 above or just below the BB idicated by FIRST. */
3226typedef std::pair<basic_block, int> pseudo_node_t;
3227
3228/* A bracket indicates an edge towards the root of the spanning tree of the
3229 undirected graph. Each bracket has a color, determined
3230 from the currrent set of brackets. */
3231struct bracket
3232{
3233 pseudo_node_t back; /* Back target */
3234
3235 /* Current color and size of set. */
3236 unsigned color;
3237 unsigned size;
3238
3239 bracket (pseudo_node_t back_)
3240 : back (back_), color (~0u), size (~0u)
3241 {
3242 }
3243
3244 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3245 {
3246 if (length != size)
3247 {
3248 size = length;
3249 color = color_counts.length ();
3250 color_counts.quick_push (0);
3251 }
3252 color_counts[color]++;
3253 return color;
3254 }
3255};
3256
3257typedef auto_vec<bracket> bracket_vec_t;
3258
3259/* Basic block info for finding SESE regions. */
3260
3261struct bb_sese
3262{
3263 int node; /* Node number in spanning tree. */
3264 int parent; /* Parent node number. */
3265
3266 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3267 edges arrive at pseudo-node Ai and the outgoing edges leave at
3268 pseudo-node Ao. We have to remember which way we arrived at a
3269 particular node when generating the spanning tree. dir > 0 means
3270 we arrived at Ai, dir < 0 means we arrived at Ao. */
3271 int dir;
3272
3273 /* Lowest numbered pseudo-node reached via a backedge from thsis
3274 node, or any descendant. */
3275 pseudo_node_t high;
3276
3277 int color; /* Cycle-equivalence color */
3278
3279 /* Stack of brackets for this node. */
3280 bracket_vec_t brackets;
3281
3282 bb_sese (unsigned node_, unsigned p, int dir_)
3283 :node (node_), parent (p), dir (dir_)
3284 {
3285 }
3286 ~bb_sese ();
3287
3288 /* Push a bracket ending at BACK. */
3289 void push (const pseudo_node_t &back)
3290 {
3291 if (dump_file)
3292 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3293 back.first ? back.first->index : 0, back.second);
3294 brackets.safe_push (bracket (back));
3295 }
3296
3297 void append (bb_sese *child);
3298 void remove (const pseudo_node_t &);
3299
3300 /* Set node's color. */
3301 void set_color (auto_vec<unsigned> &color_counts)
3302 {
3303 color = brackets.last ().get_color (color_counts, brackets.length ());
3304 }
3305};
3306
3307bb_sese::~bb_sese ()
3308{
3309}
3310
3311/* Destructively append CHILD's brackets. */
3312
3313void
3314bb_sese::append (bb_sese *child)
3315{
3316 if (int len = child->brackets.length ())
3317 {
3318 int ix;
3319
3320 if (dump_file)
3321 {
3322 for (ix = 0; ix < len; ix++)
3323 {
3324 const pseudo_node_t &pseudo = child->brackets[ix].back;
3325 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3326 child->node, pseudo.first ? pseudo.first->index : 0,
3327 pseudo.second);
3328 }
3329 }
3330 if (!brackets.length ())
3331 std::swap (brackets, child->brackets);
3332 else
3333 {
3334 brackets.reserve (len);
3335 for (ix = 0; ix < len; ix++)
3336 brackets.quick_push (child->brackets[ix]);
3337 }
3338 }
3339}
3340
3341/* Remove brackets that terminate at PSEUDO. */
3342
3343void
3344bb_sese::remove (const pseudo_node_t &pseudo)
3345{
3346 unsigned removed = 0;
3347 int len = brackets.length ();
3348
3349 for (int ix = 0; ix < len; ix++)
3350 {
3351 if (brackets[ix].back == pseudo)
3352 {
3353 if (dump_file)
3354 fprintf (dump_file, "Removing backedge %d:%+d\n",
3355 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3356 removed++;
3357 }
3358 else if (removed)
3359 brackets[ix-removed] = brackets[ix];
3360 }
3361 while (removed--)
3362 brackets.pop ();
3363}
3364
3365/* Accessors for BB's aux pointer. */
3366#define BB_SET_SESE(B, S) ((B)->aux = (S))
3367#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3368
3369/* DFS walk creating SESE data structures. Only cover nodes with
3370 BB_VISITED set. Append discovered blocks to LIST. We number in
3371 increments of 3 so that the above and below pseudo nodes can be
3372 implicitly numbered too. */
3373
3374static int
3375nvptx_sese_number (int n, int p, int dir, basic_block b,
3376 auto_vec<basic_block> *list)
3377{
3378 if (BB_GET_SESE (b))
3379 return n;
3380
3381 if (dump_file)
3382 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3383 b->index, n, p, dir);
3384
3385 BB_SET_SESE (b, new bb_sese (n, p, dir));
3386 p = n;
3387
3388 n += 3;
3389 list->quick_push (b);
3390
3391 /* First walk the nodes on the 'other side' of this node, then walk
3392 the nodes on the same side. */
3393 for (unsigned ix = 2; ix; ix--)
3394 {
3395 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3396 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3397 : offsetof (edge_def, src));
3398 edge e;
3399 edge_iterator (ei);
3400
3401 FOR_EACH_EDGE (e, ei, edges)
3402 {
3403 basic_block target = *(basic_block *)((char *)e + offset);
3404
3405 if (target->flags & BB_VISITED)
3406 n = nvptx_sese_number (n, p, dir, target, list);
3407 }
3408 dir = -dir;
3409 }
3410 return n;
3411}
3412
3413/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3414 EDGES are the outgoing edges and OFFSET is the offset to the src
3415 or dst block on the edges. */
3416
3417static void
3418nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3419 vec<edge, va_gc> *edges, size_t offset)
3420{
3421 edge e;
3422 edge_iterator (ei);
3423 int hi_back = depth;
3424 pseudo_node_t node_back (0, depth);
3425 int hi_child = depth;
3426 pseudo_node_t node_child (0, depth);
3427 basic_block child = NULL;
3428 unsigned num_children = 0;
3429 int usd = -dir * sese->dir;
3430
3431 if (dump_file)
3432 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3433 me->index, sese->node, dir);
3434
3435 if (dir < 0)
3436 {
3437 /* This is the above pseudo-child. It has the BB itself as an
3438 additional child node. */
3439 node_child = sese->high;
3440 hi_child = node_child.second;
3441 if (node_child.first)
3442 hi_child += BB_GET_SESE (node_child.first)->node;
3443 num_children++;
3444 }
3445
3446 /* Examine each edge.
3447 - if it is a child (a) append its bracket list and (b) record
3448 whether it is the child with the highest reaching bracket.
3449 - if it is an edge to ancestor, record whether it's the highest
3450 reaching backlink. */
3451 FOR_EACH_EDGE (e, ei, edges)
3452 {
3453 basic_block target = *(basic_block *)((char *)e + offset);
3454
3455 if (bb_sese *t_sese = BB_GET_SESE (target))
3456 {
3457 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3458 {
3459 /* Child node. Append its bracket list. */
3460 num_children++;
3461 sese->append (t_sese);
3462
3463 /* Compare it's hi value. */
3464 int t_hi = t_sese->high.second;
3465
3466 if (basic_block child_hi_block = t_sese->high.first)
3467 t_hi += BB_GET_SESE (child_hi_block)->node;
3468
3469 if (hi_child > t_hi)
3470 {
3471 hi_child = t_hi;
3472 node_child = t_sese->high;
3473 child = target;
3474 }
3475 }
3476 else if (t_sese->node < sese->node + dir
3477 && !(dir < 0 && sese->parent == t_sese->node))
3478 {
3479 /* Non-parental ancestor node -- a backlink. */
3480 int d = usd * t_sese->dir;
3481 int back = t_sese->node + d;
3482
3483 if (hi_back > back)
3484 {
3485 hi_back = back;
3486 node_back = pseudo_node_t (target, d);
3487 }
3488 }
3489 }
3490 else
3491 { /* Fallen off graph, backlink to entry node. */
3492 hi_back = 0;
3493 node_back = pseudo_node_t (0, 0);
3494 }
3495 }
3496
3497 /* Remove any brackets that terminate at this pseudo node. */
3498 sese->remove (pseudo_node_t (me, dir));
3499
3500 /* Now push any backlinks from this pseudo node. */
3501 FOR_EACH_EDGE (e, ei, edges)
3502 {
3503 basic_block target = *(basic_block *)((char *)e + offset);
3504 if (bb_sese *t_sese = BB_GET_SESE (target))
3505 {
3506 if (t_sese->node < sese->node + dir
3507 && !(dir < 0 && sese->parent == t_sese->node))
3508 /* Non-parental ancestor node - backedge from me. */
3509 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3510 }
3511 else
3512 {
3513 /* back edge to entry node */
3514 sese->push (pseudo_node_t (0, 0));
3515 }
3516 }
3517
3518 /* If this node leads directly or indirectly to a no-return region of
3519 the graph, then fake a backedge to entry node. */
3520 if (!sese->brackets.length () || !edges || !edges->length ())
3521 {
3522 hi_back = 0;
3523 node_back = pseudo_node_t (0, 0);
3524 sese->push (node_back);
3525 }
3526
3527 /* Record the highest reaching backedge from us or a descendant. */
3528 sese->high = hi_back < hi_child ? node_back : node_child;
3529
3530 if (num_children > 1)
3531 {
3532 /* There is more than one child -- this is a Y shaped piece of
3533 spanning tree. We have to insert a fake backedge from this
3534 node to the highest ancestor reached by not-the-highest
3535 reaching child. Note that there may be multiple children
3536 with backedges to the same highest node. That's ok and we
3537 insert the edge to that highest node. */
3538 hi_child = depth;
3539 if (dir < 0 && child)
3540 {
3541 node_child = sese->high;
3542 hi_child = node_child.second;
3543 if (node_child.first)
3544 hi_child += BB_GET_SESE (node_child.first)->node;
3545 }
3546
3547 FOR_EACH_EDGE (e, ei, edges)
3548 {
3549 basic_block target = *(basic_block *)((char *)e + offset);
3550
3551 if (target == child)
3552 /* Ignore the highest child. */
3553 continue;
3554
3555 bb_sese *t_sese = BB_GET_SESE (target);
3556 if (!t_sese)
3557 continue;
3558 if (t_sese->parent != sese->node)
3559 /* Not a child. */
3560 continue;
3561
3562 /* Compare its hi value. */
3563 int t_hi = t_sese->high.second;
3564
3565 if (basic_block child_hi_block = t_sese->high.first)
3566 t_hi += BB_GET_SESE (child_hi_block)->node;
3567
3568 if (hi_child > t_hi)
3569 {
3570 hi_child = t_hi;
3571 node_child = t_sese->high;
3572 }
3573 }
3574
3575 sese->push (node_child);
3576 }
3577}
3578
3579
3580/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3581 proceed to successors. Set SESE entry and exit nodes of
3582 REGIONS. */
3583
3584static void
3585nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3586 basic_block block, int coloring)
3587{
3588 bb_sese *sese = BB_GET_SESE (block);
3589
3590 if (block->flags & BB_VISITED)
3591 {
3592 /* If we've already encountered this block, either we must not
3593 be coloring, or it must have been colored the current color. */
3594 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3595 return;
3596 }
3597
3598 block->flags |= BB_VISITED;
3599
3600 if (sese)
3601 {
3602 if (coloring < 0)
3603 {
3604 /* Start coloring a region. */
3605 regions[sese->color].first = block;
3606 coloring = sese->color;
3607 }
3608
3609 if (!--color_counts[sese->color] && sese->color == coloring)
3610 {
3611 /* Found final block of SESE region. */
3612 regions[sese->color].second = block;
3613 coloring = -1;
3614 }
3615 else
3616 /* Color the node, so we can assert on revisiting the node
3617 that the graph is indeed SESE. */
3618 sese->color = coloring;
3619 }
3620 else
3621 /* Fallen off the subgraph, we cannot be coloring. */
3622 gcc_assert (coloring < 0);
3623
3624 /* Walk each successor block. */
3625 if (block->succs && block->succs->length ())
3626 {
3627 edge e;
3628 edge_iterator ei;
3629
3630 FOR_EACH_EDGE (e, ei, block->succs)
3631 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3632 }
3633 else
3634 gcc_assert (coloring < 0);
3635}
3636
3637/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3638 end up with NULL entries in it. */
3639
3640static void
3641nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3642{
3643 basic_block block;
3644 int ix;
3645
3646 /* First clear each BB of the whole function. */
2300c332 3647 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3648 {
3649 block->flags &= ~BB_VISITED;
3650 BB_SET_SESE (block, 0);
3651 }
912442c2
NS
3652
3653 /* Mark blocks in the function that are in this graph. */
3654 for (ix = 0; blocks.iterate (ix, &block); ix++)
3655 block->flags |= BB_VISITED;
3656
3657 /* Counts of nodes assigned to each color. There cannot be more
3658 colors than blocks (and hopefully there will be fewer). */
3659 auto_vec<unsigned> color_counts;
3660 color_counts.reserve (blocks.length ());
3661
3662 /* Worklist of nodes in the spanning tree. Again, there cannot be
3663 more nodes in the tree than blocks (there will be fewer if the
3664 CFG of blocks is disjoint). */
3665 auto_vec<basic_block> spanlist;
3666 spanlist.reserve (blocks.length ());
3667
3668 /* Make sure every block has its cycle class determined. */
3669 for (ix = 0; blocks.iterate (ix, &block); ix++)
3670 {
3671 if (BB_GET_SESE (block))
3672 /* We already met this block in an earlier graph solve. */
3673 continue;
3674
3675 if (dump_file)
3676 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3677
3678 /* Number the nodes reachable from block initial DFS order. */
3679 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3680
3681 /* Now walk in reverse DFS order to find cycle equivalents. */
3682 while (spanlist.length ())
3683 {
3684 block = spanlist.pop ();
3685 bb_sese *sese = BB_GET_SESE (block);
3686
3687 /* Do the pseudo node below. */
3688 nvptx_sese_pseudo (block, sese, depth, +1,
3689 sese->dir > 0 ? block->succs : block->preds,
3690 (sese->dir > 0 ? offsetof (edge_def, dest)
3691 : offsetof (edge_def, src)));
3692 sese->set_color (color_counts);
3693 /* Do the pseudo node above. */
3694 nvptx_sese_pseudo (block, sese, depth, -1,
3695 sese->dir < 0 ? block->succs : block->preds,
3696 (sese->dir < 0 ? offsetof (edge_def, dest)
3697 : offsetof (edge_def, src)));
3698 }
3699 if (dump_file)
3700 fprintf (dump_file, "\n");
3701 }
3702
3703 if (dump_file)
3704 {
3705 unsigned count;
3706 const char *comma = "";
3707
3708 fprintf (dump_file, "Found %d cycle equivalents\n",
3709 color_counts.length ());
3710 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3711 {
3712 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3713
3714 comma = "";
3715 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3716 if (BB_GET_SESE (block)->color == ix)
3717 {
3718 block->flags |= BB_VISITED;
3719 fprintf (dump_file, "%s%d", comma, block->index);
3720 comma=",";
3721 }
3722 fprintf (dump_file, "}");
3723 comma = ", ";
3724 }
3725 fprintf (dump_file, "\n");
3726 }
3727
3728 /* Now we've colored every block in the subgraph. We now need to
3729 determine the minimal set of SESE regions that cover that
3730 subgraph. Do this with a DFS walk of the complete function.
3731 During the walk we're either 'looking' or 'coloring'. When we
3732 reach the last node of a particular color, we stop coloring and
3733 return to looking. */
3734
3735 /* There cannot be more SESE regions than colors. */
3736 regions.reserve (color_counts.length ());
3737 for (ix = color_counts.length (); ix--;)
3738 regions.quick_push (bb_pair_t (0, 0));
3739
3740 for (ix = 0; blocks.iterate (ix, &block); ix++)
3741 block->flags &= ~BB_VISITED;
3742
3743 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3744
3745 if (dump_file)
3746 {
3747 const char *comma = "";
3748 int len = regions.length ();
3749
3750 fprintf (dump_file, "SESE regions:");
3751 for (ix = 0; ix != len; ix++)
3752 {
3753 basic_block from = regions[ix].first;
3754 basic_block to = regions[ix].second;
3755
3756 if (from)
3757 {
3758 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3759 if (to != from)
3760 fprintf (dump_file, "->%d", to->index);
3761
3762 int color = BB_GET_SESE (from)->color;
3763
3764 /* Print the blocks within the region (excluding ends). */
3765 FOR_EACH_BB_FN (block, cfun)
3766 {
3767 bb_sese *sese = BB_GET_SESE (block);
3768
3769 if (sese && sese->color == color
3770 && block != from && block != to)
3771 fprintf (dump_file, ".%d", block->index);
3772 }
3773 fprintf (dump_file, "}");
3774 }
3775 comma = ",";
3776 }
3777 fprintf (dump_file, "\n\n");
3778 }
3779
3780 for (ix = 0; blocks.iterate (ix, &block); ix++)
3781 delete BB_GET_SESE (block);
3782}
3783
3784#undef BB_SET_SESE
3785#undef BB_GET_SESE
3786
e91eba31
NS
3787/* Propagate live state at the start of a partitioned region. IS_CALL
3788 indicates whether the propagation is for a (partitioned) call
3789 instruction. BLOCK provides the live register information, and
3790 might not contain INSN. Propagation is inserted just after INSN. RW
3791 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
3792 separation is needed for worker-level proppagation where we
3793 essentially do a spill & fill. FN is the underlying worker
3794 function to generate the propagation instructions for single
3795 register. DATA is user data.
3796
e91eba31
NS
3797 Returns true if we didn't emit any instructions.
3798
3799 We propagate the live register set for non-calls and the entire
3800 frame for calls and non-calls. We could do better by (a)
3801 propagating just the live set that is used within the partitioned
3802 regions and (b) only propagating stack entries that are used. The
3803 latter might be quite hard to determine. */
d88cd9c4
NS
3804
3805typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3806
e91eba31
NS
3807static bool
3808nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3809 propagate_mask rw, propagator_fn fn, void *data)
d88cd9c4
NS
3810{
3811 bitmap live = DF_LIVE_IN (block);
3812 bitmap_iterator iterator;
3813 unsigned ix;
e91eba31 3814 bool empty = true;
d88cd9c4
NS
3815
3816 /* Copy the frame array. */
3817 HOST_WIDE_INT fs = get_frame_size ();
3818 if (fs)
3819 {
3820 rtx tmp = gen_reg_rtx (DImode);
3821 rtx idx = NULL_RTX;
3822 rtx ptr = gen_reg_rtx (Pmode);
3823 rtx pred = NULL_RTX;
3824 rtx_code_label *label = NULL;
3825
e91eba31 3826 empty = false;
c1311c86
NS
3827 /* The frame size might not be DImode compatible, but the frame
3828 array's declaration will be. So it's ok to round up here. */
3829 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3830 /* Detect single iteration loop. */
3831 if (fs == 1)
3832 fs = 0;
3833
3834 start_sequence ();
3835 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3836 if (fs)
3837 {
3838 idx = gen_reg_rtx (SImode);
3839 pred = gen_reg_rtx (BImode);
3840 label = gen_label_rtx ();
3841
3842 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3843 /* Allow worker function to initialize anything needed. */
3844 rtx init = fn (tmp, PM_loop_begin, fs, data);
3845 if (init)
3846 emit_insn (init);
3847 emit_label (label);
3848 LABEL_NUSES (label)++;
3849 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3850 }
3851 if (rw & PM_read)
3852 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3853 emit_insn (fn (tmp, rw, fs, data));
3854 if (rw & PM_write)
3855 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3856 if (fs)
3857 {
3858 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3859 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3860 emit_insn (gen_br_true_uni (pred, label));
3861 rtx fini = fn (tmp, PM_loop_end, fs, data);
3862 if (fini)
3863 emit_insn (fini);
3864 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3865 }
3866 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3867 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3868 rtx cpy = get_insns ();
3869 end_sequence ();
3870 insn = emit_insn_after (cpy, insn);
3871 }
3872
e91eba31
NS
3873 if (!is_call)
3874 /* Copy live registers. */
3875 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3876 {
3877 rtx reg = regno_reg_rtx[ix];
d88cd9c4 3878
e91eba31
NS
3879 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3880 {
3881 rtx bcast = fn (reg, rw, 0, data);
d88cd9c4 3882
e91eba31
NS
3883 insn = emit_insn_after (bcast, insn);
3884 empty = false;
3885 }
3886 }
3887 return empty;
d88cd9c4
NS
3888}
3889
3890/* Worker for nvptx_vpropagate. */
3891
3892static rtx
3893vprop_gen (rtx reg, propagate_mask pm,
3894 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3895{
3896 if (!(pm & PM_read_write))
3897 return 0;
3898
3899 return nvptx_gen_vcast (reg);
3900}
3901
3902/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
3903 of a single warp. Propagation is inserted just after INSN.
3904 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 3905
e91eba31
NS
3906static bool
3907nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 3908{
e91eba31 3909 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
d88cd9c4
NS
3910}
3911
3912/* Worker for nvptx_wpropagate. */
3913
3914static rtx
3915wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3916{
3917 wcast_data_t *data = (wcast_data_t *)data_;
3918
3919 if (pm & PM_loop_begin)
3920 {
3921 /* Starting a loop, initialize pointer. */
3922 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3923
3924 if (align > worker_bcast_align)
3925 worker_bcast_align = align;
3926 data->offset = (data->offset + align - 1) & ~(align - 1);
3927
3928 data->ptr = gen_reg_rtx (Pmode);
3929
3930 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3931 }
3932 else if (pm & PM_loop_end)
3933 {
3934 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3935 data->ptr = NULL_RTX;
3936 return clobber;
3937 }
3938 else
3939 return nvptx_gen_wcast (reg, pm, rep, data);
3940}
3941
3942/* Spill or fill live state that is live at start of BLOCK. PRE_P
3943 indicates if this is just before partitioned mode (do spill), or
3944 just after it starts (do fill). Sequence is inserted just after
e91eba31 3945 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 3946
e91eba31
NS
3947static bool
3948nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4
NS
3949{
3950 wcast_data_t data;
3951
3952 data.base = gen_reg_rtx (Pmode);
3953 data.offset = 0;
3954 data.ptr = NULL_RTX;
3955
e91eba31
NS
3956 bool empty = nvptx_propagate (is_call, block, insn,
3957 pre_p ? PM_read : PM_write, wprop_gen, &data);
3958 gcc_assert (empty == !data.offset);
d88cd9c4
NS
3959 if (data.offset)
3960 {
3961 /* Stuff was emitted, initialize the base pointer now. */
bd602b7f 3962 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
d88cd9c4 3963 emit_insn_after (init, insn);
15ab6f00 3964
d88cd9c4
NS
3965 if (worker_bcast_size < data.offset)
3966 worker_bcast_size = data.offset;
3967 }
e91eba31 3968 return empty;
d88cd9c4
NS
3969}
3970
3971/* Emit a worker-level synchronization barrier. We use different
3972 markers for before and after synchronizations. */
3973
3974static rtx
3975nvptx_wsync (bool after)
3976{
1dcf2688 3977 return gen_nvptx_barsync (GEN_INT (after), GEN_INT (0));
d88cd9c4
NS
3978}
3979
43c371e8
TV
3980#if WORKAROUND_PTXJIT_BUG
3981/* Return first real insn in BB, or return NULL_RTX if BB does not contain
3982 real insns. */
3983
3984static rtx_insn *
3985bb_first_real_insn (basic_block bb)
3986{
3987 rtx_insn *insn;
3988
3989 /* Find first insn of from block. */
3990 FOR_BB_INSNS (bb, insn)
3991 if (INSN_P (insn))
3992 return insn;
3993
3994 return 0;
3995}
3996#endif
3997
6beefdbd
TV
3998/* Return true if INSN needs neutering. */
3999
4000static bool
4001needs_neutering_p (rtx_insn *insn)
4002{
4003 if (!INSN_P (insn))
4004 return false;
4005
4006 switch (recog_memoized (insn))
4007 {
4008 case CODE_FOR_nvptx_fork:
4009 case CODE_FOR_nvptx_forked:
4010 case CODE_FOR_nvptx_joining:
4011 case CODE_FOR_nvptx_join:
4012 case CODE_FOR_nvptx_barsync:
4013 return false;
4014 default:
4015 return true;
4016 }
4017}
4018
a874808c
TV
4019/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4020
4021static bool
4022verify_neutering_jumps (basic_block from,
4023 rtx_insn *vector_jump, rtx_insn *worker_jump,
4024 rtx_insn *vector_label, rtx_insn *worker_label)
4025{
4026 basic_block bb = from;
4027 rtx_insn *insn = BB_HEAD (bb);
4028 bool seen_worker_jump = false;
4029 bool seen_vector_jump = false;
4030 bool seen_worker_label = false;
4031 bool seen_vector_label = false;
4032 bool worker_neutered = false;
4033 bool vector_neutered = false;
4034 while (true)
4035 {
4036 if (insn == worker_jump)
4037 {
4038 seen_worker_jump = true;
4039 worker_neutered = true;
4040 gcc_assert (!vector_neutered);
4041 }
4042 else if (insn == vector_jump)
4043 {
4044 seen_vector_jump = true;
4045 vector_neutered = true;
4046 }
4047 else if (insn == worker_label)
4048 {
4049 seen_worker_label = true;
4050 gcc_assert (worker_neutered);
4051 worker_neutered = false;
4052 }
4053 else if (insn == vector_label)
4054 {
4055 seen_vector_label = true;
4056 gcc_assert (vector_neutered);
4057 vector_neutered = false;
4058 }
4059 else if (INSN_P (insn))
4060 switch (recog_memoized (insn))
4061 {
4062 case CODE_FOR_nvptx_barsync:
4063 gcc_assert (!vector_neutered && !worker_neutered);
4064 break;
4065 default:
4066 break;
4067 }
4068
4069 if (insn != BB_END (bb))
4070 insn = NEXT_INSN (insn);
4071 else if (JUMP_P (insn) && single_succ_p (bb)
4072 && !seen_vector_jump && !seen_worker_jump)
4073 {
4074 bb = single_succ (bb);
4075 insn = BB_HEAD (bb);
4076 }
4077 else
4078 break;
4079 }
4080
4081 gcc_assert (!(vector_jump && !seen_vector_jump));
4082 gcc_assert (!(worker_jump && !seen_worker_jump));
4083
4084 if (seen_vector_label || seen_worker_label)
4085 {
4086 gcc_assert (!(vector_label && !seen_vector_label));
4087 gcc_assert (!(worker_label && !seen_worker_label));
4088
4089 return true;
4090 }
4091
4092 return false;
4093}
4094
4095/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4096
4097static void
4098verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4099 rtx_insn *worker_label)
4100{
4101 basic_block bb = to;
4102 rtx_insn *insn = BB_END (bb);
4103 bool seen_worker_label = false;
4104 bool seen_vector_label = false;
4105 while (true)
4106 {
4107 if (insn == worker_label)
4108 {
4109 seen_worker_label = true;
4110 gcc_assert (!seen_vector_label);
4111 }
4112 else if (insn == vector_label)
4113 seen_vector_label = true;
4114 else if (INSN_P (insn))
4115 switch (recog_memoized (insn))
4116 {
4117 case CODE_FOR_nvptx_barsync:
4118 gcc_assert (!seen_vector_label && !seen_worker_label);
4119 break;
4120 }
4121
4122 if (insn != BB_HEAD (bb))
4123 insn = PREV_INSN (insn);
4124 else
4125 break;
4126 }
4127
4128 gcc_assert (!(vector_label && !seen_vector_label));
4129 gcc_assert (!(worker_label && !seen_worker_label));
4130}
4131
d88cd9c4
NS
4132/* Single neutering according to MASK. FROM is the incoming block and
4133 TO is the outgoing block. These may be the same block. Insert at
4134 start of FROM:
4135
4136 if (tid.<axis>) goto end.
4137
4138 and insert before ending branch of TO (if there is such an insn):
4139
4140 end:
4141 <possibly-broadcast-cond>
4142 <branch>
4143
4144 We currently only use differnt FROM and TO when skipping an entire
4145 loop. We could do more if we detected superblocks. */
4146
4147static void
4148nvptx_single (unsigned mask, basic_block from, basic_block to)
4149{
4150 rtx_insn *head = BB_HEAD (from);
4151 rtx_insn *tail = BB_END (to);
4152 unsigned skip_mask = mask;
4153
ed9091b2
TV
4154 while (true)
4155 {
4156 /* Find first insn of from block. */
6beefdbd 4157 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4158 head = NEXT_INSN (head);
4159
4160 if (from == to)
4161 break;
4162
4163 if (!(JUMP_P (head) && single_succ_p (from)))
4164 break;
4165
4166 basic_block jump_target = single_succ (from);
4167 if (!single_pred_p (jump_target))
4168 break;
4169
4170 from = jump_target;
4171 head = BB_HEAD (from);
4172 }
d88cd9c4
NS
4173
4174 /* Find last insn of to block */
4175 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4176 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4177 tail = PREV_INSN (tail);
4178
4179 /* Detect if tail is a branch. */
4180 rtx tail_branch = NULL_RTX;
4181 rtx cond_branch = NULL_RTX;
4182 if (tail && INSN_P (tail))
4183 {
4184 tail_branch = PATTERN (tail);
4185 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4186 tail_branch = NULL_RTX;
4187 else
4188 {
4189 cond_branch = SET_SRC (tail_branch);
4190 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4191 cond_branch = NULL_RTX;
4192 }
4193 }
4194
4195 if (tail == head)
4196 {
4197 /* If this is empty, do nothing. */
6beefdbd 4198 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4199 return;
4200
d88cd9c4
NS
4201 if (cond_branch)
4202 {
4203 /* If we're only doing vector single, there's no need to
4204 emit skip code because we'll not insert anything. */
4205 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4206 skip_mask = 0;
4207 }
4208 else if (tail_branch)
4209 /* Block with only unconditional branch. Nothing to do. */
4210 return;
4211 }
4212
4213 /* Insert the vector test inside the worker test. */
4214 unsigned mode;
4215 rtx_insn *before = tail;
2ba16fd2 4216 rtx_insn *neuter_start = NULL;
a874808c
TV
4217 rtx_insn *worker_label = NULL, *vector_label = NULL;
4218 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4219 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4220 if (GOMP_DIM_MASK (mode) & skip_mask)
4221 {
4222 rtx_code_label *label = gen_label_rtx ();
4223 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
a874808c
TV
4224 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4225 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4226
4227 if (!pred)
4228 {
4229 pred = gen_reg_rtx (BImode);
4230 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4231 }
7820b298 4232
d88cd9c4
NS
4233 rtx br;
4234 if (mode == GOMP_DIM_VECTOR)
4235 br = gen_br_true (pred, label);
4236 else
4237 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4238 if (neuter_start)
4239 neuter_start = emit_insn_after (br, neuter_start);
4240 else
4241 neuter_start = emit_insn_before (br, head);
a874808c 4242 *mode_jump = neuter_start;
d88cd9c4
NS
4243
4244 LABEL_NUSES (label)++;
a874808c 4245 rtx_insn *label_insn;
d88cd9c4 4246 if (tail_branch)
a874808c
TV
4247 {
4248 label_insn = emit_label_before (label, before);
4249 before = label_insn;
4250 }
d88cd9c4 4251 else
be606483 4252 {
a874808c 4253 label_insn = emit_label_after (label, tail);
e77a1236
TV
4254 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4255 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4256 emit_insn_after (gen_exit (), label_insn);
4257 }
a874808c
TV
4258
4259 if (mode == GOMP_DIM_VECTOR)
4260 vector_label = label_insn;
4261 else
4262 worker_label = label_insn;
d88cd9c4
NS
4263 }
4264
4265 /* Now deal with propagating the branch condition. */
4266 if (cond_branch)
4267 {
4268 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4269
4270 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4271 {
4272 /* Vector mode only, do a shuffle. */
43c371e8
TV
4273#if WORKAROUND_PTXJIT_BUG
4274 /* The branch condition %rcond is propagated like this:
4275
4276 {
4277 .reg .u32 %x;
4278 mov.u32 %x,%tid.x;
4279 setp.ne.u32 %rnotvzero,%x,0;
4280 }
4281
4282 @%rnotvzero bra Lskip;
4283 setp.<op>.<type> %rcond,op1,op2;
4284 Lskip:
4285 selp.u32 %rcondu32,1,0,%rcond;
4286 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4287 setp.ne.u32 %rcond,%rcondu32,0;
4288
4289 There seems to be a bug in the ptx JIT compiler (observed at driver
4290 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4291 unless %rcond is initialized to something before 'bra Lskip'. The
4292 bug is not observed with ptxas from cuda 8.0.61.
4293
4294 It is true that the code is non-trivial: at Lskip, %rcond is
4295 uninitialized in threads 1-31, and after the selp the same holds
4296 for %rcondu32. But shfl propagates the defined value in thread 0
4297 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4298 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4299
4300 There is nothing in the PTX spec to suggest that this is wrong, or
4301 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4302 it as a JIT bug, and the extra initialization as workaround:
4303
4304 {
4305 .reg .u32 %x;
4306 mov.u32 %x,%tid.x;
4307 setp.ne.u32 %rnotvzero,%x,0;
4308 }
4309
4310 +.reg .pred %rcond2;
4311 +setp.eq.u32 %rcond2, 1, 0;
4312
4313 @%rnotvzero bra Lskip;
4314 setp.<op>.<type> %rcond,op1,op2;
4315 +mov.pred %rcond2, %rcond;
4316 Lskip:
4317 +mov.pred %rcond, %rcond2;
4318 selp.u32 %rcondu32,1,0,%rcond;
4319 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4320 setp.ne.u32 %rcond,%rcondu32,0;
4321 */
4322 rtx_insn *label = PREV_INSN (tail);
4323 gcc_assert (label && LABEL_P (label));
4324 rtx tmp = gen_reg_rtx (BImode);
4325 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4326 bb_first_real_insn (from));
8c8e9a6b
TV
4327 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4328 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4329#endif
d88cd9c4
NS
4330 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4331 }
4332 else
4333 {
4334 /* Includes worker mode, do spill & fill. By construction
4335 we should never have worker mode only. */
4336 wcast_data_t data;
4337
4338 data.base = worker_bcast_sym;
4339 data.ptr = 0;
4340
4341 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4342 worker_bcast_size = GET_MODE_SIZE (SImode);
4343
4344 data.offset = 0;
4345 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4346 before);
4347 /* Barrier so other workers can see the write. */
4348 emit_insn_before (nvptx_wsync (false), tail);
4349 data.offset = 0;
4350 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4351 /* This barrier is needed to avoid worker zero clobbering
4352 the broadcast buffer before all the other workers have
4353 had a chance to read this instance of it. */
22aa0613 4354 emit_insn_before (nvptx_wsync (false), tail);
d88cd9c4
NS
4355 }
4356
4357 extract_insn (tail);
4358 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4359 UNSPEC_BR_UNIFIED);
4360 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4361 }
a874808c
TV
4362
4363 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4364 vector_label, worker_label);
4365 if (!seen_label)
4366 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4367}
4368
4369/* PAR is a parallel that is being skipped in its entirety according to
4370 MASK. Treat this as skipping a superblock starting at forked
4371 and ending at joining. */
4372
4373static void
4374nvptx_skip_par (unsigned mask, parallel *par)
4375{
4376 basic_block tail = par->join_block;
4377 gcc_assert (tail->preds->length () == 1);
4378
4379 basic_block pre_tail = (*tail->preds)[0]->src;
4380 gcc_assert (pre_tail->succs->length () == 1);
4381
4382 nvptx_single (mask, par->forked_block, pre_tail);
4383}
4384
dba619f3
NS
4385/* If PAR has a single inner parallel and PAR itself only contains
4386 empty entry and exit blocks, swallow the inner PAR. */
4387
4388static void
4389nvptx_optimize_inner (parallel *par)
4390{
4391 parallel *inner = par->inner;
4392
4393 /* We mustn't be the outer dummy par. */
4394 if (!par->mask)
4395 return;
4396
4397 /* We must have a single inner par. */
4398 if (!inner || inner->next)
4399 return;
4400
4401 /* We must only contain 2 blocks ourselves -- the head and tail of
4402 the inner par. */
4403 if (par->blocks.length () != 2)
4404 return;
4405
4406 /* We must be disjoint partitioning. As we only have vector and
4407 worker partitioning, this is sufficient to guarantee the pars
4408 have adjacent partitioning. */
4409 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4410 /* This indicates malformed code generation. */
4411 return;
4412
4413 /* The outer forked insn should be immediately followed by the inner
4414 fork insn. */
4415 rtx_insn *forked = par->forked_insn;
4416 rtx_insn *fork = BB_END (par->forked_block);
4417
4418 if (NEXT_INSN (forked) != fork)
4419 return;
4420 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4421
4422 /* The outer joining insn must immediately follow the inner join
4423 insn. */
4424 rtx_insn *joining = par->joining_insn;
4425 rtx_insn *join = inner->join_insn;
4426 if (NEXT_INSN (join) != joining)
4427 return;
4428
4429 /* Preconditions met. Swallow the inner par. */
4430 if (dump_file)
4431 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4432 inner->mask, inner->forked_block->index,
4433 inner->join_block->index,
4434 par->mask, par->forked_block->index, par->join_block->index);
4435
4436 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4437
4438 par->blocks.reserve (inner->blocks.length ());
4439 while (inner->blocks.length ())
4440 par->blocks.quick_push (inner->blocks.pop ());
4441
4442 par->inner = inner->inner;
4443 inner->inner = NULL;
4444
4445 delete inner;
4446}
4447
d88cd9c4
NS
4448/* Process the parallel PAR and all its contained
4449 parallels. We do everything but the neutering. Return mask of
4450 partitioned modes used within this parallel. */
4451
4452static unsigned
4453nvptx_process_pars (parallel *par)
4454{
dba619f3
NS
4455 if (nvptx_optimize)
4456 nvptx_optimize_inner (par);
4457
d88cd9c4
NS
4458 unsigned inner_mask = par->mask;
4459
4460 /* Do the inner parallels first. */
4461 if (par->inner)
4462 {
4463 par->inner_mask = nvptx_process_pars (par->inner);
4464 inner_mask |= par->inner_mask;
4465 }
4466
e91eba31
NS
4467 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4468
4469 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4 4470 {
e91eba31
NS
4471 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
4472 bool empty = nvptx_wpropagate (true, is_call,
4473 par->forked_block, par->fork_insn);
4474
4475 if (!empty || !is_call)
4476 {
4477 /* Insert begin and end synchronizations. */
4478 emit_insn_before (nvptx_wsync (false), par->forked_insn);
22aa0613 4479 emit_insn_before (nvptx_wsync (false), par->join_insn);
e91eba31 4480 }
d88cd9c4
NS
4481 }
4482 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
e91eba31 4483 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4484
4485 /* Now do siblings. */
4486 if (par->next)
4487 inner_mask |= nvptx_process_pars (par->next);
4488 return inner_mask;
4489}
4490
4491/* Neuter the parallel described by PAR. We recurse in depth-first
4492 order. MODES are the partitioning of the execution and OUTER is
4493 the partitioning of the parallels we are contained in. */
4494
4495static void
4496nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4497{
4498 unsigned me = (par->mask
4499 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4500 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4501 unsigned skip_mask = 0, neuter_mask = 0;
4502
4503 if (par->inner)
4504 nvptx_neuter_pars (par->inner, modes, outer | me);
4505
4506 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4507 {
4508 if ((outer | me) & GOMP_DIM_MASK (mode))
4509 {} /* Mode is partitioned: no neutering. */
4510 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4511 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4512 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4513 || !par->forked_insn)
4514 /* Partitioned in inner parallels, or we're not a partitioned
4515 at all: neuter individual blocks. */
4516 neuter_mask |= GOMP_DIM_MASK (mode);
4517 else if (!par->parent || !par->parent->forked_insn
4518 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4519 /* Parent isn't a parallel or contains this paralleling: skip
4520 parallel at this level. */
4521 skip_mask |= GOMP_DIM_MASK (mode);
4522 else
4523 {} /* Parent will skip this parallel itself. */
4524 }
4525
4526 if (neuter_mask)
4527 {
912442c2 4528 int ix, len;
d88cd9c4 4529
912442c2
NS
4530 if (nvptx_optimize)
4531 {
4532 /* Neuter whole SESE regions. */
4533 bb_pair_vec_t regions;
4534
4535 nvptx_find_sese (par->blocks, regions);
4536 len = regions.length ();
4537 for (ix = 0; ix != len; ix++)
4538 {
4539 basic_block from = regions[ix].first;
4540 basic_block to = regions[ix].second;
4541
4542 if (from)
4543 nvptx_single (neuter_mask, from, to);
4544 else
4545 gcc_assert (!to);
4546 }
4547 }
4548 else
d88cd9c4 4549 {
912442c2
NS
4550 /* Neuter each BB individually. */
4551 len = par->blocks.length ();
4552 for (ix = 0; ix != len; ix++)
4553 {
4554 basic_block block = par->blocks[ix];
d88cd9c4 4555
912442c2
NS
4556 nvptx_single (neuter_mask, block, block);
4557 }
d88cd9c4
NS
4558 }
4559 }
4560
4561 if (skip_mask)
7820b298 4562 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4563
4564 if (par->next)
4565 nvptx_neuter_pars (par->next, modes, outer);
4566}
4567
3dede32b
TV
4568#if WORKAROUND_PTXJIT_BUG_2
4569/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4570 is needed in the nvptx target because the branches generated for
4571 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4572
4573static rtx
4574nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4575{
4576 rtx pat;
4577 if ((strict && !JUMP_P (insn))
4578 || (!strict && !INSN_P (insn)))
4579 return NULL_RTX;
4580 pat = PATTERN (insn);
4581
4582 /* The set is allowed to appear either as the insn pattern or
4583 the first set in a PARALLEL. */
4584 if (GET_CODE (pat) == PARALLEL)
4585 pat = XVECEXP (pat, 0, 0);
4586 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4587 return pat;
4588
4589 return NULL_RTX;
4590}
4591
4592/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4593
4594static rtx
4595nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4596{
4597 rtx x = nvptx_pc_set (insn, strict);
4598
4599 if (!x)
4600 return NULL_RTX;
4601 x = SET_SRC (x);
4602 if (GET_CODE (x) == LABEL_REF)
4603 return x;
4604 if (GET_CODE (x) != IF_THEN_ELSE)
4605 return NULL_RTX;
4606 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4607 return XEXP (x, 1);
4608 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4609 return XEXP (x, 2);
4610 return NULL_RTX;
4611}
4612
4613/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4614 insn inbetween the branch and the label. This works around a JIT bug
4615 observed at driver version 384.111, at -O0 for sm_50. */
4616
4617static void
4618prevent_branch_around_nothing (void)
4619{
4620 rtx_insn *seen_label = NULL;
4621 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4622 {
b0b592fc 4623 if (INSN_P (insn) && condjump_p (insn))
3dede32b 4624 {
b0b592fc 4625 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
4626 continue;
4627 }
4628
b0b592fc
TV
4629 if (seen_label == NULL)
4630 continue;
4631
3dede32b
TV
4632 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4633 continue;
4634
4635 if (INSN_P (insn))
4636 switch (recog_memoized (insn))
4637 {
4638 case CODE_FOR_nvptx_fork:
4639 case CODE_FOR_nvptx_forked:
4640 case CODE_FOR_nvptx_joining:
4641 case CODE_FOR_nvptx_join:
4642 continue;
4643 default:
4644 seen_label = NULL;
4645 continue;
4646 }
4647
4648 if (LABEL_P (insn) && insn == seen_label)
4649 emit_insn_before (gen_fake_nop (), insn);
4650
4651 seen_label = NULL;
4652 }
4653 }
4654#endif
4655
21251395
TV
4656#ifdef WORKAROUND_PTXJIT_BUG_3
4657/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4658 works around a hang observed at driver version 390.48 for sm_50. */
4659
4660static void
4661workaround_barsyncs (void)
4662{
4663 bool seen_barsync = false;
4664 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4665 {
4666 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4667 {
4668 if (seen_barsync)
4669 {
4670 emit_insn_before (gen_nvptx_membar_cta (), insn);
4671 emit_insn_before (gen_nvptx_membar_cta (), insn);
4672 }
4673
4674 seen_barsync = true;
4675 continue;
4676 }
4677
4678 if (!seen_barsync)
4679 continue;
4680
4681 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4682 continue;
4683 else if (INSN_P (insn))
4684 switch (recog_memoized (insn))
4685 {
4686 case CODE_FOR_nvptx_fork:
4687 case CODE_FOR_nvptx_forked:
4688 case CODE_FOR_nvptx_joining:
4689 case CODE_FOR_nvptx_join:
4690 continue;
4691 default:
4692 break;
4693 }
4694
4695 seen_barsync = false;
4696 }
4697}
4698#endif
4699
517665b3 4700/* PTX-specific reorganization
d88cd9c4 4701 - Split blocks at fork and join instructions
c38f0d8c
NS
4702 - Compute live registers
4703 - Mark now-unused registers, so function begin doesn't declare
517665b3 4704 unused registers.
d88cd9c4
NS
4705 - Insert state propagation when entering partitioned mode
4706 - Insert neutering instructions when in single mode
c38f0d8c 4707 - Replace subregs with suitable sequences.
517665b3
NS
4708*/
4709
4710static void
4711nvptx_reorg (void)
4712{
517665b3
NS
4713 /* We are freeing block_for_insn in the toplev to keep compatibility
4714 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4715 compute_bb_for_insn ();
4716
4717 thread_prologue_and_epilogue_insns ();
4718
d88cd9c4
NS
4719 /* Split blocks and record interesting unspecs. */
4720 bb_insn_map_t bb_insn_map;
4721
4722 nvptx_split_blocks (&bb_insn_map);
4723
c38f0d8c 4724 /* Compute live regs */
517665b3
NS
4725 df_clear_flags (DF_LR_RUN_DCE);
4726 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4727 df_live_add_problem ();
4728 df_live_set_all_dirty ();
517665b3 4729 df_analyze ();
738f2522
BS
4730 regstat_init_n_sets_and_refs ();
4731
d88cd9c4
NS
4732 if (dump_file)
4733 df_dump (dump_file);
4734
517665b3 4735 /* Mark unused regs as unused. */
d88cd9c4 4736 int max_regs = max_reg_num ();
44c068ae 4737 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
4738 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4739 regno_reg_rtx[i] = const0_rtx;
517665b3 4740
d88cd9c4
NS
4741 /* Determine launch dimensions of the function. If it is not an
4742 offloaded function (i.e. this is a regular compiler), the
4743 function has no neutering. */
629b3d75 4744 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
4745 if (attr)
4746 {
4747 /* If we determined this mask before RTL expansion, we could
4748 elide emission of some levels of forks and joins. */
4749 unsigned mask = 0;
4750 tree dims = TREE_VALUE (attr);
4751 unsigned ix;
4752
4753 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4754 {
4755 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4756 tree allowed = TREE_PURPOSE (dims);
4757
4758 if (size != 1 && !(allowed && integer_zerop (allowed)))
4759 mask |= GOMP_DIM_MASK (ix);
4760 }
4761 /* If there is worker neutering, there must be vector
4762 neutering. Otherwise the hardware will fail. */
4763 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4764 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4765
4766 /* Discover & process partitioned regions. */
4767 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4768 nvptx_process_pars (pars);
4769 nvptx_neuter_pars (pars, mask, 0);
4770 delete pars;
4771 }
4772
517665b3 4773 /* Replace subregs. */
c03b0416 4774 nvptx_reorg_subreg ();
517665b3 4775
5012919d
AM
4776 if (TARGET_UNIFORM_SIMT)
4777 nvptx_reorg_uniform_simt ();
4778
3dede32b
TV
4779#if WORKAROUND_PTXJIT_BUG_2
4780 prevent_branch_around_nothing ();
4781#endif
4782
21251395
TV
4783#ifdef WORKAROUND_PTXJIT_BUG_3
4784 workaround_barsyncs ();
4785#endif
4786
738f2522 4787 regstat_free_n_sets_and_refs ();
517665b3
NS
4788
4789 df_finish_pass (true);
738f2522
BS
4790}
4791\f
4792/* Handle a "kernel" attribute; arguments as in
4793 struct attribute_spec.handler. */
4794
4795static tree
4796nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4797 int ARG_UNUSED (flags), bool *no_add_attrs)
4798{
4799 tree decl = *node;
4800
4801 if (TREE_CODE (decl) != FUNCTION_DECL)
4802 {
4803 error ("%qE attribute only applies to functions", name);
4804 *no_add_attrs = true;
4805 }
b49e35a9 4806 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
4807 {
4808 error ("%qE attribute requires a void return type", name);
4809 *no_add_attrs = true;
4810 }
4811
4812 return NULL_TREE;
4813}
4814
5012919d
AM
4815/* Handle a "shared" attribute; arguments as in
4816 struct attribute_spec.handler. */
4817
4818static tree
4819nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4820 int ARG_UNUSED (flags), bool *no_add_attrs)
4821{
4822 tree decl = *node;
4823
4824 if (TREE_CODE (decl) != VAR_DECL)
4825 {
4826 error ("%qE attribute only applies to variables", name);
4827 *no_add_attrs = true;
4828 }
4829 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4830 {
4831 error ("%qE attribute not allowed with auto storage class", name);
4832 *no_add_attrs = true;
4833 }
4834
4835 return NULL_TREE;
4836}
4837
738f2522
BS
4838/* Table of valid machine attributes. */
4839static const struct attribute_spec nvptx_attribute_table[] =
4840{
4849deb1
JJ
4841 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4842 affects_type_identity, handler, exclude } */
4843 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 4844 NULL },
4849deb1 4845 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 4846 NULL },
4849deb1 4847 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
4848};
4849\f
4850/* Limit vector alignments to BIGGEST_ALIGNMENT. */
4851
4852static HOST_WIDE_INT
4853nvptx_vector_alignment (const_tree type)
4854{
4855 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4856
4857 return MIN (align, BIGGEST_ALIGNMENT);
4858}
d88cd9c4
NS
4859
4860/* Indicate that INSN cannot be duplicated. */
4861
4862static bool
4863nvptx_cannot_copy_insn_p (rtx_insn *insn)
4864{
4865 switch (recog_memoized (insn))
4866 {
4867 case CODE_FOR_nvptx_shufflesi:
4868 case CODE_FOR_nvptx_shufflesf:
4869 case CODE_FOR_nvptx_barsync:
4870 case CODE_FOR_nvptx_fork:
4871 case CODE_FOR_nvptx_forked:
4872 case CODE_FOR_nvptx_joining:
4873 case CODE_FOR_nvptx_join:
4874 return true;
4875 default:
4876 return false;
4877 }
4878}
a794bd20
NS
4879
4880/* Section anchors do not work. Initialization for flag_section_anchor
4881 probes the existence of the anchoring target hooks and prevents
4882 anchoring if they don't exist. However, we may be being used with
4883 a host-side compiler that does support anchoring, and hence see
4884 the anchor flag set (as it's not recalculated). So provide an
4885 implementation denying anchoring. */
4886
4887static bool
4888nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4889{
4890 return false;
4891}
738f2522 4892\f
1f83528e
TS
4893/* Record a symbol for mkoffload to enter into the mapping table. */
4894
4895static void
4896nvptx_record_offload_symbol (tree decl)
4897{
3e32ee19
NS
4898 switch (TREE_CODE (decl))
4899 {
4900 case VAR_DECL:
4901 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4902 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4903 break;
4904
4905 case FUNCTION_DECL:
4906 {
629b3d75 4907 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
4908 /* OpenMP offloading does not set this attribute. */
4909 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 4910
3e32ee19
NS
4911 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4912 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4913
5012919d 4914 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 4915 {
5d306e55 4916 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 4917
5d306e55 4918 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
4919 fprintf (asm_out_file, ", %#x", size);
4920 }
d2d47a28 4921
3e32ee19
NS
4922 fprintf (asm_out_file, "\n");
4923 }
4924 break;
d2d47a28 4925
3e32ee19
NS
4926 default:
4927 gcc_unreachable ();
4928 }
1f83528e
TS
4929}
4930
738f2522
BS
4931/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4932 at the start of a file. */
4933
4934static void
4935nvptx_file_start (void)
4936{
4937 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4938 fputs ("\t.version\t3.1\n", asm_out_file);
53ceb8b5
CP
4939 if (TARGET_SM35)
4940 fputs ("\t.target\tsm_35\n", asm_out_file);
4941 else
4942 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
4943 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4944 fputs ("// END PREAMBLE\n", asm_out_file);
4945}
4946
15ab6f00
NS
4947/* Emit a declaration for a worker-level buffer in .shared memory. */
4948
4949static void
4950write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4951{
4952 const char *name = XSTR (sym, 0);
4953
4954 write_var_marker (file, true, false, name);
4955 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4956 align, name, size);
4957}
4958
ecf6e535
BS
4959/* Write out the function declarations we've collected and declare storage
4960 for the broadcast buffer. */
738f2522
BS
4961
4962static void
4963nvptx_file_end (void)
4964{
f3dba894
TS
4965 hash_table<tree_hasher>::iterator iter;
4966 tree decl;
4967 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 4968 nvptx_record_fndecl (decl);
738f2522 4969 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
4970
4971 if (worker_bcast_size)
15ab6f00
NS
4972 write_worker_buffer (asm_out_file, worker_bcast_sym,
4973 worker_bcast_align, worker_bcast_size);
f3552158
NS
4974
4975 if (worker_red_size)
15ab6f00
NS
4976 write_worker_buffer (asm_out_file, worker_red_sym,
4977 worker_red_align, worker_red_size);
5012919d
AM
4978
4979 if (need_softstack_decl)
4980 {
4981 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4982 /* 32 is the maximum number of warps in a block. Even though it's an
4983 external declaration, emit the array size explicitly; otherwise, it
4984 may fail at PTX JIT time if the definition is later in link order. */
4985 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4986 POINTER_SIZE);
4987 }
4988 if (need_unisimt_decl)
4989 {
4990 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4991 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4992 }
f3552158
NS
4993}
4994
4995/* Expander for the shuffle builtins. */
4996
4997static rtx
4998nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4999{
5000 if (ignore)
5001 return target;
5002
5003 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5004 NULL_RTX, mode, EXPAND_NORMAL);
5005 if (!REG_P (src))
5006 src = copy_to_mode_reg (mode, src);
5007
5008 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5009 NULL_RTX, SImode, EXPAND_NORMAL);
5010 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5011 NULL_RTX, SImode, EXPAND_NORMAL);
5012
5013 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5014 idx = copy_to_mode_reg (SImode, idx);
5015
59263259
NS
5016 rtx pat = nvptx_gen_shuffle (target, src, idx,
5017 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5018 if (pat)
5019 emit_insn (pat);
5020
5021 return target;
5022}
5023
5024/* Worker reduction address expander. */
5025
5026static rtx
5027nvptx_expand_worker_addr (tree exp, rtx target,
5028 machine_mode ARG_UNUSED (mode), int ignore)
5029{
5030 if (ignore)
5031 return target;
5032
5033 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5034 if (align > worker_red_align)
5035 worker_red_align = align;
5036
5037 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5038 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5039 if (size + offset > worker_red_size)
5040 worker_red_size = size + offset;
5041
9a863523 5042 rtx addr = worker_red_sym;
f3552158 5043 if (offset)
9a863523
NS
5044 {
5045 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5046 addr = gen_rtx_CONST (Pmode, addr);
5047 }
f3552158 5048
9a863523 5049 emit_move_insn (target, addr);
f3552158
NS
5050
5051 return target;
5052}
5053
5054/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5055 not require taking the address of any object, other than the memory
5056 cell being operated on. */
5057
5058static rtx
5059nvptx_expand_cmp_swap (tree exp, rtx target,
5060 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5061{
5062 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5063
5064 if (!target)
5065 target = gen_reg_rtx (mode);
5066
5067 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5068 NULL_RTX, Pmode, EXPAND_NORMAL);
5069 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5070 NULL_RTX, mode, EXPAND_NORMAL);
5071 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5072 NULL_RTX, mode, EXPAND_NORMAL);
5073 rtx pat;
5074
5075 mem = gen_rtx_MEM (mode, mem);
5076 if (!REG_P (cmp))
5077 cmp = copy_to_mode_reg (mode, cmp);
5078 if (!REG_P (src))
5079 src = copy_to_mode_reg (mode, src);
5080
5081 if (mode == SImode)
5082 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5083 else
5084 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5085
5086 emit_insn (pat);
5087
5088 return target;
5089}
5090
5091
5092/* Codes for all the NVPTX builtins. */
5093enum nvptx_builtins
5094{
5095 NVPTX_BUILTIN_SHUFFLE,
5096 NVPTX_BUILTIN_SHUFFLELL,
5097 NVPTX_BUILTIN_WORKER_ADDR,
5098 NVPTX_BUILTIN_CMP_SWAP,
5099 NVPTX_BUILTIN_CMP_SWAPLL,
5100 NVPTX_BUILTIN_MAX
5101};
5102
5103static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5104
5105/* Return the NVPTX builtin for CODE. */
5106
5107static tree
5108nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5109{
5110 if (code >= NVPTX_BUILTIN_MAX)
5111 return error_mark_node;
5112
5113 return nvptx_builtin_decls[code];
5114}
5115
5116/* Set up all builtin functions for this target. */
5117
5118static void
5119nvptx_init_builtins (void)
5120{
5121#define DEF(ID, NAME, T) \
5122 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5123 = add_builtin_function ("__builtin_nvptx_" NAME, \
5124 build_function_type_list T, \
5125 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5126#define ST sizetype
5127#define UINT unsigned_type_node
5128#define LLUINT long_long_unsigned_type_node
5129#define PTRVOID ptr_type_node
5130
5131 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5132 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5133 DEF (WORKER_ADDR, "worker_addr",
5134 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5135 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5136 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5137
5138#undef DEF
5139#undef ST
5140#undef UINT
5141#undef LLUINT
5142#undef PTRVOID
5143}
5144
5145/* Expand an expression EXP that calls a built-in function,
5146 with result going to TARGET if that's convenient
5147 (and in mode MODE if that's convenient).
5148 SUBTARGET may be used as the target for computing one of EXP's operands.
5149 IGNORE is nonzero if the value is to be ignored. */
5150
5151static rtx
5152nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5153 machine_mode mode, int ignore)
5154{
5155 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5156 switch (DECL_FUNCTION_CODE (fndecl))
5157 {
5158 case NVPTX_BUILTIN_SHUFFLE:
5159 case NVPTX_BUILTIN_SHUFFLELL:
5160 return nvptx_expand_shuffle (exp, target, mode, ignore);
5161
5162 case NVPTX_BUILTIN_WORKER_ADDR:
5163 return nvptx_expand_worker_addr (exp, target, mode, ignore);
5164
5165 case NVPTX_BUILTIN_CMP_SWAP:
5166 case NVPTX_BUILTIN_CMP_SWAPLL:
5167 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5168
5169 default: gcc_unreachable ();
5170 }
738f2522 5171}
f3552158 5172
5012919d
AM
5173/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5174
5175static int
5176nvptx_simt_vf ()
5177{
5d17a476 5178 return PTX_WARP_SIZE;
5012919d
AM
5179}
5180
94829f87
NS
5181/* Validate compute dimensions of an OpenACC offload or routine, fill
5182 in non-unity defaults. FN_LEVEL indicates the level at which a
b6adbb9f
NS
5183 routine might spawn a loop. It is negative for non-routines. If
5184 DECL is null, we are validating the default dimensions. */
94829f87
NS
5185
5186static bool
5d306e55 5187nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
5188{
5189 bool changed = false;
ec6c865c
TV
5190 bool oacc_default_dims_p = false;
5191 bool oacc_min_dims_p = false;
5192 bool offload_region_p = false;
5193 bool routine_p = false;
5194 bool routine_seq_p = false;
5195
5196 if (decl == NULL_TREE)
5197 {
5198 if (fn_level == -1)
5199 oacc_default_dims_p = true;
5200 else if (fn_level == -2)
5201 oacc_min_dims_p = true;
5202 else
5203 gcc_unreachable ();
5204 }
5205 else if (fn_level == -1)
5206 offload_region_p = true;
5207 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5208 {
5209 routine_p = true;
5210 routine_seq_p = fn_level == GOMP_DIM_MAX;
5211 }
5212 else
5213 gcc_unreachable ();
94829f87 5214
207e7fea
TV
5215 if (routine_p)
5216 {
5217 /* OpenACC routines in C arrive here with the following attributes
5218 (omitting the 'omp declare target'):
5219 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5220 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5221 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5222 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5223
5224 If we take f.i. the oacc function attribute of the worker routine
5225 (0 1, 1 0, 1 0), then:
5226 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5227 meaning: worker routine, that is:
5228 - can't contain gang loop (0),
5229 - can contain worker loop (1),
5230 - can contain vector loop (1).
5231 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5232 dimensions: gang: 1, worker: 0, vector: 0.
5233
5234 OTOH, routines in Fortran arrive here with these attributes:
5235 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5236 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5237 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5238 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5239 that is, the same as for C but with the dimensions set to 0.
5240
5241 This is due to a bug in the Fortran front-end: PR72741. Work around
5242 this bug by forcing the dimensions to be the same in Fortran as for C,
5243 to be able to handle C and Fortran routines uniformly in this
5244 function. */
5245 dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
5246 dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
5247 dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
5248 changed = true;
5249 }
5250
ccc8282b 5251 /* The vector size must be 32, unless this is a SEQ routine. */
ec6c865c
TV
5252 if ((offload_region_p || oacc_default_dims_p
5253 || (routine_p && !routine_seq_p))
b6adbb9f 5254 && dims[GOMP_DIM_VECTOR] >= 0
ccc8282b
NS
5255 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5256 {
ec6c865c
TV
5257 if ((offload_region_p || oacc_default_dims_p)
5258 && dims[GOMP_DIM_VECTOR] >= 0)
b6adbb9f 5259 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b 5260 dims[GOMP_DIM_VECTOR]
324ff1a0
JJ
5261 ? G_("using vector_length (%d), ignoring %d")
5262 : G_("using vector_length (%d), ignoring runtime setting"),
ccc8282b
NS
5263 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5264 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5265 changed = true;
5266 }
5267
5268 /* Check the num workers is not too large. */
5269 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5270 {
b6adbb9f 5271 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
5272 "using num_workers (%d), ignoring %d",
5273 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5274 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5275 changed = true;
5276 }
94829f87 5277
ec6c865c 5278 if (oacc_default_dims_p || oacc_min_dims_p)
b6adbb9f
NS
5279 {
5280 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5281 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5282 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5283 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5284 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f
NS
5285 changed = true;
5286 }
5287
94829f87
NS
5288 return changed;
5289}
d88cd9c4 5290
bd751975
NS
5291/* Return maximum dimension size, or zero for unbounded. */
5292
5293static int
5294nvptx_dim_limit (int axis)
5295{
5296 switch (axis)
5297 {
bd751975
NS
5298 case GOMP_DIM_VECTOR:
5299 return PTX_VECTOR_LENGTH;
5300
5301 default:
5302 break;
5303 }
5304 return 0;
5305}
5306
d88cd9c4
NS
5307/* Determine whether fork & joins are needed. */
5308
5309static bool
5310nvptx_goacc_fork_join (gcall *call, const int dims[],
5311 bool ARG_UNUSED (is_fork))
5312{
5313 tree arg = gimple_call_arg (call, 2);
5314 unsigned axis = TREE_INT_CST_LOW (arg);
5315
5316 /* We only care about worker and vector partitioning. */
5317 if (axis < GOMP_DIM_WORKER)
5318 return false;
5319
5320 /* If the size is 1, there's no partitioning. */
5321 if (dims[axis] == 1)
5322 return false;
5323
5324 return true;
5325}
5326
f3552158
NS
5327/* Generate a PTX builtin function call that returns the address in
5328 the worker reduction buffer at OFFSET. TYPE is the type of the
5329 data at that location. */
5330
5331static tree
5332nvptx_get_worker_red_addr (tree type, tree offset)
5333{
5334 machine_mode mode = TYPE_MODE (type);
5335 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5336 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5337 tree align = build_int_cst (unsigned_type_node,
5338 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5339 tree call = build_call_expr (fndecl, 3, offset, size, align);
5340
5341 return fold_convert (build_pointer_type (type), call);
5342}
5343
5344/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5345 will cast the variable if necessary. */
5346
5347static void
5348nvptx_generate_vector_shuffle (location_t loc,
5349 tree dest_var, tree var, unsigned shift,
5350 gimple_seq *seq)
5351{
5352 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5353 tree_code code = NOP_EXPR;
dd3c1b14
NS
5354 tree arg_type = unsigned_type_node;
5355 tree var_type = TREE_TYPE (var);
5356 tree dest_type = var_type;
f3552158 5357
dd3c1b14
NS
5358 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5359 var_type = TREE_TYPE (var_type);
5360
5361 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 5362 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
5363
5364 if (TYPE_SIZE (var_type)
5365 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
5366 {
5367 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 5368 arg_type = long_long_unsigned_type_node;
f3552158 5369 }
dd3c1b14 5370
f3552158 5371 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
5372 tree bits = build_int_cst (unsigned_type_node, shift);
5373 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5374 tree expr;
5375
5376 if (var_type != dest_type)
5377 {
5378 /* Do real and imaginary parts separately. */
5379 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5380 real = fold_build1 (code, arg_type, real);
5381 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5382 real = fold_build1 (code, var_type, real);
f3552158 5383
dd3c1b14
NS
5384 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5385 imag = fold_build1 (code, arg_type, imag);
5386 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5387 imag = fold_build1 (code, var_type, imag);
5388
5389 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5390 }
5391 else
5392 {
5393 expr = fold_build1 (code, arg_type, var);
5394 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5395 expr = fold_build1 (code, dest_type, expr);
5396 }
f3552158 5397
dd3c1b14 5398 gimplify_assign (dest_var, expr, seq);
f3552158
NS
5399}
5400
33f47f42
NS
5401/* Lazily generate the global lock var decl and return its address. */
5402
5403static tree
5404nvptx_global_lock_addr ()
5405{
5406 tree v = global_lock_var;
5407
5408 if (!v)
5409 {
5410 tree name = get_identifier ("__reduction_lock");
5411 tree type = build_qualified_type (unsigned_type_node,
5412 TYPE_QUAL_VOLATILE);
5413 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5414 global_lock_var = v;
5415 DECL_ARTIFICIAL (v) = 1;
5416 DECL_EXTERNAL (v) = 1;
5417 TREE_STATIC (v) = 1;
5418 TREE_PUBLIC (v) = 1;
5419 TREE_USED (v) = 1;
5420 mark_addressable (v);
5421 mark_decl_referenced (v);
5422 }
5423
5424 return build_fold_addr_expr (v);
5425}
5426
5427/* Insert code to locklessly update *PTR with *PTR OP VAR just before
5428 GSI. We use a lockless scheme for nearly all case, which looks
5429 like:
5430 actual = initval(OP);
5431 do {
5432 guess = actual;
5433 write = guess OP myval;
5434 actual = cmp&swap (ptr, guess, write)
5435 } while (actual bit-different-to guess);
5436 return write;
5437
5438 This relies on a cmp&swap instruction, which is available for 32-
5439 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
5440
5441static tree
5442nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5443 tree ptr, tree var, tree_code op)
5444{
5445 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5446 tree_code code = NOP_EXPR;
33f47f42
NS
5447 tree arg_type = unsigned_type_node;
5448 tree var_type = TREE_TYPE (var);
f3552158 5449
33f47f42
NS
5450 if (TREE_CODE (var_type) == COMPLEX_TYPE
5451 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 5452 code = VIEW_CONVERT_EXPR;
33f47f42
NS
5453
5454 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 5455 {
33f47f42 5456 arg_type = long_long_unsigned_type_node;
f3552158 5457 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
5458 }
5459
33f47f42
NS
5460 tree swap_fn = nvptx_builtin_decl (fn, true);
5461
f3552158 5462 gimple_seq init_seq = NULL;
33f47f42
NS
5463 tree init_var = make_ssa_name (arg_type);
5464 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5465 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
5466 gimplify_assign (init_var, init_expr, &init_seq);
5467 gimple *init_end = gimple_seq_last (init_seq);
5468
5469 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5470
f3552158
NS
5471 /* Split the block just after the init stmts. */
5472 basic_block pre_bb = gsi_bb (*gsi);
5473 edge pre_edge = split_block (pre_bb, init_end);
5474 basic_block loop_bb = pre_edge->dest;
5475 pre_bb = pre_edge->src;
5476 /* Reset the iterator. */
5477 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5478
33f47f42
NS
5479 tree expect_var = make_ssa_name (arg_type);
5480 tree actual_var = make_ssa_name (arg_type);
5481 tree write_var = make_ssa_name (arg_type);
5482
5483 /* Build and insert the reduction calculation. */
5484 gimple_seq red_seq = NULL;
5485 tree write_expr = fold_build1 (code, var_type, expect_var);
5486 write_expr = fold_build2 (op, var_type, write_expr, var);
5487 write_expr = fold_build1 (code, arg_type, write_expr);
5488 gimplify_assign (write_var, write_expr, &red_seq);
5489
5490 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5491
5492 /* Build & insert the cmp&swap sequence. */
5493 gimple_seq latch_seq = NULL;
5494 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5495 ptr, expect_var, write_var);
5496 gimplify_assign (actual_var, swap_expr, &latch_seq);
5497
5498 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5499 NULL_TREE, NULL_TREE);
5500 gimple_seq_add_stmt (&latch_seq, cond);
5501
5502 gimple *latch_end = gimple_seq_last (latch_seq);
5503 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 5504
33f47f42
NS
5505 /* Split the block just after the latch stmts. */
5506 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
5507 basic_block post_bb = post_edge->dest;
5508 loop_bb = post_edge->src;
5509 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5510
5511 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 5512 post_edge->probability = profile_probability::even ();
f3552158 5513 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 5514 loop_edge->probability = profile_probability::even ();
f3552158
NS
5515 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5516 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5517
5518 gphi *phi = create_phi_node (expect_var, loop_bb);
5519 add_phi_arg (phi, init_var, pre_edge, loc);
5520 add_phi_arg (phi, actual_var, loop_edge, loc);
5521
5522 loop *loop = alloc_loop ();
5523 loop->header = loop_bb;
5524 loop->latch = loop_bb;
5525 add_loop (loop, loop_bb->loop_father);
5526
33f47f42
NS
5527 return fold_build1 (code, var_type, write_var);
5528}
5529
5530/* Insert code to lockfully update *PTR with *PTR OP VAR just before
5531 GSI. This is necessary for types larger than 64 bits, where there
5532 is no cmp&swap instruction to implement a lockless scheme. We use
5533 a lock variable in global memory.
5534
5535 while (cmp&swap (&lock_var, 0, 1))
5536 continue;
5537 T accum = *ptr;
5538 accum = accum OP var;
5539 *ptr = accum;
5540 cmp&swap (&lock_var, 1, 0);
5541 return accum;
5542
5543 A lock in global memory is necessary to force execution engine
5544 descheduling and avoid resource starvation that can occur if the
5545 lock is in .shared memory. */
5546
5547static tree
5548nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5549 tree ptr, tree var, tree_code op)
5550{
5551 tree var_type = TREE_TYPE (var);
5552 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5553 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5554 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5555
5556 /* Split the block just before the gsi. Insert a gimple nop to make
5557 this easier. */
5558 gimple *nop = gimple_build_nop ();
5559 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5560 basic_block entry_bb = gsi_bb (*gsi);
5561 edge entry_edge = split_block (entry_bb, nop);
5562 basic_block lock_bb = entry_edge->dest;
5563 /* Reset the iterator. */
5564 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5565
5566 /* Build and insert the locking sequence. */
5567 gimple_seq lock_seq = NULL;
5568 tree lock_var = make_ssa_name (unsigned_type_node);
5569 tree lock_expr = nvptx_global_lock_addr ();
5570 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5571 uns_unlocked, uns_locked);
5572 gimplify_assign (lock_var, lock_expr, &lock_seq);
5573 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5574 NULL_TREE, NULL_TREE);
5575 gimple_seq_add_stmt (&lock_seq, cond);
5576 gimple *lock_end = gimple_seq_last (lock_seq);
5577 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5578
5579 /* Split the block just after the lock sequence. */
5580 edge locked_edge = split_block (lock_bb, lock_end);
5581 basic_block update_bb = locked_edge->dest;
5582 lock_bb = locked_edge->src;
5583 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5584
5585 /* Create the lock loop ... */
5586 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
5587 locked_edge->probability = profile_probability::even ();
5588 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5589 loop_edge->probability = profile_probability::even ();
33f47f42
NS
5590 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5591 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5592
5593 /* ... and the loop structure. */
5594 loop *lock_loop = alloc_loop ();
5595 lock_loop->header = lock_bb;
5596 lock_loop->latch = lock_bb;
5597 lock_loop->nb_iterations_estimate = 1;
5598 lock_loop->any_estimate = true;
5599 add_loop (lock_loop, entry_bb->loop_father);
5600
5601 /* Build and insert the reduction calculation. */
5602 gimple_seq red_seq = NULL;
5603 tree acc_in = make_ssa_name (var_type);
5604 tree ref_in = build_simple_mem_ref (ptr);
5605 TREE_THIS_VOLATILE (ref_in) = 1;
5606 gimplify_assign (acc_in, ref_in, &red_seq);
5607
5608 tree acc_out = make_ssa_name (var_type);
5609 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5610 gimplify_assign (acc_out, update_expr, &red_seq);
5611
5612 tree ref_out = build_simple_mem_ref (ptr);
5613 TREE_THIS_VOLATILE (ref_out) = 1;
5614 gimplify_assign (ref_out, acc_out, &red_seq);
5615
5616 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5617
5618 /* Build & insert the unlock sequence. */
5619 gimple_seq unlock_seq = NULL;
5620 tree unlock_expr = nvptx_global_lock_addr ();
5621 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5622 uns_locked, uns_unlocked);
5623 gimplify_and_add (unlock_expr, &unlock_seq);
5624 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5625
5626 return acc_out;
5627}
5628
5629/* Emit a sequence to update a reduction accumlator at *PTR with the
5630 value held in VAR using operator OP. Return the updated value.
5631
5632 TODO: optimize for atomic ops and indepedent complex ops. */
5633
5634static tree
5635nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5636 tree ptr, tree var, tree_code op)
5637{
5638 tree type = TREE_TYPE (var);
5639 tree size = TYPE_SIZE (type);
5640
5641 if (size == TYPE_SIZE (unsigned_type_node)
5642 || size == TYPE_SIZE (long_long_unsigned_type_node))
5643 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5644 else
5645 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
5646}
5647
5648/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5649
5650static void
5651nvptx_goacc_reduction_setup (gcall *call)
5652{
5653 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5654 tree lhs = gimple_call_lhs (call);
5655 tree var = gimple_call_arg (call, 2);
5656 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5657 gimple_seq seq = NULL;
5658
5659 push_gimplify_context (true);
5660
5661 if (level != GOMP_DIM_GANG)
5662 {
5663 /* Copy the receiver object. */
5664 tree ref_to_res = gimple_call_arg (call, 1);
5665
5666 if (!integer_zerop (ref_to_res))
5667 var = build_simple_mem_ref (ref_to_res);
5668 }
5669
5670 if (level == GOMP_DIM_WORKER)
5671 {
5672 /* Store incoming value to worker reduction buffer. */
5673 tree offset = gimple_call_arg (call, 5);
5674 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5675 tree ptr = make_ssa_name (TREE_TYPE (call));
5676
5677 gimplify_assign (ptr, call, &seq);
5678 tree ref = build_simple_mem_ref (ptr);
5679 TREE_THIS_VOLATILE (ref) = 1;
5680 gimplify_assign (ref, var, &seq);
5681 }
5682
5683 if (lhs)
5684 gimplify_assign (lhs, var, &seq);
5685
5686 pop_gimplify_context (NULL);
5687 gsi_replace_with_seq (&gsi, seq, true);
5688}
5689
5690/* NVPTX implementation of GOACC_REDUCTION_INIT. */
5691
5692static void
5693nvptx_goacc_reduction_init (gcall *call)
5694{
5695 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5696 tree lhs = gimple_call_lhs (call);
5697 tree var = gimple_call_arg (call, 2);
5698 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5699 enum tree_code rcode
5700 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5701 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5702 TREE_TYPE (var));
5703 gimple_seq seq = NULL;
5704
5705 push_gimplify_context (true);
5706
5707 if (level == GOMP_DIM_VECTOR)
5708 {
5709 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5710 tree tid = make_ssa_name (integer_type_node);
5711 tree dim_vector = gimple_call_arg (call, 3);
5712 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5713 dim_vector);
5714 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5715 NULL_TREE, NULL_TREE);
5716
5717 gimple_call_set_lhs (tid_call, tid);
5718 gimple_seq_add_stmt (&seq, tid_call);
5719 gimple_seq_add_stmt (&seq, cond_stmt);
5720
5721 /* Split the block just after the call. */
5722 edge init_edge = split_block (gsi_bb (gsi), call);
5723 basic_block init_bb = init_edge->dest;
5724 basic_block call_bb = init_edge->src;
5725
5726 /* Fixup flags from call_bb to init_bb. */
5727 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 5728 init_edge->probability = profile_probability::even ();
f3552158
NS
5729
5730 /* Set the initialization stmts. */
5731 gimple_seq init_seq = NULL;
5732 tree init_var = make_ssa_name (TREE_TYPE (var));
5733 gimplify_assign (init_var, init, &init_seq);
5734 gsi = gsi_start_bb (init_bb);
5735 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5736
5737 /* Split block just after the init stmt. */
5738 gsi_prev (&gsi);
5739 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5740 basic_block dst_bb = inited_edge->dest;
5741
5742 /* Create false edge from call_bb to dst_bb. */
5743 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 5744 nop_edge->probability = profile_probability::even ();
f3552158
NS
5745
5746 /* Create phi node in dst block. */
5747 gphi *phi = create_phi_node (lhs, dst_bb);
5748 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5749 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5750
5751 /* Reset dominator of dst bb. */
5752 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5753
5754 /* Reset the gsi. */
5755 gsi = gsi_for_stmt (call);
5756 }
5757 else
5758 {
5759 if (level == GOMP_DIM_GANG)
5760 {
5761 /* If there's no receiver object, propagate the incoming VAR. */
5762 tree ref_to_res = gimple_call_arg (call, 1);
5763 if (integer_zerop (ref_to_res))
5764 init = var;
5765 }
5766
5767 gimplify_assign (lhs, init, &seq);
5768 }
5769
5770 pop_gimplify_context (NULL);
5771 gsi_replace_with_seq (&gsi, seq, true);
5772}
5773
5774/* NVPTX implementation of GOACC_REDUCTION_FINI. */
5775
5776static void
5777nvptx_goacc_reduction_fini (gcall *call)
5778{
5779 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5780 tree lhs = gimple_call_lhs (call);
5781 tree ref_to_res = gimple_call_arg (call, 1);
5782 tree var = gimple_call_arg (call, 2);
5783 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5784 enum tree_code op
5785 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5786 gimple_seq seq = NULL;
5787 tree r = NULL_TREE;;
5788
5789 push_gimplify_context (true);
5790
5791 if (level == GOMP_DIM_VECTOR)
5792 {
5793 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5794 but that requires a method of emitting a unified jump at the
5795 gimple level. */
5796 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5797 {
5798 tree other_var = make_ssa_name (TREE_TYPE (var));
5799 nvptx_generate_vector_shuffle (gimple_location (call),
5800 other_var, var, shfl, &seq);
5801
5802 r = make_ssa_name (TREE_TYPE (var));
5803 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5804 var, other_var), &seq);
5805 var = r;
5806 }
5807 }
5808 else
5809 {
5810 tree accum = NULL_TREE;
5811
5812 if (level == GOMP_DIM_WORKER)
5813 {
5814 /* Get reduction buffer address. */
5815 tree offset = gimple_call_arg (call, 5);
5816 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5817 tree ptr = make_ssa_name (TREE_TYPE (call));
5818
5819 gimplify_assign (ptr, call, &seq);
5820 accum = ptr;
5821 }
5822 else if (integer_zerop (ref_to_res))
5823 r = var;
5824 else
5825 accum = ref_to_res;
5826
5827 if (accum)
5828 {
33f47f42 5829 /* UPDATE the accumulator. */
f3552158
NS
5830 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5831 seq = NULL;
33f47f42
NS
5832 r = nvptx_reduction_update (gimple_location (call), &gsi,
5833 accum, var, op);
f3552158
NS
5834 }
5835 }
5836
5837 if (lhs)
5838 gimplify_assign (lhs, r, &seq);
5839 pop_gimplify_context (NULL);
5840
5841 gsi_replace_with_seq (&gsi, seq, true);
5842}
5843
5844/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5845
5846static void
5847nvptx_goacc_reduction_teardown (gcall *call)
5848{
5849 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5850 tree lhs = gimple_call_lhs (call);
5851 tree var = gimple_call_arg (call, 2);
5852 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5853 gimple_seq seq = NULL;
5854
5855 push_gimplify_context (true);
5856 if (level == GOMP_DIM_WORKER)
5857 {
5858 /* Read the worker reduction buffer. */
5859 tree offset = gimple_call_arg (call, 5);
5860 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5861 tree ptr = make_ssa_name (TREE_TYPE (call));
5862
5863 gimplify_assign (ptr, call, &seq);
5864 var = build_simple_mem_ref (ptr);
5865 TREE_THIS_VOLATILE (var) = 1;
5866 }
5867
5868 if (level != GOMP_DIM_GANG)
5869 {
5870 /* Write to the receiver object. */
5871 tree ref_to_res = gimple_call_arg (call, 1);
5872
5873 if (!integer_zerop (ref_to_res))
5874 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5875 }
5876
5877 if (lhs)
5878 gimplify_assign (lhs, var, &seq);
5879
5880 pop_gimplify_context (NULL);
5881
5882 gsi_replace_with_seq (&gsi, seq, true);
5883}
5884
5885/* NVPTX reduction expander. */
5886
5563d5c0 5887static void
f3552158
NS
5888nvptx_goacc_reduction (gcall *call)
5889{
5890 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5891
5892 switch (code)
5893 {
5894 case IFN_GOACC_REDUCTION_SETUP:
5895 nvptx_goacc_reduction_setup (call);
5896 break;
5897
5898 case IFN_GOACC_REDUCTION_INIT:
5899 nvptx_goacc_reduction_init (call);
5900 break;
5901
5902 case IFN_GOACC_REDUCTION_FINI:
5903 nvptx_goacc_reduction_fini (call);
5904 break;
5905
5906 case IFN_GOACC_REDUCTION_TEARDOWN:
5907 nvptx_goacc_reduction_teardown (call);
5908 break;
5909
5910 default:
5911 gcc_unreachable ();
5912 }
5913}
5914
e053f710
TV
5915static bool
5916nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5917 rtx x ATTRIBUTE_UNUSED)
5918{
5919 return true;
5920}
5921
b98b34b7
TV
5922static bool
5923nvptx_vector_mode_supported (machine_mode mode)
5924{
3717fbe3
TV
5925 return (mode == V2SImode
5926 || mode == V2DImode);
5927}
5928
5929/* Return the preferred mode for vectorizing scalar MODE. */
5930
5931static machine_mode
005ba29c 5932nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
5933{
5934 switch (mode)
5935 {
4e10a5a7 5936 case E_DImode:
3717fbe3 5937 return V2DImode;
4e10a5a7 5938 case E_SImode:
3717fbe3
TV
5939 return V2SImode;
5940
5941 default:
5942 return default_preferred_simd_mode (mode);
5943 }
5944}
5945
5946unsigned int
5947nvptx_data_alignment (const_tree type, unsigned int basic_align)
5948{
5949 if (TREE_CODE (type) == INTEGER_TYPE)
5950 {
5951 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5952 if (size == GET_MODE_SIZE (TImode))
5953 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5954 }
5955
5956 return basic_align;
b98b34b7
TV
5957}
5958
99e1629f
RS
5959/* Implement TARGET_MODES_TIEABLE_P. */
5960
5961static bool
5962nvptx_modes_tieable_p (machine_mode, machine_mode)
5963{
5964 return false;
5965}
5966
c43f4279
RS
5967/* Implement TARGET_HARD_REGNO_NREGS. */
5968
5969static unsigned int
5970nvptx_hard_regno_nregs (unsigned int, machine_mode)
5971{
5972 return 1;
5973}
5974
0d803030
RS
5975/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5976
5977static bool
5978nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5979{
5980 return false;
5981}
5982
43be05f5
TV
5983static GTY(()) tree nvptx_previous_fndecl;
5984
5985static void
5986nvptx_set_current_function (tree fndecl)
5987{
5988 if (!fndecl || fndecl == nvptx_previous_fndecl)
5989 return;
5990
5991 nvptx_previous_fndecl = fndecl;
5992}
5993
738f2522
BS
5994#undef TARGET_OPTION_OVERRIDE
5995#define TARGET_OPTION_OVERRIDE nvptx_option_override
5996
5997#undef TARGET_ATTRIBUTE_TABLE
5998#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5999
d81db636
SB
6000#undef TARGET_LRA_P
6001#define TARGET_LRA_P hook_bool_void_false
6002
738f2522
BS
6003#undef TARGET_LEGITIMATE_ADDRESS_P
6004#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6005
6006#undef TARGET_PROMOTE_FUNCTION_MODE
6007#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6008
6009#undef TARGET_FUNCTION_ARG
6010#define TARGET_FUNCTION_ARG nvptx_function_arg
6011#undef TARGET_FUNCTION_INCOMING_ARG
6012#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6013#undef TARGET_FUNCTION_ARG_ADVANCE
6014#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6015#undef TARGET_FUNCTION_ARG_BOUNDARY
6016#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6017#undef TARGET_PASS_BY_REFERENCE
6018#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6019#undef TARGET_FUNCTION_VALUE_REGNO_P
6020#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6021#undef TARGET_FUNCTION_VALUE
6022#define TARGET_FUNCTION_VALUE nvptx_function_value
6023#undef TARGET_LIBCALL_VALUE
6024#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6025#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6026#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6027#undef TARGET_GET_DRAP_RTX
6028#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6029#undef TARGET_SPLIT_COMPLEX_ARG
6030#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6031#undef TARGET_RETURN_IN_MEMORY
6032#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6033#undef TARGET_OMIT_STRUCT_RETURN_REG
6034#define TARGET_OMIT_STRUCT_RETURN_REG true
6035#undef TARGET_STRICT_ARGUMENT_NAMING
6036#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6037#undef TARGET_CALL_ARGS
6038#define TARGET_CALL_ARGS nvptx_call_args
6039#undef TARGET_END_CALL_ARGS
6040#define TARGET_END_CALL_ARGS nvptx_end_call_args
6041
6042#undef TARGET_ASM_FILE_START
6043#define TARGET_ASM_FILE_START nvptx_file_start
6044#undef TARGET_ASM_FILE_END
6045#define TARGET_ASM_FILE_END nvptx_file_end
6046#undef TARGET_ASM_GLOBALIZE_LABEL
6047#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6048#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6049#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6050#undef TARGET_PRINT_OPERAND
6051#define TARGET_PRINT_OPERAND nvptx_print_operand
6052#undef TARGET_PRINT_OPERAND_ADDRESS
6053#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6054#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6055#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6056#undef TARGET_ASM_INTEGER
6057#define TARGET_ASM_INTEGER nvptx_assemble_integer
6058#undef TARGET_ASM_DECL_END
6059#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6060#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6061#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6062#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6063#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6064#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6065#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6066
6067#undef TARGET_MACHINE_DEPENDENT_REORG
6068#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6069#undef TARGET_NO_REGISTER_ALLOCATION
6070#define TARGET_NO_REGISTER_ALLOCATION true
6071
9a863523
NS
6072#undef TARGET_ENCODE_SECTION_INFO
6073#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6074#undef TARGET_RECORD_OFFLOAD_SYMBOL
6075#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6076
738f2522
BS
6077#undef TARGET_VECTOR_ALIGNMENT
6078#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6079
d88cd9c4
NS
6080#undef TARGET_CANNOT_COPY_INSN_P
6081#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6082
a794bd20
NS
6083#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6084#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6085
f3552158
NS
6086#undef TARGET_INIT_BUILTINS
6087#define TARGET_INIT_BUILTINS nvptx_init_builtins
6088#undef TARGET_EXPAND_BUILTIN
6089#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6090#undef TARGET_BUILTIN_DECL
6091#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6092
5012919d
AM
6093#undef TARGET_SIMT_VF
6094#define TARGET_SIMT_VF nvptx_simt_vf
6095
94829f87
NS
6096#undef TARGET_GOACC_VALIDATE_DIMS
6097#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6098
bd751975
NS
6099#undef TARGET_GOACC_DIM_LIMIT
6100#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6101
d88cd9c4
NS
6102#undef TARGET_GOACC_FORK_JOIN
6103#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6104
f3552158
NS
6105#undef TARGET_GOACC_REDUCTION
6106#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6107
e053f710
TV
6108#undef TARGET_CANNOT_FORCE_CONST_MEM
6109#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6110
b98b34b7
TV
6111#undef TARGET_VECTOR_MODE_SUPPORTED_P
6112#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6113
3717fbe3
TV
6114#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6115#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6116 nvptx_preferred_simd_mode
6117
99e1629f
RS
6118#undef TARGET_MODES_TIEABLE_P
6119#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6120
c43f4279
RS
6121#undef TARGET_HARD_REGNO_NREGS
6122#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6123
0d803030
RS
6124#undef TARGET_CAN_CHANGE_MODE_CLASS
6125#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6126
e335138d
TV
6127#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6128#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6129
43be05f5
TV
6130#undef TARGET_SET_CURRENT_FUNCTION
6131#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6132
738f2522
BS
6133struct gcc_target targetm = TARGET_INITIALIZER;
6134
6135#include "gt-nvptx.h"