]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
re PR libfortran/88807 (misleading indentation warnings building libgfortran)
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
a5544970 2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19 61#include "omp-low.h"
6e723923 62#include "omp-offload.h"
3e32ee19 63#include "gomp-constants.h"
d88cd9c4 64#include "dumpfile.h"
f3552158
NS
65#include "internal-fn.h"
66#include "gimple-iterator.h"
67#include "stringpool.h"
314e6352 68#include "attribs.h"
f2d4d839 69#include "tree-vrp.h"
f3552158
NS
70#include "tree-ssa-operands.h"
71#include "tree-ssanames.h"
72#include "gimplify.h"
73#include "tree-phinodes.h"
74#include "cfgloop.h"
75#include "fold-const.h"
50c620ca 76#include "intl.h"
738f2522 77
994c5d85 78/* This file should be included last. */
d58627a0
RS
79#include "target-def.h"
80
43c371e8 81#define WORKAROUND_PTXJIT_BUG 1
3dede32b 82#define WORKAROUND_PTXJIT_BUG_2 1
21251395 83#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 84
69b09a58
TV
85/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
87 2.x. */
88#define PTX_CTA_SIZE 1024
89
052aaace 90#define PTX_CTA_NUM_BARRIERS 16
5d17a476 91#define PTX_WARP_SIZE 32
69b09a58 92
052aaace
TV
93#define PTX_PER_CTA_BARRIER 0
94#define PTX_NUM_PER_CTA_BARRIERS 1
95#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97
764ecad4
TV
98#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99#define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
693ad66b
TV
100#define PTX_WORKER_LENGTH 32
101#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 102
9a863523
NS
103/* The various PTX memory areas an object might reside in. */
104enum nvptx_data_area
105{
106 DATA_AREA_GENERIC,
107 DATA_AREA_GLOBAL,
108 DATA_AREA_SHARED,
109 DATA_AREA_LOCAL,
110 DATA_AREA_CONST,
111 DATA_AREA_PARAM,
112 DATA_AREA_MAX
113};
114
115/* We record the data area in the target symbol flags. */
116#define SYMBOL_DATA_AREA(SYM) \
117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
118 & 7)
119#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
121
738f2522
BS
122/* Record the function decls we've written, and the libfuncs and function
123 decls corresponding to them. */
124static std::stringstream func_decls;
f3dba894 125
6c907cff 126struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
127{
128 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
129 static bool equal (rtx a, rtx b) { return a == b; }
130};
131
132static GTY((cache))
133 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
134
6c907cff 135struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
136{
137 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
138 static bool equal (tree a, tree b) { return a == b; }
139};
140
141static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
142static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 143
1ed57fb8
TV
144/* Buffer needed to broadcast across workers and vectors. This is
145 used for both worker-neutering and worker broadcasting, and
146 vector-neutering and boardcasting when vector_length > 32. It is
147 shared by all functions emitted. The buffer is placed in shared
148 memory. It'd be nice if PTX supported common blocks, because then
149 this could be shared across TUs (taking the largest size). */
150static unsigned oacc_bcast_size;
0024c320 151static unsigned oacc_bcast_partition;
1ed57fb8
TV
152static unsigned oacc_bcast_align;
153static GTY(()) rtx oacc_bcast_sym;
d88cd9c4 154
f3552158
NS
155/* Buffer needed for worker reductions. This has to be distinct from
156 the worker broadcast array, as both may be live concurrently. */
157static unsigned worker_red_size;
158static unsigned worker_red_align;
f3552158
NS
159static GTY(()) rtx worker_red_sym;
160
f881693c
TV
161/* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164static unsigned vector_red_size;
165static unsigned vector_red_align;
166static unsigned vector_red_partition;
167static GTY(()) rtx vector_red_sym;
168
33f47f42
NS
169/* Global lock variable, needed for 128bit worker & gang reductions. */
170static GTY(()) tree global_lock_var;
171
5012919d
AM
172/* True if any function references __nvptx_stacks. */
173static bool need_softstack_decl;
174
175/* True if any function references __nvptx_uni. */
176static bool need_unisimt_decl;
177
0024c320
TV
178static int nvptx_mach_max_workers ();
179
738f2522
BS
180/* Allocate a new, cleared machine_function structure. */
181
182static struct machine_function *
183nvptx_init_machine_status (void)
184{
185 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 186 p->return_mode = VOIDmode;
738f2522
BS
187 return p;
188}
189
5012919d
AM
190/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191 and -fopenacc is also enabled. */
192
193static void
194diagnose_openacc_conflict (bool optval, const char *optname)
195{
196 if (flag_openacc && optval)
197 error ("option %s is not supported together with -fopenacc", optname);
198}
199
738f2522
BS
200/* Implement TARGET_OPTION_OVERRIDE. */
201
202static void
203nvptx_option_override (void)
204{
205 init_machine_status = nvptx_init_machine_status;
ae578943 206
3d339d5e
AM
207 /* Set toplevel_reorder, unless explicitly disabled. We need
208 reordering so that we emit necessary assembler decls of
209 undeclared variables. */
210 if (!global_options_set.x_flag_toplevel_reorder)
211 flag_toplevel_reorder = 1;
212
fa9afdc3
TV
213 debug_nonbind_markers_p = 0;
214
ae578943
NS
215 /* Set flag_no_common, unless explicitly disabled. We fake common
216 using .weak, and that's not entirely accurate, so avoid it
217 unless forced. */
218 if (!global_options_set.x_flag_no_common)
219 flag_no_common = 1;
220
cad136a4
TV
221 /* The patch area requires nops, which we don't have. */
222 if (function_entry_patch_area_size > 0)
223 sorry ("not generating patch area, nops not supported");
224
738f2522
BS
225 /* Assumes that it will see only hard registers. */
226 flag_var_tracking = 0;
1e5154e7 227
dba619f3
NS
228 if (nvptx_optimize < 0)
229 nvptx_optimize = optimize > 0;
230
f3dba894
TS
231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 233 declared_libfuncs_htab
f3dba894 234 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 235
1ed57fb8
TV
236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
0024c320 239 oacc_bcast_partition = 0;
f3552158 240
15ab6f00 241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d 244
f881693c
TV
245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248 vector_red_partition = 0;
249
5012919d
AM
250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
253
254 if (TARGET_GOMP)
255 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
256}
257
738f2522
BS
258/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
259 deal with ptx ideosyncracies. */
260
261const char *
262nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
263{
264 switch (mode)
265 {
4e10a5a7 266 case E_BLKmode:
738f2522 267 return ".b8";
4e10a5a7 268 case E_BImode:
738f2522 269 return ".pred";
4e10a5a7 270 case E_QImode:
738f2522
BS
271 if (promote)
272 return ".u32";
273 else
274 return ".u8";
4e10a5a7 275 case E_HImode:
738f2522 276 return ".u16";
4e10a5a7 277 case E_SImode:
738f2522 278 return ".u32";
4e10a5a7 279 case E_DImode:
738f2522
BS
280 return ".u64";
281
4e10a5a7 282 case E_SFmode:
738f2522 283 return ".f32";
4e10a5a7 284 case E_DFmode:
738f2522
BS
285 return ".f64";
286
4e10a5a7 287 case E_V2SImode:
b98b34b7 288 return ".v2.u32";
4e10a5a7 289 case E_V2DImode:
3717fbe3 290 return ".v2.u64";
b98b34b7 291
738f2522
BS
292 default:
293 gcc_unreachable ();
294 }
295}
296
9a863523
NS
297/* Encode the PTX data area that DECL (which might not actually be a
298 _DECL) should reside in. */
7b8edc29 299
9a863523
NS
300static void
301nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 302{
9a863523
NS
303 default_encode_section_info (decl, rtl, first);
304 if (first && MEM_P (rtl))
305 {
306 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 307
9a863523
NS
308 if (TREE_CONSTANT (decl))
309 area = DATA_AREA_CONST;
310 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
311 {
312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
313 {
314 area = DATA_AREA_SHARED;
315 if (DECL_INITIAL (decl))
316 error ("static initialization of variable %q+D in %<.shared%>"
317 " memory is not supported", decl);
318 }
319 else
320 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
321 }
7b8edc29 322
9a863523
NS
323 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
324 }
325}
326
327/* Return the PTX name of the data area in which SYM should be
328 placed. The symbol must have already been processed by
329 nvptx_encode_seciton_info, or equivalent. */
330
331static const char *
332section_for_sym (rtx sym)
333{
334 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
335 /* Same order as nvptx_data_area enum. */
336 static char const *const areas[] =
337 {"", ".global", ".shared", ".local", ".const", ".param"};
338
339 return areas[area];
340}
341
342/* Similarly for a decl. */
343
344static const char *
345section_for_decl (const_tree decl)
346{
347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
348}
349
b4346107
NS
350/* Check NAME for special function names and redirect them by returning a
351 replacement. This applies to malloc, free and realloc, for which we
352 want to use libgcc wrappers, and call, which triggers a bug in
353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354 not active in an offload compiler -- the names are all set by the
355 host-side compiler. */
356
357static const char *
358nvptx_name_replacement (const char *name)
359{
360 if (strcmp (name, "call") == 0)
361 return "__nvptx_call";
362 if (strcmp (name, "malloc") == 0)
363 return "__nvptx_malloc";
364 if (strcmp (name, "free") == 0)
365 return "__nvptx_free";
366 if (strcmp (name, "realloc") == 0)
367 return "__nvptx_realloc";
368 return name;
369}
370
d7479262
NS
371/* If MODE should be treated as two registers of an inner mode, return
372 that inner mode. Otherwise return VOIDmode. */
738f2522 373
d7479262
NS
374static machine_mode
375maybe_split_mode (machine_mode mode)
738f2522 376{
738f2522 377 if (COMPLEX_MODE_P (mode))
d7479262 378 return GET_MODE_INNER (mode);
738f2522 379
738f2522 380 if (mode == TImode)
d7479262
NS
381 return DImode;
382
383 return VOIDmode;
738f2522
BS
384}
385
38827811
TV
386/* Return true if mode should be treated as two registers. */
387
388static bool
389split_mode_p (machine_mode mode)
390{
391 return maybe_split_mode (mode) != VOIDmode;
392}
393
f313d112
NS
394/* Output a register, subreg, or register pair (with optional
395 enclosing braces). */
396
397static void
398output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
399 int subreg_offset = -1)
400{
401 if (inner_mode == VOIDmode)
402 {
403 if (HARD_REGISTER_NUM_P (regno))
404 fprintf (file, "%s", reg_names[regno]);
405 else
406 fprintf (file, "%%r%d", regno);
407 }
408 else if (subreg_offset >= 0)
409 {
410 output_reg (file, regno, VOIDmode);
411 fprintf (file, "$%d", subreg_offset);
412 }
413 else
414 {
415 if (subreg_offset == -1)
416 fprintf (file, "{");
417 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
418 fprintf (file, ",");
419 output_reg (file, regno, inner_mode, 0);
420 if (subreg_offset == -1)
421 fprintf (file, "}");
422 }
423}
424
d88cd9c4
NS
425/* Emit forking instructions for MASK. */
426
427static void
428nvptx_emit_forking (unsigned mask, bool is_call)
429{
430 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
431 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
432 if (mask)
433 {
434 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
435
436 /* Emit fork at all levels. This helps form SESE regions, as
437 it creates a block with a single successor before entering a
438 partitooned region. That is a good candidate for the end of
439 an SESE region. */
e91eba31 440 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
441 emit_insn (gen_nvptx_forked (op));
442 }
443}
444
445/* Emit joining instructions for MASK. */
446
447static void
448nvptx_emit_joining (unsigned mask, bool is_call)
449{
450 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452 if (mask)
453 {
454 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455
456 /* Emit joining for all non-call pars to ensure there's a single
457 predecessor for the block the join insn ends up in. This is
458 needed for skipping entire loops. */
e91eba31 459 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
460 emit_insn (gen_nvptx_join (op));
461 }
462}
463
738f2522 464\f
44eba92d
NS
465/* Determine whether MODE and TYPE (possibly NULL) should be passed or
466 returned in memory. Integer and floating types supported by the
467 machine are passed in registers, everything else is passed in
468 memory. Complex types are split. */
469
470static bool
471pass_in_memory (machine_mode mode, const_tree type, bool for_return)
472{
473 if (type)
474 {
475 if (AGGREGATE_TYPE_P (type))
476 return true;
477 if (TREE_CODE (type) == VECTOR_TYPE)
478 return true;
479 }
480
481 if (!for_return && COMPLEX_MODE_P (mode))
482 /* Complex types are passed as two underlying args. */
483 mode = GET_MODE_INNER (mode);
484
485 if (GET_MODE_CLASS (mode) != MODE_INT
486 && GET_MODE_CLASS (mode) != MODE_FLOAT)
487 return true;
488
489 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
490 return true;
491
492 return false;
493}
494
495/* A non-memory argument of mode MODE is being passed, determine the mode it
496 should be promoted to. This is also used for determining return
497 type promotion. */
498
499static machine_mode
500promote_arg (machine_mode mode, bool prototyped)
501{
502 if (!prototyped && mode == SFmode)
503 /* K&R float promotion for unprototyped functions. */
504 mode = DFmode;
505 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
506 mode = SImode;
507
508 return mode;
509}
510
511/* A non-memory return type of MODE is being returned. Determine the
512 mode it should be promoted to. */
513
514static machine_mode
515promote_return (machine_mode mode)
516{
517 return promote_arg (mode, true);
518}
519
e74f7152 520/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 521
e74f7152 522static rtx
b49e35a9 523nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
524 const_tree, bool named)
525{
b49e35a9 526 if (mode == VOIDmode || !named)
e74f7152 527 return NULL_RTX;
738f2522 528
b49e35a9 529 return gen_reg_rtx (mode);
e74f7152
NS
530}
531
532/* Implement TARGET_FUNCTION_INCOMING_ARG. */
533
534static rtx
535nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
536 const_tree, bool named)
738f2522 537{
e74f7152 538 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 539
b49e35a9 540 if (mode == VOIDmode || !named)
e74f7152 541 return NULL_RTX;
738f2522 542
e74f7152
NS
543 /* No need to deal with split modes here, the only case that can
544 happen is complex modes and those are dealt with by
545 TARGET_SPLIT_COMPLEX_ARG. */
546 return gen_rtx_UNSPEC (mode,
547 gen_rtvec (1, GEN_INT (cum->count)),
548 UNSPEC_ARG_REG);
549}
550
551/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
552
553static void
554nvptx_function_arg_advance (cumulative_args_t cum_v,
555 machine_mode ARG_UNUSED (mode),
556 const_tree ARG_UNUSED (type),
557 bool ARG_UNUSED (named))
558{
559 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 560
e74f7152
NS
561 cum->count++;
562}
563
1988b2c2
NS
564/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
565
566 For nvptx This is only used for varadic args. The type has already
567 been promoted and/or converted to invisible reference. */
568
569static unsigned
570nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
571{
572 return GET_MODE_ALIGNMENT (mode);
573}
574
e74f7152
NS
575/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
576
577 For nvptx, we know how to handle functions declared as stdarg: by
578 passing an extra pointer to the unnamed arguments. However, the
579 Fortran frontend can produce a different situation, where a
580 function pointer is declared with no arguments, but the actual
581 function and calls to it take more arguments. In that case, we
582 want to ensure the call matches the definition of the function. */
583
584static bool
585nvptx_strict_argument_naming (cumulative_args_t cum_v)
586{
587 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 588
e74f7152
NS
589 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
590}
591
e74f7152
NS
592/* Implement TARGET_LIBCALL_VALUE. */
593
594static rtx
595nvptx_libcall_value (machine_mode mode, const_rtx)
596{
e91cacae 597 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
598 /* Pretend to return in a hard reg for early uses before pseudos can be
599 generated. */
600 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 601
e74f7152
NS
602 return gen_reg_rtx (mode);
603}
604
605/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
606 where function FUNC returns or receives a value of data type TYPE. */
607
608static rtx
44eba92d 609nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
610 bool outgoing)
611{
44eba92d
NS
612 machine_mode mode = promote_return (TYPE_MODE (type));
613
e74f7152 614 if (outgoing)
5c036f3f 615 {
e91cacae 616 gcc_assert (cfun);
44c068ae 617 cfun->machine->return_mode = mode;
5c036f3f
NS
618 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
619 }
b49e35a9
NS
620
621 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
622}
623
624/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
625
626static bool
627nvptx_function_value_regno_p (const unsigned int regno)
628{
629 return regno == NVPTX_RETURN_REGNUM;
630}
631
632/* Types with a mode other than those supported by the machine are passed by
633 reference in memory. */
634
635static bool
5563d5c0
NS
636nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
637 machine_mode mode, const_tree type,
638 bool ARG_UNUSED (named))
e74f7152 639{
44eba92d 640 return pass_in_memory (mode, type, false);
e74f7152
NS
641}
642
643/* Implement TARGET_RETURN_IN_MEMORY. */
644
645static bool
646nvptx_return_in_memory (const_tree type, const_tree)
647{
44eba92d 648 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
649}
650
651/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
652
653static machine_mode
654nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 655 int *ARG_UNUSED (punsignedp),
e74f7152
NS
656 const_tree funtype, int for_return)
657{
44eba92d 658 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
659}
660
e74f7152
NS
661/* Helper for write_arg. Emit a single PTX argument of MODE, either
662 in a prototype, or as copy in a function prologue. ARGNO is the
663 index of this argument in the PTX function. FOR_REG is negative,
664 if we're emitting the PTX prototype. It is zero if we're copying
665 to an argument register and it is greater than zero if we're
666 copying to a specific hard register. */
667
668static int
1f065954
NS
669write_arg_mode (std::stringstream &s, int for_reg, int argno,
670 machine_mode mode)
e74f7152
NS
671{
672 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
673
dc3d2aeb
NS
674 if (for_reg < 0)
675 {
676 /* Writing PTX prototype. */
677 s << (argno ? ", " : " (");
e74f7152 678 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
679 }
680 else
681 {
e74f7152 682 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
683 if (for_reg)
684 s << reg_names[for_reg];
685 else
686 s << "%ar" << argno;
687 s << ";\n";
5563d5c0
NS
688 if (argno >= 0)
689 {
690 s << "\tld.param" << ptx_type << " ";
691 if (for_reg)
692 s << reg_names[for_reg];
693 else
694 s << "%ar" << argno;
695 s << ", [%in_ar" << argno << "];\n";
696 }
dc3d2aeb
NS
697 }
698 return argno + 1;
738f2522
BS
699}
700
e74f7152 701/* Process function parameter TYPE to emit one or more PTX
1f065954 702 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
703 is true, if this is a prototyped function, rather than an old-style
704 C declaration. Returns the next argument number to use.
e74f7152 705
9c582551 706 The promotion behavior here must match the regular GCC function
e74f7152
NS
707 parameter marshalling machinery. */
708
709static int
1f065954
NS
710write_arg_type (std::stringstream &s, int for_reg, int argno,
711 tree type, bool prototyped)
e74f7152
NS
712{
713 machine_mode mode = TYPE_MODE (type);
714
715 if (mode == VOIDmode)
716 return argno;
717
44eba92d 718 if (pass_in_memory (mode, type, false))
e74f7152 719 mode = Pmode;
44eba92d
NS
720 else
721 {
722 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 723
44eba92d
NS
724 if (split)
725 {
726 /* Complex types are sent as two separate args. */
727 type = TREE_TYPE (type);
5563d5c0 728 mode = TYPE_MODE (type);
44eba92d
NS
729 prototyped = true;
730 }
e74f7152 731
44eba92d
NS
732 mode = promote_arg (mode, prototyped);
733 if (split)
1f065954 734 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 735 }
e74f7152 736
1f065954
NS
737 return write_arg_mode (s, for_reg, argno, mode);
738}
739
740/* Emit a PTX return as a prototype or function prologue declaration
741 for MODE. */
742
743static void
744write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
745{
746 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
747 const char *pfx = "\t.reg";
748 const char *sfx = ";\n";
749
750 if (for_proto)
751 pfx = "(.param", sfx = "_out) ";
752
753 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
754}
755
44eba92d 756/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 757 or function prologue declaration. Returns true if return is via an
9c582551 758 additional pointer parameter. The promotion behavior here must
1f065954 759 match the regular GCC function return mashalling. */
44eba92d 760
0beb7c71 761static bool
1f065954 762write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
763{
764 machine_mode mode = TYPE_MODE (type);
0beb7c71 765
44eba92d
NS
766 if (mode == VOIDmode)
767 return false;
768
769 bool return_in_mem = pass_in_memory (mode, type, true);
770
771 if (return_in_mem)
0beb7c71 772 {
44eba92d
NS
773 if (for_proto)
774 return return_in_mem;
775
776 /* Named return values can cause us to return a pointer as well
777 as expect an argument for the return location. This is
778 optimization-level specific, so no caller can make use of
779 this data, but more importantly for us, we must ensure it
780 doesn't change the PTX prototype. */
44c068ae 781 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 782
44eba92d
NS
783 if (mode == VOIDmode)
784 return return_in_mem;
785
44c068ae 786 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 787 retval parameter. */
44c068ae 788 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
789 }
790 else
44eba92d
NS
791 mode = promote_return (mode);
792
1f065954 793 write_return_mode (s, for_proto, mode);
0beb7c71
NS
794
795 return return_in_mem;
796}
797
738f2522
BS
798/* Look for attributes in ATTRS that would indicate we must write a function
799 as a .entry kernel rather than a .func. Return true if one is found. */
800
801static bool
802write_as_kernel (tree attrs)
803{
804 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
805 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
806 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
807 /* For OpenMP target regions, the corresponding kernel entry is emitted from
808 write_omp_entry as a separate function. */
738f2522
BS
809}
810
69823d76
NS
811/* Emit a linker marker for a function decl or defn. */
812
813static void
814write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
815 const char *name)
816{
817 s << "\n// BEGIN";
818 if (globalize)
819 s << " GLOBAL";
820 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
821 s << name << "\n";
822}
823
824/* Emit a linker marker for a variable decl or defn. */
825
826static void
827write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
828{
829 fprintf (file, "\n// BEGIN%s VAR %s: ",
830 globalize ? " GLOBAL" : "",
831 is_defn ? "DEF" : "DECL");
832 assemble_name_raw (file, name);
833 fputs ("\n", file);
834}
835
b699adcc
NS
836/* Write a .func or .kernel declaration or definition along with
837 a helper comment for use by ld. S is the stream to write to, DECL
838 the decl for the function with name NAME. For definitions, emit
839 a declaration too. */
738f2522 840
b699adcc
NS
841static const char *
842write_fn_proto (std::stringstream &s, bool is_defn,
843 const char *name, const_tree decl)
738f2522 844{
b699adcc
NS
845 if (is_defn)
846 /* Emit a declaration. The PTX assembler gets upset without it. */
847 name = write_fn_proto (s, false, name, decl);
b4346107
NS
848 else
849 {
850 /* Avoid repeating the name replacement. */
851 name = nvptx_name_replacement (name);
852 if (name[0] == '*')
853 name++;
854 }
738f2522 855
69823d76 856 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
857
858 /* PTX declaration. */
738f2522
BS
859 if (DECL_EXTERNAL (decl))
860 s << ".extern ";
861 else if (TREE_PUBLIC (decl))
0766660b 862 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 863 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 864
b699adcc
NS
865 tree fntype = TREE_TYPE (decl);
866 tree result_type = TREE_TYPE (fntype);
738f2522 867
ac3d02e2
NS
868 /* atomic_compare_exchange_$n builtins have an exceptional calling
869 convention. */
870 int not_atomic_weak_arg = -1;
871 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
872 switch (DECL_FUNCTION_CODE (decl))
873 {
874 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
875 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
876 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
877 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
878 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
879 /* These atomics skip the 'weak' parm in an actual library
880 call. We must skip it in the prototype too. */
881 not_atomic_weak_arg = 3;
882 break;
883
884 default:
885 break;
886 }
887
738f2522 888 /* Declare the result. */
1f065954 889 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 890
b699adcc
NS
891 s << name;
892
dc3d2aeb 893 int argno = 0;
b699adcc
NS
894
895 /* Emit argument list. */
896 if (return_in_mem)
1f065954 897 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 898
b699adcc
NS
899 /* We get:
900 NULL in TYPE_ARG_TYPES, for old-style functions
901 NULL in DECL_ARGUMENTS, for builtin functions without another
902 declaration.
903 So we have to pick the best one we have. */
904 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
905 bool prototyped = true;
906 if (!args)
907 {
908 args = DECL_ARGUMENTS (decl);
909 prototyped = false;
910 }
738f2522 911
ac3d02e2 912 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 913 {
dc3d2aeb 914 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
915
916 if (not_atomic_weak_arg)
917 argno = write_arg_type (s, -1, argno, type, prototyped);
918 else
919 gcc_assert (type == boolean_type_node);
738f2522 920 }
738f2522 921
b699adcc 922 if (stdarg_p (fntype))
1f065954 923 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 924
b699adcc 925 if (DECL_STATIC_CHAIN (decl))
1f065954 926 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 927
dc3d2aeb 928 if (!argno && strcmp (name, "main") == 0)
b699adcc 929 {
1f065954
NS
930 argno = write_arg_type (s, -1, argno, integer_type_node, true);
931 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
932 }
933
dc3d2aeb 934 if (argno)
b699adcc
NS
935 s << ")";
936
937 s << (is_defn ? "\n" : ";\n");
938
939 return name;
738f2522
BS
940}
941
00e52418
NS
942/* Construct a function declaration from a call insn. This can be
943 necessary for two reasons - either we have an indirect call which
944 requires a .callprototype declaration, or we have a libcall
945 generated by emit_library_call for which no decl exists. */
946
947static void
b699adcc
NS
948write_fn_proto_from_insn (std::stringstream &s, const char *name,
949 rtx result, rtx pat)
00e52418
NS
950{
951 if (!name)
952 {
953 s << "\t.callprototype ";
954 name = "_";
955 }
956 else
957 {
b4346107 958 name = nvptx_name_replacement (name);
69823d76 959 write_fn_marker (s, false, true, name);
00e52418
NS
960 s << "\t.extern .func ";
961 }
962
963 if (result != NULL_RTX)
1f065954 964 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
965
966 s << name;
967
00e52418
NS
968 int arg_end = XVECLEN (pat, 0);
969 for (int i = 1; i < arg_end; i++)
970 {
1f065954
NS
971 /* We don't have to deal with mode splitting & promotion here,
972 as that was already done when generating the call
973 sequence. */
00e52418
NS
974 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
975
1f065954 976 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
977 }
978 if (arg_end != 1)
979 s << ")";
980 s << ";\n";
981}
982
00e52418
NS
983/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
984 table and and write a ptx prototype. These are emitted at end of
985 compilation. */
738f2522 986
00e52418
NS
987static void
988nvptx_record_fndecl (tree decl)
738f2522 989{
f3dba894 990 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
991 if (*slot == NULL)
992 {
993 *slot = decl;
994 const char *name = get_fnname_from_decl (decl);
b699adcc 995 write_fn_proto (func_decls, false, name, decl);
738f2522 996 }
738f2522
BS
997}
998
00e52418
NS
999/* Record a libcall or unprototyped external function. CALLEE is the
1000 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1001 declaration for it. */
1002
1003static void
1004nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1005{
1006 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1007 if (*slot == NULL)
1008 {
1009 *slot = callee;
1010
1011 const char *name = XSTR (callee, 0);
b699adcc 1012 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
1013 }
1014}
1015
1016/* DECL is an external FUNCTION_DECL, that we're referencing. If it
1017 is prototyped, record it now. Otherwise record it as needed at end
1018 of compilation, when we might have more information about it. */
738f2522
BS
1019
1020void
1021nvptx_record_needed_fndecl (tree decl)
1022{
00e52418
NS
1023 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1024 {
1025 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1026 if (*slot == NULL)
1027 *slot = decl;
1028 }
1029 else
1030 nvptx_record_fndecl (decl);
1031}
738f2522 1032
00e52418
NS
1033/* SYM is a SYMBOL_REF. If it refers to an external function, record
1034 it as needed. */
1035
1036static void
1037nvptx_maybe_record_fnsym (rtx sym)
1038{
1039 tree decl = SYMBOL_REF_DECL (sym);
1040
1041 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1042 nvptx_record_needed_fndecl (decl);
738f2522
BS
1043}
1044
1f065954 1045/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1046 and initialize REGNO to point to it. If the size is zero, it'll
1047 never be valid to dereference, so we can simply initialize to
1048 zero. */
1f065954
NS
1049
1050static void
1051init_frame (FILE *file, int regno, unsigned align, unsigned size)
1052{
5563d5c0
NS
1053 if (size)
1054 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1055 align, reg_names[regno], size);
1056 fprintf (file, "\t.reg.u%d %s;\n",
1057 POINTER_SIZE, reg_names[regno]);
1058 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1059 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1060 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1061}
1062
5012919d
AM
1063/* Emit soft stack frame setup sequence. */
1064
1065static void
1066init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1067{
1068 /* Maintain 64-bit stack alignment. */
1069 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1070 size = ROUND_UP (size, keep_align);
1071 int bits = POINTER_SIZE;
1072 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1073 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1074 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1075 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1076 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1077 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1078 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1079 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1080 fprintf (file, "\t{\n");
1081 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1082 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1083 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1084 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1085 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1086 bits == 64 ? ".wide" : ".lo", bits / 8);
1087 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1088
1089 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1090 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1091
1092 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1093 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1094 bits, reg_sspprev, reg_sspslot);
1095
1096 /* Initialize %frame = %sspprev - size. */
1097 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1098 bits, reg_frame, reg_sspprev, size);
1099
1100 /* Apply alignment, if larger than 64. */
1101 if (alignment > keep_align)
1102 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1103 bits, reg_frame, reg_frame, -alignment);
1104
1105 size = crtl->outgoing_args_size;
1106 gcc_assert (size % keep_align == 0);
1107
1108 /* Initialize %stack. */
1109 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1110 bits, reg_stack, reg_frame, size);
1111
5012919d
AM
1112 if (!crtl->is_leaf)
1113 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1114 bits, reg_sspslot, reg_stack);
1115 fprintf (file, "\t}\n");
1116 cfun->machine->has_softstack = true;
1117 need_softstack_decl = true;
1118}
1119
d88cd9c4
NS
1120/* Emit code to initialize the REGNO predicate register to indicate
1121 whether we are not lane zero on the NAME axis. */
1122
1123static void
1124nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1125{
1126 fprintf (file, "\t{\n");
1127 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
f881693c
TV
1128 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1129 {
1130 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1131 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1132 }
d88cd9c4
NS
1133 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1134 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
f881693c
TV
1135 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1136 {
1137 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1138 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1139 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1140 "// vector reduction buffer\n",
1141 REGNO (cfun->machine->red_partition),
1142 vector_red_partition);
1143 }
1144 /* Verify vector_red_size. */
1145 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1146 <= vector_red_size);
d88cd9c4
NS
1147 fprintf (file, "\t}\n");
1148}
1149
0024c320
TV
1150/* Emit code to initialize OpenACC worker broadcast and synchronization
1151 registers. */
1152
1153static void
1154nvptx_init_oacc_workers (FILE *file)
1155{
1156 fprintf (file, "\t{\n");
1157 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1158 if (cfun->machine->bcast_partition)
1159 {
1160 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1161 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1162 }
1163 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1164 if (cfun->machine->bcast_partition)
1165 {
1166 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1167 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1168 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1169 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1170 "// vector broadcast offset\n",
1171 REGNO (cfun->machine->bcast_partition),
1172 oacc_bcast_partition);
1173 }
1174 /* Verify oacc_bcast_size. */
1175 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1176 <= oacc_bcast_size);
1177 if (cfun->machine->sync_bar)
1178 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1179 "// vector synchronization barrier\n",
1180 REGNO (cfun->machine->sync_bar));
1181 fprintf (file, "\t}\n");
1182}
1183
5012919d
AM
1184/* Emit code to initialize predicate and master lane index registers for
1185 -muniform-simt code generation variant. */
1186
1187static void
1188nvptx_init_unisimt_predicate (FILE *file)
1189{
0c6b03b5
AM
1190 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1191 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1192 int bits = POINTER_SIZE;
0c6b03b5 1193 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1194 fprintf (file, "\t{\n");
1195 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1196 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1197 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1198 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1199 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1200 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1201 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1202 if (cfun->machine->unisimt_predicate)
1203 {
1204 int master = REGNO (cfun->machine->unisimt_master);
1205 int pred = REGNO (cfun->machine->unisimt_predicate);
1206 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1207 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1208 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1209 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1210 /* Compute predicate as 'tid.x == master'. */
1211 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1212 }
5012919d
AM
1213 fprintf (file, "\t}\n");
1214 need_unisimt_decl = true;
1215}
1216
1217/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1218
1219 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1220 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1221 {
1222 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1223 __nvptx_uni[tid.y] = 0;
1224 gomp_nvptx_main (ORIG, arg);
1225 }
1226 ORIG itself should not be emitted as a PTX .entry function. */
1227
1228static void
1229write_omp_entry (FILE *file, const char *name, const char *orig)
1230{
1231 static bool gomp_nvptx_main_declared;
1232 if (!gomp_nvptx_main_declared)
1233 {
1234 gomp_nvptx_main_declared = true;
1235 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1236 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1237 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1238 }
3c5b4c6d
TS
1239 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1240#define NTID_Y "%ntid.y"
5012919d
AM
1241#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1242 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1243{\n\
1244 .reg.u32 %r<3>;\n\
1245 .reg.u" PS " %R<4>;\n\
1246 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1247 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1248 mov.u32 %r2, %ctaid.x;\n\
1249 cvt.u" PS ".u32 %R1, %r0;\n\
1250 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1251 mov.u" PS " %R0, __nvptx_stacks;\n\
1252 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1253 ld.param.u" PS " %R2, [%stack];\n\
1254 ld.param.u" PS " %R3, [%sz];\n\
1255 add.u" PS " %R2, %R2, %R3;\n\
1256 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1257 st.shared.u" PS " [%R0], %R2;\n\
1258 mov.u" PS " %R0, __nvptx_uni;\n\
1259 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1260 mov.u32 %r0, 0;\n\
1261 st.shared.u32 [%R0], %r0;\n\
1262 mov.u" PS " %R0, \0;\n\
1263 ld.param.u" PS " %R1, [%arg];\n\
1264 {\n\
1265 .param.u" PS " %P<2>;\n\
1266 st.param.u" PS " [%P0], %R0;\n\
1267 st.param.u" PS " [%P1], %R1;\n\
1268 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1269 }\n\
1270 ret.uni;\n\
1271}\n"
1272 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1273 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1274#undef ENTRY_TEMPLATE
3c5b4c6d 1275#undef NTID_Y
5012919d
AM
1276 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1277 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1278 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1279 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1280 need_softstack_decl = need_unisimt_decl = true;
1281}
1282
738f2522
BS
1283/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1284 function, including local var decls and copies from the arguments to
1285 local regs. */
1286
1287void
1288nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1289{
1290 tree fntype = TREE_TYPE (decl);
1291 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1292 int argno = 0;
738f2522 1293
5012919d
AM
1294 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1295 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1296 {
1297 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1298 sprintf (buf, "%s$impl", name);
1299 write_omp_entry (file, name, buf);
1300 name = buf;
1301 }
dc3d2aeb
NS
1302 /* We construct the initial part of the function into a string
1303 stream, in order to share the prototype writing code. */
738f2522 1304 std::stringstream s;
b699adcc 1305 write_fn_proto (s, true, name, decl);
dc3d2aeb 1306 s << "{\n";
738f2522 1307
1f065954 1308 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1309 if (return_in_mem)
1f065954 1310 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1311
5ab662d5 1312 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1313 tree args = TYPE_ARG_TYPES (fntype);
1314 bool prototyped = true;
1315 if (!args)
5ab662d5 1316 {
dc3d2aeb
NS
1317 args = DECL_ARGUMENTS (decl);
1318 prototyped = false;
5ab662d5
NS
1319 }
1320
1321 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1322 {
1323 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1324
1f065954 1325 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1326 }
5ab662d5 1327
dc3d2aeb 1328 if (stdarg_p (fntype))
5563d5c0 1329 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1330 true);
5ab662d5 1331
5563d5c0
NS
1332 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1333 write_arg_type (s, STATIC_CHAIN_REGNUM,
1334 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1335 true);
1336
dc3d2aeb 1337 fprintf (file, "%s", s.str().c_str());
25662751 1338
0c6b03b5
AM
1339 /* Usually 'crtl->is_leaf' is computed during register allocator
1340 initialization (which is not done on NVPTX) or for pressure-sensitive
1341 optimizations. Initialize it here, except if already set. */
1342 if (!crtl->is_leaf)
1343 crtl->is_leaf = leaf_function_p ();
1344
44c068ae 1345 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1346 bool need_frameptr = sz || cfun->machine->has_chain;
1347 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1348 if (!TARGET_SOFT_STACK)
1349 {
1350 /* Declare a local var for outgoing varargs. */
1351 if (cfun->machine->has_varadic)
1352 init_frame (file, STACK_POINTER_REGNUM,
1353 UNITS_PER_WORD, crtl->outgoing_args_size);
1354
1355 /* Declare a local variable for the frame. Force its size to be
1356 DImode-compatible. */
1357 if (need_frameptr)
1358 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1359 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1360 }
0c6b03b5
AM
1361 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1362 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1363 init_softstack_frame (file, alignment, sz);
44c068ae 1364
0c6b03b5
AM
1365 if (cfun->machine->has_simtreg)
1366 {
1367 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1368 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1369 align = MAX (align, GET_MODE_SIZE (DImode));
1370 if (!crtl->is_leaf || cfun->calls_alloca)
1371 simtsz = HOST_WIDE_INT_M1U;
1372 if (simtsz == HOST_WIDE_INT_M1U)
1373 simtsz = nvptx_softstack_size;
1374 if (cfun->machine->has_softstack)
1375 simtsz += POINTER_SIZE / 8;
1376 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1377 if (align > GET_MODE_SIZE (DImode))
1378 simtsz += align - GET_MODE_SIZE (DImode);
1379 if (simtsz)
1380 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1381 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1382 }
f881693c
TV
1383
1384 /* Restore the vector reduction partition register, if necessary.
1385 FIXME: Find out when and why this is necessary, and fix it. */
1386 if (cfun->machine->red_partition)
1387 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1388 = cfun->machine->red_partition;
1389
738f2522
BS
1390 /* Declare the pseudos we have as ptx registers. */
1391 int maxregs = max_reg_num ();
1392 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1393 {
1394 if (regno_reg_rtx[i] != const0_rtx)
1395 {
1396 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1397 machine_mode split = maybe_split_mode (mode);
f313d112 1398
38827811 1399 if (split_mode_p (mode))
f313d112
NS
1400 mode = split;
1401 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1402 output_reg (file, i, split, -2);
1403 fprintf (file, ";\n");
738f2522
BS
1404 }
1405 }
1406
d88cd9c4
NS
1407 /* Emit axis predicates. */
1408 if (cfun->machine->axis_predicate[0])
1409 nvptx_init_axis_predicate (file,
1410 REGNO (cfun->machine->axis_predicate[0]), "y");
1411 if (cfun->machine->axis_predicate[1])
1412 nvptx_init_axis_predicate (file,
1413 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1414 if (cfun->machine->unisimt_predicate
1415 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1416 nvptx_init_unisimt_predicate (file);
0024c320
TV
1417 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1418 nvptx_init_oacc_workers (file);
738f2522
BS
1419}
1420
0c6b03b5
AM
1421/* Output code for switching uniform-simt state. ENTERING indicates whether
1422 we are entering or leaving non-uniform execution region. */
1423
1424static void
1425nvptx_output_unisimt_switch (FILE *file, bool entering)
1426{
1427 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1428 return;
1429 fprintf (file, "\t{\n");
1430 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1431 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1432 if (!crtl->is_leaf)
1433 {
1434 int loc = REGNO (cfun->machine->unisimt_location);
1435 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1436 }
1437 if (cfun->machine->unisimt_predicate)
1438 {
1439 int master = REGNO (cfun->machine->unisimt_master);
1440 int pred = REGNO (cfun->machine->unisimt_predicate);
1441 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1442 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1443 master, entering ? "%ustmp2" : "0");
1444 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1445 }
1446 fprintf (file, "\t}\n");
1447}
1448
1449/* Output code for allocating per-lane storage and switching soft-stack pointer.
1450 ENTERING indicates whether we are entering or leaving non-uniform execution.
1451 PTR is the register pointing to allocated storage, it is assigned to on
1452 entering and used to restore state on leaving. SIZE and ALIGN are used only
1453 on entering. */
1454
1455static void
1456nvptx_output_softstack_switch (FILE *file, bool entering,
1457 rtx ptr, rtx size, rtx align)
1458{
1459 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1460 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1461 return;
1462 int bits = POINTER_SIZE, regno = REGNO (ptr);
1463 fprintf (file, "\t{\n");
1464 if (entering)
1465 {
1466 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1467 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1468 cfun->machine->simt_stack_size);
1469 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1470 if (CONST_INT_P (size))
1471 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1472 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1473 else
1474 output_reg (file, REGNO (size), VOIDmode);
1475 fputs (";\n", file);
1476 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296
AM
1477 fprintf (file,
1478 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1479 bits, regno, regno, UINTVAL (align));
1480 }
1481 if (cfun->machine->has_softstack)
1482 {
1483 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1484 if (entering)
1485 {
1486 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1487 bits, regno, bits / 8, reg_stack);
1488 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1489 bits, reg_stack, regno, bits / 8);
1490 }
1491 else
1492 {
1493 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1494 bits, reg_stack, regno, bits / 8);
1495 }
1496 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1497 }
1498 fprintf (file, "\t}\n");
1499}
1500
1501/* Output code to enter non-uniform execution region. DEST is a register
1502 to hold a per-lane allocation given by SIZE and ALIGN. */
1503
1504const char *
1505nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1506{
1507 nvptx_output_unisimt_switch (asm_out_file, true);
1508 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1509 return "";
1510}
1511
1512/* Output code to leave non-uniform execution region. SRC is the register
1513 holding per-lane storage previously allocated by omp_simt_enter insn. */
1514
1515const char *
1516nvptx_output_simt_exit (rtx src)
1517{
1518 nvptx_output_unisimt_switch (asm_out_file, false);
1519 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1520 return "";
1521}
1522
5012919d
AM
1523/* Output instruction that sets soft stack pointer in shared memory to the
1524 value in register given by SRC_REGNO. */
1525
1526const char *
1527nvptx_output_set_softstack (unsigned src_regno)
1528{
1529 if (cfun->machine->has_softstack && !crtl->is_leaf)
1530 {
1531 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1532 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1533 output_reg (asm_out_file, src_regno, VOIDmode);
1534 fprintf (asm_out_file, ";\n");
1535 }
1536 return "";
1537}
738f2522
BS
1538/* Output a return instruction. Also copy the return value to its outgoing
1539 location. */
1540
1541const char *
1542nvptx_output_return (void)
1543{
44c068ae 1544 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1545
1546 if (mode != VOIDmode)
1f065954
NS
1547 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1548 nvptx_ptx_type_from_mode (mode, false),
1549 reg_names[NVPTX_RETURN_REGNUM],
1550 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1551
1552 return "ret;";
1553}
1554
738f2522
BS
1555/* Terminate a function by writing a closing brace to FILE. */
1556
1557void
1558nvptx_function_end (FILE *file)
1559{
cf08c344 1560 fprintf (file, "}\n");
738f2522
BS
1561}
1562\f
1563/* Decide whether we can make a sibling call to a function. For ptx, we
1564 can't. */
1565
1566static bool
1567nvptx_function_ok_for_sibcall (tree, tree)
1568{
1569 return false;
1570}
1571
18c05628
NS
1572/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1573
1574static rtx
1575nvptx_get_drap_rtx (void)
1576{
5012919d
AM
1577 if (TARGET_SOFT_STACK && stack_realign_drap)
1578 return arg_pointer_rtx;
18c05628
NS
1579 return NULL_RTX;
1580}
1581
738f2522
BS
1582/* Implement the TARGET_CALL_ARGS hook. Record information about one
1583 argument to the next call. */
1584
1585static void
44c068ae 1586nvptx_call_args (rtx arg, tree fntype)
738f2522 1587{
44c068ae 1588 if (!cfun->machine->doing_call)
738f2522 1589 {
44c068ae
NS
1590 cfun->machine->doing_call = true;
1591 cfun->machine->is_varadic = false;
1592 cfun->machine->num_args = 0;
1593
1594 if (fntype && stdarg_p (fntype))
1595 {
1596 cfun->machine->is_varadic = true;
1597 cfun->machine->has_varadic = true;
1598 cfun->machine->num_args++;
1599 }
738f2522 1600 }
738f2522 1601
44c068ae
NS
1602 if (REG_P (arg) && arg != pc_rtx)
1603 {
1604 cfun->machine->num_args++;
1605 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1606 cfun->machine->call_args);
1607 }
738f2522
BS
1608}
1609
1610/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1611 information we recorded. */
1612
1613static void
1614nvptx_end_call_args (void)
1615{
44c068ae 1616 cfun->machine->doing_call = false;
738f2522
BS
1617 free_EXPR_LIST_list (&cfun->machine->call_args);
1618}
1619
ecf6e535
BS
1620/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1621 track of whether calls involving static chains or varargs were seen
1622 in the current function.
1623 For libcalls, maintain a hash table of decls we have seen, and
1624 record a function decl for later when encountering a new one. */
738f2522
BS
1625
1626void
1627nvptx_expand_call (rtx retval, rtx address)
1628{
738f2522 1629 rtx callee = XEXP (address, 0);
f324806d 1630 rtx varargs = NULL_RTX;
d88cd9c4 1631 unsigned parallel = 0;
738f2522 1632
738f2522
BS
1633 if (!call_insn_operand (callee, Pmode))
1634 {
1635 callee = force_reg (Pmode, callee);
1636 address = change_address (address, QImode, callee);
1637 }
1638
1639 if (GET_CODE (callee) == SYMBOL_REF)
1640 {
1641 tree decl = SYMBOL_REF_DECL (callee);
1642 if (decl != NULL_TREE)
1643 {
738f2522 1644 if (DECL_STATIC_CHAIN (decl))
44c068ae 1645 cfun->machine->has_chain = true;
00e52418 1646
629b3d75 1647 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1648 if (attr)
1649 {
1650 tree dims = TREE_VALUE (attr);
1651
1652 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1653 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1654 {
1655 if (TREE_PURPOSE (dims)
1656 && !integer_zerop (TREE_PURPOSE (dims)))
1657 break;
1658 /* Not on this axis. */
1659 parallel ^= GOMP_DIM_MASK (ix);
1660 dims = TREE_CHAIN (dims);
1661 }
1662 }
738f2522
BS
1663 }
1664 }
c38f0d8c 1665
44c068ae
NS
1666 unsigned nargs = cfun->machine->num_args;
1667 if (cfun->machine->is_varadic)
738f2522 1668 {
f324806d 1669 varargs = gen_reg_rtx (Pmode);
863af9a4 1670 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1671 }
1672
44c068ae
NS
1673 rtvec vec = rtvec_alloc (nargs + 1);
1674 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1675 int vec_pos = 0;
44c068ae
NS
1676
1677 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1678 rtx tmp_retval = retval;
44c068ae 1679 if (retval)
738f2522
BS
1680 {
1681 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1682 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1683 call = gen_rtx_SET (tmp_retval, call);
738f2522 1684 }
44c068ae 1685 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1686
1687 /* Construct the call insn, including a USE for each argument pseudo
1688 register. These will be used when printing the insn. */
1689 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1690 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1691
1692 if (varargs)
cf08c344 1693 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1694
1695 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1696
d88cd9c4 1697 nvptx_emit_forking (parallel, true);
738f2522 1698 emit_call_insn (pat);
d88cd9c4
NS
1699 nvptx_emit_joining (parallel, true);
1700
738f2522
BS
1701 if (tmp_retval != retval)
1702 emit_move_insn (retval, tmp_retval);
1703}
44eba92d 1704
738f2522
BS
1705/* Emit a comparison COMPARE, and return the new test to be used in the
1706 jump. */
1707
1708rtx
1709nvptx_expand_compare (rtx compare)
1710{
1711 rtx pred = gen_reg_rtx (BImode);
1712 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1713 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1714 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1715 return gen_rtx_NE (BImode, pred, const0_rtx);
1716}
1717
d88cd9c4
NS
1718/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1719
1720void
1721nvptx_expand_oacc_fork (unsigned mode)
1722{
1723 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1724}
1725
1726void
1727nvptx_expand_oacc_join (unsigned mode)
1728{
1729 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1730}
1731
1732/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1733 objects. */
1734
1735static rtx
1736nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1737{
1738 rtx res;
1739
1740 switch (GET_MODE (src))
1741 {
4e10a5a7 1742 case E_DImode:
d88cd9c4
NS
1743 res = gen_unpackdisi2 (dst0, dst1, src);
1744 break;
4e10a5a7 1745 case E_DFmode:
d88cd9c4
NS
1746 res = gen_unpackdfsi2 (dst0, dst1, src);
1747 break;
1748 default: gcc_unreachable ();
1749 }
1750 return res;
1751}
1752
1753/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1754 object. */
1755
1756static rtx
1757nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1758{
1759 rtx res;
1760
1761 switch (GET_MODE (dst))
1762 {
4e10a5a7 1763 case E_DImode:
d88cd9c4
NS
1764 res = gen_packsidi2 (dst, src0, src1);
1765 break;
4e10a5a7 1766 case E_DFmode:
d88cd9c4
NS
1767 res = gen_packsidf2 (dst, src0, src1);
1768 break;
1769 default: gcc_unreachable ();
1770 }
1771 return res;
1772}
1773
1774/* Generate an instruction or sequence to broadcast register REG
1775 across the vectors of a single warp. */
1776
5012919d 1777rtx
59263259 1778nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1779{
1780 rtx res;
1781
1782 switch (GET_MODE (dst))
1783 {
4e10a5a7 1784 case E_SImode:
d88cd9c4
NS
1785 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1786 break;
4e10a5a7 1787 case E_SFmode:
d88cd9c4
NS
1788 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1789 break;
4e10a5a7
RS
1790 case E_DImode:
1791 case E_DFmode:
d88cd9c4
NS
1792 {
1793 rtx tmp0 = gen_reg_rtx (SImode);
1794 rtx tmp1 = gen_reg_rtx (SImode);
1795
1796 start_sequence ();
1797 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1798 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1799 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1800 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1801 res = get_insns ();
1802 end_sequence ();
1803 }
1804 break;
4e10a5a7 1805 case E_BImode:
d88cd9c4
NS
1806 {
1807 rtx tmp = gen_reg_rtx (SImode);
1808
1809 start_sequence ();
1810 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1811 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1812 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1813 res = get_insns ();
1814 end_sequence ();
1815 }
1816 break;
4e10a5a7
RS
1817 case E_QImode:
1818 case E_HImode:
d5ace3b5
CP
1819 {
1820 rtx tmp = gen_reg_rtx (SImode);
1821
1822 start_sequence ();
1823 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1824 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1825 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1826 tmp)));
1827 res = get_insns ();
1828 end_sequence ();
1829 }
1830 break;
d88cd9c4
NS
1831
1832 default:
1833 gcc_unreachable ();
1834 }
1835 return res;
1836}
1837
1838/* Generate an instruction or sequence to broadcast register REG
1839 across the vectors of a single warp. */
1840
1841static rtx
a0b3b5c4 1842nvptx_gen_warp_bcast (rtx reg)
d88cd9c4
NS
1843{
1844 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1845}
1846
1847/* Structure used when generating a worker-level spill or fill. */
1848
1ed57fb8 1849struct broadcast_data_t
d88cd9c4
NS
1850{
1851 rtx base; /* Register holding base addr of buffer. */
1852 rtx ptr; /* Iteration var, if needed. */
1853 unsigned offset; /* Offset into worker buffer. */
1854};
1855
1856/* Direction of the spill/fill and looping setup/teardown indicator. */
1857
1858enum propagate_mask
1859 {
1860 PM_read = 1 << 0,
1861 PM_write = 1 << 1,
1862 PM_loop_begin = 1 << 2,
1863 PM_loop_end = 1 << 3,
1864
1865 PM_read_write = PM_read | PM_write
1866 };
1867
1868/* Generate instruction(s) to spill or fill register REG to/from the
1869 worker broadcast array. PM indicates what is to be done, REP
1870 how many loop iterations will be executed (0 for not a loop). */
1871
1872static rtx
a0b3b5c4
TV
1873nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1874 broadcast_data_t *data, bool vector)
d88cd9c4
NS
1875{
1876 rtx res;
1877 machine_mode mode = GET_MODE (reg);
1878
1879 switch (mode)
1880 {
4e10a5a7 1881 case E_BImode:
d88cd9c4
NS
1882 {
1883 rtx tmp = gen_reg_rtx (SImode);
1884
1885 start_sequence ();
1886 if (pm & PM_read)
1887 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
a0b3b5c4 1888 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
d88cd9c4
NS
1889 if (pm & PM_write)
1890 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1891 res = get_insns ();
1892 end_sequence ();
1893 }
1894 break;
1895
1896 default:
1897 {
1898 rtx addr = data->ptr;
1899
1900 if (!addr)
1901 {
1902 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1903
3c55d60f
TV
1904 oacc_bcast_align = MAX (oacc_bcast_align, align);
1905 data->offset = ROUND_UP (data->offset, align);
d88cd9c4 1906 addr = data->base;
a0b3b5c4 1907 gcc_assert (data->base != NULL);
d88cd9c4
NS
1908 if (data->offset)
1909 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1910 }
1911
1912 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1913 if (pm == PM_read)
1914 res = gen_rtx_SET (addr, reg);
1915 else if (pm == PM_write)
1916 res = gen_rtx_SET (reg, addr);
1917 else
1918 gcc_unreachable ();
1919
1920 if (data->ptr)
1921 {
1922 /* We're using a ptr, increment it. */
1923 start_sequence ();
1924
1925 emit_insn (res);
1926 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1927 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1928 res = get_insns ();
1929 end_sequence ();
1930 }
1931 else
1932 rep = 1;
1933 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1934 }
1935 break;
1936 }
1937 return res;
1938}
738f2522
BS
1939\f
1940/* Returns true if X is a valid address for use in a memory reference. */
1941
1942static bool
1943nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1944{
1945 enum rtx_code code = GET_CODE (x);
1946
1947 switch (code)
1948 {
1949 case REG:
1950 return true;
1951
1952 case PLUS:
1953 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1954 return true;
1955 return false;
1956
1957 case CONST:
1958 case SYMBOL_REF:
1959 case LABEL_REF:
1960 return true;
1961
1962 default:
1963 return false;
1964 }
1965}
738f2522 1966\f
4d5438cd
NS
1967/* Machinery to output constant initializers. When beginning an
1968 initializer, we decide on a fragment size (which is visible in ptx
1969 in the type used), and then all initializer data is buffered until
1970 a fragment is filled and ready to be written out. */
1971
1972static struct
1973{
1974 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1975 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1976 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1977 out. */
1978 unsigned size; /* Fragment size to accumulate. */
1979 unsigned offset; /* Offset within current fragment. */
1980 bool started; /* Whether we've output any initializer. */
1981} init_frag;
1982
1983/* The current fragment is full, write it out. SYM may provide a
1984 symbolic reference we should output, in which case the fragment
1985 value is the addend. */
738f2522
BS
1986
1987static void
4d5438cd 1988output_init_frag (rtx sym)
738f2522 1989{
4d5438cd
NS
1990 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1991 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1992
4d5438cd
NS
1993 init_frag.started = true;
1994 init_frag.val = 0;
1995 init_frag.offset = 0;
1996 init_frag.remaining--;
1997
1998 if (sym)
1999 {
6c7c4708
CP
2000 bool function = (SYMBOL_REF_DECL (sym)
2001 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2002 if (!function)
2003 fprintf (asm_out_file, "generic(");
4d5438cd 2004 output_address (VOIDmode, sym);
6c7c4708
CP
2005 if (!function)
2006 fprintf (asm_out_file, ")");
2007 if (val)
2008 fprintf (asm_out_file, " + ");
4d5438cd 2009 }
738f2522 2010
4d5438cd
NS
2011 if (!sym || val)
2012 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
2013}
2014
4d5438cd
NS
2015/* Add value VAL of size SIZE to the data we're emitting, and keep
2016 writing out chunks as they fill up. */
738f2522
BS
2017
2018static void
4d5438cd 2019nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 2020{
4d5438cd
NS
2021 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2022
2023 for (unsigned part = 0; size; size -= part)
738f2522 2024 {
4d5438cd
NS
2025 val >>= part * BITS_PER_UNIT;
2026 part = init_frag.size - init_frag.offset;
3c55d60f 2027 part = MIN (part, size);
4d5438cd
NS
2028
2029 unsigned HOST_WIDE_INT partial
2030 = val << (init_frag.offset * BITS_PER_UNIT);
2031 init_frag.val |= partial & init_frag.mask;
2032 init_frag.offset += part;
2033
2034 if (init_frag.offset == init_frag.size)
2035 output_init_frag (NULL);
738f2522
BS
2036 }
2037}
2038
2039/* Target hook for assembling integer object X of size SIZE. */
2040
2041static bool
2042nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2043{
00e52418
NS
2044 HOST_WIDE_INT val = 0;
2045
2046 switch (GET_CODE (x))
738f2522 2047 {
00e52418 2048 default:
a9000e1e
NS
2049 /* Let the generic machinery figure it out, usually for a
2050 CONST_WIDE_INT. */
2051 return false;
00e52418
NS
2052
2053 case CONST_INT:
4d5438cd 2054 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
2055 break;
2056
2057 case CONST:
2058 x = XEXP (x, 0);
2059 gcc_assert (GET_CODE (x) == PLUS);
2060 val = INTVAL (XEXP (x, 1));
2061 x = XEXP (x, 0);
2062 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2063 /* FALLTHROUGH */
2064
2065 case SYMBOL_REF:
4d5438cd
NS
2066 gcc_assert (size == init_frag.size);
2067 if (init_frag.offset)
738f2522 2068 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 2069
00e52418 2070 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
2071 init_frag.val = val;
2072 output_init_frag (x);
738f2522 2073 break;
738f2522
BS
2074 }
2075
738f2522
BS
2076 return true;
2077}
2078
2079/* Output SIZE zero bytes. We ignore the FILE argument since the
2080 functions we're calling to perform the output just use
2081 asm_out_file. */
2082
2083void
2084nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2085{
4d5438cd
NS
2086 /* Finish the current fragment, if it's started. */
2087 if (init_frag.offset)
738f2522 2088 {
4d5438cd 2089 unsigned part = init_frag.size - init_frag.offset;
3c55d60f 2090 part = MIN (part, (unsigned)size);
4d5438cd
NS
2091 size -= part;
2092 nvptx_assemble_value (0, part);
738f2522
BS
2093 }
2094
4d5438cd
NS
2095 /* If this skip doesn't terminate the initializer, write as many
2096 remaining pieces as possible directly. */
2097 if (size < init_frag.remaining * init_frag.size)
738f2522 2098 {
4d5438cd
NS
2099 while (size >= init_frag.size)
2100 {
2101 size -= init_frag.size;
2102 output_init_frag (NULL_RTX);
2103 }
2104 if (size)
2105 nvptx_assemble_value (0, size);
738f2522 2106 }
738f2522
BS
2107}
2108
2109/* Output a string STR with length SIZE. As in nvptx_output_skip we
2110 ignore the FILE arg. */
2111
2112void
2113nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2114{
2115 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2116 nvptx_assemble_value (str[i], 1);
2117}
2118
bf398920
TV
2119/* Return true if TYPE is a record type where the last field is an array without
2120 given dimension. */
2121
2122static bool
2123flexible_array_member_type_p (const_tree type)
2124{
2125 if (TREE_CODE (type) != RECORD_TYPE)
2126 return false;
2127
2128 const_tree last_field = NULL_TREE;
2129 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2130 last_field = f;
2131
2132 if (!last_field)
2133 return false;
2134
2135 const_tree last_field_type = TREE_TYPE (last_field);
2136 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2137 return false;
2138
2139 return (! TYPE_DOMAIN (last_field_type)
2140 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2141}
2142
4ff3145a
NS
2143/* Emit a PTX variable decl and prepare for emission of its
2144 initializer. NAME is the symbol name and SETION the PTX data
2145 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2146 The caller has already emitted any indentation and linkage
2147 specifier. It is responsible for any initializer, terminating ;
2148 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2149 this is the opposite way round that PTX wants them! */
2150
2151static void
2152nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2153 const_tree type, HOST_WIDE_INT size, unsigned align,
2154 bool undefined = false)
4ff3145a 2155{
59d2d238
CP
2156 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2157 && (TYPE_DOMAIN (type) == NULL_TREE);
2158
bf398920
TV
2159 if (undefined && flexible_array_member_type_p (type))
2160 {
2161 size = 0;
2162 atype = true;
2163 }
2164
4ff3145a
NS
2165 while (TREE_CODE (type) == ARRAY_TYPE)
2166 type = TREE_TYPE (type);
2167
fc0efeea
NS
2168 if (TREE_CODE (type) == VECTOR_TYPE
2169 || TREE_CODE (type) == COMPLEX_TYPE)
2170 /* Neither vector nor complex types can contain the other. */
2171 type = TREE_TYPE (type);
2172
4ff3145a 2173 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
2174
2175 /* Largest mode we're prepared to accept. For BLKmode types we
2176 don't know if it'll contain pointer constants, so have to choose
2177 pointer size, otherwise we can choose DImode. */
2178 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2179
2180 elt_size |= GET_MODE_SIZE (elt_mode);
2181 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2182
4d5438cd 2183 init_frag.size = elt_size;
9c582551 2184 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2185 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2186 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2187 init_frag.val = 0;
2188 init_frag.offset = 0;
2189 init_frag.started = false;
2190 /* Size might not be a multiple of elt size, if there's an
2191 initialized trailing struct array with smaller type than
2192 elt_size. */
2193 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
2194
2195 fprintf (file, "%s .align %d .u%d ",
2196 section, align / BITS_PER_UNIT,
2197 elt_size * BITS_PER_UNIT);
2198 assemble_name (file, name);
2199
2200 if (size)
2201 /* We make everything an array, to simplify any initialization
2202 emission. */
4d5438cd 2203 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
59d2d238
CP
2204 else if (atype)
2205 fprintf (file, "[]");
4ff3145a
NS
2206}
2207
738f2522
BS
2208/* Called when the initializer for a decl has been completely output through
2209 combinations of the three functions above. */
2210
2211static void
2212nvptx_assemble_decl_end (void)
2213{
4d5438cd
NS
2214 if (init_frag.offset)
2215 /* This can happen with a packed struct with trailing array member. */
2216 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2217 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2218}
2219
69823d76
NS
2220/* Output an uninitialized common or file-scope variable. */
2221
2222void
2223nvptx_output_aligned_decl (FILE *file, const char *name,
2224 const_tree decl, HOST_WIDE_INT size, unsigned align)
2225{
2226 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2227
2228 /* If this is public, it is common. The nearest thing we have to
2229 common is weak. */
4ff3145a
NS
2230 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2231
2232 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2233 TREE_TYPE (decl), size, align);
4d5438cd 2234 nvptx_assemble_decl_end ();
69823d76
NS
2235}
2236
738f2522
BS
2237/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2238 writing a constant variable EXP with NAME and SIZE and its
2239 initializer to FILE. */
2240
2241static void
2242nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2243 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2244{
4ff3145a
NS
2245 write_var_marker (file, true, false, name);
2246
2247 fprintf (file, "\t");
2248
738f2522 2249 tree type = TREE_TYPE (exp);
4ff3145a
NS
2250 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2251 TYPE_ALIGN (type));
738f2522
BS
2252}
2253
2254/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2255 a variable DECL with NAME to FILE. */
2256
2257void
2258nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2259{
4ff3145a 2260 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2261
4ff3145a
NS
2262 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2263 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2264
4ff3145a
NS
2265 tree type = TREE_TYPE (decl);
2266 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2267 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2268 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2269}
2270
2271/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2272
2273static void
2274nvptx_globalize_label (FILE *, const char *)
2275{
2276}
2277
2278/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2279 declaration only for variable DECL with NAME to FILE. */
f313d112 2280
738f2522
BS
2281static void
2282nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2283{
0a0f74aa
NS
2284 /* The middle end can place constant pool decls into the varpool as
2285 undefined. Until that is fixed, catch the problem here. */
2286 if (DECL_IN_CONSTANT_POOL (decl))
2287 return;
2288
721547cd
NS
2289 /* We support weak defintions, and hence have the right
2290 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2291 if (DECL_WEAK (decl))
2292 error_at (DECL_SOURCE_LOCATION (decl),
2293 "PTX does not support weak declarations"
2294 " (only weak definitions)");
69823d76
NS
2295 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2296
4ff3145a
NS
2297 fprintf (file, "\t.extern ");
2298 tree size = DECL_SIZE_UNIT (decl);
2299 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2300 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2301 DECL_ALIGN (decl), true);
1e5154e7 2302 nvptx_assemble_decl_end ();
738f2522
BS
2303}
2304
f313d112
NS
2305/* Output a pattern for a move instruction. */
2306
2307const char *
2308nvptx_output_mov_insn (rtx dst, rtx src)
2309{
2310 machine_mode dst_mode = GET_MODE (dst);
2311 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2312 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2313 machine_mode src_inner = (GET_CODE (src) == SUBREG
2314 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2315
15113b03
NS
2316 rtx sym = src;
2317 if (GET_CODE (sym) == CONST)
2318 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2319 if (SYMBOL_REF_P (sym))
2320 {
2321 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2322 return "%.\tcvta%D1%t0\t%0, %1;";
2323 nvptx_maybe_record_fnsym (sym);
2324 }
15113b03 2325
f313d112
NS
2326 if (src_inner == dst_inner)
2327 return "%.\tmov%t0\t%0, %1;";
2328
2329 if (CONSTANT_P (src))
2330 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2331 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2332 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2333
2334 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2335 {
2336 if (GET_MODE_BITSIZE (dst_mode) == 128
2337 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2338 {
2339 /* mov.b128 is not supported. */
2340 if (dst_inner == V2DImode && src_inner == TImode)
2341 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2342 else if (dst_inner == TImode && src_inner == V2DImode)
2343 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2344
2345 gcc_unreachable ();
2346 }
2347 return "%.\tmov.b%T0\t%0, %1;";
2348 }
f313d112
NS
2349
2350 return "%.\tcvt%t0%t1\t%0, %1;";
2351}
2352
5012919d
AM
2353static void nvptx_print_operand (FILE *, rtx, int);
2354
738f2522 2355/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2356 involves writing .param declarations and in/out copies into them. For
2357 indirect calls, also write the .callprototype. */
738f2522
BS
2358
2359const char *
2360nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2361{
863af9a4 2362 char buf[16];
738f2522
BS
2363 static int labelno;
2364 bool needs_tgt = register_operand (callee, Pmode);
2365 rtx pat = PATTERN (insn);
5012919d
AM
2366 if (GET_CODE (pat) == COND_EXEC)
2367 pat = COND_EXEC_CODE (pat);
f324806d 2368 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2369 tree decl = NULL_TREE;
2370
2371 fprintf (asm_out_file, "\t{\n");
2372 if (result != NULL)
1f065954
NS
2373 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2374 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2375 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2376
ecf6e535 2377 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2378 if (GET_CODE (callee) == SYMBOL_REF)
2379 {
2380 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2381 if (!decl
2382 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2383 nvptx_record_libfunc (callee, result, pat);
2384 else if (DECL_EXTERNAL (decl))
738f2522
BS
2385 nvptx_record_fndecl (decl);
2386 }
2387
2388 if (needs_tgt)
2389 {
2390 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2391 labelno++;
2392 ASM_OUTPUT_LABEL (asm_out_file, buf);
2393 std::stringstream s;
b699adcc 2394 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2395 fputs (s.str().c_str(), asm_out_file);
2396 }
2397
863af9a4 2398 for (int argno = 1; argno < arg_end; argno++)
738f2522 2399 {
863af9a4 2400 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2401 machine_mode mode = GET_MODE (t);
a02d84b6 2402 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2403
863af9a4 2404 /* Mode splitting has already been done. */
a02d84b6
NS
2405 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2406 "\t\tst.param%s [%%out_arg%d], ",
2407 ptx_type, argno, ptx_type, argno);
2408 output_reg (asm_out_file, REGNO (t), VOIDmode);
2409 fprintf (asm_out_file, ";\n");
738f2522
BS
2410 }
2411
5012919d
AM
2412 /* The '.' stands for the call's predicate, if any. */
2413 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2414 fprintf (asm_out_file, "\t\tcall ");
2415 if (result != NULL_RTX)
1f065954
NS
2416 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2417
738f2522
BS
2418 if (decl)
2419 {
2420 const char *name = get_fnname_from_decl (decl);
b4346107 2421 name = nvptx_name_replacement (name);
738f2522
BS
2422 assemble_name (asm_out_file, name);
2423 }
2424 else
cc8ca59e 2425 output_address (VOIDmode, callee);
738f2522 2426
863af9a4
NS
2427 const char *open = "(";
2428 for (int argno = 1; argno < arg_end; argno++)
738f2522 2429 {
863af9a4
NS
2430 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2431 open = "";
738f2522 2432 }
863af9a4
NS
2433 if (decl && DECL_STATIC_CHAIN (decl))
2434 {
5563d5c0 2435 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2436 open = "";
2437 }
2438 if (!open[0])
2439 fprintf (asm_out_file, ")");
f324806d 2440
738f2522
BS
2441 if (needs_tgt)
2442 {
2443 fprintf (asm_out_file, ", ");
2444 assemble_name (asm_out_file, buf);
2445 }
2446 fprintf (asm_out_file, ";\n");
738f2522 2447
51baf85a 2448 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2449 {
2450 /* No return functions confuse the PTX JIT, as it doesn't realize
2451 the flow control barrier they imply. It can seg fault if it
2452 encounters what looks like an unexitable loop. Emit a trailing
2453 trap and exit, which it does grok. */
2454 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2455 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2456 }
51baf85a 2457
1f065954
NS
2458 if (result)
2459 {
2460 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2461
2462 if (!rval[0])
2463 /* We must escape the '%' that starts RETURN_REGNUM. */
2464 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2465 reg_names[NVPTX_RETURN_REGNUM]);
2466 return rval;
2467 }
2468
2469 return "}";
738f2522
BS
2470}
2471
2472/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2473
2474static bool
2475nvptx_print_operand_punct_valid_p (unsigned char c)
2476{
2477 return c == '.' || c== '#';
2478}
2479
738f2522
BS
2480/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2481
2482static void
2483nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2484{
2485 rtx off;
2486 if (GET_CODE (x) == CONST)
2487 x = XEXP (x, 0);
2488 switch (GET_CODE (x))
2489 {
2490 case PLUS:
2491 off = XEXP (x, 1);
cc8ca59e 2492 output_address (VOIDmode, XEXP (x, 0));
738f2522 2493 fprintf (file, "+");
cc8ca59e 2494 output_address (VOIDmode, off);
738f2522
BS
2495 break;
2496
2497 case SYMBOL_REF:
2498 case LABEL_REF:
2499 output_addr_const (file, x);
2500 break;
2501
2502 default:
2503 gcc_assert (GET_CODE (x) != MEM);
2504 nvptx_print_operand (file, x, 0);
2505 break;
2506 }
2507}
2508
2509/* Write assembly language output for the address ADDR to FILE. */
2510
2511static void
cc8ca59e 2512nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2513{
cc8ca59e 2514 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2515}
2516
2517/* Print an operand, X, to FILE, with an optional modifier in CODE.
2518
2519 Meaning of CODE:
2520 . -- print the predicate for the instruction or an emptry string for an
2521 unconditional one.
2522 # -- print a rounding mode for the instruction
2523
9a863523 2524 A -- print a data area for a MEM
738f2522 2525 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2526 D -- print a data area for a MEM operand
d88cd9c4 2527 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2528 t -- print a type opcode suffix, promoting QImode to 32 bits
2529 T -- print a type size in bits
2530 u -- print a type opcode suffix without promotions. */
2531
2532static void
2533nvptx_print_operand (FILE *file, rtx x, int code)
2534{
738f2522
BS
2535 if (code == '.')
2536 {
2537 x = current_insn_predicate;
2538 if (x)
2539 {
5012919d 2540 fputs ("@", file);
738f2522
BS
2541 if (GET_CODE (x) == EQ)
2542 fputs ("!", file);
5012919d 2543 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2544 }
2545 return;
2546 }
2547 else if (code == '#')
2548 {
2549 fputs (".rn", file);
2550 return;
2551 }
2552
2553 enum rtx_code x_code = GET_CODE (x);
f313d112 2554 machine_mode mode = GET_MODE (x);
738f2522
BS
2555
2556 switch (code)
2557 {
2558 case 'A':
9a863523
NS
2559 x = XEXP (x, 0);
2560 /* FALLTHROUGH. */
7b8edc29 2561
9a863523
NS
2562 case 'D':
2563 if (GET_CODE (x) == CONST)
2564 x = XEXP (x, 0);
2565 if (GET_CODE (x) == PLUS)
2566 x = XEXP (x, 0);
7b8edc29 2567
9a863523
NS
2568 if (GET_CODE (x) == SYMBOL_REF)
2569 fputs (section_for_sym (x), file);
738f2522
BS
2570 break;
2571
738f2522 2572 case 't':
738f2522 2573 case 'u':
f313d112
NS
2574 if (x_code == SUBREG)
2575 {
1310ff03
TV
2576 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2577 if (VECTOR_MODE_P (inner_mode)
2578 && (GET_MODE_SIZE (mode)
2579 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2580 mode = GET_MODE_INNER (inner_mode);
2581 else if (split_mode_p (inner_mode))
2582 mode = maybe_split_mode (inner_mode);
2583 else
2584 mode = inner_mode;
f313d112
NS
2585 }
2586 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2587 break;
2588
3717fbe3
TV
2589 case 'H':
2590 case 'L':
2591 {
2592 rtx inner_x = SUBREG_REG (x);
2593 machine_mode inner_mode = GET_MODE (inner_x);
2594 machine_mode split = maybe_split_mode (inner_mode);
2595
2596 output_reg (file, REGNO (inner_x), split,
2597 (code == 'H'
2598 ? GET_MODE_SIZE (inner_mode) / 2
2599 : 0));
2600 }
2601 break;
2602
d88cd9c4
NS
2603 case 'S':
2604 {
59263259
NS
2605 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2606 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2607 static const char *const kinds[] =
59263259
NS
2608 {".up", ".down", ".bfly", ".idx"};
2609 fputs (kinds[kind], file);
d88cd9c4
NS
2610 }
2611 break;
2612
738f2522 2613 case 'T':
f313d112 2614 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2615 break;
2616
2617 case 'j':
2618 fprintf (file, "@");
2619 goto common;
2620
2621 case 'J':
2622 fprintf (file, "@!");
2623 goto common;
2624
2625 case 'c':
f313d112 2626 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2627 switch (x_code)
2628 {
2629 case EQ:
2630 fputs (".eq", file);
2631 break;
2632 case NE:
f313d112 2633 if (FLOAT_MODE_P (mode))
738f2522
BS
2634 fputs (".neu", file);
2635 else
2636 fputs (".ne", file);
2637 break;
2638 case LE:
578fb225 2639 case LEU:
738f2522
BS
2640 fputs (".le", file);
2641 break;
2642 case GE:
578fb225 2643 case GEU:
738f2522
BS
2644 fputs (".ge", file);
2645 break;
2646 case LT:
578fb225 2647 case LTU:
738f2522
BS
2648 fputs (".lt", file);
2649 break;
2650 case GT:
738f2522 2651 case GTU:
578fb225 2652 fputs (".gt", file);
738f2522
BS
2653 break;
2654 case LTGT:
2655 fputs (".ne", file);
2656 break;
2657 case UNEQ:
2658 fputs (".equ", file);
2659 break;
2660 case UNLE:
2661 fputs (".leu", file);
2662 break;
2663 case UNGE:
2664 fputs (".geu", file);
2665 break;
2666 case UNLT:
2667 fputs (".ltu", file);
2668 break;
2669 case UNGT:
2670 fputs (".gtu", file);
2671 break;
2672 case UNORDERED:
2673 fputs (".nan", file);
2674 break;
2675 case ORDERED:
2676 fputs (".num", file);
2677 break;
2678 default:
2679 gcc_unreachable ();
2680 }
f313d112 2681 if (FLOAT_MODE_P (mode)
738f2522
BS
2682 || x_code == EQ || x_code == NE
2683 || x_code == GEU || x_code == GTU
2684 || x_code == LEU || x_code == LTU)
f313d112 2685 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2686 else
f313d112 2687 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2688 break;
2689 default:
2690 common:
2691 switch (x_code)
2692 {
2693 case SUBREG:
f313d112
NS
2694 {
2695 rtx inner_x = SUBREG_REG (x);
2696 machine_mode inner_mode = GET_MODE (inner_x);
2697 machine_mode split = maybe_split_mode (inner_mode);
2698
1310ff03
TV
2699 if (VECTOR_MODE_P (inner_mode)
2700 && (GET_MODE_SIZE (mode)
2701 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2702 {
2703 output_reg (file, REGNO (inner_x), VOIDmode);
2704 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2705 }
2706 else if (split_mode_p (inner_mode)
f313d112
NS
2707 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2708 output_reg (file, REGNO (inner_x), split);
2709 else
2710 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2711 }
2712 break;
738f2522
BS
2713
2714 case REG:
f313d112 2715 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2716 break;
2717
2718 case MEM:
2719 fputc ('[', file);
f313d112 2720 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2721 fputc (']', file);
2722 break;
2723
2724 case CONST_INT:
2725 output_addr_const (file, x);
2726 break;
2727
2728 case CONST:
2729 case SYMBOL_REF:
2730 case LABEL_REF:
2731 /* We could use output_addr_const, but that can print things like
2732 "x-8", which breaks ptxas. Need to ensure it is output as
2733 "x+-8". */
2734 nvptx_print_address_operand (file, x, VOIDmode);
2735 break;
2736
2737 case CONST_DOUBLE:
2738 long vals[2];
f313d112 2739 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2740 vals[0] &= 0xffffffff;
2741 vals[1] &= 0xffffffff;
f313d112 2742 if (mode == SFmode)
738f2522
BS
2743 fprintf (file, "0f%08lx", vals[0]);
2744 else
2745 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2746 break;
2747
1310ff03
TV
2748 case CONST_VECTOR:
2749 {
2750 unsigned n = CONST_VECTOR_NUNITS (x);
2751 fprintf (file, "{ ");
2752 for (unsigned i = 0; i < n; ++i)
2753 {
2754 if (i != 0)
2755 fprintf (file, ", ");
2756
2757 rtx elem = CONST_VECTOR_ELT (x, i);
2758 output_addr_const (file, elem);
2759 }
2760 fprintf (file, " }");
2761 }
2762 break;
2763
738f2522
BS
2764 default:
2765 output_addr_const (file, x);
2766 }
2767 }
2768}
2769\f
2770/* Record replacement regs used to deal with subreg operands. */
2771struct reg_replace
2772{
2773 rtx replacement[MAX_RECOG_OPERANDS];
2774 machine_mode mode;
2775 int n_allocated;
2776 int n_in_use;
2777};
2778
2779/* Allocate or reuse a replacement in R and return the rtx. */
2780
2781static rtx
2782get_replacement (struct reg_replace *r)
2783{
2784 if (r->n_allocated == r->n_in_use)
2785 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2786 return r->replacement[r->n_in_use++];
2787}
2788
2789/* Clean up subreg operands. In ptx assembly, everything is typed, and
2790 the presence of subregs would break the rules for most instructions.
2791 Replace them with a suitable new register of the right size, plus
2792 conversion copyin/copyout instructions. */
2793
2794static void
517665b3 2795nvptx_reorg_subreg (void)
738f2522
BS
2796{
2797 struct reg_replace qiregs, hiregs, siregs, diregs;
2798 rtx_insn *insn, *next;
2799
738f2522
BS
2800 qiregs.n_allocated = 0;
2801 hiregs.n_allocated = 0;
2802 siregs.n_allocated = 0;
2803 diregs.n_allocated = 0;
2804 qiregs.mode = QImode;
2805 hiregs.mode = HImode;
2806 siregs.mode = SImode;
2807 diregs.mode = DImode;
2808
2809 for (insn = get_insns (); insn; insn = next)
2810 {
2811 next = NEXT_INSN (insn);
2812 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2813 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2814 || GET_CODE (PATTERN (insn)) == USE
2815 || GET_CODE (PATTERN (insn)) == CLOBBER)
2816 continue;
f324806d 2817
738f2522
BS
2818 qiregs.n_in_use = 0;
2819 hiregs.n_in_use = 0;
2820 siregs.n_in_use = 0;
2821 diregs.n_in_use = 0;
2822 extract_insn (insn);
2823 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2824
738f2522
BS
2825 for (int i = 0; i < recog_data.n_operands; i++)
2826 {
2827 rtx op = recog_data.operand[i];
2828 if (GET_CODE (op) != SUBREG)
2829 continue;
2830
2831 rtx inner = SUBREG_REG (op);
2832
2833 machine_mode outer_mode = GET_MODE (op);
2834 machine_mode inner_mode = GET_MODE (inner);
2835 gcc_assert (s_ok);
2836 if (s_ok
2837 && (GET_MODE_PRECISION (inner_mode)
2838 >= GET_MODE_PRECISION (outer_mode)))
2839 continue;
2840 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2841 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2842 : outer_mode == HImode ? &hiregs
2843 : outer_mode == SImode ? &siregs
2844 : &diregs);
2845 rtx new_reg = get_replacement (r);
2846
2847 if (recog_data.operand_type[i] != OP_OUT)
2848 {
2849 enum rtx_code code;
2850 if (GET_MODE_PRECISION (inner_mode)
2851 < GET_MODE_PRECISION (outer_mode))
2852 code = ZERO_EXTEND;
2853 else
2854 code = TRUNCATE;
2855
f7df4a84 2856 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2857 gen_rtx_fmt_e (code, outer_mode, inner));
2858 emit_insn_before (pat, insn);
2859 }
2860
2861 if (recog_data.operand_type[i] != OP_IN)
2862 {
2863 enum rtx_code code;
2864 if (GET_MODE_PRECISION (inner_mode)
2865 < GET_MODE_PRECISION (outer_mode))
2866 code = TRUNCATE;
2867 else
2868 code = ZERO_EXTEND;
2869
f7df4a84 2870 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2871 gen_rtx_fmt_e (code, inner_mode, new_reg));
2872 emit_insn_after (pat, insn);
2873 }
2874 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2875 }
2876 }
517665b3 2877}
738f2522 2878
5012919d
AM
2879/* Return a SImode "master lane index" register for uniform-simt, allocating on
2880 first use. */
2881
2882static rtx
2883nvptx_get_unisimt_master ()
2884{
2885 rtx &master = cfun->machine->unisimt_master;
2886 return master ? master : master = gen_reg_rtx (SImode);
2887}
2888
2889/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2890
2891static rtx
2892nvptx_get_unisimt_predicate ()
2893{
2894 rtx &pred = cfun->machine->unisimt_predicate;
2895 return pred ? pred : pred = gen_reg_rtx (BImode);
2896}
2897
2898/* Return true if given call insn references one of the functions provided by
2899 the CUDA runtime: malloc, free, vprintf. */
2900
2901static bool
2902nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2903{
2904 rtx pat = PATTERN (insn);
2905 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2906 pat = XVECEXP (pat, 0, 0);
2907 if (GET_CODE (pat) == SET)
2908 pat = SET_SRC (pat);
2909 gcc_checking_assert (GET_CODE (pat) == CALL
2910 && GET_CODE (XEXP (pat, 0)) == MEM);
2911 rtx addr = XEXP (XEXP (pat, 0), 0);
2912 if (GET_CODE (addr) != SYMBOL_REF)
2913 return false;
2914 const char *name = XSTR (addr, 0);
2915 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2916 references with forced assembler name refer to PTX syscalls. For vprintf,
2917 accept both normal and forced-assembler-name references. */
2918 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2919 || !strcmp (name, "*malloc")
2920 || !strcmp (name, "*free"));
2921}
2922
2923/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2924 propagate its value from lane MASTER to current lane. */
2925
2926static void
2927nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2928{
2929 rtx reg;
2930 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2931 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2932}
2933
2934/* Adjust code for uniform-simt code generation variant by making atomics and
2935 "syscalls" conditionally executed, and inserting shuffle-based propagation
2936 for registers being set. */
2937
2938static void
2939nvptx_reorg_uniform_simt ()
2940{
2941 rtx_insn *insn, *next;
2942
2943 for (insn = get_insns (); insn; insn = next)
2944 {
2945 next = NEXT_INSN (insn);
2946 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2947 && !(NONJUMP_INSN_P (insn)
2948 && GET_CODE (PATTERN (insn)) == PARALLEL
2949 && get_attr_atomic (insn)))
2950 continue;
2951 rtx pat = PATTERN (insn);
2952 rtx master = nvptx_get_unisimt_master ();
2953 for (int i = 0; i < XVECLEN (pat, 0); i++)
2954 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2955 rtx pred = nvptx_get_unisimt_predicate ();
2956 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2957 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2958 validate_change (insn, &PATTERN (insn), pat, false);
2959 }
2960}
2961
2498a6ab
TV
2962/* Offloading function attributes. */
2963
2964struct offload_attrs
2965{
2966 unsigned mask;
2967 int num_gangs;
2968 int num_workers;
2969 int vector_length;
2970};
2971
0d06e8c7
TV
2972/* Define entries for cfun->machine->axis_dim. */
2973
2974#define MACH_VECTOR_LENGTH 0
2975#define MACH_MAX_WORKERS 1
2976
2977static void populate_offload_attrs (offload_attrs *oa);
2978
2979static void
2980init_axis_dim (void)
2981{
2982 offload_attrs oa;
2983 int max_workers;
2984
2985 populate_offload_attrs (&oa);
2986
2987 if (oa.num_workers == 0)
2988 max_workers = PTX_CTA_SIZE / oa.vector_length;
2989 else
2990 max_workers = oa.num_workers;
2991
2992 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2993 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2994 cfun->machine->axis_dim_init_p = true;
2995}
2996
2997static int ATTRIBUTE_UNUSED
2998nvptx_mach_max_workers ()
2999{
3000 if (!cfun->machine->axis_dim_init_p)
3001 init_axis_dim ();
3002 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3003}
3004
3005static int ATTRIBUTE_UNUSED
3006nvptx_mach_vector_length ()
3007{
3008 if (!cfun->machine->axis_dim_init_p)
3009 init_axis_dim ();
3010 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3011}
3012
d2d47a28
NS
3013/* Loop structure of the function. The entire function is described as
3014 a NULL loop. */
d88cd9c4
NS
3015
3016struct parallel
3017{
3018 /* Parent parallel. */
3019 parallel *parent;
3020
3021 /* Next sibling parallel. */
3022 parallel *next;
3023
3024 /* First child parallel. */
3025 parallel *inner;
3026
3027 /* Partitioning mask of the parallel. */
3028 unsigned mask;
3029
3030 /* Partitioning used within inner parallels. */
3031 unsigned inner_mask;
3032
3033 /* Location of parallel forked and join. The forked is the first
3034 block in the parallel and the join is the first block after of
3035 the partition. */
3036 basic_block forked_block;
3037 basic_block join_block;
3038
3039 rtx_insn *forked_insn;
3040 rtx_insn *join_insn;
3041
3042 rtx_insn *fork_insn;
3043 rtx_insn *joining_insn;
3044
3045 /* Basic blocks in this parallel, but not in child parallels. The
3046 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3047 blocks are not. */
3048 auto_vec<basic_block> blocks;
3049
3050public:
3051 parallel (parallel *parent, unsigned mode);
3052 ~parallel ();
3053};
3054
3055/* Constructor links the new parallel into it's parent's chain of
3056 children. */
3057
3058parallel::parallel (parallel *parent_, unsigned mask_)
3059 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3060{
3061 forked_block = join_block = 0;
3062 forked_insn = join_insn = 0;
3063 fork_insn = joining_insn = 0;
3064
3065 if (parent)
3066 {
3067 next = parent->inner;
3068 parent->inner = this;
3069 }
3070}
3071
3072parallel::~parallel ()
3073{
3074 delete inner;
3075 delete next;
3076}
3077
3078/* Map of basic blocks to insns */
3079typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3080
3081/* A tuple of an insn of interest and the BB in which it resides. */
3082typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3083typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3084
3085/* Split basic blocks such that each forked and join unspecs are at
3086 the start of their basic blocks. Thus afterwards each block will
3087 have a single partitioning mode. We also do the same for return
3088 insns, as they are executed by every thread. Return the
3089 partitioning mode of the function as a whole. Populate MAP with
3090 head and tail blocks. We also clear the BB visited flag, which is
3091 used when finding partitions. */
3092
3093static void
3094nvptx_split_blocks (bb_insn_map_t *map)
3095{
3096 insn_bb_vec_t worklist;
3097 basic_block block;
3098 rtx_insn *insn;
3099
3100 /* Locate all the reorg instructions of interest. */
3101 FOR_ALL_BB_FN (block, cfun)
3102 {
3103 bool seen_insn = false;
3104
3105 /* Clear visited flag, for use by parallel locator */
3106 block->flags &= ~BB_VISITED;
3107
3108 FOR_BB_INSNS (block, insn)
3109 {
3110 if (!INSN_P (insn))
3111 continue;
3112 switch (recog_memoized (insn))
3113 {
3114 default:
3115 seen_insn = true;
3116 continue;
3117 case CODE_FOR_nvptx_forked:
3118 case CODE_FOR_nvptx_join:
3119 break;
3120
3121 case CODE_FOR_return:
3122 /* We also need to split just before return insns, as
3123 that insn needs executing by all threads, but the
3124 block it is in probably does not. */
3125 break;
3126 }
3127
3128 if (seen_insn)
3129 /* We've found an instruction that must be at the start of
3130 a block, but isn't. Add it to the worklist. */
3131 worklist.safe_push (insn_bb_t (insn, block));
3132 else
3133 /* It was already the first instruction. Just add it to
3134 the map. */
3135 map->get_or_insert (block) = insn;
3136 seen_insn = true;
3137 }
3138 }
3139
3140 /* Split blocks on the worklist. */
3141 unsigned ix;
3142 insn_bb_t *elt;
3143 basic_block remap = 0;
3144 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3145 {
3146 if (remap != elt->second)
3147 {
3148 block = elt->second;
3149 remap = block;
3150 }
3151
3152 /* Split block before insn. The insn is in the new block */
3153 edge e = split_block (block, PREV_INSN (elt->first));
3154
3155 block = e->dest;
3156 map->get_or_insert (block) = elt->first;
3157 }
3158}
3159
0024c320
TV
3160/* Return true if MASK contains parallelism that requires shared
3161 memory to broadcast. */
3162
3163static bool
3164nvptx_needs_shared_bcast (unsigned mask)
3165{
3166 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3167 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3168 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3169
3170 return worker || large_vector;
3171}
3172
d88cd9c4
NS
3173/* BLOCK is a basic block containing a head or tail instruction.
3174 Locate the associated prehead or pretail instruction, which must be
3175 in the single predecessor block. */
3176
3177static rtx_insn *
3178nvptx_discover_pre (basic_block block, int expected)
3179{
3180 gcc_assert (block->preds->length () == 1);
3181 basic_block pre_block = (*block->preds)[0]->src;
3182 rtx_insn *pre_insn;
3183
3184 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3185 pre_insn = PREV_INSN (pre_insn))
3186 gcc_assert (pre_insn != BB_HEAD (pre_block));
3187
3188 gcc_assert (recog_memoized (pre_insn) == expected);
3189 return pre_insn;
3190}
3191
3192/* Dump this parallel and all its inner parallels. */
3193
3194static void
3195nvptx_dump_pars (parallel *par, unsigned depth)
3196{
3197 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3198 depth, par->mask,
3199 par->forked_block ? par->forked_block->index : -1,
3200 par->join_block ? par->join_block->index : -1);
3201
3202 fprintf (dump_file, " blocks:");
3203
3204 basic_block block;
3205 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3206 fprintf (dump_file, " %d", block->index);
3207 fprintf (dump_file, "\n");
3208 if (par->inner)
3209 nvptx_dump_pars (par->inner, depth + 1);
3210
3211 if (par->next)
3212 nvptx_dump_pars (par->next, depth);
3213}
3214
3215/* If BLOCK contains a fork/join marker, process it to create or
3216 terminate a loop structure. Add this block to the current loop,
3217 and then walk successor blocks. */
3218
3219static parallel *
3220nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3221{
3222 if (block->flags & BB_VISITED)
3223 return par;
3224 block->flags |= BB_VISITED;
3225
3226 if (rtx_insn **endp = map->get (block))
3227 {
3228 rtx_insn *end = *endp;
3229
3230 /* This is a block head or tail, or return instruction. */
3231 switch (recog_memoized (end))
3232 {
3233 case CODE_FOR_return:
3234 /* Return instructions are in their own block, and we
3235 don't need to do anything more. */
3236 return par;
3237
3238 case CODE_FOR_nvptx_forked:
3239 /* Loop head, create a new inner loop and add it into
3240 our parent's child list. */
3241 {
3242 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3243
3244 gcc_assert (mask);
3245 par = new parallel (par, mask);
3246 par->forked_block = block;
3247 par->forked_insn = end;
0024c320 3248 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3249 par->fork_insn
3250 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3251 }
3252 break;
3253
3254 case CODE_FOR_nvptx_join:
3255 /* A loop tail. Finish the current loop and return to
3256 parent. */
3257 {
3258 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3259
3260 gcc_assert (par->mask == mask);
3261 par->join_block = block;
3262 par->join_insn = end;
0024c320 3263 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3264 par->joining_insn
3265 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3266 par = par->parent;
3267 }
3268 break;
3269
3270 default:
3271 gcc_unreachable ();
3272 }
3273 }
3274
3275 if (par)
3276 /* Add this block onto the current loop's list of blocks. */
3277 par->blocks.safe_push (block);
3278 else
3279 /* This must be the entry block. Create a NULL parallel. */
3280 par = new parallel (0, 0);
3281
3282 /* Walk successor blocks. */
3283 edge e;
3284 edge_iterator ei;
3285
3286 FOR_EACH_EDGE (e, ei, block->succs)
3287 nvptx_find_par (map, par, e->dest);
3288
3289 return par;
3290}
3291
3292/* DFS walk the CFG looking for fork & join markers. Construct
3293 loop structures as we go. MAP is a mapping of basic blocks
3294 to head & tail markers, discovered when splitting blocks. This
3295 speeds up the discovery. We rely on the BB visited flag having
3296 been cleared when splitting blocks. */
3297
3298static parallel *
3299nvptx_discover_pars (bb_insn_map_t *map)
3300{
3301 basic_block block;
3302
3303 /* Mark exit blocks as visited. */
3304 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3305 block->flags |= BB_VISITED;
3306
3307 /* And entry block as not. */
3308 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3309 block->flags &= ~BB_VISITED;
3310
3311 parallel *par = nvptx_find_par (map, 0, block);
3312
3313 if (dump_file)
3314 {
3315 fprintf (dump_file, "\nLoops\n");
3316 nvptx_dump_pars (par, 0);
3317 fprintf (dump_file, "\n");
3318 }
3319
3320 return par;
3321}
3322
912442c2
NS
3323/* Analyse a group of BBs within a partitioned region and create N
3324 Single-Entry-Single-Exit regions. Some of those regions will be
3325 trivial ones consisting of a single BB. The blocks of a
3326 partitioned region might form a set of disjoint graphs -- because
3327 the region encloses a differently partitoned sub region.
3328
3329 We use the linear time algorithm described in 'Finding Regions Fast:
3330 Single Entry Single Exit and control Regions in Linear Time'
3331 Johnson, Pearson & Pingali. That algorithm deals with complete
3332 CFGs, where a back edge is inserted from END to START, and thus the
3333 problem becomes one of finding equivalent loops.
3334
3335 In this case we have a partial CFG. We complete it by redirecting
3336 any incoming edge to the graph to be from an arbitrary external BB,
3337 and similarly redirecting any outgoing edge to be to that BB.
3338 Thus we end up with a closed graph.
3339
3340 The algorithm works by building a spanning tree of an undirected
3341 graph and keeping track of back edges from nodes further from the
3342 root in the tree to nodes nearer to the root in the tree. In the
3343 description below, the root is up and the tree grows downwards.
3344
3345 We avoid having to deal with degenerate back-edges to the same
3346 block, by splitting each BB into 3 -- one for input edges, one for
3347 the node itself and one for the output edges. Such back edges are
3348 referred to as 'Brackets'. Cycle equivalent nodes will have the
3349 same set of brackets.
3350
3351 Determining bracket equivalency is done by maintaining a list of
3352 brackets in such a manner that the list length and final bracket
3353 uniquely identify the set.
3354
3355 We use coloring to mark all BBs with cycle equivalency with the
3356 same color. This is the output of the 'Finding Regions Fast'
3357 algorithm. Notice it doesn't actually find the set of nodes within
3358 a particular region, just unorderd sets of nodes that are the
3359 entries and exits of SESE regions.
3360
3361 After determining cycle equivalency, we need to find the minimal
3362 set of SESE regions. Do this with a DFS coloring walk of the
3363 complete graph. We're either 'looking' or 'coloring'. When
3364 looking, and we're in the subgraph, we start coloring the color of
3365 the current node, and remember that node as the start of the
3366 current color's SESE region. Every time we go to a new node, we
3367 decrement the count of nodes with thet color. If it reaches zero,
3368 we remember that node as the end of the current color's SESE region
3369 and return to 'looking'. Otherwise we color the node the current
3370 color.
3371
3372 This way we end up with coloring the inside of non-trivial SESE
3373 regions with the color of that region. */
3374
3375/* A pair of BBs. We use this to represent SESE regions. */
3376typedef std::pair<basic_block, basic_block> bb_pair_t;
3377typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3378
3379/* A node in the undirected CFG. The discriminator SECOND indicates just
3380 above or just below the BB idicated by FIRST. */
3381typedef std::pair<basic_block, int> pseudo_node_t;
3382
3383/* A bracket indicates an edge towards the root of the spanning tree of the
3384 undirected graph. Each bracket has a color, determined
3385 from the currrent set of brackets. */
3386struct bracket
3387{
3388 pseudo_node_t back; /* Back target */
3389
3390 /* Current color and size of set. */
3391 unsigned color;
3392 unsigned size;
3393
3394 bracket (pseudo_node_t back_)
3395 : back (back_), color (~0u), size (~0u)
3396 {
3397 }
3398
3399 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3400 {
3401 if (length != size)
3402 {
3403 size = length;
3404 color = color_counts.length ();
3405 color_counts.quick_push (0);
3406 }
3407 color_counts[color]++;
3408 return color;
3409 }
3410};
3411
3412typedef auto_vec<bracket> bracket_vec_t;
3413
3414/* Basic block info for finding SESE regions. */
3415
3416struct bb_sese
3417{
3418 int node; /* Node number in spanning tree. */
3419 int parent; /* Parent node number. */
3420
3421 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3422 edges arrive at pseudo-node Ai and the outgoing edges leave at
3423 pseudo-node Ao. We have to remember which way we arrived at a
3424 particular node when generating the spanning tree. dir > 0 means
3425 we arrived at Ai, dir < 0 means we arrived at Ao. */
3426 int dir;
3427
3428 /* Lowest numbered pseudo-node reached via a backedge from thsis
3429 node, or any descendant. */
3430 pseudo_node_t high;
3431
3432 int color; /* Cycle-equivalence color */
3433
3434 /* Stack of brackets for this node. */
3435 bracket_vec_t brackets;
3436
3437 bb_sese (unsigned node_, unsigned p, int dir_)
3438 :node (node_), parent (p), dir (dir_)
3439 {
3440 }
3441 ~bb_sese ();
3442
3443 /* Push a bracket ending at BACK. */
3444 void push (const pseudo_node_t &back)
3445 {
3446 if (dump_file)
3447 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3448 back.first ? back.first->index : 0, back.second);
3449 brackets.safe_push (bracket (back));
3450 }
3451
3452 void append (bb_sese *child);
3453 void remove (const pseudo_node_t &);
3454
3455 /* Set node's color. */
3456 void set_color (auto_vec<unsigned> &color_counts)
3457 {
3458 color = brackets.last ().get_color (color_counts, brackets.length ());
3459 }
3460};
3461
3462bb_sese::~bb_sese ()
3463{
3464}
3465
3466/* Destructively append CHILD's brackets. */
3467
3468void
3469bb_sese::append (bb_sese *child)
3470{
3471 if (int len = child->brackets.length ())
3472 {
3473 int ix;
3474
3475 if (dump_file)
3476 {
3477 for (ix = 0; ix < len; ix++)
3478 {
3479 const pseudo_node_t &pseudo = child->brackets[ix].back;
3480 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3481 child->node, pseudo.first ? pseudo.first->index : 0,
3482 pseudo.second);
3483 }
3484 }
3485 if (!brackets.length ())
3486 std::swap (brackets, child->brackets);
3487 else
3488 {
3489 brackets.reserve (len);
3490 for (ix = 0; ix < len; ix++)
3491 brackets.quick_push (child->brackets[ix]);
3492 }
3493 }
3494}
3495
3496/* Remove brackets that terminate at PSEUDO. */
3497
3498void
3499bb_sese::remove (const pseudo_node_t &pseudo)
3500{
3501 unsigned removed = 0;
3502 int len = brackets.length ();
3503
3504 for (int ix = 0; ix < len; ix++)
3505 {
3506 if (brackets[ix].back == pseudo)
3507 {
3508 if (dump_file)
3509 fprintf (dump_file, "Removing backedge %d:%+d\n",
3510 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3511 removed++;
3512 }
3513 else if (removed)
3514 brackets[ix-removed] = brackets[ix];
3515 }
3516 while (removed--)
3517 brackets.pop ();
3518}
3519
3520/* Accessors for BB's aux pointer. */
3521#define BB_SET_SESE(B, S) ((B)->aux = (S))
3522#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3523
3524/* DFS walk creating SESE data structures. Only cover nodes with
3525 BB_VISITED set. Append discovered blocks to LIST. We number in
3526 increments of 3 so that the above and below pseudo nodes can be
3527 implicitly numbered too. */
3528
3529static int
3530nvptx_sese_number (int n, int p, int dir, basic_block b,
3531 auto_vec<basic_block> *list)
3532{
3533 if (BB_GET_SESE (b))
3534 return n;
3535
3536 if (dump_file)
3537 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3538 b->index, n, p, dir);
3539
3540 BB_SET_SESE (b, new bb_sese (n, p, dir));
3541 p = n;
3542
3543 n += 3;
3544 list->quick_push (b);
3545
3546 /* First walk the nodes on the 'other side' of this node, then walk
3547 the nodes on the same side. */
3548 for (unsigned ix = 2; ix; ix--)
3549 {
3550 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3551 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3552 : offsetof (edge_def, src));
3553 edge e;
3554 edge_iterator (ei);
3555
3556 FOR_EACH_EDGE (e, ei, edges)
3557 {
3558 basic_block target = *(basic_block *)((char *)e + offset);
3559
3560 if (target->flags & BB_VISITED)
3561 n = nvptx_sese_number (n, p, dir, target, list);
3562 }
3563 dir = -dir;
3564 }
3565 return n;
3566}
3567
3568/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3569 EDGES are the outgoing edges and OFFSET is the offset to the src
3570 or dst block on the edges. */
3571
3572static void
3573nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3574 vec<edge, va_gc> *edges, size_t offset)
3575{
3576 edge e;
3577 edge_iterator (ei);
3578 int hi_back = depth;
3579 pseudo_node_t node_back (0, depth);
3580 int hi_child = depth;
3581 pseudo_node_t node_child (0, depth);
3582 basic_block child = NULL;
3583 unsigned num_children = 0;
3584 int usd = -dir * sese->dir;
3585
3586 if (dump_file)
3587 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3588 me->index, sese->node, dir);
3589
3590 if (dir < 0)
3591 {
3592 /* This is the above pseudo-child. It has the BB itself as an
3593 additional child node. */
3594 node_child = sese->high;
3595 hi_child = node_child.second;
3596 if (node_child.first)
3597 hi_child += BB_GET_SESE (node_child.first)->node;
3598 num_children++;
3599 }
3600
3601 /* Examine each edge.
3602 - if it is a child (a) append its bracket list and (b) record
3603 whether it is the child with the highest reaching bracket.
3604 - if it is an edge to ancestor, record whether it's the highest
3605 reaching backlink. */
3606 FOR_EACH_EDGE (e, ei, edges)
3607 {
3608 basic_block target = *(basic_block *)((char *)e + offset);
3609
3610 if (bb_sese *t_sese = BB_GET_SESE (target))
3611 {
3612 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3613 {
3614 /* Child node. Append its bracket list. */
3615 num_children++;
3616 sese->append (t_sese);
3617
3618 /* Compare it's hi value. */
3619 int t_hi = t_sese->high.second;
3620
3621 if (basic_block child_hi_block = t_sese->high.first)
3622 t_hi += BB_GET_SESE (child_hi_block)->node;
3623
3624 if (hi_child > t_hi)
3625 {
3626 hi_child = t_hi;
3627 node_child = t_sese->high;
3628 child = target;
3629 }
3630 }
3631 else if (t_sese->node < sese->node + dir
3632 && !(dir < 0 && sese->parent == t_sese->node))
3633 {
3634 /* Non-parental ancestor node -- a backlink. */
3635 int d = usd * t_sese->dir;
3636 int back = t_sese->node + d;
3637
3638 if (hi_back > back)
3639 {
3640 hi_back = back;
3641 node_back = pseudo_node_t (target, d);
3642 }
3643 }
3644 }
3645 else
3646 { /* Fallen off graph, backlink to entry node. */
3647 hi_back = 0;
3648 node_back = pseudo_node_t (0, 0);
3649 }
3650 }
3651
3652 /* Remove any brackets that terminate at this pseudo node. */
3653 sese->remove (pseudo_node_t (me, dir));
3654
3655 /* Now push any backlinks from this pseudo node. */
3656 FOR_EACH_EDGE (e, ei, edges)
3657 {
3658 basic_block target = *(basic_block *)((char *)e + offset);
3659 if (bb_sese *t_sese = BB_GET_SESE (target))
3660 {
3661 if (t_sese->node < sese->node + dir
3662 && !(dir < 0 && sese->parent == t_sese->node))
3663 /* Non-parental ancestor node - backedge from me. */
3664 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3665 }
3666 else
3667 {
3668 /* back edge to entry node */
3669 sese->push (pseudo_node_t (0, 0));
3670 }
3671 }
3672
3673 /* If this node leads directly or indirectly to a no-return region of
3674 the graph, then fake a backedge to entry node. */
3675 if (!sese->brackets.length () || !edges || !edges->length ())
3676 {
3677 hi_back = 0;
3678 node_back = pseudo_node_t (0, 0);
3679 sese->push (node_back);
3680 }
3681
3682 /* Record the highest reaching backedge from us or a descendant. */
3683 sese->high = hi_back < hi_child ? node_back : node_child;
3684
3685 if (num_children > 1)
3686 {
3687 /* There is more than one child -- this is a Y shaped piece of
3688 spanning tree. We have to insert a fake backedge from this
3689 node to the highest ancestor reached by not-the-highest
3690 reaching child. Note that there may be multiple children
3691 with backedges to the same highest node. That's ok and we
3692 insert the edge to that highest node. */
3693 hi_child = depth;
3694 if (dir < 0 && child)
3695 {
3696 node_child = sese->high;
3697 hi_child = node_child.second;
3698 if (node_child.first)
3699 hi_child += BB_GET_SESE (node_child.first)->node;
3700 }
3701
3702 FOR_EACH_EDGE (e, ei, edges)
3703 {
3704 basic_block target = *(basic_block *)((char *)e + offset);
3705
3706 if (target == child)
3707 /* Ignore the highest child. */
3708 continue;
3709
3710 bb_sese *t_sese = BB_GET_SESE (target);
3711 if (!t_sese)
3712 continue;
3713 if (t_sese->parent != sese->node)
3714 /* Not a child. */
3715 continue;
3716
3717 /* Compare its hi value. */
3718 int t_hi = t_sese->high.second;
3719
3720 if (basic_block child_hi_block = t_sese->high.first)
3721 t_hi += BB_GET_SESE (child_hi_block)->node;
3722
3723 if (hi_child > t_hi)
3724 {
3725 hi_child = t_hi;
3726 node_child = t_sese->high;
3727 }
3728 }
3729
3730 sese->push (node_child);
3731 }
3732}
3733
3734
3735/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3736 proceed to successors. Set SESE entry and exit nodes of
3737 REGIONS. */
3738
3739static void
3740nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3741 basic_block block, int coloring)
3742{
3743 bb_sese *sese = BB_GET_SESE (block);
3744
3745 if (block->flags & BB_VISITED)
3746 {
3747 /* If we've already encountered this block, either we must not
3748 be coloring, or it must have been colored the current color. */
3749 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3750 return;
3751 }
3752
3753 block->flags |= BB_VISITED;
3754
3755 if (sese)
3756 {
3757 if (coloring < 0)
3758 {
3759 /* Start coloring a region. */
3760 regions[sese->color].first = block;
3761 coloring = sese->color;
3762 }
3763
3764 if (!--color_counts[sese->color] && sese->color == coloring)
3765 {
3766 /* Found final block of SESE region. */
3767 regions[sese->color].second = block;
3768 coloring = -1;
3769 }
3770 else
3771 /* Color the node, so we can assert on revisiting the node
3772 that the graph is indeed SESE. */
3773 sese->color = coloring;
3774 }
3775 else
3776 /* Fallen off the subgraph, we cannot be coloring. */
3777 gcc_assert (coloring < 0);
3778
3779 /* Walk each successor block. */
3780 if (block->succs && block->succs->length ())
3781 {
3782 edge e;
3783 edge_iterator ei;
3784
3785 FOR_EACH_EDGE (e, ei, block->succs)
3786 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3787 }
3788 else
3789 gcc_assert (coloring < 0);
3790}
3791
3792/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3793 end up with NULL entries in it. */
3794
3795static void
3796nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3797{
3798 basic_block block;
3799 int ix;
3800
3801 /* First clear each BB of the whole function. */
2300c332 3802 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3803 {
3804 block->flags &= ~BB_VISITED;
3805 BB_SET_SESE (block, 0);
3806 }
912442c2
NS
3807
3808 /* Mark blocks in the function that are in this graph. */
3809 for (ix = 0; blocks.iterate (ix, &block); ix++)
3810 block->flags |= BB_VISITED;
3811
3812 /* Counts of nodes assigned to each color. There cannot be more
3813 colors than blocks (and hopefully there will be fewer). */
3814 auto_vec<unsigned> color_counts;
3815 color_counts.reserve (blocks.length ());
3816
3817 /* Worklist of nodes in the spanning tree. Again, there cannot be
3818 more nodes in the tree than blocks (there will be fewer if the
3819 CFG of blocks is disjoint). */
3820 auto_vec<basic_block> spanlist;
3821 spanlist.reserve (blocks.length ());
3822
3823 /* Make sure every block has its cycle class determined. */
3824 for (ix = 0; blocks.iterate (ix, &block); ix++)
3825 {
3826 if (BB_GET_SESE (block))
3827 /* We already met this block in an earlier graph solve. */
3828 continue;
3829
3830 if (dump_file)
3831 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3832
3833 /* Number the nodes reachable from block initial DFS order. */
3834 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3835
3836 /* Now walk in reverse DFS order to find cycle equivalents. */
3837 while (spanlist.length ())
3838 {
3839 block = spanlist.pop ();
3840 bb_sese *sese = BB_GET_SESE (block);
3841
3842 /* Do the pseudo node below. */
3843 nvptx_sese_pseudo (block, sese, depth, +1,
3844 sese->dir > 0 ? block->succs : block->preds,
3845 (sese->dir > 0 ? offsetof (edge_def, dest)
3846 : offsetof (edge_def, src)));
3847 sese->set_color (color_counts);
3848 /* Do the pseudo node above. */
3849 nvptx_sese_pseudo (block, sese, depth, -1,
3850 sese->dir < 0 ? block->succs : block->preds,
3851 (sese->dir < 0 ? offsetof (edge_def, dest)
3852 : offsetof (edge_def, src)));
3853 }
3854 if (dump_file)
3855 fprintf (dump_file, "\n");
3856 }
3857
3858 if (dump_file)
3859 {
3860 unsigned count;
3861 const char *comma = "";
3862
3863 fprintf (dump_file, "Found %d cycle equivalents\n",
3864 color_counts.length ());
3865 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3866 {
3867 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3868
3869 comma = "";
3870 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3871 if (BB_GET_SESE (block)->color == ix)
3872 {
3873 block->flags |= BB_VISITED;
3874 fprintf (dump_file, "%s%d", comma, block->index);
3875 comma=",";
3876 }
3877 fprintf (dump_file, "}");
3878 comma = ", ";
3879 }
3880 fprintf (dump_file, "\n");
3881 }
3882
3883 /* Now we've colored every block in the subgraph. We now need to
3884 determine the minimal set of SESE regions that cover that
3885 subgraph. Do this with a DFS walk of the complete function.
3886 During the walk we're either 'looking' or 'coloring'. When we
3887 reach the last node of a particular color, we stop coloring and
3888 return to looking. */
3889
3890 /* There cannot be more SESE regions than colors. */
3891 regions.reserve (color_counts.length ());
3892 for (ix = color_counts.length (); ix--;)
3893 regions.quick_push (bb_pair_t (0, 0));
3894
3895 for (ix = 0; blocks.iterate (ix, &block); ix++)
3896 block->flags &= ~BB_VISITED;
3897
3898 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3899
3900 if (dump_file)
3901 {
3902 const char *comma = "";
3903 int len = regions.length ();
3904
3905 fprintf (dump_file, "SESE regions:");
3906 for (ix = 0; ix != len; ix++)
3907 {
3908 basic_block from = regions[ix].first;
3909 basic_block to = regions[ix].second;
3910
3911 if (from)
3912 {
3913 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3914 if (to != from)
3915 fprintf (dump_file, "->%d", to->index);
3916
3917 int color = BB_GET_SESE (from)->color;
3918
3919 /* Print the blocks within the region (excluding ends). */
3920 FOR_EACH_BB_FN (block, cfun)
3921 {
3922 bb_sese *sese = BB_GET_SESE (block);
3923
3924 if (sese && sese->color == color
3925 && block != from && block != to)
3926 fprintf (dump_file, ".%d", block->index);
3927 }
3928 fprintf (dump_file, "}");
3929 }
3930 comma = ",";
3931 }
3932 fprintf (dump_file, "\n\n");
3933 }
3934
3935 for (ix = 0; blocks.iterate (ix, &block); ix++)
3936 delete BB_GET_SESE (block);
3937}
3938
3939#undef BB_SET_SESE
3940#undef BB_GET_SESE
3941
e91eba31
NS
3942/* Propagate live state at the start of a partitioned region. IS_CALL
3943 indicates whether the propagation is for a (partitioned) call
3944 instruction. BLOCK provides the live register information, and
3945 might not contain INSN. Propagation is inserted just after INSN. RW
3946 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
3947 separation is needed for worker-level proppagation where we
3948 essentially do a spill & fill. FN is the underlying worker
3949 function to generate the propagation instructions for single
3950 register. DATA is user data.
3951
e91eba31
NS
3952 Returns true if we didn't emit any instructions.
3953
3954 We propagate the live register set for non-calls and the entire
3955 frame for calls and non-calls. We could do better by (a)
3956 propagating just the live set that is used within the partitioned
3957 regions and (b) only propagating stack entries that are used. The
3958 latter might be quite hard to determine. */
d88cd9c4 3959
a0b3b5c4 3960typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
d88cd9c4 3961
e91eba31
NS
3962static bool
3963nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
a0b3b5c4 3964 propagate_mask rw, propagator_fn fn, void *data, bool vector)
d88cd9c4
NS
3965{
3966 bitmap live = DF_LIVE_IN (block);
3967 bitmap_iterator iterator;
3968 unsigned ix;
e91eba31 3969 bool empty = true;
d88cd9c4
NS
3970
3971 /* Copy the frame array. */
3972 HOST_WIDE_INT fs = get_frame_size ();
3973 if (fs)
3974 {
3975 rtx tmp = gen_reg_rtx (DImode);
3976 rtx idx = NULL_RTX;
3977 rtx ptr = gen_reg_rtx (Pmode);
3978 rtx pred = NULL_RTX;
3979 rtx_code_label *label = NULL;
3980
e91eba31 3981 empty = false;
c1311c86
NS
3982 /* The frame size might not be DImode compatible, but the frame
3983 array's declaration will be. So it's ok to round up here. */
3984 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3985 /* Detect single iteration loop. */
3986 if (fs == 1)
3987 fs = 0;
3988
3989 start_sequence ();
3990 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3991 if (fs)
3992 {
3993 idx = gen_reg_rtx (SImode);
3994 pred = gen_reg_rtx (BImode);
3995 label = gen_label_rtx ();
3996
3997 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3998 /* Allow worker function to initialize anything needed. */
a0b3b5c4 3999 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
d88cd9c4
NS
4000 if (init)
4001 emit_insn (init);
4002 emit_label (label);
4003 LABEL_NUSES (label)++;
4004 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4005 }
4006 if (rw & PM_read)
4007 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
a0b3b5c4 4008 emit_insn (fn (tmp, rw, fs, data, vector));
d88cd9c4
NS
4009 if (rw & PM_write)
4010 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4011 if (fs)
4012 {
4013 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4014 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4015 emit_insn (gen_br_true_uni (pred, label));
a0b3b5c4 4016 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
d88cd9c4
NS
4017 if (fini)
4018 emit_insn (fini);
4019 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4020 }
4021 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4022 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4023 rtx cpy = get_insns ();
4024 end_sequence ();
4025 insn = emit_insn_after (cpy, insn);
4026 }
4027
e91eba31
NS
4028 if (!is_call)
4029 /* Copy live registers. */
4030 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4031 {
4032 rtx reg = regno_reg_rtx[ix];
d88cd9c4 4033
e91eba31
NS
4034 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4035 {
a0b3b5c4 4036 rtx bcast = fn (reg, rw, 0, data, vector);
d88cd9c4 4037
e91eba31
NS
4038 insn = emit_insn_after (bcast, insn);
4039 empty = false;
4040 }
4041 }
4042 return empty;
d88cd9c4
NS
4043}
4044
a0b3b5c4 4045/* Worker for nvptx_warp_propagate. */
d88cd9c4
NS
4046
4047static rtx
a0b3b5c4
TV
4048warp_prop_gen (rtx reg, propagate_mask pm,
4049 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4050 bool ARG_UNUSED (vector))
d88cd9c4
NS
4051{
4052 if (!(pm & PM_read_write))
4053 return 0;
4054
a0b3b5c4 4055 return nvptx_gen_warp_bcast (reg);
d88cd9c4
NS
4056}
4057
4058/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
4059 of a single warp. Propagation is inserted just after INSN.
4060 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4061
e91eba31 4062static bool
a0b3b5c4 4063nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 4064{
a0b3b5c4
TV
4065 return nvptx_propagate (is_call, block, insn, PM_read_write,
4066 warp_prop_gen, 0, false);
d88cd9c4
NS
4067}
4068
a0b3b5c4 4069/* Worker for nvptx_shared_propagate. */
d88cd9c4
NS
4070
4071static rtx
a0b3b5c4
TV
4072shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4073 bool vector)
d88cd9c4 4074{
1ed57fb8 4075 broadcast_data_t *data = (broadcast_data_t *)data_;
d88cd9c4
NS
4076
4077 if (pm & PM_loop_begin)
4078 {
4079 /* Starting a loop, initialize pointer. */
4080 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4081
3c55d60f
TV
4082 oacc_bcast_align = MAX (oacc_bcast_align, align);
4083 data->offset = ROUND_UP (data->offset, align);
d88cd9c4
NS
4084
4085 data->ptr = gen_reg_rtx (Pmode);
4086
4087 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4088 }
4089 else if (pm & PM_loop_end)
4090 {
4091 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4092 data->ptr = NULL_RTX;
4093 return clobber;
4094 }
4095 else
a0b3b5c4 4096 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
d88cd9c4
NS
4097}
4098
4099/* Spill or fill live state that is live at start of BLOCK. PRE_P
4100 indicates if this is just before partitioned mode (do spill), or
4101 just after it starts (do fill). Sequence is inserted just after
e91eba31 4102 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4103
e91eba31 4104static bool
a0b3b5c4
TV
4105nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4106 rtx_insn *insn, bool vector)
d88cd9c4 4107{
1ed57fb8 4108 broadcast_data_t data;
d88cd9c4
NS
4109
4110 data.base = gen_reg_rtx (Pmode);
4111 data.offset = 0;
4112 data.ptr = NULL_RTX;
4113
e91eba31 4114 bool empty = nvptx_propagate (is_call, block, insn,
a0b3b5c4
TV
4115 pre_p ? PM_read : PM_write, shared_prop_gen,
4116 &data, vector);
e91eba31 4117 gcc_assert (empty == !data.offset);
d88cd9c4
NS
4118 if (data.offset)
4119 {
0024c320
TV
4120 rtx bcast_sym = oacc_bcast_sym;
4121
d88cd9c4 4122 /* Stuff was emitted, initialize the base pointer now. */
0024c320
TV
4123 if (vector && nvptx_mach_max_workers () > 1)
4124 {
4125 if (!cfun->machine->bcast_partition)
4126 {
4127 /* It would be nice to place this register in
4128 DATA_AREA_SHARED. */
4129 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4130 }
4131 if (!cfun->machine->sync_bar)
4132 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4133
4134 bcast_sym = cfun->machine->bcast_partition;
4135 }
4136
4137 rtx init = gen_rtx_SET (data.base, bcast_sym);
d88cd9c4 4138 emit_insn_after (init, insn);
15ab6f00 4139
0024c320
TV
4140 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4141 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4142 ? nvptx_mach_max_workers () + 1
4143 : 1);
4144
4145 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4146 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4 4147 }
e91eba31 4148 return empty;
d88cd9c4
NS
4149}
4150
0024c320
TV
4151/* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4152 which is an integer or a register. THREADS is the number of threads
4153 controlled by the barrier. */
d88cd9c4
NS
4154
4155static rtx
0024c320 4156nvptx_cta_sync (rtx lock, int threads)
d88cd9c4 4157{
0024c320 4158 return gen_nvptx_barsync (lock, GEN_INT (threads));
d88cd9c4
NS
4159}
4160
43c371e8
TV
4161#if WORKAROUND_PTXJIT_BUG
4162/* Return first real insn in BB, or return NULL_RTX if BB does not contain
4163 real insns. */
4164
4165static rtx_insn *
4166bb_first_real_insn (basic_block bb)
4167{
4168 rtx_insn *insn;
4169
4170 /* Find first insn of from block. */
4171 FOR_BB_INSNS (bb, insn)
4172 if (INSN_P (insn))
4173 return insn;
4174
4175 return 0;
4176}
4177#endif
4178
6beefdbd
TV
4179/* Return true if INSN needs neutering. */
4180
4181static bool
4182needs_neutering_p (rtx_insn *insn)
4183{
4184 if (!INSN_P (insn))
4185 return false;
4186
4187 switch (recog_memoized (insn))
4188 {
4189 case CODE_FOR_nvptx_fork:
4190 case CODE_FOR_nvptx_forked:
4191 case CODE_FOR_nvptx_joining:
4192 case CODE_FOR_nvptx_join:
4193 case CODE_FOR_nvptx_barsync:
4194 return false;
4195 default:
4196 return true;
4197 }
4198}
4199
a874808c
TV
4200/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4201
4202static bool
4203verify_neutering_jumps (basic_block from,
4204 rtx_insn *vector_jump, rtx_insn *worker_jump,
4205 rtx_insn *vector_label, rtx_insn *worker_label)
4206{
4207 basic_block bb = from;
4208 rtx_insn *insn = BB_HEAD (bb);
4209 bool seen_worker_jump = false;
4210 bool seen_vector_jump = false;
4211 bool seen_worker_label = false;
4212 bool seen_vector_label = false;
4213 bool worker_neutered = false;
4214 bool vector_neutered = false;
4215 while (true)
4216 {
4217 if (insn == worker_jump)
4218 {
4219 seen_worker_jump = true;
4220 worker_neutered = true;
4221 gcc_assert (!vector_neutered);
4222 }
4223 else if (insn == vector_jump)
4224 {
4225 seen_vector_jump = true;
4226 vector_neutered = true;
4227 }
4228 else if (insn == worker_label)
4229 {
4230 seen_worker_label = true;
4231 gcc_assert (worker_neutered);
4232 worker_neutered = false;
4233 }
4234 else if (insn == vector_label)
4235 {
4236 seen_vector_label = true;
4237 gcc_assert (vector_neutered);
4238 vector_neutered = false;
4239 }
4240 else if (INSN_P (insn))
4241 switch (recog_memoized (insn))
4242 {
4243 case CODE_FOR_nvptx_barsync:
4244 gcc_assert (!vector_neutered && !worker_neutered);
4245 break;
4246 default:
4247 break;
4248 }
4249
4250 if (insn != BB_END (bb))
4251 insn = NEXT_INSN (insn);
4252 else if (JUMP_P (insn) && single_succ_p (bb)
4253 && !seen_vector_jump && !seen_worker_jump)
4254 {
4255 bb = single_succ (bb);
4256 insn = BB_HEAD (bb);
4257 }
4258 else
4259 break;
4260 }
4261
4262 gcc_assert (!(vector_jump && !seen_vector_jump));
4263 gcc_assert (!(worker_jump && !seen_worker_jump));
4264
4265 if (seen_vector_label || seen_worker_label)
4266 {
4267 gcc_assert (!(vector_label && !seen_vector_label));
4268 gcc_assert (!(worker_label && !seen_worker_label));
4269
4270 return true;
4271 }
4272
4273 return false;
4274}
4275
4276/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4277
4278static void
4279verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4280 rtx_insn *worker_label)
4281{
4282 basic_block bb = to;
4283 rtx_insn *insn = BB_END (bb);
4284 bool seen_worker_label = false;
4285 bool seen_vector_label = false;
4286 while (true)
4287 {
4288 if (insn == worker_label)
4289 {
4290 seen_worker_label = true;
4291 gcc_assert (!seen_vector_label);
4292 }
4293 else if (insn == vector_label)
4294 seen_vector_label = true;
4295 else if (INSN_P (insn))
4296 switch (recog_memoized (insn))
4297 {
4298 case CODE_FOR_nvptx_barsync:
4299 gcc_assert (!seen_vector_label && !seen_worker_label);
4300 break;
4301 }
4302
4303 if (insn != BB_HEAD (bb))
4304 insn = PREV_INSN (insn);
4305 else
4306 break;
4307 }
4308
4309 gcc_assert (!(vector_label && !seen_vector_label));
4310 gcc_assert (!(worker_label && !seen_worker_label));
4311}
4312
d88cd9c4
NS
4313/* Single neutering according to MASK. FROM is the incoming block and
4314 TO is the outgoing block. These may be the same block. Insert at
4315 start of FROM:
4316
4317 if (tid.<axis>) goto end.
4318
4319 and insert before ending branch of TO (if there is such an insn):
4320
4321 end:
4322 <possibly-broadcast-cond>
4323 <branch>
4324
4325 We currently only use differnt FROM and TO when skipping an entire
4326 loop. We could do more if we detected superblocks. */
4327
4328static void
4329nvptx_single (unsigned mask, basic_block from, basic_block to)
4330{
4331 rtx_insn *head = BB_HEAD (from);
4332 rtx_insn *tail = BB_END (to);
4333 unsigned skip_mask = mask;
4334
ed9091b2
TV
4335 while (true)
4336 {
4337 /* Find first insn of from block. */
6beefdbd 4338 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4339 head = NEXT_INSN (head);
4340
4341 if (from == to)
4342 break;
4343
4344 if (!(JUMP_P (head) && single_succ_p (from)))
4345 break;
4346
4347 basic_block jump_target = single_succ (from);
4348 if (!single_pred_p (jump_target))
4349 break;
4350
4351 from = jump_target;
4352 head = BB_HEAD (from);
4353 }
d88cd9c4
NS
4354
4355 /* Find last insn of to block */
4356 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4357 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4358 tail = PREV_INSN (tail);
4359
4360 /* Detect if tail is a branch. */
4361 rtx tail_branch = NULL_RTX;
4362 rtx cond_branch = NULL_RTX;
4363 if (tail && INSN_P (tail))
4364 {
4365 tail_branch = PATTERN (tail);
4366 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4367 tail_branch = NULL_RTX;
4368 else
4369 {
4370 cond_branch = SET_SRC (tail_branch);
4371 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4372 cond_branch = NULL_RTX;
4373 }
4374 }
4375
4376 if (tail == head)
4377 {
4378 /* If this is empty, do nothing. */
6beefdbd 4379 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4380 return;
4381
d88cd9c4
NS
4382 if (cond_branch)
4383 {
4384 /* If we're only doing vector single, there's no need to
4385 emit skip code because we'll not insert anything. */
4386 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4387 skip_mask = 0;
4388 }
4389 else if (tail_branch)
4390 /* Block with only unconditional branch. Nothing to do. */
4391 return;
4392 }
4393
4394 /* Insert the vector test inside the worker test. */
4395 unsigned mode;
4396 rtx_insn *before = tail;
2ba16fd2 4397 rtx_insn *neuter_start = NULL;
a874808c
TV
4398 rtx_insn *worker_label = NULL, *vector_label = NULL;
4399 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4400 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4401 if (GOMP_DIM_MASK (mode) & skip_mask)
4402 {
4403 rtx_code_label *label = gen_label_rtx ();
4404 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
a874808c
TV
4405 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4406 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4407
4408 if (!pred)
4409 {
4410 pred = gen_reg_rtx (BImode);
4411 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4412 }
7820b298 4413
d88cd9c4
NS
4414 rtx br;
4415 if (mode == GOMP_DIM_VECTOR)
4416 br = gen_br_true (pred, label);
4417 else
4418 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4419 if (neuter_start)
4420 neuter_start = emit_insn_after (br, neuter_start);
4421 else
4422 neuter_start = emit_insn_before (br, head);
a874808c 4423 *mode_jump = neuter_start;
d88cd9c4
NS
4424
4425 LABEL_NUSES (label)++;
a874808c 4426 rtx_insn *label_insn;
d88cd9c4 4427 if (tail_branch)
a874808c
TV
4428 {
4429 label_insn = emit_label_before (label, before);
4430 before = label_insn;
4431 }
d88cd9c4 4432 else
be606483 4433 {
a874808c 4434 label_insn = emit_label_after (label, tail);
e77a1236
TV
4435 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4436 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4437 emit_insn_after (gen_exit (), label_insn);
4438 }
a874808c
TV
4439
4440 if (mode == GOMP_DIM_VECTOR)
4441 vector_label = label_insn;
4442 else
4443 worker_label = label_insn;
d88cd9c4
NS
4444 }
4445
4446 /* Now deal with propagating the branch condition. */
4447 if (cond_branch)
4448 {
4449 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4450
0024c320
TV
4451 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4452 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
d88cd9c4
NS
4453 {
4454 /* Vector mode only, do a shuffle. */
43c371e8
TV
4455#if WORKAROUND_PTXJIT_BUG
4456 /* The branch condition %rcond is propagated like this:
4457
4458 {
4459 .reg .u32 %x;
4460 mov.u32 %x,%tid.x;
4461 setp.ne.u32 %rnotvzero,%x,0;
4462 }
4463
4464 @%rnotvzero bra Lskip;
4465 setp.<op>.<type> %rcond,op1,op2;
4466 Lskip:
4467 selp.u32 %rcondu32,1,0,%rcond;
4468 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4469 setp.ne.u32 %rcond,%rcondu32,0;
4470
4471 There seems to be a bug in the ptx JIT compiler (observed at driver
4472 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4473 unless %rcond is initialized to something before 'bra Lskip'. The
4474 bug is not observed with ptxas from cuda 8.0.61.
4475
4476 It is true that the code is non-trivial: at Lskip, %rcond is
4477 uninitialized in threads 1-31, and after the selp the same holds
4478 for %rcondu32. But shfl propagates the defined value in thread 0
4479 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4480 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4481
4482 There is nothing in the PTX spec to suggest that this is wrong, or
4483 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4484 it as a JIT bug, and the extra initialization as workaround:
4485
4486 {
4487 .reg .u32 %x;
4488 mov.u32 %x,%tid.x;
4489 setp.ne.u32 %rnotvzero,%x,0;
4490 }
4491
4492 +.reg .pred %rcond2;
4493 +setp.eq.u32 %rcond2, 1, 0;
4494
4495 @%rnotvzero bra Lskip;
4496 setp.<op>.<type> %rcond,op1,op2;
4497 +mov.pred %rcond2, %rcond;
4498 Lskip:
4499 +mov.pred %rcond, %rcond2;
4500 selp.u32 %rcondu32,1,0,%rcond;
4501 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4502 setp.ne.u32 %rcond,%rcondu32,0;
4503 */
4504 rtx_insn *label = PREV_INSN (tail);
4505 gcc_assert (label && LABEL_P (label));
4506 rtx tmp = gen_reg_rtx (BImode);
4507 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4508 bb_first_real_insn (from));
8c8e9a6b
TV
4509 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4510 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4511#endif
a0b3b5c4 4512 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
d88cd9c4
NS
4513 }
4514 else
4515 {
4516 /* Includes worker mode, do spill & fill. By construction
4517 we should never have worker mode only. */
1ed57fb8 4518 broadcast_data_t data;
0024c320
TV
4519 unsigned size = GET_MODE_SIZE (SImode);
4520 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4521 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4522 rtx barrier = GEN_INT (0);
4523 int threads = 0;
d88cd9c4 4524
1ed57fb8 4525 data.base = oacc_bcast_sym;
d88cd9c4
NS
4526 data.ptr = 0;
4527
0024c320
TV
4528 bool use_partitioning_p = (vector && !worker
4529 && nvptx_mach_max_workers () > 1
4530 && cfun->machine->bcast_partition);
4531 if (use_partitioning_p)
4532 {
4533 data.base = cfun->machine->bcast_partition;
4534 barrier = cfun->machine->sync_bar;
4535 threads = nvptx_mach_vector_length ();
4536 }
4537 gcc_assert (data.base != NULL);
4538 gcc_assert (barrier);
4539
4540 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4541 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4542 ? nvptx_mach_max_workers () + 1
4543 : 1);
4544
4545 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4546 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4
NS
4547
4548 data.offset = 0;
a0b3b5c4 4549 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
0024c320 4550 vector),
d88cd9c4 4551 before);
0024c320 4552
d88cd9c4 4553 /* Barrier so other workers can see the write. */
0024c320 4554 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4 4555 data.offset = 0;
a0b3b5c4 4556 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
0024c320
TV
4557 vector),
4558 tail);
d88cd9c4
NS
4559 /* This barrier is needed to avoid worker zero clobbering
4560 the broadcast buffer before all the other workers have
4561 had a chance to read this instance of it. */
0024c320 4562 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4
NS
4563 }
4564
4565 extract_insn (tail);
4566 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4567 UNSPEC_BR_UNIFIED);
4568 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4569 }
a874808c
TV
4570
4571 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4572 vector_label, worker_label);
4573 if (!seen_label)
4574 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4575}
4576
4577/* PAR is a parallel that is being skipped in its entirety according to
4578 MASK. Treat this as skipping a superblock starting at forked
4579 and ending at joining. */
4580
4581static void
4582nvptx_skip_par (unsigned mask, parallel *par)
4583{
4584 basic_block tail = par->join_block;
4585 gcc_assert (tail->preds->length () == 1);
4586
4587 basic_block pre_tail = (*tail->preds)[0]->src;
4588 gcc_assert (pre_tail->succs->length () == 1);
4589
4590 nvptx_single (mask, par->forked_block, pre_tail);
4591}
4592
dba619f3
NS
4593/* If PAR has a single inner parallel and PAR itself only contains
4594 empty entry and exit blocks, swallow the inner PAR. */
4595
4596static void
4597nvptx_optimize_inner (parallel *par)
4598{
4599 parallel *inner = par->inner;
4600
4601 /* We mustn't be the outer dummy par. */
4602 if (!par->mask)
4603 return;
4604
4605 /* We must have a single inner par. */
4606 if (!inner || inner->next)
4607 return;
4608
4609 /* We must only contain 2 blocks ourselves -- the head and tail of
4610 the inner par. */
4611 if (par->blocks.length () != 2)
4612 return;
4613
4614 /* We must be disjoint partitioning. As we only have vector and
4615 worker partitioning, this is sufficient to guarantee the pars
4616 have adjacent partitioning. */
4617 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4618 /* This indicates malformed code generation. */
4619 return;
4620
4621 /* The outer forked insn should be immediately followed by the inner
4622 fork insn. */
4623 rtx_insn *forked = par->forked_insn;
4624 rtx_insn *fork = BB_END (par->forked_block);
4625
4626 if (NEXT_INSN (forked) != fork)
4627 return;
4628 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4629
4630 /* The outer joining insn must immediately follow the inner join
4631 insn. */
4632 rtx_insn *joining = par->joining_insn;
4633 rtx_insn *join = inner->join_insn;
4634 if (NEXT_INSN (join) != joining)
4635 return;
4636
4637 /* Preconditions met. Swallow the inner par. */
4638 if (dump_file)
4639 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4640 inner->mask, inner->forked_block->index,
4641 inner->join_block->index,
4642 par->mask, par->forked_block->index, par->join_block->index);
4643
4644 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4645
4646 par->blocks.reserve (inner->blocks.length ());
4647 while (inner->blocks.length ())
4648 par->blocks.quick_push (inner->blocks.pop ());
4649
4650 par->inner = inner->inner;
4651 inner->inner = NULL;
4652
4653 delete inner;
4654}
4655
d88cd9c4
NS
4656/* Process the parallel PAR and all its contained
4657 parallels. We do everything but the neutering. Return mask of
4658 partitioned modes used within this parallel. */
4659
4660static unsigned
4661nvptx_process_pars (parallel *par)
4662{
dba619f3
NS
4663 if (nvptx_optimize)
4664 nvptx_optimize_inner (par);
4665
d88cd9c4
NS
4666 unsigned inner_mask = par->mask;
4667
4668 /* Do the inner parallels first. */
4669 if (par->inner)
4670 {
4671 par->inner_mask = nvptx_process_pars (par->inner);
4672 inner_mask |= par->inner_mask;
4673 }
4674
e91eba31 4675 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
0024c320
TV
4676 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4677 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4678 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
e91eba31 4679
0024c320 4680 if (worker || large_vector)
d88cd9c4 4681 {
a0b3b5c4 4682 nvptx_shared_propagate (false, is_call, par->forked_block,
0024c320 4683 par->forked_insn, !worker);
d495b5cc
TV
4684 bool no_prop_p
4685 = nvptx_shared_propagate (true, is_call, par->forked_block,
4686 par->fork_insn, !worker);
4687 bool empty_loop_p
4688 = !is_call && (NEXT_INSN (par->forked_insn)
4689 && NEXT_INSN (par->forked_insn) == par->joining_insn);
0024c320
TV
4690 rtx barrier = GEN_INT (0);
4691 int threads = 0;
4692
4693 if (!worker && cfun->machine->sync_bar)
4694 {
4695 barrier = cfun->machine->sync_bar;
4696 threads = nvptx_mach_vector_length ();
4697 }
e91eba31 4698
d495b5cc
TV
4699 if (no_prop_p && empty_loop_p)
4700 ;
4701 else if (no_prop_p && is_call)
4702 ;
4703 else
e91eba31
NS
4704 {
4705 /* Insert begin and end synchronizations. */
0024c320
TV
4706 emit_insn_before (nvptx_cta_sync (barrier, threads),
4707 par->forked_insn);
4708 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
e91eba31 4709 }
d88cd9c4
NS
4710 }
4711 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
a0b3b5c4 4712 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4713
4714 /* Now do siblings. */
4715 if (par->next)
4716 inner_mask |= nvptx_process_pars (par->next);
4717 return inner_mask;
4718}
4719
4720/* Neuter the parallel described by PAR. We recurse in depth-first
4721 order. MODES are the partitioning of the execution and OUTER is
4722 the partitioning of the parallels we are contained in. */
4723
4724static void
4725nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4726{
4727 unsigned me = (par->mask
4728 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4729 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4730 unsigned skip_mask = 0, neuter_mask = 0;
4731
4732 if (par->inner)
4733 nvptx_neuter_pars (par->inner, modes, outer | me);
4734
4735 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4736 {
4737 if ((outer | me) & GOMP_DIM_MASK (mode))
4738 {} /* Mode is partitioned: no neutering. */
4739 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4740 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4741 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4742 || !par->forked_insn)
4743 /* Partitioned in inner parallels, or we're not a partitioned
4744 at all: neuter individual blocks. */
4745 neuter_mask |= GOMP_DIM_MASK (mode);
4746 else if (!par->parent || !par->parent->forked_insn
4747 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4748 /* Parent isn't a parallel or contains this paralleling: skip
4749 parallel at this level. */
4750 skip_mask |= GOMP_DIM_MASK (mode);
4751 else
4752 {} /* Parent will skip this parallel itself. */
4753 }
4754
4755 if (neuter_mask)
4756 {
912442c2 4757 int ix, len;
d88cd9c4 4758
912442c2
NS
4759 if (nvptx_optimize)
4760 {
4761 /* Neuter whole SESE regions. */
4762 bb_pair_vec_t regions;
4763
4764 nvptx_find_sese (par->blocks, regions);
4765 len = regions.length ();
4766 for (ix = 0; ix != len; ix++)
4767 {
4768 basic_block from = regions[ix].first;
4769 basic_block to = regions[ix].second;
4770
4771 if (from)
4772 nvptx_single (neuter_mask, from, to);
4773 else
4774 gcc_assert (!to);
4775 }
4776 }
4777 else
d88cd9c4 4778 {
912442c2
NS
4779 /* Neuter each BB individually. */
4780 len = par->blocks.length ();
4781 for (ix = 0; ix != len; ix++)
4782 {
4783 basic_block block = par->blocks[ix];
d88cd9c4 4784
912442c2
NS
4785 nvptx_single (neuter_mask, block, block);
4786 }
d88cd9c4
NS
4787 }
4788 }
4789
4790 if (skip_mask)
7820b298 4791 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4792
4793 if (par->next)
4794 nvptx_neuter_pars (par->next, modes, outer);
4795}
4796
2498a6ab
TV
4797static void
4798populate_offload_attrs (offload_attrs *oa)
4799{
4800 tree attr = oacc_get_fn_attrib (current_function_decl);
4801 tree dims = TREE_VALUE (attr);
4802 unsigned ix;
4803
4804 oa->mask = 0;
4805
4806 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4807 {
4808 tree t = TREE_VALUE (dims);
4809 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4810 tree allowed = TREE_PURPOSE (dims);
4811
4812 if (size != 1 && !(allowed && integer_zerop (allowed)))
4813 oa->mask |= GOMP_DIM_MASK (ix);
4814
4815 switch (ix)
4816 {
4817 case GOMP_DIM_GANG:
4818 oa->num_gangs = size;
4819 break;
4820
4821 case GOMP_DIM_WORKER:
4822 oa->num_workers = size;
4823 break;
4824
4825 case GOMP_DIM_VECTOR:
4826 oa->vector_length = size;
4827 break;
4828 }
4829 }
4830}
4831
3dede32b
TV
4832#if WORKAROUND_PTXJIT_BUG_2
4833/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4834 is needed in the nvptx target because the branches generated for
4835 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4836
4837static rtx
4838nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4839{
4840 rtx pat;
4841 if ((strict && !JUMP_P (insn))
4842 || (!strict && !INSN_P (insn)))
4843 return NULL_RTX;
4844 pat = PATTERN (insn);
4845
4846 /* The set is allowed to appear either as the insn pattern or
4847 the first set in a PARALLEL. */
4848 if (GET_CODE (pat) == PARALLEL)
4849 pat = XVECEXP (pat, 0, 0);
4850 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4851 return pat;
4852
4853 return NULL_RTX;
4854}
4855
4856/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4857
4858static rtx
4859nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4860{
4861 rtx x = nvptx_pc_set (insn, strict);
4862
4863 if (!x)
4864 return NULL_RTX;
4865 x = SET_SRC (x);
4866 if (GET_CODE (x) == LABEL_REF)
4867 return x;
4868 if (GET_CODE (x) != IF_THEN_ELSE)
4869 return NULL_RTX;
4870 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4871 return XEXP (x, 1);
4872 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4873 return XEXP (x, 2);
4874 return NULL_RTX;
4875}
4876
4877/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4878 insn inbetween the branch and the label. This works around a JIT bug
4879 observed at driver version 384.111, at -O0 for sm_50. */
4880
4881static void
4882prevent_branch_around_nothing (void)
4883{
4884 rtx_insn *seen_label = NULL;
4885 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4886 {
b0b592fc 4887 if (INSN_P (insn) && condjump_p (insn))
3dede32b 4888 {
b0b592fc 4889 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
4890 continue;
4891 }
4892
b0b592fc
TV
4893 if (seen_label == NULL)
4894 continue;
4895
3dede32b
TV
4896 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4897 continue;
4898
4899 if (INSN_P (insn))
4900 switch (recog_memoized (insn))
4901 {
4902 case CODE_FOR_nvptx_fork:
4903 case CODE_FOR_nvptx_forked:
4904 case CODE_FOR_nvptx_joining:
4905 case CODE_FOR_nvptx_join:
4906 continue;
4907 default:
4908 seen_label = NULL;
4909 continue;
4910 }
4911
4912 if (LABEL_P (insn) && insn == seen_label)
4913 emit_insn_before (gen_fake_nop (), insn);
4914
4915 seen_label = NULL;
4916 }
4917 }
4918#endif
4919
21251395
TV
4920#ifdef WORKAROUND_PTXJIT_BUG_3
4921/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4922 works around a hang observed at driver version 390.48 for sm_50. */
4923
4924static void
4925workaround_barsyncs (void)
4926{
4927 bool seen_barsync = false;
4928 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4929 {
4930 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4931 {
4932 if (seen_barsync)
4933 {
4934 emit_insn_before (gen_nvptx_membar_cta (), insn);
4935 emit_insn_before (gen_nvptx_membar_cta (), insn);
4936 }
4937
4938 seen_barsync = true;
4939 continue;
4940 }
4941
4942 if (!seen_barsync)
4943 continue;
4944
4945 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4946 continue;
4947 else if (INSN_P (insn))
4948 switch (recog_memoized (insn))
4949 {
4950 case CODE_FOR_nvptx_fork:
4951 case CODE_FOR_nvptx_forked:
4952 case CODE_FOR_nvptx_joining:
4953 case CODE_FOR_nvptx_join:
4954 continue;
4955 default:
4956 break;
4957 }
4958
4959 seen_barsync = false;
4960 }
4961}
4962#endif
4963
517665b3 4964/* PTX-specific reorganization
d88cd9c4 4965 - Split blocks at fork and join instructions
c38f0d8c
NS
4966 - Compute live registers
4967 - Mark now-unused registers, so function begin doesn't declare
517665b3 4968 unused registers.
d88cd9c4
NS
4969 - Insert state propagation when entering partitioned mode
4970 - Insert neutering instructions when in single mode
c38f0d8c 4971 - Replace subregs with suitable sequences.
517665b3
NS
4972*/
4973
4974static void
4975nvptx_reorg (void)
4976{
517665b3
NS
4977 /* We are freeing block_for_insn in the toplev to keep compatibility
4978 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4979 compute_bb_for_insn ();
4980
4981 thread_prologue_and_epilogue_insns ();
4982
d88cd9c4
NS
4983 /* Split blocks and record interesting unspecs. */
4984 bb_insn_map_t bb_insn_map;
4985
4986 nvptx_split_blocks (&bb_insn_map);
4987
c38f0d8c 4988 /* Compute live regs */
517665b3
NS
4989 df_clear_flags (DF_LR_RUN_DCE);
4990 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4991 df_live_add_problem ();
4992 df_live_set_all_dirty ();
517665b3 4993 df_analyze ();
738f2522
BS
4994 regstat_init_n_sets_and_refs ();
4995
d88cd9c4
NS
4996 if (dump_file)
4997 df_dump (dump_file);
4998
517665b3 4999 /* Mark unused regs as unused. */
d88cd9c4 5000 int max_regs = max_reg_num ();
44c068ae 5001 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
5002 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5003 regno_reg_rtx[i] = const0_rtx;
517665b3 5004
d88cd9c4
NS
5005 /* Determine launch dimensions of the function. If it is not an
5006 offloaded function (i.e. this is a regular compiler), the
5007 function has no neutering. */
629b3d75 5008 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
5009 if (attr)
5010 {
5011 /* If we determined this mask before RTL expansion, we could
5012 elide emission of some levels of forks and joins. */
2498a6ab 5013 offload_attrs oa;
d88cd9c4 5014
2498a6ab 5015 populate_offload_attrs (&oa);
d88cd9c4 5016
d88cd9c4
NS
5017 /* If there is worker neutering, there must be vector
5018 neutering. Otherwise the hardware will fail. */
2498a6ab
TV
5019 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5020 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
d88cd9c4
NS
5021
5022 /* Discover & process partitioned regions. */
5023 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5024 nvptx_process_pars (pars);
2498a6ab 5025 nvptx_neuter_pars (pars, oa.mask, 0);
d88cd9c4
NS
5026 delete pars;
5027 }
5028
517665b3 5029 /* Replace subregs. */
c03b0416 5030 nvptx_reorg_subreg ();
517665b3 5031
5012919d
AM
5032 if (TARGET_UNIFORM_SIMT)
5033 nvptx_reorg_uniform_simt ();
5034
3dede32b
TV
5035#if WORKAROUND_PTXJIT_BUG_2
5036 prevent_branch_around_nothing ();
5037#endif
5038
21251395
TV
5039#ifdef WORKAROUND_PTXJIT_BUG_3
5040 workaround_barsyncs ();
5041#endif
5042
738f2522 5043 regstat_free_n_sets_and_refs ();
517665b3
NS
5044
5045 df_finish_pass (true);
738f2522
BS
5046}
5047\f
5048/* Handle a "kernel" attribute; arguments as in
5049 struct attribute_spec.handler. */
5050
5051static tree
5052nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5053 int ARG_UNUSED (flags), bool *no_add_attrs)
5054{
5055 tree decl = *node;
5056
5057 if (TREE_CODE (decl) != FUNCTION_DECL)
5058 {
5059 error ("%qE attribute only applies to functions", name);
5060 *no_add_attrs = true;
5061 }
b49e35a9 5062 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
5063 {
5064 error ("%qE attribute requires a void return type", name);
5065 *no_add_attrs = true;
5066 }
5067
5068 return NULL_TREE;
5069}
5070
5012919d
AM
5071/* Handle a "shared" attribute; arguments as in
5072 struct attribute_spec.handler. */
5073
5074static tree
5075nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5076 int ARG_UNUSED (flags), bool *no_add_attrs)
5077{
5078 tree decl = *node;
5079
5080 if (TREE_CODE (decl) != VAR_DECL)
5081 {
5082 error ("%qE attribute only applies to variables", name);
5083 *no_add_attrs = true;
5084 }
5085 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5086 {
5087 error ("%qE attribute not allowed with auto storage class", name);
5088 *no_add_attrs = true;
5089 }
5090
5091 return NULL_TREE;
5092}
5093
738f2522
BS
5094/* Table of valid machine attributes. */
5095static const struct attribute_spec nvptx_attribute_table[] =
5096{
4849deb1
JJ
5097 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5098 affects_type_identity, handler, exclude } */
5099 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 5100 NULL },
4849deb1 5101 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 5102 NULL },
4849deb1 5103 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
5104};
5105\f
5106/* Limit vector alignments to BIGGEST_ALIGNMENT. */
5107
5108static HOST_WIDE_INT
5109nvptx_vector_alignment (const_tree type)
5110{
5111 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5112
5113 return MIN (align, BIGGEST_ALIGNMENT);
5114}
d88cd9c4
NS
5115
5116/* Indicate that INSN cannot be duplicated. */
5117
5118static bool
5119nvptx_cannot_copy_insn_p (rtx_insn *insn)
5120{
5121 switch (recog_memoized (insn))
5122 {
5123 case CODE_FOR_nvptx_shufflesi:
5124 case CODE_FOR_nvptx_shufflesf:
5125 case CODE_FOR_nvptx_barsync:
5126 case CODE_FOR_nvptx_fork:
5127 case CODE_FOR_nvptx_forked:
5128 case CODE_FOR_nvptx_joining:
5129 case CODE_FOR_nvptx_join:
5130 return true;
5131 default:
5132 return false;
5133 }
5134}
a794bd20
NS
5135
5136/* Section anchors do not work. Initialization for flag_section_anchor
5137 probes the existence of the anchoring target hooks and prevents
5138 anchoring if they don't exist. However, we may be being used with
5139 a host-side compiler that does support anchoring, and hence see
5140 the anchor flag set (as it's not recalculated). So provide an
5141 implementation denying anchoring. */
5142
5143static bool
5144nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5145{
5146 return false;
5147}
738f2522 5148\f
1f83528e
TS
5149/* Record a symbol for mkoffload to enter into the mapping table. */
5150
5151static void
5152nvptx_record_offload_symbol (tree decl)
5153{
3e32ee19
NS
5154 switch (TREE_CODE (decl))
5155 {
5156 case VAR_DECL:
5157 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5158 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5159 break;
5160
5161 case FUNCTION_DECL:
5162 {
629b3d75 5163 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
5164 /* OpenMP offloading does not set this attribute. */
5165 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 5166
3e32ee19
NS
5167 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5168 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5169
5012919d 5170 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 5171 {
5d306e55 5172 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 5173
5d306e55 5174 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
5175 fprintf (asm_out_file, ", %#x", size);
5176 }
d2d47a28 5177
3e32ee19
NS
5178 fprintf (asm_out_file, "\n");
5179 }
5180 break;
d2d47a28 5181
3e32ee19
NS
5182 default:
5183 gcc_unreachable ();
5184 }
1f83528e
TS
5185}
5186
738f2522
BS
5187/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5188 at the start of a file. */
5189
5190static void
5191nvptx_file_start (void)
5192{
5193 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5194 fputs ("\t.version\t3.1\n", asm_out_file);
53ceb8b5
CP
5195 if (TARGET_SM35)
5196 fputs ("\t.target\tsm_35\n", asm_out_file);
5197 else
5198 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
5199 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5200 fputs ("// END PREAMBLE\n", asm_out_file);
5201}
5202
a0b3b5c4
TV
5203/* Emit a declaration for a worker and vector-level buffer in .shared
5204 memory. */
15ab6f00
NS
5205
5206static void
a0b3b5c4 5207write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
15ab6f00
NS
5208{
5209 const char *name = XSTR (sym, 0);
5210
5211 write_var_marker (file, true, false, name);
5212 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5213 align, name, size);
5214}
5215
ecf6e535
BS
5216/* Write out the function declarations we've collected and declare storage
5217 for the broadcast buffer. */
738f2522
BS
5218
5219static void
5220nvptx_file_end (void)
5221{
f3dba894
TS
5222 hash_table<tree_hasher>::iterator iter;
5223 tree decl;
5224 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 5225 nvptx_record_fndecl (decl);
738f2522 5226 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4 5227
1ed57fb8 5228 if (oacc_bcast_size)
a0b3b5c4 5229 write_shared_buffer (asm_out_file, oacc_bcast_sym,
1ed57fb8 5230 oacc_bcast_align, oacc_bcast_size);
f3552158
NS
5231
5232 if (worker_red_size)
a0b3b5c4 5233 write_shared_buffer (asm_out_file, worker_red_sym,
15ab6f00 5234 worker_red_align, worker_red_size);
5012919d 5235
f881693c
TV
5236 if (vector_red_size)
5237 write_shared_buffer (asm_out_file, vector_red_sym,
5238 vector_red_align, vector_red_size);
5239
5012919d
AM
5240 if (need_softstack_decl)
5241 {
5242 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5243 /* 32 is the maximum number of warps in a block. Even though it's an
5244 external declaration, emit the array size explicitly; otherwise, it
5245 may fail at PTX JIT time if the definition is later in link order. */
5246 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5247 POINTER_SIZE);
5248 }
5249 if (need_unisimt_decl)
5250 {
5251 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5252 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5253 }
f3552158
NS
5254}
5255
5256/* Expander for the shuffle builtins. */
5257
5258static rtx
5259nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5260{
5261 if (ignore)
5262 return target;
5263
5264 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5265 NULL_RTX, mode, EXPAND_NORMAL);
5266 if (!REG_P (src))
5267 src = copy_to_mode_reg (mode, src);
5268
5269 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5270 NULL_RTX, SImode, EXPAND_NORMAL);
5271 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5272 NULL_RTX, SImode, EXPAND_NORMAL);
5273
5274 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5275 idx = copy_to_mode_reg (SImode, idx);
5276
59263259
NS
5277 rtx pat = nvptx_gen_shuffle (target, src, idx,
5278 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5279 if (pat)
5280 emit_insn (pat);
5281
5282 return target;
5283}
5284
f881693c
TV
5285const char *
5286nvptx_output_red_partition (rtx dst, rtx offset)
5287{
5288 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5289 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5290
5291 if (offset == const0_rtx)
5292 fprintf (asm_out_file, zero_offset, REGNO (dst),
5293 REGNO (cfun->machine->red_partition));
5294 else
5295 fprintf (asm_out_file, with_offset, REGNO (dst),
5296 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5297
5298 return "";
5299}
5300
5301/* Shared-memory reduction address expander. */
f3552158
NS
5302
5303static rtx
a0b3b5c4 5304nvptx_expand_shared_addr (tree exp, rtx target,
f881693c
TV
5305 machine_mode ARG_UNUSED (mode), int ignore,
5306 int vector)
f3552158
NS
5307{
5308 if (ignore)
5309 return target;
5310
5311 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
f3552158
NS
5312 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5313 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
9a863523 5314 rtx addr = worker_red_sym;
f881693c
TV
5315
5316 if (vector)
9a863523 5317 {
f881693c
TV
5318 offload_attrs oa;
5319
5320 populate_offload_attrs (&oa);
5321
5322 unsigned int psize = ROUND_UP (size + offset, align);
5323 unsigned int pnum = nvptx_mach_max_workers ();
5324 vector_red_partition = MAX (vector_red_partition, psize);
5325 vector_red_size = MAX (vector_red_size, psize * pnum);
5326 vector_red_align = MAX (vector_red_align, align);
5327
5328 if (cfun->machine->red_partition == NULL)
5329 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5330
5331 addr = gen_reg_rtx (Pmode);
5332 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
9a863523 5333 }
f881693c
TV
5334 else
5335 {
5336 worker_red_align = MAX (worker_red_align, align);
5337 worker_red_size = MAX (worker_red_size, size + offset);
f3552158 5338
f881693c
TV
5339 if (offset)
5340 {
5341 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5342 addr = gen_rtx_CONST (Pmode, addr);
5343 }
5344 }
f3552158 5345
f881693c 5346 emit_move_insn (target, addr);
f3552158
NS
5347 return target;
5348}
5349
5350/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5351 not require taking the address of any object, other than the memory
5352 cell being operated on. */
5353
5354static rtx
5355nvptx_expand_cmp_swap (tree exp, rtx target,
5356 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5357{
5358 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5359
5360 if (!target)
5361 target = gen_reg_rtx (mode);
5362
5363 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5364 NULL_RTX, Pmode, EXPAND_NORMAL);
5365 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5366 NULL_RTX, mode, EXPAND_NORMAL);
5367 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5368 NULL_RTX, mode, EXPAND_NORMAL);
5369 rtx pat;
5370
5371 mem = gen_rtx_MEM (mode, mem);
5372 if (!REG_P (cmp))
5373 cmp = copy_to_mode_reg (mode, cmp);
5374 if (!REG_P (src))
5375 src = copy_to_mode_reg (mode, src);
5376
5377 if (mode == SImode)
5378 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5379 else
5380 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5381
5382 emit_insn (pat);
5383
5384 return target;
5385}
5386
5387
5388/* Codes for all the NVPTX builtins. */
5389enum nvptx_builtins
5390{
5391 NVPTX_BUILTIN_SHUFFLE,
5392 NVPTX_BUILTIN_SHUFFLELL,
5393 NVPTX_BUILTIN_WORKER_ADDR,
f881693c 5394 NVPTX_BUILTIN_VECTOR_ADDR,
f3552158
NS
5395 NVPTX_BUILTIN_CMP_SWAP,
5396 NVPTX_BUILTIN_CMP_SWAPLL,
5397 NVPTX_BUILTIN_MAX
5398};
5399
5400static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5401
5402/* Return the NVPTX builtin for CODE. */
5403
5404static tree
5405nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5406{
5407 if (code >= NVPTX_BUILTIN_MAX)
5408 return error_mark_node;
5409
5410 return nvptx_builtin_decls[code];
5411}
5412
5413/* Set up all builtin functions for this target. */
5414
5415static void
5416nvptx_init_builtins (void)
5417{
5418#define DEF(ID, NAME, T) \
5419 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5420 = add_builtin_function ("__builtin_nvptx_" NAME, \
5421 build_function_type_list T, \
5422 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5423#define ST sizetype
5424#define UINT unsigned_type_node
5425#define LLUINT long_long_unsigned_type_node
5426#define PTRVOID ptr_type_node
5427
5428 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5429 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5430 DEF (WORKER_ADDR, "worker_addr",
5431 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f881693c
TV
5432 DEF (VECTOR_ADDR, "vector_addr",
5433 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f3552158
NS
5434 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5435 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5436
5437#undef DEF
5438#undef ST
5439#undef UINT
5440#undef LLUINT
5441#undef PTRVOID
5442}
5443
5444/* Expand an expression EXP that calls a built-in function,
5445 with result going to TARGET if that's convenient
5446 (and in mode MODE if that's convenient).
5447 SUBTARGET may be used as the target for computing one of EXP's operands.
5448 IGNORE is nonzero if the value is to be ignored. */
5449
5450static rtx
5451nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5452 machine_mode mode, int ignore)
5453{
5454 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5455 switch (DECL_FUNCTION_CODE (fndecl))
5456 {
5457 case NVPTX_BUILTIN_SHUFFLE:
5458 case NVPTX_BUILTIN_SHUFFLELL:
5459 return nvptx_expand_shuffle (exp, target, mode, ignore);
5460
5461 case NVPTX_BUILTIN_WORKER_ADDR:
f881693c
TV
5462 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5463
5464 case NVPTX_BUILTIN_VECTOR_ADDR:
5465 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
f3552158
NS
5466
5467 case NVPTX_BUILTIN_CMP_SWAP:
5468 case NVPTX_BUILTIN_CMP_SWAPLL:
5469 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5470
5471 default: gcc_unreachable ();
5472 }
738f2522 5473}
f3552158 5474
5012919d
AM
5475/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5476
5477static int
5478nvptx_simt_vf ()
5479{
5d17a476 5480 return PTX_WARP_SIZE;
5012919d
AM
5481}
5482
ccc0d492
TV
5483static bool
5484nvptx_welformed_vector_length_p (int l)
5485{
5486 gcc_assert (l > 0);
5487 return l % PTX_WARP_SIZE == 0;
5488}
5489
5490static void
5491nvptx_apply_dim_limits (int dims[])
5492{
5493 /* Check that the vector_length is not too large. */
5494 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5495 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5496
5497 /* Check that the number of workers is not too large. */
5498 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5499 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5500
5501 /* Ensure that num_worker * vector_length <= cta size. */
5502 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5503 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5504 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
052aaace
TV
5505
5506 /* If we need a per-worker barrier ... . */
5507 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5508 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5509 /* Don't use more barriers than available. */
5510 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5511 PTX_NUM_PER_WORKER_BARRIERS);
ccc0d492
TV
5512}
5513
6e723923
TV
5514/* Return true if FNDECL contains calls to vector-partitionable routines. */
5515
5516static bool
5517has_vector_partitionable_routine_calls_p (tree fndecl)
5518{
5519 if (!fndecl)
5520 return false;
5521
5522 basic_block bb;
5523 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5524 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5525 gsi_next_nondebug (&i))
5526 {
5527 gimple *stmt = gsi_stmt (i);
5528 if (gimple_code (stmt) != GIMPLE_CALL)
5529 continue;
5530
5531 tree callee = gimple_call_fndecl (stmt);
5532 if (!callee)
5533 continue;
5534
5535 tree attrs = oacc_get_fn_attrib (callee);
5536 if (attrs == NULL_TREE)
5537 return false;
5538
5539 int partition_level = oacc_fn_attrib_level (attrs);
5540 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5541 if (!seq_routine_p)
5542 return true;
5543 }
5544
5545 return false;
5546}
5547
5f57c8f3
TV
5548/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5549 DIMS has changed. */
94829f87 5550
5f57c8f3
TV
5551static void
5552nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level)
94829f87 5553{
ec6c865c
TV
5554 bool oacc_default_dims_p = false;
5555 bool oacc_min_dims_p = false;
5556 bool offload_region_p = false;
5557 bool routine_p = false;
5558 bool routine_seq_p = false;
5559
5560 if (decl == NULL_TREE)
5561 {
5562 if (fn_level == -1)
5563 oacc_default_dims_p = true;
5564 else if (fn_level == -2)
5565 oacc_min_dims_p = true;
5566 else
5567 gcc_unreachable ();
5568 }
5569 else if (fn_level == -1)
5570 offload_region_p = true;
5571 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5572 {
5573 routine_p = true;
5574 routine_seq_p = fn_level == GOMP_DIM_MAX;
5575 }
5576 else
5577 gcc_unreachable ();
94829f87 5578
207e7fea
TV
5579 if (routine_p)
5580 {
5581 /* OpenACC routines in C arrive here with the following attributes
5582 (omitting the 'omp declare target'):
5583 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5584 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5585 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5586 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5587
5588 If we take f.i. the oacc function attribute of the worker routine
5589 (0 1, 1 0, 1 0), then:
5590 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5591 meaning: worker routine, that is:
5592 - can't contain gang loop (0),
5593 - can contain worker loop (1),
5594 - can contain vector loop (1).
5595 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5596 dimensions: gang: 1, worker: 0, vector: 0.
5597
5598 OTOH, routines in Fortran arrive here with these attributes:
5599 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5600 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5601 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5602 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5603 that is, the same as for C but with the dimensions set to 0.
5604
5605 This is due to a bug in the Fortran front-end: PR72741. Work around
5606 this bug by forcing the dimensions to be the same in Fortran as for C,
5607 to be able to handle C and Fortran routines uniformly in this
5608 function. */
5609 dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
5610 dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
5611 dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
207e7fea
TV
5612 }
5613
4b171607
TV
5614 if (oacc_min_dims_p)
5615 {
5616 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5617 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5618 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5619
5620 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5621 return;
5622 }
5623
5624 if (routine_p)
5625 {
5626 if (!routine_seq_p)
5627 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5628
5629 return;
5630 }
5631
5632 if (oacc_default_dims_p)
5633 {
5634 /* -1 : not set
5635 0 : set at runtime, f.i. -fopenacc-dims=-
5636 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5637 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5638 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5639 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5640
5641 /* But -fopenacc-dims=- is not yet supported on trunk. */
5642 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5643 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5644 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5645 }
5646
5647 if (offload_region_p)
5648 {
5649 /* -1 : not set
5650 0 : set using variable, f.i. num_gangs (n)
5651 >= 1: set using constant, f.i. num_gangs (1). */
5652 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5653 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5654 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5655 }
5656
d8ab4e54
TV
5657 int old_dims[GOMP_DIM_MAX];
5658 unsigned int i;
5659 for (i = 0; i < GOMP_DIM_MAX; ++i)
5660 old_dims[i] = dims[i];
5661
5662 const char *vector_reason = NULL;
6e723923
TV
5663 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5664 {
5665 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5666 {
5667 vector_reason = G_("using vector_length (%d) due to call to"
5668 " vector-partitionable routine, ignoring %d");
5669 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5670 }
5671 }
5672
d8ab4e54 5673 if (dims[GOMP_DIM_VECTOR] == 0)
ccc8282b 5674 {
d8ab4e54 5675 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
764ecad4 5676 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
ccc8282b
NS
5677 }
5678
d8ab4e54 5679 if (dims[GOMP_DIM_VECTOR] > 0
ccc0d492 5680 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
d8ab4e54
TV
5681 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
5682
ccc0d492 5683 nvptx_apply_dim_limits (dims);
d8ab4e54
TV
5684
5685 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5686 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5687 vector_reason != NULL
5688 ? vector_reason
5689 : G_("using vector_length (%d), ignoring %d"),
5690 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5691
5692 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5693 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5694 G_("using num_workers (%d), ignoring %d"),
5695 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
94829f87 5696
4b171607 5697 if (oacc_default_dims_p)
b6adbb9f 5698 {
764ecad4 5699 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
b6adbb9f 5700 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5701 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5702 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5703 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
ccc0d492 5704 nvptx_apply_dim_limits (dims);
b6adbb9f 5705 }
5f57c8f3
TV
5706}
5707
5708/* Validate compute dimensions of an OpenACC offload or routine, fill
5709 in non-unity defaults. FN_LEVEL indicates the level at which a
5710 routine might spawn a loop. It is negative for non-routines. If
5711 DECL is null, we are validating the default dimensions. */
5712
5713static bool
5714nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
5715{
5716 int old_dims[GOMP_DIM_MAX];
5717 unsigned int i;
b6adbb9f 5718
5f57c8f3
TV
5719 for (i = 0; i < GOMP_DIM_MAX; ++i)
5720 old_dims[i] = dims[i];
5721
5722 nvptx_goacc_validate_dims_1 (decl, dims, fn_level);
5723
21fbea5c
TV
5724 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5725 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5726 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5727
5f57c8f3
TV
5728 for (i = 0; i < GOMP_DIM_MAX; ++i)
5729 if (old_dims[i] != dims[i])
5730 return true;
5731
5732 return false;
94829f87 5733}
d88cd9c4 5734
bd751975
NS
5735/* Return maximum dimension size, or zero for unbounded. */
5736
5737static int
5738nvptx_dim_limit (int axis)
5739{
5740 switch (axis)
5741 {
bd751975 5742 case GOMP_DIM_VECTOR:
764ecad4 5743 return PTX_MAX_VECTOR_LENGTH;
bd751975
NS
5744
5745 default:
5746 break;
5747 }
5748 return 0;
5749}
5750
d88cd9c4
NS
5751/* Determine whether fork & joins are needed. */
5752
5753static bool
5754nvptx_goacc_fork_join (gcall *call, const int dims[],
5755 bool ARG_UNUSED (is_fork))
5756{
5757 tree arg = gimple_call_arg (call, 2);
5758 unsigned axis = TREE_INT_CST_LOW (arg);
5759
5760 /* We only care about worker and vector partitioning. */
5761 if (axis < GOMP_DIM_WORKER)
5762 return false;
5763
5764 /* If the size is 1, there's no partitioning. */
5765 if (dims[axis] == 1)
5766 return false;
5767
5768 return true;
5769}
5770
f3552158
NS
5771/* Generate a PTX builtin function call that returns the address in
5772 the worker reduction buffer at OFFSET. TYPE is the type of the
5773 data at that location. */
5774
5775static tree
f881693c 5776nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
f3552158 5777{
f881693c
TV
5778 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5779 if (vector)
5780 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
f3552158 5781 machine_mode mode = TYPE_MODE (type);
f881693c 5782 tree fndecl = nvptx_builtin_decl (addr_dim, true);
f3552158
NS
5783 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5784 tree align = build_int_cst (unsigned_type_node,
5785 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5786 tree call = build_call_expr (fndecl, 3, offset, size, align);
5787
5788 return fold_convert (build_pointer_type (type), call);
5789}
5790
5791/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5792 will cast the variable if necessary. */
5793
5794static void
5795nvptx_generate_vector_shuffle (location_t loc,
5796 tree dest_var, tree var, unsigned shift,
5797 gimple_seq *seq)
5798{
5799 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5800 tree_code code = NOP_EXPR;
dd3c1b14
NS
5801 tree arg_type = unsigned_type_node;
5802 tree var_type = TREE_TYPE (var);
5803 tree dest_type = var_type;
f3552158 5804
dd3c1b14
NS
5805 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5806 var_type = TREE_TYPE (var_type);
5807
5808 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 5809 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
5810
5811 if (TYPE_SIZE (var_type)
5812 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
5813 {
5814 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 5815 arg_type = long_long_unsigned_type_node;
f3552158 5816 }
dd3c1b14 5817
f3552158 5818 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
5819 tree bits = build_int_cst (unsigned_type_node, shift);
5820 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5821 tree expr;
5822
5823 if (var_type != dest_type)
5824 {
5825 /* Do real and imaginary parts separately. */
5826 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5827 real = fold_build1 (code, arg_type, real);
5828 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5829 real = fold_build1 (code, var_type, real);
f3552158 5830
dd3c1b14
NS
5831 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5832 imag = fold_build1 (code, arg_type, imag);
5833 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5834 imag = fold_build1 (code, var_type, imag);
5835
5836 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5837 }
5838 else
5839 {
5840 expr = fold_build1 (code, arg_type, var);
5841 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5842 expr = fold_build1 (code, dest_type, expr);
5843 }
f3552158 5844
dd3c1b14 5845 gimplify_assign (dest_var, expr, seq);
f3552158
NS
5846}
5847
33f47f42
NS
5848/* Lazily generate the global lock var decl and return its address. */
5849
5850static tree
5851nvptx_global_lock_addr ()
5852{
5853 tree v = global_lock_var;
5854
5855 if (!v)
5856 {
5857 tree name = get_identifier ("__reduction_lock");
5858 tree type = build_qualified_type (unsigned_type_node,
5859 TYPE_QUAL_VOLATILE);
5860 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5861 global_lock_var = v;
5862 DECL_ARTIFICIAL (v) = 1;
5863 DECL_EXTERNAL (v) = 1;
5864 TREE_STATIC (v) = 1;
5865 TREE_PUBLIC (v) = 1;
5866 TREE_USED (v) = 1;
5867 mark_addressable (v);
5868 mark_decl_referenced (v);
5869 }
5870
5871 return build_fold_addr_expr (v);
5872}
5873
5874/* Insert code to locklessly update *PTR with *PTR OP VAR just before
5875 GSI. We use a lockless scheme for nearly all case, which looks
5876 like:
5877 actual = initval(OP);
5878 do {
5879 guess = actual;
5880 write = guess OP myval;
5881 actual = cmp&swap (ptr, guess, write)
5882 } while (actual bit-different-to guess);
5883 return write;
5884
5885 This relies on a cmp&swap instruction, which is available for 32-
5886 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
5887
5888static tree
5889nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5890 tree ptr, tree var, tree_code op)
5891{
5892 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5893 tree_code code = NOP_EXPR;
33f47f42
NS
5894 tree arg_type = unsigned_type_node;
5895 tree var_type = TREE_TYPE (var);
f3552158 5896
33f47f42
NS
5897 if (TREE_CODE (var_type) == COMPLEX_TYPE
5898 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 5899 code = VIEW_CONVERT_EXPR;
33f47f42
NS
5900
5901 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 5902 {
33f47f42 5903 arg_type = long_long_unsigned_type_node;
f3552158 5904 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
5905 }
5906
33f47f42
NS
5907 tree swap_fn = nvptx_builtin_decl (fn, true);
5908
f3552158 5909 gimple_seq init_seq = NULL;
33f47f42
NS
5910 tree init_var = make_ssa_name (arg_type);
5911 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5912 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
5913 gimplify_assign (init_var, init_expr, &init_seq);
5914 gimple *init_end = gimple_seq_last (init_seq);
5915
5916 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5917
f3552158
NS
5918 /* Split the block just after the init stmts. */
5919 basic_block pre_bb = gsi_bb (*gsi);
5920 edge pre_edge = split_block (pre_bb, init_end);
5921 basic_block loop_bb = pre_edge->dest;
5922 pre_bb = pre_edge->src;
5923 /* Reset the iterator. */
5924 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5925
33f47f42
NS
5926 tree expect_var = make_ssa_name (arg_type);
5927 tree actual_var = make_ssa_name (arg_type);
5928 tree write_var = make_ssa_name (arg_type);
5929
5930 /* Build and insert the reduction calculation. */
5931 gimple_seq red_seq = NULL;
5932 tree write_expr = fold_build1 (code, var_type, expect_var);
5933 write_expr = fold_build2 (op, var_type, write_expr, var);
5934 write_expr = fold_build1 (code, arg_type, write_expr);
5935 gimplify_assign (write_var, write_expr, &red_seq);
5936
5937 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5938
5939 /* Build & insert the cmp&swap sequence. */
5940 gimple_seq latch_seq = NULL;
5941 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5942 ptr, expect_var, write_var);
5943 gimplify_assign (actual_var, swap_expr, &latch_seq);
5944
5945 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5946 NULL_TREE, NULL_TREE);
5947 gimple_seq_add_stmt (&latch_seq, cond);
5948
5949 gimple *latch_end = gimple_seq_last (latch_seq);
5950 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 5951
33f47f42
NS
5952 /* Split the block just after the latch stmts. */
5953 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
5954 basic_block post_bb = post_edge->dest;
5955 loop_bb = post_edge->src;
5956 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5957
5958 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 5959 post_edge->probability = profile_probability::even ();
f3552158 5960 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 5961 loop_edge->probability = profile_probability::even ();
f3552158
NS
5962 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5963 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5964
5965 gphi *phi = create_phi_node (expect_var, loop_bb);
5966 add_phi_arg (phi, init_var, pre_edge, loc);
5967 add_phi_arg (phi, actual_var, loop_edge, loc);
5968
5969 loop *loop = alloc_loop ();
5970 loop->header = loop_bb;
5971 loop->latch = loop_bb;
5972 add_loop (loop, loop_bb->loop_father);
5973
33f47f42
NS
5974 return fold_build1 (code, var_type, write_var);
5975}
5976
5977/* Insert code to lockfully update *PTR with *PTR OP VAR just before
5978 GSI. This is necessary for types larger than 64 bits, where there
5979 is no cmp&swap instruction to implement a lockless scheme. We use
5980 a lock variable in global memory.
5981
5982 while (cmp&swap (&lock_var, 0, 1))
5983 continue;
5984 T accum = *ptr;
5985 accum = accum OP var;
5986 *ptr = accum;
5987 cmp&swap (&lock_var, 1, 0);
5988 return accum;
5989
5990 A lock in global memory is necessary to force execution engine
5991 descheduling and avoid resource starvation that can occur if the
5992 lock is in .shared memory. */
5993
5994static tree
5995nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5996 tree ptr, tree var, tree_code op)
5997{
5998 tree var_type = TREE_TYPE (var);
5999 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6000 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6001 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6002
6003 /* Split the block just before the gsi. Insert a gimple nop to make
6004 this easier. */
6005 gimple *nop = gimple_build_nop ();
6006 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6007 basic_block entry_bb = gsi_bb (*gsi);
6008 edge entry_edge = split_block (entry_bb, nop);
6009 basic_block lock_bb = entry_edge->dest;
6010 /* Reset the iterator. */
6011 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6012
6013 /* Build and insert the locking sequence. */
6014 gimple_seq lock_seq = NULL;
6015 tree lock_var = make_ssa_name (unsigned_type_node);
6016 tree lock_expr = nvptx_global_lock_addr ();
6017 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6018 uns_unlocked, uns_locked);
6019 gimplify_assign (lock_var, lock_expr, &lock_seq);
6020 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6021 NULL_TREE, NULL_TREE);
6022 gimple_seq_add_stmt (&lock_seq, cond);
6023 gimple *lock_end = gimple_seq_last (lock_seq);
6024 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6025
6026 /* Split the block just after the lock sequence. */
6027 edge locked_edge = split_block (lock_bb, lock_end);
6028 basic_block update_bb = locked_edge->dest;
6029 lock_bb = locked_edge->src;
6030 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6031
6032 /* Create the lock loop ... */
6033 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
6034 locked_edge->probability = profile_probability::even ();
6035 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6036 loop_edge->probability = profile_probability::even ();
33f47f42
NS
6037 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6038 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6039
6040 /* ... and the loop structure. */
6041 loop *lock_loop = alloc_loop ();
6042 lock_loop->header = lock_bb;
6043 lock_loop->latch = lock_bb;
6044 lock_loop->nb_iterations_estimate = 1;
6045 lock_loop->any_estimate = true;
6046 add_loop (lock_loop, entry_bb->loop_father);
6047
6048 /* Build and insert the reduction calculation. */
6049 gimple_seq red_seq = NULL;
6050 tree acc_in = make_ssa_name (var_type);
6051 tree ref_in = build_simple_mem_ref (ptr);
6052 TREE_THIS_VOLATILE (ref_in) = 1;
6053 gimplify_assign (acc_in, ref_in, &red_seq);
6054
6055 tree acc_out = make_ssa_name (var_type);
6056 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6057 gimplify_assign (acc_out, update_expr, &red_seq);
6058
6059 tree ref_out = build_simple_mem_ref (ptr);
6060 TREE_THIS_VOLATILE (ref_out) = 1;
6061 gimplify_assign (ref_out, acc_out, &red_seq);
6062
6063 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6064
6065 /* Build & insert the unlock sequence. */
6066 gimple_seq unlock_seq = NULL;
6067 tree unlock_expr = nvptx_global_lock_addr ();
6068 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6069 uns_locked, uns_unlocked);
6070 gimplify_and_add (unlock_expr, &unlock_seq);
6071 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6072
6073 return acc_out;
6074}
6075
6076/* Emit a sequence to update a reduction accumlator at *PTR with the
6077 value held in VAR using operator OP. Return the updated value.
6078
6079 TODO: optimize for atomic ops and indepedent complex ops. */
6080
6081static tree
6082nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6083 tree ptr, tree var, tree_code op)
6084{
6085 tree type = TREE_TYPE (var);
6086 tree size = TYPE_SIZE (type);
6087
6088 if (size == TYPE_SIZE (unsigned_type_node)
6089 || size == TYPE_SIZE (long_long_unsigned_type_node))
6090 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6091 else
6092 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
6093}
6094
6095/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6096
6097static void
f881693c 6098nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
f3552158
NS
6099{
6100 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6101 tree lhs = gimple_call_lhs (call);
6102 tree var = gimple_call_arg (call, 2);
6103 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6104 gimple_seq seq = NULL;
6105
6106 push_gimplify_context (true);
6107
6108 if (level != GOMP_DIM_GANG)
6109 {
6110 /* Copy the receiver object. */
6111 tree ref_to_res = gimple_call_arg (call, 1);
6112
6113 if (!integer_zerop (ref_to_res))
6114 var = build_simple_mem_ref (ref_to_res);
6115 }
6116
f881693c
TV
6117 if (level == GOMP_DIM_WORKER
6118 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6119 {
6120 /* Store incoming value to worker reduction buffer. */
6121 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6122 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6123 level == GOMP_DIM_VECTOR);
f3552158
NS
6124 tree ptr = make_ssa_name (TREE_TYPE (call));
6125
6126 gimplify_assign (ptr, call, &seq);
6127 tree ref = build_simple_mem_ref (ptr);
6128 TREE_THIS_VOLATILE (ref) = 1;
6129 gimplify_assign (ref, var, &seq);
6130 }
6131
6132 if (lhs)
6133 gimplify_assign (lhs, var, &seq);
6134
6135 pop_gimplify_context (NULL);
6136 gsi_replace_with_seq (&gsi, seq, true);
6137}
6138
6139/* NVPTX implementation of GOACC_REDUCTION_INIT. */
6140
6141static void
f881693c 6142nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
f3552158
NS
6143{
6144 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6145 tree lhs = gimple_call_lhs (call);
6146 tree var = gimple_call_arg (call, 2);
6147 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6148 enum tree_code rcode
6149 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6150 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6151 TREE_TYPE (var));
6152 gimple_seq seq = NULL;
6153
6154 push_gimplify_context (true);
6155
f881693c 6156 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6157 {
6158 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6159 tree tid = make_ssa_name (integer_type_node);
6160 tree dim_vector = gimple_call_arg (call, 3);
6161 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6162 dim_vector);
6163 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6164 NULL_TREE, NULL_TREE);
6165
6166 gimple_call_set_lhs (tid_call, tid);
6167 gimple_seq_add_stmt (&seq, tid_call);
6168 gimple_seq_add_stmt (&seq, cond_stmt);
6169
6170 /* Split the block just after the call. */
6171 edge init_edge = split_block (gsi_bb (gsi), call);
6172 basic_block init_bb = init_edge->dest;
6173 basic_block call_bb = init_edge->src;
6174
6175 /* Fixup flags from call_bb to init_bb. */
6176 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 6177 init_edge->probability = profile_probability::even ();
f3552158
NS
6178
6179 /* Set the initialization stmts. */
6180 gimple_seq init_seq = NULL;
6181 tree init_var = make_ssa_name (TREE_TYPE (var));
6182 gimplify_assign (init_var, init, &init_seq);
6183 gsi = gsi_start_bb (init_bb);
6184 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6185
6186 /* Split block just after the init stmt. */
6187 gsi_prev (&gsi);
6188 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6189 basic_block dst_bb = inited_edge->dest;
6190
6191 /* Create false edge from call_bb to dst_bb. */
6192 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 6193 nop_edge->probability = profile_probability::even ();
f3552158
NS
6194
6195 /* Create phi node in dst block. */
6196 gphi *phi = create_phi_node (lhs, dst_bb);
6197 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6198 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6199
6200 /* Reset dominator of dst bb. */
6201 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6202
6203 /* Reset the gsi. */
6204 gsi = gsi_for_stmt (call);
6205 }
6206 else
6207 {
6208 if (level == GOMP_DIM_GANG)
6209 {
6210 /* If there's no receiver object, propagate the incoming VAR. */
6211 tree ref_to_res = gimple_call_arg (call, 1);
6212 if (integer_zerop (ref_to_res))
6213 init = var;
6214 }
6215
6216 gimplify_assign (lhs, init, &seq);
6217 }
6218
6219 pop_gimplify_context (NULL);
6220 gsi_replace_with_seq (&gsi, seq, true);
6221}
6222
6223/* NVPTX implementation of GOACC_REDUCTION_FINI. */
6224
6225static void
f881693c 6226nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
f3552158
NS
6227{
6228 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6229 tree lhs = gimple_call_lhs (call);
6230 tree ref_to_res = gimple_call_arg (call, 1);
6231 tree var = gimple_call_arg (call, 2);
6232 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6233 enum tree_code op
6234 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6235 gimple_seq seq = NULL;
6236 tree r = NULL_TREE;;
6237
6238 push_gimplify_context (true);
6239
f881693c 6240 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6241 {
6242 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6243 but that requires a method of emitting a unified jump at the
6244 gimple level. */
764ecad4 6245 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
f3552158
NS
6246 {
6247 tree other_var = make_ssa_name (TREE_TYPE (var));
6248 nvptx_generate_vector_shuffle (gimple_location (call),
6249 other_var, var, shfl, &seq);
6250
6251 r = make_ssa_name (TREE_TYPE (var));
6252 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6253 var, other_var), &seq);
6254 var = r;
6255 }
6256 }
6257 else
6258 {
6259 tree accum = NULL_TREE;
6260
f881693c 6261 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
f3552158
NS
6262 {
6263 /* Get reduction buffer address. */
6264 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6265 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6266 level == GOMP_DIM_VECTOR);
f3552158
NS
6267 tree ptr = make_ssa_name (TREE_TYPE (call));
6268
6269 gimplify_assign (ptr, call, &seq);
6270 accum = ptr;
6271 }
6272 else if (integer_zerop (ref_to_res))
6273 r = var;
6274 else
6275 accum = ref_to_res;
6276
6277 if (accum)
6278 {
33f47f42 6279 /* UPDATE the accumulator. */
f3552158
NS
6280 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6281 seq = NULL;
33f47f42
NS
6282 r = nvptx_reduction_update (gimple_location (call), &gsi,
6283 accum, var, op);
f3552158
NS
6284 }
6285 }
6286
6287 if (lhs)
6288 gimplify_assign (lhs, r, &seq);
6289 pop_gimplify_context (NULL);
6290
6291 gsi_replace_with_seq (&gsi, seq, true);
6292}
6293
6294/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6295
6296static void
f881693c 6297nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
f3552158
NS
6298{
6299 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6300 tree lhs = gimple_call_lhs (call);
6301 tree var = gimple_call_arg (call, 2);
6302 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6303 gimple_seq seq = NULL;
6304
6305 push_gimplify_context (true);
f881693c
TV
6306 if (level == GOMP_DIM_WORKER
6307 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6308 {
6309 /* Read the worker reduction buffer. */
6310 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6311 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6312 level == GOMP_DIM_VECTOR);
f3552158
NS
6313 tree ptr = make_ssa_name (TREE_TYPE (call));
6314
6315 gimplify_assign (ptr, call, &seq);
6316 var = build_simple_mem_ref (ptr);
6317 TREE_THIS_VOLATILE (var) = 1;
6318 }
6319
6320 if (level != GOMP_DIM_GANG)
6321 {
6322 /* Write to the receiver object. */
6323 tree ref_to_res = gimple_call_arg (call, 1);
6324
6325 if (!integer_zerop (ref_to_res))
6326 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6327 }
6328
6329 if (lhs)
6330 gimplify_assign (lhs, var, &seq);
6331
6332 pop_gimplify_context (NULL);
6333
6334 gsi_replace_with_seq (&gsi, seq, true);
6335}
6336
6337/* NVPTX reduction expander. */
6338
5563d5c0 6339static void
f3552158
NS
6340nvptx_goacc_reduction (gcall *call)
6341{
6342 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
f881693c
TV
6343 offload_attrs oa;
6344
6345 populate_offload_attrs (&oa);
f3552158
NS
6346
6347 switch (code)
6348 {
6349 case IFN_GOACC_REDUCTION_SETUP:
f881693c 6350 nvptx_goacc_reduction_setup (call, &oa);
f3552158
NS
6351 break;
6352
6353 case IFN_GOACC_REDUCTION_INIT:
f881693c 6354 nvptx_goacc_reduction_init (call, &oa);
f3552158
NS
6355 break;
6356
6357 case IFN_GOACC_REDUCTION_FINI:
f881693c 6358 nvptx_goacc_reduction_fini (call, &oa);
f3552158
NS
6359 break;
6360
6361 case IFN_GOACC_REDUCTION_TEARDOWN:
f881693c 6362 nvptx_goacc_reduction_teardown (call, &oa);
f3552158
NS
6363 break;
6364
6365 default:
6366 gcc_unreachable ();
6367 }
6368}
6369
e053f710
TV
6370static bool
6371nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6372 rtx x ATTRIBUTE_UNUSED)
6373{
6374 return true;
6375}
6376
b98b34b7
TV
6377static bool
6378nvptx_vector_mode_supported (machine_mode mode)
6379{
3717fbe3
TV
6380 return (mode == V2SImode
6381 || mode == V2DImode);
6382}
6383
6384/* Return the preferred mode for vectorizing scalar MODE. */
6385
6386static machine_mode
005ba29c 6387nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
6388{
6389 switch (mode)
6390 {
4e10a5a7 6391 case E_DImode:
3717fbe3 6392 return V2DImode;
4e10a5a7 6393 case E_SImode:
3717fbe3
TV
6394 return V2SImode;
6395
6396 default:
6397 return default_preferred_simd_mode (mode);
6398 }
6399}
6400
6401unsigned int
6402nvptx_data_alignment (const_tree type, unsigned int basic_align)
6403{
6404 if (TREE_CODE (type) == INTEGER_TYPE)
6405 {
6406 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6407 if (size == GET_MODE_SIZE (TImode))
6408 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6409 }
6410
6411 return basic_align;
b98b34b7
TV
6412}
6413
99e1629f
RS
6414/* Implement TARGET_MODES_TIEABLE_P. */
6415
6416static bool
6417nvptx_modes_tieable_p (machine_mode, machine_mode)
6418{
6419 return false;
6420}
6421
c43f4279
RS
6422/* Implement TARGET_HARD_REGNO_NREGS. */
6423
6424static unsigned int
6425nvptx_hard_regno_nregs (unsigned int, machine_mode)
6426{
6427 return 1;
6428}
6429
0d803030
RS
6430/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6431
6432static bool
6433nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6434{
6435 return false;
6436}
6437
43be05f5
TV
6438static GTY(()) tree nvptx_previous_fndecl;
6439
6440static void
6441nvptx_set_current_function (tree fndecl)
6442{
6443 if (!fndecl || fndecl == nvptx_previous_fndecl)
6444 return;
6445
6446 nvptx_previous_fndecl = fndecl;
f881693c 6447 vector_red_partition = 0;
0024c320 6448 oacc_bcast_partition = 0;
43be05f5
TV
6449}
6450
738f2522
BS
6451#undef TARGET_OPTION_OVERRIDE
6452#define TARGET_OPTION_OVERRIDE nvptx_option_override
6453
6454#undef TARGET_ATTRIBUTE_TABLE
6455#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6456
d81db636
SB
6457#undef TARGET_LRA_P
6458#define TARGET_LRA_P hook_bool_void_false
6459
738f2522
BS
6460#undef TARGET_LEGITIMATE_ADDRESS_P
6461#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6462
6463#undef TARGET_PROMOTE_FUNCTION_MODE
6464#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6465
6466#undef TARGET_FUNCTION_ARG
6467#define TARGET_FUNCTION_ARG nvptx_function_arg
6468#undef TARGET_FUNCTION_INCOMING_ARG
6469#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6470#undef TARGET_FUNCTION_ARG_ADVANCE
6471#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6472#undef TARGET_FUNCTION_ARG_BOUNDARY
6473#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6474#undef TARGET_PASS_BY_REFERENCE
6475#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6476#undef TARGET_FUNCTION_VALUE_REGNO_P
6477#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6478#undef TARGET_FUNCTION_VALUE
6479#define TARGET_FUNCTION_VALUE nvptx_function_value
6480#undef TARGET_LIBCALL_VALUE
6481#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6482#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6483#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6484#undef TARGET_GET_DRAP_RTX
6485#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6486#undef TARGET_SPLIT_COMPLEX_ARG
6487#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6488#undef TARGET_RETURN_IN_MEMORY
6489#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6490#undef TARGET_OMIT_STRUCT_RETURN_REG
6491#define TARGET_OMIT_STRUCT_RETURN_REG true
6492#undef TARGET_STRICT_ARGUMENT_NAMING
6493#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6494#undef TARGET_CALL_ARGS
6495#define TARGET_CALL_ARGS nvptx_call_args
6496#undef TARGET_END_CALL_ARGS
6497#define TARGET_END_CALL_ARGS nvptx_end_call_args
6498
6499#undef TARGET_ASM_FILE_START
6500#define TARGET_ASM_FILE_START nvptx_file_start
6501#undef TARGET_ASM_FILE_END
6502#define TARGET_ASM_FILE_END nvptx_file_end
6503#undef TARGET_ASM_GLOBALIZE_LABEL
6504#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6505#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6506#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6507#undef TARGET_PRINT_OPERAND
6508#define TARGET_PRINT_OPERAND nvptx_print_operand
6509#undef TARGET_PRINT_OPERAND_ADDRESS
6510#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6511#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6512#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6513#undef TARGET_ASM_INTEGER
6514#define TARGET_ASM_INTEGER nvptx_assemble_integer
6515#undef TARGET_ASM_DECL_END
6516#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6517#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6518#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6519#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6520#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6521#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6522#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6523
6524#undef TARGET_MACHINE_DEPENDENT_REORG
6525#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6526#undef TARGET_NO_REGISTER_ALLOCATION
6527#define TARGET_NO_REGISTER_ALLOCATION true
6528
9a863523
NS
6529#undef TARGET_ENCODE_SECTION_INFO
6530#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6531#undef TARGET_RECORD_OFFLOAD_SYMBOL
6532#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6533
738f2522
BS
6534#undef TARGET_VECTOR_ALIGNMENT
6535#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6536
d88cd9c4
NS
6537#undef TARGET_CANNOT_COPY_INSN_P
6538#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6539
a794bd20
NS
6540#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6541#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6542
f3552158
NS
6543#undef TARGET_INIT_BUILTINS
6544#define TARGET_INIT_BUILTINS nvptx_init_builtins
6545#undef TARGET_EXPAND_BUILTIN
6546#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6547#undef TARGET_BUILTIN_DECL
6548#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6549
5012919d
AM
6550#undef TARGET_SIMT_VF
6551#define TARGET_SIMT_VF nvptx_simt_vf
6552
94829f87
NS
6553#undef TARGET_GOACC_VALIDATE_DIMS
6554#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6555
bd751975
NS
6556#undef TARGET_GOACC_DIM_LIMIT
6557#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6558
d88cd9c4
NS
6559#undef TARGET_GOACC_FORK_JOIN
6560#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6561
f3552158
NS
6562#undef TARGET_GOACC_REDUCTION
6563#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6564
e053f710
TV
6565#undef TARGET_CANNOT_FORCE_CONST_MEM
6566#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6567
b98b34b7
TV
6568#undef TARGET_VECTOR_MODE_SUPPORTED_P
6569#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6570
3717fbe3
TV
6571#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6572#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6573 nvptx_preferred_simd_mode
6574
99e1629f
RS
6575#undef TARGET_MODES_TIEABLE_P
6576#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6577
c43f4279
RS
6578#undef TARGET_HARD_REGNO_NREGS
6579#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6580
0d803030
RS
6581#undef TARGET_CAN_CHANGE_MODE_CLASS
6582#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6583
e335138d
TV
6584#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6585#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6586
43be05f5
TV
6587#undef TARGET_SET_CURRENT_FUNCTION
6588#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6589
738f2522
BS
6590struct gcc_target targetm = TARGET_INITIALIZER;
6591
6592#include "gt-nvptx.h"