]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
8d9254fc 2 Copyright (C) 2014-2020 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19 61#include "omp-low.h"
6e723923 62#include "omp-offload.h"
3e32ee19 63#include "gomp-constants.h"
d88cd9c4 64#include "dumpfile.h"
f3552158
NS
65#include "internal-fn.h"
66#include "gimple-iterator.h"
67#include "stringpool.h"
314e6352 68#include "attribs.h"
f2d4d839 69#include "tree-vrp.h"
f3552158
NS
70#include "tree-ssa-operands.h"
71#include "tree-ssanames.h"
72#include "gimplify.h"
73#include "tree-phinodes.h"
74#include "cfgloop.h"
75#include "fold-const.h"
50c620ca 76#include "intl.h"
738f2522 77
994c5d85 78/* This file should be included last. */
d58627a0
RS
79#include "target-def.h"
80
43c371e8 81#define WORKAROUND_PTXJIT_BUG 1
3dede32b 82#define WORKAROUND_PTXJIT_BUG_2 1
21251395 83#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 84
69b09a58
TV
85/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
87 2.x. */
88#define PTX_CTA_SIZE 1024
89
052aaace 90#define PTX_CTA_NUM_BARRIERS 16
5d17a476 91#define PTX_WARP_SIZE 32
69b09a58 92
052aaace
TV
93#define PTX_PER_CTA_BARRIER 0
94#define PTX_NUM_PER_CTA_BARRIERS 1
95#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97
764ecad4 98#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
2b9d9e39 99#define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
693ad66b
TV
100#define PTX_WORKER_LENGTH 32
101#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 102
9a863523
NS
103/* The various PTX memory areas an object might reside in. */
104enum nvptx_data_area
105{
106 DATA_AREA_GENERIC,
107 DATA_AREA_GLOBAL,
108 DATA_AREA_SHARED,
109 DATA_AREA_LOCAL,
110 DATA_AREA_CONST,
111 DATA_AREA_PARAM,
112 DATA_AREA_MAX
113};
114
115/* We record the data area in the target symbol flags. */
116#define SYMBOL_DATA_AREA(SYM) \
117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
118 & 7)
119#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
121
738f2522
BS
122/* Record the function decls we've written, and the libfuncs and function
123 decls corresponding to them. */
124static std::stringstream func_decls;
f3dba894 125
6c907cff 126struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
127{
128 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
129 static bool equal (rtx a, rtx b) { return a == b; }
130};
131
132static GTY((cache))
133 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
134
6c907cff 135struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
136{
137 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
138 static bool equal (tree a, tree b) { return a == b; }
139};
140
141static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
142static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 143
1ed57fb8
TV
144/* Buffer needed to broadcast across workers and vectors. This is
145 used for both worker-neutering and worker broadcasting, and
146 vector-neutering and boardcasting when vector_length > 32. It is
147 shared by all functions emitted. The buffer is placed in shared
148 memory. It'd be nice if PTX supported common blocks, because then
149 this could be shared across TUs (taking the largest size). */
150static unsigned oacc_bcast_size;
0024c320 151static unsigned oacc_bcast_partition;
1ed57fb8
TV
152static unsigned oacc_bcast_align;
153static GTY(()) rtx oacc_bcast_sym;
d88cd9c4 154
f3552158
NS
155/* Buffer needed for worker reductions. This has to be distinct from
156 the worker broadcast array, as both may be live concurrently. */
157static unsigned worker_red_size;
158static unsigned worker_red_align;
f3552158
NS
159static GTY(()) rtx worker_red_sym;
160
f881693c
TV
161/* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164static unsigned vector_red_size;
165static unsigned vector_red_align;
166static unsigned vector_red_partition;
167static GTY(()) rtx vector_red_sym;
168
33f47f42
NS
169/* Global lock variable, needed for 128bit worker & gang reductions. */
170static GTY(()) tree global_lock_var;
171
5012919d
AM
172/* True if any function references __nvptx_stacks. */
173static bool need_softstack_decl;
174
175/* True if any function references __nvptx_uni. */
176static bool need_unisimt_decl;
177
0024c320
TV
178static int nvptx_mach_max_workers ();
179
738f2522
BS
180/* Allocate a new, cleared machine_function structure. */
181
182static struct machine_function *
183nvptx_init_machine_status (void)
184{
185 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 186 p->return_mode = VOIDmode;
738f2522
BS
187 return p;
188}
189
5012919d
AM
190/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191 and -fopenacc is also enabled. */
192
193static void
194diagnose_openacc_conflict (bool optval, const char *optname)
195{
196 if (flag_openacc && optval)
a3f9f006 197 error ("option %s is not supported together with %<-fopenacc%>", optname);
5012919d
AM
198}
199
738f2522
BS
200/* Implement TARGET_OPTION_OVERRIDE. */
201
202static void
203nvptx_option_override (void)
204{
205 init_machine_status = nvptx_init_machine_status;
ae578943 206
3d339d5e
AM
207 /* Set toplevel_reorder, unless explicitly disabled. We need
208 reordering so that we emit necessary assembler decls of
209 undeclared variables. */
210 if (!global_options_set.x_flag_toplevel_reorder)
211 flag_toplevel_reorder = 1;
212
fa9afdc3
TV
213 debug_nonbind_markers_p = 0;
214
ae578943
NS
215 /* Set flag_no_common, unless explicitly disabled. We fake common
216 using .weak, and that's not entirely accurate, so avoid it
217 unless forced. */
218 if (!global_options_set.x_flag_no_common)
219 flag_no_common = 1;
220
cad136a4
TV
221 /* The patch area requires nops, which we don't have. */
222 if (function_entry_patch_area_size > 0)
223 sorry ("not generating patch area, nops not supported");
224
738f2522
BS
225 /* Assumes that it will see only hard registers. */
226 flag_var_tracking = 0;
1e5154e7 227
dba619f3
NS
228 if (nvptx_optimize < 0)
229 nvptx_optimize = optimize > 0;
230
f3dba894
TS
231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 233 declared_libfuncs_htab
f3dba894 234 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 235
1ed57fb8
TV
236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
0024c320 239 oacc_bcast_partition = 0;
f3552158 240
15ab6f00 241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d 244
f881693c
TV
245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248 vector_red_partition = 0;
249
5012919d
AM
250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
253
254 if (TARGET_GOMP)
255 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
256}
257
738f2522
BS
258/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
259 deal with ptx ideosyncracies. */
260
261const char *
262nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
263{
264 switch (mode)
265 {
4e10a5a7 266 case E_BLKmode:
738f2522 267 return ".b8";
4e10a5a7 268 case E_BImode:
738f2522 269 return ".pred";
4e10a5a7 270 case E_QImode:
738f2522
BS
271 if (promote)
272 return ".u32";
273 else
274 return ".u8";
4e10a5a7 275 case E_HImode:
738f2522 276 return ".u16";
4e10a5a7 277 case E_SImode:
738f2522 278 return ".u32";
4e10a5a7 279 case E_DImode:
738f2522
BS
280 return ".u64";
281
4e10a5a7 282 case E_SFmode:
738f2522 283 return ".f32";
4e10a5a7 284 case E_DFmode:
738f2522
BS
285 return ".f64";
286
4e10a5a7 287 case E_V2SImode:
b98b34b7 288 return ".v2.u32";
4e10a5a7 289 case E_V2DImode:
3717fbe3 290 return ".v2.u64";
b98b34b7 291
738f2522
BS
292 default:
293 gcc_unreachable ();
294 }
295}
296
9a863523
NS
297/* Encode the PTX data area that DECL (which might not actually be a
298 _DECL) should reside in. */
7b8edc29 299
9a863523
NS
300static void
301nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 302{
9a863523
NS
303 default_encode_section_info (decl, rtl, first);
304 if (first && MEM_P (rtl))
305 {
306 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 307
9a863523
NS
308 if (TREE_CONSTANT (decl))
309 area = DATA_AREA_CONST;
310 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
311 {
312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
313 {
314 area = DATA_AREA_SHARED;
315 if (DECL_INITIAL (decl))
316 error ("static initialization of variable %q+D in %<.shared%>"
317 " memory is not supported", decl);
318 }
319 else
320 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
321 }
7b8edc29 322
9a863523
NS
323 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
324 }
325}
326
327/* Return the PTX name of the data area in which SYM should be
328 placed. The symbol must have already been processed by
329 nvptx_encode_seciton_info, or equivalent. */
330
331static const char *
332section_for_sym (rtx sym)
333{
334 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
335 /* Same order as nvptx_data_area enum. */
336 static char const *const areas[] =
337 {"", ".global", ".shared", ".local", ".const", ".param"};
338
339 return areas[area];
340}
341
342/* Similarly for a decl. */
343
344static const char *
345section_for_decl (const_tree decl)
346{
347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
348}
349
b4346107
NS
350/* Check NAME for special function names and redirect them by returning a
351 replacement. This applies to malloc, free and realloc, for which we
352 want to use libgcc wrappers, and call, which triggers a bug in
353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354 not active in an offload compiler -- the names are all set by the
355 host-side compiler. */
356
357static const char *
358nvptx_name_replacement (const char *name)
359{
360 if (strcmp (name, "call") == 0)
361 return "__nvptx_call";
362 if (strcmp (name, "malloc") == 0)
363 return "__nvptx_malloc";
364 if (strcmp (name, "free") == 0)
365 return "__nvptx_free";
366 if (strcmp (name, "realloc") == 0)
367 return "__nvptx_realloc";
368 return name;
369}
370
d7479262
NS
371/* If MODE should be treated as two registers of an inner mode, return
372 that inner mode. Otherwise return VOIDmode. */
738f2522 373
d7479262
NS
374static machine_mode
375maybe_split_mode (machine_mode mode)
738f2522 376{
738f2522 377 if (COMPLEX_MODE_P (mode))
d7479262 378 return GET_MODE_INNER (mode);
738f2522 379
738f2522 380 if (mode == TImode)
d7479262
NS
381 return DImode;
382
383 return VOIDmode;
738f2522
BS
384}
385
38827811
TV
386/* Return true if mode should be treated as two registers. */
387
388static bool
389split_mode_p (machine_mode mode)
390{
391 return maybe_split_mode (mode) != VOIDmode;
392}
393
f313d112
NS
394/* Output a register, subreg, or register pair (with optional
395 enclosing braces). */
396
397static void
398output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
399 int subreg_offset = -1)
400{
401 if (inner_mode == VOIDmode)
402 {
403 if (HARD_REGISTER_NUM_P (regno))
404 fprintf (file, "%s", reg_names[regno]);
405 else
406 fprintf (file, "%%r%d", regno);
407 }
408 else if (subreg_offset >= 0)
409 {
410 output_reg (file, regno, VOIDmode);
411 fprintf (file, "$%d", subreg_offset);
412 }
413 else
414 {
415 if (subreg_offset == -1)
416 fprintf (file, "{");
417 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
418 fprintf (file, ",");
419 output_reg (file, regno, inner_mode, 0);
420 if (subreg_offset == -1)
421 fprintf (file, "}");
422 }
423}
424
d88cd9c4
NS
425/* Emit forking instructions for MASK. */
426
427static void
428nvptx_emit_forking (unsigned mask, bool is_call)
429{
430 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
431 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
432 if (mask)
433 {
434 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
435
436 /* Emit fork at all levels. This helps form SESE regions, as
437 it creates a block with a single successor before entering a
438 partitooned region. That is a good candidate for the end of
439 an SESE region. */
e91eba31 440 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
441 emit_insn (gen_nvptx_forked (op));
442 }
443}
444
445/* Emit joining instructions for MASK. */
446
447static void
448nvptx_emit_joining (unsigned mask, bool is_call)
449{
450 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452 if (mask)
453 {
454 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455
456 /* Emit joining for all non-call pars to ensure there's a single
457 predecessor for the block the join insn ends up in. This is
458 needed for skipping entire loops. */
e91eba31 459 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
460 emit_insn (gen_nvptx_join (op));
461 }
462}
463
738f2522 464\f
44eba92d
NS
465/* Determine whether MODE and TYPE (possibly NULL) should be passed or
466 returned in memory. Integer and floating types supported by the
467 machine are passed in registers, everything else is passed in
468 memory. Complex types are split. */
469
470static bool
471pass_in_memory (machine_mode mode, const_tree type, bool for_return)
472{
473 if (type)
474 {
475 if (AGGREGATE_TYPE_P (type))
476 return true;
477 if (TREE_CODE (type) == VECTOR_TYPE)
478 return true;
479 }
480
481 if (!for_return && COMPLEX_MODE_P (mode))
482 /* Complex types are passed as two underlying args. */
483 mode = GET_MODE_INNER (mode);
484
485 if (GET_MODE_CLASS (mode) != MODE_INT
486 && GET_MODE_CLASS (mode) != MODE_FLOAT)
487 return true;
488
489 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
490 return true;
491
492 return false;
493}
494
495/* A non-memory argument of mode MODE is being passed, determine the mode it
496 should be promoted to. This is also used for determining return
497 type promotion. */
498
499static machine_mode
500promote_arg (machine_mode mode, bool prototyped)
501{
502 if (!prototyped && mode == SFmode)
503 /* K&R float promotion for unprototyped functions. */
504 mode = DFmode;
505 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
506 mode = SImode;
507
508 return mode;
509}
510
511/* A non-memory return type of MODE is being returned. Determine the
512 mode it should be promoted to. */
513
514static machine_mode
515promote_return (machine_mode mode)
516{
517 return promote_arg (mode, true);
518}
519
e74f7152 520/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 521
e74f7152 522static rtx
6783fdb7 523nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
e74f7152 524{
6783fdb7 525 if (arg.end_marker_p () || !arg.named)
e74f7152 526 return NULL_RTX;
738f2522 527
6783fdb7 528 return gen_reg_rtx (arg.mode);
e74f7152
NS
529}
530
531/* Implement TARGET_FUNCTION_INCOMING_ARG. */
532
533static rtx
6783fdb7
RS
534nvptx_function_incoming_arg (cumulative_args_t cum_v,
535 const function_arg_info &arg)
738f2522 536{
e74f7152 537 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 538
6783fdb7 539 if (arg.end_marker_p () || !arg.named)
e74f7152 540 return NULL_RTX;
738f2522 541
e74f7152
NS
542 /* No need to deal with split modes here, the only case that can
543 happen is complex modes and those are dealt with by
544 TARGET_SPLIT_COMPLEX_ARG. */
6783fdb7 545 return gen_rtx_UNSPEC (arg.mode,
e74f7152
NS
546 gen_rtvec (1, GEN_INT (cum->count)),
547 UNSPEC_ARG_REG);
548}
549
550/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
551
552static void
6930c98c 553nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
e74f7152
NS
554{
555 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 556
e74f7152
NS
557 cum->count++;
558}
559
1988b2c2
NS
560/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
561
562 For nvptx This is only used for varadic args. The type has already
563 been promoted and/or converted to invisible reference. */
564
565static unsigned
566nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
567{
568 return GET_MODE_ALIGNMENT (mode);
569}
570
e74f7152
NS
571/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
572
573 For nvptx, we know how to handle functions declared as stdarg: by
574 passing an extra pointer to the unnamed arguments. However, the
575 Fortran frontend can produce a different situation, where a
576 function pointer is declared with no arguments, but the actual
577 function and calls to it take more arguments. In that case, we
578 want to ensure the call matches the definition of the function. */
579
580static bool
581nvptx_strict_argument_naming (cumulative_args_t cum_v)
582{
583 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 584
e74f7152
NS
585 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
586}
587
e74f7152
NS
588/* Implement TARGET_LIBCALL_VALUE. */
589
590static rtx
591nvptx_libcall_value (machine_mode mode, const_rtx)
592{
e91cacae 593 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
594 /* Pretend to return in a hard reg for early uses before pseudos can be
595 generated. */
596 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 597
e74f7152
NS
598 return gen_reg_rtx (mode);
599}
600
601/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
602 where function FUNC returns or receives a value of data type TYPE. */
603
604static rtx
44eba92d 605nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
606 bool outgoing)
607{
44eba92d
NS
608 machine_mode mode = promote_return (TYPE_MODE (type));
609
e74f7152 610 if (outgoing)
5c036f3f 611 {
e91cacae 612 gcc_assert (cfun);
44c068ae 613 cfun->machine->return_mode = mode;
5c036f3f
NS
614 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
615 }
b49e35a9
NS
616
617 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
618}
619
620/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
621
622static bool
623nvptx_function_value_regno_p (const unsigned int regno)
624{
625 return regno == NVPTX_RETURN_REGNUM;
626}
627
628/* Types with a mode other than those supported by the machine are passed by
629 reference in memory. */
630
631static bool
52090e4d 632nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
e74f7152 633{
52090e4d 634 return pass_in_memory (arg.mode, arg.type, false);
e74f7152
NS
635}
636
637/* Implement TARGET_RETURN_IN_MEMORY. */
638
639static bool
640nvptx_return_in_memory (const_tree type, const_tree)
641{
44eba92d 642 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
643}
644
645/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
646
647static machine_mode
648nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 649 int *ARG_UNUSED (punsignedp),
e74f7152
NS
650 const_tree funtype, int for_return)
651{
44eba92d 652 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
653}
654
e74f7152
NS
655/* Helper for write_arg. Emit a single PTX argument of MODE, either
656 in a prototype, or as copy in a function prologue. ARGNO is the
657 index of this argument in the PTX function. FOR_REG is negative,
658 if we're emitting the PTX prototype. It is zero if we're copying
659 to an argument register and it is greater than zero if we're
660 copying to a specific hard register. */
661
662static int
1f065954
NS
663write_arg_mode (std::stringstream &s, int for_reg, int argno,
664 machine_mode mode)
e74f7152
NS
665{
666 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
667
dc3d2aeb
NS
668 if (for_reg < 0)
669 {
670 /* Writing PTX prototype. */
671 s << (argno ? ", " : " (");
e74f7152 672 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
673 }
674 else
675 {
e74f7152 676 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
677 if (for_reg)
678 s << reg_names[for_reg];
679 else
680 s << "%ar" << argno;
681 s << ";\n";
5563d5c0
NS
682 if (argno >= 0)
683 {
684 s << "\tld.param" << ptx_type << " ";
685 if (for_reg)
686 s << reg_names[for_reg];
687 else
688 s << "%ar" << argno;
689 s << ", [%in_ar" << argno << "];\n";
690 }
dc3d2aeb
NS
691 }
692 return argno + 1;
738f2522
BS
693}
694
e74f7152 695/* Process function parameter TYPE to emit one or more PTX
1f065954 696 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
697 is true, if this is a prototyped function, rather than an old-style
698 C declaration. Returns the next argument number to use.
e74f7152 699
9c582551 700 The promotion behavior here must match the regular GCC function
e74f7152
NS
701 parameter marshalling machinery. */
702
703static int
1f065954
NS
704write_arg_type (std::stringstream &s, int for_reg, int argno,
705 tree type, bool prototyped)
e74f7152
NS
706{
707 machine_mode mode = TYPE_MODE (type);
708
709 if (mode == VOIDmode)
710 return argno;
711
44eba92d 712 if (pass_in_memory (mode, type, false))
e74f7152 713 mode = Pmode;
44eba92d
NS
714 else
715 {
716 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 717
44eba92d
NS
718 if (split)
719 {
720 /* Complex types are sent as two separate args. */
721 type = TREE_TYPE (type);
5563d5c0 722 mode = TYPE_MODE (type);
44eba92d
NS
723 prototyped = true;
724 }
e74f7152 725
44eba92d
NS
726 mode = promote_arg (mode, prototyped);
727 if (split)
1f065954 728 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 729 }
e74f7152 730
1f065954
NS
731 return write_arg_mode (s, for_reg, argno, mode);
732}
733
734/* Emit a PTX return as a prototype or function prologue declaration
735 for MODE. */
736
737static void
738write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
739{
740 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
741 const char *pfx = "\t.reg";
742 const char *sfx = ";\n";
743
744 if (for_proto)
745 pfx = "(.param", sfx = "_out) ";
746
747 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
748}
749
44eba92d 750/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 751 or function prologue declaration. Returns true if return is via an
9c582551 752 additional pointer parameter. The promotion behavior here must
1f065954 753 match the regular GCC function return mashalling. */
44eba92d 754
0beb7c71 755static bool
1f065954 756write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
757{
758 machine_mode mode = TYPE_MODE (type);
0beb7c71 759
44eba92d
NS
760 if (mode == VOIDmode)
761 return false;
762
763 bool return_in_mem = pass_in_memory (mode, type, true);
764
765 if (return_in_mem)
0beb7c71 766 {
44eba92d
NS
767 if (for_proto)
768 return return_in_mem;
769
770 /* Named return values can cause us to return a pointer as well
771 as expect an argument for the return location. This is
772 optimization-level specific, so no caller can make use of
773 this data, but more importantly for us, we must ensure it
774 doesn't change the PTX prototype. */
44c068ae 775 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 776
44eba92d
NS
777 if (mode == VOIDmode)
778 return return_in_mem;
779
44c068ae 780 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 781 retval parameter. */
44c068ae 782 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
783 }
784 else
44eba92d
NS
785 mode = promote_return (mode);
786
1f065954 787 write_return_mode (s, for_proto, mode);
0beb7c71
NS
788
789 return return_in_mem;
790}
791
738f2522
BS
792/* Look for attributes in ATTRS that would indicate we must write a function
793 as a .entry kernel rather than a .func. Return true if one is found. */
794
795static bool
796write_as_kernel (tree attrs)
797{
798 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
799 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
800 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
801 /* For OpenMP target regions, the corresponding kernel entry is emitted from
802 write_omp_entry as a separate function. */
738f2522
BS
803}
804
69823d76
NS
805/* Emit a linker marker for a function decl or defn. */
806
807static void
808write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
809 const char *name)
810{
811 s << "\n// BEGIN";
812 if (globalize)
813 s << " GLOBAL";
814 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
815 s << name << "\n";
816}
817
818/* Emit a linker marker for a variable decl or defn. */
819
820static void
821write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
822{
823 fprintf (file, "\n// BEGIN%s VAR %s: ",
824 globalize ? " GLOBAL" : "",
825 is_defn ? "DEF" : "DECL");
826 assemble_name_raw (file, name);
827 fputs ("\n", file);
828}
829
b699adcc
NS
830/* Write a .func or .kernel declaration or definition along with
831 a helper comment for use by ld. S is the stream to write to, DECL
832 the decl for the function with name NAME. For definitions, emit
833 a declaration too. */
738f2522 834
b699adcc
NS
835static const char *
836write_fn_proto (std::stringstream &s, bool is_defn,
837 const char *name, const_tree decl)
738f2522 838{
b699adcc
NS
839 if (is_defn)
840 /* Emit a declaration. The PTX assembler gets upset without it. */
841 name = write_fn_proto (s, false, name, decl);
b4346107
NS
842 else
843 {
844 /* Avoid repeating the name replacement. */
845 name = nvptx_name_replacement (name);
846 if (name[0] == '*')
847 name++;
848 }
738f2522 849
69823d76 850 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
851
852 /* PTX declaration. */
738f2522
BS
853 if (DECL_EXTERNAL (decl))
854 s << ".extern ";
855 else if (TREE_PUBLIC (decl))
0766660b 856 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 857 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 858
b699adcc
NS
859 tree fntype = TREE_TYPE (decl);
860 tree result_type = TREE_TYPE (fntype);
738f2522 861
ac3d02e2
NS
862 /* atomic_compare_exchange_$n builtins have an exceptional calling
863 convention. */
864 int not_atomic_weak_arg = -1;
865 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
866 switch (DECL_FUNCTION_CODE (decl))
867 {
868 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
869 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
871 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
872 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
873 /* These atomics skip the 'weak' parm in an actual library
874 call. We must skip it in the prototype too. */
875 not_atomic_weak_arg = 3;
876 break;
877
878 default:
879 break;
880 }
881
738f2522 882 /* Declare the result. */
1f065954 883 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 884
b699adcc
NS
885 s << name;
886
dc3d2aeb 887 int argno = 0;
b699adcc
NS
888
889 /* Emit argument list. */
890 if (return_in_mem)
1f065954 891 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 892
b699adcc
NS
893 /* We get:
894 NULL in TYPE_ARG_TYPES, for old-style functions
895 NULL in DECL_ARGUMENTS, for builtin functions without another
896 declaration.
897 So we have to pick the best one we have. */
898 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
899 bool prototyped = true;
900 if (!args)
901 {
902 args = DECL_ARGUMENTS (decl);
903 prototyped = false;
904 }
738f2522 905
ac3d02e2 906 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 907 {
dc3d2aeb 908 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
909
910 if (not_atomic_weak_arg)
911 argno = write_arg_type (s, -1, argno, type, prototyped);
912 else
913 gcc_assert (type == boolean_type_node);
738f2522 914 }
738f2522 915
b699adcc 916 if (stdarg_p (fntype))
1f065954 917 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 918
b699adcc 919 if (DECL_STATIC_CHAIN (decl))
1f065954 920 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 921
dc3d2aeb 922 if (!argno && strcmp (name, "main") == 0)
b699adcc 923 {
1f065954
NS
924 argno = write_arg_type (s, -1, argno, integer_type_node, true);
925 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
926 }
927
dc3d2aeb 928 if (argno)
b699adcc
NS
929 s << ")";
930
931 s << (is_defn ? "\n" : ";\n");
932
933 return name;
738f2522
BS
934}
935
00e52418
NS
936/* Construct a function declaration from a call insn. This can be
937 necessary for two reasons - either we have an indirect call which
938 requires a .callprototype declaration, or we have a libcall
939 generated by emit_library_call for which no decl exists. */
940
941static void
b699adcc
NS
942write_fn_proto_from_insn (std::stringstream &s, const char *name,
943 rtx result, rtx pat)
00e52418
NS
944{
945 if (!name)
946 {
947 s << "\t.callprototype ";
948 name = "_";
949 }
950 else
951 {
b4346107 952 name = nvptx_name_replacement (name);
69823d76 953 write_fn_marker (s, false, true, name);
00e52418
NS
954 s << "\t.extern .func ";
955 }
956
957 if (result != NULL_RTX)
1f065954 958 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
959
960 s << name;
961
00e52418
NS
962 int arg_end = XVECLEN (pat, 0);
963 for (int i = 1; i < arg_end; i++)
964 {
1f065954
NS
965 /* We don't have to deal with mode splitting & promotion here,
966 as that was already done when generating the call
967 sequence. */
00e52418
NS
968 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
969
1f065954 970 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
971 }
972 if (arg_end != 1)
973 s << ")";
974 s << ";\n";
975}
976
00e52418
NS
977/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
978 table and and write a ptx prototype. These are emitted at end of
979 compilation. */
738f2522 980
00e52418
NS
981static void
982nvptx_record_fndecl (tree decl)
738f2522 983{
f3dba894 984 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
985 if (*slot == NULL)
986 {
987 *slot = decl;
988 const char *name = get_fnname_from_decl (decl);
b699adcc 989 write_fn_proto (func_decls, false, name, decl);
738f2522 990 }
738f2522
BS
991}
992
00e52418
NS
993/* Record a libcall or unprototyped external function. CALLEE is the
994 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
995 declaration for it. */
996
997static void
998nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
999{
1000 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1001 if (*slot == NULL)
1002 {
1003 *slot = callee;
1004
1005 const char *name = XSTR (callee, 0);
b699adcc 1006 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
1007 }
1008}
1009
1010/* DECL is an external FUNCTION_DECL, that we're referencing. If it
1011 is prototyped, record it now. Otherwise record it as needed at end
1012 of compilation, when we might have more information about it. */
738f2522
BS
1013
1014void
1015nvptx_record_needed_fndecl (tree decl)
1016{
00e52418
NS
1017 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1018 {
1019 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1020 if (*slot == NULL)
1021 *slot = decl;
1022 }
1023 else
1024 nvptx_record_fndecl (decl);
1025}
738f2522 1026
00e52418
NS
1027/* SYM is a SYMBOL_REF. If it refers to an external function, record
1028 it as needed. */
1029
1030static void
1031nvptx_maybe_record_fnsym (rtx sym)
1032{
1033 tree decl = SYMBOL_REF_DECL (sym);
1034
1035 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1036 nvptx_record_needed_fndecl (decl);
738f2522
BS
1037}
1038
1f065954 1039/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1040 and initialize REGNO to point to it. If the size is zero, it'll
1041 never be valid to dereference, so we can simply initialize to
1042 zero. */
1f065954
NS
1043
1044static void
1045init_frame (FILE *file, int regno, unsigned align, unsigned size)
1046{
5563d5c0
NS
1047 if (size)
1048 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1049 align, reg_names[regno], size);
1050 fprintf (file, "\t.reg.u%d %s;\n",
1051 POINTER_SIZE, reg_names[regno]);
1052 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1053 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1054 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1055}
1056
5012919d
AM
1057/* Emit soft stack frame setup sequence. */
1058
1059static void
1060init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1061{
1062 /* Maintain 64-bit stack alignment. */
1063 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1064 size = ROUND_UP (size, keep_align);
1065 int bits = POINTER_SIZE;
1066 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1067 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1068 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1069 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1070 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1071 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1072 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1073 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1074 fprintf (file, "\t{\n");
1075 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1076 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1077 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1078 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1079 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1080 bits == 64 ? ".wide" : ".lo", bits / 8);
1081 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1082
1083 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1084 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1085
1086 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1087 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1088 bits, reg_sspprev, reg_sspslot);
1089
1090 /* Initialize %frame = %sspprev - size. */
1091 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1092 bits, reg_frame, reg_sspprev, size);
1093
1094 /* Apply alignment, if larger than 64. */
1095 if (alignment > keep_align)
1096 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1097 bits, reg_frame, reg_frame, -alignment);
1098
1099 size = crtl->outgoing_args_size;
1100 gcc_assert (size % keep_align == 0);
1101
1102 /* Initialize %stack. */
1103 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1104 bits, reg_stack, reg_frame, size);
1105
5012919d
AM
1106 if (!crtl->is_leaf)
1107 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1108 bits, reg_sspslot, reg_stack);
1109 fprintf (file, "\t}\n");
1110 cfun->machine->has_softstack = true;
1111 need_softstack_decl = true;
1112}
1113
d88cd9c4
NS
1114/* Emit code to initialize the REGNO predicate register to indicate
1115 whether we are not lane zero on the NAME axis. */
1116
1117static void
1118nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1119{
1120 fprintf (file, "\t{\n");
1121 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
f881693c
TV
1122 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1123 {
1124 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1125 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1126 }
d88cd9c4
NS
1127 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1128 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
f881693c
TV
1129 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1130 {
1131 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1132 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1133 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1134 "// vector reduction buffer\n",
1135 REGNO (cfun->machine->red_partition),
1136 vector_red_partition);
1137 }
1138 /* Verify vector_red_size. */
1139 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1140 <= vector_red_size);
d88cd9c4
NS
1141 fprintf (file, "\t}\n");
1142}
1143
0024c320
TV
1144/* Emit code to initialize OpenACC worker broadcast and synchronization
1145 registers. */
1146
1147static void
1148nvptx_init_oacc_workers (FILE *file)
1149{
1150 fprintf (file, "\t{\n");
1151 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1152 if (cfun->machine->bcast_partition)
1153 {
1154 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1155 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1156 }
1157 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1158 if (cfun->machine->bcast_partition)
1159 {
1160 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1161 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1162 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1163 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1164 "// vector broadcast offset\n",
1165 REGNO (cfun->machine->bcast_partition),
1166 oacc_bcast_partition);
1167 }
1168 /* Verify oacc_bcast_size. */
1169 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1170 <= oacc_bcast_size);
1171 if (cfun->machine->sync_bar)
1172 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1173 "// vector synchronization barrier\n",
1174 REGNO (cfun->machine->sync_bar));
1175 fprintf (file, "\t}\n");
1176}
1177
5012919d
AM
1178/* Emit code to initialize predicate and master lane index registers for
1179 -muniform-simt code generation variant. */
1180
1181static void
1182nvptx_init_unisimt_predicate (FILE *file)
1183{
0c6b03b5
AM
1184 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1185 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1186 int bits = POINTER_SIZE;
0c6b03b5 1187 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1188 fprintf (file, "\t{\n");
1189 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1190 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1191 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1192 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1193 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1194 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1195 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1196 if (cfun->machine->unisimt_predicate)
1197 {
1198 int master = REGNO (cfun->machine->unisimt_master);
1199 int pred = REGNO (cfun->machine->unisimt_predicate);
1200 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1201 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1202 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1203 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1204 /* Compute predicate as 'tid.x == master'. */
1205 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1206 }
5012919d
AM
1207 fprintf (file, "\t}\n");
1208 need_unisimt_decl = true;
1209}
1210
1211/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1212
1213 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1214 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1215 {
1216 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1217 __nvptx_uni[tid.y] = 0;
1218 gomp_nvptx_main (ORIG, arg);
1219 }
1220 ORIG itself should not be emitted as a PTX .entry function. */
1221
1222static void
1223write_omp_entry (FILE *file, const char *name, const char *orig)
1224{
1225 static bool gomp_nvptx_main_declared;
1226 if (!gomp_nvptx_main_declared)
1227 {
1228 gomp_nvptx_main_declared = true;
1229 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1230 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1231 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1232 }
3c5b4c6d
TS
1233 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1234#define NTID_Y "%ntid.y"
5012919d
AM
1235#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1236 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1237{\n\
1238 .reg.u32 %r<3>;\n\
1239 .reg.u" PS " %R<4>;\n\
1240 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1241 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1242 mov.u32 %r2, %ctaid.x;\n\
1243 cvt.u" PS ".u32 %R1, %r0;\n\
1244 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1245 mov.u" PS " %R0, __nvptx_stacks;\n\
1246 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1247 ld.param.u" PS " %R2, [%stack];\n\
1248 ld.param.u" PS " %R3, [%sz];\n\
1249 add.u" PS " %R2, %R2, %R3;\n\
1250 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1251 st.shared.u" PS " [%R0], %R2;\n\
1252 mov.u" PS " %R0, __nvptx_uni;\n\
1253 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1254 mov.u32 %r0, 0;\n\
1255 st.shared.u32 [%R0], %r0;\n\
1256 mov.u" PS " %R0, \0;\n\
1257 ld.param.u" PS " %R1, [%arg];\n\
1258 {\n\
1259 .param.u" PS " %P<2>;\n\
1260 st.param.u" PS " [%P0], %R0;\n\
1261 st.param.u" PS " [%P1], %R1;\n\
1262 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1263 }\n\
1264 ret.uni;\n\
1265}\n"
1266 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1267 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1268#undef ENTRY_TEMPLATE
3c5b4c6d 1269#undef NTID_Y
5012919d
AM
1270 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1271 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1272 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1273 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1274 need_softstack_decl = need_unisimt_decl = true;
1275}
1276
738f2522
BS
1277/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1278 function, including local var decls and copies from the arguments to
1279 local regs. */
1280
1281void
1282nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1283{
1284 tree fntype = TREE_TYPE (decl);
1285 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1286 int argno = 0;
738f2522 1287
5012919d
AM
1288 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1289 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1290 {
1291 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1292 sprintf (buf, "%s$impl", name);
1293 write_omp_entry (file, name, buf);
1294 name = buf;
1295 }
dc3d2aeb
NS
1296 /* We construct the initial part of the function into a string
1297 stream, in order to share the prototype writing code. */
738f2522 1298 std::stringstream s;
b699adcc 1299 write_fn_proto (s, true, name, decl);
dc3d2aeb 1300 s << "{\n";
738f2522 1301
1f065954 1302 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1303 if (return_in_mem)
1f065954 1304 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1305
5ab662d5 1306 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1307 tree args = TYPE_ARG_TYPES (fntype);
1308 bool prototyped = true;
1309 if (!args)
5ab662d5 1310 {
dc3d2aeb
NS
1311 args = DECL_ARGUMENTS (decl);
1312 prototyped = false;
5ab662d5
NS
1313 }
1314
1315 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1316 {
1317 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1318
1f065954 1319 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1320 }
5ab662d5 1321
dc3d2aeb 1322 if (stdarg_p (fntype))
5563d5c0 1323 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1324 true);
5ab662d5 1325
5563d5c0
NS
1326 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1327 write_arg_type (s, STATIC_CHAIN_REGNUM,
1328 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1329 true);
1330
dc3d2aeb 1331 fprintf (file, "%s", s.str().c_str());
25662751 1332
0c6b03b5
AM
1333 /* Usually 'crtl->is_leaf' is computed during register allocator
1334 initialization (which is not done on NVPTX) or for pressure-sensitive
1335 optimizations. Initialize it here, except if already set. */
1336 if (!crtl->is_leaf)
1337 crtl->is_leaf = leaf_function_p ();
1338
44c068ae 1339 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1340 bool need_frameptr = sz || cfun->machine->has_chain;
1341 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1342 if (!TARGET_SOFT_STACK)
1343 {
1344 /* Declare a local var for outgoing varargs. */
1345 if (cfun->machine->has_varadic)
1346 init_frame (file, STACK_POINTER_REGNUM,
1347 UNITS_PER_WORD, crtl->outgoing_args_size);
1348
1349 /* Declare a local variable for the frame. Force its size to be
1350 DImode-compatible. */
1351 if (need_frameptr)
1352 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1353 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1354 }
0c6b03b5
AM
1355 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1356 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1357 init_softstack_frame (file, alignment, sz);
44c068ae 1358
0c6b03b5
AM
1359 if (cfun->machine->has_simtreg)
1360 {
1361 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1362 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1363 align = MAX (align, GET_MODE_SIZE (DImode));
1364 if (!crtl->is_leaf || cfun->calls_alloca)
1365 simtsz = HOST_WIDE_INT_M1U;
1366 if (simtsz == HOST_WIDE_INT_M1U)
1367 simtsz = nvptx_softstack_size;
1368 if (cfun->machine->has_softstack)
1369 simtsz += POINTER_SIZE / 8;
1370 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1371 if (align > GET_MODE_SIZE (DImode))
1372 simtsz += align - GET_MODE_SIZE (DImode);
1373 if (simtsz)
1374 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1375 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1376 }
f881693c
TV
1377
1378 /* Restore the vector reduction partition register, if necessary.
1379 FIXME: Find out when and why this is necessary, and fix it. */
1380 if (cfun->machine->red_partition)
1381 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1382 = cfun->machine->red_partition;
1383
738f2522
BS
1384 /* Declare the pseudos we have as ptx registers. */
1385 int maxregs = max_reg_num ();
1386 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1387 {
1388 if (regno_reg_rtx[i] != const0_rtx)
1389 {
1390 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1391 machine_mode split = maybe_split_mode (mode);
f313d112 1392
38827811 1393 if (split_mode_p (mode))
f313d112
NS
1394 mode = split;
1395 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1396 output_reg (file, i, split, -2);
1397 fprintf (file, ";\n");
738f2522
BS
1398 }
1399 }
1400
d88cd9c4
NS
1401 /* Emit axis predicates. */
1402 if (cfun->machine->axis_predicate[0])
1403 nvptx_init_axis_predicate (file,
1404 REGNO (cfun->machine->axis_predicate[0]), "y");
1405 if (cfun->machine->axis_predicate[1])
1406 nvptx_init_axis_predicate (file,
1407 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1408 if (cfun->machine->unisimt_predicate
1409 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1410 nvptx_init_unisimt_predicate (file);
0024c320
TV
1411 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1412 nvptx_init_oacc_workers (file);
738f2522
BS
1413}
1414
0c6b03b5
AM
1415/* Output code for switching uniform-simt state. ENTERING indicates whether
1416 we are entering or leaving non-uniform execution region. */
1417
1418static void
1419nvptx_output_unisimt_switch (FILE *file, bool entering)
1420{
1421 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1422 return;
1423 fprintf (file, "\t{\n");
1424 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1425 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1426 if (!crtl->is_leaf)
1427 {
1428 int loc = REGNO (cfun->machine->unisimt_location);
1429 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1430 }
1431 if (cfun->machine->unisimt_predicate)
1432 {
1433 int master = REGNO (cfun->machine->unisimt_master);
1434 int pred = REGNO (cfun->machine->unisimt_predicate);
1435 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1436 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1437 master, entering ? "%ustmp2" : "0");
1438 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1439 }
1440 fprintf (file, "\t}\n");
1441}
1442
1443/* Output code for allocating per-lane storage and switching soft-stack pointer.
1444 ENTERING indicates whether we are entering or leaving non-uniform execution.
1445 PTR is the register pointing to allocated storage, it is assigned to on
1446 entering and used to restore state on leaving. SIZE and ALIGN are used only
1447 on entering. */
1448
1449static void
1450nvptx_output_softstack_switch (FILE *file, bool entering,
1451 rtx ptr, rtx size, rtx align)
1452{
1453 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1454 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1455 return;
1456 int bits = POINTER_SIZE, regno = REGNO (ptr);
1457 fprintf (file, "\t{\n");
1458 if (entering)
1459 {
1460 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1461 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1462 cfun->machine->simt_stack_size);
1463 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1464 if (CONST_INT_P (size))
1465 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1466 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1467 else
1468 output_reg (file, REGNO (size), VOIDmode);
1469 fputs (";\n", file);
1470 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296 1471 fprintf (file,
ce9c4ec3 1472 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1473 bits, regno, regno, UINTVAL (align));
1474 }
1475 if (cfun->machine->has_softstack)
1476 {
1477 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1478 if (entering)
1479 {
1480 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1481 bits, regno, bits / 8, reg_stack);
1482 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1483 bits, reg_stack, regno, bits / 8);
1484 }
1485 else
1486 {
1487 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1488 bits, reg_stack, regno, bits / 8);
1489 }
1490 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1491 }
1492 fprintf (file, "\t}\n");
1493}
1494
1495/* Output code to enter non-uniform execution region. DEST is a register
1496 to hold a per-lane allocation given by SIZE and ALIGN. */
1497
1498const char *
1499nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1500{
1501 nvptx_output_unisimt_switch (asm_out_file, true);
1502 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1503 return "";
1504}
1505
1506/* Output code to leave non-uniform execution region. SRC is the register
1507 holding per-lane storage previously allocated by omp_simt_enter insn. */
1508
1509const char *
1510nvptx_output_simt_exit (rtx src)
1511{
1512 nvptx_output_unisimt_switch (asm_out_file, false);
1513 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1514 return "";
1515}
1516
5012919d
AM
1517/* Output instruction that sets soft stack pointer in shared memory to the
1518 value in register given by SRC_REGNO. */
1519
1520const char *
1521nvptx_output_set_softstack (unsigned src_regno)
1522{
1523 if (cfun->machine->has_softstack && !crtl->is_leaf)
1524 {
1525 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1526 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1527 output_reg (asm_out_file, src_regno, VOIDmode);
1528 fprintf (asm_out_file, ";\n");
1529 }
1530 return "";
1531}
738f2522
BS
1532/* Output a return instruction. Also copy the return value to its outgoing
1533 location. */
1534
1535const char *
1536nvptx_output_return (void)
1537{
44c068ae 1538 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1539
1540 if (mode != VOIDmode)
1f065954
NS
1541 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1542 nvptx_ptx_type_from_mode (mode, false),
1543 reg_names[NVPTX_RETURN_REGNUM],
1544 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1545
1546 return "ret;";
1547}
1548
738f2522
BS
1549/* Terminate a function by writing a closing brace to FILE. */
1550
1551void
1552nvptx_function_end (FILE *file)
1553{
cf08c344 1554 fprintf (file, "}\n");
738f2522
BS
1555}
1556\f
1557/* Decide whether we can make a sibling call to a function. For ptx, we
1558 can't. */
1559
1560static bool
1561nvptx_function_ok_for_sibcall (tree, tree)
1562{
1563 return false;
1564}
1565
18c05628
NS
1566/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1567
1568static rtx
1569nvptx_get_drap_rtx (void)
1570{
5012919d
AM
1571 if (TARGET_SOFT_STACK && stack_realign_drap)
1572 return arg_pointer_rtx;
18c05628
NS
1573 return NULL_RTX;
1574}
1575
738f2522
BS
1576/* Implement the TARGET_CALL_ARGS hook. Record information about one
1577 argument to the next call. */
1578
1579static void
44c068ae 1580nvptx_call_args (rtx arg, tree fntype)
738f2522 1581{
44c068ae 1582 if (!cfun->machine->doing_call)
738f2522 1583 {
44c068ae
NS
1584 cfun->machine->doing_call = true;
1585 cfun->machine->is_varadic = false;
1586 cfun->machine->num_args = 0;
1587
1588 if (fntype && stdarg_p (fntype))
1589 {
1590 cfun->machine->is_varadic = true;
1591 cfun->machine->has_varadic = true;
1592 cfun->machine->num_args++;
1593 }
738f2522 1594 }
738f2522 1595
44c068ae
NS
1596 if (REG_P (arg) && arg != pc_rtx)
1597 {
1598 cfun->machine->num_args++;
1599 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1600 cfun->machine->call_args);
1601 }
738f2522
BS
1602}
1603
1604/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1605 information we recorded. */
1606
1607static void
1608nvptx_end_call_args (void)
1609{
44c068ae 1610 cfun->machine->doing_call = false;
738f2522
BS
1611 free_EXPR_LIST_list (&cfun->machine->call_args);
1612}
1613
ecf6e535
BS
1614/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1615 track of whether calls involving static chains or varargs were seen
1616 in the current function.
1617 For libcalls, maintain a hash table of decls we have seen, and
1618 record a function decl for later when encountering a new one. */
738f2522
BS
1619
1620void
1621nvptx_expand_call (rtx retval, rtx address)
1622{
738f2522 1623 rtx callee = XEXP (address, 0);
f324806d 1624 rtx varargs = NULL_RTX;
d88cd9c4 1625 unsigned parallel = 0;
738f2522 1626
738f2522
BS
1627 if (!call_insn_operand (callee, Pmode))
1628 {
1629 callee = force_reg (Pmode, callee);
1630 address = change_address (address, QImode, callee);
1631 }
1632
1633 if (GET_CODE (callee) == SYMBOL_REF)
1634 {
1635 tree decl = SYMBOL_REF_DECL (callee);
1636 if (decl != NULL_TREE)
1637 {
738f2522 1638 if (DECL_STATIC_CHAIN (decl))
44c068ae 1639 cfun->machine->has_chain = true;
00e52418 1640
629b3d75 1641 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1642 if (attr)
1643 {
1644 tree dims = TREE_VALUE (attr);
1645
1646 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1647 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1648 {
1649 if (TREE_PURPOSE (dims)
1650 && !integer_zerop (TREE_PURPOSE (dims)))
1651 break;
1652 /* Not on this axis. */
1653 parallel ^= GOMP_DIM_MASK (ix);
1654 dims = TREE_CHAIN (dims);
1655 }
1656 }
738f2522
BS
1657 }
1658 }
c38f0d8c 1659
44c068ae
NS
1660 unsigned nargs = cfun->machine->num_args;
1661 if (cfun->machine->is_varadic)
738f2522 1662 {
f324806d 1663 varargs = gen_reg_rtx (Pmode);
863af9a4 1664 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1665 }
1666
44c068ae
NS
1667 rtvec vec = rtvec_alloc (nargs + 1);
1668 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1669 int vec_pos = 0;
44c068ae
NS
1670
1671 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1672 rtx tmp_retval = retval;
44c068ae 1673 if (retval)
738f2522
BS
1674 {
1675 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1676 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1677 call = gen_rtx_SET (tmp_retval, call);
738f2522 1678 }
44c068ae 1679 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1680
1681 /* Construct the call insn, including a USE for each argument pseudo
1682 register. These will be used when printing the insn. */
1683 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1684 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1685
1686 if (varargs)
cf08c344 1687 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1688
1689 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1690
d88cd9c4 1691 nvptx_emit_forking (parallel, true);
738f2522 1692 emit_call_insn (pat);
d88cd9c4
NS
1693 nvptx_emit_joining (parallel, true);
1694
738f2522
BS
1695 if (tmp_retval != retval)
1696 emit_move_insn (retval, tmp_retval);
1697}
44eba92d 1698
738f2522
BS
1699/* Emit a comparison COMPARE, and return the new test to be used in the
1700 jump. */
1701
1702rtx
1703nvptx_expand_compare (rtx compare)
1704{
1705 rtx pred = gen_reg_rtx (BImode);
1706 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1707 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1708 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1709 return gen_rtx_NE (BImode, pred, const0_rtx);
1710}
1711
d88cd9c4
NS
1712/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1713
1714void
1715nvptx_expand_oacc_fork (unsigned mode)
1716{
1717 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1718}
1719
1720void
1721nvptx_expand_oacc_join (unsigned mode)
1722{
1723 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1724}
1725
1726/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1727 objects. */
1728
1729static rtx
1730nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1731{
1732 rtx res;
1733
1734 switch (GET_MODE (src))
1735 {
4e10a5a7 1736 case E_DImode:
d88cd9c4
NS
1737 res = gen_unpackdisi2 (dst0, dst1, src);
1738 break;
4e10a5a7 1739 case E_DFmode:
d88cd9c4
NS
1740 res = gen_unpackdfsi2 (dst0, dst1, src);
1741 break;
1742 default: gcc_unreachable ();
1743 }
1744 return res;
1745}
1746
1747/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1748 object. */
1749
1750static rtx
1751nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1752{
1753 rtx res;
1754
1755 switch (GET_MODE (dst))
1756 {
4e10a5a7 1757 case E_DImode:
d88cd9c4
NS
1758 res = gen_packsidi2 (dst, src0, src1);
1759 break;
4e10a5a7 1760 case E_DFmode:
d88cd9c4
NS
1761 res = gen_packsidf2 (dst, src0, src1);
1762 break;
1763 default: gcc_unreachable ();
1764 }
1765 return res;
1766}
1767
1768/* Generate an instruction or sequence to broadcast register REG
1769 across the vectors of a single warp. */
1770
5012919d 1771rtx
59263259 1772nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1773{
1774 rtx res;
1775
1776 switch (GET_MODE (dst))
1777 {
4e10a5a7 1778 case E_SImode:
d88cd9c4
NS
1779 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1780 break;
4e10a5a7 1781 case E_SFmode:
d88cd9c4
NS
1782 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1783 break;
4e10a5a7
RS
1784 case E_DImode:
1785 case E_DFmode:
d88cd9c4
NS
1786 {
1787 rtx tmp0 = gen_reg_rtx (SImode);
1788 rtx tmp1 = gen_reg_rtx (SImode);
1789
1790 start_sequence ();
1791 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1792 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1793 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1794 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1795 res = get_insns ();
1796 end_sequence ();
1797 }
1798 break;
4e10a5a7 1799 case E_BImode:
d88cd9c4
NS
1800 {
1801 rtx tmp = gen_reg_rtx (SImode);
1802
1803 start_sequence ();
1804 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1805 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1806 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1807 res = get_insns ();
1808 end_sequence ();
1809 }
1810 break;
4e10a5a7
RS
1811 case E_QImode:
1812 case E_HImode:
d5ace3b5
CP
1813 {
1814 rtx tmp = gen_reg_rtx (SImode);
1815
1816 start_sequence ();
1817 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1818 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1819 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1820 tmp)));
1821 res = get_insns ();
1822 end_sequence ();
1823 }
1824 break;
d88cd9c4
NS
1825
1826 default:
1827 gcc_unreachable ();
1828 }
1829 return res;
1830}
1831
1832/* Generate an instruction or sequence to broadcast register REG
1833 across the vectors of a single warp. */
1834
1835static rtx
a0b3b5c4 1836nvptx_gen_warp_bcast (rtx reg)
d88cd9c4
NS
1837{
1838 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1839}
1840
1841/* Structure used when generating a worker-level spill or fill. */
1842
1ed57fb8 1843struct broadcast_data_t
d88cd9c4
NS
1844{
1845 rtx base; /* Register holding base addr of buffer. */
1846 rtx ptr; /* Iteration var, if needed. */
1847 unsigned offset; /* Offset into worker buffer. */
1848};
1849
1850/* Direction of the spill/fill and looping setup/teardown indicator. */
1851
1852enum propagate_mask
1853 {
1854 PM_read = 1 << 0,
1855 PM_write = 1 << 1,
1856 PM_loop_begin = 1 << 2,
1857 PM_loop_end = 1 << 3,
1858
1859 PM_read_write = PM_read | PM_write
1860 };
1861
1862/* Generate instruction(s) to spill or fill register REG to/from the
1863 worker broadcast array. PM indicates what is to be done, REP
1864 how many loop iterations will be executed (0 for not a loop). */
1865
1866static rtx
a0b3b5c4
TV
1867nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1868 broadcast_data_t *data, bool vector)
d88cd9c4
NS
1869{
1870 rtx res;
1871 machine_mode mode = GET_MODE (reg);
1872
1873 switch (mode)
1874 {
4e10a5a7 1875 case E_BImode:
d88cd9c4
NS
1876 {
1877 rtx tmp = gen_reg_rtx (SImode);
1878
1879 start_sequence ();
1880 if (pm & PM_read)
1881 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
a0b3b5c4 1882 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
d88cd9c4
NS
1883 if (pm & PM_write)
1884 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1885 res = get_insns ();
1886 end_sequence ();
1887 }
1888 break;
1889
1890 default:
1891 {
1892 rtx addr = data->ptr;
1893
1894 if (!addr)
1895 {
1896 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1897
3c55d60f
TV
1898 oacc_bcast_align = MAX (oacc_bcast_align, align);
1899 data->offset = ROUND_UP (data->offset, align);
d88cd9c4 1900 addr = data->base;
a0b3b5c4 1901 gcc_assert (data->base != NULL);
d88cd9c4
NS
1902 if (data->offset)
1903 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1904 }
1905
1906 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1907 if (pm == PM_read)
1908 res = gen_rtx_SET (addr, reg);
1909 else if (pm == PM_write)
1910 res = gen_rtx_SET (reg, addr);
1911 else
1912 gcc_unreachable ();
1913
1914 if (data->ptr)
1915 {
1916 /* We're using a ptr, increment it. */
1917 start_sequence ();
1918
1919 emit_insn (res);
1920 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1921 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1922 res = get_insns ();
1923 end_sequence ();
1924 }
1925 else
1926 rep = 1;
1927 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1928 }
1929 break;
1930 }
1931 return res;
1932}
738f2522
BS
1933\f
1934/* Returns true if X is a valid address for use in a memory reference. */
1935
1936static bool
1937nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1938{
1939 enum rtx_code code = GET_CODE (x);
1940
1941 switch (code)
1942 {
1943 case REG:
1944 return true;
1945
1946 case PLUS:
1947 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1948 return true;
1949 return false;
1950
1951 case CONST:
1952 case SYMBOL_REF:
1953 case LABEL_REF:
1954 return true;
1955
1956 default:
1957 return false;
1958 }
1959}
738f2522 1960\f
4d5438cd
NS
1961/* Machinery to output constant initializers. When beginning an
1962 initializer, we decide on a fragment size (which is visible in ptx
1963 in the type used), and then all initializer data is buffered until
1964 a fragment is filled and ready to be written out. */
1965
1966static struct
1967{
1968 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1969 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1970 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1971 out. */
1972 unsigned size; /* Fragment size to accumulate. */
1973 unsigned offset; /* Offset within current fragment. */
1974 bool started; /* Whether we've output any initializer. */
1975} init_frag;
1976
1977/* The current fragment is full, write it out. SYM may provide a
1978 symbolic reference we should output, in which case the fragment
1979 value is the addend. */
738f2522
BS
1980
1981static void
4d5438cd 1982output_init_frag (rtx sym)
738f2522 1983{
4d5438cd
NS
1984 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1985 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1986
4d5438cd
NS
1987 init_frag.started = true;
1988 init_frag.val = 0;
1989 init_frag.offset = 0;
1990 init_frag.remaining--;
1991
1992 if (sym)
1993 {
6c7c4708
CP
1994 bool function = (SYMBOL_REF_DECL (sym)
1995 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1996 if (!function)
1997 fprintf (asm_out_file, "generic(");
4d5438cd 1998 output_address (VOIDmode, sym);
6c7c4708
CP
1999 if (!function)
2000 fprintf (asm_out_file, ")");
2001 if (val)
2002 fprintf (asm_out_file, " + ");
4d5438cd 2003 }
738f2522 2004
4d5438cd
NS
2005 if (!sym || val)
2006 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
2007}
2008
4d5438cd
NS
2009/* Add value VAL of size SIZE to the data we're emitting, and keep
2010 writing out chunks as they fill up. */
738f2522
BS
2011
2012static void
4d5438cd 2013nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 2014{
4d5438cd
NS
2015 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2016
2017 for (unsigned part = 0; size; size -= part)
738f2522 2018 {
4d5438cd
NS
2019 val >>= part * BITS_PER_UNIT;
2020 part = init_frag.size - init_frag.offset;
3c55d60f 2021 part = MIN (part, size);
4d5438cd
NS
2022
2023 unsigned HOST_WIDE_INT partial
2024 = val << (init_frag.offset * BITS_PER_UNIT);
2025 init_frag.val |= partial & init_frag.mask;
2026 init_frag.offset += part;
2027
2028 if (init_frag.offset == init_frag.size)
2029 output_init_frag (NULL);
738f2522
BS
2030 }
2031}
2032
2033/* Target hook for assembling integer object X of size SIZE. */
2034
2035static bool
2036nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2037{
00e52418
NS
2038 HOST_WIDE_INT val = 0;
2039
2040 switch (GET_CODE (x))
738f2522 2041 {
00e52418 2042 default:
a9000e1e
NS
2043 /* Let the generic machinery figure it out, usually for a
2044 CONST_WIDE_INT. */
2045 return false;
00e52418
NS
2046
2047 case CONST_INT:
4d5438cd 2048 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
2049 break;
2050
2051 case CONST:
2052 x = XEXP (x, 0);
2053 gcc_assert (GET_CODE (x) == PLUS);
2054 val = INTVAL (XEXP (x, 1));
2055 x = XEXP (x, 0);
2056 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2057 /* FALLTHROUGH */
2058
2059 case SYMBOL_REF:
4d5438cd
NS
2060 gcc_assert (size == init_frag.size);
2061 if (init_frag.offset)
738f2522 2062 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 2063
00e52418 2064 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
2065 init_frag.val = val;
2066 output_init_frag (x);
738f2522 2067 break;
738f2522
BS
2068 }
2069
738f2522
BS
2070 return true;
2071}
2072
2073/* Output SIZE zero bytes. We ignore the FILE argument since the
2074 functions we're calling to perform the output just use
2075 asm_out_file. */
2076
2077void
2078nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2079{
4d5438cd
NS
2080 /* Finish the current fragment, if it's started. */
2081 if (init_frag.offset)
738f2522 2082 {
4d5438cd 2083 unsigned part = init_frag.size - init_frag.offset;
3c55d60f 2084 part = MIN (part, (unsigned)size);
4d5438cd
NS
2085 size -= part;
2086 nvptx_assemble_value (0, part);
738f2522
BS
2087 }
2088
4d5438cd
NS
2089 /* If this skip doesn't terminate the initializer, write as many
2090 remaining pieces as possible directly. */
2091 if (size < init_frag.remaining * init_frag.size)
738f2522 2092 {
4d5438cd
NS
2093 while (size >= init_frag.size)
2094 {
2095 size -= init_frag.size;
2096 output_init_frag (NULL_RTX);
2097 }
2098 if (size)
2099 nvptx_assemble_value (0, size);
738f2522 2100 }
738f2522
BS
2101}
2102
2103/* Output a string STR with length SIZE. As in nvptx_output_skip we
2104 ignore the FILE arg. */
2105
2106void
2107nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2108{
2109 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2110 nvptx_assemble_value (str[i], 1);
2111}
2112
bf398920
TV
2113/* Return true if TYPE is a record type where the last field is an array without
2114 given dimension. */
2115
2116static bool
2117flexible_array_member_type_p (const_tree type)
2118{
2119 if (TREE_CODE (type) != RECORD_TYPE)
2120 return false;
2121
2122 const_tree last_field = NULL_TREE;
2123 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2124 last_field = f;
2125
2126 if (!last_field)
2127 return false;
2128
2129 const_tree last_field_type = TREE_TYPE (last_field);
2130 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2131 return false;
2132
2133 return (! TYPE_DOMAIN (last_field_type)
2134 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2135}
2136
4ff3145a
NS
2137/* Emit a PTX variable decl and prepare for emission of its
2138 initializer. NAME is the symbol name and SETION the PTX data
2139 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2140 The caller has already emitted any indentation and linkage
2141 specifier. It is responsible for any initializer, terminating ;
2142 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2143 this is the opposite way round that PTX wants them! */
2144
2145static void
2146nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2147 const_tree type, HOST_WIDE_INT size, unsigned align,
2148 bool undefined = false)
4ff3145a 2149{
59d2d238
CP
2150 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2151 && (TYPE_DOMAIN (type) == NULL_TREE);
2152
bf398920
TV
2153 if (undefined && flexible_array_member_type_p (type))
2154 {
2155 size = 0;
2156 atype = true;
2157 }
2158
4ff3145a
NS
2159 while (TREE_CODE (type) == ARRAY_TYPE)
2160 type = TREE_TYPE (type);
2161
fc0efeea
NS
2162 if (TREE_CODE (type) == VECTOR_TYPE
2163 || TREE_CODE (type) == COMPLEX_TYPE)
2164 /* Neither vector nor complex types can contain the other. */
2165 type = TREE_TYPE (type);
2166
4ff3145a 2167 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
2168
2169 /* Largest mode we're prepared to accept. For BLKmode types we
2170 don't know if it'll contain pointer constants, so have to choose
2171 pointer size, otherwise we can choose DImode. */
2172 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2173
2174 elt_size |= GET_MODE_SIZE (elt_mode);
2175 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2176
4d5438cd 2177 init_frag.size = elt_size;
9c582551 2178 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2179 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2180 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2181 init_frag.val = 0;
2182 init_frag.offset = 0;
2183 init_frag.started = false;
2184 /* Size might not be a multiple of elt size, if there's an
2185 initialized trailing struct array with smaller type than
2186 elt_size. */
2187 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
2188
2189 fprintf (file, "%s .align %d .u%d ",
2190 section, align / BITS_PER_UNIT,
2191 elt_size * BITS_PER_UNIT);
2192 assemble_name (file, name);
2193
2194 if (size)
2195 /* We make everything an array, to simplify any initialization
2196 emission. */
4d5438cd 2197 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
59d2d238
CP
2198 else if (atype)
2199 fprintf (file, "[]");
4ff3145a
NS
2200}
2201
738f2522
BS
2202/* Called when the initializer for a decl has been completely output through
2203 combinations of the three functions above. */
2204
2205static void
2206nvptx_assemble_decl_end (void)
2207{
4d5438cd
NS
2208 if (init_frag.offset)
2209 /* This can happen with a packed struct with trailing array member. */
2210 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2211 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2212}
2213
69823d76
NS
2214/* Output an uninitialized common or file-scope variable. */
2215
2216void
2217nvptx_output_aligned_decl (FILE *file, const char *name,
2218 const_tree decl, HOST_WIDE_INT size, unsigned align)
2219{
2220 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2221
2222 /* If this is public, it is common. The nearest thing we have to
2223 common is weak. */
4ff3145a
NS
2224 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2225
2226 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2227 TREE_TYPE (decl), size, align);
4d5438cd 2228 nvptx_assemble_decl_end ();
69823d76
NS
2229}
2230
738f2522
BS
2231/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2232 writing a constant variable EXP with NAME and SIZE and its
2233 initializer to FILE. */
2234
2235static void
2236nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2237 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2238{
4ff3145a
NS
2239 write_var_marker (file, true, false, name);
2240
2241 fprintf (file, "\t");
2242
738f2522 2243 tree type = TREE_TYPE (exp);
4ff3145a
NS
2244 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2245 TYPE_ALIGN (type));
738f2522
BS
2246}
2247
2248/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2249 a variable DECL with NAME to FILE. */
2250
2251void
2252nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2253{
4ff3145a 2254 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2255
4ff3145a
NS
2256 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2257 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2258
4ff3145a
NS
2259 tree type = TREE_TYPE (decl);
2260 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2261 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2262 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2263}
2264
2265/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2266
2267static void
2268nvptx_globalize_label (FILE *, const char *)
2269{
2270}
2271
2272/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2273 declaration only for variable DECL with NAME to FILE. */
f313d112 2274
738f2522
BS
2275static void
2276nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2277{
0a0f74aa
NS
2278 /* The middle end can place constant pool decls into the varpool as
2279 undefined. Until that is fixed, catch the problem here. */
2280 if (DECL_IN_CONSTANT_POOL (decl))
2281 return;
2282
721547cd
NS
2283 /* We support weak defintions, and hence have the right
2284 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2285 if (DECL_WEAK (decl))
2286 error_at (DECL_SOURCE_LOCATION (decl),
2287 "PTX does not support weak declarations"
2288 " (only weak definitions)");
69823d76
NS
2289 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2290
4ff3145a
NS
2291 fprintf (file, "\t.extern ");
2292 tree size = DECL_SIZE_UNIT (decl);
2293 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2294 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2295 DECL_ALIGN (decl), true);
1e5154e7 2296 nvptx_assemble_decl_end ();
738f2522
BS
2297}
2298
f313d112
NS
2299/* Output a pattern for a move instruction. */
2300
2301const char *
2302nvptx_output_mov_insn (rtx dst, rtx src)
2303{
2304 machine_mode dst_mode = GET_MODE (dst);
2305 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2306 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2307 machine_mode src_inner = (GET_CODE (src) == SUBREG
2308 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2309
15113b03
NS
2310 rtx sym = src;
2311 if (GET_CODE (sym) == CONST)
2312 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2313 if (SYMBOL_REF_P (sym))
2314 {
2315 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2316 return "%.\tcvta%D1%t0\t%0, %1;";
2317 nvptx_maybe_record_fnsym (sym);
2318 }
15113b03 2319
f313d112
NS
2320 if (src_inner == dst_inner)
2321 return "%.\tmov%t0\t%0, %1;";
2322
2323 if (CONSTANT_P (src))
2324 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2325 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2326 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2327
2328 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2329 {
2330 if (GET_MODE_BITSIZE (dst_mode) == 128
2331 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2332 {
2333 /* mov.b128 is not supported. */
2334 if (dst_inner == V2DImode && src_inner == TImode)
2335 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2336 else if (dst_inner == TImode && src_inner == V2DImode)
2337 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2338
2339 gcc_unreachable ();
2340 }
2341 return "%.\tmov.b%T0\t%0, %1;";
2342 }
f313d112
NS
2343
2344 return "%.\tcvt%t0%t1\t%0, %1;";
2345}
2346
5012919d
AM
2347static void nvptx_print_operand (FILE *, rtx, int);
2348
738f2522 2349/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2350 involves writing .param declarations and in/out copies into them. For
2351 indirect calls, also write the .callprototype. */
738f2522
BS
2352
2353const char *
2354nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2355{
863af9a4 2356 char buf[16];
738f2522
BS
2357 static int labelno;
2358 bool needs_tgt = register_operand (callee, Pmode);
2359 rtx pat = PATTERN (insn);
5012919d
AM
2360 if (GET_CODE (pat) == COND_EXEC)
2361 pat = COND_EXEC_CODE (pat);
f324806d 2362 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2363 tree decl = NULL_TREE;
2364
2365 fprintf (asm_out_file, "\t{\n");
2366 if (result != NULL)
1f065954
NS
2367 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2368 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2369 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2370
ecf6e535 2371 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2372 if (GET_CODE (callee) == SYMBOL_REF)
2373 {
2374 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2375 if (!decl
2376 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2377 nvptx_record_libfunc (callee, result, pat);
2378 else if (DECL_EXTERNAL (decl))
738f2522
BS
2379 nvptx_record_fndecl (decl);
2380 }
2381
2382 if (needs_tgt)
2383 {
2384 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2385 labelno++;
2386 ASM_OUTPUT_LABEL (asm_out_file, buf);
2387 std::stringstream s;
b699adcc 2388 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2389 fputs (s.str().c_str(), asm_out_file);
2390 }
2391
863af9a4 2392 for (int argno = 1; argno < arg_end; argno++)
738f2522 2393 {
863af9a4 2394 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2395 machine_mode mode = GET_MODE (t);
a02d84b6 2396 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2397
863af9a4 2398 /* Mode splitting has already been done. */
a02d84b6
NS
2399 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2400 "\t\tst.param%s [%%out_arg%d], ",
2401 ptx_type, argno, ptx_type, argno);
2402 output_reg (asm_out_file, REGNO (t), VOIDmode);
2403 fprintf (asm_out_file, ";\n");
738f2522
BS
2404 }
2405
5012919d
AM
2406 /* The '.' stands for the call's predicate, if any. */
2407 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2408 fprintf (asm_out_file, "\t\tcall ");
2409 if (result != NULL_RTX)
1f065954
NS
2410 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2411
738f2522
BS
2412 if (decl)
2413 {
2414 const char *name = get_fnname_from_decl (decl);
b4346107 2415 name = nvptx_name_replacement (name);
738f2522
BS
2416 assemble_name (asm_out_file, name);
2417 }
2418 else
cc8ca59e 2419 output_address (VOIDmode, callee);
738f2522 2420
863af9a4
NS
2421 const char *open = "(";
2422 for (int argno = 1; argno < arg_end; argno++)
738f2522 2423 {
863af9a4
NS
2424 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2425 open = "";
738f2522 2426 }
863af9a4
NS
2427 if (decl && DECL_STATIC_CHAIN (decl))
2428 {
5563d5c0 2429 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2430 open = "";
2431 }
2432 if (!open[0])
2433 fprintf (asm_out_file, ")");
f324806d 2434
738f2522
BS
2435 if (needs_tgt)
2436 {
2437 fprintf (asm_out_file, ", ");
2438 assemble_name (asm_out_file, buf);
2439 }
2440 fprintf (asm_out_file, ";\n");
738f2522 2441
51baf85a 2442 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2443 {
2444 /* No return functions confuse the PTX JIT, as it doesn't realize
2445 the flow control barrier they imply. It can seg fault if it
2446 encounters what looks like an unexitable loop. Emit a trailing
2447 trap and exit, which it does grok. */
2448 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2449 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2450 }
51baf85a 2451
1f065954
NS
2452 if (result)
2453 {
2454 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2455
2456 if (!rval[0])
2457 /* We must escape the '%' that starts RETURN_REGNUM. */
2458 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2459 reg_names[NVPTX_RETURN_REGNUM]);
2460 return rval;
2461 }
2462
2463 return "}";
738f2522
BS
2464}
2465
2466/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2467
2468static bool
2469nvptx_print_operand_punct_valid_p (unsigned char c)
2470{
2471 return c == '.' || c== '#';
2472}
2473
738f2522
BS
2474/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2475
2476static void
2477nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2478{
2479 rtx off;
2480 if (GET_CODE (x) == CONST)
2481 x = XEXP (x, 0);
2482 switch (GET_CODE (x))
2483 {
2484 case PLUS:
2485 off = XEXP (x, 1);
cc8ca59e 2486 output_address (VOIDmode, XEXP (x, 0));
738f2522 2487 fprintf (file, "+");
cc8ca59e 2488 output_address (VOIDmode, off);
738f2522
BS
2489 break;
2490
2491 case SYMBOL_REF:
2492 case LABEL_REF:
2493 output_addr_const (file, x);
2494 break;
2495
2496 default:
2497 gcc_assert (GET_CODE (x) != MEM);
2498 nvptx_print_operand (file, x, 0);
2499 break;
2500 }
2501}
2502
2503/* Write assembly language output for the address ADDR to FILE. */
2504
2505static void
cc8ca59e 2506nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2507{
cc8ca59e 2508 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2509}
2510
2511/* Print an operand, X, to FILE, with an optional modifier in CODE.
2512
2513 Meaning of CODE:
2514 . -- print the predicate for the instruction or an emptry string for an
2515 unconditional one.
2516 # -- print a rounding mode for the instruction
2517
9a863523 2518 A -- print a data area for a MEM
738f2522 2519 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2520 D -- print a data area for a MEM operand
d88cd9c4 2521 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2522 t -- print a type opcode suffix, promoting QImode to 32 bits
2523 T -- print a type size in bits
2524 u -- print a type opcode suffix without promotions. */
2525
2526static void
2527nvptx_print_operand (FILE *file, rtx x, int code)
2528{
738f2522
BS
2529 if (code == '.')
2530 {
2531 x = current_insn_predicate;
2532 if (x)
2533 {
5012919d 2534 fputs ("@", file);
738f2522
BS
2535 if (GET_CODE (x) == EQ)
2536 fputs ("!", file);
5012919d 2537 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2538 }
2539 return;
2540 }
2541 else if (code == '#')
2542 {
2543 fputs (".rn", file);
2544 return;
2545 }
2546
2547 enum rtx_code x_code = GET_CODE (x);
f313d112 2548 machine_mode mode = GET_MODE (x);
738f2522
BS
2549
2550 switch (code)
2551 {
2552 case 'A':
9a863523
NS
2553 x = XEXP (x, 0);
2554 /* FALLTHROUGH. */
7b8edc29 2555
9a863523
NS
2556 case 'D':
2557 if (GET_CODE (x) == CONST)
2558 x = XEXP (x, 0);
2559 if (GET_CODE (x) == PLUS)
2560 x = XEXP (x, 0);
7b8edc29 2561
9a863523
NS
2562 if (GET_CODE (x) == SYMBOL_REF)
2563 fputs (section_for_sym (x), file);
738f2522
BS
2564 break;
2565
738f2522 2566 case 't':
738f2522 2567 case 'u':
f313d112
NS
2568 if (x_code == SUBREG)
2569 {
1310ff03
TV
2570 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2571 if (VECTOR_MODE_P (inner_mode)
2572 && (GET_MODE_SIZE (mode)
2573 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2574 mode = GET_MODE_INNER (inner_mode);
2575 else if (split_mode_p (inner_mode))
2576 mode = maybe_split_mode (inner_mode);
2577 else
2578 mode = inner_mode;
f313d112
NS
2579 }
2580 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2581 break;
2582
3717fbe3
TV
2583 case 'H':
2584 case 'L':
2585 {
2586 rtx inner_x = SUBREG_REG (x);
2587 machine_mode inner_mode = GET_MODE (inner_x);
2588 machine_mode split = maybe_split_mode (inner_mode);
2589
2590 output_reg (file, REGNO (inner_x), split,
2591 (code == 'H'
2592 ? GET_MODE_SIZE (inner_mode) / 2
2593 : 0));
2594 }
2595 break;
2596
d88cd9c4
NS
2597 case 'S':
2598 {
59263259
NS
2599 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2600 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2601 static const char *const kinds[] =
59263259
NS
2602 {".up", ".down", ".bfly", ".idx"};
2603 fputs (kinds[kind], file);
d88cd9c4
NS
2604 }
2605 break;
2606
738f2522 2607 case 'T':
f313d112 2608 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2609 break;
2610
2611 case 'j':
2612 fprintf (file, "@");
2613 goto common;
2614
2615 case 'J':
2616 fprintf (file, "@!");
2617 goto common;
2618
2619 case 'c':
f313d112 2620 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2621 switch (x_code)
2622 {
2623 case EQ:
2624 fputs (".eq", file);
2625 break;
2626 case NE:
f313d112 2627 if (FLOAT_MODE_P (mode))
738f2522
BS
2628 fputs (".neu", file);
2629 else
2630 fputs (".ne", file);
2631 break;
2632 case LE:
578fb225 2633 case LEU:
738f2522
BS
2634 fputs (".le", file);
2635 break;
2636 case GE:
578fb225 2637 case GEU:
738f2522
BS
2638 fputs (".ge", file);
2639 break;
2640 case LT:
578fb225 2641 case LTU:
738f2522
BS
2642 fputs (".lt", file);
2643 break;
2644 case GT:
738f2522 2645 case GTU:
578fb225 2646 fputs (".gt", file);
738f2522
BS
2647 break;
2648 case LTGT:
2649 fputs (".ne", file);
2650 break;
2651 case UNEQ:
2652 fputs (".equ", file);
2653 break;
2654 case UNLE:
2655 fputs (".leu", file);
2656 break;
2657 case UNGE:
2658 fputs (".geu", file);
2659 break;
2660 case UNLT:
2661 fputs (".ltu", file);
2662 break;
2663 case UNGT:
2664 fputs (".gtu", file);
2665 break;
2666 case UNORDERED:
2667 fputs (".nan", file);
2668 break;
2669 case ORDERED:
2670 fputs (".num", file);
2671 break;
2672 default:
2673 gcc_unreachable ();
2674 }
f313d112 2675 if (FLOAT_MODE_P (mode)
738f2522
BS
2676 || x_code == EQ || x_code == NE
2677 || x_code == GEU || x_code == GTU
2678 || x_code == LEU || x_code == LTU)
f313d112 2679 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2680 else
f313d112 2681 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2682 break;
2683 default:
2684 common:
2685 switch (x_code)
2686 {
2687 case SUBREG:
f313d112
NS
2688 {
2689 rtx inner_x = SUBREG_REG (x);
2690 machine_mode inner_mode = GET_MODE (inner_x);
2691 machine_mode split = maybe_split_mode (inner_mode);
2692
1310ff03
TV
2693 if (VECTOR_MODE_P (inner_mode)
2694 && (GET_MODE_SIZE (mode)
2695 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2696 {
2697 output_reg (file, REGNO (inner_x), VOIDmode);
2698 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2699 }
2700 else if (split_mode_p (inner_mode)
f313d112
NS
2701 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2702 output_reg (file, REGNO (inner_x), split);
2703 else
2704 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2705 }
2706 break;
738f2522
BS
2707
2708 case REG:
f313d112 2709 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2710 break;
2711
2712 case MEM:
2713 fputc ('[', file);
f313d112 2714 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2715 fputc (']', file);
2716 break;
2717
2718 case CONST_INT:
2719 output_addr_const (file, x);
2720 break;
2721
2722 case CONST:
2723 case SYMBOL_REF:
2724 case LABEL_REF:
2725 /* We could use output_addr_const, but that can print things like
2726 "x-8", which breaks ptxas. Need to ensure it is output as
2727 "x+-8". */
2728 nvptx_print_address_operand (file, x, VOIDmode);
2729 break;
2730
2731 case CONST_DOUBLE:
2732 long vals[2];
f313d112 2733 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2734 vals[0] &= 0xffffffff;
2735 vals[1] &= 0xffffffff;
f313d112 2736 if (mode == SFmode)
738f2522
BS
2737 fprintf (file, "0f%08lx", vals[0]);
2738 else
2739 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2740 break;
2741
1310ff03
TV
2742 case CONST_VECTOR:
2743 {
2744 unsigned n = CONST_VECTOR_NUNITS (x);
2745 fprintf (file, "{ ");
2746 for (unsigned i = 0; i < n; ++i)
2747 {
2748 if (i != 0)
2749 fprintf (file, ", ");
2750
2751 rtx elem = CONST_VECTOR_ELT (x, i);
2752 output_addr_const (file, elem);
2753 }
2754 fprintf (file, " }");
2755 }
2756 break;
2757
738f2522
BS
2758 default:
2759 output_addr_const (file, x);
2760 }
2761 }
2762}
2763\f
2764/* Record replacement regs used to deal with subreg operands. */
2765struct reg_replace
2766{
2767 rtx replacement[MAX_RECOG_OPERANDS];
2768 machine_mode mode;
2769 int n_allocated;
2770 int n_in_use;
2771};
2772
2773/* Allocate or reuse a replacement in R and return the rtx. */
2774
2775static rtx
2776get_replacement (struct reg_replace *r)
2777{
2778 if (r->n_allocated == r->n_in_use)
2779 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2780 return r->replacement[r->n_in_use++];
2781}
2782
2783/* Clean up subreg operands. In ptx assembly, everything is typed, and
2784 the presence of subregs would break the rules for most instructions.
2785 Replace them with a suitable new register of the right size, plus
2786 conversion copyin/copyout instructions. */
2787
2788static void
517665b3 2789nvptx_reorg_subreg (void)
738f2522
BS
2790{
2791 struct reg_replace qiregs, hiregs, siregs, diregs;
2792 rtx_insn *insn, *next;
2793
738f2522
BS
2794 qiregs.n_allocated = 0;
2795 hiregs.n_allocated = 0;
2796 siregs.n_allocated = 0;
2797 diregs.n_allocated = 0;
2798 qiregs.mode = QImode;
2799 hiregs.mode = HImode;
2800 siregs.mode = SImode;
2801 diregs.mode = DImode;
2802
2803 for (insn = get_insns (); insn; insn = next)
2804 {
2805 next = NEXT_INSN (insn);
2806 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2807 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2808 || GET_CODE (PATTERN (insn)) == USE
2809 || GET_CODE (PATTERN (insn)) == CLOBBER)
2810 continue;
f324806d 2811
738f2522
BS
2812 qiregs.n_in_use = 0;
2813 hiregs.n_in_use = 0;
2814 siregs.n_in_use = 0;
2815 diregs.n_in_use = 0;
2816 extract_insn (insn);
2817 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2818
738f2522
BS
2819 for (int i = 0; i < recog_data.n_operands; i++)
2820 {
2821 rtx op = recog_data.operand[i];
2822 if (GET_CODE (op) != SUBREG)
2823 continue;
2824
2825 rtx inner = SUBREG_REG (op);
2826
2827 machine_mode outer_mode = GET_MODE (op);
2828 machine_mode inner_mode = GET_MODE (inner);
2829 gcc_assert (s_ok);
2830 if (s_ok
2831 && (GET_MODE_PRECISION (inner_mode)
2832 >= GET_MODE_PRECISION (outer_mode)))
2833 continue;
2834 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2835 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2836 : outer_mode == HImode ? &hiregs
2837 : outer_mode == SImode ? &siregs
2838 : &diregs);
2839 rtx new_reg = get_replacement (r);
2840
2841 if (recog_data.operand_type[i] != OP_OUT)
2842 {
2843 enum rtx_code code;
2844 if (GET_MODE_PRECISION (inner_mode)
2845 < GET_MODE_PRECISION (outer_mode))
2846 code = ZERO_EXTEND;
2847 else
2848 code = TRUNCATE;
2849
f7df4a84 2850 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2851 gen_rtx_fmt_e (code, outer_mode, inner));
2852 emit_insn_before (pat, insn);
2853 }
2854
2855 if (recog_data.operand_type[i] != OP_IN)
2856 {
2857 enum rtx_code code;
2858 if (GET_MODE_PRECISION (inner_mode)
2859 < GET_MODE_PRECISION (outer_mode))
2860 code = TRUNCATE;
2861 else
2862 code = ZERO_EXTEND;
2863
f7df4a84 2864 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2865 gen_rtx_fmt_e (code, inner_mode, new_reg));
2866 emit_insn_after (pat, insn);
2867 }
2868 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2869 }
2870 }
517665b3 2871}
738f2522 2872
5012919d
AM
2873/* Return a SImode "master lane index" register for uniform-simt, allocating on
2874 first use. */
2875
2876static rtx
2877nvptx_get_unisimt_master ()
2878{
2879 rtx &master = cfun->machine->unisimt_master;
2880 return master ? master : master = gen_reg_rtx (SImode);
2881}
2882
2883/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2884
2885static rtx
2886nvptx_get_unisimt_predicate ()
2887{
2888 rtx &pred = cfun->machine->unisimt_predicate;
2889 return pred ? pred : pred = gen_reg_rtx (BImode);
2890}
2891
2892/* Return true if given call insn references one of the functions provided by
2893 the CUDA runtime: malloc, free, vprintf. */
2894
2895static bool
2896nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2897{
2898 rtx pat = PATTERN (insn);
2899 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2900 pat = XVECEXP (pat, 0, 0);
2901 if (GET_CODE (pat) == SET)
2902 pat = SET_SRC (pat);
2903 gcc_checking_assert (GET_CODE (pat) == CALL
2904 && GET_CODE (XEXP (pat, 0)) == MEM);
2905 rtx addr = XEXP (XEXP (pat, 0), 0);
2906 if (GET_CODE (addr) != SYMBOL_REF)
2907 return false;
2908 const char *name = XSTR (addr, 0);
2909 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2910 references with forced assembler name refer to PTX syscalls. For vprintf,
2911 accept both normal and forced-assembler-name references. */
2912 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2913 || !strcmp (name, "*malloc")
2914 || !strcmp (name, "*free"));
2915}
2916
2917/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2918 propagate its value from lane MASTER to current lane. */
2919
2920static void
2921nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2922{
2923 rtx reg;
2924 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2925 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2926}
2927
2928/* Adjust code for uniform-simt code generation variant by making atomics and
2929 "syscalls" conditionally executed, and inserting shuffle-based propagation
2930 for registers being set. */
2931
2932static void
2933nvptx_reorg_uniform_simt ()
2934{
2935 rtx_insn *insn, *next;
2936
2937 for (insn = get_insns (); insn; insn = next)
2938 {
2939 next = NEXT_INSN (insn);
2940 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2941 && !(NONJUMP_INSN_P (insn)
2942 && GET_CODE (PATTERN (insn)) == PARALLEL
2943 && get_attr_atomic (insn)))
2944 continue;
2945 rtx pat = PATTERN (insn);
2946 rtx master = nvptx_get_unisimt_master ();
2947 for (int i = 0; i < XVECLEN (pat, 0); i++)
2948 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2949 rtx pred = nvptx_get_unisimt_predicate ();
2950 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2951 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2952 validate_change (insn, &PATTERN (insn), pat, false);
2953 }
2954}
2955
2498a6ab
TV
2956/* Offloading function attributes. */
2957
2958struct offload_attrs
2959{
2960 unsigned mask;
2961 int num_gangs;
2962 int num_workers;
2963 int vector_length;
2964};
2965
0d06e8c7
TV
2966/* Define entries for cfun->machine->axis_dim. */
2967
2968#define MACH_VECTOR_LENGTH 0
2969#define MACH_MAX_WORKERS 1
2970
2971static void populate_offload_attrs (offload_attrs *oa);
2972
2973static void
2974init_axis_dim (void)
2975{
2976 offload_attrs oa;
2977 int max_workers;
2978
2979 populate_offload_attrs (&oa);
2980
2981 if (oa.num_workers == 0)
2982 max_workers = PTX_CTA_SIZE / oa.vector_length;
2983 else
2984 max_workers = oa.num_workers;
2985
2986 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2987 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2988 cfun->machine->axis_dim_init_p = true;
2989}
2990
2991static int ATTRIBUTE_UNUSED
2992nvptx_mach_max_workers ()
2993{
2994 if (!cfun->machine->axis_dim_init_p)
2995 init_axis_dim ();
2996 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
2997}
2998
2999static int ATTRIBUTE_UNUSED
3000nvptx_mach_vector_length ()
3001{
3002 if (!cfun->machine->axis_dim_init_p)
3003 init_axis_dim ();
3004 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3005}
3006
d2d47a28
NS
3007/* Loop structure of the function. The entire function is described as
3008 a NULL loop. */
d88cd9c4
NS
3009
3010struct parallel
3011{
3012 /* Parent parallel. */
3013 parallel *parent;
3014
3015 /* Next sibling parallel. */
3016 parallel *next;
3017
3018 /* First child parallel. */
3019 parallel *inner;
3020
3021 /* Partitioning mask of the parallel. */
3022 unsigned mask;
3023
3024 /* Partitioning used within inner parallels. */
3025 unsigned inner_mask;
3026
3027 /* Location of parallel forked and join. The forked is the first
3028 block in the parallel and the join is the first block after of
3029 the partition. */
3030 basic_block forked_block;
3031 basic_block join_block;
3032
3033 rtx_insn *forked_insn;
3034 rtx_insn *join_insn;
3035
3036 rtx_insn *fork_insn;
3037 rtx_insn *joining_insn;
3038
3039 /* Basic blocks in this parallel, but not in child parallels. The
3040 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3041 blocks are not. */
3042 auto_vec<basic_block> blocks;
3043
3044public:
3045 parallel (parallel *parent, unsigned mode);
3046 ~parallel ();
3047};
3048
3049/* Constructor links the new parallel into it's parent's chain of
3050 children. */
3051
3052parallel::parallel (parallel *parent_, unsigned mask_)
3053 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3054{
3055 forked_block = join_block = 0;
3056 forked_insn = join_insn = 0;
3057 fork_insn = joining_insn = 0;
3058
3059 if (parent)
3060 {
3061 next = parent->inner;
3062 parent->inner = this;
3063 }
3064}
3065
3066parallel::~parallel ()
3067{
3068 delete inner;
3069 delete next;
3070}
3071
3072/* Map of basic blocks to insns */
3073typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3074
3075/* A tuple of an insn of interest and the BB in which it resides. */
3076typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3077typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3078
3079/* Split basic blocks such that each forked and join unspecs are at
3080 the start of their basic blocks. Thus afterwards each block will
3081 have a single partitioning mode. We also do the same for return
3082 insns, as they are executed by every thread. Return the
3083 partitioning mode of the function as a whole. Populate MAP with
3084 head and tail blocks. We also clear the BB visited flag, which is
3085 used when finding partitions. */
3086
3087static void
3088nvptx_split_blocks (bb_insn_map_t *map)
3089{
3090 insn_bb_vec_t worklist;
3091 basic_block block;
3092 rtx_insn *insn;
3093
3094 /* Locate all the reorg instructions of interest. */
3095 FOR_ALL_BB_FN (block, cfun)
3096 {
3097 bool seen_insn = false;
3098
3099 /* Clear visited flag, for use by parallel locator */
3100 block->flags &= ~BB_VISITED;
3101
3102 FOR_BB_INSNS (block, insn)
3103 {
3104 if (!INSN_P (insn))
3105 continue;
3106 switch (recog_memoized (insn))
3107 {
3108 default:
3109 seen_insn = true;
3110 continue;
3111 case CODE_FOR_nvptx_forked:
3112 case CODE_FOR_nvptx_join:
3113 break;
3114
3115 case CODE_FOR_return:
3116 /* We also need to split just before return insns, as
3117 that insn needs executing by all threads, but the
3118 block it is in probably does not. */
3119 break;
3120 }
3121
3122 if (seen_insn)
3123 /* We've found an instruction that must be at the start of
3124 a block, but isn't. Add it to the worklist. */
3125 worklist.safe_push (insn_bb_t (insn, block));
3126 else
3127 /* It was already the first instruction. Just add it to
3128 the map. */
3129 map->get_or_insert (block) = insn;
3130 seen_insn = true;
3131 }
3132 }
3133
3134 /* Split blocks on the worklist. */
3135 unsigned ix;
3136 insn_bb_t *elt;
3137 basic_block remap = 0;
3138 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3139 {
3140 if (remap != elt->second)
3141 {
3142 block = elt->second;
3143 remap = block;
3144 }
3145
3146 /* Split block before insn. The insn is in the new block */
3147 edge e = split_block (block, PREV_INSN (elt->first));
3148
3149 block = e->dest;
3150 map->get_or_insert (block) = elt->first;
3151 }
3152}
3153
0024c320
TV
3154/* Return true if MASK contains parallelism that requires shared
3155 memory to broadcast. */
3156
3157static bool
3158nvptx_needs_shared_bcast (unsigned mask)
3159{
3160 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3161 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3162 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3163
3164 return worker || large_vector;
3165}
3166
d88cd9c4
NS
3167/* BLOCK is a basic block containing a head or tail instruction.
3168 Locate the associated prehead or pretail instruction, which must be
3169 in the single predecessor block. */
3170
3171static rtx_insn *
3172nvptx_discover_pre (basic_block block, int expected)
3173{
3174 gcc_assert (block->preds->length () == 1);
3175 basic_block pre_block = (*block->preds)[0]->src;
3176 rtx_insn *pre_insn;
3177
3178 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3179 pre_insn = PREV_INSN (pre_insn))
3180 gcc_assert (pre_insn != BB_HEAD (pre_block));
3181
3182 gcc_assert (recog_memoized (pre_insn) == expected);
3183 return pre_insn;
3184}
3185
3186/* Dump this parallel and all its inner parallels. */
3187
3188static void
3189nvptx_dump_pars (parallel *par, unsigned depth)
3190{
3191 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3192 depth, par->mask,
3193 par->forked_block ? par->forked_block->index : -1,
3194 par->join_block ? par->join_block->index : -1);
3195
3196 fprintf (dump_file, " blocks:");
3197
3198 basic_block block;
3199 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3200 fprintf (dump_file, " %d", block->index);
3201 fprintf (dump_file, "\n");
3202 if (par->inner)
3203 nvptx_dump_pars (par->inner, depth + 1);
3204
3205 if (par->next)
3206 nvptx_dump_pars (par->next, depth);
3207}
3208
3209/* If BLOCK contains a fork/join marker, process it to create or
3210 terminate a loop structure. Add this block to the current loop,
3211 and then walk successor blocks. */
3212
3213static parallel *
3214nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3215{
3216 if (block->flags & BB_VISITED)
3217 return par;
3218 block->flags |= BB_VISITED;
3219
3220 if (rtx_insn **endp = map->get (block))
3221 {
3222 rtx_insn *end = *endp;
3223
3224 /* This is a block head or tail, or return instruction. */
3225 switch (recog_memoized (end))
3226 {
3227 case CODE_FOR_return:
3228 /* Return instructions are in their own block, and we
3229 don't need to do anything more. */
3230 return par;
3231
3232 case CODE_FOR_nvptx_forked:
3233 /* Loop head, create a new inner loop and add it into
3234 our parent's child list. */
3235 {
3236 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3237
3238 gcc_assert (mask);
3239 par = new parallel (par, mask);
3240 par->forked_block = block;
3241 par->forked_insn = end;
0024c320 3242 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3243 par->fork_insn
3244 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3245 }
3246 break;
3247
3248 case CODE_FOR_nvptx_join:
3249 /* A loop tail. Finish the current loop and return to
3250 parent. */
3251 {
3252 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3253
3254 gcc_assert (par->mask == mask);
4b557bcc 3255 gcc_assert (par->join_block == NULL);
d88cd9c4
NS
3256 par->join_block = block;
3257 par->join_insn = end;
0024c320 3258 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3259 par->joining_insn
3260 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3261 par = par->parent;
3262 }
3263 break;
3264
3265 default:
3266 gcc_unreachable ();
3267 }
3268 }
3269
3270 if (par)
3271 /* Add this block onto the current loop's list of blocks. */
3272 par->blocks.safe_push (block);
3273 else
3274 /* This must be the entry block. Create a NULL parallel. */
3275 par = new parallel (0, 0);
3276
3277 /* Walk successor blocks. */
3278 edge e;
3279 edge_iterator ei;
3280
3281 FOR_EACH_EDGE (e, ei, block->succs)
3282 nvptx_find_par (map, par, e->dest);
3283
3284 return par;
3285}
3286
3287/* DFS walk the CFG looking for fork & join markers. Construct
3288 loop structures as we go. MAP is a mapping of basic blocks
3289 to head & tail markers, discovered when splitting blocks. This
3290 speeds up the discovery. We rely on the BB visited flag having
3291 been cleared when splitting blocks. */
3292
3293static parallel *
3294nvptx_discover_pars (bb_insn_map_t *map)
3295{
3296 basic_block block;
3297
3298 /* Mark exit blocks as visited. */
3299 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3300 block->flags |= BB_VISITED;
3301
3302 /* And entry block as not. */
3303 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3304 block->flags &= ~BB_VISITED;
3305
3306 parallel *par = nvptx_find_par (map, 0, block);
3307
3308 if (dump_file)
3309 {
3310 fprintf (dump_file, "\nLoops\n");
3311 nvptx_dump_pars (par, 0);
3312 fprintf (dump_file, "\n");
3313 }
3314
3315 return par;
3316}
3317
912442c2
NS
3318/* Analyse a group of BBs within a partitioned region and create N
3319 Single-Entry-Single-Exit regions. Some of those regions will be
3320 trivial ones consisting of a single BB. The blocks of a
3321 partitioned region might form a set of disjoint graphs -- because
3322 the region encloses a differently partitoned sub region.
3323
3324 We use the linear time algorithm described in 'Finding Regions Fast:
3325 Single Entry Single Exit and control Regions in Linear Time'
3326 Johnson, Pearson & Pingali. That algorithm deals with complete
3327 CFGs, where a back edge is inserted from END to START, and thus the
3328 problem becomes one of finding equivalent loops.
3329
3330 In this case we have a partial CFG. We complete it by redirecting
3331 any incoming edge to the graph to be from an arbitrary external BB,
3332 and similarly redirecting any outgoing edge to be to that BB.
3333 Thus we end up with a closed graph.
3334
3335 The algorithm works by building a spanning tree of an undirected
3336 graph and keeping track of back edges from nodes further from the
3337 root in the tree to nodes nearer to the root in the tree. In the
3338 description below, the root is up and the tree grows downwards.
3339
3340 We avoid having to deal with degenerate back-edges to the same
3341 block, by splitting each BB into 3 -- one for input edges, one for
3342 the node itself and one for the output edges. Such back edges are
3343 referred to as 'Brackets'. Cycle equivalent nodes will have the
3344 same set of brackets.
3345
3346 Determining bracket equivalency is done by maintaining a list of
3347 brackets in such a manner that the list length and final bracket
3348 uniquely identify the set.
3349
3350 We use coloring to mark all BBs with cycle equivalency with the
3351 same color. This is the output of the 'Finding Regions Fast'
3352 algorithm. Notice it doesn't actually find the set of nodes within
3353 a particular region, just unorderd sets of nodes that are the
3354 entries and exits of SESE regions.
3355
3356 After determining cycle equivalency, we need to find the minimal
3357 set of SESE regions. Do this with a DFS coloring walk of the
3358 complete graph. We're either 'looking' or 'coloring'. When
3359 looking, and we're in the subgraph, we start coloring the color of
3360 the current node, and remember that node as the start of the
3361 current color's SESE region. Every time we go to a new node, we
3362 decrement the count of nodes with thet color. If it reaches zero,
3363 we remember that node as the end of the current color's SESE region
3364 and return to 'looking'. Otherwise we color the node the current
3365 color.
3366
3367 This way we end up with coloring the inside of non-trivial SESE
3368 regions with the color of that region. */
3369
3370/* A pair of BBs. We use this to represent SESE regions. */
3371typedef std::pair<basic_block, basic_block> bb_pair_t;
3372typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3373
3374/* A node in the undirected CFG. The discriminator SECOND indicates just
3375 above or just below the BB idicated by FIRST. */
3376typedef std::pair<basic_block, int> pseudo_node_t;
3377
3378/* A bracket indicates an edge towards the root of the spanning tree of the
3379 undirected graph. Each bracket has a color, determined
3380 from the currrent set of brackets. */
3381struct bracket
3382{
3383 pseudo_node_t back; /* Back target */
3384
3385 /* Current color and size of set. */
3386 unsigned color;
3387 unsigned size;
3388
3389 bracket (pseudo_node_t back_)
3390 : back (back_), color (~0u), size (~0u)
3391 {
3392 }
3393
3394 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3395 {
3396 if (length != size)
3397 {
3398 size = length;
3399 color = color_counts.length ();
3400 color_counts.quick_push (0);
3401 }
3402 color_counts[color]++;
3403 return color;
3404 }
3405};
3406
3407typedef auto_vec<bracket> bracket_vec_t;
3408
3409/* Basic block info for finding SESE regions. */
3410
3411struct bb_sese
3412{
3413 int node; /* Node number in spanning tree. */
3414 int parent; /* Parent node number. */
3415
3416 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3417 edges arrive at pseudo-node Ai and the outgoing edges leave at
3418 pseudo-node Ao. We have to remember which way we arrived at a
3419 particular node when generating the spanning tree. dir > 0 means
3420 we arrived at Ai, dir < 0 means we arrived at Ao. */
3421 int dir;
3422
3423 /* Lowest numbered pseudo-node reached via a backedge from thsis
3424 node, or any descendant. */
3425 pseudo_node_t high;
3426
3427 int color; /* Cycle-equivalence color */
3428
3429 /* Stack of brackets for this node. */
3430 bracket_vec_t brackets;
3431
3432 bb_sese (unsigned node_, unsigned p, int dir_)
3433 :node (node_), parent (p), dir (dir_)
3434 {
3435 }
3436 ~bb_sese ();
3437
3438 /* Push a bracket ending at BACK. */
3439 void push (const pseudo_node_t &back)
3440 {
3441 if (dump_file)
3442 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3443 back.first ? back.first->index : 0, back.second);
3444 brackets.safe_push (bracket (back));
3445 }
3446
3447 void append (bb_sese *child);
3448 void remove (const pseudo_node_t &);
3449
3450 /* Set node's color. */
3451 void set_color (auto_vec<unsigned> &color_counts)
3452 {
3453 color = brackets.last ().get_color (color_counts, brackets.length ());
3454 }
3455};
3456
3457bb_sese::~bb_sese ()
3458{
3459}
3460
3461/* Destructively append CHILD's brackets. */
3462
3463void
3464bb_sese::append (bb_sese *child)
3465{
3466 if (int len = child->brackets.length ())
3467 {
3468 int ix;
3469
3470 if (dump_file)
3471 {
3472 for (ix = 0; ix < len; ix++)
3473 {
3474 const pseudo_node_t &pseudo = child->brackets[ix].back;
3475 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3476 child->node, pseudo.first ? pseudo.first->index : 0,
3477 pseudo.second);
3478 }
3479 }
3480 if (!brackets.length ())
3481 std::swap (brackets, child->brackets);
3482 else
3483 {
3484 brackets.reserve (len);
3485 for (ix = 0; ix < len; ix++)
3486 brackets.quick_push (child->brackets[ix]);
3487 }
3488 }
3489}
3490
3491/* Remove brackets that terminate at PSEUDO. */
3492
3493void
3494bb_sese::remove (const pseudo_node_t &pseudo)
3495{
3496 unsigned removed = 0;
3497 int len = brackets.length ();
3498
3499 for (int ix = 0; ix < len; ix++)
3500 {
3501 if (brackets[ix].back == pseudo)
3502 {
3503 if (dump_file)
3504 fprintf (dump_file, "Removing backedge %d:%+d\n",
3505 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3506 removed++;
3507 }
3508 else if (removed)
3509 brackets[ix-removed] = brackets[ix];
3510 }
3511 while (removed--)
3512 brackets.pop ();
3513}
3514
3515/* Accessors for BB's aux pointer. */
3516#define BB_SET_SESE(B, S) ((B)->aux = (S))
3517#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3518
3519/* DFS walk creating SESE data structures. Only cover nodes with
3520 BB_VISITED set. Append discovered blocks to LIST. We number in
3521 increments of 3 so that the above and below pseudo nodes can be
3522 implicitly numbered too. */
3523
3524static int
3525nvptx_sese_number (int n, int p, int dir, basic_block b,
3526 auto_vec<basic_block> *list)
3527{
3528 if (BB_GET_SESE (b))
3529 return n;
3530
3531 if (dump_file)
3532 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3533 b->index, n, p, dir);
3534
3535 BB_SET_SESE (b, new bb_sese (n, p, dir));
3536 p = n;
3537
3538 n += 3;
3539 list->quick_push (b);
3540
3541 /* First walk the nodes on the 'other side' of this node, then walk
3542 the nodes on the same side. */
3543 for (unsigned ix = 2; ix; ix--)
3544 {
3545 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3546 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3547 : offsetof (edge_def, src));
3548 edge e;
478cb3ce 3549 edge_iterator ei;
912442c2
NS
3550
3551 FOR_EACH_EDGE (e, ei, edges)
3552 {
3553 basic_block target = *(basic_block *)((char *)e + offset);
3554
3555 if (target->flags & BB_VISITED)
3556 n = nvptx_sese_number (n, p, dir, target, list);
3557 }
3558 dir = -dir;
3559 }
3560 return n;
3561}
3562
3563/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3564 EDGES are the outgoing edges and OFFSET is the offset to the src
3565 or dst block on the edges. */
3566
3567static void
3568nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3569 vec<edge, va_gc> *edges, size_t offset)
3570{
3571 edge e;
478cb3ce 3572 edge_iterator ei;
912442c2
NS
3573 int hi_back = depth;
3574 pseudo_node_t node_back (0, depth);
3575 int hi_child = depth;
3576 pseudo_node_t node_child (0, depth);
3577 basic_block child = NULL;
3578 unsigned num_children = 0;
3579 int usd = -dir * sese->dir;
3580
3581 if (dump_file)
3582 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3583 me->index, sese->node, dir);
3584
3585 if (dir < 0)
3586 {
3587 /* This is the above pseudo-child. It has the BB itself as an
3588 additional child node. */
3589 node_child = sese->high;
3590 hi_child = node_child.second;
3591 if (node_child.first)
3592 hi_child += BB_GET_SESE (node_child.first)->node;
3593 num_children++;
3594 }
3595
3596 /* Examine each edge.
3597 - if it is a child (a) append its bracket list and (b) record
3598 whether it is the child with the highest reaching bracket.
3599 - if it is an edge to ancestor, record whether it's the highest
3600 reaching backlink. */
3601 FOR_EACH_EDGE (e, ei, edges)
3602 {
3603 basic_block target = *(basic_block *)((char *)e + offset);
3604
3605 if (bb_sese *t_sese = BB_GET_SESE (target))
3606 {
3607 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3608 {
3609 /* Child node. Append its bracket list. */
3610 num_children++;
3611 sese->append (t_sese);
3612
3613 /* Compare it's hi value. */
3614 int t_hi = t_sese->high.second;
3615
3616 if (basic_block child_hi_block = t_sese->high.first)
3617 t_hi += BB_GET_SESE (child_hi_block)->node;
3618
3619 if (hi_child > t_hi)
3620 {
3621 hi_child = t_hi;
3622 node_child = t_sese->high;
3623 child = target;
3624 }
3625 }
3626 else if (t_sese->node < sese->node + dir
3627 && !(dir < 0 && sese->parent == t_sese->node))
3628 {
3629 /* Non-parental ancestor node -- a backlink. */
3630 int d = usd * t_sese->dir;
3631 int back = t_sese->node + d;
3632
3633 if (hi_back > back)
3634 {
3635 hi_back = back;
3636 node_back = pseudo_node_t (target, d);
3637 }
3638 }
3639 }
3640 else
3641 { /* Fallen off graph, backlink to entry node. */
3642 hi_back = 0;
3643 node_back = pseudo_node_t (0, 0);
3644 }
3645 }
3646
3647 /* Remove any brackets that terminate at this pseudo node. */
3648 sese->remove (pseudo_node_t (me, dir));
3649
3650 /* Now push any backlinks from this pseudo node. */
3651 FOR_EACH_EDGE (e, ei, edges)
3652 {
3653 basic_block target = *(basic_block *)((char *)e + offset);
3654 if (bb_sese *t_sese = BB_GET_SESE (target))
3655 {
3656 if (t_sese->node < sese->node + dir
3657 && !(dir < 0 && sese->parent == t_sese->node))
3658 /* Non-parental ancestor node - backedge from me. */
3659 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3660 }
3661 else
3662 {
3663 /* back edge to entry node */
3664 sese->push (pseudo_node_t (0, 0));
3665 }
3666 }
3667
3668 /* If this node leads directly or indirectly to a no-return region of
3669 the graph, then fake a backedge to entry node. */
3670 if (!sese->brackets.length () || !edges || !edges->length ())
3671 {
3672 hi_back = 0;
3673 node_back = pseudo_node_t (0, 0);
3674 sese->push (node_back);
3675 }
3676
3677 /* Record the highest reaching backedge from us or a descendant. */
3678 sese->high = hi_back < hi_child ? node_back : node_child;
3679
3680 if (num_children > 1)
3681 {
3682 /* There is more than one child -- this is a Y shaped piece of
3683 spanning tree. We have to insert a fake backedge from this
3684 node to the highest ancestor reached by not-the-highest
3685 reaching child. Note that there may be multiple children
3686 with backedges to the same highest node. That's ok and we
3687 insert the edge to that highest node. */
3688 hi_child = depth;
3689 if (dir < 0 && child)
3690 {
3691 node_child = sese->high;
3692 hi_child = node_child.second;
3693 if (node_child.first)
3694 hi_child += BB_GET_SESE (node_child.first)->node;
3695 }
3696
3697 FOR_EACH_EDGE (e, ei, edges)
3698 {
3699 basic_block target = *(basic_block *)((char *)e + offset);
3700
3701 if (target == child)
3702 /* Ignore the highest child. */
3703 continue;
3704
3705 bb_sese *t_sese = BB_GET_SESE (target);
3706 if (!t_sese)
3707 continue;
3708 if (t_sese->parent != sese->node)
3709 /* Not a child. */
3710 continue;
3711
3712 /* Compare its hi value. */
3713 int t_hi = t_sese->high.second;
3714
3715 if (basic_block child_hi_block = t_sese->high.first)
3716 t_hi += BB_GET_SESE (child_hi_block)->node;
3717
3718 if (hi_child > t_hi)
3719 {
3720 hi_child = t_hi;
3721 node_child = t_sese->high;
3722 }
3723 }
3724
3725 sese->push (node_child);
3726 }
3727}
3728
3729
3730/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3731 proceed to successors. Set SESE entry and exit nodes of
3732 REGIONS. */
3733
3734static void
3735nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3736 basic_block block, int coloring)
3737{
3738 bb_sese *sese = BB_GET_SESE (block);
3739
3740 if (block->flags & BB_VISITED)
3741 {
3742 /* If we've already encountered this block, either we must not
3743 be coloring, or it must have been colored the current color. */
3744 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3745 return;
3746 }
3747
3748 block->flags |= BB_VISITED;
3749
3750 if (sese)
3751 {
3752 if (coloring < 0)
3753 {
3754 /* Start coloring a region. */
3755 regions[sese->color].first = block;
3756 coloring = sese->color;
3757 }
3758
3759 if (!--color_counts[sese->color] && sese->color == coloring)
3760 {
3761 /* Found final block of SESE region. */
3762 regions[sese->color].second = block;
3763 coloring = -1;
3764 }
3765 else
3766 /* Color the node, so we can assert on revisiting the node
3767 that the graph is indeed SESE. */
3768 sese->color = coloring;
3769 }
3770 else
3771 /* Fallen off the subgraph, we cannot be coloring. */
3772 gcc_assert (coloring < 0);
3773
3774 /* Walk each successor block. */
3775 if (block->succs && block->succs->length ())
3776 {
3777 edge e;
3778 edge_iterator ei;
3779
3780 FOR_EACH_EDGE (e, ei, block->succs)
3781 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3782 }
3783 else
3784 gcc_assert (coloring < 0);
3785}
3786
3787/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3788 end up with NULL entries in it. */
3789
3790static void
3791nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3792{
3793 basic_block block;
3794 int ix;
3795
3796 /* First clear each BB of the whole function. */
2300c332 3797 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3798 {
3799 block->flags &= ~BB_VISITED;
3800 BB_SET_SESE (block, 0);
3801 }
912442c2
NS
3802
3803 /* Mark blocks in the function that are in this graph. */
3804 for (ix = 0; blocks.iterate (ix, &block); ix++)
3805 block->flags |= BB_VISITED;
3806
3807 /* Counts of nodes assigned to each color. There cannot be more
3808 colors than blocks (and hopefully there will be fewer). */
3809 auto_vec<unsigned> color_counts;
3810 color_counts.reserve (blocks.length ());
3811
3812 /* Worklist of nodes in the spanning tree. Again, there cannot be
3813 more nodes in the tree than blocks (there will be fewer if the
3814 CFG of blocks is disjoint). */
3815 auto_vec<basic_block> spanlist;
3816 spanlist.reserve (blocks.length ());
3817
3818 /* Make sure every block has its cycle class determined. */
3819 for (ix = 0; blocks.iterate (ix, &block); ix++)
3820 {
3821 if (BB_GET_SESE (block))
3822 /* We already met this block in an earlier graph solve. */
3823 continue;
3824
3825 if (dump_file)
3826 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3827
3828 /* Number the nodes reachable from block initial DFS order. */
3829 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3830
3831 /* Now walk in reverse DFS order to find cycle equivalents. */
3832 while (spanlist.length ())
3833 {
3834 block = spanlist.pop ();
3835 bb_sese *sese = BB_GET_SESE (block);
3836
3837 /* Do the pseudo node below. */
3838 nvptx_sese_pseudo (block, sese, depth, +1,
3839 sese->dir > 0 ? block->succs : block->preds,
3840 (sese->dir > 0 ? offsetof (edge_def, dest)
3841 : offsetof (edge_def, src)));
3842 sese->set_color (color_counts);
3843 /* Do the pseudo node above. */
3844 nvptx_sese_pseudo (block, sese, depth, -1,
3845 sese->dir < 0 ? block->succs : block->preds,
3846 (sese->dir < 0 ? offsetof (edge_def, dest)
3847 : offsetof (edge_def, src)));
3848 }
3849 if (dump_file)
3850 fprintf (dump_file, "\n");
3851 }
3852
3853 if (dump_file)
3854 {
3855 unsigned count;
3856 const char *comma = "";
3857
3858 fprintf (dump_file, "Found %d cycle equivalents\n",
3859 color_counts.length ());
3860 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3861 {
3862 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3863
3864 comma = "";
3865 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3866 if (BB_GET_SESE (block)->color == ix)
3867 {
3868 block->flags |= BB_VISITED;
3869 fprintf (dump_file, "%s%d", comma, block->index);
3870 comma=",";
3871 }
3872 fprintf (dump_file, "}");
3873 comma = ", ";
3874 }
3875 fprintf (dump_file, "\n");
3876 }
3877
3878 /* Now we've colored every block in the subgraph. We now need to
3879 determine the minimal set of SESE regions that cover that
3880 subgraph. Do this with a DFS walk of the complete function.
3881 During the walk we're either 'looking' or 'coloring'. When we
3882 reach the last node of a particular color, we stop coloring and
3883 return to looking. */
3884
3885 /* There cannot be more SESE regions than colors. */
3886 regions.reserve (color_counts.length ());
3887 for (ix = color_counts.length (); ix--;)
3888 regions.quick_push (bb_pair_t (0, 0));
3889
3890 for (ix = 0; blocks.iterate (ix, &block); ix++)
3891 block->flags &= ~BB_VISITED;
3892
3893 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3894
3895 if (dump_file)
3896 {
3897 const char *comma = "";
3898 int len = regions.length ();
3899
3900 fprintf (dump_file, "SESE regions:");
3901 for (ix = 0; ix != len; ix++)
3902 {
3903 basic_block from = regions[ix].first;
3904 basic_block to = regions[ix].second;
3905
3906 if (from)
3907 {
3908 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3909 if (to != from)
3910 fprintf (dump_file, "->%d", to->index);
3911
3912 int color = BB_GET_SESE (from)->color;
3913
3914 /* Print the blocks within the region (excluding ends). */
3915 FOR_EACH_BB_FN (block, cfun)
3916 {
3917 bb_sese *sese = BB_GET_SESE (block);
3918
3919 if (sese && sese->color == color
3920 && block != from && block != to)
3921 fprintf (dump_file, ".%d", block->index);
3922 }
3923 fprintf (dump_file, "}");
3924 }
3925 comma = ",";
3926 }
3927 fprintf (dump_file, "\n\n");
3928 }
3929
3930 for (ix = 0; blocks.iterate (ix, &block); ix++)
3931 delete BB_GET_SESE (block);
3932}
3933
3934#undef BB_SET_SESE
3935#undef BB_GET_SESE
3936
e91eba31
NS
3937/* Propagate live state at the start of a partitioned region. IS_CALL
3938 indicates whether the propagation is for a (partitioned) call
3939 instruction. BLOCK provides the live register information, and
3940 might not contain INSN. Propagation is inserted just after INSN. RW
3941 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
3942 separation is needed for worker-level proppagation where we
3943 essentially do a spill & fill. FN is the underlying worker
3944 function to generate the propagation instructions for single
3945 register. DATA is user data.
3946
e91eba31
NS
3947 Returns true if we didn't emit any instructions.
3948
3949 We propagate the live register set for non-calls and the entire
3950 frame for calls and non-calls. We could do better by (a)
3951 propagating just the live set that is used within the partitioned
3952 regions and (b) only propagating stack entries that are used. The
3953 latter might be quite hard to determine. */
d88cd9c4 3954
a0b3b5c4 3955typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
d88cd9c4 3956
e91eba31
NS
3957static bool
3958nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
a0b3b5c4 3959 propagate_mask rw, propagator_fn fn, void *data, bool vector)
d88cd9c4
NS
3960{
3961 bitmap live = DF_LIVE_IN (block);
3962 bitmap_iterator iterator;
3963 unsigned ix;
e91eba31 3964 bool empty = true;
d88cd9c4
NS
3965
3966 /* Copy the frame array. */
3967 HOST_WIDE_INT fs = get_frame_size ();
3968 if (fs)
3969 {
3970 rtx tmp = gen_reg_rtx (DImode);
3971 rtx idx = NULL_RTX;
3972 rtx ptr = gen_reg_rtx (Pmode);
3973 rtx pred = NULL_RTX;
3974 rtx_code_label *label = NULL;
3975
e91eba31 3976 empty = false;
c1311c86
NS
3977 /* The frame size might not be DImode compatible, but the frame
3978 array's declaration will be. So it's ok to round up here. */
3979 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3980 /* Detect single iteration loop. */
3981 if (fs == 1)
3982 fs = 0;
3983
3984 start_sequence ();
3985 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3986 if (fs)
3987 {
3988 idx = gen_reg_rtx (SImode);
3989 pred = gen_reg_rtx (BImode);
3990 label = gen_label_rtx ();
3991
3992 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3993 /* Allow worker function to initialize anything needed. */
a0b3b5c4 3994 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
d88cd9c4
NS
3995 if (init)
3996 emit_insn (init);
3997 emit_label (label);
3998 LABEL_NUSES (label)++;
3999 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4000 }
4001 if (rw & PM_read)
4002 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
a0b3b5c4 4003 emit_insn (fn (tmp, rw, fs, data, vector));
d88cd9c4
NS
4004 if (rw & PM_write)
4005 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4006 if (fs)
4007 {
4008 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4009 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4010 emit_insn (gen_br_true_uni (pred, label));
a0b3b5c4 4011 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
d88cd9c4
NS
4012 if (fini)
4013 emit_insn (fini);
4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4015 }
4016 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4017 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4018 rtx cpy = get_insns ();
4019 end_sequence ();
4020 insn = emit_insn_after (cpy, insn);
4021 }
4022
e91eba31
NS
4023 if (!is_call)
4024 /* Copy live registers. */
4025 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4026 {
4027 rtx reg = regno_reg_rtx[ix];
d88cd9c4 4028
e91eba31
NS
4029 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4030 {
a0b3b5c4 4031 rtx bcast = fn (reg, rw, 0, data, vector);
d88cd9c4 4032
e91eba31
NS
4033 insn = emit_insn_after (bcast, insn);
4034 empty = false;
4035 }
4036 }
4037 return empty;
d88cd9c4
NS
4038}
4039
a0b3b5c4 4040/* Worker for nvptx_warp_propagate. */
d88cd9c4
NS
4041
4042static rtx
a0b3b5c4
TV
4043warp_prop_gen (rtx reg, propagate_mask pm,
4044 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4045 bool ARG_UNUSED (vector))
d88cd9c4
NS
4046{
4047 if (!(pm & PM_read_write))
4048 return 0;
4049
a0b3b5c4 4050 return nvptx_gen_warp_bcast (reg);
d88cd9c4
NS
4051}
4052
4053/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
4054 of a single warp. Propagation is inserted just after INSN.
4055 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4056
e91eba31 4057static bool
a0b3b5c4 4058nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 4059{
a0b3b5c4
TV
4060 return nvptx_propagate (is_call, block, insn, PM_read_write,
4061 warp_prop_gen, 0, false);
d88cd9c4
NS
4062}
4063
a0b3b5c4 4064/* Worker for nvptx_shared_propagate. */
d88cd9c4
NS
4065
4066static rtx
a0b3b5c4
TV
4067shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4068 bool vector)
d88cd9c4 4069{
1ed57fb8 4070 broadcast_data_t *data = (broadcast_data_t *)data_;
d88cd9c4
NS
4071
4072 if (pm & PM_loop_begin)
4073 {
4074 /* Starting a loop, initialize pointer. */
4075 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4076
3c55d60f
TV
4077 oacc_bcast_align = MAX (oacc_bcast_align, align);
4078 data->offset = ROUND_UP (data->offset, align);
d88cd9c4
NS
4079
4080 data->ptr = gen_reg_rtx (Pmode);
4081
4082 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4083 }
4084 else if (pm & PM_loop_end)
4085 {
4086 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4087 data->ptr = NULL_RTX;
4088 return clobber;
4089 }
4090 else
a0b3b5c4 4091 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
d88cd9c4
NS
4092}
4093
4094/* Spill or fill live state that is live at start of BLOCK. PRE_P
4095 indicates if this is just before partitioned mode (do spill), or
4096 just after it starts (do fill). Sequence is inserted just after
e91eba31 4097 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4098
e91eba31 4099static bool
a0b3b5c4
TV
4100nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4101 rtx_insn *insn, bool vector)
d88cd9c4 4102{
1ed57fb8 4103 broadcast_data_t data;
d88cd9c4
NS
4104
4105 data.base = gen_reg_rtx (Pmode);
4106 data.offset = 0;
4107 data.ptr = NULL_RTX;
4108
e91eba31 4109 bool empty = nvptx_propagate (is_call, block, insn,
a0b3b5c4
TV
4110 pre_p ? PM_read : PM_write, shared_prop_gen,
4111 &data, vector);
e91eba31 4112 gcc_assert (empty == !data.offset);
d88cd9c4
NS
4113 if (data.offset)
4114 {
0024c320
TV
4115 rtx bcast_sym = oacc_bcast_sym;
4116
d88cd9c4 4117 /* Stuff was emitted, initialize the base pointer now. */
0024c320
TV
4118 if (vector && nvptx_mach_max_workers () > 1)
4119 {
4120 if (!cfun->machine->bcast_partition)
4121 {
4122 /* It would be nice to place this register in
4123 DATA_AREA_SHARED. */
4124 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4125 }
4126 if (!cfun->machine->sync_bar)
4127 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4128
4129 bcast_sym = cfun->machine->bcast_partition;
4130 }
4131
4132 rtx init = gen_rtx_SET (data.base, bcast_sym);
d88cd9c4 4133 emit_insn_after (init, insn);
15ab6f00 4134
0024c320
TV
4135 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4136 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4137 ? nvptx_mach_max_workers () + 1
4138 : 1);
4139
4140 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4141 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4 4142 }
e91eba31 4143 return empty;
d88cd9c4
NS
4144}
4145
0024c320
TV
4146/* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4147 which is an integer or a register. THREADS is the number of threads
4148 controlled by the barrier. */
d88cd9c4
NS
4149
4150static rtx
0024c320 4151nvptx_cta_sync (rtx lock, int threads)
d88cd9c4 4152{
0024c320 4153 return gen_nvptx_barsync (lock, GEN_INT (threads));
d88cd9c4
NS
4154}
4155
43c371e8
TV
4156#if WORKAROUND_PTXJIT_BUG
4157/* Return first real insn in BB, or return NULL_RTX if BB does not contain
4158 real insns. */
4159
4160static rtx_insn *
4161bb_first_real_insn (basic_block bb)
4162{
4163 rtx_insn *insn;
4164
4165 /* Find first insn of from block. */
4166 FOR_BB_INSNS (bb, insn)
4167 if (INSN_P (insn))
4168 return insn;
4169
4170 return 0;
4171}
4172#endif
4173
6beefdbd
TV
4174/* Return true if INSN needs neutering. */
4175
4176static bool
4177needs_neutering_p (rtx_insn *insn)
4178{
4179 if (!INSN_P (insn))
4180 return false;
4181
4182 switch (recog_memoized (insn))
4183 {
4184 case CODE_FOR_nvptx_fork:
4185 case CODE_FOR_nvptx_forked:
4186 case CODE_FOR_nvptx_joining:
4187 case CODE_FOR_nvptx_join:
4188 case CODE_FOR_nvptx_barsync:
4189 return false;
4190 default:
4191 return true;
4192 }
4193}
4194
a874808c
TV
4195/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4196
4197static bool
4198verify_neutering_jumps (basic_block from,
4199 rtx_insn *vector_jump, rtx_insn *worker_jump,
4200 rtx_insn *vector_label, rtx_insn *worker_label)
4201{
4202 basic_block bb = from;
4203 rtx_insn *insn = BB_HEAD (bb);
4204 bool seen_worker_jump = false;
4205 bool seen_vector_jump = false;
4206 bool seen_worker_label = false;
4207 bool seen_vector_label = false;
4208 bool worker_neutered = false;
4209 bool vector_neutered = false;
4210 while (true)
4211 {
4212 if (insn == worker_jump)
4213 {
4214 seen_worker_jump = true;
4215 worker_neutered = true;
4216 gcc_assert (!vector_neutered);
4217 }
4218 else if (insn == vector_jump)
4219 {
4220 seen_vector_jump = true;
4221 vector_neutered = true;
4222 }
4223 else if (insn == worker_label)
4224 {
4225 seen_worker_label = true;
4226 gcc_assert (worker_neutered);
4227 worker_neutered = false;
4228 }
4229 else if (insn == vector_label)
4230 {
4231 seen_vector_label = true;
4232 gcc_assert (vector_neutered);
4233 vector_neutered = false;
4234 }
4235 else if (INSN_P (insn))
4236 switch (recog_memoized (insn))
4237 {
4238 case CODE_FOR_nvptx_barsync:
4239 gcc_assert (!vector_neutered && !worker_neutered);
4240 break;
4241 default:
4242 break;
4243 }
4244
4245 if (insn != BB_END (bb))
4246 insn = NEXT_INSN (insn);
4247 else if (JUMP_P (insn) && single_succ_p (bb)
4248 && !seen_vector_jump && !seen_worker_jump)
4249 {
4250 bb = single_succ (bb);
4251 insn = BB_HEAD (bb);
4252 }
4253 else
4254 break;
4255 }
4256
4257 gcc_assert (!(vector_jump && !seen_vector_jump));
4258 gcc_assert (!(worker_jump && !seen_worker_jump));
4259
4260 if (seen_vector_label || seen_worker_label)
4261 {
4262 gcc_assert (!(vector_label && !seen_vector_label));
4263 gcc_assert (!(worker_label && !seen_worker_label));
4264
4265 return true;
4266 }
4267
4268 return false;
4269}
4270
4271/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4272
4273static void
4274verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4275 rtx_insn *worker_label)
4276{
4277 basic_block bb = to;
4278 rtx_insn *insn = BB_END (bb);
4279 bool seen_worker_label = false;
4280 bool seen_vector_label = false;
4281 while (true)
4282 {
4283 if (insn == worker_label)
4284 {
4285 seen_worker_label = true;
4286 gcc_assert (!seen_vector_label);
4287 }
4288 else if (insn == vector_label)
4289 seen_vector_label = true;
4290 else if (INSN_P (insn))
4291 switch (recog_memoized (insn))
4292 {
4293 case CODE_FOR_nvptx_barsync:
4294 gcc_assert (!seen_vector_label && !seen_worker_label);
4295 break;
4296 }
4297
4298 if (insn != BB_HEAD (bb))
4299 insn = PREV_INSN (insn);
4300 else
4301 break;
4302 }
4303
4304 gcc_assert (!(vector_label && !seen_vector_label));
4305 gcc_assert (!(worker_label && !seen_worker_label));
4306}
4307
d88cd9c4
NS
4308/* Single neutering according to MASK. FROM is the incoming block and
4309 TO is the outgoing block. These may be the same block. Insert at
4310 start of FROM:
4311
4312 if (tid.<axis>) goto end.
4313
4314 and insert before ending branch of TO (if there is such an insn):
4315
4316 end:
4317 <possibly-broadcast-cond>
4318 <branch>
4319
4320 We currently only use differnt FROM and TO when skipping an entire
4321 loop. We could do more if we detected superblocks. */
4322
4323static void
4324nvptx_single (unsigned mask, basic_block from, basic_block to)
4325{
4326 rtx_insn *head = BB_HEAD (from);
4327 rtx_insn *tail = BB_END (to);
4328 unsigned skip_mask = mask;
4329
ed9091b2
TV
4330 while (true)
4331 {
4332 /* Find first insn of from block. */
6beefdbd 4333 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4334 head = NEXT_INSN (head);
4335
4336 if (from == to)
4337 break;
4338
4339 if (!(JUMP_P (head) && single_succ_p (from)))
4340 break;
4341
4342 basic_block jump_target = single_succ (from);
4343 if (!single_pred_p (jump_target))
4344 break;
4345
4346 from = jump_target;
4347 head = BB_HEAD (from);
4348 }
d88cd9c4
NS
4349
4350 /* Find last insn of to block */
4351 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4352 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4353 tail = PREV_INSN (tail);
4354
4355 /* Detect if tail is a branch. */
4356 rtx tail_branch = NULL_RTX;
4357 rtx cond_branch = NULL_RTX;
4358 if (tail && INSN_P (tail))
4359 {
4360 tail_branch = PATTERN (tail);
4361 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4362 tail_branch = NULL_RTX;
4363 else
4364 {
4365 cond_branch = SET_SRC (tail_branch);
4366 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4367 cond_branch = NULL_RTX;
4368 }
4369 }
4370
4371 if (tail == head)
4372 {
4373 /* If this is empty, do nothing. */
6beefdbd 4374 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4375 return;
4376
d88cd9c4
NS
4377 if (cond_branch)
4378 {
4379 /* If we're only doing vector single, there's no need to
4380 emit skip code because we'll not insert anything. */
4381 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4382 skip_mask = 0;
4383 }
4384 else if (tail_branch)
4385 /* Block with only unconditional branch. Nothing to do. */
4386 return;
4387 }
4388
4389 /* Insert the vector test inside the worker test. */
4390 unsigned mode;
4391 rtx_insn *before = tail;
2ba16fd2 4392 rtx_insn *neuter_start = NULL;
a874808c
TV
4393 rtx_insn *worker_label = NULL, *vector_label = NULL;
4394 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4395 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4396 if (GOMP_DIM_MASK (mode) & skip_mask)
4397 {
4398 rtx_code_label *label = gen_label_rtx ();
4399 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
478cb3ce
JJ
4400 rtx_insn **mode_jump
4401 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4402 rtx_insn **mode_label
4403 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4404
4405 if (!pred)
4406 {
4407 pred = gen_reg_rtx (BImode);
4408 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4409 }
7820b298 4410
d88cd9c4
NS
4411 rtx br;
4412 if (mode == GOMP_DIM_VECTOR)
4413 br = gen_br_true (pred, label);
4414 else
4415 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4416 if (neuter_start)
4417 neuter_start = emit_insn_after (br, neuter_start);
4418 else
4419 neuter_start = emit_insn_before (br, head);
a874808c 4420 *mode_jump = neuter_start;
d88cd9c4
NS
4421
4422 LABEL_NUSES (label)++;
a874808c 4423 rtx_insn *label_insn;
d88cd9c4 4424 if (tail_branch)
a874808c
TV
4425 {
4426 label_insn = emit_label_before (label, before);
4427 before = label_insn;
4428 }
d88cd9c4 4429 else
be606483 4430 {
a874808c 4431 label_insn = emit_label_after (label, tail);
e77a1236
TV
4432 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4433 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4434 emit_insn_after (gen_exit (), label_insn);
4435 }
a874808c 4436
478cb3ce 4437 *mode_label = label_insn;
d88cd9c4
NS
4438 }
4439
4440 /* Now deal with propagating the branch condition. */
4441 if (cond_branch)
4442 {
4443 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4444
0024c320
TV
4445 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4446 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
d88cd9c4
NS
4447 {
4448 /* Vector mode only, do a shuffle. */
43c371e8
TV
4449#if WORKAROUND_PTXJIT_BUG
4450 /* The branch condition %rcond is propagated like this:
4451
4452 {
4453 .reg .u32 %x;
4454 mov.u32 %x,%tid.x;
4455 setp.ne.u32 %rnotvzero,%x,0;
4456 }
4457
4458 @%rnotvzero bra Lskip;
4459 setp.<op>.<type> %rcond,op1,op2;
4460 Lskip:
4461 selp.u32 %rcondu32,1,0,%rcond;
4462 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4463 setp.ne.u32 %rcond,%rcondu32,0;
4464
4465 There seems to be a bug in the ptx JIT compiler (observed at driver
4466 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4467 unless %rcond is initialized to something before 'bra Lskip'. The
4468 bug is not observed with ptxas from cuda 8.0.61.
4469
4470 It is true that the code is non-trivial: at Lskip, %rcond is
4471 uninitialized in threads 1-31, and after the selp the same holds
4472 for %rcondu32. But shfl propagates the defined value in thread 0
4473 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4474 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4475
4476 There is nothing in the PTX spec to suggest that this is wrong, or
4477 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4478 it as a JIT bug, and the extra initialization as workaround:
4479
4480 {
4481 .reg .u32 %x;
4482 mov.u32 %x,%tid.x;
4483 setp.ne.u32 %rnotvzero,%x,0;
4484 }
4485
4486 +.reg .pred %rcond2;
4487 +setp.eq.u32 %rcond2, 1, 0;
4488
4489 @%rnotvzero bra Lskip;
4490 setp.<op>.<type> %rcond,op1,op2;
4491 +mov.pred %rcond2, %rcond;
4492 Lskip:
4493 +mov.pred %rcond, %rcond2;
4494 selp.u32 %rcondu32,1,0,%rcond;
4495 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4496 setp.ne.u32 %rcond,%rcondu32,0;
4497 */
4498 rtx_insn *label = PREV_INSN (tail);
4499 gcc_assert (label && LABEL_P (label));
4500 rtx tmp = gen_reg_rtx (BImode);
4501 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4502 bb_first_real_insn (from));
8c8e9a6b
TV
4503 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4504 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4505#endif
a0b3b5c4 4506 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
d88cd9c4
NS
4507 }
4508 else
4509 {
4510 /* Includes worker mode, do spill & fill. By construction
4511 we should never have worker mode only. */
1ed57fb8 4512 broadcast_data_t data;
0024c320
TV
4513 unsigned size = GET_MODE_SIZE (SImode);
4514 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4515 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4516 rtx barrier = GEN_INT (0);
4517 int threads = 0;
d88cd9c4 4518
1ed57fb8 4519 data.base = oacc_bcast_sym;
d88cd9c4
NS
4520 data.ptr = 0;
4521
0024c320
TV
4522 bool use_partitioning_p = (vector && !worker
4523 && nvptx_mach_max_workers () > 1
4524 && cfun->machine->bcast_partition);
4525 if (use_partitioning_p)
4526 {
4527 data.base = cfun->machine->bcast_partition;
4528 barrier = cfun->machine->sync_bar;
4529 threads = nvptx_mach_vector_length ();
4530 }
4531 gcc_assert (data.base != NULL);
4532 gcc_assert (barrier);
4533
4534 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4535 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4536 ? nvptx_mach_max_workers () + 1
4537 : 1);
4538
4539 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4540 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4
NS
4541
4542 data.offset = 0;
a0b3b5c4 4543 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
0024c320 4544 vector),
d88cd9c4 4545 before);
0024c320 4546
d88cd9c4 4547 /* Barrier so other workers can see the write. */
0024c320 4548 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4 4549 data.offset = 0;
a0b3b5c4 4550 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
0024c320
TV
4551 vector),
4552 tail);
d88cd9c4
NS
4553 /* This barrier is needed to avoid worker zero clobbering
4554 the broadcast buffer before all the other workers have
4555 had a chance to read this instance of it. */
0024c320 4556 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4
NS
4557 }
4558
4559 extract_insn (tail);
4560 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4561 UNSPEC_BR_UNIFIED);
4562 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4563 }
a874808c
TV
4564
4565 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4566 vector_label, worker_label);
4567 if (!seen_label)
4568 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4569}
4570
4571/* PAR is a parallel that is being skipped in its entirety according to
4572 MASK. Treat this as skipping a superblock starting at forked
4573 and ending at joining. */
4574
4575static void
4576nvptx_skip_par (unsigned mask, parallel *par)
4577{
4578 basic_block tail = par->join_block;
4579 gcc_assert (tail->preds->length () == 1);
4580
4581 basic_block pre_tail = (*tail->preds)[0]->src;
4582 gcc_assert (pre_tail->succs->length () == 1);
4583
4584 nvptx_single (mask, par->forked_block, pre_tail);
4585}
4586
dba619f3
NS
4587/* If PAR has a single inner parallel and PAR itself only contains
4588 empty entry and exit blocks, swallow the inner PAR. */
4589
4590static void
4591nvptx_optimize_inner (parallel *par)
4592{
4593 parallel *inner = par->inner;
4594
4595 /* We mustn't be the outer dummy par. */
4596 if (!par->mask)
4597 return;
4598
4599 /* We must have a single inner par. */
4600 if (!inner || inner->next)
4601 return;
4602
4603 /* We must only contain 2 blocks ourselves -- the head and tail of
4604 the inner par. */
4605 if (par->blocks.length () != 2)
4606 return;
4607
4608 /* We must be disjoint partitioning. As we only have vector and
4609 worker partitioning, this is sufficient to guarantee the pars
4610 have adjacent partitioning. */
4611 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4612 /* This indicates malformed code generation. */
4613 return;
4614
4615 /* The outer forked insn should be immediately followed by the inner
4616 fork insn. */
4617 rtx_insn *forked = par->forked_insn;
4618 rtx_insn *fork = BB_END (par->forked_block);
4619
4620 if (NEXT_INSN (forked) != fork)
4621 return;
4622 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4623
4624 /* The outer joining insn must immediately follow the inner join
4625 insn. */
4626 rtx_insn *joining = par->joining_insn;
4627 rtx_insn *join = inner->join_insn;
4628 if (NEXT_INSN (join) != joining)
4629 return;
4630
4631 /* Preconditions met. Swallow the inner par. */
4632 if (dump_file)
4633 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4634 inner->mask, inner->forked_block->index,
4635 inner->join_block->index,
4636 par->mask, par->forked_block->index, par->join_block->index);
4637
4638 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4639
4640 par->blocks.reserve (inner->blocks.length ());
4641 while (inner->blocks.length ())
4642 par->blocks.quick_push (inner->blocks.pop ());
4643
4644 par->inner = inner->inner;
4645 inner->inner = NULL;
4646
4647 delete inner;
4648}
4649
d88cd9c4
NS
4650/* Process the parallel PAR and all its contained
4651 parallels. We do everything but the neutering. Return mask of
4652 partitioned modes used within this parallel. */
4653
4654static unsigned
4655nvptx_process_pars (parallel *par)
4656{
dba619f3
NS
4657 if (nvptx_optimize)
4658 nvptx_optimize_inner (par);
4659
d88cd9c4
NS
4660 unsigned inner_mask = par->mask;
4661
4662 /* Do the inner parallels first. */
4663 if (par->inner)
4664 {
4665 par->inner_mask = nvptx_process_pars (par->inner);
4666 inner_mask |= par->inner_mask;
4667 }
4668
e91eba31 4669 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
0024c320
TV
4670 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4671 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4672 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
e91eba31 4673
0024c320 4674 if (worker || large_vector)
d88cd9c4 4675 {
a0b3b5c4 4676 nvptx_shared_propagate (false, is_call, par->forked_block,
0024c320 4677 par->forked_insn, !worker);
d495b5cc
TV
4678 bool no_prop_p
4679 = nvptx_shared_propagate (true, is_call, par->forked_block,
4680 par->fork_insn, !worker);
4681 bool empty_loop_p
4682 = !is_call && (NEXT_INSN (par->forked_insn)
4683 && NEXT_INSN (par->forked_insn) == par->joining_insn);
0024c320
TV
4684 rtx barrier = GEN_INT (0);
4685 int threads = 0;
4686
4687 if (!worker && cfun->machine->sync_bar)
4688 {
4689 barrier = cfun->machine->sync_bar;
4690 threads = nvptx_mach_vector_length ();
4691 }
e91eba31 4692
d495b5cc
TV
4693 if (no_prop_p && empty_loop_p)
4694 ;
4695 else if (no_prop_p && is_call)
4696 ;
4697 else
e91eba31
NS
4698 {
4699 /* Insert begin and end synchronizations. */
0024c320
TV
4700 emit_insn_before (nvptx_cta_sync (barrier, threads),
4701 par->forked_insn);
4702 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
e91eba31 4703 }
d88cd9c4
NS
4704 }
4705 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
a0b3b5c4 4706 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4707
4708 /* Now do siblings. */
4709 if (par->next)
4710 inner_mask |= nvptx_process_pars (par->next);
4711 return inner_mask;
4712}
4713
4714/* Neuter the parallel described by PAR. We recurse in depth-first
4715 order. MODES are the partitioning of the execution and OUTER is
4716 the partitioning of the parallels we are contained in. */
4717
4718static void
4719nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4720{
4721 unsigned me = (par->mask
4722 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4723 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4724 unsigned skip_mask = 0, neuter_mask = 0;
4725
4726 if (par->inner)
4727 nvptx_neuter_pars (par->inner, modes, outer | me);
4728
4729 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4730 {
4731 if ((outer | me) & GOMP_DIM_MASK (mode))
4732 {} /* Mode is partitioned: no neutering. */
4733 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4734 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4735 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4736 || !par->forked_insn)
4737 /* Partitioned in inner parallels, or we're not a partitioned
4738 at all: neuter individual blocks. */
4739 neuter_mask |= GOMP_DIM_MASK (mode);
4740 else if (!par->parent || !par->parent->forked_insn
4741 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4742 /* Parent isn't a parallel or contains this paralleling: skip
4743 parallel at this level. */
4744 skip_mask |= GOMP_DIM_MASK (mode);
4745 else
4746 {} /* Parent will skip this parallel itself. */
4747 }
4748
4749 if (neuter_mask)
4750 {
912442c2 4751 int ix, len;
d88cd9c4 4752
912442c2
NS
4753 if (nvptx_optimize)
4754 {
4755 /* Neuter whole SESE regions. */
4756 bb_pair_vec_t regions;
4757
4758 nvptx_find_sese (par->blocks, regions);
4759 len = regions.length ();
4760 for (ix = 0; ix != len; ix++)
4761 {
4762 basic_block from = regions[ix].first;
4763 basic_block to = regions[ix].second;
4764
4765 if (from)
4766 nvptx_single (neuter_mask, from, to);
4767 else
4768 gcc_assert (!to);
4769 }
4770 }
4771 else
d88cd9c4 4772 {
912442c2
NS
4773 /* Neuter each BB individually. */
4774 len = par->blocks.length ();
4775 for (ix = 0; ix != len; ix++)
4776 {
4777 basic_block block = par->blocks[ix];
d88cd9c4 4778
912442c2
NS
4779 nvptx_single (neuter_mask, block, block);
4780 }
d88cd9c4
NS
4781 }
4782 }
4783
4784 if (skip_mask)
7820b298 4785 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4786
4787 if (par->next)
4788 nvptx_neuter_pars (par->next, modes, outer);
4789}
4790
2498a6ab
TV
4791static void
4792populate_offload_attrs (offload_attrs *oa)
4793{
4794 tree attr = oacc_get_fn_attrib (current_function_decl);
4795 tree dims = TREE_VALUE (attr);
4796 unsigned ix;
4797
4798 oa->mask = 0;
4799
4800 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4801 {
4802 tree t = TREE_VALUE (dims);
4803 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4804 tree allowed = TREE_PURPOSE (dims);
4805
4806 if (size != 1 && !(allowed && integer_zerop (allowed)))
4807 oa->mask |= GOMP_DIM_MASK (ix);
4808
4809 switch (ix)
4810 {
4811 case GOMP_DIM_GANG:
4812 oa->num_gangs = size;
4813 break;
4814
4815 case GOMP_DIM_WORKER:
4816 oa->num_workers = size;
4817 break;
4818
4819 case GOMP_DIM_VECTOR:
4820 oa->vector_length = size;
4821 break;
4822 }
4823 }
4824}
4825
3dede32b
TV
4826#if WORKAROUND_PTXJIT_BUG_2
4827/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4828 is needed in the nvptx target because the branches generated for
4829 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4830
4831static rtx
4832nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4833{
4834 rtx pat;
4835 if ((strict && !JUMP_P (insn))
4836 || (!strict && !INSN_P (insn)))
4837 return NULL_RTX;
4838 pat = PATTERN (insn);
4839
4840 /* The set is allowed to appear either as the insn pattern or
4841 the first set in a PARALLEL. */
4842 if (GET_CODE (pat) == PARALLEL)
4843 pat = XVECEXP (pat, 0, 0);
4844 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4845 return pat;
4846
4847 return NULL_RTX;
4848}
4849
4850/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4851
4852static rtx
4853nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4854{
4855 rtx x = nvptx_pc_set (insn, strict);
4856
4857 if (!x)
4858 return NULL_RTX;
4859 x = SET_SRC (x);
4860 if (GET_CODE (x) == LABEL_REF)
4861 return x;
4862 if (GET_CODE (x) != IF_THEN_ELSE)
4863 return NULL_RTX;
4864 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4865 return XEXP (x, 1);
4866 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4867 return XEXP (x, 2);
4868 return NULL_RTX;
4869}
4870
4871/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4872 insn inbetween the branch and the label. This works around a JIT bug
4873 observed at driver version 384.111, at -O0 for sm_50. */
4874
4875static void
4876prevent_branch_around_nothing (void)
4877{
4878 rtx_insn *seen_label = NULL;
4879 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4880 {
b0b592fc 4881 if (INSN_P (insn) && condjump_p (insn))
3dede32b 4882 {
b0b592fc 4883 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
4884 continue;
4885 }
4886
b0b592fc
TV
4887 if (seen_label == NULL)
4888 continue;
4889
3dede32b
TV
4890 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4891 continue;
4892
4893 if (INSN_P (insn))
4894 switch (recog_memoized (insn))
4895 {
4896 case CODE_FOR_nvptx_fork:
4897 case CODE_FOR_nvptx_forked:
4898 case CODE_FOR_nvptx_joining:
4899 case CODE_FOR_nvptx_join:
4900 continue;
4901 default:
4902 seen_label = NULL;
4903 continue;
4904 }
4905
4906 if (LABEL_P (insn) && insn == seen_label)
4907 emit_insn_before (gen_fake_nop (), insn);
4908
4909 seen_label = NULL;
4910 }
4911 }
4912#endif
4913
21251395
TV
4914#ifdef WORKAROUND_PTXJIT_BUG_3
4915/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4916 works around a hang observed at driver version 390.48 for sm_50. */
4917
4918static void
4919workaround_barsyncs (void)
4920{
4921 bool seen_barsync = false;
4922 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4923 {
4924 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4925 {
4926 if (seen_barsync)
4927 {
4928 emit_insn_before (gen_nvptx_membar_cta (), insn);
4929 emit_insn_before (gen_nvptx_membar_cta (), insn);
4930 }
4931
4932 seen_barsync = true;
4933 continue;
4934 }
4935
4936 if (!seen_barsync)
4937 continue;
4938
4939 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4940 continue;
4941 else if (INSN_P (insn))
4942 switch (recog_memoized (insn))
4943 {
4944 case CODE_FOR_nvptx_fork:
4945 case CODE_FOR_nvptx_forked:
4946 case CODE_FOR_nvptx_joining:
4947 case CODE_FOR_nvptx_join:
4948 continue;
4949 default:
4950 break;
4951 }
4952
4953 seen_barsync = false;
4954 }
4955}
4956#endif
4957
517665b3 4958/* PTX-specific reorganization
d88cd9c4 4959 - Split blocks at fork and join instructions
c38f0d8c
NS
4960 - Compute live registers
4961 - Mark now-unused registers, so function begin doesn't declare
517665b3 4962 unused registers.
d88cd9c4
NS
4963 - Insert state propagation when entering partitioned mode
4964 - Insert neutering instructions when in single mode
c38f0d8c 4965 - Replace subregs with suitable sequences.
517665b3
NS
4966*/
4967
4968static void
4969nvptx_reorg (void)
4970{
517665b3
NS
4971 /* We are freeing block_for_insn in the toplev to keep compatibility
4972 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4973 compute_bb_for_insn ();
4974
4975 thread_prologue_and_epilogue_insns ();
4976
d88cd9c4
NS
4977 /* Split blocks and record interesting unspecs. */
4978 bb_insn_map_t bb_insn_map;
4979
4980 nvptx_split_blocks (&bb_insn_map);
4981
c38f0d8c 4982 /* Compute live regs */
517665b3
NS
4983 df_clear_flags (DF_LR_RUN_DCE);
4984 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4985 df_live_add_problem ();
4986 df_live_set_all_dirty ();
517665b3 4987 df_analyze ();
738f2522
BS
4988 regstat_init_n_sets_and_refs ();
4989
d88cd9c4
NS
4990 if (dump_file)
4991 df_dump (dump_file);
4992
517665b3 4993 /* Mark unused regs as unused. */
d88cd9c4 4994 int max_regs = max_reg_num ();
44c068ae 4995 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
4996 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4997 regno_reg_rtx[i] = const0_rtx;
517665b3 4998
d88cd9c4
NS
4999 /* Determine launch dimensions of the function. If it is not an
5000 offloaded function (i.e. this is a regular compiler), the
5001 function has no neutering. */
629b3d75 5002 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
5003 if (attr)
5004 {
5005 /* If we determined this mask before RTL expansion, we could
5006 elide emission of some levels of forks and joins. */
2498a6ab 5007 offload_attrs oa;
d88cd9c4 5008
2498a6ab 5009 populate_offload_attrs (&oa);
d88cd9c4 5010
d88cd9c4
NS
5011 /* If there is worker neutering, there must be vector
5012 neutering. Otherwise the hardware will fail. */
2498a6ab
TV
5013 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5014 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
d88cd9c4
NS
5015
5016 /* Discover & process partitioned regions. */
5017 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5018 nvptx_process_pars (pars);
2498a6ab 5019 nvptx_neuter_pars (pars, oa.mask, 0);
d88cd9c4
NS
5020 delete pars;
5021 }
5022
517665b3 5023 /* Replace subregs. */
c03b0416 5024 nvptx_reorg_subreg ();
517665b3 5025
5012919d
AM
5026 if (TARGET_UNIFORM_SIMT)
5027 nvptx_reorg_uniform_simt ();
5028
3dede32b
TV
5029#if WORKAROUND_PTXJIT_BUG_2
5030 prevent_branch_around_nothing ();
5031#endif
5032
21251395
TV
5033#ifdef WORKAROUND_PTXJIT_BUG_3
5034 workaround_barsyncs ();
5035#endif
5036
738f2522 5037 regstat_free_n_sets_and_refs ();
517665b3
NS
5038
5039 df_finish_pass (true);
738f2522
BS
5040}
5041\f
5042/* Handle a "kernel" attribute; arguments as in
5043 struct attribute_spec.handler. */
5044
5045static tree
5046nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5047 int ARG_UNUSED (flags), bool *no_add_attrs)
5048{
5049 tree decl = *node;
5050
5051 if (TREE_CODE (decl) != FUNCTION_DECL)
5052 {
5053 error ("%qE attribute only applies to functions", name);
5054 *no_add_attrs = true;
5055 }
b49e35a9 5056 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
5057 {
5058 error ("%qE attribute requires a void return type", name);
5059 *no_add_attrs = true;
5060 }
5061
5062 return NULL_TREE;
5063}
5064
5012919d
AM
5065/* Handle a "shared" attribute; arguments as in
5066 struct attribute_spec.handler. */
5067
5068static tree
5069nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5070 int ARG_UNUSED (flags), bool *no_add_attrs)
5071{
5072 tree decl = *node;
5073
5074 if (TREE_CODE (decl) != VAR_DECL)
5075 {
5076 error ("%qE attribute only applies to variables", name);
5077 *no_add_attrs = true;
5078 }
5079 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5080 {
5081 error ("%qE attribute not allowed with auto storage class", name);
5082 *no_add_attrs = true;
5083 }
5084
5085 return NULL_TREE;
5086}
5087
738f2522
BS
5088/* Table of valid machine attributes. */
5089static const struct attribute_spec nvptx_attribute_table[] =
5090{
4849deb1
JJ
5091 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5092 affects_type_identity, handler, exclude } */
5093 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 5094 NULL },
4849deb1 5095 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 5096 NULL },
4849deb1 5097 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
5098};
5099\f
5100/* Limit vector alignments to BIGGEST_ALIGNMENT. */
5101
5102static HOST_WIDE_INT
5103nvptx_vector_alignment (const_tree type)
5104{
5105 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5106
5107 return MIN (align, BIGGEST_ALIGNMENT);
5108}
d88cd9c4
NS
5109
5110/* Indicate that INSN cannot be duplicated. */
5111
5112static bool
5113nvptx_cannot_copy_insn_p (rtx_insn *insn)
5114{
5115 switch (recog_memoized (insn))
5116 {
5117 case CODE_FOR_nvptx_shufflesi:
5118 case CODE_FOR_nvptx_shufflesf:
5119 case CODE_FOR_nvptx_barsync:
5120 case CODE_FOR_nvptx_fork:
5121 case CODE_FOR_nvptx_forked:
5122 case CODE_FOR_nvptx_joining:
5123 case CODE_FOR_nvptx_join:
5124 return true;
5125 default:
5126 return false;
5127 }
5128}
a794bd20
NS
5129
5130/* Section anchors do not work. Initialization for flag_section_anchor
5131 probes the existence of the anchoring target hooks and prevents
5132 anchoring if they don't exist. However, we may be being used with
5133 a host-side compiler that does support anchoring, and hence see
5134 the anchor flag set (as it's not recalculated). So provide an
5135 implementation denying anchoring. */
5136
5137static bool
5138nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5139{
5140 return false;
5141}
738f2522 5142\f
1f83528e
TS
5143/* Record a symbol for mkoffload to enter into the mapping table. */
5144
5145static void
5146nvptx_record_offload_symbol (tree decl)
5147{
3e32ee19
NS
5148 switch (TREE_CODE (decl))
5149 {
5150 case VAR_DECL:
5151 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5152 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5153 break;
5154
5155 case FUNCTION_DECL:
5156 {
629b3d75 5157 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
5158 /* OpenMP offloading does not set this attribute. */
5159 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 5160
3e32ee19
NS
5161 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5162 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5163
5012919d 5164 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 5165 {
5d306e55 5166 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 5167
5d306e55 5168 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
5169 fprintf (asm_out_file, ", %#x", size);
5170 }
d2d47a28 5171
3e32ee19
NS
5172 fprintf (asm_out_file, "\n");
5173 }
5174 break;
d2d47a28 5175
3e32ee19
NS
5176 default:
5177 gcc_unreachable ();
5178 }
1f83528e
TS
5179}
5180
738f2522
BS
5181/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5182 at the start of a file. */
5183
5184static void
5185nvptx_file_start (void)
5186{
5187 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5188 fputs ("\t.version\t3.1\n", asm_out_file);
53ceb8b5
CP
5189 if (TARGET_SM35)
5190 fputs ("\t.target\tsm_35\n", asm_out_file);
5191 else
5192 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
5193 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5194 fputs ("// END PREAMBLE\n", asm_out_file);
5195}
5196
a0b3b5c4
TV
5197/* Emit a declaration for a worker and vector-level buffer in .shared
5198 memory. */
15ab6f00
NS
5199
5200static void
a0b3b5c4 5201write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
15ab6f00
NS
5202{
5203 const char *name = XSTR (sym, 0);
5204
5205 write_var_marker (file, true, false, name);
5206 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5207 align, name, size);
5208}
5209
ecf6e535
BS
5210/* Write out the function declarations we've collected and declare storage
5211 for the broadcast buffer. */
738f2522
BS
5212
5213static void
5214nvptx_file_end (void)
5215{
f3dba894
TS
5216 hash_table<tree_hasher>::iterator iter;
5217 tree decl;
5218 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 5219 nvptx_record_fndecl (decl);
738f2522 5220 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4 5221
1ed57fb8 5222 if (oacc_bcast_size)
a0b3b5c4 5223 write_shared_buffer (asm_out_file, oacc_bcast_sym,
1ed57fb8 5224 oacc_bcast_align, oacc_bcast_size);
f3552158
NS
5225
5226 if (worker_red_size)
a0b3b5c4 5227 write_shared_buffer (asm_out_file, worker_red_sym,
15ab6f00 5228 worker_red_align, worker_red_size);
5012919d 5229
f881693c
TV
5230 if (vector_red_size)
5231 write_shared_buffer (asm_out_file, vector_red_sym,
5232 vector_red_align, vector_red_size);
5233
5012919d
AM
5234 if (need_softstack_decl)
5235 {
5236 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5237 /* 32 is the maximum number of warps in a block. Even though it's an
5238 external declaration, emit the array size explicitly; otherwise, it
5239 may fail at PTX JIT time if the definition is later in link order. */
5240 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5241 POINTER_SIZE);
5242 }
5243 if (need_unisimt_decl)
5244 {
5245 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5246 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5247 }
f3552158
NS
5248}
5249
5250/* Expander for the shuffle builtins. */
5251
5252static rtx
5253nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5254{
5255 if (ignore)
5256 return target;
5257
5258 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5259 NULL_RTX, mode, EXPAND_NORMAL);
5260 if (!REG_P (src))
5261 src = copy_to_mode_reg (mode, src);
5262
5263 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5264 NULL_RTX, SImode, EXPAND_NORMAL);
5265 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5266 NULL_RTX, SImode, EXPAND_NORMAL);
5267
5268 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5269 idx = copy_to_mode_reg (SImode, idx);
5270
59263259
NS
5271 rtx pat = nvptx_gen_shuffle (target, src, idx,
5272 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5273 if (pat)
5274 emit_insn (pat);
5275
5276 return target;
5277}
5278
f881693c
TV
5279const char *
5280nvptx_output_red_partition (rtx dst, rtx offset)
5281{
5282 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5283 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5284
5285 if (offset == const0_rtx)
5286 fprintf (asm_out_file, zero_offset, REGNO (dst),
5287 REGNO (cfun->machine->red_partition));
5288 else
5289 fprintf (asm_out_file, with_offset, REGNO (dst),
5290 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5291
5292 return "";
5293}
5294
5295/* Shared-memory reduction address expander. */
f3552158
NS
5296
5297static rtx
a0b3b5c4 5298nvptx_expand_shared_addr (tree exp, rtx target,
f881693c
TV
5299 machine_mode ARG_UNUSED (mode), int ignore,
5300 int vector)
f3552158
NS
5301{
5302 if (ignore)
5303 return target;
5304
5305 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
f3552158
NS
5306 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5307 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
9a863523 5308 rtx addr = worker_red_sym;
f881693c
TV
5309
5310 if (vector)
9a863523 5311 {
f881693c
TV
5312 offload_attrs oa;
5313
5314 populate_offload_attrs (&oa);
5315
5316 unsigned int psize = ROUND_UP (size + offset, align);
5317 unsigned int pnum = nvptx_mach_max_workers ();
5318 vector_red_partition = MAX (vector_red_partition, psize);
5319 vector_red_size = MAX (vector_red_size, psize * pnum);
5320 vector_red_align = MAX (vector_red_align, align);
5321
5322 if (cfun->machine->red_partition == NULL)
5323 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5324
5325 addr = gen_reg_rtx (Pmode);
5326 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
9a863523 5327 }
f881693c
TV
5328 else
5329 {
5330 worker_red_align = MAX (worker_red_align, align);
5331 worker_red_size = MAX (worker_red_size, size + offset);
f3552158 5332
f881693c
TV
5333 if (offset)
5334 {
5335 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5336 addr = gen_rtx_CONST (Pmode, addr);
5337 }
5338 }
f3552158 5339
f881693c 5340 emit_move_insn (target, addr);
f3552158
NS
5341 return target;
5342}
5343
5344/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5345 not require taking the address of any object, other than the memory
5346 cell being operated on. */
5347
5348static rtx
5349nvptx_expand_cmp_swap (tree exp, rtx target,
5350 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5351{
5352 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5353
5354 if (!target)
5355 target = gen_reg_rtx (mode);
5356
5357 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5358 NULL_RTX, Pmode, EXPAND_NORMAL);
5359 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5360 NULL_RTX, mode, EXPAND_NORMAL);
5361 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5362 NULL_RTX, mode, EXPAND_NORMAL);
5363 rtx pat;
5364
5365 mem = gen_rtx_MEM (mode, mem);
5366 if (!REG_P (cmp))
5367 cmp = copy_to_mode_reg (mode, cmp);
5368 if (!REG_P (src))
5369 src = copy_to_mode_reg (mode, src);
5370
5371 if (mode == SImode)
5372 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5373 else
5374 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5375
5376 emit_insn (pat);
5377
5378 return target;
5379}
5380
5381
5382/* Codes for all the NVPTX builtins. */
5383enum nvptx_builtins
5384{
5385 NVPTX_BUILTIN_SHUFFLE,
5386 NVPTX_BUILTIN_SHUFFLELL,
5387 NVPTX_BUILTIN_WORKER_ADDR,
f881693c 5388 NVPTX_BUILTIN_VECTOR_ADDR,
f3552158
NS
5389 NVPTX_BUILTIN_CMP_SWAP,
5390 NVPTX_BUILTIN_CMP_SWAPLL,
5391 NVPTX_BUILTIN_MAX
5392};
5393
5394static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5395
5396/* Return the NVPTX builtin for CODE. */
5397
5398static tree
5399nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5400{
5401 if (code >= NVPTX_BUILTIN_MAX)
5402 return error_mark_node;
5403
5404 return nvptx_builtin_decls[code];
5405}
5406
5407/* Set up all builtin functions for this target. */
5408
5409static void
5410nvptx_init_builtins (void)
5411{
5412#define DEF(ID, NAME, T) \
5413 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5414 = add_builtin_function ("__builtin_nvptx_" NAME, \
5415 build_function_type_list T, \
5416 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5417#define ST sizetype
5418#define UINT unsigned_type_node
5419#define LLUINT long_long_unsigned_type_node
5420#define PTRVOID ptr_type_node
5421
5422 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5423 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5424 DEF (WORKER_ADDR, "worker_addr",
5425 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f881693c
TV
5426 DEF (VECTOR_ADDR, "vector_addr",
5427 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f3552158
NS
5428 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5429 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5430
5431#undef DEF
5432#undef ST
5433#undef UINT
5434#undef LLUINT
5435#undef PTRVOID
5436}
5437
5438/* Expand an expression EXP that calls a built-in function,
5439 with result going to TARGET if that's convenient
5440 (and in mode MODE if that's convenient).
5441 SUBTARGET may be used as the target for computing one of EXP's operands.
5442 IGNORE is nonzero if the value is to be ignored. */
5443
5444static rtx
5445nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5446 machine_mode mode, int ignore)
5447{
5448 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 5449 switch (DECL_MD_FUNCTION_CODE (fndecl))
f3552158
NS
5450 {
5451 case NVPTX_BUILTIN_SHUFFLE:
5452 case NVPTX_BUILTIN_SHUFFLELL:
5453 return nvptx_expand_shuffle (exp, target, mode, ignore);
5454
5455 case NVPTX_BUILTIN_WORKER_ADDR:
f881693c
TV
5456 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5457
5458 case NVPTX_BUILTIN_VECTOR_ADDR:
5459 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
f3552158
NS
5460
5461 case NVPTX_BUILTIN_CMP_SWAP:
5462 case NVPTX_BUILTIN_CMP_SWAPLL:
5463 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5464
5465 default: gcc_unreachable ();
5466 }
738f2522 5467}
f3552158 5468
5012919d
AM
5469/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5470
5471static int
5472nvptx_simt_vf ()
5473{
5d17a476 5474 return PTX_WARP_SIZE;
5012919d
AM
5475}
5476
9ba66bf5
JJ
5477/* Return 1 if TRAIT NAME is present in the OpenMP context's
5478 device trait set, return 0 if not present in any OpenMP context in the
5479 whole translation unit, or -1 if not present in the current OpenMP context
5480 but might be present in another OpenMP context in the same TU. */
5481
5482int
5483nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5484 const char *name)
5485{
5486 switch (trait)
5487 {
5488 case omp_device_kind:
5489 return strcmp (name, "gpu") == 0;
5490 case omp_device_arch:
5491 return strcmp (name, "nvptx") == 0;
5492 case omp_device_isa:
5493 if (strcmp (name, "sm_30") == 0)
5494 return !TARGET_SM35;
5495 if (strcmp (name, "sm_35") == 0)
5496 return TARGET_SM35;
5497 return 0;
5498 default:
5499 gcc_unreachable ();
5500 }
5501}
5502
ccc0d492
TV
5503static bool
5504nvptx_welformed_vector_length_p (int l)
5505{
5506 gcc_assert (l > 0);
5507 return l % PTX_WARP_SIZE == 0;
5508}
5509
5510static void
5511nvptx_apply_dim_limits (int dims[])
5512{
5513 /* Check that the vector_length is not too large. */
5514 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5515 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5516
5517 /* Check that the number of workers is not too large. */
5518 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5519 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5520
5521 /* Ensure that num_worker * vector_length <= cta size. */
5522 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5523 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5524 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
052aaace
TV
5525
5526 /* If we need a per-worker barrier ... . */
5527 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5528 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5529 /* Don't use more barriers than available. */
5530 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5531 PTX_NUM_PER_WORKER_BARRIERS);
ccc0d492
TV
5532}
5533
6e723923
TV
5534/* Return true if FNDECL contains calls to vector-partitionable routines. */
5535
5536static bool
5537has_vector_partitionable_routine_calls_p (tree fndecl)
5538{
5539 if (!fndecl)
5540 return false;
5541
5542 basic_block bb;
5543 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5544 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5545 gsi_next_nondebug (&i))
5546 {
5547 gimple *stmt = gsi_stmt (i);
5548 if (gimple_code (stmt) != GIMPLE_CALL)
5549 continue;
5550
5551 tree callee = gimple_call_fndecl (stmt);
5552 if (!callee)
5553 continue;
5554
5555 tree attrs = oacc_get_fn_attrib (callee);
5556 if (attrs == NULL_TREE)
5557 return false;
5558
5559 int partition_level = oacc_fn_attrib_level (attrs);
5560 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5561 if (!seq_routine_p)
5562 return true;
5563 }
5564
5565 return false;
5566}
5567
5f57c8f3
TV
5568/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5569 DIMS has changed. */
94829f87 5570
5f57c8f3 5571static void
46dedae6 5572nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
94829f87 5573{
ec6c865c
TV
5574 bool oacc_default_dims_p = false;
5575 bool oacc_min_dims_p = false;
5576 bool offload_region_p = false;
5577 bool routine_p = false;
5578 bool routine_seq_p = false;
d6f528e7 5579 int default_vector_length = -1;
ec6c865c
TV
5580
5581 if (decl == NULL_TREE)
5582 {
5583 if (fn_level == -1)
5584 oacc_default_dims_p = true;
5585 else if (fn_level == -2)
5586 oacc_min_dims_p = true;
5587 else
5588 gcc_unreachable ();
5589 }
5590 else if (fn_level == -1)
5591 offload_region_p = true;
5592 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5593 {
5594 routine_p = true;
5595 routine_seq_p = fn_level == GOMP_DIM_MAX;
5596 }
5597 else
5598 gcc_unreachable ();
94829f87 5599
4b171607
TV
5600 if (oacc_min_dims_p)
5601 {
5602 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5603 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5604 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5605
5606 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5607 return;
5608 }
5609
5610 if (routine_p)
5611 {
5612 if (!routine_seq_p)
5613 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5614
5615 return;
5616 }
5617
5618 if (oacc_default_dims_p)
5619 {
5620 /* -1 : not set
5621 0 : set at runtime, f.i. -fopenacc-dims=-
5622 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5623 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5624 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5625 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5626
5627 /* But -fopenacc-dims=- is not yet supported on trunk. */
5628 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5629 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5630 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5631 }
5632
5633 if (offload_region_p)
5634 {
5635 /* -1 : not set
5636 0 : set using variable, f.i. num_gangs (n)
5637 >= 1: set using constant, f.i. num_gangs (1). */
5638 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5639 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5640 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5641 }
5642
d6f528e7
TV
5643 if (offload_region_p)
5644 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5645 else
5646 /* oacc_default_dims_p. */
5647 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5648
d8ab4e54
TV
5649 int old_dims[GOMP_DIM_MAX];
5650 unsigned int i;
5651 for (i = 0; i < GOMP_DIM_MAX; ++i)
5652 old_dims[i] = dims[i];
5653
5654 const char *vector_reason = NULL;
6e723923
TV
5655 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5656 {
af79605e
TV
5657 default_vector_length = PTX_WARP_SIZE;
5658
6e723923
TV
5659 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5660 {
5661 vector_reason = G_("using vector_length (%d) due to call to"
5662 " vector-partitionable routine, ignoring %d");
5663 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5664 }
5665 }
5666
d8ab4e54 5667 if (dims[GOMP_DIM_VECTOR] == 0)
ccc8282b 5668 {
d8ab4e54 5669 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
d6f528e7 5670 dims[GOMP_DIM_VECTOR] = default_vector_length;
ccc8282b
NS
5671 }
5672
d8ab4e54 5673 if (dims[GOMP_DIM_VECTOR] > 0
ccc0d492 5674 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
d6f528e7 5675 dims[GOMP_DIM_VECTOR] = default_vector_length;
d8ab4e54 5676
ccc0d492 5677 nvptx_apply_dim_limits (dims);
d8ab4e54
TV
5678
5679 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5680 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5681 vector_reason != NULL
5682 ? vector_reason
5683 : G_("using vector_length (%d), ignoring %d"),
5684 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5685
5686 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5687 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5688 G_("using num_workers (%d), ignoring %d"),
5689 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
94829f87 5690
4b171607 5691 if (oacc_default_dims_p)
b6adbb9f 5692 {
2c2ff168
TV
5693 if (dims[GOMP_DIM_VECTOR] < 0)
5694 dims[GOMP_DIM_VECTOR] = default_vector_length;
b6adbb9f 5695 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5696 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5697 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5698 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
ccc0d492 5699 nvptx_apply_dim_limits (dims);
b6adbb9f 5700 }
980af7cd
TV
5701
5702 if (offload_region_p)
5703 {
5704 for (i = 0; i < GOMP_DIM_MAX; i++)
5705 {
5706 if (!(dims[i] < 0))
5707 continue;
5708
5709 if ((used & GOMP_DIM_MASK (i)) == 0)
5710 /* Function oacc_validate_dims will apply the minimal dimension. */
5711 continue;
5712
d6f528e7
TV
5713 dims[i] = (i == GOMP_DIM_VECTOR
5714 ? default_vector_length
5715 : oacc_get_default_dim (i));
980af7cd
TV
5716 }
5717
5718 nvptx_apply_dim_limits (dims);
5719 }
5f57c8f3
TV
5720}
5721
5722/* Validate compute dimensions of an OpenACC offload or routine, fill
5723 in non-unity defaults. FN_LEVEL indicates the level at which a
5724 routine might spawn a loop. It is negative for non-routines. If
5725 DECL is null, we are validating the default dimensions. */
5726
5727static bool
46dedae6 5728nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5f57c8f3
TV
5729{
5730 int old_dims[GOMP_DIM_MAX];
5731 unsigned int i;
b6adbb9f 5732
5f57c8f3
TV
5733 for (i = 0; i < GOMP_DIM_MAX; ++i)
5734 old_dims[i] = dims[i];
5735
46dedae6 5736 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5f57c8f3 5737
21fbea5c
TV
5738 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5739 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5740 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5741
5f57c8f3
TV
5742 for (i = 0; i < GOMP_DIM_MAX; ++i)
5743 if (old_dims[i] != dims[i])
5744 return true;
5745
5746 return false;
94829f87 5747}
d88cd9c4 5748
bd751975
NS
5749/* Return maximum dimension size, or zero for unbounded. */
5750
5751static int
5752nvptx_dim_limit (int axis)
5753{
5754 switch (axis)
5755 {
bd751975 5756 case GOMP_DIM_VECTOR:
764ecad4 5757 return PTX_MAX_VECTOR_LENGTH;
bd751975
NS
5758
5759 default:
5760 break;
5761 }
5762 return 0;
5763}
5764
d88cd9c4
NS
5765/* Determine whether fork & joins are needed. */
5766
5767static bool
5768nvptx_goacc_fork_join (gcall *call, const int dims[],
5769 bool ARG_UNUSED (is_fork))
5770{
5771 tree arg = gimple_call_arg (call, 2);
5772 unsigned axis = TREE_INT_CST_LOW (arg);
5773
5774 /* We only care about worker and vector partitioning. */
5775 if (axis < GOMP_DIM_WORKER)
5776 return false;
5777
5778 /* If the size is 1, there's no partitioning. */
5779 if (dims[axis] == 1)
5780 return false;
5781
5782 return true;
5783}
5784
f3552158
NS
5785/* Generate a PTX builtin function call that returns the address in
5786 the worker reduction buffer at OFFSET. TYPE is the type of the
5787 data at that location. */
5788
5789static tree
f881693c 5790nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
f3552158 5791{
f881693c
TV
5792 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5793 if (vector)
5794 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
f3552158 5795 machine_mode mode = TYPE_MODE (type);
f881693c 5796 tree fndecl = nvptx_builtin_decl (addr_dim, true);
f3552158
NS
5797 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5798 tree align = build_int_cst (unsigned_type_node,
5799 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5800 tree call = build_call_expr (fndecl, 3, offset, size, align);
5801
5802 return fold_convert (build_pointer_type (type), call);
5803}
5804
5805/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5806 will cast the variable if necessary. */
5807
5808static void
5809nvptx_generate_vector_shuffle (location_t loc,
5810 tree dest_var, tree var, unsigned shift,
5811 gimple_seq *seq)
5812{
5813 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5814 tree_code code = NOP_EXPR;
dd3c1b14
NS
5815 tree arg_type = unsigned_type_node;
5816 tree var_type = TREE_TYPE (var);
5817 tree dest_type = var_type;
f3552158 5818
dd3c1b14
NS
5819 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5820 var_type = TREE_TYPE (var_type);
5821
5822 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 5823 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
5824
5825 if (TYPE_SIZE (var_type)
5826 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
5827 {
5828 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 5829 arg_type = long_long_unsigned_type_node;
f3552158 5830 }
dd3c1b14 5831
f3552158 5832 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
5833 tree bits = build_int_cst (unsigned_type_node, shift);
5834 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5835 tree expr;
5836
5837 if (var_type != dest_type)
5838 {
5839 /* Do real and imaginary parts separately. */
5840 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5841 real = fold_build1 (code, arg_type, real);
5842 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5843 real = fold_build1 (code, var_type, real);
f3552158 5844
dd3c1b14
NS
5845 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5846 imag = fold_build1 (code, arg_type, imag);
5847 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5848 imag = fold_build1 (code, var_type, imag);
5849
5850 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5851 }
5852 else
5853 {
5854 expr = fold_build1 (code, arg_type, var);
5855 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5856 expr = fold_build1 (code, dest_type, expr);
5857 }
f3552158 5858
dd3c1b14 5859 gimplify_assign (dest_var, expr, seq);
f3552158
NS
5860}
5861
33f47f42
NS
5862/* Lazily generate the global lock var decl and return its address. */
5863
5864static tree
5865nvptx_global_lock_addr ()
5866{
5867 tree v = global_lock_var;
5868
5869 if (!v)
5870 {
5871 tree name = get_identifier ("__reduction_lock");
5872 tree type = build_qualified_type (unsigned_type_node,
5873 TYPE_QUAL_VOLATILE);
5874 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5875 global_lock_var = v;
5876 DECL_ARTIFICIAL (v) = 1;
5877 DECL_EXTERNAL (v) = 1;
5878 TREE_STATIC (v) = 1;
5879 TREE_PUBLIC (v) = 1;
5880 TREE_USED (v) = 1;
5881 mark_addressable (v);
5882 mark_decl_referenced (v);
5883 }
5884
5885 return build_fold_addr_expr (v);
5886}
5887
5888/* Insert code to locklessly update *PTR with *PTR OP VAR just before
5889 GSI. We use a lockless scheme for nearly all case, which looks
5890 like:
5891 actual = initval(OP);
5892 do {
5893 guess = actual;
5894 write = guess OP myval;
5895 actual = cmp&swap (ptr, guess, write)
5896 } while (actual bit-different-to guess);
5897 return write;
5898
5899 This relies on a cmp&swap instruction, which is available for 32-
5900 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
5901
5902static tree
5903nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5904 tree ptr, tree var, tree_code op)
5905{
5906 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5907 tree_code code = NOP_EXPR;
33f47f42
NS
5908 tree arg_type = unsigned_type_node;
5909 tree var_type = TREE_TYPE (var);
f3552158 5910
33f47f42
NS
5911 if (TREE_CODE (var_type) == COMPLEX_TYPE
5912 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 5913 code = VIEW_CONVERT_EXPR;
33f47f42
NS
5914
5915 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 5916 {
33f47f42 5917 arg_type = long_long_unsigned_type_node;
f3552158 5918 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
5919 }
5920
33f47f42
NS
5921 tree swap_fn = nvptx_builtin_decl (fn, true);
5922
f3552158 5923 gimple_seq init_seq = NULL;
33f47f42
NS
5924 tree init_var = make_ssa_name (arg_type);
5925 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5926 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
5927 gimplify_assign (init_var, init_expr, &init_seq);
5928 gimple *init_end = gimple_seq_last (init_seq);
5929
5930 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5931
f3552158
NS
5932 /* Split the block just after the init stmts. */
5933 basic_block pre_bb = gsi_bb (*gsi);
5934 edge pre_edge = split_block (pre_bb, init_end);
5935 basic_block loop_bb = pre_edge->dest;
5936 pre_bb = pre_edge->src;
5937 /* Reset the iterator. */
5938 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5939
33f47f42
NS
5940 tree expect_var = make_ssa_name (arg_type);
5941 tree actual_var = make_ssa_name (arg_type);
5942 tree write_var = make_ssa_name (arg_type);
5943
5944 /* Build and insert the reduction calculation. */
5945 gimple_seq red_seq = NULL;
5946 tree write_expr = fold_build1 (code, var_type, expect_var);
5947 write_expr = fold_build2 (op, var_type, write_expr, var);
5948 write_expr = fold_build1 (code, arg_type, write_expr);
5949 gimplify_assign (write_var, write_expr, &red_seq);
5950
5951 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5952
5953 /* Build & insert the cmp&swap sequence. */
5954 gimple_seq latch_seq = NULL;
5955 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5956 ptr, expect_var, write_var);
5957 gimplify_assign (actual_var, swap_expr, &latch_seq);
5958
5959 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5960 NULL_TREE, NULL_TREE);
5961 gimple_seq_add_stmt (&latch_seq, cond);
5962
5963 gimple *latch_end = gimple_seq_last (latch_seq);
5964 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 5965
33f47f42
NS
5966 /* Split the block just after the latch stmts. */
5967 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
5968 basic_block post_bb = post_edge->dest;
5969 loop_bb = post_edge->src;
5970 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5971
5972 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 5973 post_edge->probability = profile_probability::even ();
f3552158 5974 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 5975 loop_edge->probability = profile_probability::even ();
f3552158
NS
5976 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5977 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5978
5979 gphi *phi = create_phi_node (expect_var, loop_bb);
5980 add_phi_arg (phi, init_var, pre_edge, loc);
5981 add_phi_arg (phi, actual_var, loop_edge, loc);
5982
5983 loop *loop = alloc_loop ();
5984 loop->header = loop_bb;
5985 loop->latch = loop_bb;
5986 add_loop (loop, loop_bb->loop_father);
5987
33f47f42
NS
5988 return fold_build1 (code, var_type, write_var);
5989}
5990
5991/* Insert code to lockfully update *PTR with *PTR OP VAR just before
5992 GSI. This is necessary for types larger than 64 bits, where there
5993 is no cmp&swap instruction to implement a lockless scheme. We use
5994 a lock variable in global memory.
5995
5996 while (cmp&swap (&lock_var, 0, 1))
5997 continue;
5998 T accum = *ptr;
5999 accum = accum OP var;
6000 *ptr = accum;
6001 cmp&swap (&lock_var, 1, 0);
6002 return accum;
6003
6004 A lock in global memory is necessary to force execution engine
6005 descheduling and avoid resource starvation that can occur if the
6006 lock is in .shared memory. */
6007
6008static tree
6009nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6010 tree ptr, tree var, tree_code op)
6011{
6012 tree var_type = TREE_TYPE (var);
6013 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6014 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6015 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6016
6017 /* Split the block just before the gsi. Insert a gimple nop to make
6018 this easier. */
6019 gimple *nop = gimple_build_nop ();
6020 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6021 basic_block entry_bb = gsi_bb (*gsi);
6022 edge entry_edge = split_block (entry_bb, nop);
6023 basic_block lock_bb = entry_edge->dest;
6024 /* Reset the iterator. */
6025 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6026
6027 /* Build and insert the locking sequence. */
6028 gimple_seq lock_seq = NULL;
6029 tree lock_var = make_ssa_name (unsigned_type_node);
6030 tree lock_expr = nvptx_global_lock_addr ();
6031 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6032 uns_unlocked, uns_locked);
6033 gimplify_assign (lock_var, lock_expr, &lock_seq);
6034 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6035 NULL_TREE, NULL_TREE);
6036 gimple_seq_add_stmt (&lock_seq, cond);
6037 gimple *lock_end = gimple_seq_last (lock_seq);
6038 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6039
6040 /* Split the block just after the lock sequence. */
6041 edge locked_edge = split_block (lock_bb, lock_end);
6042 basic_block update_bb = locked_edge->dest;
6043 lock_bb = locked_edge->src;
6044 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6045
6046 /* Create the lock loop ... */
6047 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
6048 locked_edge->probability = profile_probability::even ();
6049 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6050 loop_edge->probability = profile_probability::even ();
33f47f42
NS
6051 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6052 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6053
6054 /* ... and the loop structure. */
6055 loop *lock_loop = alloc_loop ();
6056 lock_loop->header = lock_bb;
6057 lock_loop->latch = lock_bb;
6058 lock_loop->nb_iterations_estimate = 1;
6059 lock_loop->any_estimate = true;
6060 add_loop (lock_loop, entry_bb->loop_father);
6061
6062 /* Build and insert the reduction calculation. */
6063 gimple_seq red_seq = NULL;
6064 tree acc_in = make_ssa_name (var_type);
6065 tree ref_in = build_simple_mem_ref (ptr);
6066 TREE_THIS_VOLATILE (ref_in) = 1;
6067 gimplify_assign (acc_in, ref_in, &red_seq);
6068
6069 tree acc_out = make_ssa_name (var_type);
6070 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6071 gimplify_assign (acc_out, update_expr, &red_seq);
6072
6073 tree ref_out = build_simple_mem_ref (ptr);
6074 TREE_THIS_VOLATILE (ref_out) = 1;
6075 gimplify_assign (ref_out, acc_out, &red_seq);
6076
6077 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6078
6079 /* Build & insert the unlock sequence. */
6080 gimple_seq unlock_seq = NULL;
6081 tree unlock_expr = nvptx_global_lock_addr ();
6082 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6083 uns_locked, uns_unlocked);
6084 gimplify_and_add (unlock_expr, &unlock_seq);
6085 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6086
6087 return acc_out;
6088}
6089
6090/* Emit a sequence to update a reduction accumlator at *PTR with the
6091 value held in VAR using operator OP. Return the updated value.
6092
6093 TODO: optimize for atomic ops and indepedent complex ops. */
6094
6095static tree
6096nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6097 tree ptr, tree var, tree_code op)
6098{
6099 tree type = TREE_TYPE (var);
6100 tree size = TYPE_SIZE (type);
6101
6102 if (size == TYPE_SIZE (unsigned_type_node)
6103 || size == TYPE_SIZE (long_long_unsigned_type_node))
6104 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6105 else
6106 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
6107}
6108
6109/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6110
6111static void
f881693c 6112nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
f3552158
NS
6113{
6114 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6115 tree lhs = gimple_call_lhs (call);
6116 tree var = gimple_call_arg (call, 2);
6117 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6118 gimple_seq seq = NULL;
6119
6120 push_gimplify_context (true);
6121
6122 if (level != GOMP_DIM_GANG)
6123 {
6124 /* Copy the receiver object. */
6125 tree ref_to_res = gimple_call_arg (call, 1);
6126
6127 if (!integer_zerop (ref_to_res))
6128 var = build_simple_mem_ref (ref_to_res);
6129 }
6130
f881693c
TV
6131 if (level == GOMP_DIM_WORKER
6132 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6133 {
6134 /* Store incoming value to worker reduction buffer. */
6135 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6136 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6137 level == GOMP_DIM_VECTOR);
f3552158
NS
6138 tree ptr = make_ssa_name (TREE_TYPE (call));
6139
6140 gimplify_assign (ptr, call, &seq);
6141 tree ref = build_simple_mem_ref (ptr);
6142 TREE_THIS_VOLATILE (ref) = 1;
6143 gimplify_assign (ref, var, &seq);
6144 }
6145
6146 if (lhs)
6147 gimplify_assign (lhs, var, &seq);
6148
6149 pop_gimplify_context (NULL);
6150 gsi_replace_with_seq (&gsi, seq, true);
6151}
6152
6153/* NVPTX implementation of GOACC_REDUCTION_INIT. */
6154
6155static void
f881693c 6156nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
f3552158
NS
6157{
6158 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6159 tree lhs = gimple_call_lhs (call);
6160 tree var = gimple_call_arg (call, 2);
6161 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6162 enum tree_code rcode
6163 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6164 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6165 TREE_TYPE (var));
6166 gimple_seq seq = NULL;
6167
6168 push_gimplify_context (true);
6169
f881693c 6170 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6171 {
6172 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6173 tree tid = make_ssa_name (integer_type_node);
6174 tree dim_vector = gimple_call_arg (call, 3);
6175 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6176 dim_vector);
6177 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6178 NULL_TREE, NULL_TREE);
6179
6180 gimple_call_set_lhs (tid_call, tid);
6181 gimple_seq_add_stmt (&seq, tid_call);
6182 gimple_seq_add_stmt (&seq, cond_stmt);
6183
6184 /* Split the block just after the call. */
6185 edge init_edge = split_block (gsi_bb (gsi), call);
6186 basic_block init_bb = init_edge->dest;
6187 basic_block call_bb = init_edge->src;
6188
6189 /* Fixup flags from call_bb to init_bb. */
6190 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 6191 init_edge->probability = profile_probability::even ();
f3552158
NS
6192
6193 /* Set the initialization stmts. */
6194 gimple_seq init_seq = NULL;
6195 tree init_var = make_ssa_name (TREE_TYPE (var));
6196 gimplify_assign (init_var, init, &init_seq);
6197 gsi = gsi_start_bb (init_bb);
6198 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6199
6200 /* Split block just after the init stmt. */
6201 gsi_prev (&gsi);
6202 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6203 basic_block dst_bb = inited_edge->dest;
6204
6205 /* Create false edge from call_bb to dst_bb. */
6206 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 6207 nop_edge->probability = profile_probability::even ();
f3552158
NS
6208
6209 /* Create phi node in dst block. */
6210 gphi *phi = create_phi_node (lhs, dst_bb);
6211 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6212 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6213
6214 /* Reset dominator of dst bb. */
6215 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6216
6217 /* Reset the gsi. */
6218 gsi = gsi_for_stmt (call);
6219 }
6220 else
6221 {
6222 if (level == GOMP_DIM_GANG)
6223 {
6224 /* If there's no receiver object, propagate the incoming VAR. */
6225 tree ref_to_res = gimple_call_arg (call, 1);
6226 if (integer_zerop (ref_to_res))
6227 init = var;
6228 }
6229
d41d952c
TV
6230 if (lhs != NULL_TREE)
6231 gimplify_assign (lhs, init, &seq);
f3552158
NS
6232 }
6233
6234 pop_gimplify_context (NULL);
6235 gsi_replace_with_seq (&gsi, seq, true);
6236}
6237
6238/* NVPTX implementation of GOACC_REDUCTION_FINI. */
6239
6240static void
f881693c 6241nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
f3552158
NS
6242{
6243 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6244 tree lhs = gimple_call_lhs (call);
6245 tree ref_to_res = gimple_call_arg (call, 1);
6246 tree var = gimple_call_arg (call, 2);
6247 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6248 enum tree_code op
6249 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6250 gimple_seq seq = NULL;
6251 tree r = NULL_TREE;;
6252
6253 push_gimplify_context (true);
6254
f881693c 6255 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6256 {
6257 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6258 but that requires a method of emitting a unified jump at the
6259 gimple level. */
764ecad4 6260 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
f3552158
NS
6261 {
6262 tree other_var = make_ssa_name (TREE_TYPE (var));
6263 nvptx_generate_vector_shuffle (gimple_location (call),
6264 other_var, var, shfl, &seq);
6265
6266 r = make_ssa_name (TREE_TYPE (var));
6267 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6268 var, other_var), &seq);
6269 var = r;
6270 }
6271 }
6272 else
6273 {
6274 tree accum = NULL_TREE;
6275
f881693c 6276 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
f3552158
NS
6277 {
6278 /* Get reduction buffer address. */
6279 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6280 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6281 level == GOMP_DIM_VECTOR);
f3552158
NS
6282 tree ptr = make_ssa_name (TREE_TYPE (call));
6283
6284 gimplify_assign (ptr, call, &seq);
6285 accum = ptr;
6286 }
6287 else if (integer_zerop (ref_to_res))
6288 r = var;
6289 else
6290 accum = ref_to_res;
6291
6292 if (accum)
6293 {
33f47f42 6294 /* UPDATE the accumulator. */
f3552158
NS
6295 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6296 seq = NULL;
33f47f42
NS
6297 r = nvptx_reduction_update (gimple_location (call), &gsi,
6298 accum, var, op);
f3552158
NS
6299 }
6300 }
6301
6302 if (lhs)
6303 gimplify_assign (lhs, r, &seq);
6304 pop_gimplify_context (NULL);
6305
6306 gsi_replace_with_seq (&gsi, seq, true);
6307}
6308
6309/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6310
6311static void
f881693c 6312nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
f3552158
NS
6313{
6314 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6315 tree lhs = gimple_call_lhs (call);
6316 tree var = gimple_call_arg (call, 2);
6317 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6318 gimple_seq seq = NULL;
6319
6320 push_gimplify_context (true);
f881693c
TV
6321 if (level == GOMP_DIM_WORKER
6322 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6323 {
6324 /* Read the worker reduction buffer. */
6325 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6326 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6327 level == GOMP_DIM_VECTOR);
f3552158
NS
6328 tree ptr = make_ssa_name (TREE_TYPE (call));
6329
6330 gimplify_assign (ptr, call, &seq);
6331 var = build_simple_mem_ref (ptr);
6332 TREE_THIS_VOLATILE (var) = 1;
6333 }
6334
6335 if (level != GOMP_DIM_GANG)
6336 {
6337 /* Write to the receiver object. */
6338 tree ref_to_res = gimple_call_arg (call, 1);
6339
6340 if (!integer_zerop (ref_to_res))
6341 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6342 }
6343
6344 if (lhs)
6345 gimplify_assign (lhs, var, &seq);
6346
6347 pop_gimplify_context (NULL);
6348
6349 gsi_replace_with_seq (&gsi, seq, true);
6350}
6351
6352/* NVPTX reduction expander. */
6353
5563d5c0 6354static void
f3552158
NS
6355nvptx_goacc_reduction (gcall *call)
6356{
6357 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
f881693c
TV
6358 offload_attrs oa;
6359
6360 populate_offload_attrs (&oa);
f3552158
NS
6361
6362 switch (code)
6363 {
6364 case IFN_GOACC_REDUCTION_SETUP:
f881693c 6365 nvptx_goacc_reduction_setup (call, &oa);
f3552158
NS
6366 break;
6367
6368 case IFN_GOACC_REDUCTION_INIT:
f881693c 6369 nvptx_goacc_reduction_init (call, &oa);
f3552158
NS
6370 break;
6371
6372 case IFN_GOACC_REDUCTION_FINI:
f881693c 6373 nvptx_goacc_reduction_fini (call, &oa);
f3552158
NS
6374 break;
6375
6376 case IFN_GOACC_REDUCTION_TEARDOWN:
f881693c 6377 nvptx_goacc_reduction_teardown (call, &oa);
f3552158
NS
6378 break;
6379
6380 default:
6381 gcc_unreachable ();
6382 }
6383}
6384
e053f710
TV
6385static bool
6386nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6387 rtx x ATTRIBUTE_UNUSED)
6388{
6389 return true;
6390}
6391
b98b34b7
TV
6392static bool
6393nvptx_vector_mode_supported (machine_mode mode)
6394{
3717fbe3
TV
6395 return (mode == V2SImode
6396 || mode == V2DImode);
6397}
6398
6399/* Return the preferred mode for vectorizing scalar MODE. */
6400
6401static machine_mode
005ba29c 6402nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
6403{
6404 switch (mode)
6405 {
4e10a5a7 6406 case E_DImode:
3717fbe3 6407 return V2DImode;
4e10a5a7 6408 case E_SImode:
3717fbe3
TV
6409 return V2SImode;
6410
6411 default:
6412 return default_preferred_simd_mode (mode);
6413 }
6414}
6415
6416unsigned int
6417nvptx_data_alignment (const_tree type, unsigned int basic_align)
6418{
6419 if (TREE_CODE (type) == INTEGER_TYPE)
6420 {
6421 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6422 if (size == GET_MODE_SIZE (TImode))
6423 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6424 }
6425
6426 return basic_align;
b98b34b7
TV
6427}
6428
99e1629f
RS
6429/* Implement TARGET_MODES_TIEABLE_P. */
6430
6431static bool
6432nvptx_modes_tieable_p (machine_mode, machine_mode)
6433{
6434 return false;
6435}
6436
c43f4279
RS
6437/* Implement TARGET_HARD_REGNO_NREGS. */
6438
6439static unsigned int
6440nvptx_hard_regno_nregs (unsigned int, machine_mode)
6441{
6442 return 1;
6443}
6444
0d803030
RS
6445/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6446
6447static bool
6448nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6449{
6450 return false;
6451}
6452
43be05f5
TV
6453static GTY(()) tree nvptx_previous_fndecl;
6454
6455static void
6456nvptx_set_current_function (tree fndecl)
6457{
6458 if (!fndecl || fndecl == nvptx_previous_fndecl)
6459 return;
6460
6461 nvptx_previous_fndecl = fndecl;
f881693c 6462 vector_red_partition = 0;
0024c320 6463 oacc_bcast_partition = 0;
43be05f5
TV
6464}
6465
738f2522
BS
6466#undef TARGET_OPTION_OVERRIDE
6467#define TARGET_OPTION_OVERRIDE nvptx_option_override
6468
6469#undef TARGET_ATTRIBUTE_TABLE
6470#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6471
d81db636
SB
6472#undef TARGET_LRA_P
6473#define TARGET_LRA_P hook_bool_void_false
6474
738f2522
BS
6475#undef TARGET_LEGITIMATE_ADDRESS_P
6476#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6477
6478#undef TARGET_PROMOTE_FUNCTION_MODE
6479#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6480
6481#undef TARGET_FUNCTION_ARG
6482#define TARGET_FUNCTION_ARG nvptx_function_arg
6483#undef TARGET_FUNCTION_INCOMING_ARG
6484#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6485#undef TARGET_FUNCTION_ARG_ADVANCE
6486#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6487#undef TARGET_FUNCTION_ARG_BOUNDARY
6488#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6489#undef TARGET_PASS_BY_REFERENCE
6490#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6491#undef TARGET_FUNCTION_VALUE_REGNO_P
6492#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6493#undef TARGET_FUNCTION_VALUE
6494#define TARGET_FUNCTION_VALUE nvptx_function_value
6495#undef TARGET_LIBCALL_VALUE
6496#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6497#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6498#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6499#undef TARGET_GET_DRAP_RTX
6500#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6501#undef TARGET_SPLIT_COMPLEX_ARG
6502#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6503#undef TARGET_RETURN_IN_MEMORY
6504#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6505#undef TARGET_OMIT_STRUCT_RETURN_REG
6506#define TARGET_OMIT_STRUCT_RETURN_REG true
6507#undef TARGET_STRICT_ARGUMENT_NAMING
6508#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6509#undef TARGET_CALL_ARGS
6510#define TARGET_CALL_ARGS nvptx_call_args
6511#undef TARGET_END_CALL_ARGS
6512#define TARGET_END_CALL_ARGS nvptx_end_call_args
6513
6514#undef TARGET_ASM_FILE_START
6515#define TARGET_ASM_FILE_START nvptx_file_start
6516#undef TARGET_ASM_FILE_END
6517#define TARGET_ASM_FILE_END nvptx_file_end
6518#undef TARGET_ASM_GLOBALIZE_LABEL
6519#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6520#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6521#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6522#undef TARGET_PRINT_OPERAND
6523#define TARGET_PRINT_OPERAND nvptx_print_operand
6524#undef TARGET_PRINT_OPERAND_ADDRESS
6525#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6526#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6527#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6528#undef TARGET_ASM_INTEGER
6529#define TARGET_ASM_INTEGER nvptx_assemble_integer
6530#undef TARGET_ASM_DECL_END
6531#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6532#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6533#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6534#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6535#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6536#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6537#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6538
6539#undef TARGET_MACHINE_DEPENDENT_REORG
6540#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6541#undef TARGET_NO_REGISTER_ALLOCATION
6542#define TARGET_NO_REGISTER_ALLOCATION true
6543
9a863523
NS
6544#undef TARGET_ENCODE_SECTION_INFO
6545#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6546#undef TARGET_RECORD_OFFLOAD_SYMBOL
6547#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6548
738f2522
BS
6549#undef TARGET_VECTOR_ALIGNMENT
6550#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6551
d88cd9c4
NS
6552#undef TARGET_CANNOT_COPY_INSN_P
6553#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6554
a794bd20
NS
6555#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6556#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6557
f3552158
NS
6558#undef TARGET_INIT_BUILTINS
6559#define TARGET_INIT_BUILTINS nvptx_init_builtins
6560#undef TARGET_EXPAND_BUILTIN
6561#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6562#undef TARGET_BUILTIN_DECL
6563#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6564
5012919d
AM
6565#undef TARGET_SIMT_VF
6566#define TARGET_SIMT_VF nvptx_simt_vf
6567
9ba66bf5
JJ
6568#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6569#define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6570
94829f87
NS
6571#undef TARGET_GOACC_VALIDATE_DIMS
6572#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6573
bd751975
NS
6574#undef TARGET_GOACC_DIM_LIMIT
6575#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6576
d88cd9c4
NS
6577#undef TARGET_GOACC_FORK_JOIN
6578#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6579
f3552158
NS
6580#undef TARGET_GOACC_REDUCTION
6581#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6582
e053f710
TV
6583#undef TARGET_CANNOT_FORCE_CONST_MEM
6584#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6585
b98b34b7
TV
6586#undef TARGET_VECTOR_MODE_SUPPORTED_P
6587#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6588
3717fbe3
TV
6589#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6590#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6591 nvptx_preferred_simd_mode
6592
99e1629f
RS
6593#undef TARGET_MODES_TIEABLE_P
6594#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6595
c43f4279
RS
6596#undef TARGET_HARD_REGNO_NREGS
6597#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6598
0d803030
RS
6599#undef TARGET_CAN_CHANGE_MODE_CLASS
6600#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6601
e335138d
TV
6602#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6603#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6604
43be05f5
TV
6605#undef TARGET_SET_CURRENT_FUNCTION
6606#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6607
738f2522
BS
6608struct gcc_target targetm = TARGET_INITIALIZER;
6609
6610#include "gt-nvptx.h"