]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
99dee823 2 Copyright (C) 2014-2021 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19 61#include "omp-low.h"
6e723923 62#include "omp-offload.h"
3e32ee19 63#include "gomp-constants.h"
d88cd9c4 64#include "dumpfile.h"
f3552158
NS
65#include "internal-fn.h"
66#include "gimple-iterator.h"
67#include "stringpool.h"
314e6352 68#include "attribs.h"
f2d4d839 69#include "tree-vrp.h"
f3552158
NS
70#include "tree-ssa-operands.h"
71#include "tree-ssanames.h"
72#include "gimplify.h"
73#include "tree-phinodes.h"
74#include "cfgloop.h"
75#include "fold-const.h"
50c620ca 76#include "intl.h"
738f2522 77
994c5d85 78/* This file should be included last. */
d58627a0
RS
79#include "target-def.h"
80
43c371e8 81#define WORKAROUND_PTXJIT_BUG 1
3dede32b 82#define WORKAROUND_PTXJIT_BUG_2 1
21251395 83#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 84
69b09a58
TV
85/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
87 2.x. */
88#define PTX_CTA_SIZE 1024
89
052aaace 90#define PTX_CTA_NUM_BARRIERS 16
5d17a476 91#define PTX_WARP_SIZE 32
69b09a58 92
052aaace
TV
93#define PTX_PER_CTA_BARRIER 0
94#define PTX_NUM_PER_CTA_BARRIERS 1
95#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97
764ecad4 98#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
2b9d9e39 99#define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
693ad66b
TV
100#define PTX_WORKER_LENGTH 32
101#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 102
9a863523
NS
103/* The various PTX memory areas an object might reside in. */
104enum nvptx_data_area
105{
106 DATA_AREA_GENERIC,
107 DATA_AREA_GLOBAL,
108 DATA_AREA_SHARED,
109 DATA_AREA_LOCAL,
110 DATA_AREA_CONST,
111 DATA_AREA_PARAM,
112 DATA_AREA_MAX
113};
114
115/* We record the data area in the target symbol flags. */
116#define SYMBOL_DATA_AREA(SYM) \
117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
118 & 7)
119#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
121
738f2522
BS
122/* Record the function decls we've written, and the libfuncs and function
123 decls corresponding to them. */
124static std::stringstream func_decls;
f3dba894 125
6c907cff 126struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
127{
128 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
129 static bool equal (rtx a, rtx b) { return a == b; }
130};
131
132static GTY((cache))
133 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
134
6c907cff 135struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
136{
137 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
138 static bool equal (tree a, tree b) { return a == b; }
139};
140
141static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
142static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 143
1ed57fb8
TV
144/* Buffer needed to broadcast across workers and vectors. This is
145 used for both worker-neutering and worker broadcasting, and
146 vector-neutering and boardcasting when vector_length > 32. It is
147 shared by all functions emitted. The buffer is placed in shared
148 memory. It'd be nice if PTX supported common blocks, because then
149 this could be shared across TUs (taking the largest size). */
150static unsigned oacc_bcast_size;
0024c320 151static unsigned oacc_bcast_partition;
1ed57fb8
TV
152static unsigned oacc_bcast_align;
153static GTY(()) rtx oacc_bcast_sym;
d88cd9c4 154
f3552158
NS
155/* Buffer needed for worker reductions. This has to be distinct from
156 the worker broadcast array, as both may be live concurrently. */
157static unsigned worker_red_size;
158static unsigned worker_red_align;
f3552158
NS
159static GTY(()) rtx worker_red_sym;
160
f881693c
TV
161/* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164static unsigned vector_red_size;
165static unsigned vector_red_align;
166static unsigned vector_red_partition;
167static GTY(()) rtx vector_red_sym;
168
33f47f42
NS
169/* Global lock variable, needed for 128bit worker & gang reductions. */
170static GTY(()) tree global_lock_var;
171
5012919d
AM
172/* True if any function references __nvptx_stacks. */
173static bool need_softstack_decl;
174
175/* True if any function references __nvptx_uni. */
176static bool need_unisimt_decl;
177
0024c320
TV
178static int nvptx_mach_max_workers ();
179
738f2522
BS
180/* Allocate a new, cleared machine_function structure. */
181
182static struct machine_function *
183nvptx_init_machine_status (void)
184{
185 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 186 p->return_mode = VOIDmode;
738f2522
BS
187 return p;
188}
189
5012919d
AM
190/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191 and -fopenacc is also enabled. */
192
193static void
194diagnose_openacc_conflict (bool optval, const char *optname)
195{
196 if (flag_openacc && optval)
a3f9f006 197 error ("option %s is not supported together with %<-fopenacc%>", optname);
5012919d
AM
198}
199
738f2522
BS
200/* Implement TARGET_OPTION_OVERRIDE. */
201
202static void
203nvptx_option_override (void)
204{
205 init_machine_status = nvptx_init_machine_status;
ae578943 206
3d339d5e
AM
207 /* Set toplevel_reorder, unless explicitly disabled. We need
208 reordering so that we emit necessary assembler decls of
209 undeclared variables. */
210 if (!global_options_set.x_flag_toplevel_reorder)
211 flag_toplevel_reorder = 1;
212
fa9afdc3
TV
213 debug_nonbind_markers_p = 0;
214
ae578943
NS
215 /* Set flag_no_common, unless explicitly disabled. We fake common
216 using .weak, and that's not entirely accurate, so avoid it
217 unless forced. */
218 if (!global_options_set.x_flag_no_common)
219 flag_no_common = 1;
220
cad136a4
TV
221 /* The patch area requires nops, which we don't have. */
222 if (function_entry_patch_area_size > 0)
223 sorry ("not generating patch area, nops not supported");
224
738f2522
BS
225 /* Assumes that it will see only hard registers. */
226 flag_var_tracking = 0;
1e5154e7 227
dba619f3
NS
228 if (nvptx_optimize < 0)
229 nvptx_optimize = optimize > 0;
230
f3dba894
TS
231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 233 declared_libfuncs_htab
f3dba894 234 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 235
1ed57fb8
TV
236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
0024c320 239 oacc_bcast_partition = 0;
f3552158 240
15ab6f00 241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d 244
f881693c
TV
245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248 vector_red_partition = 0;
249
5012919d
AM
250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
253
254 if (TARGET_GOMP)
255 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
256}
257
738f2522
BS
258/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
259 deal with ptx ideosyncracies. */
260
261const char *
262nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
263{
264 switch (mode)
265 {
4e10a5a7 266 case E_BLKmode:
738f2522 267 return ".b8";
4e10a5a7 268 case E_BImode:
738f2522 269 return ".pred";
4e10a5a7 270 case E_QImode:
738f2522
BS
271 if (promote)
272 return ".u32";
273 else
274 return ".u8";
4e10a5a7 275 case E_HImode:
738f2522 276 return ".u16";
4e10a5a7 277 case E_SImode:
738f2522 278 return ".u32";
4e10a5a7 279 case E_DImode:
738f2522
BS
280 return ".u64";
281
4e10a5a7 282 case E_SFmode:
738f2522 283 return ".f32";
4e10a5a7 284 case E_DFmode:
738f2522
BS
285 return ".f64";
286
4e10a5a7 287 case E_V2SImode:
b98b34b7 288 return ".v2.u32";
4e10a5a7 289 case E_V2DImode:
3717fbe3 290 return ".v2.u64";
b98b34b7 291
738f2522
BS
292 default:
293 gcc_unreachable ();
294 }
295}
296
9a863523
NS
297/* Encode the PTX data area that DECL (which might not actually be a
298 _DECL) should reside in. */
7b8edc29 299
9a863523
NS
300static void
301nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 302{
9a863523
NS
303 default_encode_section_info (decl, rtl, first);
304 if (first && MEM_P (rtl))
305 {
306 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 307
9a863523
NS
308 if (TREE_CONSTANT (decl))
309 area = DATA_AREA_CONST;
310 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
311 {
312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
313 {
314 area = DATA_AREA_SHARED;
315 if (DECL_INITIAL (decl))
316 error ("static initialization of variable %q+D in %<.shared%>"
317 " memory is not supported", decl);
318 }
319 else
320 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
321 }
7b8edc29 322
9a863523
NS
323 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
324 }
325}
326
327/* Return the PTX name of the data area in which SYM should be
328 placed. The symbol must have already been processed by
329 nvptx_encode_seciton_info, or equivalent. */
330
331static const char *
332section_for_sym (rtx sym)
333{
334 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
335 /* Same order as nvptx_data_area enum. */
336 static char const *const areas[] =
337 {"", ".global", ".shared", ".local", ".const", ".param"};
338
339 return areas[area];
340}
341
342/* Similarly for a decl. */
343
344static const char *
345section_for_decl (const_tree decl)
346{
347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
348}
349
b4346107
NS
350/* Check NAME for special function names and redirect them by returning a
351 replacement. This applies to malloc, free and realloc, for which we
352 want to use libgcc wrappers, and call, which triggers a bug in
353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354 not active in an offload compiler -- the names are all set by the
355 host-side compiler. */
356
357static const char *
358nvptx_name_replacement (const char *name)
359{
360 if (strcmp (name, "call") == 0)
361 return "__nvptx_call";
362 if (strcmp (name, "malloc") == 0)
363 return "__nvptx_malloc";
364 if (strcmp (name, "free") == 0)
365 return "__nvptx_free";
366 if (strcmp (name, "realloc") == 0)
367 return "__nvptx_realloc";
368 return name;
369}
370
e88b04bd
TV
371/* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
372 with the dots replaced with dollar signs. */
373
374static char *
375nvptx_replace_dot (const char *name)
376{
377 if (strchr (name, '.') == NULL)
378 return NULL;
379
380 char *p = xstrdup (name);
381 for (size_t i = 0; i < strlen (p); ++i)
382 if (p[i] == '.')
383 p[i] = '$';
384 return p;
385}
386
d7479262
NS
387/* If MODE should be treated as two registers of an inner mode, return
388 that inner mode. Otherwise return VOIDmode. */
738f2522 389
d7479262
NS
390static machine_mode
391maybe_split_mode (machine_mode mode)
738f2522 392{
738f2522 393 if (COMPLEX_MODE_P (mode))
d7479262 394 return GET_MODE_INNER (mode);
738f2522 395
738f2522 396 if (mode == TImode)
d7479262
NS
397 return DImode;
398
399 return VOIDmode;
738f2522
BS
400}
401
38827811
TV
402/* Return true if mode should be treated as two registers. */
403
404static bool
405split_mode_p (machine_mode mode)
406{
407 return maybe_split_mode (mode) != VOIDmode;
408}
409
f313d112
NS
410/* Output a register, subreg, or register pair (with optional
411 enclosing braces). */
412
413static void
414output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
415 int subreg_offset = -1)
416{
417 if (inner_mode == VOIDmode)
418 {
419 if (HARD_REGISTER_NUM_P (regno))
420 fprintf (file, "%s", reg_names[regno]);
421 else
422 fprintf (file, "%%r%d", regno);
423 }
424 else if (subreg_offset >= 0)
425 {
426 output_reg (file, regno, VOIDmode);
427 fprintf (file, "$%d", subreg_offset);
428 }
429 else
430 {
431 if (subreg_offset == -1)
432 fprintf (file, "{");
433 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
434 fprintf (file, ",");
435 output_reg (file, regno, inner_mode, 0);
436 if (subreg_offset == -1)
437 fprintf (file, "}");
438 }
439}
440
d88cd9c4
NS
441/* Emit forking instructions for MASK. */
442
443static void
444nvptx_emit_forking (unsigned mask, bool is_call)
445{
446 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
447 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
448 if (mask)
449 {
450 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
451
452 /* Emit fork at all levels. This helps form SESE regions, as
453 it creates a block with a single successor before entering a
454 partitooned region. That is a good candidate for the end of
455 an SESE region. */
e91eba31 456 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
457 emit_insn (gen_nvptx_forked (op));
458 }
459}
460
461/* Emit joining instructions for MASK. */
462
463static void
464nvptx_emit_joining (unsigned mask, bool is_call)
465{
466 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
467 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
468 if (mask)
469 {
470 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
471
472 /* Emit joining for all non-call pars to ensure there's a single
473 predecessor for the block the join insn ends up in. This is
474 needed for skipping entire loops. */
e91eba31 475 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
476 emit_insn (gen_nvptx_join (op));
477 }
478}
479
738f2522 480\f
44eba92d
NS
481/* Determine whether MODE and TYPE (possibly NULL) should be passed or
482 returned in memory. Integer and floating types supported by the
483 machine are passed in registers, everything else is passed in
484 memory. Complex types are split. */
485
486static bool
487pass_in_memory (machine_mode mode, const_tree type, bool for_return)
488{
489 if (type)
490 {
491 if (AGGREGATE_TYPE_P (type))
492 return true;
493 if (TREE_CODE (type) == VECTOR_TYPE)
494 return true;
495 }
496
497 if (!for_return && COMPLEX_MODE_P (mode))
498 /* Complex types are passed as two underlying args. */
499 mode = GET_MODE_INNER (mode);
500
501 if (GET_MODE_CLASS (mode) != MODE_INT
502 && GET_MODE_CLASS (mode) != MODE_FLOAT)
503 return true;
504
505 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
506 return true;
507
508 return false;
509}
510
511/* A non-memory argument of mode MODE is being passed, determine the mode it
512 should be promoted to. This is also used for determining return
513 type promotion. */
514
515static machine_mode
516promote_arg (machine_mode mode, bool prototyped)
517{
518 if (!prototyped && mode == SFmode)
519 /* K&R float promotion for unprototyped functions. */
520 mode = DFmode;
521 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
522 mode = SImode;
523
524 return mode;
525}
526
527/* A non-memory return type of MODE is being returned. Determine the
528 mode it should be promoted to. */
529
530static machine_mode
531promote_return (machine_mode mode)
532{
533 return promote_arg (mode, true);
534}
535
e74f7152 536/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 537
e74f7152 538static rtx
6783fdb7 539nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
e74f7152 540{
6783fdb7 541 if (arg.end_marker_p () || !arg.named)
e74f7152 542 return NULL_RTX;
738f2522 543
6783fdb7 544 return gen_reg_rtx (arg.mode);
e74f7152
NS
545}
546
547/* Implement TARGET_FUNCTION_INCOMING_ARG. */
548
549static rtx
6783fdb7
RS
550nvptx_function_incoming_arg (cumulative_args_t cum_v,
551 const function_arg_info &arg)
738f2522 552{
e74f7152 553 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 554
6783fdb7 555 if (arg.end_marker_p () || !arg.named)
e74f7152 556 return NULL_RTX;
738f2522 557
e74f7152
NS
558 /* No need to deal with split modes here, the only case that can
559 happen is complex modes and those are dealt with by
560 TARGET_SPLIT_COMPLEX_ARG. */
6783fdb7 561 return gen_rtx_UNSPEC (arg.mode,
e74f7152
NS
562 gen_rtvec (1, GEN_INT (cum->count)),
563 UNSPEC_ARG_REG);
564}
565
566/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
567
568static void
6930c98c 569nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
e74f7152
NS
570{
571 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 572
e74f7152
NS
573 cum->count++;
574}
575
1988b2c2
NS
576/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
577
578 For nvptx This is only used for varadic args. The type has already
579 been promoted and/or converted to invisible reference. */
580
581static unsigned
582nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
583{
584 return GET_MODE_ALIGNMENT (mode);
585}
586
e74f7152
NS
587/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
588
589 For nvptx, we know how to handle functions declared as stdarg: by
590 passing an extra pointer to the unnamed arguments. However, the
591 Fortran frontend can produce a different situation, where a
592 function pointer is declared with no arguments, but the actual
593 function and calls to it take more arguments. In that case, we
594 want to ensure the call matches the definition of the function. */
595
596static bool
597nvptx_strict_argument_naming (cumulative_args_t cum_v)
598{
599 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 600
e74f7152
NS
601 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
602}
603
e74f7152
NS
604/* Implement TARGET_LIBCALL_VALUE. */
605
606static rtx
607nvptx_libcall_value (machine_mode mode, const_rtx)
608{
e91cacae 609 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
610 /* Pretend to return in a hard reg for early uses before pseudos can be
611 generated. */
612 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 613
e74f7152
NS
614 return gen_reg_rtx (mode);
615}
616
617/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
618 where function FUNC returns or receives a value of data type TYPE. */
619
620static rtx
44eba92d 621nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
622 bool outgoing)
623{
44eba92d
NS
624 machine_mode mode = promote_return (TYPE_MODE (type));
625
e74f7152 626 if (outgoing)
5c036f3f 627 {
e91cacae 628 gcc_assert (cfun);
44c068ae 629 cfun->machine->return_mode = mode;
5c036f3f
NS
630 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
631 }
b49e35a9
NS
632
633 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
634}
635
636/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
637
638static bool
639nvptx_function_value_regno_p (const unsigned int regno)
640{
641 return regno == NVPTX_RETURN_REGNUM;
642}
643
644/* Types with a mode other than those supported by the machine are passed by
645 reference in memory. */
646
647static bool
52090e4d 648nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
e74f7152 649{
52090e4d 650 return pass_in_memory (arg.mode, arg.type, false);
e74f7152
NS
651}
652
653/* Implement TARGET_RETURN_IN_MEMORY. */
654
655static bool
656nvptx_return_in_memory (const_tree type, const_tree)
657{
44eba92d 658 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
659}
660
661/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
662
663static machine_mode
664nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 665 int *ARG_UNUSED (punsignedp),
e74f7152
NS
666 const_tree funtype, int for_return)
667{
44eba92d 668 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
669}
670
e74f7152
NS
671/* Helper for write_arg. Emit a single PTX argument of MODE, either
672 in a prototype, or as copy in a function prologue. ARGNO is the
673 index of this argument in the PTX function. FOR_REG is negative,
674 if we're emitting the PTX prototype. It is zero if we're copying
675 to an argument register and it is greater than zero if we're
676 copying to a specific hard register. */
677
678static int
1f065954
NS
679write_arg_mode (std::stringstream &s, int for_reg, int argno,
680 machine_mode mode)
e74f7152
NS
681{
682 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
683
dc3d2aeb
NS
684 if (for_reg < 0)
685 {
686 /* Writing PTX prototype. */
687 s << (argno ? ", " : " (");
e74f7152 688 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
689 }
690 else
691 {
e74f7152 692 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
693 if (for_reg)
694 s << reg_names[for_reg];
695 else
696 s << "%ar" << argno;
697 s << ";\n";
5563d5c0
NS
698 if (argno >= 0)
699 {
700 s << "\tld.param" << ptx_type << " ";
701 if (for_reg)
702 s << reg_names[for_reg];
703 else
704 s << "%ar" << argno;
705 s << ", [%in_ar" << argno << "];\n";
706 }
dc3d2aeb
NS
707 }
708 return argno + 1;
738f2522
BS
709}
710
e74f7152 711/* Process function parameter TYPE to emit one or more PTX
1f065954 712 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
713 is true, if this is a prototyped function, rather than an old-style
714 C declaration. Returns the next argument number to use.
e74f7152 715
9c582551 716 The promotion behavior here must match the regular GCC function
e74f7152
NS
717 parameter marshalling machinery. */
718
719static int
1f065954
NS
720write_arg_type (std::stringstream &s, int for_reg, int argno,
721 tree type, bool prototyped)
e74f7152
NS
722{
723 machine_mode mode = TYPE_MODE (type);
724
725 if (mode == VOIDmode)
726 return argno;
727
44eba92d 728 if (pass_in_memory (mode, type, false))
e74f7152 729 mode = Pmode;
44eba92d
NS
730 else
731 {
732 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 733
44eba92d
NS
734 if (split)
735 {
736 /* Complex types are sent as two separate args. */
737 type = TREE_TYPE (type);
5563d5c0 738 mode = TYPE_MODE (type);
44eba92d
NS
739 prototyped = true;
740 }
e74f7152 741
44eba92d
NS
742 mode = promote_arg (mode, prototyped);
743 if (split)
1f065954 744 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 745 }
e74f7152 746
1f065954
NS
747 return write_arg_mode (s, for_reg, argno, mode);
748}
749
750/* Emit a PTX return as a prototype or function prologue declaration
751 for MODE. */
752
753static void
754write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
755{
756 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
757 const char *pfx = "\t.reg";
758 const char *sfx = ";\n";
759
760 if (for_proto)
761 pfx = "(.param", sfx = "_out) ";
762
763 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
764}
765
44eba92d 766/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 767 or function prologue declaration. Returns true if return is via an
9c582551 768 additional pointer parameter. The promotion behavior here must
1f065954 769 match the regular GCC function return mashalling. */
44eba92d 770
0beb7c71 771static bool
1f065954 772write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
773{
774 machine_mode mode = TYPE_MODE (type);
0beb7c71 775
44eba92d
NS
776 if (mode == VOIDmode)
777 return false;
778
779 bool return_in_mem = pass_in_memory (mode, type, true);
780
781 if (return_in_mem)
0beb7c71 782 {
44eba92d
NS
783 if (for_proto)
784 return return_in_mem;
785
786 /* Named return values can cause us to return a pointer as well
787 as expect an argument for the return location. This is
788 optimization-level specific, so no caller can make use of
789 this data, but more importantly for us, we must ensure it
790 doesn't change the PTX prototype. */
44c068ae 791 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 792
44eba92d
NS
793 if (mode == VOIDmode)
794 return return_in_mem;
795
44c068ae 796 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 797 retval parameter. */
44c068ae 798 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
799 }
800 else
44eba92d
NS
801 mode = promote_return (mode);
802
1f065954 803 write_return_mode (s, for_proto, mode);
0beb7c71
NS
804
805 return return_in_mem;
806}
807
738f2522
BS
808/* Look for attributes in ATTRS that would indicate we must write a function
809 as a .entry kernel rather than a .func. Return true if one is found. */
810
811static bool
812write_as_kernel (tree attrs)
813{
814 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
815 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
816 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
817 /* For OpenMP target regions, the corresponding kernel entry is emitted from
818 write_omp_entry as a separate function. */
738f2522
BS
819}
820
69823d76
NS
821/* Emit a linker marker for a function decl or defn. */
822
823static void
824write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
825 const char *name)
826{
827 s << "\n// BEGIN";
828 if (globalize)
829 s << " GLOBAL";
830 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
831 s << name << "\n";
832}
833
834/* Emit a linker marker for a variable decl or defn. */
835
836static void
837write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
838{
839 fprintf (file, "\n// BEGIN%s VAR %s: ",
840 globalize ? " GLOBAL" : "",
841 is_defn ? "DEF" : "DECL");
842 assemble_name_raw (file, name);
843 fputs ("\n", file);
844}
845
db52bcfe 846/* Helper function for write_fn_proto. */
738f2522 847
db52bcfe
TV
848static void
849write_fn_proto_1 (std::stringstream &s, bool is_defn,
850 const char *name, const_tree decl)
738f2522 851{
69823d76 852 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
853
854 /* PTX declaration. */
738f2522
BS
855 if (DECL_EXTERNAL (decl))
856 s << ".extern ";
857 else if (TREE_PUBLIC (decl))
0766660b 858 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 859 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 860
b699adcc
NS
861 tree fntype = TREE_TYPE (decl);
862 tree result_type = TREE_TYPE (fntype);
738f2522 863
ac3d02e2
NS
864 /* atomic_compare_exchange_$n builtins have an exceptional calling
865 convention. */
866 int not_atomic_weak_arg = -1;
867 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
868 switch (DECL_FUNCTION_CODE (decl))
869 {
870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
871 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
872 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
873 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
874 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
875 /* These atomics skip the 'weak' parm in an actual library
876 call. We must skip it in the prototype too. */
877 not_atomic_weak_arg = 3;
878 break;
879
880 default:
881 break;
882 }
883
738f2522 884 /* Declare the result. */
1f065954 885 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 886
b699adcc
NS
887 s << name;
888
dc3d2aeb 889 int argno = 0;
b699adcc
NS
890
891 /* Emit argument list. */
892 if (return_in_mem)
1f065954 893 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 894
b699adcc
NS
895 /* We get:
896 NULL in TYPE_ARG_TYPES, for old-style functions
897 NULL in DECL_ARGUMENTS, for builtin functions without another
898 declaration.
899 So we have to pick the best one we have. */
900 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
901 bool prototyped = true;
902 if (!args)
903 {
904 args = DECL_ARGUMENTS (decl);
905 prototyped = false;
906 }
738f2522 907
ac3d02e2 908 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 909 {
dc3d2aeb 910 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
911
912 if (not_atomic_weak_arg)
913 argno = write_arg_type (s, -1, argno, type, prototyped);
914 else
505590b7 915 gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
738f2522 916 }
738f2522 917
b699adcc 918 if (stdarg_p (fntype))
1f065954 919 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 920
b699adcc 921 if (DECL_STATIC_CHAIN (decl))
1f065954 922 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 923
dc3d2aeb 924 if (!argno && strcmp (name, "main") == 0)
b699adcc 925 {
1f065954
NS
926 argno = write_arg_type (s, -1, argno, integer_type_node, true);
927 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
928 }
929
dc3d2aeb 930 if (argno)
b699adcc
NS
931 s << ")";
932
933 s << (is_defn ? "\n" : ";\n");
db52bcfe 934}
b699adcc 935
db52bcfe
TV
936/* Write a .func or .kernel declaration or definition along with
937 a helper comment for use by ld. S is the stream to write to, DECL
938 the decl for the function with name NAME. For definitions, emit
939 a declaration too. */
940
941static void
942write_fn_proto (std::stringstream &s, bool is_defn,
943 const char *name, const_tree decl)
944{
e88b04bd
TV
945 const char *replacement = nvptx_name_replacement (name);
946 char *replaced_dots = NULL;
947 if (replacement != name)
948 name = replacement;
949 else
950 {
951 replaced_dots = nvptx_replace_dot (name);
952 if (replaced_dots)
953 name = replaced_dots;
954 }
db52bcfe
TV
955 if (name[0] == '*')
956 name++;
957
958 if (is_defn)
959 /* Emit a declaration. The PTX assembler gets upset without it. */
960 write_fn_proto_1 (s, false, name, decl);
961
962 write_fn_proto_1 (s, is_defn, name, decl);
e88b04bd
TV
963
964 if (replaced_dots)
965 XDELETE (replaced_dots);
738f2522
BS
966}
967
00e52418
NS
968/* Construct a function declaration from a call insn. This can be
969 necessary for two reasons - either we have an indirect call which
970 requires a .callprototype declaration, or we have a libcall
971 generated by emit_library_call for which no decl exists. */
972
973static void
b699adcc
NS
974write_fn_proto_from_insn (std::stringstream &s, const char *name,
975 rtx result, rtx pat)
00e52418 976{
e88b04bd
TV
977 char *replaced_dots = NULL;
978
00e52418
NS
979 if (!name)
980 {
981 s << "\t.callprototype ";
982 name = "_";
983 }
984 else
985 {
e88b04bd
TV
986 const char *replacement = nvptx_name_replacement (name);
987 if (replacement != name)
988 name = replacement;
989 else
990 {
991 replaced_dots = nvptx_replace_dot (name);
992 if (replaced_dots)
993 name = replaced_dots;
994 }
69823d76 995 write_fn_marker (s, false, true, name);
00e52418
NS
996 s << "\t.extern .func ";
997 }
998
999 if (result != NULL_RTX)
1f065954 1000 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
1001
1002 s << name;
e88b04bd
TV
1003 if (replaced_dots)
1004 XDELETE (replaced_dots);
00e52418 1005
00e52418
NS
1006 int arg_end = XVECLEN (pat, 0);
1007 for (int i = 1; i < arg_end; i++)
1008 {
1f065954
NS
1009 /* We don't have to deal with mode splitting & promotion here,
1010 as that was already done when generating the call
1011 sequence. */
00e52418
NS
1012 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1013
1f065954 1014 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
1015 }
1016 if (arg_end != 1)
1017 s << ")";
1018 s << ";\n";
1019}
1020
00e52418 1021/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
700d4cb0 1022 table and write a ptx prototype. These are emitted at end of
00e52418 1023 compilation. */
738f2522 1024
00e52418
NS
1025static void
1026nvptx_record_fndecl (tree decl)
738f2522 1027{
f3dba894 1028 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
1029 if (*slot == NULL)
1030 {
1031 *slot = decl;
1032 const char *name = get_fnname_from_decl (decl);
b699adcc 1033 write_fn_proto (func_decls, false, name, decl);
738f2522 1034 }
738f2522
BS
1035}
1036
00e52418
NS
1037/* Record a libcall or unprototyped external function. CALLEE is the
1038 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1039 declaration for it. */
1040
1041static void
1042nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1043{
1044 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1045 if (*slot == NULL)
1046 {
1047 *slot = callee;
1048
1049 const char *name = XSTR (callee, 0);
b699adcc 1050 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
1051 }
1052}
1053
1054/* DECL is an external FUNCTION_DECL, that we're referencing. If it
1055 is prototyped, record it now. Otherwise record it as needed at end
1056 of compilation, when we might have more information about it. */
738f2522
BS
1057
1058void
1059nvptx_record_needed_fndecl (tree decl)
1060{
00e52418
NS
1061 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1062 {
1063 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1064 if (*slot == NULL)
1065 *slot = decl;
1066 }
1067 else
1068 nvptx_record_fndecl (decl);
1069}
738f2522 1070
00e52418
NS
1071/* SYM is a SYMBOL_REF. If it refers to an external function, record
1072 it as needed. */
1073
1074static void
1075nvptx_maybe_record_fnsym (rtx sym)
1076{
1077 tree decl = SYMBOL_REF_DECL (sym);
1078
1079 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1080 nvptx_record_needed_fndecl (decl);
738f2522
BS
1081}
1082
1f065954 1083/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1084 and initialize REGNO to point to it. If the size is zero, it'll
1085 never be valid to dereference, so we can simply initialize to
1086 zero. */
1f065954
NS
1087
1088static void
1089init_frame (FILE *file, int regno, unsigned align, unsigned size)
1090{
5563d5c0
NS
1091 if (size)
1092 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1093 align, reg_names[regno], size);
1094 fprintf (file, "\t.reg.u%d %s;\n",
1095 POINTER_SIZE, reg_names[regno]);
1096 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1097 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1098 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1099}
1100
5012919d
AM
1101/* Emit soft stack frame setup sequence. */
1102
1103static void
1104init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1105{
1106 /* Maintain 64-bit stack alignment. */
1107 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1108 size = ROUND_UP (size, keep_align);
1109 int bits = POINTER_SIZE;
1110 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1111 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1112 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1113 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1114 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1115 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1116 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1117 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1118 fprintf (file, "\t{\n");
1119 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1120 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1121 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1122 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1123 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1124 bits == 64 ? ".wide" : ".lo", bits / 8);
1125 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1126
1127 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1128 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1129
1130 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1131 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1132 bits, reg_sspprev, reg_sspslot);
1133
1134 /* Initialize %frame = %sspprev - size. */
1135 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1136 bits, reg_frame, reg_sspprev, size);
1137
1138 /* Apply alignment, if larger than 64. */
1139 if (alignment > keep_align)
1140 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1141 bits, reg_frame, reg_frame, -alignment);
1142
1143 size = crtl->outgoing_args_size;
1144 gcc_assert (size % keep_align == 0);
1145
1146 /* Initialize %stack. */
1147 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1148 bits, reg_stack, reg_frame, size);
1149
5012919d
AM
1150 if (!crtl->is_leaf)
1151 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1152 bits, reg_sspslot, reg_stack);
1153 fprintf (file, "\t}\n");
1154 cfun->machine->has_softstack = true;
1155 need_softstack_decl = true;
1156}
1157
d88cd9c4
NS
1158/* Emit code to initialize the REGNO predicate register to indicate
1159 whether we are not lane zero on the NAME axis. */
1160
1161static void
1162nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1163{
1164 fprintf (file, "\t{\n");
1165 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
f881693c
TV
1166 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1167 {
1168 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1169 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1170 }
d88cd9c4
NS
1171 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1172 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
f881693c
TV
1173 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1174 {
1175 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1176 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1177 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1178 "// vector reduction buffer\n",
1179 REGNO (cfun->machine->red_partition),
1180 vector_red_partition);
1181 }
1182 /* Verify vector_red_size. */
1183 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1184 <= vector_red_size);
d88cd9c4
NS
1185 fprintf (file, "\t}\n");
1186}
1187
0024c320
TV
1188/* Emit code to initialize OpenACC worker broadcast and synchronization
1189 registers. */
1190
1191static void
1192nvptx_init_oacc_workers (FILE *file)
1193{
1194 fprintf (file, "\t{\n");
1195 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1196 if (cfun->machine->bcast_partition)
1197 {
1198 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1199 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1200 }
1201 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1202 if (cfun->machine->bcast_partition)
1203 {
1204 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1205 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1206 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1207 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1208 "// vector broadcast offset\n",
1209 REGNO (cfun->machine->bcast_partition),
1210 oacc_bcast_partition);
1211 }
1212 /* Verify oacc_bcast_size. */
1213 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1214 <= oacc_bcast_size);
1215 if (cfun->machine->sync_bar)
1216 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1217 "// vector synchronization barrier\n",
1218 REGNO (cfun->machine->sync_bar));
1219 fprintf (file, "\t}\n");
1220}
1221
5012919d
AM
1222/* Emit code to initialize predicate and master lane index registers for
1223 -muniform-simt code generation variant. */
1224
1225static void
1226nvptx_init_unisimt_predicate (FILE *file)
1227{
0c6b03b5
AM
1228 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1229 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1230 int bits = POINTER_SIZE;
0c6b03b5 1231 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1232 fprintf (file, "\t{\n");
1233 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1234 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1235 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1236 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1237 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1238 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1239 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1240 if (cfun->machine->unisimt_predicate)
1241 {
1242 int master = REGNO (cfun->machine->unisimt_master);
1243 int pred = REGNO (cfun->machine->unisimt_predicate);
1244 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1245 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1246 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1247 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1248 /* Compute predicate as 'tid.x == master'. */
1249 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1250 }
5012919d
AM
1251 fprintf (file, "\t}\n");
1252 need_unisimt_decl = true;
1253}
1254
1255/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1256
1257 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1258 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1259 {
1260 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1261 __nvptx_uni[tid.y] = 0;
1262 gomp_nvptx_main (ORIG, arg);
1263 }
1264 ORIG itself should not be emitted as a PTX .entry function. */
1265
1266static void
1267write_omp_entry (FILE *file, const char *name, const char *orig)
1268{
1269 static bool gomp_nvptx_main_declared;
1270 if (!gomp_nvptx_main_declared)
1271 {
1272 gomp_nvptx_main_declared = true;
1273 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1274 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1275 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1276 }
3c5b4c6d
TS
1277 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1278#define NTID_Y "%ntid.y"
5012919d
AM
1279#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1280 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1281{\n\
1282 .reg.u32 %r<3>;\n\
1283 .reg.u" PS " %R<4>;\n\
1284 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1285 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1286 mov.u32 %r2, %ctaid.x;\n\
1287 cvt.u" PS ".u32 %R1, %r0;\n\
1288 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1289 mov.u" PS " %R0, __nvptx_stacks;\n\
1290 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1291 ld.param.u" PS " %R2, [%stack];\n\
1292 ld.param.u" PS " %R3, [%sz];\n\
1293 add.u" PS " %R2, %R2, %R3;\n\
1294 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1295 st.shared.u" PS " [%R0], %R2;\n\
1296 mov.u" PS " %R0, __nvptx_uni;\n\
1297 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1298 mov.u32 %r0, 0;\n\
1299 st.shared.u32 [%R0], %r0;\n\
1300 mov.u" PS " %R0, \0;\n\
1301 ld.param.u" PS " %R1, [%arg];\n\
1302 {\n\
1303 .param.u" PS " %P<2>;\n\
1304 st.param.u" PS " [%P0], %R0;\n\
1305 st.param.u" PS " [%P1], %R1;\n\
1306 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1307 }\n\
1308 ret.uni;\n\
1309}\n"
1310 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1311 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1312#undef ENTRY_TEMPLATE
3c5b4c6d 1313#undef NTID_Y
5012919d
AM
1314 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1315 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1316 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1317 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1318 need_softstack_decl = need_unisimt_decl = true;
1319}
1320
738f2522
BS
1321/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1322 function, including local var decls and copies from the arguments to
1323 local regs. */
1324
1325void
1326nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1327{
1328 tree fntype = TREE_TYPE (decl);
1329 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1330 int argno = 0;
738f2522 1331
5012919d
AM
1332 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1333 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1334 {
1335 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1336 sprintf (buf, "%s$impl", name);
1337 write_omp_entry (file, name, buf);
1338 name = buf;
1339 }
dc3d2aeb
NS
1340 /* We construct the initial part of the function into a string
1341 stream, in order to share the prototype writing code. */
738f2522 1342 std::stringstream s;
b699adcc 1343 write_fn_proto (s, true, name, decl);
dc3d2aeb 1344 s << "{\n";
738f2522 1345
1f065954 1346 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1347 if (return_in_mem)
1f065954 1348 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1349
5ab662d5 1350 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1351 tree args = TYPE_ARG_TYPES (fntype);
1352 bool prototyped = true;
1353 if (!args)
5ab662d5 1354 {
dc3d2aeb
NS
1355 args = DECL_ARGUMENTS (decl);
1356 prototyped = false;
5ab662d5
NS
1357 }
1358
1359 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1360 {
1361 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1362
1f065954 1363 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1364 }
5ab662d5 1365
dc3d2aeb 1366 if (stdarg_p (fntype))
5563d5c0 1367 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1368 true);
5ab662d5 1369
5563d5c0
NS
1370 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1371 write_arg_type (s, STATIC_CHAIN_REGNUM,
1372 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1373 true);
1374
dc3d2aeb 1375 fprintf (file, "%s", s.str().c_str());
25662751 1376
0c6b03b5
AM
1377 /* Usually 'crtl->is_leaf' is computed during register allocator
1378 initialization (which is not done on NVPTX) or for pressure-sensitive
1379 optimizations. Initialize it here, except if already set. */
1380 if (!crtl->is_leaf)
1381 crtl->is_leaf = leaf_function_p ();
1382
44c068ae 1383 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1384 bool need_frameptr = sz || cfun->machine->has_chain;
1385 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1386 if (!TARGET_SOFT_STACK)
1387 {
1388 /* Declare a local var for outgoing varargs. */
1389 if (cfun->machine->has_varadic)
1390 init_frame (file, STACK_POINTER_REGNUM,
1391 UNITS_PER_WORD, crtl->outgoing_args_size);
1392
1393 /* Declare a local variable for the frame. Force its size to be
1394 DImode-compatible. */
1395 if (need_frameptr)
1396 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1397 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1398 }
0c6b03b5
AM
1399 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1400 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1401 init_softstack_frame (file, alignment, sz);
44c068ae 1402
0c6b03b5
AM
1403 if (cfun->machine->has_simtreg)
1404 {
1405 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1406 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1407 align = MAX (align, GET_MODE_SIZE (DImode));
1408 if (!crtl->is_leaf || cfun->calls_alloca)
1409 simtsz = HOST_WIDE_INT_M1U;
1410 if (simtsz == HOST_WIDE_INT_M1U)
1411 simtsz = nvptx_softstack_size;
1412 if (cfun->machine->has_softstack)
1413 simtsz += POINTER_SIZE / 8;
1414 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1415 if (align > GET_MODE_SIZE (DImode))
1416 simtsz += align - GET_MODE_SIZE (DImode);
1417 if (simtsz)
1418 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1419 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1420 }
f881693c
TV
1421
1422 /* Restore the vector reduction partition register, if necessary.
1423 FIXME: Find out when and why this is necessary, and fix it. */
1424 if (cfun->machine->red_partition)
1425 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1426 = cfun->machine->red_partition;
1427
738f2522
BS
1428 /* Declare the pseudos we have as ptx registers. */
1429 int maxregs = max_reg_num ();
1430 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1431 {
1432 if (regno_reg_rtx[i] != const0_rtx)
1433 {
1434 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1435 machine_mode split = maybe_split_mode (mode);
f313d112 1436
38827811 1437 if (split_mode_p (mode))
f313d112
NS
1438 mode = split;
1439 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1440 output_reg (file, i, split, -2);
1441 fprintf (file, ";\n");
738f2522
BS
1442 }
1443 }
1444
d88cd9c4
NS
1445 /* Emit axis predicates. */
1446 if (cfun->machine->axis_predicate[0])
1447 nvptx_init_axis_predicate (file,
1448 REGNO (cfun->machine->axis_predicate[0]), "y");
1449 if (cfun->machine->axis_predicate[1])
1450 nvptx_init_axis_predicate (file,
1451 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1452 if (cfun->machine->unisimt_predicate
1453 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1454 nvptx_init_unisimt_predicate (file);
0024c320
TV
1455 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1456 nvptx_init_oacc_workers (file);
738f2522
BS
1457}
1458
0c6b03b5
AM
1459/* Output code for switching uniform-simt state. ENTERING indicates whether
1460 we are entering or leaving non-uniform execution region. */
1461
1462static void
1463nvptx_output_unisimt_switch (FILE *file, bool entering)
1464{
1465 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1466 return;
1467 fprintf (file, "\t{\n");
1468 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1469 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1470 if (!crtl->is_leaf)
1471 {
1472 int loc = REGNO (cfun->machine->unisimt_location);
1473 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1474 }
1475 if (cfun->machine->unisimt_predicate)
1476 {
1477 int master = REGNO (cfun->machine->unisimt_master);
1478 int pred = REGNO (cfun->machine->unisimt_predicate);
1479 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1480 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1481 master, entering ? "%ustmp2" : "0");
1482 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1483 }
1484 fprintf (file, "\t}\n");
1485}
1486
1487/* Output code for allocating per-lane storage and switching soft-stack pointer.
1488 ENTERING indicates whether we are entering or leaving non-uniform execution.
1489 PTR is the register pointing to allocated storage, it is assigned to on
1490 entering and used to restore state on leaving. SIZE and ALIGN are used only
1491 on entering. */
1492
1493static void
1494nvptx_output_softstack_switch (FILE *file, bool entering,
1495 rtx ptr, rtx size, rtx align)
1496{
1497 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1498 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1499 return;
1500 int bits = POINTER_SIZE, regno = REGNO (ptr);
1501 fprintf (file, "\t{\n");
1502 if (entering)
1503 {
1504 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1505 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1506 cfun->machine->simt_stack_size);
1507 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1508 if (CONST_INT_P (size))
1509 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1510 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1511 else
1512 output_reg (file, REGNO (size), VOIDmode);
1513 fputs (";\n", file);
1514 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296 1515 fprintf (file,
ce9c4ec3 1516 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1517 bits, regno, regno, UINTVAL (align));
1518 }
1519 if (cfun->machine->has_softstack)
1520 {
1521 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1522 if (entering)
1523 {
1524 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1525 bits, regno, bits / 8, reg_stack);
1526 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1527 bits, reg_stack, regno, bits / 8);
1528 }
1529 else
1530 {
1531 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1532 bits, reg_stack, regno, bits / 8);
1533 }
1534 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1535 }
1536 fprintf (file, "\t}\n");
1537}
1538
1539/* Output code to enter non-uniform execution region. DEST is a register
1540 to hold a per-lane allocation given by SIZE and ALIGN. */
1541
1542const char *
1543nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1544{
1545 nvptx_output_unisimt_switch (asm_out_file, true);
1546 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1547 return "";
1548}
1549
1550/* Output code to leave non-uniform execution region. SRC is the register
1551 holding per-lane storage previously allocated by omp_simt_enter insn. */
1552
1553const char *
1554nvptx_output_simt_exit (rtx src)
1555{
1556 nvptx_output_unisimt_switch (asm_out_file, false);
1557 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1558 return "";
1559}
1560
5012919d
AM
1561/* Output instruction that sets soft stack pointer in shared memory to the
1562 value in register given by SRC_REGNO. */
1563
1564const char *
1565nvptx_output_set_softstack (unsigned src_regno)
1566{
1567 if (cfun->machine->has_softstack && !crtl->is_leaf)
1568 {
1569 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1570 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1571 output_reg (asm_out_file, src_regno, VOIDmode);
1572 fprintf (asm_out_file, ";\n");
1573 }
1574 return "";
1575}
738f2522
BS
1576/* Output a return instruction. Also copy the return value to its outgoing
1577 location. */
1578
1579const char *
1580nvptx_output_return (void)
1581{
44c068ae 1582 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1583
1584 if (mode != VOIDmode)
1f065954
NS
1585 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1586 nvptx_ptx_type_from_mode (mode, false),
1587 reg_names[NVPTX_RETURN_REGNUM],
1588 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1589
1590 return "ret;";
1591}
1592
738f2522
BS
1593/* Terminate a function by writing a closing brace to FILE. */
1594
1595void
1596nvptx_function_end (FILE *file)
1597{
cf08c344 1598 fprintf (file, "}\n");
738f2522
BS
1599}
1600\f
1601/* Decide whether we can make a sibling call to a function. For ptx, we
1602 can't. */
1603
1604static bool
1605nvptx_function_ok_for_sibcall (tree, tree)
1606{
1607 return false;
1608}
1609
18c05628
NS
1610/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1611
1612static rtx
1613nvptx_get_drap_rtx (void)
1614{
5012919d
AM
1615 if (TARGET_SOFT_STACK && stack_realign_drap)
1616 return arg_pointer_rtx;
18c05628
NS
1617 return NULL_RTX;
1618}
1619
738f2522
BS
1620/* Implement the TARGET_CALL_ARGS hook. Record information about one
1621 argument to the next call. */
1622
1623static void
44c068ae 1624nvptx_call_args (rtx arg, tree fntype)
738f2522 1625{
44c068ae 1626 if (!cfun->machine->doing_call)
738f2522 1627 {
44c068ae
NS
1628 cfun->machine->doing_call = true;
1629 cfun->machine->is_varadic = false;
1630 cfun->machine->num_args = 0;
1631
1632 if (fntype && stdarg_p (fntype))
1633 {
1634 cfun->machine->is_varadic = true;
1635 cfun->machine->has_varadic = true;
1636 cfun->machine->num_args++;
1637 }
738f2522 1638 }
738f2522 1639
44c068ae
NS
1640 if (REG_P (arg) && arg != pc_rtx)
1641 {
1642 cfun->machine->num_args++;
1643 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1644 cfun->machine->call_args);
1645 }
738f2522
BS
1646}
1647
1648/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1649 information we recorded. */
1650
1651static void
1652nvptx_end_call_args (void)
1653{
44c068ae 1654 cfun->machine->doing_call = false;
738f2522
BS
1655 free_EXPR_LIST_list (&cfun->machine->call_args);
1656}
1657
ecf6e535
BS
1658/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1659 track of whether calls involving static chains or varargs were seen
1660 in the current function.
1661 For libcalls, maintain a hash table of decls we have seen, and
1662 record a function decl for later when encountering a new one. */
738f2522
BS
1663
1664void
1665nvptx_expand_call (rtx retval, rtx address)
1666{
738f2522 1667 rtx callee = XEXP (address, 0);
f324806d 1668 rtx varargs = NULL_RTX;
d88cd9c4 1669 unsigned parallel = 0;
738f2522 1670
738f2522
BS
1671 if (!call_insn_operand (callee, Pmode))
1672 {
1673 callee = force_reg (Pmode, callee);
1674 address = change_address (address, QImode, callee);
1675 }
1676
1677 if (GET_CODE (callee) == SYMBOL_REF)
1678 {
1679 tree decl = SYMBOL_REF_DECL (callee);
1680 if (decl != NULL_TREE)
1681 {
738f2522 1682 if (DECL_STATIC_CHAIN (decl))
44c068ae 1683 cfun->machine->has_chain = true;
00e52418 1684
629b3d75 1685 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1686 if (attr)
1687 {
1688 tree dims = TREE_VALUE (attr);
1689
1690 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1691 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1692 {
1693 if (TREE_PURPOSE (dims)
1694 && !integer_zerop (TREE_PURPOSE (dims)))
1695 break;
1696 /* Not on this axis. */
1697 parallel ^= GOMP_DIM_MASK (ix);
1698 dims = TREE_CHAIN (dims);
1699 }
1700 }
738f2522
BS
1701 }
1702 }
c38f0d8c 1703
44c068ae
NS
1704 unsigned nargs = cfun->machine->num_args;
1705 if (cfun->machine->is_varadic)
738f2522 1706 {
f324806d 1707 varargs = gen_reg_rtx (Pmode);
863af9a4 1708 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1709 }
1710
44c068ae
NS
1711 rtvec vec = rtvec_alloc (nargs + 1);
1712 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1713 int vec_pos = 0;
44c068ae
NS
1714
1715 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1716 rtx tmp_retval = retval;
44c068ae 1717 if (retval)
738f2522
BS
1718 {
1719 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1720 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1721 call = gen_rtx_SET (tmp_retval, call);
738f2522 1722 }
44c068ae 1723 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1724
1725 /* Construct the call insn, including a USE for each argument pseudo
1726 register. These will be used when printing the insn. */
1727 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1728 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1729
1730 if (varargs)
cf08c344 1731 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1732
1733 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1734
d88cd9c4 1735 nvptx_emit_forking (parallel, true);
738f2522 1736 emit_call_insn (pat);
d88cd9c4
NS
1737 nvptx_emit_joining (parallel, true);
1738
738f2522
BS
1739 if (tmp_retval != retval)
1740 emit_move_insn (retval, tmp_retval);
1741}
44eba92d 1742
738f2522
BS
1743/* Emit a comparison COMPARE, and return the new test to be used in the
1744 jump. */
1745
1746rtx
1747nvptx_expand_compare (rtx compare)
1748{
1749 rtx pred = gen_reg_rtx (BImode);
1750 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1751 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1752 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1753 return gen_rtx_NE (BImode, pred, const0_rtx);
1754}
1755
d88cd9c4
NS
1756/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1757
1758void
1759nvptx_expand_oacc_fork (unsigned mode)
1760{
1761 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1762}
1763
1764void
1765nvptx_expand_oacc_join (unsigned mode)
1766{
1767 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1768}
1769
1770/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1771 objects. */
1772
1773static rtx
1774nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1775{
1776 rtx res;
1777
1778 switch (GET_MODE (src))
1779 {
4e10a5a7 1780 case E_DImode:
d88cd9c4
NS
1781 res = gen_unpackdisi2 (dst0, dst1, src);
1782 break;
4e10a5a7 1783 case E_DFmode:
d88cd9c4
NS
1784 res = gen_unpackdfsi2 (dst0, dst1, src);
1785 break;
1786 default: gcc_unreachable ();
1787 }
1788 return res;
1789}
1790
1791/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1792 object. */
1793
1794static rtx
1795nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1796{
1797 rtx res;
1798
1799 switch (GET_MODE (dst))
1800 {
4e10a5a7 1801 case E_DImode:
d88cd9c4
NS
1802 res = gen_packsidi2 (dst, src0, src1);
1803 break;
4e10a5a7 1804 case E_DFmode:
d88cd9c4
NS
1805 res = gen_packsidf2 (dst, src0, src1);
1806 break;
1807 default: gcc_unreachable ();
1808 }
1809 return res;
1810}
1811
1812/* Generate an instruction or sequence to broadcast register REG
1813 across the vectors of a single warp. */
1814
5012919d 1815rtx
59263259 1816nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1817{
1818 rtx res;
1819
1820 switch (GET_MODE (dst))
1821 {
4e10a5a7 1822 case E_SImode:
d88cd9c4
NS
1823 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1824 break;
4e10a5a7 1825 case E_SFmode:
d88cd9c4
NS
1826 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1827 break;
4e10a5a7
RS
1828 case E_DImode:
1829 case E_DFmode:
d88cd9c4
NS
1830 {
1831 rtx tmp0 = gen_reg_rtx (SImode);
1832 rtx tmp1 = gen_reg_rtx (SImode);
1833
1834 start_sequence ();
1835 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1836 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1837 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1838 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1839 res = get_insns ();
1840 end_sequence ();
1841 }
1842 break;
344f09a7
TV
1843 case E_V2SImode:
1844 {
1845 rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
1846 rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
1847 rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
1848 rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
1849 rtx tmp0 = gen_reg_rtx (SImode);
1850 rtx tmp1 = gen_reg_rtx (SImode);
1851 start_sequence ();
1852 emit_insn (gen_movsi (tmp0, src0));
1853 emit_insn (gen_movsi (tmp1, src1));
1854 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1855 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1856 emit_insn (gen_movsi (dst0, tmp0));
1857 emit_insn (gen_movsi (dst1, tmp1));
1858 res = get_insns ();
1859 end_sequence ();
1860 }
1861 break;
1862 case E_V2DImode:
1863 {
1864 rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
1865 rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
1866 rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
1867 rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
1868 rtx tmp0 = gen_reg_rtx (DImode);
1869 rtx tmp1 = gen_reg_rtx (DImode);
1870 start_sequence ();
1871 emit_insn (gen_movdi (tmp0, src0));
1872 emit_insn (gen_movdi (tmp1, src1));
1873 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1874 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1875 emit_insn (gen_movdi (dst0, tmp0));
1876 emit_insn (gen_movdi (dst1, tmp1));
1877 res = get_insns ();
1878 end_sequence ();
1879 }
1880 break;
4e10a5a7 1881 case E_BImode:
d88cd9c4
NS
1882 {
1883 rtx tmp = gen_reg_rtx (SImode);
1884
1885 start_sequence ();
1886 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1887 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1888 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1889 res = get_insns ();
1890 end_sequence ();
1891 }
1892 break;
4e10a5a7
RS
1893 case E_QImode:
1894 case E_HImode:
d5ace3b5
CP
1895 {
1896 rtx tmp = gen_reg_rtx (SImode);
1897
1898 start_sequence ();
1899 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1900 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1901 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1902 tmp)));
1903 res = get_insns ();
1904 end_sequence ();
1905 }
1906 break;
d88cd9c4
NS
1907
1908 default:
1909 gcc_unreachable ();
1910 }
1911 return res;
1912}
1913
1914/* Generate an instruction or sequence to broadcast register REG
1915 across the vectors of a single warp. */
1916
1917static rtx
a0b3b5c4 1918nvptx_gen_warp_bcast (rtx reg)
d88cd9c4
NS
1919{
1920 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1921}
1922
1923/* Structure used when generating a worker-level spill or fill. */
1924
1ed57fb8 1925struct broadcast_data_t
d88cd9c4
NS
1926{
1927 rtx base; /* Register holding base addr of buffer. */
1928 rtx ptr; /* Iteration var, if needed. */
1929 unsigned offset; /* Offset into worker buffer. */
1930};
1931
1932/* Direction of the spill/fill and looping setup/teardown indicator. */
1933
1934enum propagate_mask
1935 {
1936 PM_read = 1 << 0,
1937 PM_write = 1 << 1,
1938 PM_loop_begin = 1 << 2,
1939 PM_loop_end = 1 << 3,
1940
1941 PM_read_write = PM_read | PM_write
1942 };
1943
1944/* Generate instruction(s) to spill or fill register REG to/from the
1945 worker broadcast array. PM indicates what is to be done, REP
1946 how many loop iterations will be executed (0 for not a loop). */
1947
1948static rtx
a0b3b5c4
TV
1949nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1950 broadcast_data_t *data, bool vector)
d88cd9c4
NS
1951{
1952 rtx res;
1953 machine_mode mode = GET_MODE (reg);
1954
1955 switch (mode)
1956 {
4e10a5a7 1957 case E_BImode:
d88cd9c4
NS
1958 {
1959 rtx tmp = gen_reg_rtx (SImode);
1960
1961 start_sequence ();
1962 if (pm & PM_read)
1963 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
a0b3b5c4 1964 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
d88cd9c4
NS
1965 if (pm & PM_write)
1966 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1967 res = get_insns ();
1968 end_sequence ();
1969 }
1970 break;
1971
1972 default:
1973 {
1974 rtx addr = data->ptr;
1975
1976 if (!addr)
1977 {
1978 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1979
3c55d60f
TV
1980 oacc_bcast_align = MAX (oacc_bcast_align, align);
1981 data->offset = ROUND_UP (data->offset, align);
d88cd9c4 1982 addr = data->base;
a0b3b5c4 1983 gcc_assert (data->base != NULL);
d88cd9c4
NS
1984 if (data->offset)
1985 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1986 }
1987
1988 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1989 if (pm == PM_read)
1990 res = gen_rtx_SET (addr, reg);
1991 else if (pm == PM_write)
1992 res = gen_rtx_SET (reg, addr);
1993 else
1994 gcc_unreachable ();
1995
1996 if (data->ptr)
1997 {
1998 /* We're using a ptr, increment it. */
1999 start_sequence ();
2000
2001 emit_insn (res);
2002 emit_insn (gen_adddi3 (data->ptr, data->ptr,
2003 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2004 res = get_insns ();
2005 end_sequence ();
2006 }
2007 else
2008 rep = 1;
2009 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2010 }
2011 break;
2012 }
2013 return res;
2014}
738f2522
BS
2015\f
2016/* Returns true if X is a valid address for use in a memory reference. */
2017
2018static bool
2019nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2020{
2021 enum rtx_code code = GET_CODE (x);
2022
2023 switch (code)
2024 {
2025 case REG:
2026 return true;
2027
2028 case PLUS:
2029 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2030 return true;
2031 return false;
2032
2033 case CONST:
2034 case SYMBOL_REF:
2035 case LABEL_REF:
2036 return true;
2037
2038 default:
2039 return false;
2040 }
2041}
738f2522 2042\f
4d5438cd
NS
2043/* Machinery to output constant initializers. When beginning an
2044 initializer, we decide on a fragment size (which is visible in ptx
2045 in the type used), and then all initializer data is buffered until
2046 a fragment is filled and ready to be written out. */
2047
2048static struct
2049{
2050 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2051 unsigned HOST_WIDE_INT val; /* Current fragment value. */
2052 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2053 out. */
2054 unsigned size; /* Fragment size to accumulate. */
2055 unsigned offset; /* Offset within current fragment. */
2056 bool started; /* Whether we've output any initializer. */
2057} init_frag;
2058
2059/* The current fragment is full, write it out. SYM may provide a
2060 symbolic reference we should output, in which case the fragment
2061 value is the addend. */
738f2522
BS
2062
2063static void
4d5438cd 2064output_init_frag (rtx sym)
738f2522 2065{
4d5438cd
NS
2066 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2067 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 2068
4d5438cd
NS
2069 init_frag.started = true;
2070 init_frag.val = 0;
2071 init_frag.offset = 0;
2072 init_frag.remaining--;
2073
2074 if (sym)
2075 {
6c7c4708
CP
2076 bool function = (SYMBOL_REF_DECL (sym)
2077 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2078 if (!function)
2079 fprintf (asm_out_file, "generic(");
4d5438cd 2080 output_address (VOIDmode, sym);
6c7c4708
CP
2081 if (!function)
2082 fprintf (asm_out_file, ")");
2083 if (val)
2084 fprintf (asm_out_file, " + ");
4d5438cd 2085 }
738f2522 2086
4d5438cd
NS
2087 if (!sym || val)
2088 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
2089}
2090
4d5438cd
NS
2091/* Add value VAL of size SIZE to the data we're emitting, and keep
2092 writing out chunks as they fill up. */
738f2522
BS
2093
2094static void
4d5438cd 2095nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 2096{
60e537a0
TV
2097 bool negative_p
2098 = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2099
5e044c67
TV
2100 /* Avoid undefined behaviour. */
2101 if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2102 val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
4d5438cd
NS
2103
2104 for (unsigned part = 0; size; size -= part)
738f2522 2105 {
af47a203
TV
2106 if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2107 /* Avoid undefined behaviour. */
60e537a0 2108 val = negative_p ? -1 : 0;
af47a203
TV
2109 else
2110 val >>= (part * BITS_PER_UNIT);
4d5438cd 2111 part = init_frag.size - init_frag.offset;
3c55d60f 2112 part = MIN (part, size);
4d5438cd
NS
2113
2114 unsigned HOST_WIDE_INT partial
2115 = val << (init_frag.offset * BITS_PER_UNIT);
2116 init_frag.val |= partial & init_frag.mask;
2117 init_frag.offset += part;
2118
2119 if (init_frag.offset == init_frag.size)
2120 output_init_frag (NULL);
738f2522
BS
2121 }
2122}
2123
2124/* Target hook for assembling integer object X of size SIZE. */
2125
2126static bool
2127nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2128{
00e52418
NS
2129 HOST_WIDE_INT val = 0;
2130
2131 switch (GET_CODE (x))
738f2522 2132 {
00e52418 2133 default:
a9000e1e
NS
2134 /* Let the generic machinery figure it out, usually for a
2135 CONST_WIDE_INT. */
2136 return false;
00e52418
NS
2137
2138 case CONST_INT:
4d5438cd 2139 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
2140 break;
2141
2142 case CONST:
2143 x = XEXP (x, 0);
2144 gcc_assert (GET_CODE (x) == PLUS);
2145 val = INTVAL (XEXP (x, 1));
2146 x = XEXP (x, 0);
2147 gcc_assert (GET_CODE (x) == SYMBOL_REF);
69cf7dec 2148 gcc_fallthrough (); /* FALLTHROUGH */
00e52418
NS
2149
2150 case SYMBOL_REF:
4d5438cd
NS
2151 gcc_assert (size == init_frag.size);
2152 if (init_frag.offset)
738f2522 2153 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 2154
00e52418 2155 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
2156 init_frag.val = val;
2157 output_init_frag (x);
738f2522 2158 break;
738f2522
BS
2159 }
2160
738f2522
BS
2161 return true;
2162}
2163
2164/* Output SIZE zero bytes. We ignore the FILE argument since the
2165 functions we're calling to perform the output just use
2166 asm_out_file. */
2167
2168void
2169nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2170{
4d5438cd
NS
2171 /* Finish the current fragment, if it's started. */
2172 if (init_frag.offset)
738f2522 2173 {
4d5438cd 2174 unsigned part = init_frag.size - init_frag.offset;
3c55d60f 2175 part = MIN (part, (unsigned)size);
4d5438cd
NS
2176 size -= part;
2177 nvptx_assemble_value (0, part);
738f2522
BS
2178 }
2179
4d5438cd
NS
2180 /* If this skip doesn't terminate the initializer, write as many
2181 remaining pieces as possible directly. */
2182 if (size < init_frag.remaining * init_frag.size)
738f2522 2183 {
4d5438cd
NS
2184 while (size >= init_frag.size)
2185 {
2186 size -= init_frag.size;
2187 output_init_frag (NULL_RTX);
2188 }
2189 if (size)
2190 nvptx_assemble_value (0, size);
738f2522 2191 }
738f2522
BS
2192}
2193
2194/* Output a string STR with length SIZE. As in nvptx_output_skip we
2195 ignore the FILE arg. */
2196
2197void
2198nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2199{
2200 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2201 nvptx_assemble_value (str[i], 1);
2202}
2203
bf398920
TV
2204/* Return true if TYPE is a record type where the last field is an array without
2205 given dimension. */
2206
2207static bool
2208flexible_array_member_type_p (const_tree type)
2209{
2210 if (TREE_CODE (type) != RECORD_TYPE)
2211 return false;
2212
2213 const_tree last_field = NULL_TREE;
2214 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2215 last_field = f;
2216
2217 if (!last_field)
2218 return false;
2219
2220 const_tree last_field_type = TREE_TYPE (last_field);
2221 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2222 return false;
2223
2224 return (! TYPE_DOMAIN (last_field_type)
2225 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2226}
2227
4ff3145a
NS
2228/* Emit a PTX variable decl and prepare for emission of its
2229 initializer. NAME is the symbol name and SETION the PTX data
2230 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2231 The caller has already emitted any indentation and linkage
2232 specifier. It is responsible for any initializer, terminating ;
2233 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2234 this is the opposite way round that PTX wants them! */
2235
2236static void
2237nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2238 const_tree type, HOST_WIDE_INT size, unsigned align,
2239 bool undefined = false)
4ff3145a 2240{
59d2d238
CP
2241 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2242 && (TYPE_DOMAIN (type) == NULL_TREE);
2243
bf398920
TV
2244 if (undefined && flexible_array_member_type_p (type))
2245 {
2246 size = 0;
2247 atype = true;
2248 }
2249
4ff3145a
NS
2250 while (TREE_CODE (type) == ARRAY_TYPE)
2251 type = TREE_TYPE (type);
2252
fc0efeea
NS
2253 if (TREE_CODE (type) == VECTOR_TYPE
2254 || TREE_CODE (type) == COMPLEX_TYPE)
2255 /* Neither vector nor complex types can contain the other. */
2256 type = TREE_TYPE (type);
2257
b9c7fe59 2258 unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
fc0efeea
NS
2259
2260 /* Largest mode we're prepared to accept. For BLKmode types we
2261 don't know if it'll contain pointer constants, so have to choose
2262 pointer size, otherwise we can choose DImode. */
2263 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2264
2265 elt_size |= GET_MODE_SIZE (elt_mode);
2266 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2267
4d5438cd 2268 init_frag.size = elt_size;
9c582551 2269 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2270 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2271 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2272 init_frag.val = 0;
2273 init_frag.offset = 0;
2274 init_frag.started = false;
2275 /* Size might not be a multiple of elt size, if there's an
2276 initialized trailing struct array with smaller type than
2277 elt_size. */
2278 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a 2279
5d3c12e4 2280 fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
4ff3145a
NS
2281 section, align / BITS_PER_UNIT,
2282 elt_size * BITS_PER_UNIT);
2283 assemble_name (file, name);
2284
2285 if (size)
2286 /* We make everything an array, to simplify any initialization
2287 emission. */
b9c7fe59 2288 fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
59d2d238
CP
2289 else if (atype)
2290 fprintf (file, "[]");
4ff3145a
NS
2291}
2292
738f2522
BS
2293/* Called when the initializer for a decl has been completely output through
2294 combinations of the three functions above. */
2295
2296static void
2297nvptx_assemble_decl_end (void)
2298{
4d5438cd
NS
2299 if (init_frag.offset)
2300 /* This can happen with a packed struct with trailing array member. */
2301 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2302 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2303}
2304
69823d76
NS
2305/* Output an uninitialized common or file-scope variable. */
2306
2307void
2308nvptx_output_aligned_decl (FILE *file, const char *name,
2309 const_tree decl, HOST_WIDE_INT size, unsigned align)
2310{
2311 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2312
2313 /* If this is public, it is common. The nearest thing we have to
2314 common is weak. */
4ff3145a
NS
2315 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2316
2317 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2318 TREE_TYPE (decl), size, align);
4d5438cd 2319 nvptx_assemble_decl_end ();
69823d76
NS
2320}
2321
738f2522
BS
2322/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2323 writing a constant variable EXP with NAME and SIZE and its
2324 initializer to FILE. */
2325
2326static void
2327nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2328 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2329{
4ff3145a
NS
2330 write_var_marker (file, true, false, name);
2331
2332 fprintf (file, "\t");
2333
738f2522 2334 tree type = TREE_TYPE (exp);
4ff3145a
NS
2335 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2336 TYPE_ALIGN (type));
738f2522
BS
2337}
2338
2339/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2340 a variable DECL with NAME to FILE. */
2341
2342void
2343nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2344{
4ff3145a 2345 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2346
4ff3145a
NS
2347 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2348 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2349
4ff3145a
NS
2350 tree type = TREE_TYPE (decl);
2351 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2352 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2353 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2354}
2355
2356/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2357
2358static void
2359nvptx_globalize_label (FILE *, const char *)
2360{
2361}
2362
2363/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2364 declaration only for variable DECL with NAME to FILE. */
f313d112 2365
738f2522
BS
2366static void
2367nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2368{
0a0f74aa
NS
2369 /* The middle end can place constant pool decls into the varpool as
2370 undefined. Until that is fixed, catch the problem here. */
2371 if (DECL_IN_CONSTANT_POOL (decl))
2372 return;
2373
721547cd
NS
2374 /* We support weak defintions, and hence have the right
2375 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2376 if (DECL_WEAK (decl))
2377 error_at (DECL_SOURCE_LOCATION (decl),
2378 "PTX does not support weak declarations"
2379 " (only weak definitions)");
69823d76
NS
2380 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2381
4ff3145a
NS
2382 fprintf (file, "\t.extern ");
2383 tree size = DECL_SIZE_UNIT (decl);
2384 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2385 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2386 DECL_ALIGN (decl), true);
1e5154e7 2387 nvptx_assemble_decl_end ();
738f2522
BS
2388}
2389
f313d112
NS
2390/* Output a pattern for a move instruction. */
2391
2392const char *
2393nvptx_output_mov_insn (rtx dst, rtx src)
2394{
2395 machine_mode dst_mode = GET_MODE (dst);
ca52f937 2396 machine_mode src_mode = GET_MODE (src);
f313d112
NS
2397 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2398 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2399 machine_mode src_inner = (GET_CODE (src) == SUBREG
2400 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2401
15113b03
NS
2402 rtx sym = src;
2403 if (GET_CODE (sym) == CONST)
2404 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2405 if (SYMBOL_REF_P (sym))
2406 {
2407 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2408 return "%.\tcvta%D1%t0\t%0, %1;";
2409 nvptx_maybe_record_fnsym (sym);
2410 }
15113b03 2411
f313d112
NS
2412 if (src_inner == dst_inner)
2413 return "%.\tmov%t0\t%0, %1;";
2414
2415 if (CONSTANT_P (src))
2416 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2417 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2418 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2419
2420 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2421 {
2422 if (GET_MODE_BITSIZE (dst_mode) == 128
ca52f937 2423 && GET_MODE_BITSIZE (src_mode) == 128)
3717fbe3
TV
2424 {
2425 /* mov.b128 is not supported. */
2426 if (dst_inner == V2DImode && src_inner == TImode)
2427 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2428 else if (dst_inner == TImode && src_inner == V2DImode)
2429 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2430
2431 gcc_unreachable ();
2432 }
2433 return "%.\tmov.b%T0\t%0, %1;";
2434 }
f313d112 2435
ca52f937
TV
2436 if (GET_MODE_BITSIZE (src_inner) == 128
2437 && GET_MODE_BITSIZE (src_mode) == 64)
2438 return "%.\tmov.b%T0\t%0, %1;";
2439
f313d112
NS
2440 return "%.\tcvt%t0%t1\t%0, %1;";
2441}
2442
5012919d
AM
2443static void nvptx_print_operand (FILE *, rtx, int);
2444
738f2522 2445/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2446 involves writing .param declarations and in/out copies into them. For
2447 indirect calls, also write the .callprototype. */
738f2522
BS
2448
2449const char *
2450nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2451{
863af9a4 2452 char buf[16];
738f2522
BS
2453 static int labelno;
2454 bool needs_tgt = register_operand (callee, Pmode);
2455 rtx pat = PATTERN (insn);
5012919d
AM
2456 if (GET_CODE (pat) == COND_EXEC)
2457 pat = COND_EXEC_CODE (pat);
f324806d 2458 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2459 tree decl = NULL_TREE;
2460
2461 fprintf (asm_out_file, "\t{\n");
2462 if (result != NULL)
1f065954
NS
2463 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2464 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2465 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2466
ecf6e535 2467 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2468 if (GET_CODE (callee) == SYMBOL_REF)
2469 {
2470 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2471 if (!decl
2472 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2473 nvptx_record_libfunc (callee, result, pat);
2474 else if (DECL_EXTERNAL (decl))
738f2522
BS
2475 nvptx_record_fndecl (decl);
2476 }
2477
2478 if (needs_tgt)
2479 {
2480 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2481 labelno++;
2482 ASM_OUTPUT_LABEL (asm_out_file, buf);
2483 std::stringstream s;
b699adcc 2484 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2485 fputs (s.str().c_str(), asm_out_file);
2486 }
2487
863af9a4 2488 for (int argno = 1; argno < arg_end; argno++)
738f2522 2489 {
863af9a4 2490 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2491 machine_mode mode = GET_MODE (t);
a02d84b6 2492 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2493
863af9a4 2494 /* Mode splitting has already been done. */
a02d84b6
NS
2495 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2496 "\t\tst.param%s [%%out_arg%d], ",
2497 ptx_type, argno, ptx_type, argno);
2498 output_reg (asm_out_file, REGNO (t), VOIDmode);
2499 fprintf (asm_out_file, ";\n");
738f2522
BS
2500 }
2501
5012919d
AM
2502 /* The '.' stands for the call's predicate, if any. */
2503 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2504 fprintf (asm_out_file, "\t\tcall ");
2505 if (result != NULL_RTX)
1f065954
NS
2506 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2507
738f2522
BS
2508 if (decl)
2509 {
e88b04bd 2510 char *replaced_dots = NULL;
738f2522 2511 const char *name = get_fnname_from_decl (decl);
e88b04bd
TV
2512 const char *replacement = nvptx_name_replacement (name);
2513 if (replacement != name)
2514 name = replacement;
2515 else
2516 {
2517 replaced_dots = nvptx_replace_dot (name);
2518 if (replaced_dots)
2519 name = replaced_dots;
2520 }
738f2522 2521 assemble_name (asm_out_file, name);
e88b04bd
TV
2522 if (replaced_dots)
2523 XDELETE (replaced_dots);
738f2522
BS
2524 }
2525 else
cc8ca59e 2526 output_address (VOIDmode, callee);
738f2522 2527
863af9a4
NS
2528 const char *open = "(";
2529 for (int argno = 1; argno < arg_end; argno++)
738f2522 2530 {
863af9a4
NS
2531 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2532 open = "";
738f2522 2533 }
863af9a4
NS
2534 if (decl && DECL_STATIC_CHAIN (decl))
2535 {
5563d5c0 2536 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2537 open = "";
2538 }
2539 if (!open[0])
2540 fprintf (asm_out_file, ")");
f324806d 2541
738f2522
BS
2542 if (needs_tgt)
2543 {
2544 fprintf (asm_out_file, ", ");
2545 assemble_name (asm_out_file, buf);
2546 }
2547 fprintf (asm_out_file, ";\n");
738f2522 2548
51baf85a 2549 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2550 {
2551 /* No return functions confuse the PTX JIT, as it doesn't realize
2552 the flow control barrier they imply. It can seg fault if it
2553 encounters what looks like an unexitable loop. Emit a trailing
2554 trap and exit, which it does grok. */
2555 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2556 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2557 }
51baf85a 2558
1f065954
NS
2559 if (result)
2560 {
2561 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2562
2563 if (!rval[0])
2564 /* We must escape the '%' that starts RETURN_REGNUM. */
2565 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2566 reg_names[NVPTX_RETURN_REGNUM]);
2567 return rval;
2568 }
2569
2570 return "}";
738f2522
BS
2571}
2572
2573/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2574
2575static bool
2576nvptx_print_operand_punct_valid_p (unsigned char c)
2577{
2578 return c == '.' || c== '#';
2579}
2580
738f2522
BS
2581/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2582
2583static void
2584nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2585{
2586 rtx off;
2587 if (GET_CODE (x) == CONST)
2588 x = XEXP (x, 0);
2589 switch (GET_CODE (x))
2590 {
2591 case PLUS:
2592 off = XEXP (x, 1);
cc8ca59e 2593 output_address (VOIDmode, XEXP (x, 0));
738f2522 2594 fprintf (file, "+");
cc8ca59e 2595 output_address (VOIDmode, off);
738f2522
BS
2596 break;
2597
2598 case SYMBOL_REF:
2599 case LABEL_REF:
2600 output_addr_const (file, x);
2601 break;
2602
2603 default:
2604 gcc_assert (GET_CODE (x) != MEM);
2605 nvptx_print_operand (file, x, 0);
2606 break;
2607 }
2608}
2609
2610/* Write assembly language output for the address ADDR to FILE. */
2611
2612static void
cc8ca59e 2613nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2614{
cc8ca59e 2615 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2616}
2617
2618/* Print an operand, X, to FILE, with an optional modifier in CODE.
2619
2620 Meaning of CODE:
2621 . -- print the predicate for the instruction or an emptry string for an
2622 unconditional one.
2623 # -- print a rounding mode for the instruction
2624
9a863523 2625 A -- print a data area for a MEM
738f2522 2626 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2627 D -- print a data area for a MEM operand
d88cd9c4 2628 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2629 t -- print a type opcode suffix, promoting QImode to 32 bits
2630 T -- print a type size in bits
2631 u -- print a type opcode suffix without promotions. */
2632
2633static void
2634nvptx_print_operand (FILE *file, rtx x, int code)
2635{
738f2522
BS
2636 if (code == '.')
2637 {
2638 x = current_insn_predicate;
2639 if (x)
2640 {
5012919d 2641 fputs ("@", file);
738f2522
BS
2642 if (GET_CODE (x) == EQ)
2643 fputs ("!", file);
5012919d 2644 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2645 }
2646 return;
2647 }
2648 else if (code == '#')
2649 {
2650 fputs (".rn", file);
2651 return;
2652 }
2653
2654 enum rtx_code x_code = GET_CODE (x);
f313d112 2655 machine_mode mode = GET_MODE (x);
738f2522
BS
2656
2657 switch (code)
2658 {
2659 case 'A':
9a863523 2660 x = XEXP (x, 0);
69cf7dec 2661 gcc_fallthrough (); /* FALLTHROUGH. */
7b8edc29 2662
9a863523
NS
2663 case 'D':
2664 if (GET_CODE (x) == CONST)
2665 x = XEXP (x, 0);
2666 if (GET_CODE (x) == PLUS)
2667 x = XEXP (x, 0);
7b8edc29 2668
9a863523
NS
2669 if (GET_CODE (x) == SYMBOL_REF)
2670 fputs (section_for_sym (x), file);
738f2522
BS
2671 break;
2672
738f2522 2673 case 't':
738f2522 2674 case 'u':
f313d112
NS
2675 if (x_code == SUBREG)
2676 {
1310ff03
TV
2677 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2678 if (VECTOR_MODE_P (inner_mode)
2679 && (GET_MODE_SIZE (mode)
2680 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2681 mode = GET_MODE_INNER (inner_mode);
2682 else if (split_mode_p (inner_mode))
2683 mode = maybe_split_mode (inner_mode);
2684 else
2685 mode = inner_mode;
f313d112
NS
2686 }
2687 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2688 break;
2689
3717fbe3
TV
2690 case 'H':
2691 case 'L':
2692 {
2693 rtx inner_x = SUBREG_REG (x);
2694 machine_mode inner_mode = GET_MODE (inner_x);
2695 machine_mode split = maybe_split_mode (inner_mode);
2696
2697 output_reg (file, REGNO (inner_x), split,
2698 (code == 'H'
2699 ? GET_MODE_SIZE (inner_mode) / 2
2700 : 0));
2701 }
2702 break;
2703
d88cd9c4
NS
2704 case 'S':
2705 {
59263259
NS
2706 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2707 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2708 static const char *const kinds[] =
59263259
NS
2709 {".up", ".down", ".bfly", ".idx"};
2710 fputs (kinds[kind], file);
d88cd9c4
NS
2711 }
2712 break;
2713
738f2522 2714 case 'T':
f313d112 2715 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2716 break;
2717
2718 case 'j':
2719 fprintf (file, "@");
2720 goto common;
2721
2722 case 'J':
2723 fprintf (file, "@!");
2724 goto common;
2725
2726 case 'c':
f313d112 2727 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2728 switch (x_code)
2729 {
2730 case EQ:
2731 fputs (".eq", file);
2732 break;
2733 case NE:
f313d112 2734 if (FLOAT_MODE_P (mode))
738f2522
BS
2735 fputs (".neu", file);
2736 else
2737 fputs (".ne", file);
2738 break;
2739 case LE:
578fb225 2740 case LEU:
738f2522
BS
2741 fputs (".le", file);
2742 break;
2743 case GE:
578fb225 2744 case GEU:
738f2522
BS
2745 fputs (".ge", file);
2746 break;
2747 case LT:
578fb225 2748 case LTU:
738f2522
BS
2749 fputs (".lt", file);
2750 break;
2751 case GT:
738f2522 2752 case GTU:
578fb225 2753 fputs (".gt", file);
738f2522
BS
2754 break;
2755 case LTGT:
2756 fputs (".ne", file);
2757 break;
2758 case UNEQ:
2759 fputs (".equ", file);
2760 break;
2761 case UNLE:
2762 fputs (".leu", file);
2763 break;
2764 case UNGE:
2765 fputs (".geu", file);
2766 break;
2767 case UNLT:
2768 fputs (".ltu", file);
2769 break;
2770 case UNGT:
2771 fputs (".gtu", file);
2772 break;
2773 case UNORDERED:
2774 fputs (".nan", file);
2775 break;
2776 case ORDERED:
2777 fputs (".num", file);
2778 break;
2779 default:
2780 gcc_unreachable ();
2781 }
f313d112 2782 if (FLOAT_MODE_P (mode)
738f2522
BS
2783 || x_code == EQ || x_code == NE
2784 || x_code == GEU || x_code == GTU
2785 || x_code == LEU || x_code == LTU)
f313d112 2786 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2787 else
f313d112 2788 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2789 break;
2790 default:
2791 common:
2792 switch (x_code)
2793 {
2794 case SUBREG:
f313d112
NS
2795 {
2796 rtx inner_x = SUBREG_REG (x);
2797 machine_mode inner_mode = GET_MODE (inner_x);
2798 machine_mode split = maybe_split_mode (inner_mode);
2799
1310ff03
TV
2800 if (VECTOR_MODE_P (inner_mode)
2801 && (GET_MODE_SIZE (mode)
2802 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2803 {
2804 output_reg (file, REGNO (inner_x), VOIDmode);
2805 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2806 }
2807 else if (split_mode_p (inner_mode)
f313d112
NS
2808 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2809 output_reg (file, REGNO (inner_x), split);
2810 else
2811 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2812 }
2813 break;
738f2522
BS
2814
2815 case REG:
f313d112 2816 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2817 break;
2818
2819 case MEM:
2820 fputc ('[', file);
f313d112 2821 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2822 fputc (']', file);
2823 break;
2824
2825 case CONST_INT:
2826 output_addr_const (file, x);
2827 break;
2828
2829 case CONST:
2830 case SYMBOL_REF:
2831 case LABEL_REF:
2832 /* We could use output_addr_const, but that can print things like
2833 "x-8", which breaks ptxas. Need to ensure it is output as
2834 "x+-8". */
2835 nvptx_print_address_operand (file, x, VOIDmode);
2836 break;
2837
2838 case CONST_DOUBLE:
2839 long vals[2];
f313d112 2840 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2841 vals[0] &= 0xffffffff;
2842 vals[1] &= 0xffffffff;
f313d112 2843 if (mode == SFmode)
738f2522
BS
2844 fprintf (file, "0f%08lx", vals[0]);
2845 else
2846 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2847 break;
2848
1310ff03
TV
2849 case CONST_VECTOR:
2850 {
2851 unsigned n = CONST_VECTOR_NUNITS (x);
2852 fprintf (file, "{ ");
2853 for (unsigned i = 0; i < n; ++i)
2854 {
2855 if (i != 0)
2856 fprintf (file, ", ");
2857
2858 rtx elem = CONST_VECTOR_ELT (x, i);
2859 output_addr_const (file, elem);
2860 }
2861 fprintf (file, " }");
2862 }
2863 break;
2864
738f2522
BS
2865 default:
2866 output_addr_const (file, x);
2867 }
2868 }
2869}
2870\f
2871/* Record replacement regs used to deal with subreg operands. */
2872struct reg_replace
2873{
2874 rtx replacement[MAX_RECOG_OPERANDS];
2875 machine_mode mode;
2876 int n_allocated;
2877 int n_in_use;
2878};
2879
2880/* Allocate or reuse a replacement in R and return the rtx. */
2881
2882static rtx
2883get_replacement (struct reg_replace *r)
2884{
2885 if (r->n_allocated == r->n_in_use)
2886 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2887 return r->replacement[r->n_in_use++];
2888}
2889
2890/* Clean up subreg operands. In ptx assembly, everything is typed, and
2891 the presence of subregs would break the rules for most instructions.
2892 Replace them with a suitable new register of the right size, plus
2893 conversion copyin/copyout instructions. */
2894
2895static void
517665b3 2896nvptx_reorg_subreg (void)
738f2522
BS
2897{
2898 struct reg_replace qiregs, hiregs, siregs, diregs;
2899 rtx_insn *insn, *next;
2900
738f2522
BS
2901 qiregs.n_allocated = 0;
2902 hiregs.n_allocated = 0;
2903 siregs.n_allocated = 0;
2904 diregs.n_allocated = 0;
2905 qiregs.mode = QImode;
2906 hiregs.mode = HImode;
2907 siregs.mode = SImode;
2908 diregs.mode = DImode;
2909
2910 for (insn = get_insns (); insn; insn = next)
2911 {
2912 next = NEXT_INSN (insn);
2913 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2914 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2915 || GET_CODE (PATTERN (insn)) == USE
2916 || GET_CODE (PATTERN (insn)) == CLOBBER)
2917 continue;
f324806d 2918
738f2522
BS
2919 qiregs.n_in_use = 0;
2920 hiregs.n_in_use = 0;
2921 siregs.n_in_use = 0;
2922 diregs.n_in_use = 0;
2923 extract_insn (insn);
2924 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2925
738f2522
BS
2926 for (int i = 0; i < recog_data.n_operands; i++)
2927 {
2928 rtx op = recog_data.operand[i];
2929 if (GET_CODE (op) != SUBREG)
2930 continue;
2931
2932 rtx inner = SUBREG_REG (op);
2933
2934 machine_mode outer_mode = GET_MODE (op);
2935 machine_mode inner_mode = GET_MODE (inner);
2936 gcc_assert (s_ok);
2937 if (s_ok
2938 && (GET_MODE_PRECISION (inner_mode)
2939 >= GET_MODE_PRECISION (outer_mode)))
2940 continue;
2941 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2942 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2943 : outer_mode == HImode ? &hiregs
2944 : outer_mode == SImode ? &siregs
2945 : &diregs);
2946 rtx new_reg = get_replacement (r);
2947
2948 if (recog_data.operand_type[i] != OP_OUT)
2949 {
2950 enum rtx_code code;
2951 if (GET_MODE_PRECISION (inner_mode)
2952 < GET_MODE_PRECISION (outer_mode))
2953 code = ZERO_EXTEND;
2954 else
2955 code = TRUNCATE;
2956
f7df4a84 2957 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2958 gen_rtx_fmt_e (code, outer_mode, inner));
2959 emit_insn_before (pat, insn);
2960 }
2961
2962 if (recog_data.operand_type[i] != OP_IN)
2963 {
2964 enum rtx_code code;
2965 if (GET_MODE_PRECISION (inner_mode)
2966 < GET_MODE_PRECISION (outer_mode))
2967 code = TRUNCATE;
2968 else
2969 code = ZERO_EXTEND;
2970
f7df4a84 2971 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2972 gen_rtx_fmt_e (code, inner_mode, new_reg));
2973 emit_insn_after (pat, insn);
2974 }
2975 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2976 }
2977 }
517665b3 2978}
738f2522 2979
5012919d
AM
2980/* Return a SImode "master lane index" register for uniform-simt, allocating on
2981 first use. */
2982
2983static rtx
2984nvptx_get_unisimt_master ()
2985{
2986 rtx &master = cfun->machine->unisimt_master;
2987 return master ? master : master = gen_reg_rtx (SImode);
2988}
2989
2990/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2991
2992static rtx
2993nvptx_get_unisimt_predicate ()
2994{
2995 rtx &pred = cfun->machine->unisimt_predicate;
2996 return pred ? pred : pred = gen_reg_rtx (BImode);
2997}
2998
2999/* Return true if given call insn references one of the functions provided by
3000 the CUDA runtime: malloc, free, vprintf. */
3001
3002static bool
3003nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3004{
3005 rtx pat = PATTERN (insn);
3006 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3007 pat = XVECEXP (pat, 0, 0);
3008 if (GET_CODE (pat) == SET)
3009 pat = SET_SRC (pat);
3010 gcc_checking_assert (GET_CODE (pat) == CALL
3011 && GET_CODE (XEXP (pat, 0)) == MEM);
3012 rtx addr = XEXP (XEXP (pat, 0), 0);
3013 if (GET_CODE (addr) != SYMBOL_REF)
3014 return false;
3015 const char *name = XSTR (addr, 0);
3016 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3017 references with forced assembler name refer to PTX syscalls. For vprintf,
3018 accept both normal and forced-assembler-name references. */
3019 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3020 || !strcmp (name, "*malloc")
3021 || !strcmp (name, "*free"));
3022}
3023
3024/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3025 propagate its value from lane MASTER to current lane. */
3026
3027static void
3028nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3029{
3030 rtx reg;
3031 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
3032 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
3033}
3034
3035/* Adjust code for uniform-simt code generation variant by making atomics and
3036 "syscalls" conditionally executed, and inserting shuffle-based propagation
3037 for registers being set. */
3038
3039static void
3040nvptx_reorg_uniform_simt ()
3041{
3042 rtx_insn *insn, *next;
3043
3044 for (insn = get_insns (); insn; insn = next)
3045 {
3046 next = NEXT_INSN (insn);
3047 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3048 && !(NONJUMP_INSN_P (insn)
3049 && GET_CODE (PATTERN (insn)) == PARALLEL
3050 && get_attr_atomic (insn)))
3051 continue;
3052 rtx pat = PATTERN (insn);
3053 rtx master = nvptx_get_unisimt_master ();
3054 for (int i = 0; i < XVECLEN (pat, 0); i++)
3055 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3056 rtx pred = nvptx_get_unisimt_predicate ();
3057 pred = gen_rtx_NE (BImode, pred, const0_rtx);
3058 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3059 validate_change (insn, &PATTERN (insn), pat, false);
3060 }
3061}
3062
2498a6ab
TV
3063/* Offloading function attributes. */
3064
3065struct offload_attrs
3066{
3067 unsigned mask;
3068 int num_gangs;
3069 int num_workers;
3070 int vector_length;
3071};
3072
0d06e8c7
TV
3073/* Define entries for cfun->machine->axis_dim. */
3074
3075#define MACH_VECTOR_LENGTH 0
3076#define MACH_MAX_WORKERS 1
3077
3078static void populate_offload_attrs (offload_attrs *oa);
3079
3080static void
3081init_axis_dim (void)
3082{
3083 offload_attrs oa;
3084 int max_workers;
3085
3086 populate_offload_attrs (&oa);
3087
3088 if (oa.num_workers == 0)
3089 max_workers = PTX_CTA_SIZE / oa.vector_length;
3090 else
3091 max_workers = oa.num_workers;
3092
3093 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3094 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3095 cfun->machine->axis_dim_init_p = true;
3096}
3097
3098static int ATTRIBUTE_UNUSED
3099nvptx_mach_max_workers ()
3100{
3101 if (!cfun->machine->axis_dim_init_p)
3102 init_axis_dim ();
3103 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3104}
3105
3106static int ATTRIBUTE_UNUSED
3107nvptx_mach_vector_length ()
3108{
3109 if (!cfun->machine->axis_dim_init_p)
3110 init_axis_dim ();
3111 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3112}
3113
d2d47a28
NS
3114/* Loop structure of the function. The entire function is described as
3115 a NULL loop. */
d88cd9c4
NS
3116
3117struct parallel
3118{
3119 /* Parent parallel. */
3120 parallel *parent;
3121
3122 /* Next sibling parallel. */
3123 parallel *next;
3124
3125 /* First child parallel. */
3126 parallel *inner;
3127
3128 /* Partitioning mask of the parallel. */
3129 unsigned mask;
3130
3131 /* Partitioning used within inner parallels. */
3132 unsigned inner_mask;
3133
3134 /* Location of parallel forked and join. The forked is the first
3135 block in the parallel and the join is the first block after of
3136 the partition. */
3137 basic_block forked_block;
3138 basic_block join_block;
3139
3140 rtx_insn *forked_insn;
3141 rtx_insn *join_insn;
3142
3143 rtx_insn *fork_insn;
3144 rtx_insn *joining_insn;
3145
3146 /* Basic blocks in this parallel, but not in child parallels. The
3147 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3148 blocks are not. */
3149 auto_vec<basic_block> blocks;
3150
3151public:
3152 parallel (parallel *parent, unsigned mode);
3153 ~parallel ();
3154};
3155
3156/* Constructor links the new parallel into it's parent's chain of
3157 children. */
3158
3159parallel::parallel (parallel *parent_, unsigned mask_)
3160 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3161{
3162 forked_block = join_block = 0;
3163 forked_insn = join_insn = 0;
3164 fork_insn = joining_insn = 0;
3165
3166 if (parent)
3167 {
3168 next = parent->inner;
3169 parent->inner = this;
3170 }
3171}
3172
3173parallel::~parallel ()
3174{
3175 delete inner;
3176 delete next;
3177}
3178
3179/* Map of basic blocks to insns */
3180typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3181
3182/* A tuple of an insn of interest and the BB in which it resides. */
3183typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3184typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3185
3186/* Split basic blocks such that each forked and join unspecs are at
3187 the start of their basic blocks. Thus afterwards each block will
3188 have a single partitioning mode. We also do the same for return
3189 insns, as they are executed by every thread. Return the
3190 partitioning mode of the function as a whole. Populate MAP with
3191 head and tail blocks. We also clear the BB visited flag, which is
3192 used when finding partitions. */
3193
3194static void
3195nvptx_split_blocks (bb_insn_map_t *map)
3196{
3197 insn_bb_vec_t worklist;
3198 basic_block block;
3199 rtx_insn *insn;
3200
3201 /* Locate all the reorg instructions of interest. */
3202 FOR_ALL_BB_FN (block, cfun)
3203 {
3204 bool seen_insn = false;
3205
3206 /* Clear visited flag, for use by parallel locator */
3207 block->flags &= ~BB_VISITED;
3208
3209 FOR_BB_INSNS (block, insn)
3210 {
3211 if (!INSN_P (insn))
3212 continue;
3213 switch (recog_memoized (insn))
3214 {
3215 default:
3216 seen_insn = true;
3217 continue;
3218 case CODE_FOR_nvptx_forked:
3219 case CODE_FOR_nvptx_join:
3220 break;
3221
3222 case CODE_FOR_return:
3223 /* We also need to split just before return insns, as
3224 that insn needs executing by all threads, but the
3225 block it is in probably does not. */
3226 break;
3227 }
3228
3229 if (seen_insn)
3230 /* We've found an instruction that must be at the start of
3231 a block, but isn't. Add it to the worklist. */
3232 worklist.safe_push (insn_bb_t (insn, block));
3233 else
3234 /* It was already the first instruction. Just add it to
3235 the map. */
3236 map->get_or_insert (block) = insn;
3237 seen_insn = true;
3238 }
3239 }
3240
3241 /* Split blocks on the worklist. */
3242 unsigned ix;
3243 insn_bb_t *elt;
3244 basic_block remap = 0;
3245 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3246 {
3247 if (remap != elt->second)
3248 {
3249 block = elt->second;
3250 remap = block;
3251 }
3252
3253 /* Split block before insn. The insn is in the new block */
3254 edge e = split_block (block, PREV_INSN (elt->first));
3255
3256 block = e->dest;
3257 map->get_or_insert (block) = elt->first;
3258 }
3259}
3260
0024c320
TV
3261/* Return true if MASK contains parallelism that requires shared
3262 memory to broadcast. */
3263
3264static bool
3265nvptx_needs_shared_bcast (unsigned mask)
3266{
3267 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3268 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3269 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3270
3271 return worker || large_vector;
3272}
3273
d88cd9c4
NS
3274/* BLOCK is a basic block containing a head or tail instruction.
3275 Locate the associated prehead or pretail instruction, which must be
3276 in the single predecessor block. */
3277
3278static rtx_insn *
3279nvptx_discover_pre (basic_block block, int expected)
3280{
3281 gcc_assert (block->preds->length () == 1);
3282 basic_block pre_block = (*block->preds)[0]->src;
3283 rtx_insn *pre_insn;
3284
3285 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3286 pre_insn = PREV_INSN (pre_insn))
3287 gcc_assert (pre_insn != BB_HEAD (pre_block));
3288
3289 gcc_assert (recog_memoized (pre_insn) == expected);
3290 return pre_insn;
3291}
3292
3293/* Dump this parallel and all its inner parallels. */
3294
3295static void
3296nvptx_dump_pars (parallel *par, unsigned depth)
3297{
3298 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3299 depth, par->mask,
3300 par->forked_block ? par->forked_block->index : -1,
3301 par->join_block ? par->join_block->index : -1);
3302
3303 fprintf (dump_file, " blocks:");
3304
3305 basic_block block;
3306 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3307 fprintf (dump_file, " %d", block->index);
3308 fprintf (dump_file, "\n");
3309 if (par->inner)
3310 nvptx_dump_pars (par->inner, depth + 1);
3311
3312 if (par->next)
3313 nvptx_dump_pars (par->next, depth);
3314}
3315
3316/* If BLOCK contains a fork/join marker, process it to create or
3317 terminate a loop structure. Add this block to the current loop,
3318 and then walk successor blocks. */
3319
3320static parallel *
3321nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3322{
3323 if (block->flags & BB_VISITED)
3324 return par;
3325 block->flags |= BB_VISITED;
3326
3327 if (rtx_insn **endp = map->get (block))
3328 {
3329 rtx_insn *end = *endp;
3330
3331 /* This is a block head or tail, or return instruction. */
3332 switch (recog_memoized (end))
3333 {
3334 case CODE_FOR_return:
3335 /* Return instructions are in their own block, and we
3336 don't need to do anything more. */
3337 return par;
3338
3339 case CODE_FOR_nvptx_forked:
3340 /* Loop head, create a new inner loop and add it into
3341 our parent's child list. */
3342 {
3343 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3344
3345 gcc_assert (mask);
3346 par = new parallel (par, mask);
3347 par->forked_block = block;
3348 par->forked_insn = end;
0024c320 3349 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3350 par->fork_insn
3351 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3352 }
3353 break;
3354
3355 case CODE_FOR_nvptx_join:
3356 /* A loop tail. Finish the current loop and return to
3357 parent. */
3358 {
3359 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3360
3361 gcc_assert (par->mask == mask);
4b557bcc 3362 gcc_assert (par->join_block == NULL);
d88cd9c4
NS
3363 par->join_block = block;
3364 par->join_insn = end;
0024c320 3365 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3366 par->joining_insn
3367 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3368 par = par->parent;
3369 }
3370 break;
3371
3372 default:
3373 gcc_unreachable ();
3374 }
3375 }
3376
3377 if (par)
3378 /* Add this block onto the current loop's list of blocks. */
3379 par->blocks.safe_push (block);
3380 else
3381 /* This must be the entry block. Create a NULL parallel. */
3382 par = new parallel (0, 0);
3383
3384 /* Walk successor blocks. */
3385 edge e;
3386 edge_iterator ei;
3387
3388 FOR_EACH_EDGE (e, ei, block->succs)
3389 nvptx_find_par (map, par, e->dest);
3390
3391 return par;
3392}
3393
3394/* DFS walk the CFG looking for fork & join markers. Construct
3395 loop structures as we go. MAP is a mapping of basic blocks
3396 to head & tail markers, discovered when splitting blocks. This
3397 speeds up the discovery. We rely on the BB visited flag having
3398 been cleared when splitting blocks. */
3399
3400static parallel *
3401nvptx_discover_pars (bb_insn_map_t *map)
3402{
3403 basic_block block;
3404
3405 /* Mark exit blocks as visited. */
3406 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3407 block->flags |= BB_VISITED;
3408
3409 /* And entry block as not. */
3410 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3411 block->flags &= ~BB_VISITED;
3412
3413 parallel *par = nvptx_find_par (map, 0, block);
3414
3415 if (dump_file)
3416 {
3417 fprintf (dump_file, "\nLoops\n");
3418 nvptx_dump_pars (par, 0);
3419 fprintf (dump_file, "\n");
3420 }
3421
3422 return par;
3423}
3424
912442c2
NS
3425/* Analyse a group of BBs within a partitioned region and create N
3426 Single-Entry-Single-Exit regions. Some of those regions will be
3427 trivial ones consisting of a single BB. The blocks of a
3428 partitioned region might form a set of disjoint graphs -- because
3429 the region encloses a differently partitoned sub region.
3430
3431 We use the linear time algorithm described in 'Finding Regions Fast:
3432 Single Entry Single Exit and control Regions in Linear Time'
3433 Johnson, Pearson & Pingali. That algorithm deals with complete
3434 CFGs, where a back edge is inserted from END to START, and thus the
3435 problem becomes one of finding equivalent loops.
3436
3437 In this case we have a partial CFG. We complete it by redirecting
3438 any incoming edge to the graph to be from an arbitrary external BB,
3439 and similarly redirecting any outgoing edge to be to that BB.
3440 Thus we end up with a closed graph.
3441
3442 The algorithm works by building a spanning tree of an undirected
3443 graph and keeping track of back edges from nodes further from the
3444 root in the tree to nodes nearer to the root in the tree. In the
3445 description below, the root is up and the tree grows downwards.
3446
3447 We avoid having to deal with degenerate back-edges to the same
3448 block, by splitting each BB into 3 -- one for input edges, one for
3449 the node itself and one for the output edges. Such back edges are
3450 referred to as 'Brackets'. Cycle equivalent nodes will have the
3451 same set of brackets.
3452
3453 Determining bracket equivalency is done by maintaining a list of
3454 brackets in such a manner that the list length and final bracket
3455 uniquely identify the set.
3456
3457 We use coloring to mark all BBs with cycle equivalency with the
3458 same color. This is the output of the 'Finding Regions Fast'
3459 algorithm. Notice it doesn't actually find the set of nodes within
3460 a particular region, just unorderd sets of nodes that are the
3461 entries and exits of SESE regions.
3462
3463 After determining cycle equivalency, we need to find the minimal
3464 set of SESE regions. Do this with a DFS coloring walk of the
3465 complete graph. We're either 'looking' or 'coloring'. When
3466 looking, and we're in the subgraph, we start coloring the color of
3467 the current node, and remember that node as the start of the
3468 current color's SESE region. Every time we go to a new node, we
3469 decrement the count of nodes with thet color. If it reaches zero,
3470 we remember that node as the end of the current color's SESE region
3471 and return to 'looking'. Otherwise we color the node the current
3472 color.
3473
3474 This way we end up with coloring the inside of non-trivial SESE
3475 regions with the color of that region. */
3476
3477/* A pair of BBs. We use this to represent SESE regions. */
3478typedef std::pair<basic_block, basic_block> bb_pair_t;
3479typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3480
3481/* A node in the undirected CFG. The discriminator SECOND indicates just
3482 above or just below the BB idicated by FIRST. */
3483typedef std::pair<basic_block, int> pseudo_node_t;
3484
3485/* A bracket indicates an edge towards the root of the spanning tree of the
3486 undirected graph. Each bracket has a color, determined
3487 from the currrent set of brackets. */
3488struct bracket
3489{
3490 pseudo_node_t back; /* Back target */
3491
3492 /* Current color and size of set. */
3493 unsigned color;
3494 unsigned size;
3495
3496 bracket (pseudo_node_t back_)
3497 : back (back_), color (~0u), size (~0u)
3498 {
3499 }
3500
3501 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3502 {
3503 if (length != size)
3504 {
3505 size = length;
3506 color = color_counts.length ();
3507 color_counts.quick_push (0);
3508 }
3509 color_counts[color]++;
3510 return color;
3511 }
3512};
3513
3514typedef auto_vec<bracket> bracket_vec_t;
3515
3516/* Basic block info for finding SESE regions. */
3517
3518struct bb_sese
3519{
3520 int node; /* Node number in spanning tree. */
3521 int parent; /* Parent node number. */
3522
3523 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3524 edges arrive at pseudo-node Ai and the outgoing edges leave at
3525 pseudo-node Ao. We have to remember which way we arrived at a
3526 particular node when generating the spanning tree. dir > 0 means
3527 we arrived at Ai, dir < 0 means we arrived at Ao. */
3528 int dir;
3529
3530 /* Lowest numbered pseudo-node reached via a backedge from thsis
3531 node, or any descendant. */
3532 pseudo_node_t high;
3533
3534 int color; /* Cycle-equivalence color */
3535
3536 /* Stack of brackets for this node. */
3537 bracket_vec_t brackets;
3538
3539 bb_sese (unsigned node_, unsigned p, int dir_)
3540 :node (node_), parent (p), dir (dir_)
3541 {
3542 }
3543 ~bb_sese ();
3544
3545 /* Push a bracket ending at BACK. */
3546 void push (const pseudo_node_t &back)
3547 {
3548 if (dump_file)
3549 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3550 back.first ? back.first->index : 0, back.second);
3551 brackets.safe_push (bracket (back));
3552 }
3553
3554 void append (bb_sese *child);
3555 void remove (const pseudo_node_t &);
3556
3557 /* Set node's color. */
3558 void set_color (auto_vec<unsigned> &color_counts)
3559 {
3560 color = brackets.last ().get_color (color_counts, brackets.length ());
3561 }
3562};
3563
3564bb_sese::~bb_sese ()
3565{
3566}
3567
3568/* Destructively append CHILD's brackets. */
3569
3570void
3571bb_sese::append (bb_sese *child)
3572{
3573 if (int len = child->brackets.length ())
3574 {
3575 int ix;
3576
3577 if (dump_file)
3578 {
3579 for (ix = 0; ix < len; ix++)
3580 {
3581 const pseudo_node_t &pseudo = child->brackets[ix].back;
3582 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3583 child->node, pseudo.first ? pseudo.first->index : 0,
3584 pseudo.second);
3585 }
3586 }
3587 if (!brackets.length ())
3588 std::swap (brackets, child->brackets);
3589 else
3590 {
3591 brackets.reserve (len);
3592 for (ix = 0; ix < len; ix++)
3593 brackets.quick_push (child->brackets[ix]);
3594 }
3595 }
3596}
3597
3598/* Remove brackets that terminate at PSEUDO. */
3599
3600void
3601bb_sese::remove (const pseudo_node_t &pseudo)
3602{
3603 unsigned removed = 0;
3604 int len = brackets.length ();
3605
3606 for (int ix = 0; ix < len; ix++)
3607 {
3608 if (brackets[ix].back == pseudo)
3609 {
3610 if (dump_file)
3611 fprintf (dump_file, "Removing backedge %d:%+d\n",
3612 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3613 removed++;
3614 }
3615 else if (removed)
3616 brackets[ix-removed] = brackets[ix];
3617 }
3618 while (removed--)
3619 brackets.pop ();
3620}
3621
3622/* Accessors for BB's aux pointer. */
3623#define BB_SET_SESE(B, S) ((B)->aux = (S))
3624#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3625
3626/* DFS walk creating SESE data structures. Only cover nodes with
3627 BB_VISITED set. Append discovered blocks to LIST. We number in
3628 increments of 3 so that the above and below pseudo nodes can be
3629 implicitly numbered too. */
3630
3631static int
3632nvptx_sese_number (int n, int p, int dir, basic_block b,
3633 auto_vec<basic_block> *list)
3634{
3635 if (BB_GET_SESE (b))
3636 return n;
3637
3638 if (dump_file)
3639 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3640 b->index, n, p, dir);
3641
3642 BB_SET_SESE (b, new bb_sese (n, p, dir));
3643 p = n;
3644
3645 n += 3;
3646 list->quick_push (b);
3647
3648 /* First walk the nodes on the 'other side' of this node, then walk
3649 the nodes on the same side. */
3650 for (unsigned ix = 2; ix; ix--)
3651 {
3652 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3653 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3654 : offsetof (edge_def, src));
3655 edge e;
478cb3ce 3656 edge_iterator ei;
912442c2
NS
3657
3658 FOR_EACH_EDGE (e, ei, edges)
3659 {
3660 basic_block target = *(basic_block *)((char *)e + offset);
3661
3662 if (target->flags & BB_VISITED)
3663 n = nvptx_sese_number (n, p, dir, target, list);
3664 }
3665 dir = -dir;
3666 }
3667 return n;
3668}
3669
3670/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3671 EDGES are the outgoing edges and OFFSET is the offset to the src
3672 or dst block on the edges. */
3673
3674static void
3675nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3676 vec<edge, va_gc> *edges, size_t offset)
3677{
3678 edge e;
478cb3ce 3679 edge_iterator ei;
912442c2
NS
3680 int hi_back = depth;
3681 pseudo_node_t node_back (0, depth);
3682 int hi_child = depth;
3683 pseudo_node_t node_child (0, depth);
3684 basic_block child = NULL;
3685 unsigned num_children = 0;
3686 int usd = -dir * sese->dir;
3687
3688 if (dump_file)
3689 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3690 me->index, sese->node, dir);
3691
3692 if (dir < 0)
3693 {
3694 /* This is the above pseudo-child. It has the BB itself as an
3695 additional child node. */
3696 node_child = sese->high;
3697 hi_child = node_child.second;
3698 if (node_child.first)
3699 hi_child += BB_GET_SESE (node_child.first)->node;
3700 num_children++;
3701 }
3702
3703 /* Examine each edge.
3704 - if it is a child (a) append its bracket list and (b) record
3705 whether it is the child with the highest reaching bracket.
3706 - if it is an edge to ancestor, record whether it's the highest
3707 reaching backlink. */
3708 FOR_EACH_EDGE (e, ei, edges)
3709 {
3710 basic_block target = *(basic_block *)((char *)e + offset);
3711
3712 if (bb_sese *t_sese = BB_GET_SESE (target))
3713 {
3714 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3715 {
3716 /* Child node. Append its bracket list. */
3717 num_children++;
3718 sese->append (t_sese);
3719
3720 /* Compare it's hi value. */
3721 int t_hi = t_sese->high.second;
3722
3723 if (basic_block child_hi_block = t_sese->high.first)
3724 t_hi += BB_GET_SESE (child_hi_block)->node;
3725
3726 if (hi_child > t_hi)
3727 {
3728 hi_child = t_hi;
3729 node_child = t_sese->high;
3730 child = target;
3731 }
3732 }
3733 else if (t_sese->node < sese->node + dir
3734 && !(dir < 0 && sese->parent == t_sese->node))
3735 {
3736 /* Non-parental ancestor node -- a backlink. */
3737 int d = usd * t_sese->dir;
3738 int back = t_sese->node + d;
3739
3740 if (hi_back > back)
3741 {
3742 hi_back = back;
3743 node_back = pseudo_node_t (target, d);
3744 }
3745 }
3746 }
3747 else
3748 { /* Fallen off graph, backlink to entry node. */
3749 hi_back = 0;
3750 node_back = pseudo_node_t (0, 0);
3751 }
3752 }
3753
3754 /* Remove any brackets that terminate at this pseudo node. */
3755 sese->remove (pseudo_node_t (me, dir));
3756
3757 /* Now push any backlinks from this pseudo node. */
3758 FOR_EACH_EDGE (e, ei, edges)
3759 {
3760 basic_block target = *(basic_block *)((char *)e + offset);
3761 if (bb_sese *t_sese = BB_GET_SESE (target))
3762 {
3763 if (t_sese->node < sese->node + dir
3764 && !(dir < 0 && sese->parent == t_sese->node))
3765 /* Non-parental ancestor node - backedge from me. */
3766 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3767 }
3768 else
3769 {
3770 /* back edge to entry node */
3771 sese->push (pseudo_node_t (0, 0));
3772 }
3773 }
3774
3775 /* If this node leads directly or indirectly to a no-return region of
3776 the graph, then fake a backedge to entry node. */
3777 if (!sese->brackets.length () || !edges || !edges->length ())
3778 {
3779 hi_back = 0;
3780 node_back = pseudo_node_t (0, 0);
3781 sese->push (node_back);
3782 }
3783
3784 /* Record the highest reaching backedge from us or a descendant. */
3785 sese->high = hi_back < hi_child ? node_back : node_child;
3786
3787 if (num_children > 1)
3788 {
3789 /* There is more than one child -- this is a Y shaped piece of
3790 spanning tree. We have to insert a fake backedge from this
3791 node to the highest ancestor reached by not-the-highest
3792 reaching child. Note that there may be multiple children
3793 with backedges to the same highest node. That's ok and we
3794 insert the edge to that highest node. */
3795 hi_child = depth;
3796 if (dir < 0 && child)
3797 {
3798 node_child = sese->high;
3799 hi_child = node_child.second;
3800 if (node_child.first)
3801 hi_child += BB_GET_SESE (node_child.first)->node;
3802 }
3803
3804 FOR_EACH_EDGE (e, ei, edges)
3805 {
3806 basic_block target = *(basic_block *)((char *)e + offset);
3807
3808 if (target == child)
3809 /* Ignore the highest child. */
3810 continue;
3811
3812 bb_sese *t_sese = BB_GET_SESE (target);
3813 if (!t_sese)
3814 continue;
3815 if (t_sese->parent != sese->node)
3816 /* Not a child. */
3817 continue;
3818
3819 /* Compare its hi value. */
3820 int t_hi = t_sese->high.second;
3821
3822 if (basic_block child_hi_block = t_sese->high.first)
3823 t_hi += BB_GET_SESE (child_hi_block)->node;
3824
3825 if (hi_child > t_hi)
3826 {
3827 hi_child = t_hi;
3828 node_child = t_sese->high;
3829 }
3830 }
3831
3832 sese->push (node_child);
3833 }
3834}
3835
3836
3837/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3838 proceed to successors. Set SESE entry and exit nodes of
3839 REGIONS. */
3840
3841static void
3842nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3843 basic_block block, int coloring)
3844{
3845 bb_sese *sese = BB_GET_SESE (block);
3846
3847 if (block->flags & BB_VISITED)
3848 {
3849 /* If we've already encountered this block, either we must not
3850 be coloring, or it must have been colored the current color. */
3851 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3852 return;
3853 }
3854
3855 block->flags |= BB_VISITED;
3856
3857 if (sese)
3858 {
3859 if (coloring < 0)
3860 {
3861 /* Start coloring a region. */
3862 regions[sese->color].first = block;
3863 coloring = sese->color;
3864 }
3865
3866 if (!--color_counts[sese->color] && sese->color == coloring)
3867 {
3868 /* Found final block of SESE region. */
3869 regions[sese->color].second = block;
3870 coloring = -1;
3871 }
3872 else
3873 /* Color the node, so we can assert on revisiting the node
3874 that the graph is indeed SESE. */
3875 sese->color = coloring;
3876 }
3877 else
3878 /* Fallen off the subgraph, we cannot be coloring. */
3879 gcc_assert (coloring < 0);
3880
3881 /* Walk each successor block. */
3882 if (block->succs && block->succs->length ())
3883 {
3884 edge e;
3885 edge_iterator ei;
3886
3887 FOR_EACH_EDGE (e, ei, block->succs)
3888 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3889 }
3890 else
3891 gcc_assert (coloring < 0);
3892}
3893
3894/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3895 end up with NULL entries in it. */
3896
3897static void
3898nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3899{
3900 basic_block block;
3901 int ix;
3902
3903 /* First clear each BB of the whole function. */
2300c332 3904 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3905 {
3906 block->flags &= ~BB_VISITED;
3907 BB_SET_SESE (block, 0);
3908 }
912442c2
NS
3909
3910 /* Mark blocks in the function that are in this graph. */
3911 for (ix = 0; blocks.iterate (ix, &block); ix++)
3912 block->flags |= BB_VISITED;
3913
3914 /* Counts of nodes assigned to each color. There cannot be more
3915 colors than blocks (and hopefully there will be fewer). */
3916 auto_vec<unsigned> color_counts;
3917 color_counts.reserve (blocks.length ());
3918
3919 /* Worklist of nodes in the spanning tree. Again, there cannot be
3920 more nodes in the tree than blocks (there will be fewer if the
3921 CFG of blocks is disjoint). */
3922 auto_vec<basic_block> spanlist;
3923 spanlist.reserve (blocks.length ());
3924
3925 /* Make sure every block has its cycle class determined. */
3926 for (ix = 0; blocks.iterate (ix, &block); ix++)
3927 {
3928 if (BB_GET_SESE (block))
3929 /* We already met this block in an earlier graph solve. */
3930 continue;
3931
3932 if (dump_file)
3933 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3934
3935 /* Number the nodes reachable from block initial DFS order. */
3936 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3937
3938 /* Now walk in reverse DFS order to find cycle equivalents. */
3939 while (spanlist.length ())
3940 {
3941 block = spanlist.pop ();
3942 bb_sese *sese = BB_GET_SESE (block);
3943
3944 /* Do the pseudo node below. */
3945 nvptx_sese_pseudo (block, sese, depth, +1,
3946 sese->dir > 0 ? block->succs : block->preds,
3947 (sese->dir > 0 ? offsetof (edge_def, dest)
3948 : offsetof (edge_def, src)));
3949 sese->set_color (color_counts);
3950 /* Do the pseudo node above. */
3951 nvptx_sese_pseudo (block, sese, depth, -1,
3952 sese->dir < 0 ? block->succs : block->preds,
3953 (sese->dir < 0 ? offsetof (edge_def, dest)
3954 : offsetof (edge_def, src)));
3955 }
3956 if (dump_file)
3957 fprintf (dump_file, "\n");
3958 }
3959
3960 if (dump_file)
3961 {
3962 unsigned count;
3963 const char *comma = "";
3964
3965 fprintf (dump_file, "Found %d cycle equivalents\n",
3966 color_counts.length ());
3967 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3968 {
3969 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3970
3971 comma = "";
3972 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3973 if (BB_GET_SESE (block)->color == ix)
3974 {
3975 block->flags |= BB_VISITED;
3976 fprintf (dump_file, "%s%d", comma, block->index);
3977 comma=",";
3978 }
3979 fprintf (dump_file, "}");
3980 comma = ", ";
3981 }
3982 fprintf (dump_file, "\n");
3983 }
3984
3985 /* Now we've colored every block in the subgraph. We now need to
3986 determine the minimal set of SESE regions that cover that
3987 subgraph. Do this with a DFS walk of the complete function.
3988 During the walk we're either 'looking' or 'coloring'. When we
3989 reach the last node of a particular color, we stop coloring and
3990 return to looking. */
3991
3992 /* There cannot be more SESE regions than colors. */
3993 regions.reserve (color_counts.length ());
3994 for (ix = color_counts.length (); ix--;)
3995 regions.quick_push (bb_pair_t (0, 0));
3996
3997 for (ix = 0; blocks.iterate (ix, &block); ix++)
3998 block->flags &= ~BB_VISITED;
3999
4000 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4001
4002 if (dump_file)
4003 {
4004 const char *comma = "";
4005 int len = regions.length ();
4006
4007 fprintf (dump_file, "SESE regions:");
4008 for (ix = 0; ix != len; ix++)
4009 {
4010 basic_block from = regions[ix].first;
4011 basic_block to = regions[ix].second;
4012
4013 if (from)
4014 {
4015 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4016 if (to != from)
4017 fprintf (dump_file, "->%d", to->index);
4018
4019 int color = BB_GET_SESE (from)->color;
4020
4021 /* Print the blocks within the region (excluding ends). */
4022 FOR_EACH_BB_FN (block, cfun)
4023 {
4024 bb_sese *sese = BB_GET_SESE (block);
4025
4026 if (sese && sese->color == color
4027 && block != from && block != to)
4028 fprintf (dump_file, ".%d", block->index);
4029 }
4030 fprintf (dump_file, "}");
4031 }
4032 comma = ",";
4033 }
4034 fprintf (dump_file, "\n\n");
4035 }
4036
4037 for (ix = 0; blocks.iterate (ix, &block); ix++)
4038 delete BB_GET_SESE (block);
4039}
4040
4041#undef BB_SET_SESE
4042#undef BB_GET_SESE
4043
e91eba31
NS
4044/* Propagate live state at the start of a partitioned region. IS_CALL
4045 indicates whether the propagation is for a (partitioned) call
4046 instruction. BLOCK provides the live register information, and
4047 might not contain INSN. Propagation is inserted just after INSN. RW
4048 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
4049 separation is needed for worker-level proppagation where we
4050 essentially do a spill & fill. FN is the underlying worker
4051 function to generate the propagation instructions for single
4052 register. DATA is user data.
4053
e91eba31
NS
4054 Returns true if we didn't emit any instructions.
4055
4056 We propagate the live register set for non-calls and the entire
4057 frame for calls and non-calls. We could do better by (a)
4058 propagating just the live set that is used within the partitioned
4059 regions and (b) only propagating stack entries that are used. The
4060 latter might be quite hard to determine. */
d88cd9c4 4061
a0b3b5c4 4062typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
d88cd9c4 4063
e91eba31
NS
4064static bool
4065nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
a0b3b5c4 4066 propagate_mask rw, propagator_fn fn, void *data, bool vector)
d88cd9c4
NS
4067{
4068 bitmap live = DF_LIVE_IN (block);
4069 bitmap_iterator iterator;
4070 unsigned ix;
e91eba31 4071 bool empty = true;
d88cd9c4
NS
4072
4073 /* Copy the frame array. */
4074 HOST_WIDE_INT fs = get_frame_size ();
4075 if (fs)
4076 {
4077 rtx tmp = gen_reg_rtx (DImode);
4078 rtx idx = NULL_RTX;
4079 rtx ptr = gen_reg_rtx (Pmode);
4080 rtx pred = NULL_RTX;
4081 rtx_code_label *label = NULL;
4082
e91eba31 4083 empty = false;
c1311c86
NS
4084 /* The frame size might not be DImode compatible, but the frame
4085 array's declaration will be. So it's ok to round up here. */
4086 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
4087 /* Detect single iteration loop. */
4088 if (fs == 1)
4089 fs = 0;
4090
4091 start_sequence ();
4092 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4093 if (fs)
4094 {
4095 idx = gen_reg_rtx (SImode);
4096 pred = gen_reg_rtx (BImode);
4097 label = gen_label_rtx ();
4098
4099 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4100 /* Allow worker function to initialize anything needed. */
a0b3b5c4 4101 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
d88cd9c4
NS
4102 if (init)
4103 emit_insn (init);
4104 emit_label (label);
4105 LABEL_NUSES (label)++;
4106 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4107 }
4108 if (rw & PM_read)
4109 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
a0b3b5c4 4110 emit_insn (fn (tmp, rw, fs, data, vector));
d88cd9c4
NS
4111 if (rw & PM_write)
4112 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4113 if (fs)
4114 {
4115 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4116 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4117 emit_insn (gen_br_true_uni (pred, label));
a0b3b5c4 4118 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
d88cd9c4
NS
4119 if (fini)
4120 emit_insn (fini);
4121 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4122 }
4123 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4124 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4125 rtx cpy = get_insns ();
4126 end_sequence ();
4127 insn = emit_insn_after (cpy, insn);
4128 }
4129
e91eba31
NS
4130 if (!is_call)
4131 /* Copy live registers. */
4132 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4133 {
4134 rtx reg = regno_reg_rtx[ix];
d88cd9c4 4135
e91eba31
NS
4136 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4137 {
a0b3b5c4 4138 rtx bcast = fn (reg, rw, 0, data, vector);
d88cd9c4 4139
e91eba31
NS
4140 insn = emit_insn_after (bcast, insn);
4141 empty = false;
4142 }
4143 }
4144 return empty;
d88cd9c4
NS
4145}
4146
a0b3b5c4 4147/* Worker for nvptx_warp_propagate. */
d88cd9c4
NS
4148
4149static rtx
a0b3b5c4
TV
4150warp_prop_gen (rtx reg, propagate_mask pm,
4151 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4152 bool ARG_UNUSED (vector))
d88cd9c4
NS
4153{
4154 if (!(pm & PM_read_write))
4155 return 0;
4156
a0b3b5c4 4157 return nvptx_gen_warp_bcast (reg);
d88cd9c4
NS
4158}
4159
4160/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
4161 of a single warp. Propagation is inserted just after INSN.
4162 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4163
e91eba31 4164static bool
a0b3b5c4 4165nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 4166{
a0b3b5c4
TV
4167 return nvptx_propagate (is_call, block, insn, PM_read_write,
4168 warp_prop_gen, 0, false);
d88cd9c4
NS
4169}
4170
a0b3b5c4 4171/* Worker for nvptx_shared_propagate. */
d88cd9c4
NS
4172
4173static rtx
a0b3b5c4
TV
4174shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4175 bool vector)
d88cd9c4 4176{
1ed57fb8 4177 broadcast_data_t *data = (broadcast_data_t *)data_;
d88cd9c4
NS
4178
4179 if (pm & PM_loop_begin)
4180 {
4181 /* Starting a loop, initialize pointer. */
4182 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4183
3c55d60f
TV
4184 oacc_bcast_align = MAX (oacc_bcast_align, align);
4185 data->offset = ROUND_UP (data->offset, align);
d88cd9c4
NS
4186
4187 data->ptr = gen_reg_rtx (Pmode);
4188
4189 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4190 }
4191 else if (pm & PM_loop_end)
4192 {
4193 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4194 data->ptr = NULL_RTX;
4195 return clobber;
4196 }
4197 else
a0b3b5c4 4198 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
d88cd9c4
NS
4199}
4200
4201/* Spill or fill live state that is live at start of BLOCK. PRE_P
4202 indicates if this is just before partitioned mode (do spill), or
4203 just after it starts (do fill). Sequence is inserted just after
e91eba31 4204 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4205
e91eba31 4206static bool
a0b3b5c4
TV
4207nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4208 rtx_insn *insn, bool vector)
d88cd9c4 4209{
1ed57fb8 4210 broadcast_data_t data;
d88cd9c4
NS
4211
4212 data.base = gen_reg_rtx (Pmode);
4213 data.offset = 0;
4214 data.ptr = NULL_RTX;
4215
e91eba31 4216 bool empty = nvptx_propagate (is_call, block, insn,
a0b3b5c4
TV
4217 pre_p ? PM_read : PM_write, shared_prop_gen,
4218 &data, vector);
e91eba31 4219 gcc_assert (empty == !data.offset);
d88cd9c4
NS
4220 if (data.offset)
4221 {
0024c320
TV
4222 rtx bcast_sym = oacc_bcast_sym;
4223
d88cd9c4 4224 /* Stuff was emitted, initialize the base pointer now. */
0024c320
TV
4225 if (vector && nvptx_mach_max_workers () > 1)
4226 {
4227 if (!cfun->machine->bcast_partition)
4228 {
4229 /* It would be nice to place this register in
4230 DATA_AREA_SHARED. */
4231 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4232 }
4233 if (!cfun->machine->sync_bar)
4234 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4235
4236 bcast_sym = cfun->machine->bcast_partition;
4237 }
4238
4239 rtx init = gen_rtx_SET (data.base, bcast_sym);
d88cd9c4 4240 emit_insn_after (init, insn);
15ab6f00 4241
0024c320
TV
4242 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4243 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4244 ? nvptx_mach_max_workers () + 1
4245 : 1);
4246
4247 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4248 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4 4249 }
e91eba31 4250 return empty;
d88cd9c4
NS
4251}
4252
0024c320
TV
4253/* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4254 which is an integer or a register. THREADS is the number of threads
4255 controlled by the barrier. */
d88cd9c4
NS
4256
4257static rtx
0024c320 4258nvptx_cta_sync (rtx lock, int threads)
d88cd9c4 4259{
0024c320 4260 return gen_nvptx_barsync (lock, GEN_INT (threads));
d88cd9c4
NS
4261}
4262
43c371e8
TV
4263#if WORKAROUND_PTXJIT_BUG
4264/* Return first real insn in BB, or return NULL_RTX if BB does not contain
4265 real insns. */
4266
4267static rtx_insn *
4268bb_first_real_insn (basic_block bb)
4269{
4270 rtx_insn *insn;
4271
4272 /* Find first insn of from block. */
4273 FOR_BB_INSNS (bb, insn)
4274 if (INSN_P (insn))
4275 return insn;
4276
4277 return 0;
4278}
4279#endif
4280
6beefdbd
TV
4281/* Return true if INSN needs neutering. */
4282
4283static bool
4284needs_neutering_p (rtx_insn *insn)
4285{
4286 if (!INSN_P (insn))
4287 return false;
4288
4289 switch (recog_memoized (insn))
4290 {
4291 case CODE_FOR_nvptx_fork:
4292 case CODE_FOR_nvptx_forked:
4293 case CODE_FOR_nvptx_joining:
4294 case CODE_FOR_nvptx_join:
4295 case CODE_FOR_nvptx_barsync:
4296 return false;
4297 default:
4298 return true;
4299 }
4300}
4301
a874808c
TV
4302/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4303
4304static bool
4305verify_neutering_jumps (basic_block from,
4306 rtx_insn *vector_jump, rtx_insn *worker_jump,
4307 rtx_insn *vector_label, rtx_insn *worker_label)
4308{
4309 basic_block bb = from;
4310 rtx_insn *insn = BB_HEAD (bb);
4311 bool seen_worker_jump = false;
4312 bool seen_vector_jump = false;
4313 bool seen_worker_label = false;
4314 bool seen_vector_label = false;
4315 bool worker_neutered = false;
4316 bool vector_neutered = false;
4317 while (true)
4318 {
4319 if (insn == worker_jump)
4320 {
4321 seen_worker_jump = true;
4322 worker_neutered = true;
4323 gcc_assert (!vector_neutered);
4324 }
4325 else if (insn == vector_jump)
4326 {
4327 seen_vector_jump = true;
4328 vector_neutered = true;
4329 }
4330 else if (insn == worker_label)
4331 {
4332 seen_worker_label = true;
4333 gcc_assert (worker_neutered);
4334 worker_neutered = false;
4335 }
4336 else if (insn == vector_label)
4337 {
4338 seen_vector_label = true;
4339 gcc_assert (vector_neutered);
4340 vector_neutered = false;
4341 }
4342 else if (INSN_P (insn))
4343 switch (recog_memoized (insn))
4344 {
4345 case CODE_FOR_nvptx_barsync:
4346 gcc_assert (!vector_neutered && !worker_neutered);
4347 break;
4348 default:
4349 break;
4350 }
4351
4352 if (insn != BB_END (bb))
4353 insn = NEXT_INSN (insn);
4354 else if (JUMP_P (insn) && single_succ_p (bb)
4355 && !seen_vector_jump && !seen_worker_jump)
4356 {
4357 bb = single_succ (bb);
4358 insn = BB_HEAD (bb);
4359 }
4360 else
4361 break;
4362 }
4363
4364 gcc_assert (!(vector_jump && !seen_vector_jump));
4365 gcc_assert (!(worker_jump && !seen_worker_jump));
4366
4367 if (seen_vector_label || seen_worker_label)
4368 {
4369 gcc_assert (!(vector_label && !seen_vector_label));
4370 gcc_assert (!(worker_label && !seen_worker_label));
4371
4372 return true;
4373 }
4374
4375 return false;
4376}
4377
4378/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4379
4380static void
4381verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4382 rtx_insn *worker_label)
4383{
4384 basic_block bb = to;
4385 rtx_insn *insn = BB_END (bb);
4386 bool seen_worker_label = false;
4387 bool seen_vector_label = false;
4388 while (true)
4389 {
4390 if (insn == worker_label)
4391 {
4392 seen_worker_label = true;
4393 gcc_assert (!seen_vector_label);
4394 }
4395 else if (insn == vector_label)
4396 seen_vector_label = true;
4397 else if (INSN_P (insn))
4398 switch (recog_memoized (insn))
4399 {
4400 case CODE_FOR_nvptx_barsync:
4401 gcc_assert (!seen_vector_label && !seen_worker_label);
4402 break;
4403 }
4404
4405 if (insn != BB_HEAD (bb))
4406 insn = PREV_INSN (insn);
4407 else
4408 break;
4409 }
4410
4411 gcc_assert (!(vector_label && !seen_vector_label));
4412 gcc_assert (!(worker_label && !seen_worker_label));
4413}
4414
d88cd9c4
NS
4415/* Single neutering according to MASK. FROM is the incoming block and
4416 TO is the outgoing block. These may be the same block. Insert at
4417 start of FROM:
4418
4419 if (tid.<axis>) goto end.
4420
4421 and insert before ending branch of TO (if there is such an insn):
4422
4423 end:
4424 <possibly-broadcast-cond>
4425 <branch>
4426
4427 We currently only use differnt FROM and TO when skipping an entire
4428 loop. We could do more if we detected superblocks. */
4429
4430static void
4431nvptx_single (unsigned mask, basic_block from, basic_block to)
4432{
4433 rtx_insn *head = BB_HEAD (from);
4434 rtx_insn *tail = BB_END (to);
4435 unsigned skip_mask = mask;
4436
ed9091b2
TV
4437 while (true)
4438 {
4439 /* Find first insn of from block. */
6beefdbd 4440 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4441 head = NEXT_INSN (head);
4442
4443 if (from == to)
4444 break;
4445
4446 if (!(JUMP_P (head) && single_succ_p (from)))
4447 break;
4448
4449 basic_block jump_target = single_succ (from);
4450 if (!single_pred_p (jump_target))
4451 break;
4452
4453 from = jump_target;
4454 head = BB_HEAD (from);
4455 }
d88cd9c4
NS
4456
4457 /* Find last insn of to block */
4458 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4459 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4460 tail = PREV_INSN (tail);
4461
4462 /* Detect if tail is a branch. */
4463 rtx tail_branch = NULL_RTX;
4464 rtx cond_branch = NULL_RTX;
4465 if (tail && INSN_P (tail))
4466 {
4467 tail_branch = PATTERN (tail);
4468 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4469 tail_branch = NULL_RTX;
4470 else
4471 {
4472 cond_branch = SET_SRC (tail_branch);
4473 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4474 cond_branch = NULL_RTX;
4475 }
4476 }
4477
4478 if (tail == head)
4479 {
4480 /* If this is empty, do nothing. */
6beefdbd 4481 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4482 return;
4483
d88cd9c4
NS
4484 if (cond_branch)
4485 {
4486 /* If we're only doing vector single, there's no need to
4487 emit skip code because we'll not insert anything. */
4488 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4489 skip_mask = 0;
4490 }
4491 else if (tail_branch)
4492 /* Block with only unconditional branch. Nothing to do. */
4493 return;
4494 }
4495
4496 /* Insert the vector test inside the worker test. */
4497 unsigned mode;
4498 rtx_insn *before = tail;
2ba16fd2 4499 rtx_insn *neuter_start = NULL;
a874808c
TV
4500 rtx_insn *worker_label = NULL, *vector_label = NULL;
4501 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4502 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4503 if (GOMP_DIM_MASK (mode) & skip_mask)
4504 {
4505 rtx_code_label *label = gen_label_rtx ();
4506 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
478cb3ce
JJ
4507 rtx_insn **mode_jump
4508 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4509 rtx_insn **mode_label
4510 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4511
4512 if (!pred)
4513 {
4514 pred = gen_reg_rtx (BImode);
4515 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4516 }
7820b298 4517
d88cd9c4
NS
4518 rtx br;
4519 if (mode == GOMP_DIM_VECTOR)
4520 br = gen_br_true (pred, label);
4521 else
4522 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4523 if (neuter_start)
4524 neuter_start = emit_insn_after (br, neuter_start);
4525 else
4526 neuter_start = emit_insn_before (br, head);
a874808c 4527 *mode_jump = neuter_start;
d88cd9c4
NS
4528
4529 LABEL_NUSES (label)++;
a874808c 4530 rtx_insn *label_insn;
d88cd9c4 4531 if (tail_branch)
a874808c
TV
4532 {
4533 label_insn = emit_label_before (label, before);
4534 before = label_insn;
4535 }
d88cd9c4 4536 else
be606483 4537 {
a874808c 4538 label_insn = emit_label_after (label, tail);
e77a1236
TV
4539 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4540 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4541 emit_insn_after (gen_exit (), label_insn);
4542 }
a874808c 4543
478cb3ce 4544 *mode_label = label_insn;
d88cd9c4
NS
4545 }
4546
4547 /* Now deal with propagating the branch condition. */
4548 if (cond_branch)
4549 {
4550 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4551
0024c320
TV
4552 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4553 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
d88cd9c4
NS
4554 {
4555 /* Vector mode only, do a shuffle. */
43c371e8
TV
4556#if WORKAROUND_PTXJIT_BUG
4557 /* The branch condition %rcond is propagated like this:
4558
4559 {
4560 .reg .u32 %x;
4561 mov.u32 %x,%tid.x;
4562 setp.ne.u32 %rnotvzero,%x,0;
4563 }
4564
4565 @%rnotvzero bra Lskip;
4566 setp.<op>.<type> %rcond,op1,op2;
4567 Lskip:
4568 selp.u32 %rcondu32,1,0,%rcond;
4569 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4570 setp.ne.u32 %rcond,%rcondu32,0;
4571
4572 There seems to be a bug in the ptx JIT compiler (observed at driver
4573 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4574 unless %rcond is initialized to something before 'bra Lskip'. The
4575 bug is not observed with ptxas from cuda 8.0.61.
4576
4577 It is true that the code is non-trivial: at Lskip, %rcond is
4578 uninitialized in threads 1-31, and after the selp the same holds
4579 for %rcondu32. But shfl propagates the defined value in thread 0
4580 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4581 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4582
4583 There is nothing in the PTX spec to suggest that this is wrong, or
4584 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4585 it as a JIT bug, and the extra initialization as workaround:
4586
4587 {
4588 .reg .u32 %x;
4589 mov.u32 %x,%tid.x;
4590 setp.ne.u32 %rnotvzero,%x,0;
4591 }
4592
4593 +.reg .pred %rcond2;
4594 +setp.eq.u32 %rcond2, 1, 0;
4595
4596 @%rnotvzero bra Lskip;
4597 setp.<op>.<type> %rcond,op1,op2;
4598 +mov.pred %rcond2, %rcond;
4599 Lskip:
4600 +mov.pred %rcond, %rcond2;
4601 selp.u32 %rcondu32,1,0,%rcond;
4602 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4603 setp.ne.u32 %rcond,%rcondu32,0;
4604 */
4605 rtx_insn *label = PREV_INSN (tail);
4606 gcc_assert (label && LABEL_P (label));
4607 rtx tmp = gen_reg_rtx (BImode);
4608 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4609 bb_first_real_insn (from));
8c8e9a6b
TV
4610 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4611 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4612#endif
a0b3b5c4 4613 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
d88cd9c4
NS
4614 }
4615 else
4616 {
4617 /* Includes worker mode, do spill & fill. By construction
4618 we should never have worker mode only. */
1ed57fb8 4619 broadcast_data_t data;
0024c320
TV
4620 unsigned size = GET_MODE_SIZE (SImode);
4621 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4622 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4623 rtx barrier = GEN_INT (0);
4624 int threads = 0;
d88cd9c4 4625
1ed57fb8 4626 data.base = oacc_bcast_sym;
d88cd9c4
NS
4627 data.ptr = 0;
4628
0024c320
TV
4629 bool use_partitioning_p = (vector && !worker
4630 && nvptx_mach_max_workers () > 1
4631 && cfun->machine->bcast_partition);
4632 if (use_partitioning_p)
4633 {
4634 data.base = cfun->machine->bcast_partition;
4635 barrier = cfun->machine->sync_bar;
4636 threads = nvptx_mach_vector_length ();
4637 }
4638 gcc_assert (data.base != NULL);
4639 gcc_assert (barrier);
4640
4641 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4642 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4643 ? nvptx_mach_max_workers () + 1
4644 : 1);
4645
4646 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4647 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4
NS
4648
4649 data.offset = 0;
a0b3b5c4 4650 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
0024c320 4651 vector),
d88cd9c4 4652 before);
0024c320 4653
d88cd9c4 4654 /* Barrier so other workers can see the write. */
0024c320 4655 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4 4656 data.offset = 0;
a0b3b5c4 4657 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
0024c320
TV
4658 vector),
4659 tail);
d88cd9c4
NS
4660 /* This barrier is needed to avoid worker zero clobbering
4661 the broadcast buffer before all the other workers have
4662 had a chance to read this instance of it. */
0024c320 4663 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4
NS
4664 }
4665
4666 extract_insn (tail);
4667 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4668 UNSPEC_BR_UNIFIED);
4669 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4670 }
a874808c
TV
4671
4672 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4673 vector_label, worker_label);
4674 if (!seen_label)
4675 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4676}
4677
4678/* PAR is a parallel that is being skipped in its entirety according to
4679 MASK. Treat this as skipping a superblock starting at forked
4680 and ending at joining. */
4681
4682static void
4683nvptx_skip_par (unsigned mask, parallel *par)
4684{
4685 basic_block tail = par->join_block;
4686 gcc_assert (tail->preds->length () == 1);
4687
4688 basic_block pre_tail = (*tail->preds)[0]->src;
4689 gcc_assert (pre_tail->succs->length () == 1);
4690
4691 nvptx_single (mask, par->forked_block, pre_tail);
4692}
4693
dba619f3
NS
4694/* If PAR has a single inner parallel and PAR itself only contains
4695 empty entry and exit blocks, swallow the inner PAR. */
4696
4697static void
4698nvptx_optimize_inner (parallel *par)
4699{
4700 parallel *inner = par->inner;
4701
4702 /* We mustn't be the outer dummy par. */
4703 if (!par->mask)
4704 return;
4705
4706 /* We must have a single inner par. */
4707 if (!inner || inner->next)
4708 return;
4709
4710 /* We must only contain 2 blocks ourselves -- the head and tail of
4711 the inner par. */
4712 if (par->blocks.length () != 2)
4713 return;
4714
4715 /* We must be disjoint partitioning. As we only have vector and
4716 worker partitioning, this is sufficient to guarantee the pars
4717 have adjacent partitioning. */
4718 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4719 /* This indicates malformed code generation. */
4720 return;
4721
4722 /* The outer forked insn should be immediately followed by the inner
4723 fork insn. */
4724 rtx_insn *forked = par->forked_insn;
4725 rtx_insn *fork = BB_END (par->forked_block);
4726
4727 if (NEXT_INSN (forked) != fork)
4728 return;
4729 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4730
4731 /* The outer joining insn must immediately follow the inner join
4732 insn. */
4733 rtx_insn *joining = par->joining_insn;
4734 rtx_insn *join = inner->join_insn;
4735 if (NEXT_INSN (join) != joining)
4736 return;
4737
4738 /* Preconditions met. Swallow the inner par. */
4739 if (dump_file)
4740 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4741 inner->mask, inner->forked_block->index,
4742 inner->join_block->index,
4743 par->mask, par->forked_block->index, par->join_block->index);
4744
4745 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4746
4747 par->blocks.reserve (inner->blocks.length ());
4748 while (inner->blocks.length ())
4749 par->blocks.quick_push (inner->blocks.pop ());
4750
4751 par->inner = inner->inner;
4752 inner->inner = NULL;
4753
4754 delete inner;
4755}
4756
d88cd9c4
NS
4757/* Process the parallel PAR and all its contained
4758 parallels. We do everything but the neutering. Return mask of
4759 partitioned modes used within this parallel. */
4760
4761static unsigned
4762nvptx_process_pars (parallel *par)
4763{
dba619f3
NS
4764 if (nvptx_optimize)
4765 nvptx_optimize_inner (par);
4766
d88cd9c4
NS
4767 unsigned inner_mask = par->mask;
4768
4769 /* Do the inner parallels first. */
4770 if (par->inner)
4771 {
4772 par->inner_mask = nvptx_process_pars (par->inner);
4773 inner_mask |= par->inner_mask;
4774 }
4775
e91eba31 4776 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
0024c320
TV
4777 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4778 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4779 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
e91eba31 4780
0024c320 4781 if (worker || large_vector)
d88cd9c4 4782 {
a0b3b5c4 4783 nvptx_shared_propagate (false, is_call, par->forked_block,
0024c320 4784 par->forked_insn, !worker);
d495b5cc
TV
4785 bool no_prop_p
4786 = nvptx_shared_propagate (true, is_call, par->forked_block,
4787 par->fork_insn, !worker);
4788 bool empty_loop_p
4789 = !is_call && (NEXT_INSN (par->forked_insn)
4790 && NEXT_INSN (par->forked_insn) == par->joining_insn);
0024c320
TV
4791 rtx barrier = GEN_INT (0);
4792 int threads = 0;
4793
4794 if (!worker && cfun->machine->sync_bar)
4795 {
4796 barrier = cfun->machine->sync_bar;
4797 threads = nvptx_mach_vector_length ();
4798 }
e91eba31 4799
d495b5cc
TV
4800 if (no_prop_p && empty_loop_p)
4801 ;
4802 else if (no_prop_p && is_call)
4803 ;
4804 else
e91eba31
NS
4805 {
4806 /* Insert begin and end synchronizations. */
0024c320
TV
4807 emit_insn_before (nvptx_cta_sync (barrier, threads),
4808 par->forked_insn);
4809 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
e91eba31 4810 }
d88cd9c4
NS
4811 }
4812 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
a0b3b5c4 4813 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4814
4815 /* Now do siblings. */
4816 if (par->next)
4817 inner_mask |= nvptx_process_pars (par->next);
4818 return inner_mask;
4819}
4820
4821/* Neuter the parallel described by PAR. We recurse in depth-first
4822 order. MODES are the partitioning of the execution and OUTER is
4823 the partitioning of the parallels we are contained in. */
4824
4825static void
4826nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4827{
4828 unsigned me = (par->mask
4829 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4830 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4831 unsigned skip_mask = 0, neuter_mask = 0;
4832
4833 if (par->inner)
4834 nvptx_neuter_pars (par->inner, modes, outer | me);
4835
4836 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4837 {
4838 if ((outer | me) & GOMP_DIM_MASK (mode))
4839 {} /* Mode is partitioned: no neutering. */
4840 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4841 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4842 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4843 || !par->forked_insn)
4844 /* Partitioned in inner parallels, or we're not a partitioned
4845 at all: neuter individual blocks. */
4846 neuter_mask |= GOMP_DIM_MASK (mode);
4847 else if (!par->parent || !par->parent->forked_insn
4848 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4849 /* Parent isn't a parallel or contains this paralleling: skip
4850 parallel at this level. */
4851 skip_mask |= GOMP_DIM_MASK (mode);
4852 else
4853 {} /* Parent will skip this parallel itself. */
4854 }
4855
4856 if (neuter_mask)
4857 {
912442c2 4858 int ix, len;
d88cd9c4 4859
912442c2
NS
4860 if (nvptx_optimize)
4861 {
4862 /* Neuter whole SESE regions. */
4863 bb_pair_vec_t regions;
4864
4865 nvptx_find_sese (par->blocks, regions);
4866 len = regions.length ();
4867 for (ix = 0; ix != len; ix++)
4868 {
4869 basic_block from = regions[ix].first;
4870 basic_block to = regions[ix].second;
4871
4872 if (from)
4873 nvptx_single (neuter_mask, from, to);
4874 else
4875 gcc_assert (!to);
4876 }
4877 }
4878 else
d88cd9c4 4879 {
912442c2
NS
4880 /* Neuter each BB individually. */
4881 len = par->blocks.length ();
4882 for (ix = 0; ix != len; ix++)
4883 {
4884 basic_block block = par->blocks[ix];
d88cd9c4 4885
912442c2
NS
4886 nvptx_single (neuter_mask, block, block);
4887 }
d88cd9c4
NS
4888 }
4889 }
4890
4891 if (skip_mask)
7820b298 4892 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4893
4894 if (par->next)
4895 nvptx_neuter_pars (par->next, modes, outer);
4896}
4897
2498a6ab
TV
4898static void
4899populate_offload_attrs (offload_attrs *oa)
4900{
4901 tree attr = oacc_get_fn_attrib (current_function_decl);
4902 tree dims = TREE_VALUE (attr);
4903 unsigned ix;
4904
4905 oa->mask = 0;
4906
4907 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4908 {
4909 tree t = TREE_VALUE (dims);
4910 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4911 tree allowed = TREE_PURPOSE (dims);
4912
4913 if (size != 1 && !(allowed && integer_zerop (allowed)))
4914 oa->mask |= GOMP_DIM_MASK (ix);
4915
4916 switch (ix)
4917 {
4918 case GOMP_DIM_GANG:
4919 oa->num_gangs = size;
4920 break;
4921
4922 case GOMP_DIM_WORKER:
4923 oa->num_workers = size;
4924 break;
4925
4926 case GOMP_DIM_VECTOR:
4927 oa->vector_length = size;
4928 break;
4929 }
4930 }
4931}
4932
3dede32b
TV
4933#if WORKAROUND_PTXJIT_BUG_2
4934/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4935 is needed in the nvptx target because the branches generated for
4936 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4937
4938static rtx
4939nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4940{
4941 rtx pat;
4942 if ((strict && !JUMP_P (insn))
4943 || (!strict && !INSN_P (insn)))
4944 return NULL_RTX;
4945 pat = PATTERN (insn);
4946
4947 /* The set is allowed to appear either as the insn pattern or
4948 the first set in a PARALLEL. */
4949 if (GET_CODE (pat) == PARALLEL)
4950 pat = XVECEXP (pat, 0, 0);
4951 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4952 return pat;
4953
4954 return NULL_RTX;
4955}
4956
4957/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4958
4959static rtx
4960nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4961{
4962 rtx x = nvptx_pc_set (insn, strict);
4963
4964 if (!x)
4965 return NULL_RTX;
4966 x = SET_SRC (x);
4967 if (GET_CODE (x) == LABEL_REF)
4968 return x;
4969 if (GET_CODE (x) != IF_THEN_ELSE)
4970 return NULL_RTX;
4971 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4972 return XEXP (x, 1);
4973 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4974 return XEXP (x, 2);
4975 return NULL_RTX;
4976}
4977
4978/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4979 insn inbetween the branch and the label. This works around a JIT bug
4980 observed at driver version 384.111, at -O0 for sm_50. */
4981
4982static void
4983prevent_branch_around_nothing (void)
4984{
4985 rtx_insn *seen_label = NULL;
4986 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4987 {
b0b592fc 4988 if (INSN_P (insn) && condjump_p (insn))
3dede32b 4989 {
b0b592fc 4990 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
4991 continue;
4992 }
4993
b0b592fc
TV
4994 if (seen_label == NULL)
4995 continue;
4996
3dede32b
TV
4997 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4998 continue;
4999
5000 if (INSN_P (insn))
5001 switch (recog_memoized (insn))
5002 {
5003 case CODE_FOR_nvptx_fork:
5004 case CODE_FOR_nvptx_forked:
5005 case CODE_FOR_nvptx_joining:
5006 case CODE_FOR_nvptx_join:
5007 continue;
5008 default:
5009 seen_label = NULL;
5010 continue;
5011 }
5012
5013 if (LABEL_P (insn) && insn == seen_label)
5014 emit_insn_before (gen_fake_nop (), insn);
5015
5016 seen_label = NULL;
5017 }
5018 }
5019#endif
5020
21251395
TV
5021#ifdef WORKAROUND_PTXJIT_BUG_3
5022/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5023 works around a hang observed at driver version 390.48 for sm_50. */
5024
5025static void
5026workaround_barsyncs (void)
5027{
5028 bool seen_barsync = false;
5029 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5030 {
5031 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5032 {
5033 if (seen_barsync)
5034 {
5035 emit_insn_before (gen_nvptx_membar_cta (), insn);
5036 emit_insn_before (gen_nvptx_membar_cta (), insn);
5037 }
5038
5039 seen_barsync = true;
5040 continue;
5041 }
5042
5043 if (!seen_barsync)
5044 continue;
5045
5046 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5047 continue;
5048 else if (INSN_P (insn))
5049 switch (recog_memoized (insn))
5050 {
5051 case CODE_FOR_nvptx_fork:
5052 case CODE_FOR_nvptx_forked:
5053 case CODE_FOR_nvptx_joining:
5054 case CODE_FOR_nvptx_join:
5055 continue;
5056 default:
5057 break;
5058 }
5059
5060 seen_barsync = false;
5061 }
5062}
5063#endif
5064
517665b3 5065/* PTX-specific reorganization
d88cd9c4 5066 - Split blocks at fork and join instructions
c38f0d8c
NS
5067 - Compute live registers
5068 - Mark now-unused registers, so function begin doesn't declare
517665b3 5069 unused registers.
d88cd9c4
NS
5070 - Insert state propagation when entering partitioned mode
5071 - Insert neutering instructions when in single mode
c38f0d8c 5072 - Replace subregs with suitable sequences.
517665b3
NS
5073*/
5074
5075static void
5076nvptx_reorg (void)
5077{
517665b3
NS
5078 /* We are freeing block_for_insn in the toplev to keep compatibility
5079 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5080 compute_bb_for_insn ();
5081
5082 thread_prologue_and_epilogue_insns ();
5083
d88cd9c4
NS
5084 /* Split blocks and record interesting unspecs. */
5085 bb_insn_map_t bb_insn_map;
5086
5087 nvptx_split_blocks (&bb_insn_map);
5088
c38f0d8c 5089 /* Compute live regs */
517665b3
NS
5090 df_clear_flags (DF_LR_RUN_DCE);
5091 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
5092 df_live_add_problem ();
5093 df_live_set_all_dirty ();
517665b3 5094 df_analyze ();
738f2522
BS
5095 regstat_init_n_sets_and_refs ();
5096
d88cd9c4
NS
5097 if (dump_file)
5098 df_dump (dump_file);
5099
517665b3 5100 /* Mark unused regs as unused. */
d88cd9c4 5101 int max_regs = max_reg_num ();
44c068ae 5102 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
5103 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5104 regno_reg_rtx[i] = const0_rtx;
517665b3 5105
d88cd9c4
NS
5106 /* Determine launch dimensions of the function. If it is not an
5107 offloaded function (i.e. this is a regular compiler), the
5108 function has no neutering. */
629b3d75 5109 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
5110 if (attr)
5111 {
5112 /* If we determined this mask before RTL expansion, we could
5113 elide emission of some levels of forks and joins. */
2498a6ab 5114 offload_attrs oa;
d88cd9c4 5115
2498a6ab 5116 populate_offload_attrs (&oa);
d88cd9c4 5117
d88cd9c4
NS
5118 /* If there is worker neutering, there must be vector
5119 neutering. Otherwise the hardware will fail. */
2498a6ab
TV
5120 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5121 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
d88cd9c4
NS
5122
5123 /* Discover & process partitioned regions. */
5124 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5125 nvptx_process_pars (pars);
2498a6ab 5126 nvptx_neuter_pars (pars, oa.mask, 0);
d88cd9c4
NS
5127 delete pars;
5128 }
5129
517665b3 5130 /* Replace subregs. */
c03b0416 5131 nvptx_reorg_subreg ();
517665b3 5132
5012919d
AM
5133 if (TARGET_UNIFORM_SIMT)
5134 nvptx_reorg_uniform_simt ();
5135
3dede32b
TV
5136#if WORKAROUND_PTXJIT_BUG_2
5137 prevent_branch_around_nothing ();
5138#endif
5139
21251395
TV
5140#ifdef WORKAROUND_PTXJIT_BUG_3
5141 workaround_barsyncs ();
5142#endif
5143
738f2522 5144 regstat_free_n_sets_and_refs ();
517665b3
NS
5145
5146 df_finish_pass (true);
738f2522
BS
5147}
5148\f
5149/* Handle a "kernel" attribute; arguments as in
5150 struct attribute_spec.handler. */
5151
5152static tree
5153nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5154 int ARG_UNUSED (flags), bool *no_add_attrs)
5155{
5156 tree decl = *node;
5157
5158 if (TREE_CODE (decl) != FUNCTION_DECL)
5159 {
5160 error ("%qE attribute only applies to functions", name);
5161 *no_add_attrs = true;
5162 }
b49e35a9 5163 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
5164 {
5165 error ("%qE attribute requires a void return type", name);
5166 *no_add_attrs = true;
5167 }
5168
5169 return NULL_TREE;
5170}
5171
5012919d
AM
5172/* Handle a "shared" attribute; arguments as in
5173 struct attribute_spec.handler. */
5174
5175static tree
5176nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5177 int ARG_UNUSED (flags), bool *no_add_attrs)
5178{
5179 tree decl = *node;
5180
5181 if (TREE_CODE (decl) != VAR_DECL)
5182 {
5183 error ("%qE attribute only applies to variables", name);
5184 *no_add_attrs = true;
5185 }
5186 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5187 {
5188 error ("%qE attribute not allowed with auto storage class", name);
5189 *no_add_attrs = true;
5190 }
5191
5192 return NULL_TREE;
5193}
5194
738f2522
BS
5195/* Table of valid machine attributes. */
5196static const struct attribute_spec nvptx_attribute_table[] =
5197{
4849deb1
JJ
5198 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5199 affects_type_identity, handler, exclude } */
5200 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 5201 NULL },
4849deb1 5202 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 5203 NULL },
4849deb1 5204 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
5205};
5206\f
5207/* Limit vector alignments to BIGGEST_ALIGNMENT. */
5208
5209static HOST_WIDE_INT
5210nvptx_vector_alignment (const_tree type)
5211{
26b538cd
RS
5212 unsigned HOST_WIDE_INT align;
5213 tree size = TYPE_SIZE (type);
738f2522 5214
26b538cd
RS
5215 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5216 if (tree_fits_uhwi_p (size))
5217 {
5218 align = tree_to_uhwi (size);
5219 align = MIN (align, BIGGEST_ALIGNMENT);
5220 }
5221 else
5222 align = BIGGEST_ALIGNMENT;
5223
5224 /* Ensure align is not smaller than mode alignment. */
5225 align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5226
5227 return align;
738f2522 5228}
d88cd9c4
NS
5229
5230/* Indicate that INSN cannot be duplicated. */
5231
5232static bool
5233nvptx_cannot_copy_insn_p (rtx_insn *insn)
5234{
5235 switch (recog_memoized (insn))
5236 {
5237 case CODE_FOR_nvptx_shufflesi:
5238 case CODE_FOR_nvptx_shufflesf:
5239 case CODE_FOR_nvptx_barsync:
5240 case CODE_FOR_nvptx_fork:
5241 case CODE_FOR_nvptx_forked:
5242 case CODE_FOR_nvptx_joining:
5243 case CODE_FOR_nvptx_join:
5244 return true;
5245 default:
5246 return false;
5247 }
5248}
a794bd20
NS
5249
5250/* Section anchors do not work. Initialization for flag_section_anchor
5251 probes the existence of the anchoring target hooks and prevents
5252 anchoring if they don't exist. However, we may be being used with
5253 a host-side compiler that does support anchoring, and hence see
5254 the anchor flag set (as it's not recalculated). So provide an
5255 implementation denying anchoring. */
5256
5257static bool
5258nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5259{
5260 return false;
5261}
738f2522 5262\f
1f83528e
TS
5263/* Record a symbol for mkoffload to enter into the mapping table. */
5264
5265static void
5266nvptx_record_offload_symbol (tree decl)
5267{
3e32ee19
NS
5268 switch (TREE_CODE (decl))
5269 {
5270 case VAR_DECL:
5271 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5272 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5273 break;
5274
5275 case FUNCTION_DECL:
5276 {
629b3d75 5277 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
5278 /* OpenMP offloading does not set this attribute. */
5279 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 5280
3e32ee19
NS
5281 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5282 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5283
5012919d 5284 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 5285 {
5d306e55 5286 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 5287
5d306e55 5288 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
5289 fprintf (asm_out_file, ", %#x", size);
5290 }
d2d47a28 5291
3e32ee19
NS
5292 fprintf (asm_out_file, "\n");
5293 }
5294 break;
d2d47a28 5295
3e32ee19
NS
5296 default:
5297 gcc_unreachable ();
5298 }
1f83528e
TS
5299}
5300
738f2522
BS
5301/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5302 at the start of a file. */
5303
5304static void
5305nvptx_file_start (void)
5306{
5307 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5308 fputs ("\t.version\t3.1\n", asm_out_file);
53ceb8b5
CP
5309 if (TARGET_SM35)
5310 fputs ("\t.target\tsm_35\n", asm_out_file);
5311 else
5312 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
5313 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5314 fputs ("// END PREAMBLE\n", asm_out_file);
5315}
5316
a0b3b5c4
TV
5317/* Emit a declaration for a worker and vector-level buffer in .shared
5318 memory. */
15ab6f00
NS
5319
5320static void
a0b3b5c4 5321write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
15ab6f00
NS
5322{
5323 const char *name = XSTR (sym, 0);
5324
5325 write_var_marker (file, true, false, name);
5326 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5327 align, name, size);
5328}
5329
ecf6e535
BS
5330/* Write out the function declarations we've collected and declare storage
5331 for the broadcast buffer. */
738f2522
BS
5332
5333static void
5334nvptx_file_end (void)
5335{
f3dba894
TS
5336 hash_table<tree_hasher>::iterator iter;
5337 tree decl;
5338 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 5339 nvptx_record_fndecl (decl);
738f2522 5340 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4 5341
1ed57fb8 5342 if (oacc_bcast_size)
a0b3b5c4 5343 write_shared_buffer (asm_out_file, oacc_bcast_sym,
1ed57fb8 5344 oacc_bcast_align, oacc_bcast_size);
f3552158
NS
5345
5346 if (worker_red_size)
a0b3b5c4 5347 write_shared_buffer (asm_out_file, worker_red_sym,
15ab6f00 5348 worker_red_align, worker_red_size);
5012919d 5349
f881693c
TV
5350 if (vector_red_size)
5351 write_shared_buffer (asm_out_file, vector_red_sym,
5352 vector_red_align, vector_red_size);
5353
5012919d
AM
5354 if (need_softstack_decl)
5355 {
5356 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5357 /* 32 is the maximum number of warps in a block. Even though it's an
5358 external declaration, emit the array size explicitly; otherwise, it
5359 may fail at PTX JIT time if the definition is later in link order. */
5360 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5361 POINTER_SIZE);
5362 }
5363 if (need_unisimt_decl)
5364 {
5365 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5366 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5367 }
f3552158
NS
5368}
5369
5370/* Expander for the shuffle builtins. */
5371
5372static rtx
5373nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5374{
5375 if (ignore)
5376 return target;
5377
5378 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5379 NULL_RTX, mode, EXPAND_NORMAL);
5380 if (!REG_P (src))
5381 src = copy_to_mode_reg (mode, src);
5382
5383 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5384 NULL_RTX, SImode, EXPAND_NORMAL);
5385 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5386 NULL_RTX, SImode, EXPAND_NORMAL);
5387
5388 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5389 idx = copy_to_mode_reg (SImode, idx);
5390
59263259
NS
5391 rtx pat = nvptx_gen_shuffle (target, src, idx,
5392 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5393 if (pat)
5394 emit_insn (pat);
5395
5396 return target;
5397}
5398
f881693c
TV
5399const char *
5400nvptx_output_red_partition (rtx dst, rtx offset)
5401{
5402 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5403 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5404
5405 if (offset == const0_rtx)
5406 fprintf (asm_out_file, zero_offset, REGNO (dst),
5407 REGNO (cfun->machine->red_partition));
5408 else
5409 fprintf (asm_out_file, with_offset, REGNO (dst),
5410 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5411
5412 return "";
5413}
5414
5415/* Shared-memory reduction address expander. */
f3552158
NS
5416
5417static rtx
a0b3b5c4 5418nvptx_expand_shared_addr (tree exp, rtx target,
f881693c
TV
5419 machine_mode ARG_UNUSED (mode), int ignore,
5420 int vector)
f3552158
NS
5421{
5422 if (ignore)
5423 return target;
5424
5425 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
f3552158
NS
5426 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5427 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
9a863523 5428 rtx addr = worker_red_sym;
f881693c
TV
5429
5430 if (vector)
9a863523 5431 {
f881693c
TV
5432 offload_attrs oa;
5433
5434 populate_offload_attrs (&oa);
5435
5436 unsigned int psize = ROUND_UP (size + offset, align);
5437 unsigned int pnum = nvptx_mach_max_workers ();
5438 vector_red_partition = MAX (vector_red_partition, psize);
5439 vector_red_size = MAX (vector_red_size, psize * pnum);
5440 vector_red_align = MAX (vector_red_align, align);
5441
5442 if (cfun->machine->red_partition == NULL)
5443 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5444
5445 addr = gen_reg_rtx (Pmode);
5446 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
9a863523 5447 }
f881693c
TV
5448 else
5449 {
5450 worker_red_align = MAX (worker_red_align, align);
5451 worker_red_size = MAX (worker_red_size, size + offset);
f3552158 5452
f881693c
TV
5453 if (offset)
5454 {
5455 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5456 addr = gen_rtx_CONST (Pmode, addr);
5457 }
5458 }
f3552158 5459
f881693c 5460 emit_move_insn (target, addr);
f3552158
NS
5461 return target;
5462}
5463
5464/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5465 not require taking the address of any object, other than the memory
5466 cell being operated on. */
5467
5468static rtx
5469nvptx_expand_cmp_swap (tree exp, rtx target,
5470 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5471{
5472 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5473
5474 if (!target)
5475 target = gen_reg_rtx (mode);
5476
5477 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5478 NULL_RTX, Pmode, EXPAND_NORMAL);
5479 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5480 NULL_RTX, mode, EXPAND_NORMAL);
5481 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5482 NULL_RTX, mode, EXPAND_NORMAL);
5483 rtx pat;
5484
5485 mem = gen_rtx_MEM (mode, mem);
5486 if (!REG_P (cmp))
5487 cmp = copy_to_mode_reg (mode, cmp);
5488 if (!REG_P (src))
5489 src = copy_to_mode_reg (mode, src);
5490
5491 if (mode == SImode)
5492 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5493 else
5494 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5495
5496 emit_insn (pat);
5497
5498 return target;
5499}
5500
5501
5502/* Codes for all the NVPTX builtins. */
5503enum nvptx_builtins
5504{
5505 NVPTX_BUILTIN_SHUFFLE,
5506 NVPTX_BUILTIN_SHUFFLELL,
5507 NVPTX_BUILTIN_WORKER_ADDR,
f881693c 5508 NVPTX_BUILTIN_VECTOR_ADDR,
f3552158
NS
5509 NVPTX_BUILTIN_CMP_SWAP,
5510 NVPTX_BUILTIN_CMP_SWAPLL,
5511 NVPTX_BUILTIN_MAX
5512};
5513
5514static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5515
5516/* Return the NVPTX builtin for CODE. */
5517
5518static tree
5519nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5520{
5521 if (code >= NVPTX_BUILTIN_MAX)
5522 return error_mark_node;
5523
5524 return nvptx_builtin_decls[code];
5525}
5526
5527/* Set up all builtin functions for this target. */
5528
5529static void
5530nvptx_init_builtins (void)
5531{
5532#define DEF(ID, NAME, T) \
5533 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5534 = add_builtin_function ("__builtin_nvptx_" NAME, \
5535 build_function_type_list T, \
5536 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5537#define ST sizetype
5538#define UINT unsigned_type_node
5539#define LLUINT long_long_unsigned_type_node
5540#define PTRVOID ptr_type_node
5541
5542 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5543 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5544 DEF (WORKER_ADDR, "worker_addr",
5545 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f881693c
TV
5546 DEF (VECTOR_ADDR, "vector_addr",
5547 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f3552158
NS
5548 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5549 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5550
5551#undef DEF
5552#undef ST
5553#undef UINT
5554#undef LLUINT
5555#undef PTRVOID
5556}
5557
5558/* Expand an expression EXP that calls a built-in function,
5559 with result going to TARGET if that's convenient
5560 (and in mode MODE if that's convenient).
5561 SUBTARGET may be used as the target for computing one of EXP's operands.
5562 IGNORE is nonzero if the value is to be ignored. */
5563
5564static rtx
5565nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5566 machine_mode mode, int ignore)
5567{
5568 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 5569 switch (DECL_MD_FUNCTION_CODE (fndecl))
f3552158
NS
5570 {
5571 case NVPTX_BUILTIN_SHUFFLE:
5572 case NVPTX_BUILTIN_SHUFFLELL:
5573 return nvptx_expand_shuffle (exp, target, mode, ignore);
5574
5575 case NVPTX_BUILTIN_WORKER_ADDR:
f881693c
TV
5576 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5577
5578 case NVPTX_BUILTIN_VECTOR_ADDR:
5579 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
f3552158
NS
5580
5581 case NVPTX_BUILTIN_CMP_SWAP:
5582 case NVPTX_BUILTIN_CMP_SWAPLL:
5583 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5584
5585 default: gcc_unreachable ();
5586 }
738f2522 5587}
f3552158 5588
5012919d
AM
5589/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5590
5591static int
5592nvptx_simt_vf ()
5593{
5d17a476 5594 return PTX_WARP_SIZE;
5012919d
AM
5595}
5596
9ba66bf5
JJ
5597/* Return 1 if TRAIT NAME is present in the OpenMP context's
5598 device trait set, return 0 if not present in any OpenMP context in the
5599 whole translation unit, or -1 if not present in the current OpenMP context
5600 but might be present in another OpenMP context in the same TU. */
5601
5602int
5603nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5604 const char *name)
5605{
5606 switch (trait)
5607 {
5608 case omp_device_kind:
5609 return strcmp (name, "gpu") == 0;
5610 case omp_device_arch:
5611 return strcmp (name, "nvptx") == 0;
5612 case omp_device_isa:
5613 if (strcmp (name, "sm_30") == 0)
5614 return !TARGET_SM35;
5615 if (strcmp (name, "sm_35") == 0)
5616 return TARGET_SM35;
5617 return 0;
5618 default:
5619 gcc_unreachable ();
5620 }
5621}
5622
ccc0d492
TV
5623static bool
5624nvptx_welformed_vector_length_p (int l)
5625{
5626 gcc_assert (l > 0);
5627 return l % PTX_WARP_SIZE == 0;
5628}
5629
5630static void
5631nvptx_apply_dim_limits (int dims[])
5632{
5633 /* Check that the vector_length is not too large. */
5634 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5635 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5636
5637 /* Check that the number of workers is not too large. */
5638 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5639 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5640
5641 /* Ensure that num_worker * vector_length <= cta size. */
5642 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5643 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5644 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
052aaace
TV
5645
5646 /* If we need a per-worker barrier ... . */
5647 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5648 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5649 /* Don't use more barriers than available. */
5650 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5651 PTX_NUM_PER_WORKER_BARRIERS);
ccc0d492
TV
5652}
5653
6e723923
TV
5654/* Return true if FNDECL contains calls to vector-partitionable routines. */
5655
5656static bool
5657has_vector_partitionable_routine_calls_p (tree fndecl)
5658{
5659 if (!fndecl)
5660 return false;
5661
5662 basic_block bb;
5663 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5664 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5665 gsi_next_nondebug (&i))
5666 {
5667 gimple *stmt = gsi_stmt (i);
5668 if (gimple_code (stmt) != GIMPLE_CALL)
5669 continue;
5670
5671 tree callee = gimple_call_fndecl (stmt);
5672 if (!callee)
5673 continue;
5674
5675 tree attrs = oacc_get_fn_attrib (callee);
5676 if (attrs == NULL_TREE)
5677 return false;
5678
5679 int partition_level = oacc_fn_attrib_level (attrs);
5680 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5681 if (!seq_routine_p)
5682 return true;
5683 }
5684
5685 return false;
5686}
5687
5f57c8f3
TV
5688/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5689 DIMS has changed. */
94829f87 5690
5f57c8f3 5691static void
46dedae6 5692nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
94829f87 5693{
ec6c865c
TV
5694 bool oacc_default_dims_p = false;
5695 bool oacc_min_dims_p = false;
5696 bool offload_region_p = false;
5697 bool routine_p = false;
5698 bool routine_seq_p = false;
d6f528e7 5699 int default_vector_length = -1;
ec6c865c
TV
5700
5701 if (decl == NULL_TREE)
5702 {
5703 if (fn_level == -1)
5704 oacc_default_dims_p = true;
5705 else if (fn_level == -2)
5706 oacc_min_dims_p = true;
5707 else
5708 gcc_unreachable ();
5709 }
5710 else if (fn_level == -1)
5711 offload_region_p = true;
5712 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5713 {
5714 routine_p = true;
5715 routine_seq_p = fn_level == GOMP_DIM_MAX;
5716 }
5717 else
5718 gcc_unreachable ();
94829f87 5719
4b171607
TV
5720 if (oacc_min_dims_p)
5721 {
5722 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5723 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5724 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5725
5726 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5727 return;
5728 }
5729
5730 if (routine_p)
5731 {
5732 if (!routine_seq_p)
5733 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5734
5735 return;
5736 }
5737
5738 if (oacc_default_dims_p)
5739 {
5740 /* -1 : not set
5741 0 : set at runtime, f.i. -fopenacc-dims=-
5742 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5743 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5744 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5745 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5746
5747 /* But -fopenacc-dims=- is not yet supported on trunk. */
5748 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5749 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5750 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5751 }
5752
5753 if (offload_region_p)
5754 {
5755 /* -1 : not set
5756 0 : set using variable, f.i. num_gangs (n)
5757 >= 1: set using constant, f.i. num_gangs (1). */
5758 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5759 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5760 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5761 }
5762
d6f528e7
TV
5763 if (offload_region_p)
5764 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5765 else
5766 /* oacc_default_dims_p. */
5767 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5768
d8ab4e54
TV
5769 int old_dims[GOMP_DIM_MAX];
5770 unsigned int i;
5771 for (i = 0; i < GOMP_DIM_MAX; ++i)
5772 old_dims[i] = dims[i];
5773
5774 const char *vector_reason = NULL;
6e723923
TV
5775 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5776 {
af79605e
TV
5777 default_vector_length = PTX_WARP_SIZE;
5778
6e723923
TV
5779 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5780 {
5781 vector_reason = G_("using vector_length (%d) due to call to"
5782 " vector-partitionable routine, ignoring %d");
5783 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5784 }
5785 }
5786
d8ab4e54 5787 if (dims[GOMP_DIM_VECTOR] == 0)
ccc8282b 5788 {
d8ab4e54 5789 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
d6f528e7 5790 dims[GOMP_DIM_VECTOR] = default_vector_length;
ccc8282b
NS
5791 }
5792
d8ab4e54 5793 if (dims[GOMP_DIM_VECTOR] > 0
ccc0d492 5794 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
d6f528e7 5795 dims[GOMP_DIM_VECTOR] = default_vector_length;
d8ab4e54 5796
ccc0d492 5797 nvptx_apply_dim_limits (dims);
d8ab4e54
TV
5798
5799 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5800 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5801 vector_reason != NULL
5802 ? vector_reason
5803 : G_("using vector_length (%d), ignoring %d"),
5804 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5805
5806 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5807 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5808 G_("using num_workers (%d), ignoring %d"),
5809 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
94829f87 5810
4b171607 5811 if (oacc_default_dims_p)
b6adbb9f 5812 {
2c2ff168
TV
5813 if (dims[GOMP_DIM_VECTOR] < 0)
5814 dims[GOMP_DIM_VECTOR] = default_vector_length;
b6adbb9f 5815 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5816 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5817 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5818 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
ccc0d492 5819 nvptx_apply_dim_limits (dims);
b6adbb9f 5820 }
980af7cd
TV
5821
5822 if (offload_region_p)
5823 {
5824 for (i = 0; i < GOMP_DIM_MAX; i++)
5825 {
5826 if (!(dims[i] < 0))
5827 continue;
5828
5829 if ((used & GOMP_DIM_MASK (i)) == 0)
5830 /* Function oacc_validate_dims will apply the minimal dimension. */
5831 continue;
5832
d6f528e7
TV
5833 dims[i] = (i == GOMP_DIM_VECTOR
5834 ? default_vector_length
5835 : oacc_get_default_dim (i));
980af7cd
TV
5836 }
5837
5838 nvptx_apply_dim_limits (dims);
5839 }
5f57c8f3
TV
5840}
5841
5842/* Validate compute dimensions of an OpenACC offload or routine, fill
5843 in non-unity defaults. FN_LEVEL indicates the level at which a
5844 routine might spawn a loop. It is negative for non-routines. If
5845 DECL is null, we are validating the default dimensions. */
5846
5847static bool
46dedae6 5848nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5f57c8f3
TV
5849{
5850 int old_dims[GOMP_DIM_MAX];
5851 unsigned int i;
b6adbb9f 5852
5f57c8f3
TV
5853 for (i = 0; i < GOMP_DIM_MAX; ++i)
5854 old_dims[i] = dims[i];
5855
46dedae6 5856 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5f57c8f3 5857
21fbea5c
TV
5858 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5859 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5860 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5861
5f57c8f3
TV
5862 for (i = 0; i < GOMP_DIM_MAX; ++i)
5863 if (old_dims[i] != dims[i])
5864 return true;
5865
5866 return false;
94829f87 5867}
d88cd9c4 5868
bd751975
NS
5869/* Return maximum dimension size, or zero for unbounded. */
5870
5871static int
5872nvptx_dim_limit (int axis)
5873{
5874 switch (axis)
5875 {
bd751975 5876 case GOMP_DIM_VECTOR:
764ecad4 5877 return PTX_MAX_VECTOR_LENGTH;
bd751975
NS
5878
5879 default:
5880 break;
5881 }
5882 return 0;
5883}
5884
d88cd9c4
NS
5885/* Determine whether fork & joins are needed. */
5886
5887static bool
5888nvptx_goacc_fork_join (gcall *call, const int dims[],
5889 bool ARG_UNUSED (is_fork))
5890{
5891 tree arg = gimple_call_arg (call, 2);
5892 unsigned axis = TREE_INT_CST_LOW (arg);
5893
5894 /* We only care about worker and vector partitioning. */
5895 if (axis < GOMP_DIM_WORKER)
5896 return false;
5897
5898 /* If the size is 1, there's no partitioning. */
5899 if (dims[axis] == 1)
5900 return false;
5901
5902 return true;
5903}
5904
f3552158
NS
5905/* Generate a PTX builtin function call that returns the address in
5906 the worker reduction buffer at OFFSET. TYPE is the type of the
5907 data at that location. */
5908
5909static tree
f881693c 5910nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
f3552158 5911{
f881693c
TV
5912 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5913 if (vector)
5914 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
f3552158 5915 machine_mode mode = TYPE_MODE (type);
f881693c 5916 tree fndecl = nvptx_builtin_decl (addr_dim, true);
f3552158
NS
5917 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5918 tree align = build_int_cst (unsigned_type_node,
5919 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5920 tree call = build_call_expr (fndecl, 3, offset, size, align);
5921
5922 return fold_convert (build_pointer_type (type), call);
5923}
5924
5925/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5926 will cast the variable if necessary. */
5927
5928static void
5929nvptx_generate_vector_shuffle (location_t loc,
5930 tree dest_var, tree var, unsigned shift,
5931 gimple_seq *seq)
5932{
5933 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5934 tree_code code = NOP_EXPR;
dd3c1b14
NS
5935 tree arg_type = unsigned_type_node;
5936 tree var_type = TREE_TYPE (var);
5937 tree dest_type = var_type;
f3552158 5938
dd3c1b14
NS
5939 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5940 var_type = TREE_TYPE (var_type);
5941
5942 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 5943 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
5944
5945 if (TYPE_SIZE (var_type)
5946 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
5947 {
5948 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 5949 arg_type = long_long_unsigned_type_node;
f3552158 5950 }
dd3c1b14 5951
f3552158 5952 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
5953 tree bits = build_int_cst (unsigned_type_node, shift);
5954 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5955 tree expr;
5956
5957 if (var_type != dest_type)
5958 {
5959 /* Do real and imaginary parts separately. */
5960 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5961 real = fold_build1 (code, arg_type, real);
5962 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5963 real = fold_build1 (code, var_type, real);
f3552158 5964
dd3c1b14
NS
5965 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5966 imag = fold_build1 (code, arg_type, imag);
5967 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5968 imag = fold_build1 (code, var_type, imag);
5969
5970 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5971 }
5972 else
5973 {
5974 expr = fold_build1 (code, arg_type, var);
5975 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5976 expr = fold_build1 (code, dest_type, expr);
5977 }
f3552158 5978
dd3c1b14 5979 gimplify_assign (dest_var, expr, seq);
f3552158
NS
5980}
5981
33f47f42
NS
5982/* Lazily generate the global lock var decl and return its address. */
5983
5984static tree
5985nvptx_global_lock_addr ()
5986{
5987 tree v = global_lock_var;
5988
5989 if (!v)
5990 {
5991 tree name = get_identifier ("__reduction_lock");
5992 tree type = build_qualified_type (unsigned_type_node,
5993 TYPE_QUAL_VOLATILE);
5994 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5995 global_lock_var = v;
5996 DECL_ARTIFICIAL (v) = 1;
5997 DECL_EXTERNAL (v) = 1;
5998 TREE_STATIC (v) = 1;
5999 TREE_PUBLIC (v) = 1;
6000 TREE_USED (v) = 1;
6001 mark_addressable (v);
6002 mark_decl_referenced (v);
6003 }
6004
6005 return build_fold_addr_expr (v);
6006}
6007
6008/* Insert code to locklessly update *PTR with *PTR OP VAR just before
6009 GSI. We use a lockless scheme for nearly all case, which looks
6010 like:
6011 actual = initval(OP);
6012 do {
6013 guess = actual;
6014 write = guess OP myval;
6015 actual = cmp&swap (ptr, guess, write)
6016 } while (actual bit-different-to guess);
6017 return write;
6018
6019 This relies on a cmp&swap instruction, which is available for 32-
6020 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
6021
6022static tree
6023nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6024 tree ptr, tree var, tree_code op)
6025{
6026 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6027 tree_code code = NOP_EXPR;
33f47f42
NS
6028 tree arg_type = unsigned_type_node;
6029 tree var_type = TREE_TYPE (var);
f3552158 6030
33f47f42
NS
6031 if (TREE_CODE (var_type) == COMPLEX_TYPE
6032 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 6033 code = VIEW_CONVERT_EXPR;
33f47f42
NS
6034
6035 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 6036 {
33f47f42 6037 arg_type = long_long_unsigned_type_node;
f3552158 6038 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
6039 }
6040
33f47f42
NS
6041 tree swap_fn = nvptx_builtin_decl (fn, true);
6042
f3552158 6043 gimple_seq init_seq = NULL;
33f47f42
NS
6044 tree init_var = make_ssa_name (arg_type);
6045 tree init_expr = omp_reduction_init_op (loc, op, var_type);
6046 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
6047 gimplify_assign (init_var, init_expr, &init_seq);
6048 gimple *init_end = gimple_seq_last (init_seq);
6049
6050 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6051
f3552158
NS
6052 /* Split the block just after the init stmts. */
6053 basic_block pre_bb = gsi_bb (*gsi);
6054 edge pre_edge = split_block (pre_bb, init_end);
6055 basic_block loop_bb = pre_edge->dest;
6056 pre_bb = pre_edge->src;
6057 /* Reset the iterator. */
6058 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6059
33f47f42
NS
6060 tree expect_var = make_ssa_name (arg_type);
6061 tree actual_var = make_ssa_name (arg_type);
6062 tree write_var = make_ssa_name (arg_type);
6063
6064 /* Build and insert the reduction calculation. */
6065 gimple_seq red_seq = NULL;
6066 tree write_expr = fold_build1 (code, var_type, expect_var);
6067 write_expr = fold_build2 (op, var_type, write_expr, var);
6068 write_expr = fold_build1 (code, arg_type, write_expr);
6069 gimplify_assign (write_var, write_expr, &red_seq);
6070
6071 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6072
6073 /* Build & insert the cmp&swap sequence. */
6074 gimple_seq latch_seq = NULL;
6075 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6076 ptr, expect_var, write_var);
6077 gimplify_assign (actual_var, swap_expr, &latch_seq);
6078
6079 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6080 NULL_TREE, NULL_TREE);
6081 gimple_seq_add_stmt (&latch_seq, cond);
6082
6083 gimple *latch_end = gimple_seq_last (latch_seq);
6084 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 6085
33f47f42
NS
6086 /* Split the block just after the latch stmts. */
6087 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
6088 basic_block post_bb = post_edge->dest;
6089 loop_bb = post_edge->src;
6090 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6091
6092 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 6093 post_edge->probability = profile_probability::even ();
f3552158 6094 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 6095 loop_edge->probability = profile_probability::even ();
f3552158
NS
6096 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6097 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6098
6099 gphi *phi = create_phi_node (expect_var, loop_bb);
6100 add_phi_arg (phi, init_var, pre_edge, loc);
6101 add_phi_arg (phi, actual_var, loop_edge, loc);
6102
6103 loop *loop = alloc_loop ();
6104 loop->header = loop_bb;
6105 loop->latch = loop_bb;
6106 add_loop (loop, loop_bb->loop_father);
6107
33f47f42
NS
6108 return fold_build1 (code, var_type, write_var);
6109}
6110
6111/* Insert code to lockfully update *PTR with *PTR OP VAR just before
6112 GSI. This is necessary for types larger than 64 bits, where there
6113 is no cmp&swap instruction to implement a lockless scheme. We use
6114 a lock variable in global memory.
6115
6116 while (cmp&swap (&lock_var, 0, 1))
6117 continue;
6118 T accum = *ptr;
6119 accum = accum OP var;
6120 *ptr = accum;
6121 cmp&swap (&lock_var, 1, 0);
6122 return accum;
6123
6124 A lock in global memory is necessary to force execution engine
6125 descheduling and avoid resource starvation that can occur if the
6126 lock is in .shared memory. */
6127
6128static tree
6129nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6130 tree ptr, tree var, tree_code op)
6131{
6132 tree var_type = TREE_TYPE (var);
6133 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6134 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6135 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6136
6137 /* Split the block just before the gsi. Insert a gimple nop to make
6138 this easier. */
6139 gimple *nop = gimple_build_nop ();
6140 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6141 basic_block entry_bb = gsi_bb (*gsi);
6142 edge entry_edge = split_block (entry_bb, nop);
6143 basic_block lock_bb = entry_edge->dest;
6144 /* Reset the iterator. */
6145 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6146
6147 /* Build and insert the locking sequence. */
6148 gimple_seq lock_seq = NULL;
6149 tree lock_var = make_ssa_name (unsigned_type_node);
6150 tree lock_expr = nvptx_global_lock_addr ();
6151 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6152 uns_unlocked, uns_locked);
6153 gimplify_assign (lock_var, lock_expr, &lock_seq);
6154 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6155 NULL_TREE, NULL_TREE);
6156 gimple_seq_add_stmt (&lock_seq, cond);
6157 gimple *lock_end = gimple_seq_last (lock_seq);
6158 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6159
6160 /* Split the block just after the lock sequence. */
6161 edge locked_edge = split_block (lock_bb, lock_end);
6162 basic_block update_bb = locked_edge->dest;
6163 lock_bb = locked_edge->src;
6164 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6165
6166 /* Create the lock loop ... */
6167 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
6168 locked_edge->probability = profile_probability::even ();
6169 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6170 loop_edge->probability = profile_probability::even ();
33f47f42
NS
6171 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6172 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6173
6174 /* ... and the loop structure. */
6175 loop *lock_loop = alloc_loop ();
6176 lock_loop->header = lock_bb;
6177 lock_loop->latch = lock_bb;
6178 lock_loop->nb_iterations_estimate = 1;
6179 lock_loop->any_estimate = true;
6180 add_loop (lock_loop, entry_bb->loop_father);
6181
6182 /* Build and insert the reduction calculation. */
6183 gimple_seq red_seq = NULL;
6184 tree acc_in = make_ssa_name (var_type);
6185 tree ref_in = build_simple_mem_ref (ptr);
6186 TREE_THIS_VOLATILE (ref_in) = 1;
6187 gimplify_assign (acc_in, ref_in, &red_seq);
6188
6189 tree acc_out = make_ssa_name (var_type);
6190 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6191 gimplify_assign (acc_out, update_expr, &red_seq);
6192
6193 tree ref_out = build_simple_mem_ref (ptr);
6194 TREE_THIS_VOLATILE (ref_out) = 1;
6195 gimplify_assign (ref_out, acc_out, &red_seq);
6196
6197 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6198
6199 /* Build & insert the unlock sequence. */
6200 gimple_seq unlock_seq = NULL;
6201 tree unlock_expr = nvptx_global_lock_addr ();
6202 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6203 uns_locked, uns_unlocked);
6204 gimplify_and_add (unlock_expr, &unlock_seq);
6205 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6206
6207 return acc_out;
6208}
6209
6210/* Emit a sequence to update a reduction accumlator at *PTR with the
6211 value held in VAR using operator OP. Return the updated value.
6212
6213 TODO: optimize for atomic ops and indepedent complex ops. */
6214
6215static tree
6216nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6217 tree ptr, tree var, tree_code op)
6218{
6219 tree type = TREE_TYPE (var);
6220 tree size = TYPE_SIZE (type);
6221
6222 if (size == TYPE_SIZE (unsigned_type_node)
6223 || size == TYPE_SIZE (long_long_unsigned_type_node))
6224 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6225 else
6226 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
6227}
6228
6229/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6230
6231static void
f881693c 6232nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
f3552158
NS
6233{
6234 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6235 tree lhs = gimple_call_lhs (call);
6236 tree var = gimple_call_arg (call, 2);
6237 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6238 gimple_seq seq = NULL;
6239
6240 push_gimplify_context (true);
6241
6242 if (level != GOMP_DIM_GANG)
6243 {
6244 /* Copy the receiver object. */
6245 tree ref_to_res = gimple_call_arg (call, 1);
6246
6247 if (!integer_zerop (ref_to_res))
6248 var = build_simple_mem_ref (ref_to_res);
6249 }
6250
f881693c
TV
6251 if (level == GOMP_DIM_WORKER
6252 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6253 {
6254 /* Store incoming value to worker reduction buffer. */
6255 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6256 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6257 level == GOMP_DIM_VECTOR);
f3552158
NS
6258 tree ptr = make_ssa_name (TREE_TYPE (call));
6259
6260 gimplify_assign (ptr, call, &seq);
6261 tree ref = build_simple_mem_ref (ptr);
6262 TREE_THIS_VOLATILE (ref) = 1;
6263 gimplify_assign (ref, var, &seq);
6264 }
6265
6266 if (lhs)
6267 gimplify_assign (lhs, var, &seq);
6268
6269 pop_gimplify_context (NULL);
6270 gsi_replace_with_seq (&gsi, seq, true);
6271}
6272
6273/* NVPTX implementation of GOACC_REDUCTION_INIT. */
6274
6275static void
f881693c 6276nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
f3552158
NS
6277{
6278 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6279 tree lhs = gimple_call_lhs (call);
6280 tree var = gimple_call_arg (call, 2);
6281 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6282 enum tree_code rcode
6283 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6284 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6285 TREE_TYPE (var));
6286 gimple_seq seq = NULL;
6287
6288 push_gimplify_context (true);
6289
f881693c 6290 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6291 {
6292 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6293 tree tid = make_ssa_name (integer_type_node);
6294 tree dim_vector = gimple_call_arg (call, 3);
6295 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6296 dim_vector);
6297 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6298 NULL_TREE, NULL_TREE);
6299
6300 gimple_call_set_lhs (tid_call, tid);
6301 gimple_seq_add_stmt (&seq, tid_call);
6302 gimple_seq_add_stmt (&seq, cond_stmt);
6303
6304 /* Split the block just after the call. */
6305 edge init_edge = split_block (gsi_bb (gsi), call);
6306 basic_block init_bb = init_edge->dest;
6307 basic_block call_bb = init_edge->src;
6308
6309 /* Fixup flags from call_bb to init_bb. */
6310 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 6311 init_edge->probability = profile_probability::even ();
f3552158
NS
6312
6313 /* Set the initialization stmts. */
6314 gimple_seq init_seq = NULL;
6315 tree init_var = make_ssa_name (TREE_TYPE (var));
6316 gimplify_assign (init_var, init, &init_seq);
6317 gsi = gsi_start_bb (init_bb);
6318 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6319
6320 /* Split block just after the init stmt. */
6321 gsi_prev (&gsi);
6322 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6323 basic_block dst_bb = inited_edge->dest;
6324
6325 /* Create false edge from call_bb to dst_bb. */
6326 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 6327 nop_edge->probability = profile_probability::even ();
f3552158
NS
6328
6329 /* Create phi node in dst block. */
6330 gphi *phi = create_phi_node (lhs, dst_bb);
6331 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6332 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6333
6334 /* Reset dominator of dst bb. */
6335 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6336
6337 /* Reset the gsi. */
6338 gsi = gsi_for_stmt (call);
6339 }
6340 else
6341 {
6342 if (level == GOMP_DIM_GANG)
6343 {
6344 /* If there's no receiver object, propagate the incoming VAR. */
6345 tree ref_to_res = gimple_call_arg (call, 1);
6346 if (integer_zerop (ref_to_res))
6347 init = var;
6348 }
6349
d41d952c
TV
6350 if (lhs != NULL_TREE)
6351 gimplify_assign (lhs, init, &seq);
f3552158
NS
6352 }
6353
6354 pop_gimplify_context (NULL);
6355 gsi_replace_with_seq (&gsi, seq, true);
6356}
6357
6358/* NVPTX implementation of GOACC_REDUCTION_FINI. */
6359
6360static void
f881693c 6361nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
f3552158
NS
6362{
6363 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6364 tree lhs = gimple_call_lhs (call);
6365 tree ref_to_res = gimple_call_arg (call, 1);
6366 tree var = gimple_call_arg (call, 2);
6367 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6368 enum tree_code op
6369 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6370 gimple_seq seq = NULL;
6371 tree r = NULL_TREE;;
6372
6373 push_gimplify_context (true);
6374
f881693c 6375 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6376 {
6377 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6378 but that requires a method of emitting a unified jump at the
6379 gimple level. */
764ecad4 6380 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
f3552158
NS
6381 {
6382 tree other_var = make_ssa_name (TREE_TYPE (var));
6383 nvptx_generate_vector_shuffle (gimple_location (call),
6384 other_var, var, shfl, &seq);
6385
6386 r = make_ssa_name (TREE_TYPE (var));
6387 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6388 var, other_var), &seq);
6389 var = r;
6390 }
6391 }
6392 else
6393 {
6394 tree accum = NULL_TREE;
6395
f881693c 6396 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
f3552158
NS
6397 {
6398 /* Get reduction buffer address. */
6399 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6400 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6401 level == GOMP_DIM_VECTOR);
f3552158
NS
6402 tree ptr = make_ssa_name (TREE_TYPE (call));
6403
6404 gimplify_assign (ptr, call, &seq);
6405 accum = ptr;
6406 }
6407 else if (integer_zerop (ref_to_res))
6408 r = var;
6409 else
6410 accum = ref_to_res;
6411
6412 if (accum)
6413 {
33f47f42 6414 /* UPDATE the accumulator. */
f3552158
NS
6415 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6416 seq = NULL;
33f47f42
NS
6417 r = nvptx_reduction_update (gimple_location (call), &gsi,
6418 accum, var, op);
f3552158
NS
6419 }
6420 }
6421
6422 if (lhs)
6423 gimplify_assign (lhs, r, &seq);
6424 pop_gimplify_context (NULL);
6425
6426 gsi_replace_with_seq (&gsi, seq, true);
6427}
6428
6429/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6430
6431static void
f881693c 6432nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
f3552158
NS
6433{
6434 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6435 tree lhs = gimple_call_lhs (call);
6436 tree var = gimple_call_arg (call, 2);
6437 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6438 gimple_seq seq = NULL;
6439
6440 push_gimplify_context (true);
f881693c
TV
6441 if (level == GOMP_DIM_WORKER
6442 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6443 {
6444 /* Read the worker reduction buffer. */
6445 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6446 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6447 level == GOMP_DIM_VECTOR);
f3552158
NS
6448 tree ptr = make_ssa_name (TREE_TYPE (call));
6449
6450 gimplify_assign (ptr, call, &seq);
6451 var = build_simple_mem_ref (ptr);
6452 TREE_THIS_VOLATILE (var) = 1;
6453 }
6454
6455 if (level != GOMP_DIM_GANG)
6456 {
6457 /* Write to the receiver object. */
6458 tree ref_to_res = gimple_call_arg (call, 1);
6459
6460 if (!integer_zerop (ref_to_res))
6461 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6462 }
6463
6464 if (lhs)
6465 gimplify_assign (lhs, var, &seq);
6466
6467 pop_gimplify_context (NULL);
6468
6469 gsi_replace_with_seq (&gsi, seq, true);
6470}
6471
6472/* NVPTX reduction expander. */
6473
5563d5c0 6474static void
f3552158
NS
6475nvptx_goacc_reduction (gcall *call)
6476{
6477 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
f881693c
TV
6478 offload_attrs oa;
6479
6480 populate_offload_attrs (&oa);
f3552158
NS
6481
6482 switch (code)
6483 {
6484 case IFN_GOACC_REDUCTION_SETUP:
f881693c 6485 nvptx_goacc_reduction_setup (call, &oa);
f3552158
NS
6486 break;
6487
6488 case IFN_GOACC_REDUCTION_INIT:
f881693c 6489 nvptx_goacc_reduction_init (call, &oa);
f3552158
NS
6490 break;
6491
6492 case IFN_GOACC_REDUCTION_FINI:
f881693c 6493 nvptx_goacc_reduction_fini (call, &oa);
f3552158
NS
6494 break;
6495
6496 case IFN_GOACC_REDUCTION_TEARDOWN:
f881693c 6497 nvptx_goacc_reduction_teardown (call, &oa);
f3552158
NS
6498 break;
6499
6500 default:
6501 gcc_unreachable ();
6502 }
6503}
6504
e053f710
TV
6505static bool
6506nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6507 rtx x ATTRIBUTE_UNUSED)
6508{
6509 return true;
6510}
6511
b98b34b7
TV
6512static bool
6513nvptx_vector_mode_supported (machine_mode mode)
6514{
3717fbe3
TV
6515 return (mode == V2SImode
6516 || mode == V2DImode);
6517}
6518
6519/* Return the preferred mode for vectorizing scalar MODE. */
6520
6521static machine_mode
005ba29c 6522nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
6523{
6524 switch (mode)
6525 {
4e10a5a7 6526 case E_DImode:
3717fbe3 6527 return V2DImode;
4e10a5a7 6528 case E_SImode:
3717fbe3
TV
6529 return V2SImode;
6530
6531 default:
6532 return default_preferred_simd_mode (mode);
6533 }
6534}
6535
6536unsigned int
6537nvptx_data_alignment (const_tree type, unsigned int basic_align)
6538{
6539 if (TREE_CODE (type) == INTEGER_TYPE)
6540 {
6541 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6542 if (size == GET_MODE_SIZE (TImode))
6543 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6544 }
6545
6546 return basic_align;
b98b34b7
TV
6547}
6548
99e1629f
RS
6549/* Implement TARGET_MODES_TIEABLE_P. */
6550
6551static bool
6552nvptx_modes_tieable_p (machine_mode, machine_mode)
6553{
6554 return false;
6555}
6556
c43f4279
RS
6557/* Implement TARGET_HARD_REGNO_NREGS. */
6558
6559static unsigned int
6560nvptx_hard_regno_nregs (unsigned int, machine_mode)
6561{
6562 return 1;
6563}
6564
0d803030
RS
6565/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6566
6567static bool
6568nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6569{
6570 return false;
6571}
6572
3a4a9259
RS
6573/* Implement TARGET_TRULY_NOOP_TRUNCATION. */
6574
6575static bool
6576nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
6577{
6578 return false;
6579}
6580
43be05f5
TV
6581static GTY(()) tree nvptx_previous_fndecl;
6582
6583static void
6584nvptx_set_current_function (tree fndecl)
6585{
6586 if (!fndecl || fndecl == nvptx_previous_fndecl)
6587 return;
6588
6589 nvptx_previous_fndecl = fndecl;
f881693c 6590 vector_red_partition = 0;
0024c320 6591 oacc_bcast_partition = 0;
43be05f5
TV
6592}
6593
bae974e6
TV
6594/* Implement TARGET_LIBC_HAS_FUNCTION. */
6595
6596bool
6597nvptx_libc_has_function (enum function_class fn_class, tree type)
6598{
6599 if (fn_class == function_sincos)
6600 {
6601 if (type != NULL_TREE)
6602 /* Currently, newlib does not support sincosl. */
6603 return type == float_type_node || type == double_type_node;
6604 else
6605 return true;
6606 }
6607
6608 return default_libc_has_function (fn_class, type);
6609}
6610
738f2522
BS
6611#undef TARGET_OPTION_OVERRIDE
6612#define TARGET_OPTION_OVERRIDE nvptx_option_override
6613
6614#undef TARGET_ATTRIBUTE_TABLE
6615#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6616
d81db636
SB
6617#undef TARGET_LRA_P
6618#define TARGET_LRA_P hook_bool_void_false
6619
738f2522
BS
6620#undef TARGET_LEGITIMATE_ADDRESS_P
6621#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6622
6623#undef TARGET_PROMOTE_FUNCTION_MODE
6624#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6625
6626#undef TARGET_FUNCTION_ARG
6627#define TARGET_FUNCTION_ARG nvptx_function_arg
6628#undef TARGET_FUNCTION_INCOMING_ARG
6629#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6630#undef TARGET_FUNCTION_ARG_ADVANCE
6631#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6632#undef TARGET_FUNCTION_ARG_BOUNDARY
6633#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6634#undef TARGET_PASS_BY_REFERENCE
6635#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6636#undef TARGET_FUNCTION_VALUE_REGNO_P
6637#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6638#undef TARGET_FUNCTION_VALUE
6639#define TARGET_FUNCTION_VALUE nvptx_function_value
6640#undef TARGET_LIBCALL_VALUE
6641#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6642#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6643#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6644#undef TARGET_GET_DRAP_RTX
6645#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6646#undef TARGET_SPLIT_COMPLEX_ARG
6647#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6648#undef TARGET_RETURN_IN_MEMORY
6649#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6650#undef TARGET_OMIT_STRUCT_RETURN_REG
6651#define TARGET_OMIT_STRUCT_RETURN_REG true
6652#undef TARGET_STRICT_ARGUMENT_NAMING
6653#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6654#undef TARGET_CALL_ARGS
6655#define TARGET_CALL_ARGS nvptx_call_args
6656#undef TARGET_END_CALL_ARGS
6657#define TARGET_END_CALL_ARGS nvptx_end_call_args
6658
6659#undef TARGET_ASM_FILE_START
6660#define TARGET_ASM_FILE_START nvptx_file_start
6661#undef TARGET_ASM_FILE_END
6662#define TARGET_ASM_FILE_END nvptx_file_end
6663#undef TARGET_ASM_GLOBALIZE_LABEL
6664#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6665#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6666#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6667#undef TARGET_PRINT_OPERAND
6668#define TARGET_PRINT_OPERAND nvptx_print_operand
6669#undef TARGET_PRINT_OPERAND_ADDRESS
6670#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6671#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6672#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6673#undef TARGET_ASM_INTEGER
6674#define TARGET_ASM_INTEGER nvptx_assemble_integer
6675#undef TARGET_ASM_DECL_END
6676#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6677#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6678#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6679#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6680#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6681#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6682#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6683
6684#undef TARGET_MACHINE_DEPENDENT_REORG
6685#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6686#undef TARGET_NO_REGISTER_ALLOCATION
6687#define TARGET_NO_REGISTER_ALLOCATION true
6688
9a863523
NS
6689#undef TARGET_ENCODE_SECTION_INFO
6690#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6691#undef TARGET_RECORD_OFFLOAD_SYMBOL
6692#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6693
738f2522
BS
6694#undef TARGET_VECTOR_ALIGNMENT
6695#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6696
d88cd9c4
NS
6697#undef TARGET_CANNOT_COPY_INSN_P
6698#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6699
a794bd20
NS
6700#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6701#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6702
f3552158
NS
6703#undef TARGET_INIT_BUILTINS
6704#define TARGET_INIT_BUILTINS nvptx_init_builtins
6705#undef TARGET_EXPAND_BUILTIN
6706#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6707#undef TARGET_BUILTIN_DECL
6708#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6709
5012919d
AM
6710#undef TARGET_SIMT_VF
6711#define TARGET_SIMT_VF nvptx_simt_vf
6712
9ba66bf5
JJ
6713#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6714#define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6715
94829f87
NS
6716#undef TARGET_GOACC_VALIDATE_DIMS
6717#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6718
bd751975
NS
6719#undef TARGET_GOACC_DIM_LIMIT
6720#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6721
d88cd9c4
NS
6722#undef TARGET_GOACC_FORK_JOIN
6723#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6724
f3552158
NS
6725#undef TARGET_GOACC_REDUCTION
6726#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6727
e053f710
TV
6728#undef TARGET_CANNOT_FORCE_CONST_MEM
6729#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6730
b98b34b7
TV
6731#undef TARGET_VECTOR_MODE_SUPPORTED_P
6732#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6733
3717fbe3
TV
6734#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6735#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6736 nvptx_preferred_simd_mode
6737
99e1629f
RS
6738#undef TARGET_MODES_TIEABLE_P
6739#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6740
c43f4279
RS
6741#undef TARGET_HARD_REGNO_NREGS
6742#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6743
0d803030
RS
6744#undef TARGET_CAN_CHANGE_MODE_CLASS
6745#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6746
3a4a9259
RS
6747#undef TARGET_TRULY_NOOP_TRUNCATION
6748#define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
6749
e335138d
TV
6750#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6751#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6752
43be05f5
TV
6753#undef TARGET_SET_CURRENT_FUNCTION
6754#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6755
bae974e6
TV
6756#undef TARGET_LIBC_HAS_FUNCTION
6757#define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
6758
738f2522
BS
6759struct gcc_target targetm = TARGET_INITIALIZER;
6760
6761#include "gt-nvptx.h"