]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
sse.md (vec_extract<mode><ssehalfvecmodelower>): Use V_256_512 iterator instead of...
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
a5544970 2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19
NS
61#include "omp-low.h"
62#include "gomp-constants.h"
d88cd9c4 63#include "dumpfile.h"
f3552158
NS
64#include "internal-fn.h"
65#include "gimple-iterator.h"
66#include "stringpool.h"
314e6352 67#include "attribs.h"
f2d4d839 68#include "tree-vrp.h"
f3552158
NS
69#include "tree-ssa-operands.h"
70#include "tree-ssanames.h"
71#include "gimplify.h"
72#include "tree-phinodes.h"
73#include "cfgloop.h"
74#include "fold-const.h"
50c620ca 75#include "intl.h"
738f2522 76
994c5d85 77/* This file should be included last. */
d58627a0
RS
78#include "target-def.h"
79
43c371e8 80#define WORKAROUND_PTXJIT_BUG 1
3dede32b 81#define WORKAROUND_PTXJIT_BUG_2 1
21251395 82#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 83
5d17a476 84#define PTX_WARP_SIZE 32
764ecad4
TV
85#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
86#define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
693ad66b
TV
87#define PTX_WORKER_LENGTH 32
88#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 89
c4686fa9
TV
90/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
91 block, which has had a maximum number of threads of 1024 since CUDA version
92 2.x. */
93#define PTX_CTA_SIZE 1024
94
9a863523
NS
95/* The various PTX memory areas an object might reside in. */
96enum nvptx_data_area
97{
98 DATA_AREA_GENERIC,
99 DATA_AREA_GLOBAL,
100 DATA_AREA_SHARED,
101 DATA_AREA_LOCAL,
102 DATA_AREA_CONST,
103 DATA_AREA_PARAM,
104 DATA_AREA_MAX
105};
106
107/* We record the data area in the target symbol flags. */
108#define SYMBOL_DATA_AREA(SYM) \
109 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
110 & 7)
111#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
112 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
113
738f2522
BS
114/* Record the function decls we've written, and the libfuncs and function
115 decls corresponding to them. */
116static std::stringstream func_decls;
f3dba894 117
6c907cff 118struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
119{
120 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
121 static bool equal (rtx a, rtx b) { return a == b; }
122};
123
124static GTY((cache))
125 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
126
6c907cff 127struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
128{
129 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
130 static bool equal (tree a, tree b) { return a == b; }
131};
132
133static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
134static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 135
1ed57fb8
TV
136/* Buffer needed to broadcast across workers and vectors. This is
137 used for both worker-neutering and worker broadcasting, and
138 vector-neutering and boardcasting when vector_length > 32. It is
139 shared by all functions emitted. The buffer is placed in shared
140 memory. It'd be nice if PTX supported common blocks, because then
141 this could be shared across TUs (taking the largest size). */
142static unsigned oacc_bcast_size;
0024c320 143static unsigned oacc_bcast_partition;
1ed57fb8
TV
144static unsigned oacc_bcast_align;
145static GTY(()) rtx oacc_bcast_sym;
d88cd9c4 146
f3552158
NS
147/* Buffer needed for worker reductions. This has to be distinct from
148 the worker broadcast array, as both may be live concurrently. */
149static unsigned worker_red_size;
150static unsigned worker_red_align;
f3552158
NS
151static GTY(()) rtx worker_red_sym;
152
f881693c
TV
153/* Buffer needed for vector reductions, when vector_length >
154 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
155 array, as both may be live concurrently. */
156static unsigned vector_red_size;
157static unsigned vector_red_align;
158static unsigned vector_red_partition;
159static GTY(()) rtx vector_red_sym;
160
33f47f42
NS
161/* Global lock variable, needed for 128bit worker & gang reductions. */
162static GTY(()) tree global_lock_var;
163
5012919d
AM
164/* True if any function references __nvptx_stacks. */
165static bool need_softstack_decl;
166
167/* True if any function references __nvptx_uni. */
168static bool need_unisimt_decl;
169
0024c320
TV
170static int nvptx_mach_max_workers ();
171
738f2522
BS
172/* Allocate a new, cleared machine_function structure. */
173
174static struct machine_function *
175nvptx_init_machine_status (void)
176{
177 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 178 p->return_mode = VOIDmode;
738f2522
BS
179 return p;
180}
181
5012919d
AM
182/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
183 and -fopenacc is also enabled. */
184
185static void
186diagnose_openacc_conflict (bool optval, const char *optname)
187{
188 if (flag_openacc && optval)
189 error ("option %s is not supported together with -fopenacc", optname);
190}
191
738f2522
BS
192/* Implement TARGET_OPTION_OVERRIDE. */
193
194static void
195nvptx_option_override (void)
196{
197 init_machine_status = nvptx_init_machine_status;
ae578943 198
3d339d5e
AM
199 /* Set toplevel_reorder, unless explicitly disabled. We need
200 reordering so that we emit necessary assembler decls of
201 undeclared variables. */
202 if (!global_options_set.x_flag_toplevel_reorder)
203 flag_toplevel_reorder = 1;
204
fa9afdc3
TV
205 debug_nonbind_markers_p = 0;
206
ae578943
NS
207 /* Set flag_no_common, unless explicitly disabled. We fake common
208 using .weak, and that's not entirely accurate, so avoid it
209 unless forced. */
210 if (!global_options_set.x_flag_no_common)
211 flag_no_common = 1;
212
cad136a4
TV
213 /* The patch area requires nops, which we don't have. */
214 if (function_entry_patch_area_size > 0)
215 sorry ("not generating patch area, nops not supported");
216
738f2522
BS
217 /* Assumes that it will see only hard registers. */
218 flag_var_tracking = 0;
1e5154e7 219
dba619f3
NS
220 if (nvptx_optimize < 0)
221 nvptx_optimize = optimize > 0;
222
f3dba894
TS
223 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
224 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 225 declared_libfuncs_htab
f3dba894 226 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 227
1ed57fb8
TV
228 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
229 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
230 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
0024c320 231 oacc_bcast_partition = 0;
f3552158 232
15ab6f00 233 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 234 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 235 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d 236
f881693c
TV
237 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
238 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
239 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
240 vector_red_partition = 0;
241
5012919d
AM
242 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
243 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
244 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
245
246 if (TARGET_GOMP)
247 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
248}
249
738f2522
BS
250/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
251 deal with ptx ideosyncracies. */
252
253const char *
254nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
255{
256 switch (mode)
257 {
4e10a5a7 258 case E_BLKmode:
738f2522 259 return ".b8";
4e10a5a7 260 case E_BImode:
738f2522 261 return ".pred";
4e10a5a7 262 case E_QImode:
738f2522
BS
263 if (promote)
264 return ".u32";
265 else
266 return ".u8";
4e10a5a7 267 case E_HImode:
738f2522 268 return ".u16";
4e10a5a7 269 case E_SImode:
738f2522 270 return ".u32";
4e10a5a7 271 case E_DImode:
738f2522
BS
272 return ".u64";
273
4e10a5a7 274 case E_SFmode:
738f2522 275 return ".f32";
4e10a5a7 276 case E_DFmode:
738f2522
BS
277 return ".f64";
278
4e10a5a7 279 case E_V2SImode:
b98b34b7 280 return ".v2.u32";
4e10a5a7 281 case E_V2DImode:
3717fbe3 282 return ".v2.u64";
b98b34b7 283
738f2522
BS
284 default:
285 gcc_unreachable ();
286 }
287}
288
9a863523
NS
289/* Encode the PTX data area that DECL (which might not actually be a
290 _DECL) should reside in. */
7b8edc29 291
9a863523
NS
292static void
293nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 294{
9a863523
NS
295 default_encode_section_info (decl, rtl, first);
296 if (first && MEM_P (rtl))
297 {
298 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 299
9a863523
NS
300 if (TREE_CONSTANT (decl))
301 area = DATA_AREA_CONST;
302 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
303 {
304 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
305 {
306 area = DATA_AREA_SHARED;
307 if (DECL_INITIAL (decl))
308 error ("static initialization of variable %q+D in %<.shared%>"
309 " memory is not supported", decl);
310 }
311 else
312 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
313 }
7b8edc29 314
9a863523
NS
315 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
316 }
317}
318
319/* Return the PTX name of the data area in which SYM should be
320 placed. The symbol must have already been processed by
321 nvptx_encode_seciton_info, or equivalent. */
322
323static const char *
324section_for_sym (rtx sym)
325{
326 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
327 /* Same order as nvptx_data_area enum. */
328 static char const *const areas[] =
329 {"", ".global", ".shared", ".local", ".const", ".param"};
330
331 return areas[area];
332}
333
334/* Similarly for a decl. */
335
336static const char *
337section_for_decl (const_tree decl)
338{
339 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
340}
341
b4346107
NS
342/* Check NAME for special function names and redirect them by returning a
343 replacement. This applies to malloc, free and realloc, for which we
344 want to use libgcc wrappers, and call, which triggers a bug in
345 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
346 not active in an offload compiler -- the names are all set by the
347 host-side compiler. */
348
349static const char *
350nvptx_name_replacement (const char *name)
351{
352 if (strcmp (name, "call") == 0)
353 return "__nvptx_call";
354 if (strcmp (name, "malloc") == 0)
355 return "__nvptx_malloc";
356 if (strcmp (name, "free") == 0)
357 return "__nvptx_free";
358 if (strcmp (name, "realloc") == 0)
359 return "__nvptx_realloc";
360 return name;
361}
362
d7479262
NS
363/* If MODE should be treated as two registers of an inner mode, return
364 that inner mode. Otherwise return VOIDmode. */
738f2522 365
d7479262
NS
366static machine_mode
367maybe_split_mode (machine_mode mode)
738f2522 368{
738f2522 369 if (COMPLEX_MODE_P (mode))
d7479262 370 return GET_MODE_INNER (mode);
738f2522 371
738f2522 372 if (mode == TImode)
d7479262
NS
373 return DImode;
374
375 return VOIDmode;
738f2522
BS
376}
377
38827811
TV
378/* Return true if mode should be treated as two registers. */
379
380static bool
381split_mode_p (machine_mode mode)
382{
383 return maybe_split_mode (mode) != VOIDmode;
384}
385
f313d112
NS
386/* Output a register, subreg, or register pair (with optional
387 enclosing braces). */
388
389static void
390output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
391 int subreg_offset = -1)
392{
393 if (inner_mode == VOIDmode)
394 {
395 if (HARD_REGISTER_NUM_P (regno))
396 fprintf (file, "%s", reg_names[regno]);
397 else
398 fprintf (file, "%%r%d", regno);
399 }
400 else if (subreg_offset >= 0)
401 {
402 output_reg (file, regno, VOIDmode);
403 fprintf (file, "$%d", subreg_offset);
404 }
405 else
406 {
407 if (subreg_offset == -1)
408 fprintf (file, "{");
409 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
410 fprintf (file, ",");
411 output_reg (file, regno, inner_mode, 0);
412 if (subreg_offset == -1)
413 fprintf (file, "}");
414 }
415}
416
d88cd9c4
NS
417/* Emit forking instructions for MASK. */
418
419static void
420nvptx_emit_forking (unsigned mask, bool is_call)
421{
422 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
423 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
424 if (mask)
425 {
426 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
427
428 /* Emit fork at all levels. This helps form SESE regions, as
429 it creates a block with a single successor before entering a
430 partitooned region. That is a good candidate for the end of
431 an SESE region. */
e91eba31 432 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
433 emit_insn (gen_nvptx_forked (op));
434 }
435}
436
437/* Emit joining instructions for MASK. */
438
439static void
440nvptx_emit_joining (unsigned mask, bool is_call)
441{
442 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
443 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
444 if (mask)
445 {
446 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
447
448 /* Emit joining for all non-call pars to ensure there's a single
449 predecessor for the block the join insn ends up in. This is
450 needed for skipping entire loops. */
e91eba31 451 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
452 emit_insn (gen_nvptx_join (op));
453 }
454}
455
738f2522 456\f
44eba92d
NS
457/* Determine whether MODE and TYPE (possibly NULL) should be passed or
458 returned in memory. Integer and floating types supported by the
459 machine are passed in registers, everything else is passed in
460 memory. Complex types are split. */
461
462static bool
463pass_in_memory (machine_mode mode, const_tree type, bool for_return)
464{
465 if (type)
466 {
467 if (AGGREGATE_TYPE_P (type))
468 return true;
469 if (TREE_CODE (type) == VECTOR_TYPE)
470 return true;
471 }
472
473 if (!for_return && COMPLEX_MODE_P (mode))
474 /* Complex types are passed as two underlying args. */
475 mode = GET_MODE_INNER (mode);
476
477 if (GET_MODE_CLASS (mode) != MODE_INT
478 && GET_MODE_CLASS (mode) != MODE_FLOAT)
479 return true;
480
481 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
482 return true;
483
484 return false;
485}
486
487/* A non-memory argument of mode MODE is being passed, determine the mode it
488 should be promoted to. This is also used for determining return
489 type promotion. */
490
491static machine_mode
492promote_arg (machine_mode mode, bool prototyped)
493{
494 if (!prototyped && mode == SFmode)
495 /* K&R float promotion for unprototyped functions. */
496 mode = DFmode;
497 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
498 mode = SImode;
499
500 return mode;
501}
502
503/* A non-memory return type of MODE is being returned. Determine the
504 mode it should be promoted to. */
505
506static machine_mode
507promote_return (machine_mode mode)
508{
509 return promote_arg (mode, true);
510}
511
e74f7152 512/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 513
e74f7152 514static rtx
b49e35a9 515nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
516 const_tree, bool named)
517{
b49e35a9 518 if (mode == VOIDmode || !named)
e74f7152 519 return NULL_RTX;
738f2522 520
b49e35a9 521 return gen_reg_rtx (mode);
e74f7152
NS
522}
523
524/* Implement TARGET_FUNCTION_INCOMING_ARG. */
525
526static rtx
527nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
528 const_tree, bool named)
738f2522 529{
e74f7152 530 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 531
b49e35a9 532 if (mode == VOIDmode || !named)
e74f7152 533 return NULL_RTX;
738f2522 534
e74f7152
NS
535 /* No need to deal with split modes here, the only case that can
536 happen is complex modes and those are dealt with by
537 TARGET_SPLIT_COMPLEX_ARG. */
538 return gen_rtx_UNSPEC (mode,
539 gen_rtvec (1, GEN_INT (cum->count)),
540 UNSPEC_ARG_REG);
541}
542
543/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
544
545static void
546nvptx_function_arg_advance (cumulative_args_t cum_v,
547 machine_mode ARG_UNUSED (mode),
548 const_tree ARG_UNUSED (type),
549 bool ARG_UNUSED (named))
550{
551 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 552
e74f7152
NS
553 cum->count++;
554}
555
1988b2c2
NS
556/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
557
558 For nvptx This is only used for varadic args. The type has already
559 been promoted and/or converted to invisible reference. */
560
561static unsigned
562nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
563{
564 return GET_MODE_ALIGNMENT (mode);
565}
566
e74f7152
NS
567/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
568
569 For nvptx, we know how to handle functions declared as stdarg: by
570 passing an extra pointer to the unnamed arguments. However, the
571 Fortran frontend can produce a different situation, where a
572 function pointer is declared with no arguments, but the actual
573 function and calls to it take more arguments. In that case, we
574 want to ensure the call matches the definition of the function. */
575
576static bool
577nvptx_strict_argument_naming (cumulative_args_t cum_v)
578{
579 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 580
e74f7152
NS
581 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
582}
583
e74f7152
NS
584/* Implement TARGET_LIBCALL_VALUE. */
585
586static rtx
587nvptx_libcall_value (machine_mode mode, const_rtx)
588{
e91cacae 589 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
590 /* Pretend to return in a hard reg for early uses before pseudos can be
591 generated. */
592 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 593
e74f7152
NS
594 return gen_reg_rtx (mode);
595}
596
597/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
598 where function FUNC returns or receives a value of data type TYPE. */
599
600static rtx
44eba92d 601nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
602 bool outgoing)
603{
44eba92d
NS
604 machine_mode mode = promote_return (TYPE_MODE (type));
605
e74f7152 606 if (outgoing)
5c036f3f 607 {
e91cacae 608 gcc_assert (cfun);
44c068ae 609 cfun->machine->return_mode = mode;
5c036f3f
NS
610 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
611 }
b49e35a9
NS
612
613 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
614}
615
616/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
617
618static bool
619nvptx_function_value_regno_p (const unsigned int regno)
620{
621 return regno == NVPTX_RETURN_REGNUM;
622}
623
624/* Types with a mode other than those supported by the machine are passed by
625 reference in memory. */
626
627static bool
5563d5c0
NS
628nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
629 machine_mode mode, const_tree type,
630 bool ARG_UNUSED (named))
e74f7152 631{
44eba92d 632 return pass_in_memory (mode, type, false);
e74f7152
NS
633}
634
635/* Implement TARGET_RETURN_IN_MEMORY. */
636
637static bool
638nvptx_return_in_memory (const_tree type, const_tree)
639{
44eba92d 640 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
641}
642
643/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
644
645static machine_mode
646nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 647 int *ARG_UNUSED (punsignedp),
e74f7152
NS
648 const_tree funtype, int for_return)
649{
44eba92d 650 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
651}
652
e74f7152
NS
653/* Helper for write_arg. Emit a single PTX argument of MODE, either
654 in a prototype, or as copy in a function prologue. ARGNO is the
655 index of this argument in the PTX function. FOR_REG is negative,
656 if we're emitting the PTX prototype. It is zero if we're copying
657 to an argument register and it is greater than zero if we're
658 copying to a specific hard register. */
659
660static int
1f065954
NS
661write_arg_mode (std::stringstream &s, int for_reg, int argno,
662 machine_mode mode)
e74f7152
NS
663{
664 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
665
dc3d2aeb
NS
666 if (for_reg < 0)
667 {
668 /* Writing PTX prototype. */
669 s << (argno ? ", " : " (");
e74f7152 670 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
671 }
672 else
673 {
e74f7152 674 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
675 if (for_reg)
676 s << reg_names[for_reg];
677 else
678 s << "%ar" << argno;
679 s << ";\n";
5563d5c0
NS
680 if (argno >= 0)
681 {
682 s << "\tld.param" << ptx_type << " ";
683 if (for_reg)
684 s << reg_names[for_reg];
685 else
686 s << "%ar" << argno;
687 s << ", [%in_ar" << argno << "];\n";
688 }
dc3d2aeb
NS
689 }
690 return argno + 1;
738f2522
BS
691}
692
e74f7152 693/* Process function parameter TYPE to emit one or more PTX
1f065954 694 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
695 is true, if this is a prototyped function, rather than an old-style
696 C declaration. Returns the next argument number to use.
e74f7152 697
9c582551 698 The promotion behavior here must match the regular GCC function
e74f7152
NS
699 parameter marshalling machinery. */
700
701static int
1f065954
NS
702write_arg_type (std::stringstream &s, int for_reg, int argno,
703 tree type, bool prototyped)
e74f7152
NS
704{
705 machine_mode mode = TYPE_MODE (type);
706
707 if (mode == VOIDmode)
708 return argno;
709
44eba92d 710 if (pass_in_memory (mode, type, false))
e74f7152 711 mode = Pmode;
44eba92d
NS
712 else
713 {
714 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 715
44eba92d
NS
716 if (split)
717 {
718 /* Complex types are sent as two separate args. */
719 type = TREE_TYPE (type);
5563d5c0 720 mode = TYPE_MODE (type);
44eba92d
NS
721 prototyped = true;
722 }
e74f7152 723
44eba92d
NS
724 mode = promote_arg (mode, prototyped);
725 if (split)
1f065954 726 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 727 }
e74f7152 728
1f065954
NS
729 return write_arg_mode (s, for_reg, argno, mode);
730}
731
732/* Emit a PTX return as a prototype or function prologue declaration
733 for MODE. */
734
735static void
736write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
737{
738 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
739 const char *pfx = "\t.reg";
740 const char *sfx = ";\n";
741
742 if (for_proto)
743 pfx = "(.param", sfx = "_out) ";
744
745 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
746}
747
44eba92d 748/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 749 or function prologue declaration. Returns true if return is via an
9c582551 750 additional pointer parameter. The promotion behavior here must
1f065954 751 match the regular GCC function return mashalling. */
44eba92d 752
0beb7c71 753static bool
1f065954 754write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
755{
756 machine_mode mode = TYPE_MODE (type);
0beb7c71 757
44eba92d
NS
758 if (mode == VOIDmode)
759 return false;
760
761 bool return_in_mem = pass_in_memory (mode, type, true);
762
763 if (return_in_mem)
0beb7c71 764 {
44eba92d
NS
765 if (for_proto)
766 return return_in_mem;
767
768 /* Named return values can cause us to return a pointer as well
769 as expect an argument for the return location. This is
770 optimization-level specific, so no caller can make use of
771 this data, but more importantly for us, we must ensure it
772 doesn't change the PTX prototype. */
44c068ae 773 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 774
44eba92d
NS
775 if (mode == VOIDmode)
776 return return_in_mem;
777
44c068ae 778 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 779 retval parameter. */
44c068ae 780 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
781 }
782 else
44eba92d
NS
783 mode = promote_return (mode);
784
1f065954 785 write_return_mode (s, for_proto, mode);
0beb7c71
NS
786
787 return return_in_mem;
788}
789
738f2522
BS
790/* Look for attributes in ATTRS that would indicate we must write a function
791 as a .entry kernel rather than a .func. Return true if one is found. */
792
793static bool
794write_as_kernel (tree attrs)
795{
796 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
797 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
798 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
799 /* For OpenMP target regions, the corresponding kernel entry is emitted from
800 write_omp_entry as a separate function. */
738f2522
BS
801}
802
69823d76
NS
803/* Emit a linker marker for a function decl or defn. */
804
805static void
806write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
807 const char *name)
808{
809 s << "\n// BEGIN";
810 if (globalize)
811 s << " GLOBAL";
812 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
813 s << name << "\n";
814}
815
816/* Emit a linker marker for a variable decl or defn. */
817
818static void
819write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
820{
821 fprintf (file, "\n// BEGIN%s VAR %s: ",
822 globalize ? " GLOBAL" : "",
823 is_defn ? "DEF" : "DECL");
824 assemble_name_raw (file, name);
825 fputs ("\n", file);
826}
827
b699adcc
NS
828/* Write a .func or .kernel declaration or definition along with
829 a helper comment for use by ld. S is the stream to write to, DECL
830 the decl for the function with name NAME. For definitions, emit
831 a declaration too. */
738f2522 832
b699adcc
NS
833static const char *
834write_fn_proto (std::stringstream &s, bool is_defn,
835 const char *name, const_tree decl)
738f2522 836{
b699adcc
NS
837 if (is_defn)
838 /* Emit a declaration. The PTX assembler gets upset without it. */
839 name = write_fn_proto (s, false, name, decl);
b4346107
NS
840 else
841 {
842 /* Avoid repeating the name replacement. */
843 name = nvptx_name_replacement (name);
844 if (name[0] == '*')
845 name++;
846 }
738f2522 847
69823d76 848 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
849
850 /* PTX declaration. */
738f2522
BS
851 if (DECL_EXTERNAL (decl))
852 s << ".extern ";
853 else if (TREE_PUBLIC (decl))
0766660b 854 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 855 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 856
b699adcc
NS
857 tree fntype = TREE_TYPE (decl);
858 tree result_type = TREE_TYPE (fntype);
738f2522 859
ac3d02e2
NS
860 /* atomic_compare_exchange_$n builtins have an exceptional calling
861 convention. */
862 int not_atomic_weak_arg = -1;
863 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
864 switch (DECL_FUNCTION_CODE (decl))
865 {
866 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
867 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
868 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
869 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
871 /* These atomics skip the 'weak' parm in an actual library
872 call. We must skip it in the prototype too. */
873 not_atomic_weak_arg = 3;
874 break;
875
876 default:
877 break;
878 }
879
738f2522 880 /* Declare the result. */
1f065954 881 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 882
b699adcc
NS
883 s << name;
884
dc3d2aeb 885 int argno = 0;
b699adcc
NS
886
887 /* Emit argument list. */
888 if (return_in_mem)
1f065954 889 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 890
b699adcc
NS
891 /* We get:
892 NULL in TYPE_ARG_TYPES, for old-style functions
893 NULL in DECL_ARGUMENTS, for builtin functions without another
894 declaration.
895 So we have to pick the best one we have. */
896 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
897 bool prototyped = true;
898 if (!args)
899 {
900 args = DECL_ARGUMENTS (decl);
901 prototyped = false;
902 }
738f2522 903
ac3d02e2 904 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 905 {
dc3d2aeb 906 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
907
908 if (not_atomic_weak_arg)
909 argno = write_arg_type (s, -1, argno, type, prototyped);
910 else
911 gcc_assert (type == boolean_type_node);
738f2522 912 }
738f2522 913
b699adcc 914 if (stdarg_p (fntype))
1f065954 915 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 916
b699adcc 917 if (DECL_STATIC_CHAIN (decl))
1f065954 918 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 919
dc3d2aeb 920 if (!argno && strcmp (name, "main") == 0)
b699adcc 921 {
1f065954
NS
922 argno = write_arg_type (s, -1, argno, integer_type_node, true);
923 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
924 }
925
dc3d2aeb 926 if (argno)
b699adcc
NS
927 s << ")";
928
929 s << (is_defn ? "\n" : ";\n");
930
931 return name;
738f2522
BS
932}
933
00e52418
NS
934/* Construct a function declaration from a call insn. This can be
935 necessary for two reasons - either we have an indirect call which
936 requires a .callprototype declaration, or we have a libcall
937 generated by emit_library_call for which no decl exists. */
938
939static void
b699adcc
NS
940write_fn_proto_from_insn (std::stringstream &s, const char *name,
941 rtx result, rtx pat)
00e52418
NS
942{
943 if (!name)
944 {
945 s << "\t.callprototype ";
946 name = "_";
947 }
948 else
949 {
b4346107 950 name = nvptx_name_replacement (name);
69823d76 951 write_fn_marker (s, false, true, name);
00e52418
NS
952 s << "\t.extern .func ";
953 }
954
955 if (result != NULL_RTX)
1f065954 956 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
957
958 s << name;
959
00e52418
NS
960 int arg_end = XVECLEN (pat, 0);
961 for (int i = 1; i < arg_end; i++)
962 {
1f065954
NS
963 /* We don't have to deal with mode splitting & promotion here,
964 as that was already done when generating the call
965 sequence. */
00e52418
NS
966 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
967
1f065954 968 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
969 }
970 if (arg_end != 1)
971 s << ")";
972 s << ";\n";
973}
974
00e52418
NS
975/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
976 table and and write a ptx prototype. These are emitted at end of
977 compilation. */
738f2522 978
00e52418
NS
979static void
980nvptx_record_fndecl (tree decl)
738f2522 981{
f3dba894 982 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
983 if (*slot == NULL)
984 {
985 *slot = decl;
986 const char *name = get_fnname_from_decl (decl);
b699adcc 987 write_fn_proto (func_decls, false, name, decl);
738f2522 988 }
738f2522
BS
989}
990
00e52418
NS
991/* Record a libcall or unprototyped external function. CALLEE is the
992 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
993 declaration for it. */
994
995static void
996nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
997{
998 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
999 if (*slot == NULL)
1000 {
1001 *slot = callee;
1002
1003 const char *name = XSTR (callee, 0);
b699adcc 1004 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
1005 }
1006}
1007
1008/* DECL is an external FUNCTION_DECL, that we're referencing. If it
1009 is prototyped, record it now. Otherwise record it as needed at end
1010 of compilation, when we might have more information about it. */
738f2522
BS
1011
1012void
1013nvptx_record_needed_fndecl (tree decl)
1014{
00e52418
NS
1015 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1016 {
1017 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1018 if (*slot == NULL)
1019 *slot = decl;
1020 }
1021 else
1022 nvptx_record_fndecl (decl);
1023}
738f2522 1024
00e52418
NS
1025/* SYM is a SYMBOL_REF. If it refers to an external function, record
1026 it as needed. */
1027
1028static void
1029nvptx_maybe_record_fnsym (rtx sym)
1030{
1031 tree decl = SYMBOL_REF_DECL (sym);
1032
1033 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1034 nvptx_record_needed_fndecl (decl);
738f2522
BS
1035}
1036
1f065954 1037/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1038 and initialize REGNO to point to it. If the size is zero, it'll
1039 never be valid to dereference, so we can simply initialize to
1040 zero. */
1f065954
NS
1041
1042static void
1043init_frame (FILE *file, int regno, unsigned align, unsigned size)
1044{
5563d5c0
NS
1045 if (size)
1046 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1047 align, reg_names[regno], size);
1048 fprintf (file, "\t.reg.u%d %s;\n",
1049 POINTER_SIZE, reg_names[regno]);
1050 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1051 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1052 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1053}
1054
5012919d
AM
1055/* Emit soft stack frame setup sequence. */
1056
1057static void
1058init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1059{
1060 /* Maintain 64-bit stack alignment. */
1061 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1062 size = ROUND_UP (size, keep_align);
1063 int bits = POINTER_SIZE;
1064 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1065 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1066 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1067 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1068 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1069 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1070 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1071 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1072 fprintf (file, "\t{\n");
1073 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1074 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1075 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1076 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1077 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1078 bits == 64 ? ".wide" : ".lo", bits / 8);
1079 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1080
1081 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1082 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1083
1084 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1085 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1086 bits, reg_sspprev, reg_sspslot);
1087
1088 /* Initialize %frame = %sspprev - size. */
1089 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1090 bits, reg_frame, reg_sspprev, size);
1091
1092 /* Apply alignment, if larger than 64. */
1093 if (alignment > keep_align)
1094 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1095 bits, reg_frame, reg_frame, -alignment);
1096
1097 size = crtl->outgoing_args_size;
1098 gcc_assert (size % keep_align == 0);
1099
1100 /* Initialize %stack. */
1101 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1102 bits, reg_stack, reg_frame, size);
1103
5012919d
AM
1104 if (!crtl->is_leaf)
1105 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1106 bits, reg_sspslot, reg_stack);
1107 fprintf (file, "\t}\n");
1108 cfun->machine->has_softstack = true;
1109 need_softstack_decl = true;
1110}
1111
d88cd9c4
NS
1112/* Emit code to initialize the REGNO predicate register to indicate
1113 whether we are not lane zero on the NAME axis. */
1114
1115static void
1116nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1117{
1118 fprintf (file, "\t{\n");
1119 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
f881693c
TV
1120 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1121 {
1122 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1123 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1124 }
d88cd9c4
NS
1125 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1126 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
f881693c
TV
1127 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1128 {
1129 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1130 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1131 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1132 "// vector reduction buffer\n",
1133 REGNO (cfun->machine->red_partition),
1134 vector_red_partition);
1135 }
1136 /* Verify vector_red_size. */
1137 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1138 <= vector_red_size);
d88cd9c4
NS
1139 fprintf (file, "\t}\n");
1140}
1141
0024c320
TV
1142/* Emit code to initialize OpenACC worker broadcast and synchronization
1143 registers. */
1144
1145static void
1146nvptx_init_oacc_workers (FILE *file)
1147{
1148 fprintf (file, "\t{\n");
1149 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1150 if (cfun->machine->bcast_partition)
1151 {
1152 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1153 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1154 }
1155 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1156 if (cfun->machine->bcast_partition)
1157 {
1158 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1159 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1160 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1161 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1162 "// vector broadcast offset\n",
1163 REGNO (cfun->machine->bcast_partition),
1164 oacc_bcast_partition);
1165 }
1166 /* Verify oacc_bcast_size. */
1167 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1168 <= oacc_bcast_size);
1169 if (cfun->machine->sync_bar)
1170 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1171 "// vector synchronization barrier\n",
1172 REGNO (cfun->machine->sync_bar));
1173 fprintf (file, "\t}\n");
1174}
1175
5012919d
AM
1176/* Emit code to initialize predicate and master lane index registers for
1177 -muniform-simt code generation variant. */
1178
1179static void
1180nvptx_init_unisimt_predicate (FILE *file)
1181{
0c6b03b5
AM
1182 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1183 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1184 int bits = POINTER_SIZE;
0c6b03b5 1185 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1186 fprintf (file, "\t{\n");
1187 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1188 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1189 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1190 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1191 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1192 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1193 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1194 if (cfun->machine->unisimt_predicate)
1195 {
1196 int master = REGNO (cfun->machine->unisimt_master);
1197 int pred = REGNO (cfun->machine->unisimt_predicate);
1198 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1199 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1200 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1201 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1202 /* Compute predicate as 'tid.x == master'. */
1203 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1204 }
5012919d
AM
1205 fprintf (file, "\t}\n");
1206 need_unisimt_decl = true;
1207}
1208
1209/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1210
1211 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1212 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1213 {
1214 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1215 __nvptx_uni[tid.y] = 0;
1216 gomp_nvptx_main (ORIG, arg);
1217 }
1218 ORIG itself should not be emitted as a PTX .entry function. */
1219
1220static void
1221write_omp_entry (FILE *file, const char *name, const char *orig)
1222{
1223 static bool gomp_nvptx_main_declared;
1224 if (!gomp_nvptx_main_declared)
1225 {
1226 gomp_nvptx_main_declared = true;
1227 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1228 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1229 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1230 }
3c5b4c6d
TS
1231 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1232#define NTID_Y "%ntid.y"
5012919d
AM
1233#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1234 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1235{\n\
1236 .reg.u32 %r<3>;\n\
1237 .reg.u" PS " %R<4>;\n\
1238 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1239 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1240 mov.u32 %r2, %ctaid.x;\n\
1241 cvt.u" PS ".u32 %R1, %r0;\n\
1242 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1243 mov.u" PS " %R0, __nvptx_stacks;\n\
1244 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1245 ld.param.u" PS " %R2, [%stack];\n\
1246 ld.param.u" PS " %R3, [%sz];\n\
1247 add.u" PS " %R2, %R2, %R3;\n\
1248 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1249 st.shared.u" PS " [%R0], %R2;\n\
1250 mov.u" PS " %R0, __nvptx_uni;\n\
1251 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1252 mov.u32 %r0, 0;\n\
1253 st.shared.u32 [%R0], %r0;\n\
1254 mov.u" PS " %R0, \0;\n\
1255 ld.param.u" PS " %R1, [%arg];\n\
1256 {\n\
1257 .param.u" PS " %P<2>;\n\
1258 st.param.u" PS " [%P0], %R0;\n\
1259 st.param.u" PS " [%P1], %R1;\n\
1260 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1261 }\n\
1262 ret.uni;\n\
1263}\n"
1264 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1265 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1266#undef ENTRY_TEMPLATE
3c5b4c6d 1267#undef NTID_Y
5012919d
AM
1268 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1269 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1270 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1271 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1272 need_softstack_decl = need_unisimt_decl = true;
1273}
1274
738f2522
BS
1275/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1276 function, including local var decls and copies from the arguments to
1277 local regs. */
1278
1279void
1280nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1281{
1282 tree fntype = TREE_TYPE (decl);
1283 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1284 int argno = 0;
738f2522 1285
5012919d
AM
1286 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1287 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1288 {
1289 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1290 sprintf (buf, "%s$impl", name);
1291 write_omp_entry (file, name, buf);
1292 name = buf;
1293 }
dc3d2aeb
NS
1294 /* We construct the initial part of the function into a string
1295 stream, in order to share the prototype writing code. */
738f2522 1296 std::stringstream s;
b699adcc 1297 write_fn_proto (s, true, name, decl);
dc3d2aeb 1298 s << "{\n";
738f2522 1299
1f065954 1300 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1301 if (return_in_mem)
1f065954 1302 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1303
5ab662d5 1304 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1305 tree args = TYPE_ARG_TYPES (fntype);
1306 bool prototyped = true;
1307 if (!args)
5ab662d5 1308 {
dc3d2aeb
NS
1309 args = DECL_ARGUMENTS (decl);
1310 prototyped = false;
5ab662d5
NS
1311 }
1312
1313 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1314 {
1315 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1316
1f065954 1317 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1318 }
5ab662d5 1319
dc3d2aeb 1320 if (stdarg_p (fntype))
5563d5c0 1321 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1322 true);
5ab662d5 1323
5563d5c0
NS
1324 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1325 write_arg_type (s, STATIC_CHAIN_REGNUM,
1326 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1327 true);
1328
dc3d2aeb 1329 fprintf (file, "%s", s.str().c_str());
25662751 1330
0c6b03b5
AM
1331 /* Usually 'crtl->is_leaf' is computed during register allocator
1332 initialization (which is not done on NVPTX) or for pressure-sensitive
1333 optimizations. Initialize it here, except if already set. */
1334 if (!crtl->is_leaf)
1335 crtl->is_leaf = leaf_function_p ();
1336
44c068ae 1337 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1338 bool need_frameptr = sz || cfun->machine->has_chain;
1339 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1340 if (!TARGET_SOFT_STACK)
1341 {
1342 /* Declare a local var for outgoing varargs. */
1343 if (cfun->machine->has_varadic)
1344 init_frame (file, STACK_POINTER_REGNUM,
1345 UNITS_PER_WORD, crtl->outgoing_args_size);
1346
1347 /* Declare a local variable for the frame. Force its size to be
1348 DImode-compatible. */
1349 if (need_frameptr)
1350 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1351 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1352 }
0c6b03b5
AM
1353 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1354 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1355 init_softstack_frame (file, alignment, sz);
44c068ae 1356
0c6b03b5
AM
1357 if (cfun->machine->has_simtreg)
1358 {
1359 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1360 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1361 align = MAX (align, GET_MODE_SIZE (DImode));
1362 if (!crtl->is_leaf || cfun->calls_alloca)
1363 simtsz = HOST_WIDE_INT_M1U;
1364 if (simtsz == HOST_WIDE_INT_M1U)
1365 simtsz = nvptx_softstack_size;
1366 if (cfun->machine->has_softstack)
1367 simtsz += POINTER_SIZE / 8;
1368 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1369 if (align > GET_MODE_SIZE (DImode))
1370 simtsz += align - GET_MODE_SIZE (DImode);
1371 if (simtsz)
1372 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1373 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1374 }
f881693c
TV
1375
1376 /* Restore the vector reduction partition register, if necessary.
1377 FIXME: Find out when and why this is necessary, and fix it. */
1378 if (cfun->machine->red_partition)
1379 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1380 = cfun->machine->red_partition;
1381
738f2522
BS
1382 /* Declare the pseudos we have as ptx registers. */
1383 int maxregs = max_reg_num ();
1384 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1385 {
1386 if (regno_reg_rtx[i] != const0_rtx)
1387 {
1388 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1389 machine_mode split = maybe_split_mode (mode);
f313d112 1390
38827811 1391 if (split_mode_p (mode))
f313d112
NS
1392 mode = split;
1393 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1394 output_reg (file, i, split, -2);
1395 fprintf (file, ";\n");
738f2522
BS
1396 }
1397 }
1398
d88cd9c4
NS
1399 /* Emit axis predicates. */
1400 if (cfun->machine->axis_predicate[0])
1401 nvptx_init_axis_predicate (file,
1402 REGNO (cfun->machine->axis_predicate[0]), "y");
1403 if (cfun->machine->axis_predicate[1])
1404 nvptx_init_axis_predicate (file,
1405 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1406 if (cfun->machine->unisimt_predicate
1407 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1408 nvptx_init_unisimt_predicate (file);
0024c320
TV
1409 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1410 nvptx_init_oacc_workers (file);
738f2522
BS
1411}
1412
0c6b03b5
AM
1413/* Output code for switching uniform-simt state. ENTERING indicates whether
1414 we are entering or leaving non-uniform execution region. */
1415
1416static void
1417nvptx_output_unisimt_switch (FILE *file, bool entering)
1418{
1419 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1420 return;
1421 fprintf (file, "\t{\n");
1422 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1423 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1424 if (!crtl->is_leaf)
1425 {
1426 int loc = REGNO (cfun->machine->unisimt_location);
1427 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1428 }
1429 if (cfun->machine->unisimt_predicate)
1430 {
1431 int master = REGNO (cfun->machine->unisimt_master);
1432 int pred = REGNO (cfun->machine->unisimt_predicate);
1433 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1434 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1435 master, entering ? "%ustmp2" : "0");
1436 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1437 }
1438 fprintf (file, "\t}\n");
1439}
1440
1441/* Output code for allocating per-lane storage and switching soft-stack pointer.
1442 ENTERING indicates whether we are entering or leaving non-uniform execution.
1443 PTR is the register pointing to allocated storage, it is assigned to on
1444 entering and used to restore state on leaving. SIZE and ALIGN are used only
1445 on entering. */
1446
1447static void
1448nvptx_output_softstack_switch (FILE *file, bool entering,
1449 rtx ptr, rtx size, rtx align)
1450{
1451 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1452 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1453 return;
1454 int bits = POINTER_SIZE, regno = REGNO (ptr);
1455 fprintf (file, "\t{\n");
1456 if (entering)
1457 {
1458 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1459 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1460 cfun->machine->simt_stack_size);
1461 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1462 if (CONST_INT_P (size))
1463 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1464 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1465 else
1466 output_reg (file, REGNO (size), VOIDmode);
1467 fputs (";\n", file);
1468 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296
AM
1469 fprintf (file,
1470 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1471 bits, regno, regno, UINTVAL (align));
1472 }
1473 if (cfun->machine->has_softstack)
1474 {
1475 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1476 if (entering)
1477 {
1478 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1479 bits, regno, bits / 8, reg_stack);
1480 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1481 bits, reg_stack, regno, bits / 8);
1482 }
1483 else
1484 {
1485 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1486 bits, reg_stack, regno, bits / 8);
1487 }
1488 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1489 }
1490 fprintf (file, "\t}\n");
1491}
1492
1493/* Output code to enter non-uniform execution region. DEST is a register
1494 to hold a per-lane allocation given by SIZE and ALIGN. */
1495
1496const char *
1497nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1498{
1499 nvptx_output_unisimt_switch (asm_out_file, true);
1500 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1501 return "";
1502}
1503
1504/* Output code to leave non-uniform execution region. SRC is the register
1505 holding per-lane storage previously allocated by omp_simt_enter insn. */
1506
1507const char *
1508nvptx_output_simt_exit (rtx src)
1509{
1510 nvptx_output_unisimt_switch (asm_out_file, false);
1511 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1512 return "";
1513}
1514
5012919d
AM
1515/* Output instruction that sets soft stack pointer in shared memory to the
1516 value in register given by SRC_REGNO. */
1517
1518const char *
1519nvptx_output_set_softstack (unsigned src_regno)
1520{
1521 if (cfun->machine->has_softstack && !crtl->is_leaf)
1522 {
1523 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1524 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1525 output_reg (asm_out_file, src_regno, VOIDmode);
1526 fprintf (asm_out_file, ";\n");
1527 }
1528 return "";
1529}
738f2522
BS
1530/* Output a return instruction. Also copy the return value to its outgoing
1531 location. */
1532
1533const char *
1534nvptx_output_return (void)
1535{
44c068ae 1536 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1537
1538 if (mode != VOIDmode)
1f065954
NS
1539 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1540 nvptx_ptx_type_from_mode (mode, false),
1541 reg_names[NVPTX_RETURN_REGNUM],
1542 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1543
1544 return "ret;";
1545}
1546
738f2522
BS
1547/* Terminate a function by writing a closing brace to FILE. */
1548
1549void
1550nvptx_function_end (FILE *file)
1551{
cf08c344 1552 fprintf (file, "}\n");
738f2522
BS
1553}
1554\f
1555/* Decide whether we can make a sibling call to a function. For ptx, we
1556 can't. */
1557
1558static bool
1559nvptx_function_ok_for_sibcall (tree, tree)
1560{
1561 return false;
1562}
1563
18c05628
NS
1564/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1565
1566static rtx
1567nvptx_get_drap_rtx (void)
1568{
5012919d
AM
1569 if (TARGET_SOFT_STACK && stack_realign_drap)
1570 return arg_pointer_rtx;
18c05628
NS
1571 return NULL_RTX;
1572}
1573
738f2522
BS
1574/* Implement the TARGET_CALL_ARGS hook. Record information about one
1575 argument to the next call. */
1576
1577static void
44c068ae 1578nvptx_call_args (rtx arg, tree fntype)
738f2522 1579{
44c068ae 1580 if (!cfun->machine->doing_call)
738f2522 1581 {
44c068ae
NS
1582 cfun->machine->doing_call = true;
1583 cfun->machine->is_varadic = false;
1584 cfun->machine->num_args = 0;
1585
1586 if (fntype && stdarg_p (fntype))
1587 {
1588 cfun->machine->is_varadic = true;
1589 cfun->machine->has_varadic = true;
1590 cfun->machine->num_args++;
1591 }
738f2522 1592 }
738f2522 1593
44c068ae
NS
1594 if (REG_P (arg) && arg != pc_rtx)
1595 {
1596 cfun->machine->num_args++;
1597 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1598 cfun->machine->call_args);
1599 }
738f2522
BS
1600}
1601
1602/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1603 information we recorded. */
1604
1605static void
1606nvptx_end_call_args (void)
1607{
44c068ae 1608 cfun->machine->doing_call = false;
738f2522
BS
1609 free_EXPR_LIST_list (&cfun->machine->call_args);
1610}
1611
ecf6e535
BS
1612/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1613 track of whether calls involving static chains or varargs were seen
1614 in the current function.
1615 For libcalls, maintain a hash table of decls we have seen, and
1616 record a function decl for later when encountering a new one. */
738f2522
BS
1617
1618void
1619nvptx_expand_call (rtx retval, rtx address)
1620{
738f2522 1621 rtx callee = XEXP (address, 0);
f324806d 1622 rtx varargs = NULL_RTX;
d88cd9c4 1623 unsigned parallel = 0;
738f2522 1624
738f2522
BS
1625 if (!call_insn_operand (callee, Pmode))
1626 {
1627 callee = force_reg (Pmode, callee);
1628 address = change_address (address, QImode, callee);
1629 }
1630
1631 if (GET_CODE (callee) == SYMBOL_REF)
1632 {
1633 tree decl = SYMBOL_REF_DECL (callee);
1634 if (decl != NULL_TREE)
1635 {
738f2522 1636 if (DECL_STATIC_CHAIN (decl))
44c068ae 1637 cfun->machine->has_chain = true;
00e52418 1638
629b3d75 1639 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1640 if (attr)
1641 {
1642 tree dims = TREE_VALUE (attr);
1643
1644 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1645 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1646 {
1647 if (TREE_PURPOSE (dims)
1648 && !integer_zerop (TREE_PURPOSE (dims)))
1649 break;
1650 /* Not on this axis. */
1651 parallel ^= GOMP_DIM_MASK (ix);
1652 dims = TREE_CHAIN (dims);
1653 }
1654 }
738f2522
BS
1655 }
1656 }
c38f0d8c 1657
44c068ae
NS
1658 unsigned nargs = cfun->machine->num_args;
1659 if (cfun->machine->is_varadic)
738f2522 1660 {
f324806d 1661 varargs = gen_reg_rtx (Pmode);
863af9a4 1662 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1663 }
1664
44c068ae
NS
1665 rtvec vec = rtvec_alloc (nargs + 1);
1666 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1667 int vec_pos = 0;
44c068ae
NS
1668
1669 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1670 rtx tmp_retval = retval;
44c068ae 1671 if (retval)
738f2522
BS
1672 {
1673 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1674 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1675 call = gen_rtx_SET (tmp_retval, call);
738f2522 1676 }
44c068ae 1677 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1678
1679 /* Construct the call insn, including a USE for each argument pseudo
1680 register. These will be used when printing the insn. */
1681 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1682 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1683
1684 if (varargs)
cf08c344 1685 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1686
1687 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1688
d88cd9c4 1689 nvptx_emit_forking (parallel, true);
738f2522 1690 emit_call_insn (pat);
d88cd9c4
NS
1691 nvptx_emit_joining (parallel, true);
1692
738f2522
BS
1693 if (tmp_retval != retval)
1694 emit_move_insn (retval, tmp_retval);
1695}
44eba92d 1696
738f2522
BS
1697/* Emit a comparison COMPARE, and return the new test to be used in the
1698 jump. */
1699
1700rtx
1701nvptx_expand_compare (rtx compare)
1702{
1703 rtx pred = gen_reg_rtx (BImode);
1704 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1705 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1706 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1707 return gen_rtx_NE (BImode, pred, const0_rtx);
1708}
1709
d88cd9c4
NS
1710/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1711
1712void
1713nvptx_expand_oacc_fork (unsigned mode)
1714{
1715 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1716}
1717
1718void
1719nvptx_expand_oacc_join (unsigned mode)
1720{
1721 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1722}
1723
1724/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1725 objects. */
1726
1727static rtx
1728nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1729{
1730 rtx res;
1731
1732 switch (GET_MODE (src))
1733 {
4e10a5a7 1734 case E_DImode:
d88cd9c4
NS
1735 res = gen_unpackdisi2 (dst0, dst1, src);
1736 break;
4e10a5a7 1737 case E_DFmode:
d88cd9c4
NS
1738 res = gen_unpackdfsi2 (dst0, dst1, src);
1739 break;
1740 default: gcc_unreachable ();
1741 }
1742 return res;
1743}
1744
1745/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1746 object. */
1747
1748static rtx
1749nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1750{
1751 rtx res;
1752
1753 switch (GET_MODE (dst))
1754 {
4e10a5a7 1755 case E_DImode:
d88cd9c4
NS
1756 res = gen_packsidi2 (dst, src0, src1);
1757 break;
4e10a5a7 1758 case E_DFmode:
d88cd9c4
NS
1759 res = gen_packsidf2 (dst, src0, src1);
1760 break;
1761 default: gcc_unreachable ();
1762 }
1763 return res;
1764}
1765
1766/* Generate an instruction or sequence to broadcast register REG
1767 across the vectors of a single warp. */
1768
5012919d 1769rtx
59263259 1770nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1771{
1772 rtx res;
1773
1774 switch (GET_MODE (dst))
1775 {
4e10a5a7 1776 case E_SImode:
d88cd9c4
NS
1777 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1778 break;
4e10a5a7 1779 case E_SFmode:
d88cd9c4
NS
1780 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1781 break;
4e10a5a7
RS
1782 case E_DImode:
1783 case E_DFmode:
d88cd9c4
NS
1784 {
1785 rtx tmp0 = gen_reg_rtx (SImode);
1786 rtx tmp1 = gen_reg_rtx (SImode);
1787
1788 start_sequence ();
1789 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1790 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1791 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1792 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1793 res = get_insns ();
1794 end_sequence ();
1795 }
1796 break;
4e10a5a7 1797 case E_BImode:
d88cd9c4
NS
1798 {
1799 rtx tmp = gen_reg_rtx (SImode);
1800
1801 start_sequence ();
1802 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1803 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1804 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1805 res = get_insns ();
1806 end_sequence ();
1807 }
1808 break;
4e10a5a7
RS
1809 case E_QImode:
1810 case E_HImode:
d5ace3b5
CP
1811 {
1812 rtx tmp = gen_reg_rtx (SImode);
1813
1814 start_sequence ();
1815 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1816 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1817 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1818 tmp)));
1819 res = get_insns ();
1820 end_sequence ();
1821 }
1822 break;
d88cd9c4
NS
1823
1824 default:
1825 gcc_unreachable ();
1826 }
1827 return res;
1828}
1829
1830/* Generate an instruction or sequence to broadcast register REG
1831 across the vectors of a single warp. */
1832
1833static rtx
a0b3b5c4 1834nvptx_gen_warp_bcast (rtx reg)
d88cd9c4
NS
1835{
1836 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1837}
1838
1839/* Structure used when generating a worker-level spill or fill. */
1840
1ed57fb8 1841struct broadcast_data_t
d88cd9c4
NS
1842{
1843 rtx base; /* Register holding base addr of buffer. */
1844 rtx ptr; /* Iteration var, if needed. */
1845 unsigned offset; /* Offset into worker buffer. */
1846};
1847
1848/* Direction of the spill/fill and looping setup/teardown indicator. */
1849
1850enum propagate_mask
1851 {
1852 PM_read = 1 << 0,
1853 PM_write = 1 << 1,
1854 PM_loop_begin = 1 << 2,
1855 PM_loop_end = 1 << 3,
1856
1857 PM_read_write = PM_read | PM_write
1858 };
1859
1860/* Generate instruction(s) to spill or fill register REG to/from the
1861 worker broadcast array. PM indicates what is to be done, REP
1862 how many loop iterations will be executed (0 for not a loop). */
1863
1864static rtx
a0b3b5c4
TV
1865nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1866 broadcast_data_t *data, bool vector)
d88cd9c4
NS
1867{
1868 rtx res;
1869 machine_mode mode = GET_MODE (reg);
1870
1871 switch (mode)
1872 {
4e10a5a7 1873 case E_BImode:
d88cd9c4
NS
1874 {
1875 rtx tmp = gen_reg_rtx (SImode);
1876
1877 start_sequence ();
1878 if (pm & PM_read)
1879 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
a0b3b5c4 1880 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
d88cd9c4
NS
1881 if (pm & PM_write)
1882 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1883 res = get_insns ();
1884 end_sequence ();
1885 }
1886 break;
1887
1888 default:
1889 {
1890 rtx addr = data->ptr;
1891
1892 if (!addr)
1893 {
1894 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1895
3c55d60f
TV
1896 oacc_bcast_align = MAX (oacc_bcast_align, align);
1897 data->offset = ROUND_UP (data->offset, align);
d88cd9c4 1898 addr = data->base;
a0b3b5c4 1899 gcc_assert (data->base != NULL);
d88cd9c4
NS
1900 if (data->offset)
1901 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1902 }
1903
1904 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1905 if (pm == PM_read)
1906 res = gen_rtx_SET (addr, reg);
1907 else if (pm == PM_write)
1908 res = gen_rtx_SET (reg, addr);
1909 else
1910 gcc_unreachable ();
1911
1912 if (data->ptr)
1913 {
1914 /* We're using a ptr, increment it. */
1915 start_sequence ();
1916
1917 emit_insn (res);
1918 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1919 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1920 res = get_insns ();
1921 end_sequence ();
1922 }
1923 else
1924 rep = 1;
1925 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1926 }
1927 break;
1928 }
1929 return res;
1930}
738f2522
BS
1931\f
1932/* Returns true if X is a valid address for use in a memory reference. */
1933
1934static bool
1935nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1936{
1937 enum rtx_code code = GET_CODE (x);
1938
1939 switch (code)
1940 {
1941 case REG:
1942 return true;
1943
1944 case PLUS:
1945 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1946 return true;
1947 return false;
1948
1949 case CONST:
1950 case SYMBOL_REF:
1951 case LABEL_REF:
1952 return true;
1953
1954 default:
1955 return false;
1956 }
1957}
738f2522 1958\f
4d5438cd
NS
1959/* Machinery to output constant initializers. When beginning an
1960 initializer, we decide on a fragment size (which is visible in ptx
1961 in the type used), and then all initializer data is buffered until
1962 a fragment is filled and ready to be written out. */
1963
1964static struct
1965{
1966 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1967 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1968 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1969 out. */
1970 unsigned size; /* Fragment size to accumulate. */
1971 unsigned offset; /* Offset within current fragment. */
1972 bool started; /* Whether we've output any initializer. */
1973} init_frag;
1974
1975/* The current fragment is full, write it out. SYM may provide a
1976 symbolic reference we should output, in which case the fragment
1977 value is the addend. */
738f2522
BS
1978
1979static void
4d5438cd 1980output_init_frag (rtx sym)
738f2522 1981{
4d5438cd
NS
1982 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1983 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1984
4d5438cd
NS
1985 init_frag.started = true;
1986 init_frag.val = 0;
1987 init_frag.offset = 0;
1988 init_frag.remaining--;
1989
1990 if (sym)
1991 {
6c7c4708
CP
1992 bool function = (SYMBOL_REF_DECL (sym)
1993 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1994 if (!function)
1995 fprintf (asm_out_file, "generic(");
4d5438cd 1996 output_address (VOIDmode, sym);
6c7c4708
CP
1997 if (!function)
1998 fprintf (asm_out_file, ")");
1999 if (val)
2000 fprintf (asm_out_file, " + ");
4d5438cd 2001 }
738f2522 2002
4d5438cd
NS
2003 if (!sym || val)
2004 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
2005}
2006
4d5438cd
NS
2007/* Add value VAL of size SIZE to the data we're emitting, and keep
2008 writing out chunks as they fill up. */
738f2522
BS
2009
2010static void
4d5438cd 2011nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 2012{
4d5438cd
NS
2013 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2014
2015 for (unsigned part = 0; size; size -= part)
738f2522 2016 {
4d5438cd
NS
2017 val >>= part * BITS_PER_UNIT;
2018 part = init_frag.size - init_frag.offset;
3c55d60f 2019 part = MIN (part, size);
4d5438cd
NS
2020
2021 unsigned HOST_WIDE_INT partial
2022 = val << (init_frag.offset * BITS_PER_UNIT);
2023 init_frag.val |= partial & init_frag.mask;
2024 init_frag.offset += part;
2025
2026 if (init_frag.offset == init_frag.size)
2027 output_init_frag (NULL);
738f2522
BS
2028 }
2029}
2030
2031/* Target hook for assembling integer object X of size SIZE. */
2032
2033static bool
2034nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2035{
00e52418
NS
2036 HOST_WIDE_INT val = 0;
2037
2038 switch (GET_CODE (x))
738f2522 2039 {
00e52418 2040 default:
a9000e1e
NS
2041 /* Let the generic machinery figure it out, usually for a
2042 CONST_WIDE_INT. */
2043 return false;
00e52418
NS
2044
2045 case CONST_INT:
4d5438cd 2046 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
2047 break;
2048
2049 case CONST:
2050 x = XEXP (x, 0);
2051 gcc_assert (GET_CODE (x) == PLUS);
2052 val = INTVAL (XEXP (x, 1));
2053 x = XEXP (x, 0);
2054 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2055 /* FALLTHROUGH */
2056
2057 case SYMBOL_REF:
4d5438cd
NS
2058 gcc_assert (size == init_frag.size);
2059 if (init_frag.offset)
738f2522 2060 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 2061
00e52418 2062 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
2063 init_frag.val = val;
2064 output_init_frag (x);
738f2522 2065 break;
738f2522
BS
2066 }
2067
738f2522
BS
2068 return true;
2069}
2070
2071/* Output SIZE zero bytes. We ignore the FILE argument since the
2072 functions we're calling to perform the output just use
2073 asm_out_file. */
2074
2075void
2076nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2077{
4d5438cd
NS
2078 /* Finish the current fragment, if it's started. */
2079 if (init_frag.offset)
738f2522 2080 {
4d5438cd 2081 unsigned part = init_frag.size - init_frag.offset;
3c55d60f 2082 part = MIN (part, (unsigned)size);
4d5438cd
NS
2083 size -= part;
2084 nvptx_assemble_value (0, part);
738f2522
BS
2085 }
2086
4d5438cd
NS
2087 /* If this skip doesn't terminate the initializer, write as many
2088 remaining pieces as possible directly. */
2089 if (size < init_frag.remaining * init_frag.size)
738f2522 2090 {
4d5438cd
NS
2091 while (size >= init_frag.size)
2092 {
2093 size -= init_frag.size;
2094 output_init_frag (NULL_RTX);
2095 }
2096 if (size)
2097 nvptx_assemble_value (0, size);
738f2522 2098 }
738f2522
BS
2099}
2100
2101/* Output a string STR with length SIZE. As in nvptx_output_skip we
2102 ignore the FILE arg. */
2103
2104void
2105nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2106{
2107 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2108 nvptx_assemble_value (str[i], 1);
2109}
2110
bf398920
TV
2111/* Return true if TYPE is a record type where the last field is an array without
2112 given dimension. */
2113
2114static bool
2115flexible_array_member_type_p (const_tree type)
2116{
2117 if (TREE_CODE (type) != RECORD_TYPE)
2118 return false;
2119
2120 const_tree last_field = NULL_TREE;
2121 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2122 last_field = f;
2123
2124 if (!last_field)
2125 return false;
2126
2127 const_tree last_field_type = TREE_TYPE (last_field);
2128 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2129 return false;
2130
2131 return (! TYPE_DOMAIN (last_field_type)
2132 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2133}
2134
4ff3145a
NS
2135/* Emit a PTX variable decl and prepare for emission of its
2136 initializer. NAME is the symbol name and SETION the PTX data
2137 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2138 The caller has already emitted any indentation and linkage
2139 specifier. It is responsible for any initializer, terminating ;
2140 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2141 this is the opposite way round that PTX wants them! */
2142
2143static void
2144nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2145 const_tree type, HOST_WIDE_INT size, unsigned align,
2146 bool undefined = false)
4ff3145a 2147{
59d2d238
CP
2148 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2149 && (TYPE_DOMAIN (type) == NULL_TREE);
2150
bf398920
TV
2151 if (undefined && flexible_array_member_type_p (type))
2152 {
2153 size = 0;
2154 atype = true;
2155 }
2156
4ff3145a
NS
2157 while (TREE_CODE (type) == ARRAY_TYPE)
2158 type = TREE_TYPE (type);
2159
fc0efeea
NS
2160 if (TREE_CODE (type) == VECTOR_TYPE
2161 || TREE_CODE (type) == COMPLEX_TYPE)
2162 /* Neither vector nor complex types can contain the other. */
2163 type = TREE_TYPE (type);
2164
4ff3145a 2165 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
2166
2167 /* Largest mode we're prepared to accept. For BLKmode types we
2168 don't know if it'll contain pointer constants, so have to choose
2169 pointer size, otherwise we can choose DImode. */
2170 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2171
2172 elt_size |= GET_MODE_SIZE (elt_mode);
2173 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2174
4d5438cd 2175 init_frag.size = elt_size;
9c582551 2176 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2177 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2178 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2179 init_frag.val = 0;
2180 init_frag.offset = 0;
2181 init_frag.started = false;
2182 /* Size might not be a multiple of elt size, if there's an
2183 initialized trailing struct array with smaller type than
2184 elt_size. */
2185 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
2186
2187 fprintf (file, "%s .align %d .u%d ",
2188 section, align / BITS_PER_UNIT,
2189 elt_size * BITS_PER_UNIT);
2190 assemble_name (file, name);
2191
2192 if (size)
2193 /* We make everything an array, to simplify any initialization
2194 emission. */
4d5438cd 2195 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
59d2d238
CP
2196 else if (atype)
2197 fprintf (file, "[]");
4ff3145a
NS
2198}
2199
738f2522
BS
2200/* Called when the initializer for a decl has been completely output through
2201 combinations of the three functions above. */
2202
2203static void
2204nvptx_assemble_decl_end (void)
2205{
4d5438cd
NS
2206 if (init_frag.offset)
2207 /* This can happen with a packed struct with trailing array member. */
2208 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2209 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2210}
2211
69823d76
NS
2212/* Output an uninitialized common or file-scope variable. */
2213
2214void
2215nvptx_output_aligned_decl (FILE *file, const char *name,
2216 const_tree decl, HOST_WIDE_INT size, unsigned align)
2217{
2218 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2219
2220 /* If this is public, it is common. The nearest thing we have to
2221 common is weak. */
4ff3145a
NS
2222 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2223
2224 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2225 TREE_TYPE (decl), size, align);
4d5438cd 2226 nvptx_assemble_decl_end ();
69823d76
NS
2227}
2228
738f2522
BS
2229/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2230 writing a constant variable EXP with NAME and SIZE and its
2231 initializer to FILE. */
2232
2233static void
2234nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2235 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2236{
4ff3145a
NS
2237 write_var_marker (file, true, false, name);
2238
2239 fprintf (file, "\t");
2240
738f2522 2241 tree type = TREE_TYPE (exp);
4ff3145a
NS
2242 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2243 TYPE_ALIGN (type));
738f2522
BS
2244}
2245
2246/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2247 a variable DECL with NAME to FILE. */
2248
2249void
2250nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2251{
4ff3145a 2252 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2253
4ff3145a
NS
2254 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2255 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2256
4ff3145a
NS
2257 tree type = TREE_TYPE (decl);
2258 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2259 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2260 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2261}
2262
2263/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2264
2265static void
2266nvptx_globalize_label (FILE *, const char *)
2267{
2268}
2269
2270/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2271 declaration only for variable DECL with NAME to FILE. */
f313d112 2272
738f2522
BS
2273static void
2274nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2275{
0a0f74aa
NS
2276 /* The middle end can place constant pool decls into the varpool as
2277 undefined. Until that is fixed, catch the problem here. */
2278 if (DECL_IN_CONSTANT_POOL (decl))
2279 return;
2280
721547cd
NS
2281 /* We support weak defintions, and hence have the right
2282 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2283 if (DECL_WEAK (decl))
2284 error_at (DECL_SOURCE_LOCATION (decl),
2285 "PTX does not support weak declarations"
2286 " (only weak definitions)");
69823d76
NS
2287 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2288
4ff3145a
NS
2289 fprintf (file, "\t.extern ");
2290 tree size = DECL_SIZE_UNIT (decl);
2291 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2292 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2293 DECL_ALIGN (decl), true);
1e5154e7 2294 nvptx_assemble_decl_end ();
738f2522
BS
2295}
2296
f313d112
NS
2297/* Output a pattern for a move instruction. */
2298
2299const char *
2300nvptx_output_mov_insn (rtx dst, rtx src)
2301{
2302 machine_mode dst_mode = GET_MODE (dst);
2303 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2304 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2305 machine_mode src_inner = (GET_CODE (src) == SUBREG
2306 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2307
15113b03
NS
2308 rtx sym = src;
2309 if (GET_CODE (sym) == CONST)
2310 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2311 if (SYMBOL_REF_P (sym))
2312 {
2313 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2314 return "%.\tcvta%D1%t0\t%0, %1;";
2315 nvptx_maybe_record_fnsym (sym);
2316 }
15113b03 2317
f313d112
NS
2318 if (src_inner == dst_inner)
2319 return "%.\tmov%t0\t%0, %1;";
2320
2321 if (CONSTANT_P (src))
2322 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2323 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2324 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2325
2326 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2327 {
2328 if (GET_MODE_BITSIZE (dst_mode) == 128
2329 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2330 {
2331 /* mov.b128 is not supported. */
2332 if (dst_inner == V2DImode && src_inner == TImode)
2333 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2334 else if (dst_inner == TImode && src_inner == V2DImode)
2335 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2336
2337 gcc_unreachable ();
2338 }
2339 return "%.\tmov.b%T0\t%0, %1;";
2340 }
f313d112
NS
2341
2342 return "%.\tcvt%t0%t1\t%0, %1;";
2343}
2344
5012919d
AM
2345static void nvptx_print_operand (FILE *, rtx, int);
2346
738f2522 2347/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2348 involves writing .param declarations and in/out copies into them. For
2349 indirect calls, also write the .callprototype. */
738f2522
BS
2350
2351const char *
2352nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2353{
863af9a4 2354 char buf[16];
738f2522
BS
2355 static int labelno;
2356 bool needs_tgt = register_operand (callee, Pmode);
2357 rtx pat = PATTERN (insn);
5012919d
AM
2358 if (GET_CODE (pat) == COND_EXEC)
2359 pat = COND_EXEC_CODE (pat);
f324806d 2360 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2361 tree decl = NULL_TREE;
2362
2363 fprintf (asm_out_file, "\t{\n");
2364 if (result != NULL)
1f065954
NS
2365 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2366 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2367 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2368
ecf6e535 2369 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2370 if (GET_CODE (callee) == SYMBOL_REF)
2371 {
2372 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2373 if (!decl
2374 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2375 nvptx_record_libfunc (callee, result, pat);
2376 else if (DECL_EXTERNAL (decl))
738f2522
BS
2377 nvptx_record_fndecl (decl);
2378 }
2379
2380 if (needs_tgt)
2381 {
2382 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2383 labelno++;
2384 ASM_OUTPUT_LABEL (asm_out_file, buf);
2385 std::stringstream s;
b699adcc 2386 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2387 fputs (s.str().c_str(), asm_out_file);
2388 }
2389
863af9a4 2390 for (int argno = 1; argno < arg_end; argno++)
738f2522 2391 {
863af9a4 2392 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2393 machine_mode mode = GET_MODE (t);
a02d84b6 2394 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2395
863af9a4 2396 /* Mode splitting has already been done. */
a02d84b6
NS
2397 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2398 "\t\tst.param%s [%%out_arg%d], ",
2399 ptx_type, argno, ptx_type, argno);
2400 output_reg (asm_out_file, REGNO (t), VOIDmode);
2401 fprintf (asm_out_file, ";\n");
738f2522
BS
2402 }
2403
5012919d
AM
2404 /* The '.' stands for the call's predicate, if any. */
2405 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2406 fprintf (asm_out_file, "\t\tcall ");
2407 if (result != NULL_RTX)
1f065954
NS
2408 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2409
738f2522
BS
2410 if (decl)
2411 {
2412 const char *name = get_fnname_from_decl (decl);
b4346107 2413 name = nvptx_name_replacement (name);
738f2522
BS
2414 assemble_name (asm_out_file, name);
2415 }
2416 else
cc8ca59e 2417 output_address (VOIDmode, callee);
738f2522 2418
863af9a4
NS
2419 const char *open = "(";
2420 for (int argno = 1; argno < arg_end; argno++)
738f2522 2421 {
863af9a4
NS
2422 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2423 open = "";
738f2522 2424 }
863af9a4
NS
2425 if (decl && DECL_STATIC_CHAIN (decl))
2426 {
5563d5c0 2427 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2428 open = "";
2429 }
2430 if (!open[0])
2431 fprintf (asm_out_file, ")");
f324806d 2432
738f2522
BS
2433 if (needs_tgt)
2434 {
2435 fprintf (asm_out_file, ", ");
2436 assemble_name (asm_out_file, buf);
2437 }
2438 fprintf (asm_out_file, ";\n");
738f2522 2439
51baf85a 2440 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2441 {
2442 /* No return functions confuse the PTX JIT, as it doesn't realize
2443 the flow control barrier they imply. It can seg fault if it
2444 encounters what looks like an unexitable loop. Emit a trailing
2445 trap and exit, which it does grok. */
2446 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2447 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2448 }
51baf85a 2449
1f065954
NS
2450 if (result)
2451 {
2452 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2453
2454 if (!rval[0])
2455 /* We must escape the '%' that starts RETURN_REGNUM. */
2456 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2457 reg_names[NVPTX_RETURN_REGNUM]);
2458 return rval;
2459 }
2460
2461 return "}";
738f2522
BS
2462}
2463
2464/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2465
2466static bool
2467nvptx_print_operand_punct_valid_p (unsigned char c)
2468{
2469 return c == '.' || c== '#';
2470}
2471
738f2522
BS
2472/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2473
2474static void
2475nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2476{
2477 rtx off;
2478 if (GET_CODE (x) == CONST)
2479 x = XEXP (x, 0);
2480 switch (GET_CODE (x))
2481 {
2482 case PLUS:
2483 off = XEXP (x, 1);
cc8ca59e 2484 output_address (VOIDmode, XEXP (x, 0));
738f2522 2485 fprintf (file, "+");
cc8ca59e 2486 output_address (VOIDmode, off);
738f2522
BS
2487 break;
2488
2489 case SYMBOL_REF:
2490 case LABEL_REF:
2491 output_addr_const (file, x);
2492 break;
2493
2494 default:
2495 gcc_assert (GET_CODE (x) != MEM);
2496 nvptx_print_operand (file, x, 0);
2497 break;
2498 }
2499}
2500
2501/* Write assembly language output for the address ADDR to FILE. */
2502
2503static void
cc8ca59e 2504nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2505{
cc8ca59e 2506 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2507}
2508
2509/* Print an operand, X, to FILE, with an optional modifier in CODE.
2510
2511 Meaning of CODE:
2512 . -- print the predicate for the instruction or an emptry string for an
2513 unconditional one.
2514 # -- print a rounding mode for the instruction
2515
9a863523 2516 A -- print a data area for a MEM
738f2522 2517 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2518 D -- print a data area for a MEM operand
d88cd9c4 2519 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2520 t -- print a type opcode suffix, promoting QImode to 32 bits
2521 T -- print a type size in bits
2522 u -- print a type opcode suffix without promotions. */
2523
2524static void
2525nvptx_print_operand (FILE *file, rtx x, int code)
2526{
738f2522
BS
2527 if (code == '.')
2528 {
2529 x = current_insn_predicate;
2530 if (x)
2531 {
5012919d 2532 fputs ("@", file);
738f2522
BS
2533 if (GET_CODE (x) == EQ)
2534 fputs ("!", file);
5012919d 2535 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2536 }
2537 return;
2538 }
2539 else if (code == '#')
2540 {
2541 fputs (".rn", file);
2542 return;
2543 }
2544
2545 enum rtx_code x_code = GET_CODE (x);
f313d112 2546 machine_mode mode = GET_MODE (x);
738f2522
BS
2547
2548 switch (code)
2549 {
2550 case 'A':
9a863523
NS
2551 x = XEXP (x, 0);
2552 /* FALLTHROUGH. */
7b8edc29 2553
9a863523
NS
2554 case 'D':
2555 if (GET_CODE (x) == CONST)
2556 x = XEXP (x, 0);
2557 if (GET_CODE (x) == PLUS)
2558 x = XEXP (x, 0);
7b8edc29 2559
9a863523
NS
2560 if (GET_CODE (x) == SYMBOL_REF)
2561 fputs (section_for_sym (x), file);
738f2522
BS
2562 break;
2563
738f2522 2564 case 't':
738f2522 2565 case 'u':
f313d112
NS
2566 if (x_code == SUBREG)
2567 {
1310ff03
TV
2568 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2569 if (VECTOR_MODE_P (inner_mode)
2570 && (GET_MODE_SIZE (mode)
2571 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2572 mode = GET_MODE_INNER (inner_mode);
2573 else if (split_mode_p (inner_mode))
2574 mode = maybe_split_mode (inner_mode);
2575 else
2576 mode = inner_mode;
f313d112
NS
2577 }
2578 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2579 break;
2580
3717fbe3
TV
2581 case 'H':
2582 case 'L':
2583 {
2584 rtx inner_x = SUBREG_REG (x);
2585 machine_mode inner_mode = GET_MODE (inner_x);
2586 machine_mode split = maybe_split_mode (inner_mode);
2587
2588 output_reg (file, REGNO (inner_x), split,
2589 (code == 'H'
2590 ? GET_MODE_SIZE (inner_mode) / 2
2591 : 0));
2592 }
2593 break;
2594
d88cd9c4
NS
2595 case 'S':
2596 {
59263259
NS
2597 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2598 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2599 static const char *const kinds[] =
59263259
NS
2600 {".up", ".down", ".bfly", ".idx"};
2601 fputs (kinds[kind], file);
d88cd9c4
NS
2602 }
2603 break;
2604
738f2522 2605 case 'T':
f313d112 2606 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2607 break;
2608
2609 case 'j':
2610 fprintf (file, "@");
2611 goto common;
2612
2613 case 'J':
2614 fprintf (file, "@!");
2615 goto common;
2616
2617 case 'c':
f313d112 2618 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2619 switch (x_code)
2620 {
2621 case EQ:
2622 fputs (".eq", file);
2623 break;
2624 case NE:
f313d112 2625 if (FLOAT_MODE_P (mode))
738f2522
BS
2626 fputs (".neu", file);
2627 else
2628 fputs (".ne", file);
2629 break;
2630 case LE:
578fb225 2631 case LEU:
738f2522
BS
2632 fputs (".le", file);
2633 break;
2634 case GE:
578fb225 2635 case GEU:
738f2522
BS
2636 fputs (".ge", file);
2637 break;
2638 case LT:
578fb225 2639 case LTU:
738f2522
BS
2640 fputs (".lt", file);
2641 break;
2642 case GT:
738f2522 2643 case GTU:
578fb225 2644 fputs (".gt", file);
738f2522
BS
2645 break;
2646 case LTGT:
2647 fputs (".ne", file);
2648 break;
2649 case UNEQ:
2650 fputs (".equ", file);
2651 break;
2652 case UNLE:
2653 fputs (".leu", file);
2654 break;
2655 case UNGE:
2656 fputs (".geu", file);
2657 break;
2658 case UNLT:
2659 fputs (".ltu", file);
2660 break;
2661 case UNGT:
2662 fputs (".gtu", file);
2663 break;
2664 case UNORDERED:
2665 fputs (".nan", file);
2666 break;
2667 case ORDERED:
2668 fputs (".num", file);
2669 break;
2670 default:
2671 gcc_unreachable ();
2672 }
f313d112 2673 if (FLOAT_MODE_P (mode)
738f2522
BS
2674 || x_code == EQ || x_code == NE
2675 || x_code == GEU || x_code == GTU
2676 || x_code == LEU || x_code == LTU)
f313d112 2677 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2678 else
f313d112 2679 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2680 break;
2681 default:
2682 common:
2683 switch (x_code)
2684 {
2685 case SUBREG:
f313d112
NS
2686 {
2687 rtx inner_x = SUBREG_REG (x);
2688 machine_mode inner_mode = GET_MODE (inner_x);
2689 machine_mode split = maybe_split_mode (inner_mode);
2690
1310ff03
TV
2691 if (VECTOR_MODE_P (inner_mode)
2692 && (GET_MODE_SIZE (mode)
2693 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2694 {
2695 output_reg (file, REGNO (inner_x), VOIDmode);
2696 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2697 }
2698 else if (split_mode_p (inner_mode)
f313d112
NS
2699 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2700 output_reg (file, REGNO (inner_x), split);
2701 else
2702 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2703 }
2704 break;
738f2522
BS
2705
2706 case REG:
f313d112 2707 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2708 break;
2709
2710 case MEM:
2711 fputc ('[', file);
f313d112 2712 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2713 fputc (']', file);
2714 break;
2715
2716 case CONST_INT:
2717 output_addr_const (file, x);
2718 break;
2719
2720 case CONST:
2721 case SYMBOL_REF:
2722 case LABEL_REF:
2723 /* We could use output_addr_const, but that can print things like
2724 "x-8", which breaks ptxas. Need to ensure it is output as
2725 "x+-8". */
2726 nvptx_print_address_operand (file, x, VOIDmode);
2727 break;
2728
2729 case CONST_DOUBLE:
2730 long vals[2];
f313d112 2731 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2732 vals[0] &= 0xffffffff;
2733 vals[1] &= 0xffffffff;
f313d112 2734 if (mode == SFmode)
738f2522
BS
2735 fprintf (file, "0f%08lx", vals[0]);
2736 else
2737 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2738 break;
2739
1310ff03
TV
2740 case CONST_VECTOR:
2741 {
2742 unsigned n = CONST_VECTOR_NUNITS (x);
2743 fprintf (file, "{ ");
2744 for (unsigned i = 0; i < n; ++i)
2745 {
2746 if (i != 0)
2747 fprintf (file, ", ");
2748
2749 rtx elem = CONST_VECTOR_ELT (x, i);
2750 output_addr_const (file, elem);
2751 }
2752 fprintf (file, " }");
2753 }
2754 break;
2755
738f2522
BS
2756 default:
2757 output_addr_const (file, x);
2758 }
2759 }
2760}
2761\f
2762/* Record replacement regs used to deal with subreg operands. */
2763struct reg_replace
2764{
2765 rtx replacement[MAX_RECOG_OPERANDS];
2766 machine_mode mode;
2767 int n_allocated;
2768 int n_in_use;
2769};
2770
2771/* Allocate or reuse a replacement in R and return the rtx. */
2772
2773static rtx
2774get_replacement (struct reg_replace *r)
2775{
2776 if (r->n_allocated == r->n_in_use)
2777 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2778 return r->replacement[r->n_in_use++];
2779}
2780
2781/* Clean up subreg operands. In ptx assembly, everything is typed, and
2782 the presence of subregs would break the rules for most instructions.
2783 Replace them with a suitable new register of the right size, plus
2784 conversion copyin/copyout instructions. */
2785
2786static void
517665b3 2787nvptx_reorg_subreg (void)
738f2522
BS
2788{
2789 struct reg_replace qiregs, hiregs, siregs, diregs;
2790 rtx_insn *insn, *next;
2791
738f2522
BS
2792 qiregs.n_allocated = 0;
2793 hiregs.n_allocated = 0;
2794 siregs.n_allocated = 0;
2795 diregs.n_allocated = 0;
2796 qiregs.mode = QImode;
2797 hiregs.mode = HImode;
2798 siregs.mode = SImode;
2799 diregs.mode = DImode;
2800
2801 for (insn = get_insns (); insn; insn = next)
2802 {
2803 next = NEXT_INSN (insn);
2804 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2805 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2806 || GET_CODE (PATTERN (insn)) == USE
2807 || GET_CODE (PATTERN (insn)) == CLOBBER)
2808 continue;
f324806d 2809
738f2522
BS
2810 qiregs.n_in_use = 0;
2811 hiregs.n_in_use = 0;
2812 siregs.n_in_use = 0;
2813 diregs.n_in_use = 0;
2814 extract_insn (insn);
2815 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2816
738f2522
BS
2817 for (int i = 0; i < recog_data.n_operands; i++)
2818 {
2819 rtx op = recog_data.operand[i];
2820 if (GET_CODE (op) != SUBREG)
2821 continue;
2822
2823 rtx inner = SUBREG_REG (op);
2824
2825 machine_mode outer_mode = GET_MODE (op);
2826 machine_mode inner_mode = GET_MODE (inner);
2827 gcc_assert (s_ok);
2828 if (s_ok
2829 && (GET_MODE_PRECISION (inner_mode)
2830 >= GET_MODE_PRECISION (outer_mode)))
2831 continue;
2832 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2833 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2834 : outer_mode == HImode ? &hiregs
2835 : outer_mode == SImode ? &siregs
2836 : &diregs);
2837 rtx new_reg = get_replacement (r);
2838
2839 if (recog_data.operand_type[i] != OP_OUT)
2840 {
2841 enum rtx_code code;
2842 if (GET_MODE_PRECISION (inner_mode)
2843 < GET_MODE_PRECISION (outer_mode))
2844 code = ZERO_EXTEND;
2845 else
2846 code = TRUNCATE;
2847
f7df4a84 2848 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2849 gen_rtx_fmt_e (code, outer_mode, inner));
2850 emit_insn_before (pat, insn);
2851 }
2852
2853 if (recog_data.operand_type[i] != OP_IN)
2854 {
2855 enum rtx_code code;
2856 if (GET_MODE_PRECISION (inner_mode)
2857 < GET_MODE_PRECISION (outer_mode))
2858 code = TRUNCATE;
2859 else
2860 code = ZERO_EXTEND;
2861
f7df4a84 2862 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2863 gen_rtx_fmt_e (code, inner_mode, new_reg));
2864 emit_insn_after (pat, insn);
2865 }
2866 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2867 }
2868 }
517665b3 2869}
738f2522 2870
5012919d
AM
2871/* Return a SImode "master lane index" register for uniform-simt, allocating on
2872 first use. */
2873
2874static rtx
2875nvptx_get_unisimt_master ()
2876{
2877 rtx &master = cfun->machine->unisimt_master;
2878 return master ? master : master = gen_reg_rtx (SImode);
2879}
2880
2881/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2882
2883static rtx
2884nvptx_get_unisimt_predicate ()
2885{
2886 rtx &pred = cfun->machine->unisimt_predicate;
2887 return pred ? pred : pred = gen_reg_rtx (BImode);
2888}
2889
2890/* Return true if given call insn references one of the functions provided by
2891 the CUDA runtime: malloc, free, vprintf. */
2892
2893static bool
2894nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2895{
2896 rtx pat = PATTERN (insn);
2897 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2898 pat = XVECEXP (pat, 0, 0);
2899 if (GET_CODE (pat) == SET)
2900 pat = SET_SRC (pat);
2901 gcc_checking_assert (GET_CODE (pat) == CALL
2902 && GET_CODE (XEXP (pat, 0)) == MEM);
2903 rtx addr = XEXP (XEXP (pat, 0), 0);
2904 if (GET_CODE (addr) != SYMBOL_REF)
2905 return false;
2906 const char *name = XSTR (addr, 0);
2907 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2908 references with forced assembler name refer to PTX syscalls. For vprintf,
2909 accept both normal and forced-assembler-name references. */
2910 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2911 || !strcmp (name, "*malloc")
2912 || !strcmp (name, "*free"));
2913}
2914
2915/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2916 propagate its value from lane MASTER to current lane. */
2917
2918static void
2919nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2920{
2921 rtx reg;
2922 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2923 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2924}
2925
2926/* Adjust code for uniform-simt code generation variant by making atomics and
2927 "syscalls" conditionally executed, and inserting shuffle-based propagation
2928 for registers being set. */
2929
2930static void
2931nvptx_reorg_uniform_simt ()
2932{
2933 rtx_insn *insn, *next;
2934
2935 for (insn = get_insns (); insn; insn = next)
2936 {
2937 next = NEXT_INSN (insn);
2938 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2939 && !(NONJUMP_INSN_P (insn)
2940 && GET_CODE (PATTERN (insn)) == PARALLEL
2941 && get_attr_atomic (insn)))
2942 continue;
2943 rtx pat = PATTERN (insn);
2944 rtx master = nvptx_get_unisimt_master ();
2945 for (int i = 0; i < XVECLEN (pat, 0); i++)
2946 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2947 rtx pred = nvptx_get_unisimt_predicate ();
2948 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2949 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2950 validate_change (insn, &PATTERN (insn), pat, false);
2951 }
2952}
2953
2498a6ab
TV
2954/* Offloading function attributes. */
2955
2956struct offload_attrs
2957{
2958 unsigned mask;
2959 int num_gangs;
2960 int num_workers;
2961 int vector_length;
2962};
2963
0d06e8c7
TV
2964/* Define entries for cfun->machine->axis_dim. */
2965
2966#define MACH_VECTOR_LENGTH 0
2967#define MACH_MAX_WORKERS 1
2968
2969static void populate_offload_attrs (offload_attrs *oa);
2970
2971static void
2972init_axis_dim (void)
2973{
2974 offload_attrs oa;
2975 int max_workers;
2976
2977 populate_offload_attrs (&oa);
2978
2979 if (oa.num_workers == 0)
2980 max_workers = PTX_CTA_SIZE / oa.vector_length;
2981 else
2982 max_workers = oa.num_workers;
2983
2984 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2985 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2986 cfun->machine->axis_dim_init_p = true;
2987}
2988
2989static int ATTRIBUTE_UNUSED
2990nvptx_mach_max_workers ()
2991{
2992 if (!cfun->machine->axis_dim_init_p)
2993 init_axis_dim ();
2994 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
2995}
2996
2997static int ATTRIBUTE_UNUSED
2998nvptx_mach_vector_length ()
2999{
3000 if (!cfun->machine->axis_dim_init_p)
3001 init_axis_dim ();
3002 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3003}
3004
d2d47a28
NS
3005/* Loop structure of the function. The entire function is described as
3006 a NULL loop. */
d88cd9c4
NS
3007
3008struct parallel
3009{
3010 /* Parent parallel. */
3011 parallel *parent;
3012
3013 /* Next sibling parallel. */
3014 parallel *next;
3015
3016 /* First child parallel. */
3017 parallel *inner;
3018
3019 /* Partitioning mask of the parallel. */
3020 unsigned mask;
3021
3022 /* Partitioning used within inner parallels. */
3023 unsigned inner_mask;
3024
3025 /* Location of parallel forked and join. The forked is the first
3026 block in the parallel and the join is the first block after of
3027 the partition. */
3028 basic_block forked_block;
3029 basic_block join_block;
3030
3031 rtx_insn *forked_insn;
3032 rtx_insn *join_insn;
3033
3034 rtx_insn *fork_insn;
3035 rtx_insn *joining_insn;
3036
3037 /* Basic blocks in this parallel, but not in child parallels. The
3038 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3039 blocks are not. */
3040 auto_vec<basic_block> blocks;
3041
3042public:
3043 parallel (parallel *parent, unsigned mode);
3044 ~parallel ();
3045};
3046
3047/* Constructor links the new parallel into it's parent's chain of
3048 children. */
3049
3050parallel::parallel (parallel *parent_, unsigned mask_)
3051 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3052{
3053 forked_block = join_block = 0;
3054 forked_insn = join_insn = 0;
3055 fork_insn = joining_insn = 0;
3056
3057 if (parent)
3058 {
3059 next = parent->inner;
3060 parent->inner = this;
3061 }
3062}
3063
3064parallel::~parallel ()
3065{
3066 delete inner;
3067 delete next;
3068}
3069
3070/* Map of basic blocks to insns */
3071typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3072
3073/* A tuple of an insn of interest and the BB in which it resides. */
3074typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3075typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3076
3077/* Split basic blocks such that each forked and join unspecs are at
3078 the start of their basic blocks. Thus afterwards each block will
3079 have a single partitioning mode. We also do the same for return
3080 insns, as they are executed by every thread. Return the
3081 partitioning mode of the function as a whole. Populate MAP with
3082 head and tail blocks. We also clear the BB visited flag, which is
3083 used when finding partitions. */
3084
3085static void
3086nvptx_split_blocks (bb_insn_map_t *map)
3087{
3088 insn_bb_vec_t worklist;
3089 basic_block block;
3090 rtx_insn *insn;
3091
3092 /* Locate all the reorg instructions of interest. */
3093 FOR_ALL_BB_FN (block, cfun)
3094 {
3095 bool seen_insn = false;
3096
3097 /* Clear visited flag, for use by parallel locator */
3098 block->flags &= ~BB_VISITED;
3099
3100 FOR_BB_INSNS (block, insn)
3101 {
3102 if (!INSN_P (insn))
3103 continue;
3104 switch (recog_memoized (insn))
3105 {
3106 default:
3107 seen_insn = true;
3108 continue;
3109 case CODE_FOR_nvptx_forked:
3110 case CODE_FOR_nvptx_join:
3111 break;
3112
3113 case CODE_FOR_return:
3114 /* We also need to split just before return insns, as
3115 that insn needs executing by all threads, but the
3116 block it is in probably does not. */
3117 break;
3118 }
3119
3120 if (seen_insn)
3121 /* We've found an instruction that must be at the start of
3122 a block, but isn't. Add it to the worklist. */
3123 worklist.safe_push (insn_bb_t (insn, block));
3124 else
3125 /* It was already the first instruction. Just add it to
3126 the map. */
3127 map->get_or_insert (block) = insn;
3128 seen_insn = true;
3129 }
3130 }
3131
3132 /* Split blocks on the worklist. */
3133 unsigned ix;
3134 insn_bb_t *elt;
3135 basic_block remap = 0;
3136 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3137 {
3138 if (remap != elt->second)
3139 {
3140 block = elt->second;
3141 remap = block;
3142 }
3143
3144 /* Split block before insn. The insn is in the new block */
3145 edge e = split_block (block, PREV_INSN (elt->first));
3146
3147 block = e->dest;
3148 map->get_or_insert (block) = elt->first;
3149 }
3150}
3151
0024c320
TV
3152/* Return true if MASK contains parallelism that requires shared
3153 memory to broadcast. */
3154
3155static bool
3156nvptx_needs_shared_bcast (unsigned mask)
3157{
3158 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3159 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3160 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3161
3162 return worker || large_vector;
3163}
3164
d88cd9c4
NS
3165/* BLOCK is a basic block containing a head or tail instruction.
3166 Locate the associated prehead or pretail instruction, which must be
3167 in the single predecessor block. */
3168
3169static rtx_insn *
3170nvptx_discover_pre (basic_block block, int expected)
3171{
3172 gcc_assert (block->preds->length () == 1);
3173 basic_block pre_block = (*block->preds)[0]->src;
3174 rtx_insn *pre_insn;
3175
3176 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3177 pre_insn = PREV_INSN (pre_insn))
3178 gcc_assert (pre_insn != BB_HEAD (pre_block));
3179
3180 gcc_assert (recog_memoized (pre_insn) == expected);
3181 return pre_insn;
3182}
3183
3184/* Dump this parallel and all its inner parallels. */
3185
3186static void
3187nvptx_dump_pars (parallel *par, unsigned depth)
3188{
3189 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3190 depth, par->mask,
3191 par->forked_block ? par->forked_block->index : -1,
3192 par->join_block ? par->join_block->index : -1);
3193
3194 fprintf (dump_file, " blocks:");
3195
3196 basic_block block;
3197 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3198 fprintf (dump_file, " %d", block->index);
3199 fprintf (dump_file, "\n");
3200 if (par->inner)
3201 nvptx_dump_pars (par->inner, depth + 1);
3202
3203 if (par->next)
3204 nvptx_dump_pars (par->next, depth);
3205}
3206
3207/* If BLOCK contains a fork/join marker, process it to create or
3208 terminate a loop structure. Add this block to the current loop,
3209 and then walk successor blocks. */
3210
3211static parallel *
3212nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3213{
3214 if (block->flags & BB_VISITED)
3215 return par;
3216 block->flags |= BB_VISITED;
3217
3218 if (rtx_insn **endp = map->get (block))
3219 {
3220 rtx_insn *end = *endp;
3221
3222 /* This is a block head or tail, or return instruction. */
3223 switch (recog_memoized (end))
3224 {
3225 case CODE_FOR_return:
3226 /* Return instructions are in their own block, and we
3227 don't need to do anything more. */
3228 return par;
3229
3230 case CODE_FOR_nvptx_forked:
3231 /* Loop head, create a new inner loop and add it into
3232 our parent's child list. */
3233 {
3234 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3235
3236 gcc_assert (mask);
3237 par = new parallel (par, mask);
3238 par->forked_block = block;
3239 par->forked_insn = end;
0024c320 3240 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3241 par->fork_insn
3242 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3243 }
3244 break;
3245
3246 case CODE_FOR_nvptx_join:
3247 /* A loop tail. Finish the current loop and return to
3248 parent. */
3249 {
3250 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3251
3252 gcc_assert (par->mask == mask);
3253 par->join_block = block;
3254 par->join_insn = end;
0024c320 3255 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3256 par->joining_insn
3257 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3258 par = par->parent;
3259 }
3260 break;
3261
3262 default:
3263 gcc_unreachable ();
3264 }
3265 }
3266
3267 if (par)
3268 /* Add this block onto the current loop's list of blocks. */
3269 par->blocks.safe_push (block);
3270 else
3271 /* This must be the entry block. Create a NULL parallel. */
3272 par = new parallel (0, 0);
3273
3274 /* Walk successor blocks. */
3275 edge e;
3276 edge_iterator ei;
3277
3278 FOR_EACH_EDGE (e, ei, block->succs)
3279 nvptx_find_par (map, par, e->dest);
3280
3281 return par;
3282}
3283
3284/* DFS walk the CFG looking for fork & join markers. Construct
3285 loop structures as we go. MAP is a mapping of basic blocks
3286 to head & tail markers, discovered when splitting blocks. This
3287 speeds up the discovery. We rely on the BB visited flag having
3288 been cleared when splitting blocks. */
3289
3290static parallel *
3291nvptx_discover_pars (bb_insn_map_t *map)
3292{
3293 basic_block block;
3294
3295 /* Mark exit blocks as visited. */
3296 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3297 block->flags |= BB_VISITED;
3298
3299 /* And entry block as not. */
3300 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3301 block->flags &= ~BB_VISITED;
3302
3303 parallel *par = nvptx_find_par (map, 0, block);
3304
3305 if (dump_file)
3306 {
3307 fprintf (dump_file, "\nLoops\n");
3308 nvptx_dump_pars (par, 0);
3309 fprintf (dump_file, "\n");
3310 }
3311
3312 return par;
3313}
3314
912442c2
NS
3315/* Analyse a group of BBs within a partitioned region and create N
3316 Single-Entry-Single-Exit regions. Some of those regions will be
3317 trivial ones consisting of a single BB. The blocks of a
3318 partitioned region might form a set of disjoint graphs -- because
3319 the region encloses a differently partitoned sub region.
3320
3321 We use the linear time algorithm described in 'Finding Regions Fast:
3322 Single Entry Single Exit and control Regions in Linear Time'
3323 Johnson, Pearson & Pingali. That algorithm deals with complete
3324 CFGs, where a back edge is inserted from END to START, and thus the
3325 problem becomes one of finding equivalent loops.
3326
3327 In this case we have a partial CFG. We complete it by redirecting
3328 any incoming edge to the graph to be from an arbitrary external BB,
3329 and similarly redirecting any outgoing edge to be to that BB.
3330 Thus we end up with a closed graph.
3331
3332 The algorithm works by building a spanning tree of an undirected
3333 graph and keeping track of back edges from nodes further from the
3334 root in the tree to nodes nearer to the root in the tree. In the
3335 description below, the root is up and the tree grows downwards.
3336
3337 We avoid having to deal with degenerate back-edges to the same
3338 block, by splitting each BB into 3 -- one for input edges, one for
3339 the node itself and one for the output edges. Such back edges are
3340 referred to as 'Brackets'. Cycle equivalent nodes will have the
3341 same set of brackets.
3342
3343 Determining bracket equivalency is done by maintaining a list of
3344 brackets in such a manner that the list length and final bracket
3345 uniquely identify the set.
3346
3347 We use coloring to mark all BBs with cycle equivalency with the
3348 same color. This is the output of the 'Finding Regions Fast'
3349 algorithm. Notice it doesn't actually find the set of nodes within
3350 a particular region, just unorderd sets of nodes that are the
3351 entries and exits of SESE regions.
3352
3353 After determining cycle equivalency, we need to find the minimal
3354 set of SESE regions. Do this with a DFS coloring walk of the
3355 complete graph. We're either 'looking' or 'coloring'. When
3356 looking, and we're in the subgraph, we start coloring the color of
3357 the current node, and remember that node as the start of the
3358 current color's SESE region. Every time we go to a new node, we
3359 decrement the count of nodes with thet color. If it reaches zero,
3360 we remember that node as the end of the current color's SESE region
3361 and return to 'looking'. Otherwise we color the node the current
3362 color.
3363
3364 This way we end up with coloring the inside of non-trivial SESE
3365 regions with the color of that region. */
3366
3367/* A pair of BBs. We use this to represent SESE regions. */
3368typedef std::pair<basic_block, basic_block> bb_pair_t;
3369typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3370
3371/* A node in the undirected CFG. The discriminator SECOND indicates just
3372 above or just below the BB idicated by FIRST. */
3373typedef std::pair<basic_block, int> pseudo_node_t;
3374
3375/* A bracket indicates an edge towards the root of the spanning tree of the
3376 undirected graph. Each bracket has a color, determined
3377 from the currrent set of brackets. */
3378struct bracket
3379{
3380 pseudo_node_t back; /* Back target */
3381
3382 /* Current color and size of set. */
3383 unsigned color;
3384 unsigned size;
3385
3386 bracket (pseudo_node_t back_)
3387 : back (back_), color (~0u), size (~0u)
3388 {
3389 }
3390
3391 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3392 {
3393 if (length != size)
3394 {
3395 size = length;
3396 color = color_counts.length ();
3397 color_counts.quick_push (0);
3398 }
3399 color_counts[color]++;
3400 return color;
3401 }
3402};
3403
3404typedef auto_vec<bracket> bracket_vec_t;
3405
3406/* Basic block info for finding SESE regions. */
3407
3408struct bb_sese
3409{
3410 int node; /* Node number in spanning tree. */
3411 int parent; /* Parent node number. */
3412
3413 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3414 edges arrive at pseudo-node Ai and the outgoing edges leave at
3415 pseudo-node Ao. We have to remember which way we arrived at a
3416 particular node when generating the spanning tree. dir > 0 means
3417 we arrived at Ai, dir < 0 means we arrived at Ao. */
3418 int dir;
3419
3420 /* Lowest numbered pseudo-node reached via a backedge from thsis
3421 node, or any descendant. */
3422 pseudo_node_t high;
3423
3424 int color; /* Cycle-equivalence color */
3425
3426 /* Stack of brackets for this node. */
3427 bracket_vec_t brackets;
3428
3429 bb_sese (unsigned node_, unsigned p, int dir_)
3430 :node (node_), parent (p), dir (dir_)
3431 {
3432 }
3433 ~bb_sese ();
3434
3435 /* Push a bracket ending at BACK. */
3436 void push (const pseudo_node_t &back)
3437 {
3438 if (dump_file)
3439 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3440 back.first ? back.first->index : 0, back.second);
3441 brackets.safe_push (bracket (back));
3442 }
3443
3444 void append (bb_sese *child);
3445 void remove (const pseudo_node_t &);
3446
3447 /* Set node's color. */
3448 void set_color (auto_vec<unsigned> &color_counts)
3449 {
3450 color = brackets.last ().get_color (color_counts, brackets.length ());
3451 }
3452};
3453
3454bb_sese::~bb_sese ()
3455{
3456}
3457
3458/* Destructively append CHILD's brackets. */
3459
3460void
3461bb_sese::append (bb_sese *child)
3462{
3463 if (int len = child->brackets.length ())
3464 {
3465 int ix;
3466
3467 if (dump_file)
3468 {
3469 for (ix = 0; ix < len; ix++)
3470 {
3471 const pseudo_node_t &pseudo = child->brackets[ix].back;
3472 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3473 child->node, pseudo.first ? pseudo.first->index : 0,
3474 pseudo.second);
3475 }
3476 }
3477 if (!brackets.length ())
3478 std::swap (brackets, child->brackets);
3479 else
3480 {
3481 brackets.reserve (len);
3482 for (ix = 0; ix < len; ix++)
3483 brackets.quick_push (child->brackets[ix]);
3484 }
3485 }
3486}
3487
3488/* Remove brackets that terminate at PSEUDO. */
3489
3490void
3491bb_sese::remove (const pseudo_node_t &pseudo)
3492{
3493 unsigned removed = 0;
3494 int len = brackets.length ();
3495
3496 for (int ix = 0; ix < len; ix++)
3497 {
3498 if (brackets[ix].back == pseudo)
3499 {
3500 if (dump_file)
3501 fprintf (dump_file, "Removing backedge %d:%+d\n",
3502 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3503 removed++;
3504 }
3505 else if (removed)
3506 brackets[ix-removed] = brackets[ix];
3507 }
3508 while (removed--)
3509 brackets.pop ();
3510}
3511
3512/* Accessors for BB's aux pointer. */
3513#define BB_SET_SESE(B, S) ((B)->aux = (S))
3514#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3515
3516/* DFS walk creating SESE data structures. Only cover nodes with
3517 BB_VISITED set. Append discovered blocks to LIST. We number in
3518 increments of 3 so that the above and below pseudo nodes can be
3519 implicitly numbered too. */
3520
3521static int
3522nvptx_sese_number (int n, int p, int dir, basic_block b,
3523 auto_vec<basic_block> *list)
3524{
3525 if (BB_GET_SESE (b))
3526 return n;
3527
3528 if (dump_file)
3529 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3530 b->index, n, p, dir);
3531
3532 BB_SET_SESE (b, new bb_sese (n, p, dir));
3533 p = n;
3534
3535 n += 3;
3536 list->quick_push (b);
3537
3538 /* First walk the nodes on the 'other side' of this node, then walk
3539 the nodes on the same side. */
3540 for (unsigned ix = 2; ix; ix--)
3541 {
3542 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3543 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3544 : offsetof (edge_def, src));
3545 edge e;
3546 edge_iterator (ei);
3547
3548 FOR_EACH_EDGE (e, ei, edges)
3549 {
3550 basic_block target = *(basic_block *)((char *)e + offset);
3551
3552 if (target->flags & BB_VISITED)
3553 n = nvptx_sese_number (n, p, dir, target, list);
3554 }
3555 dir = -dir;
3556 }
3557 return n;
3558}
3559
3560/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3561 EDGES are the outgoing edges and OFFSET is the offset to the src
3562 or dst block on the edges. */
3563
3564static void
3565nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3566 vec<edge, va_gc> *edges, size_t offset)
3567{
3568 edge e;
3569 edge_iterator (ei);
3570 int hi_back = depth;
3571 pseudo_node_t node_back (0, depth);
3572 int hi_child = depth;
3573 pseudo_node_t node_child (0, depth);
3574 basic_block child = NULL;
3575 unsigned num_children = 0;
3576 int usd = -dir * sese->dir;
3577
3578 if (dump_file)
3579 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3580 me->index, sese->node, dir);
3581
3582 if (dir < 0)
3583 {
3584 /* This is the above pseudo-child. It has the BB itself as an
3585 additional child node. */
3586 node_child = sese->high;
3587 hi_child = node_child.second;
3588 if (node_child.first)
3589 hi_child += BB_GET_SESE (node_child.first)->node;
3590 num_children++;
3591 }
3592
3593 /* Examine each edge.
3594 - if it is a child (a) append its bracket list and (b) record
3595 whether it is the child with the highest reaching bracket.
3596 - if it is an edge to ancestor, record whether it's the highest
3597 reaching backlink. */
3598 FOR_EACH_EDGE (e, ei, edges)
3599 {
3600 basic_block target = *(basic_block *)((char *)e + offset);
3601
3602 if (bb_sese *t_sese = BB_GET_SESE (target))
3603 {
3604 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3605 {
3606 /* Child node. Append its bracket list. */
3607 num_children++;
3608 sese->append (t_sese);
3609
3610 /* Compare it's hi value. */
3611 int t_hi = t_sese->high.second;
3612
3613 if (basic_block child_hi_block = t_sese->high.first)
3614 t_hi += BB_GET_SESE (child_hi_block)->node;
3615
3616 if (hi_child > t_hi)
3617 {
3618 hi_child = t_hi;
3619 node_child = t_sese->high;
3620 child = target;
3621 }
3622 }
3623 else if (t_sese->node < sese->node + dir
3624 && !(dir < 0 && sese->parent == t_sese->node))
3625 {
3626 /* Non-parental ancestor node -- a backlink. */
3627 int d = usd * t_sese->dir;
3628 int back = t_sese->node + d;
3629
3630 if (hi_back > back)
3631 {
3632 hi_back = back;
3633 node_back = pseudo_node_t (target, d);
3634 }
3635 }
3636 }
3637 else
3638 { /* Fallen off graph, backlink to entry node. */
3639 hi_back = 0;
3640 node_back = pseudo_node_t (0, 0);
3641 }
3642 }
3643
3644 /* Remove any brackets that terminate at this pseudo node. */
3645 sese->remove (pseudo_node_t (me, dir));
3646
3647 /* Now push any backlinks from this pseudo node. */
3648 FOR_EACH_EDGE (e, ei, edges)
3649 {
3650 basic_block target = *(basic_block *)((char *)e + offset);
3651 if (bb_sese *t_sese = BB_GET_SESE (target))
3652 {
3653 if (t_sese->node < sese->node + dir
3654 && !(dir < 0 && sese->parent == t_sese->node))
3655 /* Non-parental ancestor node - backedge from me. */
3656 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3657 }
3658 else
3659 {
3660 /* back edge to entry node */
3661 sese->push (pseudo_node_t (0, 0));
3662 }
3663 }
3664
3665 /* If this node leads directly or indirectly to a no-return region of
3666 the graph, then fake a backedge to entry node. */
3667 if (!sese->brackets.length () || !edges || !edges->length ())
3668 {
3669 hi_back = 0;
3670 node_back = pseudo_node_t (0, 0);
3671 sese->push (node_back);
3672 }
3673
3674 /* Record the highest reaching backedge from us or a descendant. */
3675 sese->high = hi_back < hi_child ? node_back : node_child;
3676
3677 if (num_children > 1)
3678 {
3679 /* There is more than one child -- this is a Y shaped piece of
3680 spanning tree. We have to insert a fake backedge from this
3681 node to the highest ancestor reached by not-the-highest
3682 reaching child. Note that there may be multiple children
3683 with backedges to the same highest node. That's ok and we
3684 insert the edge to that highest node. */
3685 hi_child = depth;
3686 if (dir < 0 && child)
3687 {
3688 node_child = sese->high;
3689 hi_child = node_child.second;
3690 if (node_child.first)
3691 hi_child += BB_GET_SESE (node_child.first)->node;
3692 }
3693
3694 FOR_EACH_EDGE (e, ei, edges)
3695 {
3696 basic_block target = *(basic_block *)((char *)e + offset);
3697
3698 if (target == child)
3699 /* Ignore the highest child. */
3700 continue;
3701
3702 bb_sese *t_sese = BB_GET_SESE (target);
3703 if (!t_sese)
3704 continue;
3705 if (t_sese->parent != sese->node)
3706 /* Not a child. */
3707 continue;
3708
3709 /* Compare its hi value. */
3710 int t_hi = t_sese->high.second;
3711
3712 if (basic_block child_hi_block = t_sese->high.first)
3713 t_hi += BB_GET_SESE (child_hi_block)->node;
3714
3715 if (hi_child > t_hi)
3716 {
3717 hi_child = t_hi;
3718 node_child = t_sese->high;
3719 }
3720 }
3721
3722 sese->push (node_child);
3723 }
3724}
3725
3726
3727/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3728 proceed to successors. Set SESE entry and exit nodes of
3729 REGIONS. */
3730
3731static void
3732nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3733 basic_block block, int coloring)
3734{
3735 bb_sese *sese = BB_GET_SESE (block);
3736
3737 if (block->flags & BB_VISITED)
3738 {
3739 /* If we've already encountered this block, either we must not
3740 be coloring, or it must have been colored the current color. */
3741 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3742 return;
3743 }
3744
3745 block->flags |= BB_VISITED;
3746
3747 if (sese)
3748 {
3749 if (coloring < 0)
3750 {
3751 /* Start coloring a region. */
3752 regions[sese->color].first = block;
3753 coloring = sese->color;
3754 }
3755
3756 if (!--color_counts[sese->color] && sese->color == coloring)
3757 {
3758 /* Found final block of SESE region. */
3759 regions[sese->color].second = block;
3760 coloring = -1;
3761 }
3762 else
3763 /* Color the node, so we can assert on revisiting the node
3764 that the graph is indeed SESE. */
3765 sese->color = coloring;
3766 }
3767 else
3768 /* Fallen off the subgraph, we cannot be coloring. */
3769 gcc_assert (coloring < 0);
3770
3771 /* Walk each successor block. */
3772 if (block->succs && block->succs->length ())
3773 {
3774 edge e;
3775 edge_iterator ei;
3776
3777 FOR_EACH_EDGE (e, ei, block->succs)
3778 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3779 }
3780 else
3781 gcc_assert (coloring < 0);
3782}
3783
3784/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3785 end up with NULL entries in it. */
3786
3787static void
3788nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3789{
3790 basic_block block;
3791 int ix;
3792
3793 /* First clear each BB of the whole function. */
2300c332 3794 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3795 {
3796 block->flags &= ~BB_VISITED;
3797 BB_SET_SESE (block, 0);
3798 }
912442c2
NS
3799
3800 /* Mark blocks in the function that are in this graph. */
3801 for (ix = 0; blocks.iterate (ix, &block); ix++)
3802 block->flags |= BB_VISITED;
3803
3804 /* Counts of nodes assigned to each color. There cannot be more
3805 colors than blocks (and hopefully there will be fewer). */
3806 auto_vec<unsigned> color_counts;
3807 color_counts.reserve (blocks.length ());
3808
3809 /* Worklist of nodes in the spanning tree. Again, there cannot be
3810 more nodes in the tree than blocks (there will be fewer if the
3811 CFG of blocks is disjoint). */
3812 auto_vec<basic_block> spanlist;
3813 spanlist.reserve (blocks.length ());
3814
3815 /* Make sure every block has its cycle class determined. */
3816 for (ix = 0; blocks.iterate (ix, &block); ix++)
3817 {
3818 if (BB_GET_SESE (block))
3819 /* We already met this block in an earlier graph solve. */
3820 continue;
3821
3822 if (dump_file)
3823 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3824
3825 /* Number the nodes reachable from block initial DFS order. */
3826 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3827
3828 /* Now walk in reverse DFS order to find cycle equivalents. */
3829 while (spanlist.length ())
3830 {
3831 block = spanlist.pop ();
3832 bb_sese *sese = BB_GET_SESE (block);
3833
3834 /* Do the pseudo node below. */
3835 nvptx_sese_pseudo (block, sese, depth, +1,
3836 sese->dir > 0 ? block->succs : block->preds,
3837 (sese->dir > 0 ? offsetof (edge_def, dest)
3838 : offsetof (edge_def, src)));
3839 sese->set_color (color_counts);
3840 /* Do the pseudo node above. */
3841 nvptx_sese_pseudo (block, sese, depth, -1,
3842 sese->dir < 0 ? block->succs : block->preds,
3843 (sese->dir < 0 ? offsetof (edge_def, dest)
3844 : offsetof (edge_def, src)));
3845 }
3846 if (dump_file)
3847 fprintf (dump_file, "\n");
3848 }
3849
3850 if (dump_file)
3851 {
3852 unsigned count;
3853 const char *comma = "";
3854
3855 fprintf (dump_file, "Found %d cycle equivalents\n",
3856 color_counts.length ());
3857 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3858 {
3859 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3860
3861 comma = "";
3862 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3863 if (BB_GET_SESE (block)->color == ix)
3864 {
3865 block->flags |= BB_VISITED;
3866 fprintf (dump_file, "%s%d", comma, block->index);
3867 comma=",";
3868 }
3869 fprintf (dump_file, "}");
3870 comma = ", ";
3871 }
3872 fprintf (dump_file, "\n");
3873 }
3874
3875 /* Now we've colored every block in the subgraph. We now need to
3876 determine the minimal set of SESE regions that cover that
3877 subgraph. Do this with a DFS walk of the complete function.
3878 During the walk we're either 'looking' or 'coloring'. When we
3879 reach the last node of a particular color, we stop coloring and
3880 return to looking. */
3881
3882 /* There cannot be more SESE regions than colors. */
3883 regions.reserve (color_counts.length ());
3884 for (ix = color_counts.length (); ix--;)
3885 regions.quick_push (bb_pair_t (0, 0));
3886
3887 for (ix = 0; blocks.iterate (ix, &block); ix++)
3888 block->flags &= ~BB_VISITED;
3889
3890 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3891
3892 if (dump_file)
3893 {
3894 const char *comma = "";
3895 int len = regions.length ();
3896
3897 fprintf (dump_file, "SESE regions:");
3898 for (ix = 0; ix != len; ix++)
3899 {
3900 basic_block from = regions[ix].first;
3901 basic_block to = regions[ix].second;
3902
3903 if (from)
3904 {
3905 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3906 if (to != from)
3907 fprintf (dump_file, "->%d", to->index);
3908
3909 int color = BB_GET_SESE (from)->color;
3910
3911 /* Print the blocks within the region (excluding ends). */
3912 FOR_EACH_BB_FN (block, cfun)
3913 {
3914 bb_sese *sese = BB_GET_SESE (block);
3915
3916 if (sese && sese->color == color
3917 && block != from && block != to)
3918 fprintf (dump_file, ".%d", block->index);
3919 }
3920 fprintf (dump_file, "}");
3921 }
3922 comma = ",";
3923 }
3924 fprintf (dump_file, "\n\n");
3925 }
3926
3927 for (ix = 0; blocks.iterate (ix, &block); ix++)
3928 delete BB_GET_SESE (block);
3929}
3930
3931#undef BB_SET_SESE
3932#undef BB_GET_SESE
3933
e91eba31
NS
3934/* Propagate live state at the start of a partitioned region. IS_CALL
3935 indicates whether the propagation is for a (partitioned) call
3936 instruction. BLOCK provides the live register information, and
3937 might not contain INSN. Propagation is inserted just after INSN. RW
3938 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
3939 separation is needed for worker-level proppagation where we
3940 essentially do a spill & fill. FN is the underlying worker
3941 function to generate the propagation instructions for single
3942 register. DATA is user data.
3943
e91eba31
NS
3944 Returns true if we didn't emit any instructions.
3945
3946 We propagate the live register set for non-calls and the entire
3947 frame for calls and non-calls. We could do better by (a)
3948 propagating just the live set that is used within the partitioned
3949 regions and (b) only propagating stack entries that are used. The
3950 latter might be quite hard to determine. */
d88cd9c4 3951
a0b3b5c4 3952typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
d88cd9c4 3953
e91eba31
NS
3954static bool
3955nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
a0b3b5c4 3956 propagate_mask rw, propagator_fn fn, void *data, bool vector)
d88cd9c4
NS
3957{
3958 bitmap live = DF_LIVE_IN (block);
3959 bitmap_iterator iterator;
3960 unsigned ix;
e91eba31 3961 bool empty = true;
d88cd9c4
NS
3962
3963 /* Copy the frame array. */
3964 HOST_WIDE_INT fs = get_frame_size ();
3965 if (fs)
3966 {
3967 rtx tmp = gen_reg_rtx (DImode);
3968 rtx idx = NULL_RTX;
3969 rtx ptr = gen_reg_rtx (Pmode);
3970 rtx pred = NULL_RTX;
3971 rtx_code_label *label = NULL;
3972
e91eba31 3973 empty = false;
c1311c86
NS
3974 /* The frame size might not be DImode compatible, but the frame
3975 array's declaration will be. So it's ok to round up here. */
3976 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3977 /* Detect single iteration loop. */
3978 if (fs == 1)
3979 fs = 0;
3980
3981 start_sequence ();
3982 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3983 if (fs)
3984 {
3985 idx = gen_reg_rtx (SImode);
3986 pred = gen_reg_rtx (BImode);
3987 label = gen_label_rtx ();
3988
3989 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3990 /* Allow worker function to initialize anything needed. */
a0b3b5c4 3991 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
d88cd9c4
NS
3992 if (init)
3993 emit_insn (init);
3994 emit_label (label);
3995 LABEL_NUSES (label)++;
3996 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3997 }
3998 if (rw & PM_read)
3999 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
a0b3b5c4 4000 emit_insn (fn (tmp, rw, fs, data, vector));
d88cd9c4
NS
4001 if (rw & PM_write)
4002 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4003 if (fs)
4004 {
4005 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4006 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4007 emit_insn (gen_br_true_uni (pred, label));
a0b3b5c4 4008 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
d88cd9c4
NS
4009 if (fini)
4010 emit_insn (fini);
4011 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4012 }
4013 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4015 rtx cpy = get_insns ();
4016 end_sequence ();
4017 insn = emit_insn_after (cpy, insn);
4018 }
4019
e91eba31
NS
4020 if (!is_call)
4021 /* Copy live registers. */
4022 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4023 {
4024 rtx reg = regno_reg_rtx[ix];
d88cd9c4 4025
e91eba31
NS
4026 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4027 {
a0b3b5c4 4028 rtx bcast = fn (reg, rw, 0, data, vector);
d88cd9c4 4029
e91eba31
NS
4030 insn = emit_insn_after (bcast, insn);
4031 empty = false;
4032 }
4033 }
4034 return empty;
d88cd9c4
NS
4035}
4036
a0b3b5c4 4037/* Worker for nvptx_warp_propagate. */
d88cd9c4
NS
4038
4039static rtx
a0b3b5c4
TV
4040warp_prop_gen (rtx reg, propagate_mask pm,
4041 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4042 bool ARG_UNUSED (vector))
d88cd9c4
NS
4043{
4044 if (!(pm & PM_read_write))
4045 return 0;
4046
a0b3b5c4 4047 return nvptx_gen_warp_bcast (reg);
d88cd9c4
NS
4048}
4049
4050/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
4051 of a single warp. Propagation is inserted just after INSN.
4052 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4053
e91eba31 4054static bool
a0b3b5c4 4055nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 4056{
a0b3b5c4
TV
4057 return nvptx_propagate (is_call, block, insn, PM_read_write,
4058 warp_prop_gen, 0, false);
d88cd9c4
NS
4059}
4060
a0b3b5c4 4061/* Worker for nvptx_shared_propagate. */
d88cd9c4
NS
4062
4063static rtx
a0b3b5c4
TV
4064shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4065 bool vector)
d88cd9c4 4066{
1ed57fb8 4067 broadcast_data_t *data = (broadcast_data_t *)data_;
d88cd9c4
NS
4068
4069 if (pm & PM_loop_begin)
4070 {
4071 /* Starting a loop, initialize pointer. */
4072 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4073
3c55d60f
TV
4074 oacc_bcast_align = MAX (oacc_bcast_align, align);
4075 data->offset = ROUND_UP (data->offset, align);
d88cd9c4
NS
4076
4077 data->ptr = gen_reg_rtx (Pmode);
4078
4079 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4080 }
4081 else if (pm & PM_loop_end)
4082 {
4083 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4084 data->ptr = NULL_RTX;
4085 return clobber;
4086 }
4087 else
a0b3b5c4 4088 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
d88cd9c4
NS
4089}
4090
4091/* Spill or fill live state that is live at start of BLOCK. PRE_P
4092 indicates if this is just before partitioned mode (do spill), or
4093 just after it starts (do fill). Sequence is inserted just after
e91eba31 4094 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4095
e91eba31 4096static bool
a0b3b5c4
TV
4097nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4098 rtx_insn *insn, bool vector)
d88cd9c4 4099{
1ed57fb8 4100 broadcast_data_t data;
d88cd9c4
NS
4101
4102 data.base = gen_reg_rtx (Pmode);
4103 data.offset = 0;
4104 data.ptr = NULL_RTX;
4105
e91eba31 4106 bool empty = nvptx_propagate (is_call, block, insn,
a0b3b5c4
TV
4107 pre_p ? PM_read : PM_write, shared_prop_gen,
4108 &data, vector);
e91eba31 4109 gcc_assert (empty == !data.offset);
d88cd9c4
NS
4110 if (data.offset)
4111 {
0024c320
TV
4112 rtx bcast_sym = oacc_bcast_sym;
4113
d88cd9c4 4114 /* Stuff was emitted, initialize the base pointer now. */
0024c320
TV
4115 if (vector && nvptx_mach_max_workers () > 1)
4116 {
4117 if (!cfun->machine->bcast_partition)
4118 {
4119 /* It would be nice to place this register in
4120 DATA_AREA_SHARED. */
4121 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4122 }
4123 if (!cfun->machine->sync_bar)
4124 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4125
4126 bcast_sym = cfun->machine->bcast_partition;
4127 }
4128
4129 rtx init = gen_rtx_SET (data.base, bcast_sym);
d88cd9c4 4130 emit_insn_after (init, insn);
15ab6f00 4131
0024c320
TV
4132 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4133 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4134 ? nvptx_mach_max_workers () + 1
4135 : 1);
4136
4137 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4138 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4 4139 }
e91eba31 4140 return empty;
d88cd9c4
NS
4141}
4142
0024c320
TV
4143/* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4144 which is an integer or a register. THREADS is the number of threads
4145 controlled by the barrier. */
d88cd9c4
NS
4146
4147static rtx
0024c320 4148nvptx_cta_sync (rtx lock, int threads)
d88cd9c4 4149{
0024c320 4150 return gen_nvptx_barsync (lock, GEN_INT (threads));
d88cd9c4
NS
4151}
4152
43c371e8
TV
4153#if WORKAROUND_PTXJIT_BUG
4154/* Return first real insn in BB, or return NULL_RTX if BB does not contain
4155 real insns. */
4156
4157static rtx_insn *
4158bb_first_real_insn (basic_block bb)
4159{
4160 rtx_insn *insn;
4161
4162 /* Find first insn of from block. */
4163 FOR_BB_INSNS (bb, insn)
4164 if (INSN_P (insn))
4165 return insn;
4166
4167 return 0;
4168}
4169#endif
4170
6beefdbd
TV
4171/* Return true if INSN needs neutering. */
4172
4173static bool
4174needs_neutering_p (rtx_insn *insn)
4175{
4176 if (!INSN_P (insn))
4177 return false;
4178
4179 switch (recog_memoized (insn))
4180 {
4181 case CODE_FOR_nvptx_fork:
4182 case CODE_FOR_nvptx_forked:
4183 case CODE_FOR_nvptx_joining:
4184 case CODE_FOR_nvptx_join:
4185 case CODE_FOR_nvptx_barsync:
4186 return false;
4187 default:
4188 return true;
4189 }
4190}
4191
a874808c
TV
4192/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4193
4194static bool
4195verify_neutering_jumps (basic_block from,
4196 rtx_insn *vector_jump, rtx_insn *worker_jump,
4197 rtx_insn *vector_label, rtx_insn *worker_label)
4198{
4199 basic_block bb = from;
4200 rtx_insn *insn = BB_HEAD (bb);
4201 bool seen_worker_jump = false;
4202 bool seen_vector_jump = false;
4203 bool seen_worker_label = false;
4204 bool seen_vector_label = false;
4205 bool worker_neutered = false;
4206 bool vector_neutered = false;
4207 while (true)
4208 {
4209 if (insn == worker_jump)
4210 {
4211 seen_worker_jump = true;
4212 worker_neutered = true;
4213 gcc_assert (!vector_neutered);
4214 }
4215 else if (insn == vector_jump)
4216 {
4217 seen_vector_jump = true;
4218 vector_neutered = true;
4219 }
4220 else if (insn == worker_label)
4221 {
4222 seen_worker_label = true;
4223 gcc_assert (worker_neutered);
4224 worker_neutered = false;
4225 }
4226 else if (insn == vector_label)
4227 {
4228 seen_vector_label = true;
4229 gcc_assert (vector_neutered);
4230 vector_neutered = false;
4231 }
4232 else if (INSN_P (insn))
4233 switch (recog_memoized (insn))
4234 {
4235 case CODE_FOR_nvptx_barsync:
4236 gcc_assert (!vector_neutered && !worker_neutered);
4237 break;
4238 default:
4239 break;
4240 }
4241
4242 if (insn != BB_END (bb))
4243 insn = NEXT_INSN (insn);
4244 else if (JUMP_P (insn) && single_succ_p (bb)
4245 && !seen_vector_jump && !seen_worker_jump)
4246 {
4247 bb = single_succ (bb);
4248 insn = BB_HEAD (bb);
4249 }
4250 else
4251 break;
4252 }
4253
4254 gcc_assert (!(vector_jump && !seen_vector_jump));
4255 gcc_assert (!(worker_jump && !seen_worker_jump));
4256
4257 if (seen_vector_label || seen_worker_label)
4258 {
4259 gcc_assert (!(vector_label && !seen_vector_label));
4260 gcc_assert (!(worker_label && !seen_worker_label));
4261
4262 return true;
4263 }
4264
4265 return false;
4266}
4267
4268/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4269
4270static void
4271verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4272 rtx_insn *worker_label)
4273{
4274 basic_block bb = to;
4275 rtx_insn *insn = BB_END (bb);
4276 bool seen_worker_label = false;
4277 bool seen_vector_label = false;
4278 while (true)
4279 {
4280 if (insn == worker_label)
4281 {
4282 seen_worker_label = true;
4283 gcc_assert (!seen_vector_label);
4284 }
4285 else if (insn == vector_label)
4286 seen_vector_label = true;
4287 else if (INSN_P (insn))
4288 switch (recog_memoized (insn))
4289 {
4290 case CODE_FOR_nvptx_barsync:
4291 gcc_assert (!seen_vector_label && !seen_worker_label);
4292 break;
4293 }
4294
4295 if (insn != BB_HEAD (bb))
4296 insn = PREV_INSN (insn);
4297 else
4298 break;
4299 }
4300
4301 gcc_assert (!(vector_label && !seen_vector_label));
4302 gcc_assert (!(worker_label && !seen_worker_label));
4303}
4304
d88cd9c4
NS
4305/* Single neutering according to MASK. FROM is the incoming block and
4306 TO is the outgoing block. These may be the same block. Insert at
4307 start of FROM:
4308
4309 if (tid.<axis>) goto end.
4310
4311 and insert before ending branch of TO (if there is such an insn):
4312
4313 end:
4314 <possibly-broadcast-cond>
4315 <branch>
4316
4317 We currently only use differnt FROM and TO when skipping an entire
4318 loop. We could do more if we detected superblocks. */
4319
4320static void
4321nvptx_single (unsigned mask, basic_block from, basic_block to)
4322{
4323 rtx_insn *head = BB_HEAD (from);
4324 rtx_insn *tail = BB_END (to);
4325 unsigned skip_mask = mask;
4326
ed9091b2
TV
4327 while (true)
4328 {
4329 /* Find first insn of from block. */
6beefdbd 4330 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4331 head = NEXT_INSN (head);
4332
4333 if (from == to)
4334 break;
4335
4336 if (!(JUMP_P (head) && single_succ_p (from)))
4337 break;
4338
4339 basic_block jump_target = single_succ (from);
4340 if (!single_pred_p (jump_target))
4341 break;
4342
4343 from = jump_target;
4344 head = BB_HEAD (from);
4345 }
d88cd9c4
NS
4346
4347 /* Find last insn of to block */
4348 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4349 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4350 tail = PREV_INSN (tail);
4351
4352 /* Detect if tail is a branch. */
4353 rtx tail_branch = NULL_RTX;
4354 rtx cond_branch = NULL_RTX;
4355 if (tail && INSN_P (tail))
4356 {
4357 tail_branch = PATTERN (tail);
4358 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4359 tail_branch = NULL_RTX;
4360 else
4361 {
4362 cond_branch = SET_SRC (tail_branch);
4363 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4364 cond_branch = NULL_RTX;
4365 }
4366 }
4367
4368 if (tail == head)
4369 {
4370 /* If this is empty, do nothing. */
6beefdbd 4371 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4372 return;
4373
d88cd9c4
NS
4374 if (cond_branch)
4375 {
4376 /* If we're only doing vector single, there's no need to
4377 emit skip code because we'll not insert anything. */
4378 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4379 skip_mask = 0;
4380 }
4381 else if (tail_branch)
4382 /* Block with only unconditional branch. Nothing to do. */
4383 return;
4384 }
4385
4386 /* Insert the vector test inside the worker test. */
4387 unsigned mode;
4388 rtx_insn *before = tail;
2ba16fd2 4389 rtx_insn *neuter_start = NULL;
a874808c
TV
4390 rtx_insn *worker_label = NULL, *vector_label = NULL;
4391 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4392 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4393 if (GOMP_DIM_MASK (mode) & skip_mask)
4394 {
4395 rtx_code_label *label = gen_label_rtx ();
4396 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
a874808c
TV
4397 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4398 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4399
4400 if (!pred)
4401 {
4402 pred = gen_reg_rtx (BImode);
4403 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4404 }
7820b298 4405
d88cd9c4
NS
4406 rtx br;
4407 if (mode == GOMP_DIM_VECTOR)
4408 br = gen_br_true (pred, label);
4409 else
4410 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4411 if (neuter_start)
4412 neuter_start = emit_insn_after (br, neuter_start);
4413 else
4414 neuter_start = emit_insn_before (br, head);
a874808c 4415 *mode_jump = neuter_start;
d88cd9c4
NS
4416
4417 LABEL_NUSES (label)++;
a874808c 4418 rtx_insn *label_insn;
d88cd9c4 4419 if (tail_branch)
a874808c
TV
4420 {
4421 label_insn = emit_label_before (label, before);
4422 before = label_insn;
4423 }
d88cd9c4 4424 else
be606483 4425 {
a874808c 4426 label_insn = emit_label_after (label, tail);
e77a1236
TV
4427 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4428 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4429 emit_insn_after (gen_exit (), label_insn);
4430 }
a874808c
TV
4431
4432 if (mode == GOMP_DIM_VECTOR)
4433 vector_label = label_insn;
4434 else
4435 worker_label = label_insn;
d88cd9c4
NS
4436 }
4437
4438 /* Now deal with propagating the branch condition. */
4439 if (cond_branch)
4440 {
4441 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4442
0024c320
TV
4443 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4444 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
d88cd9c4
NS
4445 {
4446 /* Vector mode only, do a shuffle. */
43c371e8
TV
4447#if WORKAROUND_PTXJIT_BUG
4448 /* The branch condition %rcond is propagated like this:
4449
4450 {
4451 .reg .u32 %x;
4452 mov.u32 %x,%tid.x;
4453 setp.ne.u32 %rnotvzero,%x,0;
4454 }
4455
4456 @%rnotvzero bra Lskip;
4457 setp.<op>.<type> %rcond,op1,op2;
4458 Lskip:
4459 selp.u32 %rcondu32,1,0,%rcond;
4460 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4461 setp.ne.u32 %rcond,%rcondu32,0;
4462
4463 There seems to be a bug in the ptx JIT compiler (observed at driver
4464 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4465 unless %rcond is initialized to something before 'bra Lskip'. The
4466 bug is not observed with ptxas from cuda 8.0.61.
4467
4468 It is true that the code is non-trivial: at Lskip, %rcond is
4469 uninitialized in threads 1-31, and after the selp the same holds
4470 for %rcondu32. But shfl propagates the defined value in thread 0
4471 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4472 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4473
4474 There is nothing in the PTX spec to suggest that this is wrong, or
4475 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4476 it as a JIT bug, and the extra initialization as workaround:
4477
4478 {
4479 .reg .u32 %x;
4480 mov.u32 %x,%tid.x;
4481 setp.ne.u32 %rnotvzero,%x,0;
4482 }
4483
4484 +.reg .pred %rcond2;
4485 +setp.eq.u32 %rcond2, 1, 0;
4486
4487 @%rnotvzero bra Lskip;
4488 setp.<op>.<type> %rcond,op1,op2;
4489 +mov.pred %rcond2, %rcond;
4490 Lskip:
4491 +mov.pred %rcond, %rcond2;
4492 selp.u32 %rcondu32,1,0,%rcond;
4493 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4494 setp.ne.u32 %rcond,%rcondu32,0;
4495 */
4496 rtx_insn *label = PREV_INSN (tail);
4497 gcc_assert (label && LABEL_P (label));
4498 rtx tmp = gen_reg_rtx (BImode);
4499 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4500 bb_first_real_insn (from));
8c8e9a6b
TV
4501 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4502 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4503#endif
a0b3b5c4 4504 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
d88cd9c4
NS
4505 }
4506 else
4507 {
4508 /* Includes worker mode, do spill & fill. By construction
4509 we should never have worker mode only. */
1ed57fb8 4510 broadcast_data_t data;
0024c320
TV
4511 unsigned size = GET_MODE_SIZE (SImode);
4512 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4513 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4514 rtx barrier = GEN_INT (0);
4515 int threads = 0;
d88cd9c4 4516
1ed57fb8 4517 data.base = oacc_bcast_sym;
d88cd9c4
NS
4518 data.ptr = 0;
4519
0024c320
TV
4520 bool use_partitioning_p = (vector && !worker
4521 && nvptx_mach_max_workers () > 1
4522 && cfun->machine->bcast_partition);
4523 if (use_partitioning_p)
4524 {
4525 data.base = cfun->machine->bcast_partition;
4526 barrier = cfun->machine->sync_bar;
4527 threads = nvptx_mach_vector_length ();
4528 }
4529 gcc_assert (data.base != NULL);
4530 gcc_assert (barrier);
4531
4532 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4533 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4534 ? nvptx_mach_max_workers () + 1
4535 : 1);
4536
4537 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4538 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4
NS
4539
4540 data.offset = 0;
a0b3b5c4 4541 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
0024c320 4542 vector),
d88cd9c4 4543 before);
0024c320 4544
d88cd9c4 4545 /* Barrier so other workers can see the write. */
0024c320 4546 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4 4547 data.offset = 0;
a0b3b5c4 4548 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
0024c320
TV
4549 vector),
4550 tail);
d88cd9c4
NS
4551 /* This barrier is needed to avoid worker zero clobbering
4552 the broadcast buffer before all the other workers have
4553 had a chance to read this instance of it. */
0024c320 4554 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4
NS
4555 }
4556
4557 extract_insn (tail);
4558 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4559 UNSPEC_BR_UNIFIED);
4560 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4561 }
a874808c
TV
4562
4563 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4564 vector_label, worker_label);
4565 if (!seen_label)
4566 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4567}
4568
4569/* PAR is a parallel that is being skipped in its entirety according to
4570 MASK. Treat this as skipping a superblock starting at forked
4571 and ending at joining. */
4572
4573static void
4574nvptx_skip_par (unsigned mask, parallel *par)
4575{
4576 basic_block tail = par->join_block;
4577 gcc_assert (tail->preds->length () == 1);
4578
4579 basic_block pre_tail = (*tail->preds)[0]->src;
4580 gcc_assert (pre_tail->succs->length () == 1);
4581
4582 nvptx_single (mask, par->forked_block, pre_tail);
4583}
4584
dba619f3
NS
4585/* If PAR has a single inner parallel and PAR itself only contains
4586 empty entry and exit blocks, swallow the inner PAR. */
4587
4588static void
4589nvptx_optimize_inner (parallel *par)
4590{
4591 parallel *inner = par->inner;
4592
4593 /* We mustn't be the outer dummy par. */
4594 if (!par->mask)
4595 return;
4596
4597 /* We must have a single inner par. */
4598 if (!inner || inner->next)
4599 return;
4600
4601 /* We must only contain 2 blocks ourselves -- the head and tail of
4602 the inner par. */
4603 if (par->blocks.length () != 2)
4604 return;
4605
4606 /* We must be disjoint partitioning. As we only have vector and
4607 worker partitioning, this is sufficient to guarantee the pars
4608 have adjacent partitioning. */
4609 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4610 /* This indicates malformed code generation. */
4611 return;
4612
4613 /* The outer forked insn should be immediately followed by the inner
4614 fork insn. */
4615 rtx_insn *forked = par->forked_insn;
4616 rtx_insn *fork = BB_END (par->forked_block);
4617
4618 if (NEXT_INSN (forked) != fork)
4619 return;
4620 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4621
4622 /* The outer joining insn must immediately follow the inner join
4623 insn. */
4624 rtx_insn *joining = par->joining_insn;
4625 rtx_insn *join = inner->join_insn;
4626 if (NEXT_INSN (join) != joining)
4627 return;
4628
4629 /* Preconditions met. Swallow the inner par. */
4630 if (dump_file)
4631 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4632 inner->mask, inner->forked_block->index,
4633 inner->join_block->index,
4634 par->mask, par->forked_block->index, par->join_block->index);
4635
4636 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4637
4638 par->blocks.reserve (inner->blocks.length ());
4639 while (inner->blocks.length ())
4640 par->blocks.quick_push (inner->blocks.pop ());
4641
4642 par->inner = inner->inner;
4643 inner->inner = NULL;
4644
4645 delete inner;
4646}
4647
d88cd9c4
NS
4648/* Process the parallel PAR and all its contained
4649 parallels. We do everything but the neutering. Return mask of
4650 partitioned modes used within this parallel. */
4651
4652static unsigned
4653nvptx_process_pars (parallel *par)
4654{
dba619f3
NS
4655 if (nvptx_optimize)
4656 nvptx_optimize_inner (par);
4657
d88cd9c4
NS
4658 unsigned inner_mask = par->mask;
4659
4660 /* Do the inner parallels first. */
4661 if (par->inner)
4662 {
4663 par->inner_mask = nvptx_process_pars (par->inner);
4664 inner_mask |= par->inner_mask;
4665 }
4666
e91eba31 4667 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
0024c320
TV
4668 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4669 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4670 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
e91eba31 4671
0024c320 4672 if (worker || large_vector)
d88cd9c4 4673 {
a0b3b5c4 4674 nvptx_shared_propagate (false, is_call, par->forked_block,
0024c320 4675 par->forked_insn, !worker);
d495b5cc
TV
4676 bool no_prop_p
4677 = nvptx_shared_propagate (true, is_call, par->forked_block,
4678 par->fork_insn, !worker);
4679 bool empty_loop_p
4680 = !is_call && (NEXT_INSN (par->forked_insn)
4681 && NEXT_INSN (par->forked_insn) == par->joining_insn);
0024c320
TV
4682 rtx barrier = GEN_INT (0);
4683 int threads = 0;
4684
4685 if (!worker && cfun->machine->sync_bar)
4686 {
4687 barrier = cfun->machine->sync_bar;
4688 threads = nvptx_mach_vector_length ();
4689 }
e91eba31 4690
d495b5cc
TV
4691 if (no_prop_p && empty_loop_p)
4692 ;
4693 else if (no_prop_p && is_call)
4694 ;
4695 else
e91eba31
NS
4696 {
4697 /* Insert begin and end synchronizations. */
0024c320
TV
4698 emit_insn_before (nvptx_cta_sync (barrier, threads),
4699 par->forked_insn);
4700 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
e91eba31 4701 }
d88cd9c4
NS
4702 }
4703 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
a0b3b5c4 4704 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4705
4706 /* Now do siblings. */
4707 if (par->next)
4708 inner_mask |= nvptx_process_pars (par->next);
4709 return inner_mask;
4710}
4711
4712/* Neuter the parallel described by PAR. We recurse in depth-first
4713 order. MODES are the partitioning of the execution and OUTER is
4714 the partitioning of the parallels we are contained in. */
4715
4716static void
4717nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4718{
4719 unsigned me = (par->mask
4720 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4721 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4722 unsigned skip_mask = 0, neuter_mask = 0;
4723
4724 if (par->inner)
4725 nvptx_neuter_pars (par->inner, modes, outer | me);
4726
4727 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4728 {
4729 if ((outer | me) & GOMP_DIM_MASK (mode))
4730 {} /* Mode is partitioned: no neutering. */
4731 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4732 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4733 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4734 || !par->forked_insn)
4735 /* Partitioned in inner parallels, or we're not a partitioned
4736 at all: neuter individual blocks. */
4737 neuter_mask |= GOMP_DIM_MASK (mode);
4738 else if (!par->parent || !par->parent->forked_insn
4739 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4740 /* Parent isn't a parallel or contains this paralleling: skip
4741 parallel at this level. */
4742 skip_mask |= GOMP_DIM_MASK (mode);
4743 else
4744 {} /* Parent will skip this parallel itself. */
4745 }
4746
4747 if (neuter_mask)
4748 {
912442c2 4749 int ix, len;
d88cd9c4 4750
912442c2
NS
4751 if (nvptx_optimize)
4752 {
4753 /* Neuter whole SESE regions. */
4754 bb_pair_vec_t regions;
4755
4756 nvptx_find_sese (par->blocks, regions);
4757 len = regions.length ();
4758 for (ix = 0; ix != len; ix++)
4759 {
4760 basic_block from = regions[ix].first;
4761 basic_block to = regions[ix].second;
4762
4763 if (from)
4764 nvptx_single (neuter_mask, from, to);
4765 else
4766 gcc_assert (!to);
4767 }
4768 }
4769 else
d88cd9c4 4770 {
912442c2
NS
4771 /* Neuter each BB individually. */
4772 len = par->blocks.length ();
4773 for (ix = 0; ix != len; ix++)
4774 {
4775 basic_block block = par->blocks[ix];
d88cd9c4 4776
912442c2
NS
4777 nvptx_single (neuter_mask, block, block);
4778 }
d88cd9c4
NS
4779 }
4780 }
4781
4782 if (skip_mask)
7820b298 4783 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4784
4785 if (par->next)
4786 nvptx_neuter_pars (par->next, modes, outer);
4787}
4788
2498a6ab
TV
4789static void
4790populate_offload_attrs (offload_attrs *oa)
4791{
4792 tree attr = oacc_get_fn_attrib (current_function_decl);
4793 tree dims = TREE_VALUE (attr);
4794 unsigned ix;
4795
4796 oa->mask = 0;
4797
4798 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4799 {
4800 tree t = TREE_VALUE (dims);
4801 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4802 tree allowed = TREE_PURPOSE (dims);
4803
4804 if (size != 1 && !(allowed && integer_zerop (allowed)))
4805 oa->mask |= GOMP_DIM_MASK (ix);
4806
4807 switch (ix)
4808 {
4809 case GOMP_DIM_GANG:
4810 oa->num_gangs = size;
4811 break;
4812
4813 case GOMP_DIM_WORKER:
4814 oa->num_workers = size;
4815 break;
4816
4817 case GOMP_DIM_VECTOR:
4818 oa->vector_length = size;
4819 break;
4820 }
4821 }
4822}
4823
3dede32b
TV
4824#if WORKAROUND_PTXJIT_BUG_2
4825/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4826 is needed in the nvptx target because the branches generated for
4827 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4828
4829static rtx
4830nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4831{
4832 rtx pat;
4833 if ((strict && !JUMP_P (insn))
4834 || (!strict && !INSN_P (insn)))
4835 return NULL_RTX;
4836 pat = PATTERN (insn);
4837
4838 /* The set is allowed to appear either as the insn pattern or
4839 the first set in a PARALLEL. */
4840 if (GET_CODE (pat) == PARALLEL)
4841 pat = XVECEXP (pat, 0, 0);
4842 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4843 return pat;
4844
4845 return NULL_RTX;
4846}
4847
4848/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4849
4850static rtx
4851nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4852{
4853 rtx x = nvptx_pc_set (insn, strict);
4854
4855 if (!x)
4856 return NULL_RTX;
4857 x = SET_SRC (x);
4858 if (GET_CODE (x) == LABEL_REF)
4859 return x;
4860 if (GET_CODE (x) != IF_THEN_ELSE)
4861 return NULL_RTX;
4862 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4863 return XEXP (x, 1);
4864 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4865 return XEXP (x, 2);
4866 return NULL_RTX;
4867}
4868
4869/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4870 insn inbetween the branch and the label. This works around a JIT bug
4871 observed at driver version 384.111, at -O0 for sm_50. */
4872
4873static void
4874prevent_branch_around_nothing (void)
4875{
4876 rtx_insn *seen_label = NULL;
4877 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4878 {
b0b592fc 4879 if (INSN_P (insn) && condjump_p (insn))
3dede32b 4880 {
b0b592fc 4881 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
4882 continue;
4883 }
4884
b0b592fc
TV
4885 if (seen_label == NULL)
4886 continue;
4887
3dede32b
TV
4888 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4889 continue;
4890
4891 if (INSN_P (insn))
4892 switch (recog_memoized (insn))
4893 {
4894 case CODE_FOR_nvptx_fork:
4895 case CODE_FOR_nvptx_forked:
4896 case CODE_FOR_nvptx_joining:
4897 case CODE_FOR_nvptx_join:
4898 continue;
4899 default:
4900 seen_label = NULL;
4901 continue;
4902 }
4903
4904 if (LABEL_P (insn) && insn == seen_label)
4905 emit_insn_before (gen_fake_nop (), insn);
4906
4907 seen_label = NULL;
4908 }
4909 }
4910#endif
4911
21251395
TV
4912#ifdef WORKAROUND_PTXJIT_BUG_3
4913/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4914 works around a hang observed at driver version 390.48 for sm_50. */
4915
4916static void
4917workaround_barsyncs (void)
4918{
4919 bool seen_barsync = false;
4920 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4921 {
4922 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4923 {
4924 if (seen_barsync)
4925 {
4926 emit_insn_before (gen_nvptx_membar_cta (), insn);
4927 emit_insn_before (gen_nvptx_membar_cta (), insn);
4928 }
4929
4930 seen_barsync = true;
4931 continue;
4932 }
4933
4934 if (!seen_barsync)
4935 continue;
4936
4937 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4938 continue;
4939 else if (INSN_P (insn))
4940 switch (recog_memoized (insn))
4941 {
4942 case CODE_FOR_nvptx_fork:
4943 case CODE_FOR_nvptx_forked:
4944 case CODE_FOR_nvptx_joining:
4945 case CODE_FOR_nvptx_join:
4946 continue;
4947 default:
4948 break;
4949 }
4950
4951 seen_barsync = false;
4952 }
4953}
4954#endif
4955
517665b3 4956/* PTX-specific reorganization
d88cd9c4 4957 - Split blocks at fork and join instructions
c38f0d8c
NS
4958 - Compute live registers
4959 - Mark now-unused registers, so function begin doesn't declare
517665b3 4960 unused registers.
d88cd9c4
NS
4961 - Insert state propagation when entering partitioned mode
4962 - Insert neutering instructions when in single mode
c38f0d8c 4963 - Replace subregs with suitable sequences.
517665b3
NS
4964*/
4965
4966static void
4967nvptx_reorg (void)
4968{
517665b3
NS
4969 /* We are freeing block_for_insn in the toplev to keep compatibility
4970 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4971 compute_bb_for_insn ();
4972
4973 thread_prologue_and_epilogue_insns ();
4974
d88cd9c4
NS
4975 /* Split blocks and record interesting unspecs. */
4976 bb_insn_map_t bb_insn_map;
4977
4978 nvptx_split_blocks (&bb_insn_map);
4979
c38f0d8c 4980 /* Compute live regs */
517665b3
NS
4981 df_clear_flags (DF_LR_RUN_DCE);
4982 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4983 df_live_add_problem ();
4984 df_live_set_all_dirty ();
517665b3 4985 df_analyze ();
738f2522
BS
4986 regstat_init_n_sets_and_refs ();
4987
d88cd9c4
NS
4988 if (dump_file)
4989 df_dump (dump_file);
4990
517665b3 4991 /* Mark unused regs as unused. */
d88cd9c4 4992 int max_regs = max_reg_num ();
44c068ae 4993 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
4994 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4995 regno_reg_rtx[i] = const0_rtx;
517665b3 4996
d88cd9c4
NS
4997 /* Determine launch dimensions of the function. If it is not an
4998 offloaded function (i.e. this is a regular compiler), the
4999 function has no neutering. */
629b3d75 5000 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
5001 if (attr)
5002 {
5003 /* If we determined this mask before RTL expansion, we could
5004 elide emission of some levels of forks and joins. */
2498a6ab 5005 offload_attrs oa;
d88cd9c4 5006
2498a6ab 5007 populate_offload_attrs (&oa);
d88cd9c4 5008
d88cd9c4
NS
5009 /* If there is worker neutering, there must be vector
5010 neutering. Otherwise the hardware will fail. */
2498a6ab
TV
5011 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5012 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
d88cd9c4
NS
5013
5014 /* Discover & process partitioned regions. */
5015 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5016 nvptx_process_pars (pars);
2498a6ab 5017 nvptx_neuter_pars (pars, oa.mask, 0);
d88cd9c4
NS
5018 delete pars;
5019 }
5020
517665b3 5021 /* Replace subregs. */
c03b0416 5022 nvptx_reorg_subreg ();
517665b3 5023
5012919d
AM
5024 if (TARGET_UNIFORM_SIMT)
5025 nvptx_reorg_uniform_simt ();
5026
3dede32b
TV
5027#if WORKAROUND_PTXJIT_BUG_2
5028 prevent_branch_around_nothing ();
5029#endif
5030
21251395
TV
5031#ifdef WORKAROUND_PTXJIT_BUG_3
5032 workaround_barsyncs ();
5033#endif
5034
738f2522 5035 regstat_free_n_sets_and_refs ();
517665b3
NS
5036
5037 df_finish_pass (true);
738f2522
BS
5038}
5039\f
5040/* Handle a "kernel" attribute; arguments as in
5041 struct attribute_spec.handler. */
5042
5043static tree
5044nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5045 int ARG_UNUSED (flags), bool *no_add_attrs)
5046{
5047 tree decl = *node;
5048
5049 if (TREE_CODE (decl) != FUNCTION_DECL)
5050 {
5051 error ("%qE attribute only applies to functions", name);
5052 *no_add_attrs = true;
5053 }
b49e35a9 5054 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
5055 {
5056 error ("%qE attribute requires a void return type", name);
5057 *no_add_attrs = true;
5058 }
5059
5060 return NULL_TREE;
5061}
5062
5012919d
AM
5063/* Handle a "shared" attribute; arguments as in
5064 struct attribute_spec.handler. */
5065
5066static tree
5067nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5068 int ARG_UNUSED (flags), bool *no_add_attrs)
5069{
5070 tree decl = *node;
5071
5072 if (TREE_CODE (decl) != VAR_DECL)
5073 {
5074 error ("%qE attribute only applies to variables", name);
5075 *no_add_attrs = true;
5076 }
5077 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5078 {
5079 error ("%qE attribute not allowed with auto storage class", name);
5080 *no_add_attrs = true;
5081 }
5082
5083 return NULL_TREE;
5084}
5085
738f2522
BS
5086/* Table of valid machine attributes. */
5087static const struct attribute_spec nvptx_attribute_table[] =
5088{
4849deb1
JJ
5089 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5090 affects_type_identity, handler, exclude } */
5091 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 5092 NULL },
4849deb1 5093 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 5094 NULL },
4849deb1 5095 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
5096};
5097\f
5098/* Limit vector alignments to BIGGEST_ALIGNMENT. */
5099
5100static HOST_WIDE_INT
5101nvptx_vector_alignment (const_tree type)
5102{
5103 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5104
5105 return MIN (align, BIGGEST_ALIGNMENT);
5106}
d88cd9c4
NS
5107
5108/* Indicate that INSN cannot be duplicated. */
5109
5110static bool
5111nvptx_cannot_copy_insn_p (rtx_insn *insn)
5112{
5113 switch (recog_memoized (insn))
5114 {
5115 case CODE_FOR_nvptx_shufflesi:
5116 case CODE_FOR_nvptx_shufflesf:
5117 case CODE_FOR_nvptx_barsync:
5118 case CODE_FOR_nvptx_fork:
5119 case CODE_FOR_nvptx_forked:
5120 case CODE_FOR_nvptx_joining:
5121 case CODE_FOR_nvptx_join:
5122 return true;
5123 default:
5124 return false;
5125 }
5126}
a794bd20
NS
5127
5128/* Section anchors do not work. Initialization for flag_section_anchor
5129 probes the existence of the anchoring target hooks and prevents
5130 anchoring if they don't exist. However, we may be being used with
5131 a host-side compiler that does support anchoring, and hence see
5132 the anchor flag set (as it's not recalculated). So provide an
5133 implementation denying anchoring. */
5134
5135static bool
5136nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5137{
5138 return false;
5139}
738f2522 5140\f
1f83528e
TS
5141/* Record a symbol for mkoffload to enter into the mapping table. */
5142
5143static void
5144nvptx_record_offload_symbol (tree decl)
5145{
3e32ee19
NS
5146 switch (TREE_CODE (decl))
5147 {
5148 case VAR_DECL:
5149 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5150 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5151 break;
5152
5153 case FUNCTION_DECL:
5154 {
629b3d75 5155 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
5156 /* OpenMP offloading does not set this attribute. */
5157 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 5158
3e32ee19
NS
5159 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5160 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5161
5012919d 5162 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 5163 {
5d306e55 5164 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 5165
5d306e55 5166 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
5167 fprintf (asm_out_file, ", %#x", size);
5168 }
d2d47a28 5169
3e32ee19
NS
5170 fprintf (asm_out_file, "\n");
5171 }
5172 break;
d2d47a28 5173
3e32ee19
NS
5174 default:
5175 gcc_unreachable ();
5176 }
1f83528e
TS
5177}
5178
738f2522
BS
5179/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5180 at the start of a file. */
5181
5182static void
5183nvptx_file_start (void)
5184{
5185 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5186 fputs ("\t.version\t3.1\n", asm_out_file);
53ceb8b5
CP
5187 if (TARGET_SM35)
5188 fputs ("\t.target\tsm_35\n", asm_out_file);
5189 else
5190 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
5191 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5192 fputs ("// END PREAMBLE\n", asm_out_file);
5193}
5194
a0b3b5c4
TV
5195/* Emit a declaration for a worker and vector-level buffer in .shared
5196 memory. */
15ab6f00
NS
5197
5198static void
a0b3b5c4 5199write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
15ab6f00
NS
5200{
5201 const char *name = XSTR (sym, 0);
5202
5203 write_var_marker (file, true, false, name);
5204 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5205 align, name, size);
5206}
5207
ecf6e535
BS
5208/* Write out the function declarations we've collected and declare storage
5209 for the broadcast buffer. */
738f2522
BS
5210
5211static void
5212nvptx_file_end (void)
5213{
f3dba894
TS
5214 hash_table<tree_hasher>::iterator iter;
5215 tree decl;
5216 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 5217 nvptx_record_fndecl (decl);
738f2522 5218 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4 5219
1ed57fb8 5220 if (oacc_bcast_size)
a0b3b5c4 5221 write_shared_buffer (asm_out_file, oacc_bcast_sym,
1ed57fb8 5222 oacc_bcast_align, oacc_bcast_size);
f3552158
NS
5223
5224 if (worker_red_size)
a0b3b5c4 5225 write_shared_buffer (asm_out_file, worker_red_sym,
15ab6f00 5226 worker_red_align, worker_red_size);
5012919d 5227
f881693c
TV
5228 if (vector_red_size)
5229 write_shared_buffer (asm_out_file, vector_red_sym,
5230 vector_red_align, vector_red_size);
5231
5012919d
AM
5232 if (need_softstack_decl)
5233 {
5234 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5235 /* 32 is the maximum number of warps in a block. Even though it's an
5236 external declaration, emit the array size explicitly; otherwise, it
5237 may fail at PTX JIT time if the definition is later in link order. */
5238 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5239 POINTER_SIZE);
5240 }
5241 if (need_unisimt_decl)
5242 {
5243 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5244 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5245 }
f3552158
NS
5246}
5247
5248/* Expander for the shuffle builtins. */
5249
5250static rtx
5251nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5252{
5253 if (ignore)
5254 return target;
5255
5256 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5257 NULL_RTX, mode, EXPAND_NORMAL);
5258 if (!REG_P (src))
5259 src = copy_to_mode_reg (mode, src);
5260
5261 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5262 NULL_RTX, SImode, EXPAND_NORMAL);
5263 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5264 NULL_RTX, SImode, EXPAND_NORMAL);
5265
5266 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5267 idx = copy_to_mode_reg (SImode, idx);
5268
59263259
NS
5269 rtx pat = nvptx_gen_shuffle (target, src, idx,
5270 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5271 if (pat)
5272 emit_insn (pat);
5273
5274 return target;
5275}
5276
f881693c
TV
5277const char *
5278nvptx_output_red_partition (rtx dst, rtx offset)
5279{
5280 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5281 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5282
5283 if (offset == const0_rtx)
5284 fprintf (asm_out_file, zero_offset, REGNO (dst),
5285 REGNO (cfun->machine->red_partition));
5286 else
5287 fprintf (asm_out_file, with_offset, REGNO (dst),
5288 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5289
5290 return "";
5291}
5292
5293/* Shared-memory reduction address expander. */
f3552158
NS
5294
5295static rtx
a0b3b5c4 5296nvptx_expand_shared_addr (tree exp, rtx target,
f881693c
TV
5297 machine_mode ARG_UNUSED (mode), int ignore,
5298 int vector)
f3552158
NS
5299{
5300 if (ignore)
5301 return target;
5302
5303 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
f3552158
NS
5304 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5305 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
9a863523 5306 rtx addr = worker_red_sym;
f881693c
TV
5307
5308 if (vector)
9a863523 5309 {
f881693c
TV
5310 offload_attrs oa;
5311
5312 populate_offload_attrs (&oa);
5313
5314 unsigned int psize = ROUND_UP (size + offset, align);
5315 unsigned int pnum = nvptx_mach_max_workers ();
5316 vector_red_partition = MAX (vector_red_partition, psize);
5317 vector_red_size = MAX (vector_red_size, psize * pnum);
5318 vector_red_align = MAX (vector_red_align, align);
5319
5320 if (cfun->machine->red_partition == NULL)
5321 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5322
5323 addr = gen_reg_rtx (Pmode);
5324 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
9a863523 5325 }
f881693c
TV
5326 else
5327 {
5328 worker_red_align = MAX (worker_red_align, align);
5329 worker_red_size = MAX (worker_red_size, size + offset);
f3552158 5330
f881693c
TV
5331 if (offset)
5332 {
5333 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5334 addr = gen_rtx_CONST (Pmode, addr);
5335 }
5336 }
f3552158 5337
f881693c 5338 emit_move_insn (target, addr);
f3552158
NS
5339 return target;
5340}
5341
5342/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5343 not require taking the address of any object, other than the memory
5344 cell being operated on. */
5345
5346static rtx
5347nvptx_expand_cmp_swap (tree exp, rtx target,
5348 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5349{
5350 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5351
5352 if (!target)
5353 target = gen_reg_rtx (mode);
5354
5355 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5356 NULL_RTX, Pmode, EXPAND_NORMAL);
5357 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5358 NULL_RTX, mode, EXPAND_NORMAL);
5359 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5360 NULL_RTX, mode, EXPAND_NORMAL);
5361 rtx pat;
5362
5363 mem = gen_rtx_MEM (mode, mem);
5364 if (!REG_P (cmp))
5365 cmp = copy_to_mode_reg (mode, cmp);
5366 if (!REG_P (src))
5367 src = copy_to_mode_reg (mode, src);
5368
5369 if (mode == SImode)
5370 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5371 else
5372 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5373
5374 emit_insn (pat);
5375
5376 return target;
5377}
5378
5379
5380/* Codes for all the NVPTX builtins. */
5381enum nvptx_builtins
5382{
5383 NVPTX_BUILTIN_SHUFFLE,
5384 NVPTX_BUILTIN_SHUFFLELL,
5385 NVPTX_BUILTIN_WORKER_ADDR,
f881693c 5386 NVPTX_BUILTIN_VECTOR_ADDR,
f3552158
NS
5387 NVPTX_BUILTIN_CMP_SWAP,
5388 NVPTX_BUILTIN_CMP_SWAPLL,
5389 NVPTX_BUILTIN_MAX
5390};
5391
5392static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5393
5394/* Return the NVPTX builtin for CODE. */
5395
5396static tree
5397nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5398{
5399 if (code >= NVPTX_BUILTIN_MAX)
5400 return error_mark_node;
5401
5402 return nvptx_builtin_decls[code];
5403}
5404
5405/* Set up all builtin functions for this target. */
5406
5407static void
5408nvptx_init_builtins (void)
5409{
5410#define DEF(ID, NAME, T) \
5411 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5412 = add_builtin_function ("__builtin_nvptx_" NAME, \
5413 build_function_type_list T, \
5414 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5415#define ST sizetype
5416#define UINT unsigned_type_node
5417#define LLUINT long_long_unsigned_type_node
5418#define PTRVOID ptr_type_node
5419
5420 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5421 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5422 DEF (WORKER_ADDR, "worker_addr",
5423 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f881693c
TV
5424 DEF (VECTOR_ADDR, "vector_addr",
5425 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f3552158
NS
5426 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5427 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5428
5429#undef DEF
5430#undef ST
5431#undef UINT
5432#undef LLUINT
5433#undef PTRVOID
5434}
5435
5436/* Expand an expression EXP that calls a built-in function,
5437 with result going to TARGET if that's convenient
5438 (and in mode MODE if that's convenient).
5439 SUBTARGET may be used as the target for computing one of EXP's operands.
5440 IGNORE is nonzero if the value is to be ignored. */
5441
5442static rtx
5443nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5444 machine_mode mode, int ignore)
5445{
5446 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5447 switch (DECL_FUNCTION_CODE (fndecl))
5448 {
5449 case NVPTX_BUILTIN_SHUFFLE:
5450 case NVPTX_BUILTIN_SHUFFLELL:
5451 return nvptx_expand_shuffle (exp, target, mode, ignore);
5452
5453 case NVPTX_BUILTIN_WORKER_ADDR:
f881693c
TV
5454 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5455
5456 case NVPTX_BUILTIN_VECTOR_ADDR:
5457 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
f3552158
NS
5458
5459 case NVPTX_BUILTIN_CMP_SWAP:
5460 case NVPTX_BUILTIN_CMP_SWAPLL:
5461 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5462
5463 default: gcc_unreachable ();
5464 }
738f2522 5465}
f3552158 5466
5012919d
AM
5467/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5468
5469static int
5470nvptx_simt_vf ()
5471{
5d17a476 5472 return PTX_WARP_SIZE;
5012919d
AM
5473}
5474
ccc0d492
TV
5475static bool
5476nvptx_welformed_vector_length_p (int l)
5477{
5478 gcc_assert (l > 0);
5479 return l % PTX_WARP_SIZE == 0;
5480}
5481
5482static void
5483nvptx_apply_dim_limits (int dims[])
5484{
5485 /* Check that the vector_length is not too large. */
5486 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5487 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5488
5489 /* Check that the number of workers is not too large. */
5490 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5491 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5492
5493 /* Ensure that num_worker * vector_length <= cta size. */
5494 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5495 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5496 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5497}
5498
5f57c8f3
TV
5499/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5500 DIMS has changed. */
94829f87 5501
5f57c8f3
TV
5502static void
5503nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level)
94829f87 5504{
ec6c865c
TV
5505 bool oacc_default_dims_p = false;
5506 bool oacc_min_dims_p = false;
5507 bool offload_region_p = false;
5508 bool routine_p = false;
5509 bool routine_seq_p = false;
5510
5511 if (decl == NULL_TREE)
5512 {
5513 if (fn_level == -1)
5514 oacc_default_dims_p = true;
5515 else if (fn_level == -2)
5516 oacc_min_dims_p = true;
5517 else
5518 gcc_unreachable ();
5519 }
5520 else if (fn_level == -1)
5521 offload_region_p = true;
5522 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5523 {
5524 routine_p = true;
5525 routine_seq_p = fn_level == GOMP_DIM_MAX;
5526 }
5527 else
5528 gcc_unreachable ();
94829f87 5529
207e7fea
TV
5530 if (routine_p)
5531 {
5532 /* OpenACC routines in C arrive here with the following attributes
5533 (omitting the 'omp declare target'):
5534 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5535 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5536 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5537 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5538
5539 If we take f.i. the oacc function attribute of the worker routine
5540 (0 1, 1 0, 1 0), then:
5541 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5542 meaning: worker routine, that is:
5543 - can't contain gang loop (0),
5544 - can contain worker loop (1),
5545 - can contain vector loop (1).
5546 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5547 dimensions: gang: 1, worker: 0, vector: 0.
5548
5549 OTOH, routines in Fortran arrive here with these attributes:
5550 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5551 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5552 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5553 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5554 that is, the same as for C but with the dimensions set to 0.
5555
5556 This is due to a bug in the Fortran front-end: PR72741. Work around
5557 this bug by forcing the dimensions to be the same in Fortran as for C,
5558 to be able to handle C and Fortran routines uniformly in this
5559 function. */
5560 dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
5561 dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
5562 dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
207e7fea
TV
5563 }
5564
4b171607
TV
5565 if (oacc_min_dims_p)
5566 {
5567 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5568 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5569 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5570
5571 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5572 return;
5573 }
5574
5575 if (routine_p)
5576 {
5577 if (!routine_seq_p)
5578 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5579
5580 return;
5581 }
5582
5583 if (oacc_default_dims_p)
5584 {
5585 /* -1 : not set
5586 0 : set at runtime, f.i. -fopenacc-dims=-
5587 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5588 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5589 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5590 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5591
5592 /* But -fopenacc-dims=- is not yet supported on trunk. */
5593 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5594 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5595 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5596 }
5597
5598 if (offload_region_p)
5599 {
5600 /* -1 : not set
5601 0 : set using variable, f.i. num_gangs (n)
5602 >= 1: set using constant, f.i. num_gangs (1). */
5603 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5604 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5605 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5606 }
5607
d8ab4e54
TV
5608 int old_dims[GOMP_DIM_MAX];
5609 unsigned int i;
5610 for (i = 0; i < GOMP_DIM_MAX; ++i)
5611 old_dims[i] = dims[i];
5612
5613 const char *vector_reason = NULL;
5614 if (dims[GOMP_DIM_VECTOR] == 0)
ccc8282b 5615 {
d8ab4e54 5616 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
764ecad4 5617 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
ccc8282b
NS
5618 }
5619
d8ab4e54 5620 if (dims[GOMP_DIM_VECTOR] > 0
ccc0d492 5621 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
d8ab4e54
TV
5622 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
5623
ccc0d492 5624 nvptx_apply_dim_limits (dims);
d8ab4e54
TV
5625
5626 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5627 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5628 vector_reason != NULL
5629 ? vector_reason
5630 : G_("using vector_length (%d), ignoring %d"),
5631 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5632
5633 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5634 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5635 G_("using num_workers (%d), ignoring %d"),
5636 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
94829f87 5637
4b171607 5638 if (oacc_default_dims_p)
b6adbb9f 5639 {
764ecad4 5640 dims[GOMP_DIM_VECTOR] = PTX_DEFAULT_VECTOR_LENGTH;
b6adbb9f 5641 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5642 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5643 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5644 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
ccc0d492 5645 nvptx_apply_dim_limits (dims);
b6adbb9f 5646 }
5f57c8f3
TV
5647}
5648
5649/* Validate compute dimensions of an OpenACC offload or routine, fill
5650 in non-unity defaults. FN_LEVEL indicates the level at which a
5651 routine might spawn a loop. It is negative for non-routines. If
5652 DECL is null, we are validating the default dimensions. */
5653
5654static bool
5655nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
5656{
5657 int old_dims[GOMP_DIM_MAX];
5658 unsigned int i;
b6adbb9f 5659
5f57c8f3
TV
5660 for (i = 0; i < GOMP_DIM_MAX; ++i)
5661 old_dims[i] = dims[i];
5662
5663 nvptx_goacc_validate_dims_1 (decl, dims, fn_level);
5664
21fbea5c
TV
5665 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5666 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5667 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5668
5f57c8f3
TV
5669 for (i = 0; i < GOMP_DIM_MAX; ++i)
5670 if (old_dims[i] != dims[i])
5671 return true;
5672
5673 return false;
94829f87 5674}
d88cd9c4 5675
bd751975
NS
5676/* Return maximum dimension size, or zero for unbounded. */
5677
5678static int
5679nvptx_dim_limit (int axis)
5680{
5681 switch (axis)
5682 {
bd751975 5683 case GOMP_DIM_VECTOR:
764ecad4 5684 return PTX_MAX_VECTOR_LENGTH;
bd751975
NS
5685
5686 default:
5687 break;
5688 }
5689 return 0;
5690}
5691
d88cd9c4
NS
5692/* Determine whether fork & joins are needed. */
5693
5694static bool
5695nvptx_goacc_fork_join (gcall *call, const int dims[],
5696 bool ARG_UNUSED (is_fork))
5697{
5698 tree arg = gimple_call_arg (call, 2);
5699 unsigned axis = TREE_INT_CST_LOW (arg);
5700
5701 /* We only care about worker and vector partitioning. */
5702 if (axis < GOMP_DIM_WORKER)
5703 return false;
5704
5705 /* If the size is 1, there's no partitioning. */
5706 if (dims[axis] == 1)
5707 return false;
5708
5709 return true;
5710}
5711
f3552158
NS
5712/* Generate a PTX builtin function call that returns the address in
5713 the worker reduction buffer at OFFSET. TYPE is the type of the
5714 data at that location. */
5715
5716static tree
f881693c 5717nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
f3552158 5718{
f881693c
TV
5719 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5720 if (vector)
5721 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
f3552158 5722 machine_mode mode = TYPE_MODE (type);
f881693c 5723 tree fndecl = nvptx_builtin_decl (addr_dim, true);
f3552158
NS
5724 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5725 tree align = build_int_cst (unsigned_type_node,
5726 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5727 tree call = build_call_expr (fndecl, 3, offset, size, align);
5728
5729 return fold_convert (build_pointer_type (type), call);
5730}
5731
5732/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5733 will cast the variable if necessary. */
5734
5735static void
5736nvptx_generate_vector_shuffle (location_t loc,
5737 tree dest_var, tree var, unsigned shift,
5738 gimple_seq *seq)
5739{
5740 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5741 tree_code code = NOP_EXPR;
dd3c1b14
NS
5742 tree arg_type = unsigned_type_node;
5743 tree var_type = TREE_TYPE (var);
5744 tree dest_type = var_type;
f3552158 5745
dd3c1b14
NS
5746 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5747 var_type = TREE_TYPE (var_type);
5748
5749 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 5750 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
5751
5752 if (TYPE_SIZE (var_type)
5753 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
5754 {
5755 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 5756 arg_type = long_long_unsigned_type_node;
f3552158 5757 }
dd3c1b14 5758
f3552158 5759 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
5760 tree bits = build_int_cst (unsigned_type_node, shift);
5761 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5762 tree expr;
5763
5764 if (var_type != dest_type)
5765 {
5766 /* Do real and imaginary parts separately. */
5767 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5768 real = fold_build1 (code, arg_type, real);
5769 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5770 real = fold_build1 (code, var_type, real);
f3552158 5771
dd3c1b14
NS
5772 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5773 imag = fold_build1 (code, arg_type, imag);
5774 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5775 imag = fold_build1 (code, var_type, imag);
5776
5777 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5778 }
5779 else
5780 {
5781 expr = fold_build1 (code, arg_type, var);
5782 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5783 expr = fold_build1 (code, dest_type, expr);
5784 }
f3552158 5785
dd3c1b14 5786 gimplify_assign (dest_var, expr, seq);
f3552158
NS
5787}
5788
33f47f42
NS
5789/* Lazily generate the global lock var decl and return its address. */
5790
5791static tree
5792nvptx_global_lock_addr ()
5793{
5794 tree v = global_lock_var;
5795
5796 if (!v)
5797 {
5798 tree name = get_identifier ("__reduction_lock");
5799 tree type = build_qualified_type (unsigned_type_node,
5800 TYPE_QUAL_VOLATILE);
5801 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5802 global_lock_var = v;
5803 DECL_ARTIFICIAL (v) = 1;
5804 DECL_EXTERNAL (v) = 1;
5805 TREE_STATIC (v) = 1;
5806 TREE_PUBLIC (v) = 1;
5807 TREE_USED (v) = 1;
5808 mark_addressable (v);
5809 mark_decl_referenced (v);
5810 }
5811
5812 return build_fold_addr_expr (v);
5813}
5814
5815/* Insert code to locklessly update *PTR with *PTR OP VAR just before
5816 GSI. We use a lockless scheme for nearly all case, which looks
5817 like:
5818 actual = initval(OP);
5819 do {
5820 guess = actual;
5821 write = guess OP myval;
5822 actual = cmp&swap (ptr, guess, write)
5823 } while (actual bit-different-to guess);
5824 return write;
5825
5826 This relies on a cmp&swap instruction, which is available for 32-
5827 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
5828
5829static tree
5830nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5831 tree ptr, tree var, tree_code op)
5832{
5833 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5834 tree_code code = NOP_EXPR;
33f47f42
NS
5835 tree arg_type = unsigned_type_node;
5836 tree var_type = TREE_TYPE (var);
f3552158 5837
33f47f42
NS
5838 if (TREE_CODE (var_type) == COMPLEX_TYPE
5839 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 5840 code = VIEW_CONVERT_EXPR;
33f47f42
NS
5841
5842 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 5843 {
33f47f42 5844 arg_type = long_long_unsigned_type_node;
f3552158 5845 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
5846 }
5847
33f47f42
NS
5848 tree swap_fn = nvptx_builtin_decl (fn, true);
5849
f3552158 5850 gimple_seq init_seq = NULL;
33f47f42
NS
5851 tree init_var = make_ssa_name (arg_type);
5852 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5853 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
5854 gimplify_assign (init_var, init_expr, &init_seq);
5855 gimple *init_end = gimple_seq_last (init_seq);
5856
5857 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5858
f3552158
NS
5859 /* Split the block just after the init stmts. */
5860 basic_block pre_bb = gsi_bb (*gsi);
5861 edge pre_edge = split_block (pre_bb, init_end);
5862 basic_block loop_bb = pre_edge->dest;
5863 pre_bb = pre_edge->src;
5864 /* Reset the iterator. */
5865 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5866
33f47f42
NS
5867 tree expect_var = make_ssa_name (arg_type);
5868 tree actual_var = make_ssa_name (arg_type);
5869 tree write_var = make_ssa_name (arg_type);
5870
5871 /* Build and insert the reduction calculation. */
5872 gimple_seq red_seq = NULL;
5873 tree write_expr = fold_build1 (code, var_type, expect_var);
5874 write_expr = fold_build2 (op, var_type, write_expr, var);
5875 write_expr = fold_build1 (code, arg_type, write_expr);
5876 gimplify_assign (write_var, write_expr, &red_seq);
5877
5878 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5879
5880 /* Build & insert the cmp&swap sequence. */
5881 gimple_seq latch_seq = NULL;
5882 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5883 ptr, expect_var, write_var);
5884 gimplify_assign (actual_var, swap_expr, &latch_seq);
5885
5886 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5887 NULL_TREE, NULL_TREE);
5888 gimple_seq_add_stmt (&latch_seq, cond);
5889
5890 gimple *latch_end = gimple_seq_last (latch_seq);
5891 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 5892
33f47f42
NS
5893 /* Split the block just after the latch stmts. */
5894 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
5895 basic_block post_bb = post_edge->dest;
5896 loop_bb = post_edge->src;
5897 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5898
5899 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 5900 post_edge->probability = profile_probability::even ();
f3552158 5901 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 5902 loop_edge->probability = profile_probability::even ();
f3552158
NS
5903 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5904 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5905
5906 gphi *phi = create_phi_node (expect_var, loop_bb);
5907 add_phi_arg (phi, init_var, pre_edge, loc);
5908 add_phi_arg (phi, actual_var, loop_edge, loc);
5909
5910 loop *loop = alloc_loop ();
5911 loop->header = loop_bb;
5912 loop->latch = loop_bb;
5913 add_loop (loop, loop_bb->loop_father);
5914
33f47f42
NS
5915 return fold_build1 (code, var_type, write_var);
5916}
5917
5918/* Insert code to lockfully update *PTR with *PTR OP VAR just before
5919 GSI. This is necessary for types larger than 64 bits, where there
5920 is no cmp&swap instruction to implement a lockless scheme. We use
5921 a lock variable in global memory.
5922
5923 while (cmp&swap (&lock_var, 0, 1))
5924 continue;
5925 T accum = *ptr;
5926 accum = accum OP var;
5927 *ptr = accum;
5928 cmp&swap (&lock_var, 1, 0);
5929 return accum;
5930
5931 A lock in global memory is necessary to force execution engine
5932 descheduling and avoid resource starvation that can occur if the
5933 lock is in .shared memory. */
5934
5935static tree
5936nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5937 tree ptr, tree var, tree_code op)
5938{
5939 tree var_type = TREE_TYPE (var);
5940 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5941 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5942 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5943
5944 /* Split the block just before the gsi. Insert a gimple nop to make
5945 this easier. */
5946 gimple *nop = gimple_build_nop ();
5947 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5948 basic_block entry_bb = gsi_bb (*gsi);
5949 edge entry_edge = split_block (entry_bb, nop);
5950 basic_block lock_bb = entry_edge->dest;
5951 /* Reset the iterator. */
5952 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5953
5954 /* Build and insert the locking sequence. */
5955 gimple_seq lock_seq = NULL;
5956 tree lock_var = make_ssa_name (unsigned_type_node);
5957 tree lock_expr = nvptx_global_lock_addr ();
5958 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5959 uns_unlocked, uns_locked);
5960 gimplify_assign (lock_var, lock_expr, &lock_seq);
5961 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5962 NULL_TREE, NULL_TREE);
5963 gimple_seq_add_stmt (&lock_seq, cond);
5964 gimple *lock_end = gimple_seq_last (lock_seq);
5965 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5966
5967 /* Split the block just after the lock sequence. */
5968 edge locked_edge = split_block (lock_bb, lock_end);
5969 basic_block update_bb = locked_edge->dest;
5970 lock_bb = locked_edge->src;
5971 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5972
5973 /* Create the lock loop ... */
5974 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
5975 locked_edge->probability = profile_probability::even ();
5976 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5977 loop_edge->probability = profile_probability::even ();
33f47f42
NS
5978 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5979 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5980
5981 /* ... and the loop structure. */
5982 loop *lock_loop = alloc_loop ();
5983 lock_loop->header = lock_bb;
5984 lock_loop->latch = lock_bb;
5985 lock_loop->nb_iterations_estimate = 1;
5986 lock_loop->any_estimate = true;
5987 add_loop (lock_loop, entry_bb->loop_father);
5988
5989 /* Build and insert the reduction calculation. */
5990 gimple_seq red_seq = NULL;
5991 tree acc_in = make_ssa_name (var_type);
5992 tree ref_in = build_simple_mem_ref (ptr);
5993 TREE_THIS_VOLATILE (ref_in) = 1;
5994 gimplify_assign (acc_in, ref_in, &red_seq);
5995
5996 tree acc_out = make_ssa_name (var_type);
5997 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5998 gimplify_assign (acc_out, update_expr, &red_seq);
5999
6000 tree ref_out = build_simple_mem_ref (ptr);
6001 TREE_THIS_VOLATILE (ref_out) = 1;
6002 gimplify_assign (ref_out, acc_out, &red_seq);
6003
6004 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6005
6006 /* Build & insert the unlock sequence. */
6007 gimple_seq unlock_seq = NULL;
6008 tree unlock_expr = nvptx_global_lock_addr ();
6009 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6010 uns_locked, uns_unlocked);
6011 gimplify_and_add (unlock_expr, &unlock_seq);
6012 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6013
6014 return acc_out;
6015}
6016
6017/* Emit a sequence to update a reduction accumlator at *PTR with the
6018 value held in VAR using operator OP. Return the updated value.
6019
6020 TODO: optimize for atomic ops and indepedent complex ops. */
6021
6022static tree
6023nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6024 tree ptr, tree var, tree_code op)
6025{
6026 tree type = TREE_TYPE (var);
6027 tree size = TYPE_SIZE (type);
6028
6029 if (size == TYPE_SIZE (unsigned_type_node)
6030 || size == TYPE_SIZE (long_long_unsigned_type_node))
6031 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6032 else
6033 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
6034}
6035
6036/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6037
6038static void
f881693c 6039nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
f3552158
NS
6040{
6041 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6042 tree lhs = gimple_call_lhs (call);
6043 tree var = gimple_call_arg (call, 2);
6044 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6045 gimple_seq seq = NULL;
6046
6047 push_gimplify_context (true);
6048
6049 if (level != GOMP_DIM_GANG)
6050 {
6051 /* Copy the receiver object. */
6052 tree ref_to_res = gimple_call_arg (call, 1);
6053
6054 if (!integer_zerop (ref_to_res))
6055 var = build_simple_mem_ref (ref_to_res);
6056 }
6057
f881693c
TV
6058 if (level == GOMP_DIM_WORKER
6059 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6060 {
6061 /* Store incoming value to worker reduction buffer. */
6062 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6063 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6064 level == GOMP_DIM_VECTOR);
f3552158
NS
6065 tree ptr = make_ssa_name (TREE_TYPE (call));
6066
6067 gimplify_assign (ptr, call, &seq);
6068 tree ref = build_simple_mem_ref (ptr);
6069 TREE_THIS_VOLATILE (ref) = 1;
6070 gimplify_assign (ref, var, &seq);
6071 }
6072
6073 if (lhs)
6074 gimplify_assign (lhs, var, &seq);
6075
6076 pop_gimplify_context (NULL);
6077 gsi_replace_with_seq (&gsi, seq, true);
6078}
6079
6080/* NVPTX implementation of GOACC_REDUCTION_INIT. */
6081
6082static void
f881693c 6083nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
f3552158
NS
6084{
6085 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6086 tree lhs = gimple_call_lhs (call);
6087 tree var = gimple_call_arg (call, 2);
6088 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6089 enum tree_code rcode
6090 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6091 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6092 TREE_TYPE (var));
6093 gimple_seq seq = NULL;
6094
6095 push_gimplify_context (true);
6096
f881693c 6097 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6098 {
6099 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6100 tree tid = make_ssa_name (integer_type_node);
6101 tree dim_vector = gimple_call_arg (call, 3);
6102 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6103 dim_vector);
6104 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6105 NULL_TREE, NULL_TREE);
6106
6107 gimple_call_set_lhs (tid_call, tid);
6108 gimple_seq_add_stmt (&seq, tid_call);
6109 gimple_seq_add_stmt (&seq, cond_stmt);
6110
6111 /* Split the block just after the call. */
6112 edge init_edge = split_block (gsi_bb (gsi), call);
6113 basic_block init_bb = init_edge->dest;
6114 basic_block call_bb = init_edge->src;
6115
6116 /* Fixup flags from call_bb to init_bb. */
6117 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 6118 init_edge->probability = profile_probability::even ();
f3552158
NS
6119
6120 /* Set the initialization stmts. */
6121 gimple_seq init_seq = NULL;
6122 tree init_var = make_ssa_name (TREE_TYPE (var));
6123 gimplify_assign (init_var, init, &init_seq);
6124 gsi = gsi_start_bb (init_bb);
6125 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6126
6127 /* Split block just after the init stmt. */
6128 gsi_prev (&gsi);
6129 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6130 basic_block dst_bb = inited_edge->dest;
6131
6132 /* Create false edge from call_bb to dst_bb. */
6133 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 6134 nop_edge->probability = profile_probability::even ();
f3552158
NS
6135
6136 /* Create phi node in dst block. */
6137 gphi *phi = create_phi_node (lhs, dst_bb);
6138 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6139 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6140
6141 /* Reset dominator of dst bb. */
6142 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6143
6144 /* Reset the gsi. */
6145 gsi = gsi_for_stmt (call);
6146 }
6147 else
6148 {
6149 if (level == GOMP_DIM_GANG)
6150 {
6151 /* If there's no receiver object, propagate the incoming VAR. */
6152 tree ref_to_res = gimple_call_arg (call, 1);
6153 if (integer_zerop (ref_to_res))
6154 init = var;
6155 }
6156
6157 gimplify_assign (lhs, init, &seq);
6158 }
6159
6160 pop_gimplify_context (NULL);
6161 gsi_replace_with_seq (&gsi, seq, true);
6162}
6163
6164/* NVPTX implementation of GOACC_REDUCTION_FINI. */
6165
6166static void
f881693c 6167nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
f3552158
NS
6168{
6169 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6170 tree lhs = gimple_call_lhs (call);
6171 tree ref_to_res = gimple_call_arg (call, 1);
6172 tree var = gimple_call_arg (call, 2);
6173 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6174 enum tree_code op
6175 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6176 gimple_seq seq = NULL;
6177 tree r = NULL_TREE;;
6178
6179 push_gimplify_context (true);
6180
f881693c 6181 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6182 {
6183 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6184 but that requires a method of emitting a unified jump at the
6185 gimple level. */
764ecad4 6186 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
f3552158
NS
6187 {
6188 tree other_var = make_ssa_name (TREE_TYPE (var));
6189 nvptx_generate_vector_shuffle (gimple_location (call),
6190 other_var, var, shfl, &seq);
6191
6192 r = make_ssa_name (TREE_TYPE (var));
6193 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6194 var, other_var), &seq);
6195 var = r;
6196 }
6197 }
6198 else
6199 {
6200 tree accum = NULL_TREE;
6201
f881693c 6202 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
f3552158
NS
6203 {
6204 /* Get reduction buffer address. */
6205 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6206 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6207 level == GOMP_DIM_VECTOR);
f3552158
NS
6208 tree ptr = make_ssa_name (TREE_TYPE (call));
6209
6210 gimplify_assign (ptr, call, &seq);
6211 accum = ptr;
6212 }
6213 else if (integer_zerop (ref_to_res))
6214 r = var;
6215 else
6216 accum = ref_to_res;
6217
6218 if (accum)
6219 {
33f47f42 6220 /* UPDATE the accumulator. */
f3552158
NS
6221 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6222 seq = NULL;
33f47f42
NS
6223 r = nvptx_reduction_update (gimple_location (call), &gsi,
6224 accum, var, op);
f3552158
NS
6225 }
6226 }
6227
6228 if (lhs)
6229 gimplify_assign (lhs, r, &seq);
6230 pop_gimplify_context (NULL);
6231
6232 gsi_replace_with_seq (&gsi, seq, true);
6233}
6234
6235/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6236
6237static void
f881693c 6238nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
f3552158
NS
6239{
6240 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6241 tree lhs = gimple_call_lhs (call);
6242 tree var = gimple_call_arg (call, 2);
6243 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6244 gimple_seq seq = NULL;
6245
6246 push_gimplify_context (true);
f881693c
TV
6247 if (level == GOMP_DIM_WORKER
6248 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6249 {
6250 /* Read the worker reduction buffer. */
6251 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6252 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6253 level == GOMP_DIM_VECTOR);
f3552158
NS
6254 tree ptr = make_ssa_name (TREE_TYPE (call));
6255
6256 gimplify_assign (ptr, call, &seq);
6257 var = build_simple_mem_ref (ptr);
6258 TREE_THIS_VOLATILE (var) = 1;
6259 }
6260
6261 if (level != GOMP_DIM_GANG)
6262 {
6263 /* Write to the receiver object. */
6264 tree ref_to_res = gimple_call_arg (call, 1);
6265
6266 if (!integer_zerop (ref_to_res))
6267 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6268 }
6269
6270 if (lhs)
6271 gimplify_assign (lhs, var, &seq);
6272
6273 pop_gimplify_context (NULL);
6274
6275 gsi_replace_with_seq (&gsi, seq, true);
6276}
6277
6278/* NVPTX reduction expander. */
6279
5563d5c0 6280static void
f3552158
NS
6281nvptx_goacc_reduction (gcall *call)
6282{
6283 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
f881693c
TV
6284 offload_attrs oa;
6285
6286 populate_offload_attrs (&oa);
f3552158
NS
6287
6288 switch (code)
6289 {
6290 case IFN_GOACC_REDUCTION_SETUP:
f881693c 6291 nvptx_goacc_reduction_setup (call, &oa);
f3552158
NS
6292 break;
6293
6294 case IFN_GOACC_REDUCTION_INIT:
f881693c 6295 nvptx_goacc_reduction_init (call, &oa);
f3552158
NS
6296 break;
6297
6298 case IFN_GOACC_REDUCTION_FINI:
f881693c 6299 nvptx_goacc_reduction_fini (call, &oa);
f3552158
NS
6300 break;
6301
6302 case IFN_GOACC_REDUCTION_TEARDOWN:
f881693c 6303 nvptx_goacc_reduction_teardown (call, &oa);
f3552158
NS
6304 break;
6305
6306 default:
6307 gcc_unreachable ();
6308 }
6309}
6310
e053f710
TV
6311static bool
6312nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6313 rtx x ATTRIBUTE_UNUSED)
6314{
6315 return true;
6316}
6317
b98b34b7
TV
6318static bool
6319nvptx_vector_mode_supported (machine_mode mode)
6320{
3717fbe3
TV
6321 return (mode == V2SImode
6322 || mode == V2DImode);
6323}
6324
6325/* Return the preferred mode for vectorizing scalar MODE. */
6326
6327static machine_mode
005ba29c 6328nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
6329{
6330 switch (mode)
6331 {
4e10a5a7 6332 case E_DImode:
3717fbe3 6333 return V2DImode;
4e10a5a7 6334 case E_SImode:
3717fbe3
TV
6335 return V2SImode;
6336
6337 default:
6338 return default_preferred_simd_mode (mode);
6339 }
6340}
6341
6342unsigned int
6343nvptx_data_alignment (const_tree type, unsigned int basic_align)
6344{
6345 if (TREE_CODE (type) == INTEGER_TYPE)
6346 {
6347 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6348 if (size == GET_MODE_SIZE (TImode))
6349 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6350 }
6351
6352 return basic_align;
b98b34b7
TV
6353}
6354
99e1629f
RS
6355/* Implement TARGET_MODES_TIEABLE_P. */
6356
6357static bool
6358nvptx_modes_tieable_p (machine_mode, machine_mode)
6359{
6360 return false;
6361}
6362
c43f4279
RS
6363/* Implement TARGET_HARD_REGNO_NREGS. */
6364
6365static unsigned int
6366nvptx_hard_regno_nregs (unsigned int, machine_mode)
6367{
6368 return 1;
6369}
6370
0d803030
RS
6371/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6372
6373static bool
6374nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6375{
6376 return false;
6377}
6378
43be05f5
TV
6379static GTY(()) tree nvptx_previous_fndecl;
6380
6381static void
6382nvptx_set_current_function (tree fndecl)
6383{
6384 if (!fndecl || fndecl == nvptx_previous_fndecl)
6385 return;
6386
6387 nvptx_previous_fndecl = fndecl;
f881693c 6388 vector_red_partition = 0;
0024c320 6389 oacc_bcast_partition = 0;
43be05f5
TV
6390}
6391
738f2522
BS
6392#undef TARGET_OPTION_OVERRIDE
6393#define TARGET_OPTION_OVERRIDE nvptx_option_override
6394
6395#undef TARGET_ATTRIBUTE_TABLE
6396#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6397
d81db636
SB
6398#undef TARGET_LRA_P
6399#define TARGET_LRA_P hook_bool_void_false
6400
738f2522
BS
6401#undef TARGET_LEGITIMATE_ADDRESS_P
6402#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6403
6404#undef TARGET_PROMOTE_FUNCTION_MODE
6405#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6406
6407#undef TARGET_FUNCTION_ARG
6408#define TARGET_FUNCTION_ARG nvptx_function_arg
6409#undef TARGET_FUNCTION_INCOMING_ARG
6410#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6411#undef TARGET_FUNCTION_ARG_ADVANCE
6412#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6413#undef TARGET_FUNCTION_ARG_BOUNDARY
6414#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6415#undef TARGET_PASS_BY_REFERENCE
6416#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6417#undef TARGET_FUNCTION_VALUE_REGNO_P
6418#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6419#undef TARGET_FUNCTION_VALUE
6420#define TARGET_FUNCTION_VALUE nvptx_function_value
6421#undef TARGET_LIBCALL_VALUE
6422#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6423#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6424#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6425#undef TARGET_GET_DRAP_RTX
6426#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6427#undef TARGET_SPLIT_COMPLEX_ARG
6428#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6429#undef TARGET_RETURN_IN_MEMORY
6430#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6431#undef TARGET_OMIT_STRUCT_RETURN_REG
6432#define TARGET_OMIT_STRUCT_RETURN_REG true
6433#undef TARGET_STRICT_ARGUMENT_NAMING
6434#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6435#undef TARGET_CALL_ARGS
6436#define TARGET_CALL_ARGS nvptx_call_args
6437#undef TARGET_END_CALL_ARGS
6438#define TARGET_END_CALL_ARGS nvptx_end_call_args
6439
6440#undef TARGET_ASM_FILE_START
6441#define TARGET_ASM_FILE_START nvptx_file_start
6442#undef TARGET_ASM_FILE_END
6443#define TARGET_ASM_FILE_END nvptx_file_end
6444#undef TARGET_ASM_GLOBALIZE_LABEL
6445#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6446#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6447#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6448#undef TARGET_PRINT_OPERAND
6449#define TARGET_PRINT_OPERAND nvptx_print_operand
6450#undef TARGET_PRINT_OPERAND_ADDRESS
6451#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6452#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6453#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6454#undef TARGET_ASM_INTEGER
6455#define TARGET_ASM_INTEGER nvptx_assemble_integer
6456#undef TARGET_ASM_DECL_END
6457#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6458#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6459#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6460#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6461#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6462#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6463#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6464
6465#undef TARGET_MACHINE_DEPENDENT_REORG
6466#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6467#undef TARGET_NO_REGISTER_ALLOCATION
6468#define TARGET_NO_REGISTER_ALLOCATION true
6469
9a863523
NS
6470#undef TARGET_ENCODE_SECTION_INFO
6471#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6472#undef TARGET_RECORD_OFFLOAD_SYMBOL
6473#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6474
738f2522
BS
6475#undef TARGET_VECTOR_ALIGNMENT
6476#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6477
d88cd9c4
NS
6478#undef TARGET_CANNOT_COPY_INSN_P
6479#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6480
a794bd20
NS
6481#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6482#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6483
f3552158
NS
6484#undef TARGET_INIT_BUILTINS
6485#define TARGET_INIT_BUILTINS nvptx_init_builtins
6486#undef TARGET_EXPAND_BUILTIN
6487#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6488#undef TARGET_BUILTIN_DECL
6489#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6490
5012919d
AM
6491#undef TARGET_SIMT_VF
6492#define TARGET_SIMT_VF nvptx_simt_vf
6493
94829f87
NS
6494#undef TARGET_GOACC_VALIDATE_DIMS
6495#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6496
bd751975
NS
6497#undef TARGET_GOACC_DIM_LIMIT
6498#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6499
d88cd9c4
NS
6500#undef TARGET_GOACC_FORK_JOIN
6501#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6502
f3552158
NS
6503#undef TARGET_GOACC_REDUCTION
6504#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6505
e053f710
TV
6506#undef TARGET_CANNOT_FORCE_CONST_MEM
6507#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6508
b98b34b7
TV
6509#undef TARGET_VECTOR_MODE_SUPPORTED_P
6510#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6511
3717fbe3
TV
6512#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6513#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6514 nvptx_preferred_simd_mode
6515
99e1629f
RS
6516#undef TARGET_MODES_TIEABLE_P
6517#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6518
c43f4279
RS
6519#undef TARGET_HARD_REGNO_NREGS
6520#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6521
0d803030
RS
6522#undef TARGET_CAN_CHANGE_MODE_CLASS
6523#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6524
e335138d
TV
6525#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6526#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6527
43be05f5
TV
6528#undef TARGET_SET_CURRENT_FUNCTION
6529#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6530
738f2522
BS
6531struct gcc_target targetm = TARGET_INITIALIZER;
6532
6533#include "gt-nvptx.h"