]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
a5544970 2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19
NS
61#include "omp-low.h"
62#include "gomp-constants.h"
d88cd9c4 63#include "dumpfile.h"
f3552158
NS
64#include "internal-fn.h"
65#include "gimple-iterator.h"
66#include "stringpool.h"
314e6352 67#include "attribs.h"
f2d4d839 68#include "tree-vrp.h"
f3552158
NS
69#include "tree-ssa-operands.h"
70#include "tree-ssanames.h"
71#include "gimplify.h"
72#include "tree-phinodes.h"
73#include "cfgloop.h"
74#include "fold-const.h"
50c620ca 75#include "intl.h"
738f2522 76
994c5d85 77/* This file should be included last. */
d58627a0
RS
78#include "target-def.h"
79
43c371e8 80#define WORKAROUND_PTXJIT_BUG 1
3dede32b 81#define WORKAROUND_PTXJIT_BUG_2 1
21251395 82#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 83
5d17a476 84#define PTX_WARP_SIZE 32
693ad66b
TV
85#define PTX_VECTOR_LENGTH 32
86#define PTX_WORKER_LENGTH 32
87#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 88
c4686fa9
TV
89/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
90 block, which has had a maximum number of threads of 1024 since CUDA version
91 2.x. */
92#define PTX_CTA_SIZE 1024
93
9a863523
NS
94/* The various PTX memory areas an object might reside in. */
95enum nvptx_data_area
96{
97 DATA_AREA_GENERIC,
98 DATA_AREA_GLOBAL,
99 DATA_AREA_SHARED,
100 DATA_AREA_LOCAL,
101 DATA_AREA_CONST,
102 DATA_AREA_PARAM,
103 DATA_AREA_MAX
104};
105
106/* We record the data area in the target symbol flags. */
107#define SYMBOL_DATA_AREA(SYM) \
108 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
109 & 7)
110#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
111 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
112
738f2522
BS
113/* Record the function decls we've written, and the libfuncs and function
114 decls corresponding to them. */
115static std::stringstream func_decls;
f3dba894 116
6c907cff 117struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
118{
119 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
120 static bool equal (rtx a, rtx b) { return a == b; }
121};
122
123static GTY((cache))
124 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
125
6c907cff 126struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
127{
128 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
129 static bool equal (tree a, tree b) { return a == b; }
130};
131
132static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
133static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 134
1ed57fb8
TV
135/* Buffer needed to broadcast across workers and vectors. This is
136 used for both worker-neutering and worker broadcasting, and
137 vector-neutering and boardcasting when vector_length > 32. It is
138 shared by all functions emitted. The buffer is placed in shared
139 memory. It'd be nice if PTX supported common blocks, because then
140 this could be shared across TUs (taking the largest size). */
141static unsigned oacc_bcast_size;
142static unsigned oacc_bcast_align;
143static GTY(()) rtx oacc_bcast_sym;
d88cd9c4 144
f3552158
NS
145/* Buffer needed for worker reductions. This has to be distinct from
146 the worker broadcast array, as both may be live concurrently. */
147static unsigned worker_red_size;
148static unsigned worker_red_align;
f3552158
NS
149static GTY(()) rtx worker_red_sym;
150
33f47f42
NS
151/* Global lock variable, needed for 128bit worker & gang reductions. */
152static GTY(()) tree global_lock_var;
153
5012919d
AM
154/* True if any function references __nvptx_stacks. */
155static bool need_softstack_decl;
156
157/* True if any function references __nvptx_uni. */
158static bool need_unisimt_decl;
159
738f2522
BS
160/* Allocate a new, cleared machine_function structure. */
161
162static struct machine_function *
163nvptx_init_machine_status (void)
164{
165 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 166 p->return_mode = VOIDmode;
738f2522
BS
167 return p;
168}
169
5012919d
AM
170/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
171 and -fopenacc is also enabled. */
172
173static void
174diagnose_openacc_conflict (bool optval, const char *optname)
175{
176 if (flag_openacc && optval)
177 error ("option %s is not supported together with -fopenacc", optname);
178}
179
738f2522
BS
180/* Implement TARGET_OPTION_OVERRIDE. */
181
182static void
183nvptx_option_override (void)
184{
185 init_machine_status = nvptx_init_machine_status;
ae578943 186
3d339d5e
AM
187 /* Set toplevel_reorder, unless explicitly disabled. We need
188 reordering so that we emit necessary assembler decls of
189 undeclared variables. */
190 if (!global_options_set.x_flag_toplevel_reorder)
191 flag_toplevel_reorder = 1;
192
fa9afdc3
TV
193 debug_nonbind_markers_p = 0;
194
ae578943
NS
195 /* Set flag_no_common, unless explicitly disabled. We fake common
196 using .weak, and that's not entirely accurate, so avoid it
197 unless forced. */
198 if (!global_options_set.x_flag_no_common)
199 flag_no_common = 1;
200
cad136a4
TV
201 /* The patch area requires nops, which we don't have. */
202 if (function_entry_patch_area_size > 0)
203 sorry ("not generating patch area, nops not supported");
204
738f2522
BS
205 /* Assumes that it will see only hard registers. */
206 flag_var_tracking = 0;
1e5154e7 207
dba619f3
NS
208 if (nvptx_optimize < 0)
209 nvptx_optimize = optimize > 0;
210
f3dba894
TS
211 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
212 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 213 declared_libfuncs_htab
f3dba894 214 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 215
1ed57fb8
TV
216 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
217 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
218 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158 219
15ab6f00 220 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 221 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 222 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d
AM
223
224 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
225 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
226 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
227
228 if (TARGET_GOMP)
229 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
230}
231
738f2522
BS
232/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
233 deal with ptx ideosyncracies. */
234
235const char *
236nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
237{
238 switch (mode)
239 {
4e10a5a7 240 case E_BLKmode:
738f2522 241 return ".b8";
4e10a5a7 242 case E_BImode:
738f2522 243 return ".pred";
4e10a5a7 244 case E_QImode:
738f2522
BS
245 if (promote)
246 return ".u32";
247 else
248 return ".u8";
4e10a5a7 249 case E_HImode:
738f2522 250 return ".u16";
4e10a5a7 251 case E_SImode:
738f2522 252 return ".u32";
4e10a5a7 253 case E_DImode:
738f2522
BS
254 return ".u64";
255
4e10a5a7 256 case E_SFmode:
738f2522 257 return ".f32";
4e10a5a7 258 case E_DFmode:
738f2522
BS
259 return ".f64";
260
4e10a5a7 261 case E_V2SImode:
b98b34b7 262 return ".v2.u32";
4e10a5a7 263 case E_V2DImode:
3717fbe3 264 return ".v2.u64";
b98b34b7 265
738f2522
BS
266 default:
267 gcc_unreachable ();
268 }
269}
270
9a863523
NS
271/* Encode the PTX data area that DECL (which might not actually be a
272 _DECL) should reside in. */
7b8edc29 273
9a863523
NS
274static void
275nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 276{
9a863523
NS
277 default_encode_section_info (decl, rtl, first);
278 if (first && MEM_P (rtl))
279 {
280 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 281
9a863523
NS
282 if (TREE_CONSTANT (decl))
283 area = DATA_AREA_CONST;
284 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
285 {
286 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
287 {
288 area = DATA_AREA_SHARED;
289 if (DECL_INITIAL (decl))
290 error ("static initialization of variable %q+D in %<.shared%>"
291 " memory is not supported", decl);
292 }
293 else
294 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
295 }
7b8edc29 296
9a863523
NS
297 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
298 }
299}
300
301/* Return the PTX name of the data area in which SYM should be
302 placed. The symbol must have already been processed by
303 nvptx_encode_seciton_info, or equivalent. */
304
305static const char *
306section_for_sym (rtx sym)
307{
308 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
309 /* Same order as nvptx_data_area enum. */
310 static char const *const areas[] =
311 {"", ".global", ".shared", ".local", ".const", ".param"};
312
313 return areas[area];
314}
315
316/* Similarly for a decl. */
317
318static const char *
319section_for_decl (const_tree decl)
320{
321 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
322}
323
b4346107
NS
324/* Check NAME for special function names and redirect them by returning a
325 replacement. This applies to malloc, free and realloc, for which we
326 want to use libgcc wrappers, and call, which triggers a bug in
327 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
328 not active in an offload compiler -- the names are all set by the
329 host-side compiler. */
330
331static const char *
332nvptx_name_replacement (const char *name)
333{
334 if (strcmp (name, "call") == 0)
335 return "__nvptx_call";
336 if (strcmp (name, "malloc") == 0)
337 return "__nvptx_malloc";
338 if (strcmp (name, "free") == 0)
339 return "__nvptx_free";
340 if (strcmp (name, "realloc") == 0)
341 return "__nvptx_realloc";
342 return name;
343}
344
d7479262
NS
345/* If MODE should be treated as two registers of an inner mode, return
346 that inner mode. Otherwise return VOIDmode. */
738f2522 347
d7479262
NS
348static machine_mode
349maybe_split_mode (machine_mode mode)
738f2522 350{
738f2522 351 if (COMPLEX_MODE_P (mode))
d7479262 352 return GET_MODE_INNER (mode);
738f2522 353
738f2522 354 if (mode == TImode)
d7479262
NS
355 return DImode;
356
357 return VOIDmode;
738f2522
BS
358}
359
38827811
TV
360/* Return true if mode should be treated as two registers. */
361
362static bool
363split_mode_p (machine_mode mode)
364{
365 return maybe_split_mode (mode) != VOIDmode;
366}
367
f313d112
NS
368/* Output a register, subreg, or register pair (with optional
369 enclosing braces). */
370
371static void
372output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
373 int subreg_offset = -1)
374{
375 if (inner_mode == VOIDmode)
376 {
377 if (HARD_REGISTER_NUM_P (regno))
378 fprintf (file, "%s", reg_names[regno]);
379 else
380 fprintf (file, "%%r%d", regno);
381 }
382 else if (subreg_offset >= 0)
383 {
384 output_reg (file, regno, VOIDmode);
385 fprintf (file, "$%d", subreg_offset);
386 }
387 else
388 {
389 if (subreg_offset == -1)
390 fprintf (file, "{");
391 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
392 fprintf (file, ",");
393 output_reg (file, regno, inner_mode, 0);
394 if (subreg_offset == -1)
395 fprintf (file, "}");
396 }
397}
398
d88cd9c4
NS
399/* Emit forking instructions for MASK. */
400
401static void
402nvptx_emit_forking (unsigned mask, bool is_call)
403{
404 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
405 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
406 if (mask)
407 {
408 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
409
410 /* Emit fork at all levels. This helps form SESE regions, as
411 it creates a block with a single successor before entering a
412 partitooned region. That is a good candidate for the end of
413 an SESE region. */
e91eba31 414 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
415 emit_insn (gen_nvptx_forked (op));
416 }
417}
418
419/* Emit joining instructions for MASK. */
420
421static void
422nvptx_emit_joining (unsigned mask, bool is_call)
423{
424 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
425 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
426 if (mask)
427 {
428 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
429
430 /* Emit joining for all non-call pars to ensure there's a single
431 predecessor for the block the join insn ends up in. This is
432 needed for skipping entire loops. */
e91eba31 433 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
434 emit_insn (gen_nvptx_join (op));
435 }
436}
437
738f2522 438\f
44eba92d
NS
439/* Determine whether MODE and TYPE (possibly NULL) should be passed or
440 returned in memory. Integer and floating types supported by the
441 machine are passed in registers, everything else is passed in
442 memory. Complex types are split. */
443
444static bool
445pass_in_memory (machine_mode mode, const_tree type, bool for_return)
446{
447 if (type)
448 {
449 if (AGGREGATE_TYPE_P (type))
450 return true;
451 if (TREE_CODE (type) == VECTOR_TYPE)
452 return true;
453 }
454
455 if (!for_return && COMPLEX_MODE_P (mode))
456 /* Complex types are passed as two underlying args. */
457 mode = GET_MODE_INNER (mode);
458
459 if (GET_MODE_CLASS (mode) != MODE_INT
460 && GET_MODE_CLASS (mode) != MODE_FLOAT)
461 return true;
462
463 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
464 return true;
465
466 return false;
467}
468
469/* A non-memory argument of mode MODE is being passed, determine the mode it
470 should be promoted to. This is also used for determining return
471 type promotion. */
472
473static machine_mode
474promote_arg (machine_mode mode, bool prototyped)
475{
476 if (!prototyped && mode == SFmode)
477 /* K&R float promotion for unprototyped functions. */
478 mode = DFmode;
479 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
480 mode = SImode;
481
482 return mode;
483}
484
485/* A non-memory return type of MODE is being returned. Determine the
486 mode it should be promoted to. */
487
488static machine_mode
489promote_return (machine_mode mode)
490{
491 return promote_arg (mode, true);
492}
493
e74f7152 494/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 495
e74f7152 496static rtx
b49e35a9 497nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
498 const_tree, bool named)
499{
b49e35a9 500 if (mode == VOIDmode || !named)
e74f7152 501 return NULL_RTX;
738f2522 502
b49e35a9 503 return gen_reg_rtx (mode);
e74f7152
NS
504}
505
506/* Implement TARGET_FUNCTION_INCOMING_ARG. */
507
508static rtx
509nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
510 const_tree, bool named)
738f2522 511{
e74f7152 512 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 513
b49e35a9 514 if (mode == VOIDmode || !named)
e74f7152 515 return NULL_RTX;
738f2522 516
e74f7152
NS
517 /* No need to deal with split modes here, the only case that can
518 happen is complex modes and those are dealt with by
519 TARGET_SPLIT_COMPLEX_ARG. */
520 return gen_rtx_UNSPEC (mode,
521 gen_rtvec (1, GEN_INT (cum->count)),
522 UNSPEC_ARG_REG);
523}
524
525/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
526
527static void
528nvptx_function_arg_advance (cumulative_args_t cum_v,
529 machine_mode ARG_UNUSED (mode),
530 const_tree ARG_UNUSED (type),
531 bool ARG_UNUSED (named))
532{
533 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 534
e74f7152
NS
535 cum->count++;
536}
537
1988b2c2
NS
538/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
539
540 For nvptx This is only used for varadic args. The type has already
541 been promoted and/or converted to invisible reference. */
542
543static unsigned
544nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
545{
546 return GET_MODE_ALIGNMENT (mode);
547}
548
e74f7152
NS
549/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
550
551 For nvptx, we know how to handle functions declared as stdarg: by
552 passing an extra pointer to the unnamed arguments. However, the
553 Fortran frontend can produce a different situation, where a
554 function pointer is declared with no arguments, but the actual
555 function and calls to it take more arguments. In that case, we
556 want to ensure the call matches the definition of the function. */
557
558static bool
559nvptx_strict_argument_naming (cumulative_args_t cum_v)
560{
561 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 562
e74f7152
NS
563 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
564}
565
e74f7152
NS
566/* Implement TARGET_LIBCALL_VALUE. */
567
568static rtx
569nvptx_libcall_value (machine_mode mode, const_rtx)
570{
e91cacae 571 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
572 /* Pretend to return in a hard reg for early uses before pseudos can be
573 generated. */
574 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 575
e74f7152
NS
576 return gen_reg_rtx (mode);
577}
578
579/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
580 where function FUNC returns or receives a value of data type TYPE. */
581
582static rtx
44eba92d 583nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
584 bool outgoing)
585{
44eba92d
NS
586 machine_mode mode = promote_return (TYPE_MODE (type));
587
e74f7152 588 if (outgoing)
5c036f3f 589 {
e91cacae 590 gcc_assert (cfun);
44c068ae 591 cfun->machine->return_mode = mode;
5c036f3f
NS
592 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
593 }
b49e35a9
NS
594
595 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
596}
597
598/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
599
600static bool
601nvptx_function_value_regno_p (const unsigned int regno)
602{
603 return regno == NVPTX_RETURN_REGNUM;
604}
605
606/* Types with a mode other than those supported by the machine are passed by
607 reference in memory. */
608
609static bool
5563d5c0
NS
610nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
611 machine_mode mode, const_tree type,
612 bool ARG_UNUSED (named))
e74f7152 613{
44eba92d 614 return pass_in_memory (mode, type, false);
e74f7152
NS
615}
616
617/* Implement TARGET_RETURN_IN_MEMORY. */
618
619static bool
620nvptx_return_in_memory (const_tree type, const_tree)
621{
44eba92d 622 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
623}
624
625/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
626
627static machine_mode
628nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 629 int *ARG_UNUSED (punsignedp),
e74f7152
NS
630 const_tree funtype, int for_return)
631{
44eba92d 632 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
633}
634
e74f7152
NS
635/* Helper for write_arg. Emit a single PTX argument of MODE, either
636 in a prototype, or as copy in a function prologue. ARGNO is the
637 index of this argument in the PTX function. FOR_REG is negative,
638 if we're emitting the PTX prototype. It is zero if we're copying
639 to an argument register and it is greater than zero if we're
640 copying to a specific hard register. */
641
642static int
1f065954
NS
643write_arg_mode (std::stringstream &s, int for_reg, int argno,
644 machine_mode mode)
e74f7152
NS
645{
646 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
647
dc3d2aeb
NS
648 if (for_reg < 0)
649 {
650 /* Writing PTX prototype. */
651 s << (argno ? ", " : " (");
e74f7152 652 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
653 }
654 else
655 {
e74f7152 656 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
657 if (for_reg)
658 s << reg_names[for_reg];
659 else
660 s << "%ar" << argno;
661 s << ";\n";
5563d5c0
NS
662 if (argno >= 0)
663 {
664 s << "\tld.param" << ptx_type << " ";
665 if (for_reg)
666 s << reg_names[for_reg];
667 else
668 s << "%ar" << argno;
669 s << ", [%in_ar" << argno << "];\n";
670 }
dc3d2aeb
NS
671 }
672 return argno + 1;
738f2522
BS
673}
674
e74f7152 675/* Process function parameter TYPE to emit one or more PTX
1f065954 676 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
677 is true, if this is a prototyped function, rather than an old-style
678 C declaration. Returns the next argument number to use.
e74f7152 679
9c582551 680 The promotion behavior here must match the regular GCC function
e74f7152
NS
681 parameter marshalling machinery. */
682
683static int
1f065954
NS
684write_arg_type (std::stringstream &s, int for_reg, int argno,
685 tree type, bool prototyped)
e74f7152
NS
686{
687 machine_mode mode = TYPE_MODE (type);
688
689 if (mode == VOIDmode)
690 return argno;
691
44eba92d 692 if (pass_in_memory (mode, type, false))
e74f7152 693 mode = Pmode;
44eba92d
NS
694 else
695 {
696 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 697
44eba92d
NS
698 if (split)
699 {
700 /* Complex types are sent as two separate args. */
701 type = TREE_TYPE (type);
5563d5c0 702 mode = TYPE_MODE (type);
44eba92d
NS
703 prototyped = true;
704 }
e74f7152 705
44eba92d
NS
706 mode = promote_arg (mode, prototyped);
707 if (split)
1f065954 708 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 709 }
e74f7152 710
1f065954
NS
711 return write_arg_mode (s, for_reg, argno, mode);
712}
713
714/* Emit a PTX return as a prototype or function prologue declaration
715 for MODE. */
716
717static void
718write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
719{
720 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
721 const char *pfx = "\t.reg";
722 const char *sfx = ";\n";
723
724 if (for_proto)
725 pfx = "(.param", sfx = "_out) ";
726
727 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
728}
729
44eba92d 730/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 731 or function prologue declaration. Returns true if return is via an
9c582551 732 additional pointer parameter. The promotion behavior here must
1f065954 733 match the regular GCC function return mashalling. */
44eba92d 734
0beb7c71 735static bool
1f065954 736write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
737{
738 machine_mode mode = TYPE_MODE (type);
0beb7c71 739
44eba92d
NS
740 if (mode == VOIDmode)
741 return false;
742
743 bool return_in_mem = pass_in_memory (mode, type, true);
744
745 if (return_in_mem)
0beb7c71 746 {
44eba92d
NS
747 if (for_proto)
748 return return_in_mem;
749
750 /* Named return values can cause us to return a pointer as well
751 as expect an argument for the return location. This is
752 optimization-level specific, so no caller can make use of
753 this data, but more importantly for us, we must ensure it
754 doesn't change the PTX prototype. */
44c068ae 755 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 756
44eba92d
NS
757 if (mode == VOIDmode)
758 return return_in_mem;
759
44c068ae 760 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 761 retval parameter. */
44c068ae 762 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
763 }
764 else
44eba92d
NS
765 mode = promote_return (mode);
766
1f065954 767 write_return_mode (s, for_proto, mode);
0beb7c71
NS
768
769 return return_in_mem;
770}
771
738f2522
BS
772/* Look for attributes in ATTRS that would indicate we must write a function
773 as a .entry kernel rather than a .func. Return true if one is found. */
774
775static bool
776write_as_kernel (tree attrs)
777{
778 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
779 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
780 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
781 /* For OpenMP target regions, the corresponding kernel entry is emitted from
782 write_omp_entry as a separate function. */
738f2522
BS
783}
784
69823d76
NS
785/* Emit a linker marker for a function decl or defn. */
786
787static void
788write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
789 const char *name)
790{
791 s << "\n// BEGIN";
792 if (globalize)
793 s << " GLOBAL";
794 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
795 s << name << "\n";
796}
797
798/* Emit a linker marker for a variable decl or defn. */
799
800static void
801write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
802{
803 fprintf (file, "\n// BEGIN%s VAR %s: ",
804 globalize ? " GLOBAL" : "",
805 is_defn ? "DEF" : "DECL");
806 assemble_name_raw (file, name);
807 fputs ("\n", file);
808}
809
b699adcc
NS
810/* Write a .func or .kernel declaration or definition along with
811 a helper comment for use by ld. S is the stream to write to, DECL
812 the decl for the function with name NAME. For definitions, emit
813 a declaration too. */
738f2522 814
b699adcc
NS
815static const char *
816write_fn_proto (std::stringstream &s, bool is_defn,
817 const char *name, const_tree decl)
738f2522 818{
b699adcc
NS
819 if (is_defn)
820 /* Emit a declaration. The PTX assembler gets upset without it. */
821 name = write_fn_proto (s, false, name, decl);
b4346107
NS
822 else
823 {
824 /* Avoid repeating the name replacement. */
825 name = nvptx_name_replacement (name);
826 if (name[0] == '*')
827 name++;
828 }
738f2522 829
69823d76 830 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
831
832 /* PTX declaration. */
738f2522
BS
833 if (DECL_EXTERNAL (decl))
834 s << ".extern ";
835 else if (TREE_PUBLIC (decl))
0766660b 836 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 837 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 838
b699adcc
NS
839 tree fntype = TREE_TYPE (decl);
840 tree result_type = TREE_TYPE (fntype);
738f2522 841
ac3d02e2
NS
842 /* atomic_compare_exchange_$n builtins have an exceptional calling
843 convention. */
844 int not_atomic_weak_arg = -1;
845 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
846 switch (DECL_FUNCTION_CODE (decl))
847 {
848 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
849 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
850 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
851 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
852 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
853 /* These atomics skip the 'weak' parm in an actual library
854 call. We must skip it in the prototype too. */
855 not_atomic_weak_arg = 3;
856 break;
857
858 default:
859 break;
860 }
861
738f2522 862 /* Declare the result. */
1f065954 863 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 864
b699adcc
NS
865 s << name;
866
dc3d2aeb 867 int argno = 0;
b699adcc
NS
868
869 /* Emit argument list. */
870 if (return_in_mem)
1f065954 871 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 872
b699adcc
NS
873 /* We get:
874 NULL in TYPE_ARG_TYPES, for old-style functions
875 NULL in DECL_ARGUMENTS, for builtin functions without another
876 declaration.
877 So we have to pick the best one we have. */
878 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
879 bool prototyped = true;
880 if (!args)
881 {
882 args = DECL_ARGUMENTS (decl);
883 prototyped = false;
884 }
738f2522 885
ac3d02e2 886 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 887 {
dc3d2aeb 888 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
889
890 if (not_atomic_weak_arg)
891 argno = write_arg_type (s, -1, argno, type, prototyped);
892 else
893 gcc_assert (type == boolean_type_node);
738f2522 894 }
738f2522 895
b699adcc 896 if (stdarg_p (fntype))
1f065954 897 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 898
b699adcc 899 if (DECL_STATIC_CHAIN (decl))
1f065954 900 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 901
dc3d2aeb 902 if (!argno && strcmp (name, "main") == 0)
b699adcc 903 {
1f065954
NS
904 argno = write_arg_type (s, -1, argno, integer_type_node, true);
905 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
906 }
907
dc3d2aeb 908 if (argno)
b699adcc
NS
909 s << ")";
910
911 s << (is_defn ? "\n" : ";\n");
912
913 return name;
738f2522
BS
914}
915
00e52418
NS
916/* Construct a function declaration from a call insn. This can be
917 necessary for two reasons - either we have an indirect call which
918 requires a .callprototype declaration, or we have a libcall
919 generated by emit_library_call for which no decl exists. */
920
921static void
b699adcc
NS
922write_fn_proto_from_insn (std::stringstream &s, const char *name,
923 rtx result, rtx pat)
00e52418
NS
924{
925 if (!name)
926 {
927 s << "\t.callprototype ";
928 name = "_";
929 }
930 else
931 {
b4346107 932 name = nvptx_name_replacement (name);
69823d76 933 write_fn_marker (s, false, true, name);
00e52418
NS
934 s << "\t.extern .func ";
935 }
936
937 if (result != NULL_RTX)
1f065954 938 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
939
940 s << name;
941
00e52418
NS
942 int arg_end = XVECLEN (pat, 0);
943 for (int i = 1; i < arg_end; i++)
944 {
1f065954
NS
945 /* We don't have to deal with mode splitting & promotion here,
946 as that was already done when generating the call
947 sequence. */
00e52418
NS
948 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
949
1f065954 950 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
951 }
952 if (arg_end != 1)
953 s << ")";
954 s << ";\n";
955}
956
00e52418
NS
957/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
958 table and and write a ptx prototype. These are emitted at end of
959 compilation. */
738f2522 960
00e52418
NS
961static void
962nvptx_record_fndecl (tree decl)
738f2522 963{
f3dba894 964 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
965 if (*slot == NULL)
966 {
967 *slot = decl;
968 const char *name = get_fnname_from_decl (decl);
b699adcc 969 write_fn_proto (func_decls, false, name, decl);
738f2522 970 }
738f2522
BS
971}
972
00e52418
NS
973/* Record a libcall or unprototyped external function. CALLEE is the
974 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
975 declaration for it. */
976
977static void
978nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
979{
980 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
981 if (*slot == NULL)
982 {
983 *slot = callee;
984
985 const char *name = XSTR (callee, 0);
b699adcc 986 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
987 }
988}
989
990/* DECL is an external FUNCTION_DECL, that we're referencing. If it
991 is prototyped, record it now. Otherwise record it as needed at end
992 of compilation, when we might have more information about it. */
738f2522
BS
993
994void
995nvptx_record_needed_fndecl (tree decl)
996{
00e52418
NS
997 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
998 {
999 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1000 if (*slot == NULL)
1001 *slot = decl;
1002 }
1003 else
1004 nvptx_record_fndecl (decl);
1005}
738f2522 1006
00e52418
NS
1007/* SYM is a SYMBOL_REF. If it refers to an external function, record
1008 it as needed. */
1009
1010static void
1011nvptx_maybe_record_fnsym (rtx sym)
1012{
1013 tree decl = SYMBOL_REF_DECL (sym);
1014
1015 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1016 nvptx_record_needed_fndecl (decl);
738f2522
BS
1017}
1018
1f065954 1019/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1020 and initialize REGNO to point to it. If the size is zero, it'll
1021 never be valid to dereference, so we can simply initialize to
1022 zero. */
1f065954
NS
1023
1024static void
1025init_frame (FILE *file, int regno, unsigned align, unsigned size)
1026{
5563d5c0
NS
1027 if (size)
1028 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1029 align, reg_names[regno], size);
1030 fprintf (file, "\t.reg.u%d %s;\n",
1031 POINTER_SIZE, reg_names[regno]);
1032 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1033 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1034 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1035}
1036
5012919d
AM
1037/* Emit soft stack frame setup sequence. */
1038
1039static void
1040init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1041{
1042 /* Maintain 64-bit stack alignment. */
1043 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1044 size = ROUND_UP (size, keep_align);
1045 int bits = POINTER_SIZE;
1046 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1047 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1048 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1049 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1050 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1051 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1052 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1053 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1054 fprintf (file, "\t{\n");
1055 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1056 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1057 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1058 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1059 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1060 bits == 64 ? ".wide" : ".lo", bits / 8);
1061 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1062
1063 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1064 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1065
1066 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1067 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1068 bits, reg_sspprev, reg_sspslot);
1069
1070 /* Initialize %frame = %sspprev - size. */
1071 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1072 bits, reg_frame, reg_sspprev, size);
1073
1074 /* Apply alignment, if larger than 64. */
1075 if (alignment > keep_align)
1076 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1077 bits, reg_frame, reg_frame, -alignment);
1078
1079 size = crtl->outgoing_args_size;
1080 gcc_assert (size % keep_align == 0);
1081
1082 /* Initialize %stack. */
1083 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1084 bits, reg_stack, reg_frame, size);
1085
5012919d
AM
1086 if (!crtl->is_leaf)
1087 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1088 bits, reg_sspslot, reg_stack);
1089 fprintf (file, "\t}\n");
1090 cfun->machine->has_softstack = true;
1091 need_softstack_decl = true;
1092}
1093
d88cd9c4
NS
1094/* Emit code to initialize the REGNO predicate register to indicate
1095 whether we are not lane zero on the NAME axis. */
1096
1097static void
1098nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1099{
1100 fprintf (file, "\t{\n");
1101 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1102 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1103 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1104 fprintf (file, "\t}\n");
1105}
1106
5012919d
AM
1107/* Emit code to initialize predicate and master lane index registers for
1108 -muniform-simt code generation variant. */
1109
1110static void
1111nvptx_init_unisimt_predicate (FILE *file)
1112{
0c6b03b5
AM
1113 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1114 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1115 int bits = POINTER_SIZE;
0c6b03b5 1116 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1117 fprintf (file, "\t{\n");
1118 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1119 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1120 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1121 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1122 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1123 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1124 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1125 if (cfun->machine->unisimt_predicate)
1126 {
1127 int master = REGNO (cfun->machine->unisimt_master);
1128 int pred = REGNO (cfun->machine->unisimt_predicate);
1129 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1130 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1131 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1132 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1133 /* Compute predicate as 'tid.x == master'. */
1134 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1135 }
5012919d
AM
1136 fprintf (file, "\t}\n");
1137 need_unisimt_decl = true;
1138}
1139
1140/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1141
1142 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1143 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1144 {
1145 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1146 __nvptx_uni[tid.y] = 0;
1147 gomp_nvptx_main (ORIG, arg);
1148 }
1149 ORIG itself should not be emitted as a PTX .entry function. */
1150
1151static void
1152write_omp_entry (FILE *file, const char *name, const char *orig)
1153{
1154 static bool gomp_nvptx_main_declared;
1155 if (!gomp_nvptx_main_declared)
1156 {
1157 gomp_nvptx_main_declared = true;
1158 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1159 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1160 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1161 }
3c5b4c6d
TS
1162 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1163#define NTID_Y "%ntid.y"
5012919d
AM
1164#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1165 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1166{\n\
1167 .reg.u32 %r<3>;\n\
1168 .reg.u" PS " %R<4>;\n\
1169 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1170 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1171 mov.u32 %r2, %ctaid.x;\n\
1172 cvt.u" PS ".u32 %R1, %r0;\n\
1173 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1174 mov.u" PS " %R0, __nvptx_stacks;\n\
1175 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1176 ld.param.u" PS " %R2, [%stack];\n\
1177 ld.param.u" PS " %R3, [%sz];\n\
1178 add.u" PS " %R2, %R2, %R3;\n\
1179 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1180 st.shared.u" PS " [%R0], %R2;\n\
1181 mov.u" PS " %R0, __nvptx_uni;\n\
1182 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1183 mov.u32 %r0, 0;\n\
1184 st.shared.u32 [%R0], %r0;\n\
1185 mov.u" PS " %R0, \0;\n\
1186 ld.param.u" PS " %R1, [%arg];\n\
1187 {\n\
1188 .param.u" PS " %P<2>;\n\
1189 st.param.u" PS " [%P0], %R0;\n\
1190 st.param.u" PS " [%P1], %R1;\n\
1191 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1192 }\n\
1193 ret.uni;\n\
1194}\n"
1195 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1196 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1197#undef ENTRY_TEMPLATE
3c5b4c6d 1198#undef NTID_Y
5012919d
AM
1199 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1200 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1201 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1202 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1203 need_softstack_decl = need_unisimt_decl = true;
1204}
1205
738f2522
BS
1206/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1207 function, including local var decls and copies from the arguments to
1208 local regs. */
1209
1210void
1211nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1212{
1213 tree fntype = TREE_TYPE (decl);
1214 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1215 int argno = 0;
738f2522 1216
5012919d
AM
1217 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1218 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1219 {
1220 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1221 sprintf (buf, "%s$impl", name);
1222 write_omp_entry (file, name, buf);
1223 name = buf;
1224 }
dc3d2aeb
NS
1225 /* We construct the initial part of the function into a string
1226 stream, in order to share the prototype writing code. */
738f2522 1227 std::stringstream s;
b699adcc 1228 write_fn_proto (s, true, name, decl);
dc3d2aeb 1229 s << "{\n";
738f2522 1230
1f065954 1231 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1232 if (return_in_mem)
1f065954 1233 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1234
5ab662d5 1235 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1236 tree args = TYPE_ARG_TYPES (fntype);
1237 bool prototyped = true;
1238 if (!args)
5ab662d5 1239 {
dc3d2aeb
NS
1240 args = DECL_ARGUMENTS (decl);
1241 prototyped = false;
5ab662d5
NS
1242 }
1243
1244 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1245 {
1246 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1247
1f065954 1248 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1249 }
5ab662d5 1250
dc3d2aeb 1251 if (stdarg_p (fntype))
5563d5c0 1252 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1253 true);
5ab662d5 1254
5563d5c0
NS
1255 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1256 write_arg_type (s, STATIC_CHAIN_REGNUM,
1257 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1258 true);
1259
dc3d2aeb 1260 fprintf (file, "%s", s.str().c_str());
25662751 1261
0c6b03b5
AM
1262 /* Usually 'crtl->is_leaf' is computed during register allocator
1263 initialization (which is not done on NVPTX) or for pressure-sensitive
1264 optimizations. Initialize it here, except if already set. */
1265 if (!crtl->is_leaf)
1266 crtl->is_leaf = leaf_function_p ();
1267
44c068ae 1268 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1269 bool need_frameptr = sz || cfun->machine->has_chain;
1270 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1271 if (!TARGET_SOFT_STACK)
1272 {
1273 /* Declare a local var for outgoing varargs. */
1274 if (cfun->machine->has_varadic)
1275 init_frame (file, STACK_POINTER_REGNUM,
1276 UNITS_PER_WORD, crtl->outgoing_args_size);
1277
1278 /* Declare a local variable for the frame. Force its size to be
1279 DImode-compatible. */
1280 if (need_frameptr)
1281 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1282 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1283 }
0c6b03b5
AM
1284 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1285 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1286 init_softstack_frame (file, alignment, sz);
44c068ae 1287
0c6b03b5
AM
1288 if (cfun->machine->has_simtreg)
1289 {
1290 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1291 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1292 align = MAX (align, GET_MODE_SIZE (DImode));
1293 if (!crtl->is_leaf || cfun->calls_alloca)
1294 simtsz = HOST_WIDE_INT_M1U;
1295 if (simtsz == HOST_WIDE_INT_M1U)
1296 simtsz = nvptx_softstack_size;
1297 if (cfun->machine->has_softstack)
1298 simtsz += POINTER_SIZE / 8;
1299 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1300 if (align > GET_MODE_SIZE (DImode))
1301 simtsz += align - GET_MODE_SIZE (DImode);
1302 if (simtsz)
1303 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1304 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1305 }
738f2522
BS
1306 /* Declare the pseudos we have as ptx registers. */
1307 int maxregs = max_reg_num ();
1308 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1309 {
1310 if (regno_reg_rtx[i] != const0_rtx)
1311 {
1312 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1313 machine_mode split = maybe_split_mode (mode);
f313d112 1314
38827811 1315 if (split_mode_p (mode))
f313d112
NS
1316 mode = split;
1317 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1318 output_reg (file, i, split, -2);
1319 fprintf (file, ";\n");
738f2522
BS
1320 }
1321 }
1322
d88cd9c4
NS
1323 /* Emit axis predicates. */
1324 if (cfun->machine->axis_predicate[0])
1325 nvptx_init_axis_predicate (file,
1326 REGNO (cfun->machine->axis_predicate[0]), "y");
1327 if (cfun->machine->axis_predicate[1])
1328 nvptx_init_axis_predicate (file,
1329 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1330 if (cfun->machine->unisimt_predicate
1331 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1332 nvptx_init_unisimt_predicate (file);
738f2522
BS
1333}
1334
0c6b03b5
AM
1335/* Output code for switching uniform-simt state. ENTERING indicates whether
1336 we are entering or leaving non-uniform execution region. */
1337
1338static void
1339nvptx_output_unisimt_switch (FILE *file, bool entering)
1340{
1341 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1342 return;
1343 fprintf (file, "\t{\n");
1344 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1345 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1346 if (!crtl->is_leaf)
1347 {
1348 int loc = REGNO (cfun->machine->unisimt_location);
1349 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1350 }
1351 if (cfun->machine->unisimt_predicate)
1352 {
1353 int master = REGNO (cfun->machine->unisimt_master);
1354 int pred = REGNO (cfun->machine->unisimt_predicate);
1355 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1356 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1357 master, entering ? "%ustmp2" : "0");
1358 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1359 }
1360 fprintf (file, "\t}\n");
1361}
1362
1363/* Output code for allocating per-lane storage and switching soft-stack pointer.
1364 ENTERING indicates whether we are entering or leaving non-uniform execution.
1365 PTR is the register pointing to allocated storage, it is assigned to on
1366 entering and used to restore state on leaving. SIZE and ALIGN are used only
1367 on entering. */
1368
1369static void
1370nvptx_output_softstack_switch (FILE *file, bool entering,
1371 rtx ptr, rtx size, rtx align)
1372{
1373 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1374 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1375 return;
1376 int bits = POINTER_SIZE, regno = REGNO (ptr);
1377 fprintf (file, "\t{\n");
1378 if (entering)
1379 {
1380 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1381 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1382 cfun->machine->simt_stack_size);
1383 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1384 if (CONST_INT_P (size))
1385 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1386 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1387 else
1388 output_reg (file, REGNO (size), VOIDmode);
1389 fputs (";\n", file);
1390 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296
AM
1391 fprintf (file,
1392 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1393 bits, regno, regno, UINTVAL (align));
1394 }
1395 if (cfun->machine->has_softstack)
1396 {
1397 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1398 if (entering)
1399 {
1400 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1401 bits, regno, bits / 8, reg_stack);
1402 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1403 bits, reg_stack, regno, bits / 8);
1404 }
1405 else
1406 {
1407 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1408 bits, reg_stack, regno, bits / 8);
1409 }
1410 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1411 }
1412 fprintf (file, "\t}\n");
1413}
1414
1415/* Output code to enter non-uniform execution region. DEST is a register
1416 to hold a per-lane allocation given by SIZE and ALIGN. */
1417
1418const char *
1419nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1420{
1421 nvptx_output_unisimt_switch (asm_out_file, true);
1422 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1423 return "";
1424}
1425
1426/* Output code to leave non-uniform execution region. SRC is the register
1427 holding per-lane storage previously allocated by omp_simt_enter insn. */
1428
1429const char *
1430nvptx_output_simt_exit (rtx src)
1431{
1432 nvptx_output_unisimt_switch (asm_out_file, false);
1433 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1434 return "";
1435}
1436
5012919d
AM
1437/* Output instruction that sets soft stack pointer in shared memory to the
1438 value in register given by SRC_REGNO. */
1439
1440const char *
1441nvptx_output_set_softstack (unsigned src_regno)
1442{
1443 if (cfun->machine->has_softstack && !crtl->is_leaf)
1444 {
1445 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1446 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1447 output_reg (asm_out_file, src_regno, VOIDmode);
1448 fprintf (asm_out_file, ";\n");
1449 }
1450 return "";
1451}
738f2522
BS
1452/* Output a return instruction. Also copy the return value to its outgoing
1453 location. */
1454
1455const char *
1456nvptx_output_return (void)
1457{
44c068ae 1458 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1459
1460 if (mode != VOIDmode)
1f065954
NS
1461 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1462 nvptx_ptx_type_from_mode (mode, false),
1463 reg_names[NVPTX_RETURN_REGNUM],
1464 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1465
1466 return "ret;";
1467}
1468
738f2522
BS
1469/* Terminate a function by writing a closing brace to FILE. */
1470
1471void
1472nvptx_function_end (FILE *file)
1473{
cf08c344 1474 fprintf (file, "}\n");
738f2522
BS
1475}
1476\f
1477/* Decide whether we can make a sibling call to a function. For ptx, we
1478 can't. */
1479
1480static bool
1481nvptx_function_ok_for_sibcall (tree, tree)
1482{
1483 return false;
1484}
1485
18c05628
NS
1486/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1487
1488static rtx
1489nvptx_get_drap_rtx (void)
1490{
5012919d
AM
1491 if (TARGET_SOFT_STACK && stack_realign_drap)
1492 return arg_pointer_rtx;
18c05628
NS
1493 return NULL_RTX;
1494}
1495
738f2522
BS
1496/* Implement the TARGET_CALL_ARGS hook. Record information about one
1497 argument to the next call. */
1498
1499static void
44c068ae 1500nvptx_call_args (rtx arg, tree fntype)
738f2522 1501{
44c068ae 1502 if (!cfun->machine->doing_call)
738f2522 1503 {
44c068ae
NS
1504 cfun->machine->doing_call = true;
1505 cfun->machine->is_varadic = false;
1506 cfun->machine->num_args = 0;
1507
1508 if (fntype && stdarg_p (fntype))
1509 {
1510 cfun->machine->is_varadic = true;
1511 cfun->machine->has_varadic = true;
1512 cfun->machine->num_args++;
1513 }
738f2522 1514 }
738f2522 1515
44c068ae
NS
1516 if (REG_P (arg) && arg != pc_rtx)
1517 {
1518 cfun->machine->num_args++;
1519 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1520 cfun->machine->call_args);
1521 }
738f2522
BS
1522}
1523
1524/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1525 information we recorded. */
1526
1527static void
1528nvptx_end_call_args (void)
1529{
44c068ae 1530 cfun->machine->doing_call = false;
738f2522
BS
1531 free_EXPR_LIST_list (&cfun->machine->call_args);
1532}
1533
ecf6e535
BS
1534/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1535 track of whether calls involving static chains or varargs were seen
1536 in the current function.
1537 For libcalls, maintain a hash table of decls we have seen, and
1538 record a function decl for later when encountering a new one. */
738f2522
BS
1539
1540void
1541nvptx_expand_call (rtx retval, rtx address)
1542{
738f2522 1543 rtx callee = XEXP (address, 0);
f324806d 1544 rtx varargs = NULL_RTX;
d88cd9c4 1545 unsigned parallel = 0;
738f2522 1546
738f2522
BS
1547 if (!call_insn_operand (callee, Pmode))
1548 {
1549 callee = force_reg (Pmode, callee);
1550 address = change_address (address, QImode, callee);
1551 }
1552
1553 if (GET_CODE (callee) == SYMBOL_REF)
1554 {
1555 tree decl = SYMBOL_REF_DECL (callee);
1556 if (decl != NULL_TREE)
1557 {
738f2522 1558 if (DECL_STATIC_CHAIN (decl))
44c068ae 1559 cfun->machine->has_chain = true;
00e52418 1560
629b3d75 1561 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1562 if (attr)
1563 {
1564 tree dims = TREE_VALUE (attr);
1565
1566 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1567 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1568 {
1569 if (TREE_PURPOSE (dims)
1570 && !integer_zerop (TREE_PURPOSE (dims)))
1571 break;
1572 /* Not on this axis. */
1573 parallel ^= GOMP_DIM_MASK (ix);
1574 dims = TREE_CHAIN (dims);
1575 }
1576 }
738f2522
BS
1577 }
1578 }
c38f0d8c 1579
44c068ae
NS
1580 unsigned nargs = cfun->machine->num_args;
1581 if (cfun->machine->is_varadic)
738f2522 1582 {
f324806d 1583 varargs = gen_reg_rtx (Pmode);
863af9a4 1584 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1585 }
1586
44c068ae
NS
1587 rtvec vec = rtvec_alloc (nargs + 1);
1588 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1589 int vec_pos = 0;
44c068ae
NS
1590
1591 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1592 rtx tmp_retval = retval;
44c068ae 1593 if (retval)
738f2522
BS
1594 {
1595 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1596 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1597 call = gen_rtx_SET (tmp_retval, call);
738f2522 1598 }
44c068ae 1599 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1600
1601 /* Construct the call insn, including a USE for each argument pseudo
1602 register. These will be used when printing the insn. */
1603 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1604 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1605
1606 if (varargs)
cf08c344 1607 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1608
1609 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1610
d88cd9c4 1611 nvptx_emit_forking (parallel, true);
738f2522 1612 emit_call_insn (pat);
d88cd9c4
NS
1613 nvptx_emit_joining (parallel, true);
1614
738f2522
BS
1615 if (tmp_retval != retval)
1616 emit_move_insn (retval, tmp_retval);
1617}
44eba92d 1618
738f2522
BS
1619/* Emit a comparison COMPARE, and return the new test to be used in the
1620 jump. */
1621
1622rtx
1623nvptx_expand_compare (rtx compare)
1624{
1625 rtx pred = gen_reg_rtx (BImode);
1626 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1627 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1628 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1629 return gen_rtx_NE (BImode, pred, const0_rtx);
1630}
1631
d88cd9c4
NS
1632/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1633
1634void
1635nvptx_expand_oacc_fork (unsigned mode)
1636{
1637 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1638}
1639
1640void
1641nvptx_expand_oacc_join (unsigned mode)
1642{
1643 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1644}
1645
1646/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1647 objects. */
1648
1649static rtx
1650nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1651{
1652 rtx res;
1653
1654 switch (GET_MODE (src))
1655 {
4e10a5a7 1656 case E_DImode:
d88cd9c4
NS
1657 res = gen_unpackdisi2 (dst0, dst1, src);
1658 break;
4e10a5a7 1659 case E_DFmode:
d88cd9c4
NS
1660 res = gen_unpackdfsi2 (dst0, dst1, src);
1661 break;
1662 default: gcc_unreachable ();
1663 }
1664 return res;
1665}
1666
1667/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1668 object. */
1669
1670static rtx
1671nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1672{
1673 rtx res;
1674
1675 switch (GET_MODE (dst))
1676 {
4e10a5a7 1677 case E_DImode:
d88cd9c4
NS
1678 res = gen_packsidi2 (dst, src0, src1);
1679 break;
4e10a5a7 1680 case E_DFmode:
d88cd9c4
NS
1681 res = gen_packsidf2 (dst, src0, src1);
1682 break;
1683 default: gcc_unreachable ();
1684 }
1685 return res;
1686}
1687
1688/* Generate an instruction or sequence to broadcast register REG
1689 across the vectors of a single warp. */
1690
5012919d 1691rtx
59263259 1692nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1693{
1694 rtx res;
1695
1696 switch (GET_MODE (dst))
1697 {
4e10a5a7 1698 case E_SImode:
d88cd9c4
NS
1699 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1700 break;
4e10a5a7 1701 case E_SFmode:
d88cd9c4
NS
1702 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1703 break;
4e10a5a7
RS
1704 case E_DImode:
1705 case E_DFmode:
d88cd9c4
NS
1706 {
1707 rtx tmp0 = gen_reg_rtx (SImode);
1708 rtx tmp1 = gen_reg_rtx (SImode);
1709
1710 start_sequence ();
1711 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1712 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1713 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1714 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1715 res = get_insns ();
1716 end_sequence ();
1717 }
1718 break;
4e10a5a7 1719 case E_BImode:
d88cd9c4
NS
1720 {
1721 rtx tmp = gen_reg_rtx (SImode);
1722
1723 start_sequence ();
1724 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1725 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1726 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1727 res = get_insns ();
1728 end_sequence ();
1729 }
1730 break;
4e10a5a7
RS
1731 case E_QImode:
1732 case E_HImode:
d5ace3b5
CP
1733 {
1734 rtx tmp = gen_reg_rtx (SImode);
1735
1736 start_sequence ();
1737 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1738 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1739 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1740 tmp)));
1741 res = get_insns ();
1742 end_sequence ();
1743 }
1744 break;
d88cd9c4
NS
1745
1746 default:
1747 gcc_unreachable ();
1748 }
1749 return res;
1750}
1751
1752/* Generate an instruction or sequence to broadcast register REG
1753 across the vectors of a single warp. */
1754
1755static rtx
a0b3b5c4 1756nvptx_gen_warp_bcast (rtx reg)
d88cd9c4
NS
1757{
1758 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1759}
1760
1761/* Structure used when generating a worker-level spill or fill. */
1762
1ed57fb8 1763struct broadcast_data_t
d88cd9c4
NS
1764{
1765 rtx base; /* Register holding base addr of buffer. */
1766 rtx ptr; /* Iteration var, if needed. */
1767 unsigned offset; /* Offset into worker buffer. */
1768};
1769
1770/* Direction of the spill/fill and looping setup/teardown indicator. */
1771
1772enum propagate_mask
1773 {
1774 PM_read = 1 << 0,
1775 PM_write = 1 << 1,
1776 PM_loop_begin = 1 << 2,
1777 PM_loop_end = 1 << 3,
1778
1779 PM_read_write = PM_read | PM_write
1780 };
1781
1782/* Generate instruction(s) to spill or fill register REG to/from the
1783 worker broadcast array. PM indicates what is to be done, REP
1784 how many loop iterations will be executed (0 for not a loop). */
1785
1786static rtx
a0b3b5c4
TV
1787nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1788 broadcast_data_t *data, bool vector)
d88cd9c4
NS
1789{
1790 rtx res;
1791 machine_mode mode = GET_MODE (reg);
1792
1793 switch (mode)
1794 {
4e10a5a7 1795 case E_BImode:
d88cd9c4
NS
1796 {
1797 rtx tmp = gen_reg_rtx (SImode);
1798
1799 start_sequence ();
1800 if (pm & PM_read)
1801 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
a0b3b5c4 1802 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
d88cd9c4
NS
1803 if (pm & PM_write)
1804 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1805 res = get_insns ();
1806 end_sequence ();
1807 }
1808 break;
1809
1810 default:
1811 {
1812 rtx addr = data->ptr;
1813
1814 if (!addr)
1815 {
1816 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1817
3c55d60f
TV
1818 oacc_bcast_align = MAX (oacc_bcast_align, align);
1819 data->offset = ROUND_UP (data->offset, align);
d88cd9c4 1820 addr = data->base;
a0b3b5c4 1821 gcc_assert (data->base != NULL);
d88cd9c4
NS
1822 if (data->offset)
1823 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1824 }
1825
1826 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1827 if (pm == PM_read)
1828 res = gen_rtx_SET (addr, reg);
1829 else if (pm == PM_write)
1830 res = gen_rtx_SET (reg, addr);
1831 else
1832 gcc_unreachable ();
1833
1834 if (data->ptr)
1835 {
1836 /* We're using a ptr, increment it. */
1837 start_sequence ();
1838
1839 emit_insn (res);
1840 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1841 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1842 res = get_insns ();
1843 end_sequence ();
1844 }
1845 else
1846 rep = 1;
1847 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1848 }
1849 break;
1850 }
1851 return res;
1852}
738f2522
BS
1853\f
1854/* Returns true if X is a valid address for use in a memory reference. */
1855
1856static bool
1857nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1858{
1859 enum rtx_code code = GET_CODE (x);
1860
1861 switch (code)
1862 {
1863 case REG:
1864 return true;
1865
1866 case PLUS:
1867 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1868 return true;
1869 return false;
1870
1871 case CONST:
1872 case SYMBOL_REF:
1873 case LABEL_REF:
1874 return true;
1875
1876 default:
1877 return false;
1878 }
1879}
738f2522 1880\f
4d5438cd
NS
1881/* Machinery to output constant initializers. When beginning an
1882 initializer, we decide on a fragment size (which is visible in ptx
1883 in the type used), and then all initializer data is buffered until
1884 a fragment is filled and ready to be written out. */
1885
1886static struct
1887{
1888 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1889 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1890 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1891 out. */
1892 unsigned size; /* Fragment size to accumulate. */
1893 unsigned offset; /* Offset within current fragment. */
1894 bool started; /* Whether we've output any initializer. */
1895} init_frag;
1896
1897/* The current fragment is full, write it out. SYM may provide a
1898 symbolic reference we should output, in which case the fragment
1899 value is the addend. */
738f2522
BS
1900
1901static void
4d5438cd 1902output_init_frag (rtx sym)
738f2522 1903{
4d5438cd
NS
1904 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1905 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1906
4d5438cd
NS
1907 init_frag.started = true;
1908 init_frag.val = 0;
1909 init_frag.offset = 0;
1910 init_frag.remaining--;
1911
1912 if (sym)
1913 {
6c7c4708
CP
1914 bool function = (SYMBOL_REF_DECL (sym)
1915 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1916 if (!function)
1917 fprintf (asm_out_file, "generic(");
4d5438cd 1918 output_address (VOIDmode, sym);
6c7c4708
CP
1919 if (!function)
1920 fprintf (asm_out_file, ")");
1921 if (val)
1922 fprintf (asm_out_file, " + ");
4d5438cd 1923 }
738f2522 1924
4d5438cd
NS
1925 if (!sym || val)
1926 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
1927}
1928
4d5438cd
NS
1929/* Add value VAL of size SIZE to the data we're emitting, and keep
1930 writing out chunks as they fill up. */
738f2522
BS
1931
1932static void
4d5438cd 1933nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 1934{
4d5438cd
NS
1935 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1936
1937 for (unsigned part = 0; size; size -= part)
738f2522 1938 {
4d5438cd
NS
1939 val >>= part * BITS_PER_UNIT;
1940 part = init_frag.size - init_frag.offset;
3c55d60f 1941 part = MIN (part, size);
4d5438cd
NS
1942
1943 unsigned HOST_WIDE_INT partial
1944 = val << (init_frag.offset * BITS_PER_UNIT);
1945 init_frag.val |= partial & init_frag.mask;
1946 init_frag.offset += part;
1947
1948 if (init_frag.offset == init_frag.size)
1949 output_init_frag (NULL);
738f2522
BS
1950 }
1951}
1952
1953/* Target hook for assembling integer object X of size SIZE. */
1954
1955static bool
1956nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1957{
00e52418
NS
1958 HOST_WIDE_INT val = 0;
1959
1960 switch (GET_CODE (x))
738f2522 1961 {
00e52418 1962 default:
a9000e1e
NS
1963 /* Let the generic machinery figure it out, usually for a
1964 CONST_WIDE_INT. */
1965 return false;
00e52418
NS
1966
1967 case CONST_INT:
4d5438cd 1968 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
1969 break;
1970
1971 case CONST:
1972 x = XEXP (x, 0);
1973 gcc_assert (GET_CODE (x) == PLUS);
1974 val = INTVAL (XEXP (x, 1));
1975 x = XEXP (x, 0);
1976 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1977 /* FALLTHROUGH */
1978
1979 case SYMBOL_REF:
4d5438cd
NS
1980 gcc_assert (size == init_frag.size);
1981 if (init_frag.offset)
738f2522 1982 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 1983
00e52418 1984 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
1985 init_frag.val = val;
1986 output_init_frag (x);
738f2522 1987 break;
738f2522
BS
1988 }
1989
738f2522
BS
1990 return true;
1991}
1992
1993/* Output SIZE zero bytes. We ignore the FILE argument since the
1994 functions we're calling to perform the output just use
1995 asm_out_file. */
1996
1997void
1998nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1999{
4d5438cd
NS
2000 /* Finish the current fragment, if it's started. */
2001 if (init_frag.offset)
738f2522 2002 {
4d5438cd 2003 unsigned part = init_frag.size - init_frag.offset;
3c55d60f 2004 part = MIN (part, (unsigned)size);
4d5438cd
NS
2005 size -= part;
2006 nvptx_assemble_value (0, part);
738f2522
BS
2007 }
2008
4d5438cd
NS
2009 /* If this skip doesn't terminate the initializer, write as many
2010 remaining pieces as possible directly. */
2011 if (size < init_frag.remaining * init_frag.size)
738f2522 2012 {
4d5438cd
NS
2013 while (size >= init_frag.size)
2014 {
2015 size -= init_frag.size;
2016 output_init_frag (NULL_RTX);
2017 }
2018 if (size)
2019 nvptx_assemble_value (0, size);
738f2522 2020 }
738f2522
BS
2021}
2022
2023/* Output a string STR with length SIZE. As in nvptx_output_skip we
2024 ignore the FILE arg. */
2025
2026void
2027nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2028{
2029 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2030 nvptx_assemble_value (str[i], 1);
2031}
2032
bf398920
TV
2033/* Return true if TYPE is a record type where the last field is an array without
2034 given dimension. */
2035
2036static bool
2037flexible_array_member_type_p (const_tree type)
2038{
2039 if (TREE_CODE (type) != RECORD_TYPE)
2040 return false;
2041
2042 const_tree last_field = NULL_TREE;
2043 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2044 last_field = f;
2045
2046 if (!last_field)
2047 return false;
2048
2049 const_tree last_field_type = TREE_TYPE (last_field);
2050 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2051 return false;
2052
2053 return (! TYPE_DOMAIN (last_field_type)
2054 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2055}
2056
4ff3145a
NS
2057/* Emit a PTX variable decl and prepare for emission of its
2058 initializer. NAME is the symbol name and SETION the PTX data
2059 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2060 The caller has already emitted any indentation and linkage
2061 specifier. It is responsible for any initializer, terminating ;
2062 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2063 this is the opposite way round that PTX wants them! */
2064
2065static void
2066nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2067 const_tree type, HOST_WIDE_INT size, unsigned align,
2068 bool undefined = false)
4ff3145a 2069{
59d2d238
CP
2070 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2071 && (TYPE_DOMAIN (type) == NULL_TREE);
2072
bf398920
TV
2073 if (undefined && flexible_array_member_type_p (type))
2074 {
2075 size = 0;
2076 atype = true;
2077 }
2078
4ff3145a
NS
2079 while (TREE_CODE (type) == ARRAY_TYPE)
2080 type = TREE_TYPE (type);
2081
fc0efeea
NS
2082 if (TREE_CODE (type) == VECTOR_TYPE
2083 || TREE_CODE (type) == COMPLEX_TYPE)
2084 /* Neither vector nor complex types can contain the other. */
2085 type = TREE_TYPE (type);
2086
4ff3145a 2087 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
2088
2089 /* Largest mode we're prepared to accept. For BLKmode types we
2090 don't know if it'll contain pointer constants, so have to choose
2091 pointer size, otherwise we can choose DImode. */
2092 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2093
2094 elt_size |= GET_MODE_SIZE (elt_mode);
2095 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2096
4d5438cd 2097 init_frag.size = elt_size;
9c582551 2098 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2099 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2100 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2101 init_frag.val = 0;
2102 init_frag.offset = 0;
2103 init_frag.started = false;
2104 /* Size might not be a multiple of elt size, if there's an
2105 initialized trailing struct array with smaller type than
2106 elt_size. */
2107 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
2108
2109 fprintf (file, "%s .align %d .u%d ",
2110 section, align / BITS_PER_UNIT,
2111 elt_size * BITS_PER_UNIT);
2112 assemble_name (file, name);
2113
2114 if (size)
2115 /* We make everything an array, to simplify any initialization
2116 emission. */
4d5438cd 2117 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
59d2d238
CP
2118 else if (atype)
2119 fprintf (file, "[]");
4ff3145a
NS
2120}
2121
738f2522
BS
2122/* Called when the initializer for a decl has been completely output through
2123 combinations of the three functions above. */
2124
2125static void
2126nvptx_assemble_decl_end (void)
2127{
4d5438cd
NS
2128 if (init_frag.offset)
2129 /* This can happen with a packed struct with trailing array member. */
2130 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2131 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2132}
2133
69823d76
NS
2134/* Output an uninitialized common or file-scope variable. */
2135
2136void
2137nvptx_output_aligned_decl (FILE *file, const char *name,
2138 const_tree decl, HOST_WIDE_INT size, unsigned align)
2139{
2140 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2141
2142 /* If this is public, it is common. The nearest thing we have to
2143 common is weak. */
4ff3145a
NS
2144 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2145
2146 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2147 TREE_TYPE (decl), size, align);
4d5438cd 2148 nvptx_assemble_decl_end ();
69823d76
NS
2149}
2150
738f2522
BS
2151/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2152 writing a constant variable EXP with NAME and SIZE and its
2153 initializer to FILE. */
2154
2155static void
2156nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2157 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2158{
4ff3145a
NS
2159 write_var_marker (file, true, false, name);
2160
2161 fprintf (file, "\t");
2162
738f2522 2163 tree type = TREE_TYPE (exp);
4ff3145a
NS
2164 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2165 TYPE_ALIGN (type));
738f2522
BS
2166}
2167
2168/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2169 a variable DECL with NAME to FILE. */
2170
2171void
2172nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2173{
4ff3145a 2174 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2175
4ff3145a
NS
2176 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2177 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2178
4ff3145a
NS
2179 tree type = TREE_TYPE (decl);
2180 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2181 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2182 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2183}
2184
2185/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2186
2187static void
2188nvptx_globalize_label (FILE *, const char *)
2189{
2190}
2191
2192/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2193 declaration only for variable DECL with NAME to FILE. */
f313d112 2194
738f2522
BS
2195static void
2196nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2197{
0a0f74aa
NS
2198 /* The middle end can place constant pool decls into the varpool as
2199 undefined. Until that is fixed, catch the problem here. */
2200 if (DECL_IN_CONSTANT_POOL (decl))
2201 return;
2202
721547cd
NS
2203 /* We support weak defintions, and hence have the right
2204 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2205 if (DECL_WEAK (decl))
2206 error_at (DECL_SOURCE_LOCATION (decl),
2207 "PTX does not support weak declarations"
2208 " (only weak definitions)");
69823d76
NS
2209 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2210
4ff3145a
NS
2211 fprintf (file, "\t.extern ");
2212 tree size = DECL_SIZE_UNIT (decl);
2213 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2214 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2215 DECL_ALIGN (decl), true);
1e5154e7 2216 nvptx_assemble_decl_end ();
738f2522
BS
2217}
2218
f313d112
NS
2219/* Output a pattern for a move instruction. */
2220
2221const char *
2222nvptx_output_mov_insn (rtx dst, rtx src)
2223{
2224 machine_mode dst_mode = GET_MODE (dst);
2225 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2226 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2227 machine_mode src_inner = (GET_CODE (src) == SUBREG
2228 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2229
15113b03
NS
2230 rtx sym = src;
2231 if (GET_CODE (sym) == CONST)
2232 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2233 if (SYMBOL_REF_P (sym))
2234 {
2235 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2236 return "%.\tcvta%D1%t0\t%0, %1;";
2237 nvptx_maybe_record_fnsym (sym);
2238 }
15113b03 2239
f313d112
NS
2240 if (src_inner == dst_inner)
2241 return "%.\tmov%t0\t%0, %1;";
2242
2243 if (CONSTANT_P (src))
2244 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2245 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2246 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2247
2248 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2249 {
2250 if (GET_MODE_BITSIZE (dst_mode) == 128
2251 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2252 {
2253 /* mov.b128 is not supported. */
2254 if (dst_inner == V2DImode && src_inner == TImode)
2255 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2256 else if (dst_inner == TImode && src_inner == V2DImode)
2257 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2258
2259 gcc_unreachable ();
2260 }
2261 return "%.\tmov.b%T0\t%0, %1;";
2262 }
f313d112
NS
2263
2264 return "%.\tcvt%t0%t1\t%0, %1;";
2265}
2266
5012919d
AM
2267static void nvptx_print_operand (FILE *, rtx, int);
2268
738f2522 2269/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2270 involves writing .param declarations and in/out copies into them. For
2271 indirect calls, also write the .callprototype. */
738f2522
BS
2272
2273const char *
2274nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2275{
863af9a4 2276 char buf[16];
738f2522
BS
2277 static int labelno;
2278 bool needs_tgt = register_operand (callee, Pmode);
2279 rtx pat = PATTERN (insn);
5012919d
AM
2280 if (GET_CODE (pat) == COND_EXEC)
2281 pat = COND_EXEC_CODE (pat);
f324806d 2282 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2283 tree decl = NULL_TREE;
2284
2285 fprintf (asm_out_file, "\t{\n");
2286 if (result != NULL)
1f065954
NS
2287 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2288 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2289 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2290
ecf6e535 2291 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2292 if (GET_CODE (callee) == SYMBOL_REF)
2293 {
2294 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2295 if (!decl
2296 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2297 nvptx_record_libfunc (callee, result, pat);
2298 else if (DECL_EXTERNAL (decl))
738f2522
BS
2299 nvptx_record_fndecl (decl);
2300 }
2301
2302 if (needs_tgt)
2303 {
2304 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2305 labelno++;
2306 ASM_OUTPUT_LABEL (asm_out_file, buf);
2307 std::stringstream s;
b699adcc 2308 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2309 fputs (s.str().c_str(), asm_out_file);
2310 }
2311
863af9a4 2312 for (int argno = 1; argno < arg_end; argno++)
738f2522 2313 {
863af9a4 2314 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2315 machine_mode mode = GET_MODE (t);
a02d84b6 2316 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2317
863af9a4 2318 /* Mode splitting has already been done. */
a02d84b6
NS
2319 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2320 "\t\tst.param%s [%%out_arg%d], ",
2321 ptx_type, argno, ptx_type, argno);
2322 output_reg (asm_out_file, REGNO (t), VOIDmode);
2323 fprintf (asm_out_file, ";\n");
738f2522
BS
2324 }
2325
5012919d
AM
2326 /* The '.' stands for the call's predicate, if any. */
2327 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2328 fprintf (asm_out_file, "\t\tcall ");
2329 if (result != NULL_RTX)
1f065954
NS
2330 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2331
738f2522
BS
2332 if (decl)
2333 {
2334 const char *name = get_fnname_from_decl (decl);
b4346107 2335 name = nvptx_name_replacement (name);
738f2522
BS
2336 assemble_name (asm_out_file, name);
2337 }
2338 else
cc8ca59e 2339 output_address (VOIDmode, callee);
738f2522 2340
863af9a4
NS
2341 const char *open = "(";
2342 for (int argno = 1; argno < arg_end; argno++)
738f2522 2343 {
863af9a4
NS
2344 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2345 open = "";
738f2522 2346 }
863af9a4
NS
2347 if (decl && DECL_STATIC_CHAIN (decl))
2348 {
5563d5c0 2349 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2350 open = "";
2351 }
2352 if (!open[0])
2353 fprintf (asm_out_file, ")");
f324806d 2354
738f2522
BS
2355 if (needs_tgt)
2356 {
2357 fprintf (asm_out_file, ", ");
2358 assemble_name (asm_out_file, buf);
2359 }
2360 fprintf (asm_out_file, ";\n");
738f2522 2361
51baf85a 2362 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2363 {
2364 /* No return functions confuse the PTX JIT, as it doesn't realize
2365 the flow control barrier they imply. It can seg fault if it
2366 encounters what looks like an unexitable loop. Emit a trailing
2367 trap and exit, which it does grok. */
2368 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2369 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2370 }
51baf85a 2371
1f065954
NS
2372 if (result)
2373 {
2374 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2375
2376 if (!rval[0])
2377 /* We must escape the '%' that starts RETURN_REGNUM. */
2378 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2379 reg_names[NVPTX_RETURN_REGNUM]);
2380 return rval;
2381 }
2382
2383 return "}";
738f2522
BS
2384}
2385
2386/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2387
2388static bool
2389nvptx_print_operand_punct_valid_p (unsigned char c)
2390{
2391 return c == '.' || c== '#';
2392}
2393
738f2522
BS
2394/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2395
2396static void
2397nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2398{
2399 rtx off;
2400 if (GET_CODE (x) == CONST)
2401 x = XEXP (x, 0);
2402 switch (GET_CODE (x))
2403 {
2404 case PLUS:
2405 off = XEXP (x, 1);
cc8ca59e 2406 output_address (VOIDmode, XEXP (x, 0));
738f2522 2407 fprintf (file, "+");
cc8ca59e 2408 output_address (VOIDmode, off);
738f2522
BS
2409 break;
2410
2411 case SYMBOL_REF:
2412 case LABEL_REF:
2413 output_addr_const (file, x);
2414 break;
2415
2416 default:
2417 gcc_assert (GET_CODE (x) != MEM);
2418 nvptx_print_operand (file, x, 0);
2419 break;
2420 }
2421}
2422
2423/* Write assembly language output for the address ADDR to FILE. */
2424
2425static void
cc8ca59e 2426nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2427{
cc8ca59e 2428 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2429}
2430
2431/* Print an operand, X, to FILE, with an optional modifier in CODE.
2432
2433 Meaning of CODE:
2434 . -- print the predicate for the instruction or an emptry string for an
2435 unconditional one.
2436 # -- print a rounding mode for the instruction
2437
9a863523 2438 A -- print a data area for a MEM
738f2522 2439 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2440 D -- print a data area for a MEM operand
d88cd9c4 2441 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2442 t -- print a type opcode suffix, promoting QImode to 32 bits
2443 T -- print a type size in bits
2444 u -- print a type opcode suffix without promotions. */
2445
2446static void
2447nvptx_print_operand (FILE *file, rtx x, int code)
2448{
738f2522
BS
2449 if (code == '.')
2450 {
2451 x = current_insn_predicate;
2452 if (x)
2453 {
5012919d 2454 fputs ("@", file);
738f2522
BS
2455 if (GET_CODE (x) == EQ)
2456 fputs ("!", file);
5012919d 2457 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2458 }
2459 return;
2460 }
2461 else if (code == '#')
2462 {
2463 fputs (".rn", file);
2464 return;
2465 }
2466
2467 enum rtx_code x_code = GET_CODE (x);
f313d112 2468 machine_mode mode = GET_MODE (x);
738f2522
BS
2469
2470 switch (code)
2471 {
2472 case 'A':
9a863523
NS
2473 x = XEXP (x, 0);
2474 /* FALLTHROUGH. */
7b8edc29 2475
9a863523
NS
2476 case 'D':
2477 if (GET_CODE (x) == CONST)
2478 x = XEXP (x, 0);
2479 if (GET_CODE (x) == PLUS)
2480 x = XEXP (x, 0);
7b8edc29 2481
9a863523
NS
2482 if (GET_CODE (x) == SYMBOL_REF)
2483 fputs (section_for_sym (x), file);
738f2522
BS
2484 break;
2485
738f2522 2486 case 't':
738f2522 2487 case 'u':
f313d112
NS
2488 if (x_code == SUBREG)
2489 {
1310ff03
TV
2490 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2491 if (VECTOR_MODE_P (inner_mode)
2492 && (GET_MODE_SIZE (mode)
2493 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2494 mode = GET_MODE_INNER (inner_mode);
2495 else if (split_mode_p (inner_mode))
2496 mode = maybe_split_mode (inner_mode);
2497 else
2498 mode = inner_mode;
f313d112
NS
2499 }
2500 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2501 break;
2502
3717fbe3
TV
2503 case 'H':
2504 case 'L':
2505 {
2506 rtx inner_x = SUBREG_REG (x);
2507 machine_mode inner_mode = GET_MODE (inner_x);
2508 machine_mode split = maybe_split_mode (inner_mode);
2509
2510 output_reg (file, REGNO (inner_x), split,
2511 (code == 'H'
2512 ? GET_MODE_SIZE (inner_mode) / 2
2513 : 0));
2514 }
2515 break;
2516
d88cd9c4
NS
2517 case 'S':
2518 {
59263259
NS
2519 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2520 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2521 static const char *const kinds[] =
59263259
NS
2522 {".up", ".down", ".bfly", ".idx"};
2523 fputs (kinds[kind], file);
d88cd9c4
NS
2524 }
2525 break;
2526
738f2522 2527 case 'T':
f313d112 2528 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2529 break;
2530
2531 case 'j':
2532 fprintf (file, "@");
2533 goto common;
2534
2535 case 'J':
2536 fprintf (file, "@!");
2537 goto common;
2538
2539 case 'c':
f313d112 2540 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2541 switch (x_code)
2542 {
2543 case EQ:
2544 fputs (".eq", file);
2545 break;
2546 case NE:
f313d112 2547 if (FLOAT_MODE_P (mode))
738f2522
BS
2548 fputs (".neu", file);
2549 else
2550 fputs (".ne", file);
2551 break;
2552 case LE:
578fb225 2553 case LEU:
738f2522
BS
2554 fputs (".le", file);
2555 break;
2556 case GE:
578fb225 2557 case GEU:
738f2522
BS
2558 fputs (".ge", file);
2559 break;
2560 case LT:
578fb225 2561 case LTU:
738f2522
BS
2562 fputs (".lt", file);
2563 break;
2564 case GT:
738f2522 2565 case GTU:
578fb225 2566 fputs (".gt", file);
738f2522
BS
2567 break;
2568 case LTGT:
2569 fputs (".ne", file);
2570 break;
2571 case UNEQ:
2572 fputs (".equ", file);
2573 break;
2574 case UNLE:
2575 fputs (".leu", file);
2576 break;
2577 case UNGE:
2578 fputs (".geu", file);
2579 break;
2580 case UNLT:
2581 fputs (".ltu", file);
2582 break;
2583 case UNGT:
2584 fputs (".gtu", file);
2585 break;
2586 case UNORDERED:
2587 fputs (".nan", file);
2588 break;
2589 case ORDERED:
2590 fputs (".num", file);
2591 break;
2592 default:
2593 gcc_unreachable ();
2594 }
f313d112 2595 if (FLOAT_MODE_P (mode)
738f2522
BS
2596 || x_code == EQ || x_code == NE
2597 || x_code == GEU || x_code == GTU
2598 || x_code == LEU || x_code == LTU)
f313d112 2599 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2600 else
f313d112 2601 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2602 break;
2603 default:
2604 common:
2605 switch (x_code)
2606 {
2607 case SUBREG:
f313d112
NS
2608 {
2609 rtx inner_x = SUBREG_REG (x);
2610 machine_mode inner_mode = GET_MODE (inner_x);
2611 machine_mode split = maybe_split_mode (inner_mode);
2612
1310ff03
TV
2613 if (VECTOR_MODE_P (inner_mode)
2614 && (GET_MODE_SIZE (mode)
2615 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2616 {
2617 output_reg (file, REGNO (inner_x), VOIDmode);
2618 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2619 }
2620 else if (split_mode_p (inner_mode)
f313d112
NS
2621 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2622 output_reg (file, REGNO (inner_x), split);
2623 else
2624 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2625 }
2626 break;
738f2522
BS
2627
2628 case REG:
f313d112 2629 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2630 break;
2631
2632 case MEM:
2633 fputc ('[', file);
f313d112 2634 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2635 fputc (']', file);
2636 break;
2637
2638 case CONST_INT:
2639 output_addr_const (file, x);
2640 break;
2641
2642 case CONST:
2643 case SYMBOL_REF:
2644 case LABEL_REF:
2645 /* We could use output_addr_const, but that can print things like
2646 "x-8", which breaks ptxas. Need to ensure it is output as
2647 "x+-8". */
2648 nvptx_print_address_operand (file, x, VOIDmode);
2649 break;
2650
2651 case CONST_DOUBLE:
2652 long vals[2];
f313d112 2653 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2654 vals[0] &= 0xffffffff;
2655 vals[1] &= 0xffffffff;
f313d112 2656 if (mode == SFmode)
738f2522
BS
2657 fprintf (file, "0f%08lx", vals[0]);
2658 else
2659 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2660 break;
2661
1310ff03
TV
2662 case CONST_VECTOR:
2663 {
2664 unsigned n = CONST_VECTOR_NUNITS (x);
2665 fprintf (file, "{ ");
2666 for (unsigned i = 0; i < n; ++i)
2667 {
2668 if (i != 0)
2669 fprintf (file, ", ");
2670
2671 rtx elem = CONST_VECTOR_ELT (x, i);
2672 output_addr_const (file, elem);
2673 }
2674 fprintf (file, " }");
2675 }
2676 break;
2677
738f2522
BS
2678 default:
2679 output_addr_const (file, x);
2680 }
2681 }
2682}
2683\f
2684/* Record replacement regs used to deal with subreg operands. */
2685struct reg_replace
2686{
2687 rtx replacement[MAX_RECOG_OPERANDS];
2688 machine_mode mode;
2689 int n_allocated;
2690 int n_in_use;
2691};
2692
2693/* Allocate or reuse a replacement in R and return the rtx. */
2694
2695static rtx
2696get_replacement (struct reg_replace *r)
2697{
2698 if (r->n_allocated == r->n_in_use)
2699 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2700 return r->replacement[r->n_in_use++];
2701}
2702
2703/* Clean up subreg operands. In ptx assembly, everything is typed, and
2704 the presence of subregs would break the rules for most instructions.
2705 Replace them with a suitable new register of the right size, plus
2706 conversion copyin/copyout instructions. */
2707
2708static void
517665b3 2709nvptx_reorg_subreg (void)
738f2522
BS
2710{
2711 struct reg_replace qiregs, hiregs, siregs, diregs;
2712 rtx_insn *insn, *next;
2713
738f2522
BS
2714 qiregs.n_allocated = 0;
2715 hiregs.n_allocated = 0;
2716 siregs.n_allocated = 0;
2717 diregs.n_allocated = 0;
2718 qiregs.mode = QImode;
2719 hiregs.mode = HImode;
2720 siregs.mode = SImode;
2721 diregs.mode = DImode;
2722
2723 for (insn = get_insns (); insn; insn = next)
2724 {
2725 next = NEXT_INSN (insn);
2726 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2727 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2728 || GET_CODE (PATTERN (insn)) == USE
2729 || GET_CODE (PATTERN (insn)) == CLOBBER)
2730 continue;
f324806d 2731
738f2522
BS
2732 qiregs.n_in_use = 0;
2733 hiregs.n_in_use = 0;
2734 siregs.n_in_use = 0;
2735 diregs.n_in_use = 0;
2736 extract_insn (insn);
2737 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2738
738f2522
BS
2739 for (int i = 0; i < recog_data.n_operands; i++)
2740 {
2741 rtx op = recog_data.operand[i];
2742 if (GET_CODE (op) != SUBREG)
2743 continue;
2744
2745 rtx inner = SUBREG_REG (op);
2746
2747 machine_mode outer_mode = GET_MODE (op);
2748 machine_mode inner_mode = GET_MODE (inner);
2749 gcc_assert (s_ok);
2750 if (s_ok
2751 && (GET_MODE_PRECISION (inner_mode)
2752 >= GET_MODE_PRECISION (outer_mode)))
2753 continue;
2754 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2755 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2756 : outer_mode == HImode ? &hiregs
2757 : outer_mode == SImode ? &siregs
2758 : &diregs);
2759 rtx new_reg = get_replacement (r);
2760
2761 if (recog_data.operand_type[i] != OP_OUT)
2762 {
2763 enum rtx_code code;
2764 if (GET_MODE_PRECISION (inner_mode)
2765 < GET_MODE_PRECISION (outer_mode))
2766 code = ZERO_EXTEND;
2767 else
2768 code = TRUNCATE;
2769
f7df4a84 2770 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2771 gen_rtx_fmt_e (code, outer_mode, inner));
2772 emit_insn_before (pat, insn);
2773 }
2774
2775 if (recog_data.operand_type[i] != OP_IN)
2776 {
2777 enum rtx_code code;
2778 if (GET_MODE_PRECISION (inner_mode)
2779 < GET_MODE_PRECISION (outer_mode))
2780 code = TRUNCATE;
2781 else
2782 code = ZERO_EXTEND;
2783
f7df4a84 2784 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2785 gen_rtx_fmt_e (code, inner_mode, new_reg));
2786 emit_insn_after (pat, insn);
2787 }
2788 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2789 }
2790 }
517665b3 2791}
738f2522 2792
5012919d
AM
2793/* Return a SImode "master lane index" register for uniform-simt, allocating on
2794 first use. */
2795
2796static rtx
2797nvptx_get_unisimt_master ()
2798{
2799 rtx &master = cfun->machine->unisimt_master;
2800 return master ? master : master = gen_reg_rtx (SImode);
2801}
2802
2803/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2804
2805static rtx
2806nvptx_get_unisimt_predicate ()
2807{
2808 rtx &pred = cfun->machine->unisimt_predicate;
2809 return pred ? pred : pred = gen_reg_rtx (BImode);
2810}
2811
2812/* Return true if given call insn references one of the functions provided by
2813 the CUDA runtime: malloc, free, vprintf. */
2814
2815static bool
2816nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2817{
2818 rtx pat = PATTERN (insn);
2819 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2820 pat = XVECEXP (pat, 0, 0);
2821 if (GET_CODE (pat) == SET)
2822 pat = SET_SRC (pat);
2823 gcc_checking_assert (GET_CODE (pat) == CALL
2824 && GET_CODE (XEXP (pat, 0)) == MEM);
2825 rtx addr = XEXP (XEXP (pat, 0), 0);
2826 if (GET_CODE (addr) != SYMBOL_REF)
2827 return false;
2828 const char *name = XSTR (addr, 0);
2829 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2830 references with forced assembler name refer to PTX syscalls. For vprintf,
2831 accept both normal and forced-assembler-name references. */
2832 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2833 || !strcmp (name, "*malloc")
2834 || !strcmp (name, "*free"));
2835}
2836
2837/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2838 propagate its value from lane MASTER to current lane. */
2839
2840static void
2841nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2842{
2843 rtx reg;
2844 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2845 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2846}
2847
2848/* Adjust code for uniform-simt code generation variant by making atomics and
2849 "syscalls" conditionally executed, and inserting shuffle-based propagation
2850 for registers being set. */
2851
2852static void
2853nvptx_reorg_uniform_simt ()
2854{
2855 rtx_insn *insn, *next;
2856
2857 for (insn = get_insns (); insn; insn = next)
2858 {
2859 next = NEXT_INSN (insn);
2860 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2861 && !(NONJUMP_INSN_P (insn)
2862 && GET_CODE (PATTERN (insn)) == PARALLEL
2863 && get_attr_atomic (insn)))
2864 continue;
2865 rtx pat = PATTERN (insn);
2866 rtx master = nvptx_get_unisimt_master ();
2867 for (int i = 0; i < XVECLEN (pat, 0); i++)
2868 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2869 rtx pred = nvptx_get_unisimt_predicate ();
2870 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2871 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2872 validate_change (insn, &PATTERN (insn), pat, false);
2873 }
2874}
2875
d2d47a28
NS
2876/* Loop structure of the function. The entire function is described as
2877 a NULL loop. */
d88cd9c4
NS
2878
2879struct parallel
2880{
2881 /* Parent parallel. */
2882 parallel *parent;
2883
2884 /* Next sibling parallel. */
2885 parallel *next;
2886
2887 /* First child parallel. */
2888 parallel *inner;
2889
2890 /* Partitioning mask of the parallel. */
2891 unsigned mask;
2892
2893 /* Partitioning used within inner parallels. */
2894 unsigned inner_mask;
2895
2896 /* Location of parallel forked and join. The forked is the first
2897 block in the parallel and the join is the first block after of
2898 the partition. */
2899 basic_block forked_block;
2900 basic_block join_block;
2901
2902 rtx_insn *forked_insn;
2903 rtx_insn *join_insn;
2904
2905 rtx_insn *fork_insn;
2906 rtx_insn *joining_insn;
2907
2908 /* Basic blocks in this parallel, but not in child parallels. The
2909 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2910 blocks are not. */
2911 auto_vec<basic_block> blocks;
2912
2913public:
2914 parallel (parallel *parent, unsigned mode);
2915 ~parallel ();
2916};
2917
2918/* Constructor links the new parallel into it's parent's chain of
2919 children. */
2920
2921parallel::parallel (parallel *parent_, unsigned mask_)
2922 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2923{
2924 forked_block = join_block = 0;
2925 forked_insn = join_insn = 0;
2926 fork_insn = joining_insn = 0;
2927
2928 if (parent)
2929 {
2930 next = parent->inner;
2931 parent->inner = this;
2932 }
2933}
2934
2935parallel::~parallel ()
2936{
2937 delete inner;
2938 delete next;
2939}
2940
2941/* Map of basic blocks to insns */
2942typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2943
2944/* A tuple of an insn of interest and the BB in which it resides. */
2945typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2946typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2947
2948/* Split basic blocks such that each forked and join unspecs are at
2949 the start of their basic blocks. Thus afterwards each block will
2950 have a single partitioning mode. We also do the same for return
2951 insns, as they are executed by every thread. Return the
2952 partitioning mode of the function as a whole. Populate MAP with
2953 head and tail blocks. We also clear the BB visited flag, which is
2954 used when finding partitions. */
2955
2956static void
2957nvptx_split_blocks (bb_insn_map_t *map)
2958{
2959 insn_bb_vec_t worklist;
2960 basic_block block;
2961 rtx_insn *insn;
2962
2963 /* Locate all the reorg instructions of interest. */
2964 FOR_ALL_BB_FN (block, cfun)
2965 {
2966 bool seen_insn = false;
2967
2968 /* Clear visited flag, for use by parallel locator */
2969 block->flags &= ~BB_VISITED;
2970
2971 FOR_BB_INSNS (block, insn)
2972 {
2973 if (!INSN_P (insn))
2974 continue;
2975 switch (recog_memoized (insn))
2976 {
2977 default:
2978 seen_insn = true;
2979 continue;
2980 case CODE_FOR_nvptx_forked:
2981 case CODE_FOR_nvptx_join:
2982 break;
2983
2984 case CODE_FOR_return:
2985 /* We also need to split just before return insns, as
2986 that insn needs executing by all threads, but the
2987 block it is in probably does not. */
2988 break;
2989 }
2990
2991 if (seen_insn)
2992 /* We've found an instruction that must be at the start of
2993 a block, but isn't. Add it to the worklist. */
2994 worklist.safe_push (insn_bb_t (insn, block));
2995 else
2996 /* It was already the first instruction. Just add it to
2997 the map. */
2998 map->get_or_insert (block) = insn;
2999 seen_insn = true;
3000 }
3001 }
3002
3003 /* Split blocks on the worklist. */
3004 unsigned ix;
3005 insn_bb_t *elt;
3006 basic_block remap = 0;
3007 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3008 {
3009 if (remap != elt->second)
3010 {
3011 block = elt->second;
3012 remap = block;
3013 }
3014
3015 /* Split block before insn. The insn is in the new block */
3016 edge e = split_block (block, PREV_INSN (elt->first));
3017
3018 block = e->dest;
3019 map->get_or_insert (block) = elt->first;
3020 }
3021}
3022
3023/* BLOCK is a basic block containing a head or tail instruction.
3024 Locate the associated prehead or pretail instruction, which must be
3025 in the single predecessor block. */
3026
3027static rtx_insn *
3028nvptx_discover_pre (basic_block block, int expected)
3029{
3030 gcc_assert (block->preds->length () == 1);
3031 basic_block pre_block = (*block->preds)[0]->src;
3032 rtx_insn *pre_insn;
3033
3034 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3035 pre_insn = PREV_INSN (pre_insn))
3036 gcc_assert (pre_insn != BB_HEAD (pre_block));
3037
3038 gcc_assert (recog_memoized (pre_insn) == expected);
3039 return pre_insn;
3040}
3041
3042/* Dump this parallel and all its inner parallels. */
3043
3044static void
3045nvptx_dump_pars (parallel *par, unsigned depth)
3046{
3047 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3048 depth, par->mask,
3049 par->forked_block ? par->forked_block->index : -1,
3050 par->join_block ? par->join_block->index : -1);
3051
3052 fprintf (dump_file, " blocks:");
3053
3054 basic_block block;
3055 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3056 fprintf (dump_file, " %d", block->index);
3057 fprintf (dump_file, "\n");
3058 if (par->inner)
3059 nvptx_dump_pars (par->inner, depth + 1);
3060
3061 if (par->next)
3062 nvptx_dump_pars (par->next, depth);
3063}
3064
3065/* If BLOCK contains a fork/join marker, process it to create or
3066 terminate a loop structure. Add this block to the current loop,
3067 and then walk successor blocks. */
3068
3069static parallel *
3070nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3071{
3072 if (block->flags & BB_VISITED)
3073 return par;
3074 block->flags |= BB_VISITED;
3075
3076 if (rtx_insn **endp = map->get (block))
3077 {
3078 rtx_insn *end = *endp;
3079
3080 /* This is a block head or tail, or return instruction. */
3081 switch (recog_memoized (end))
3082 {
3083 case CODE_FOR_return:
3084 /* Return instructions are in their own block, and we
3085 don't need to do anything more. */
3086 return par;
3087
3088 case CODE_FOR_nvptx_forked:
3089 /* Loop head, create a new inner loop and add it into
3090 our parent's child list. */
3091 {
3092 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3093
3094 gcc_assert (mask);
3095 par = new parallel (par, mask);
3096 par->forked_block = block;
3097 par->forked_insn = end;
e91eba31 3098 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3099 par->fork_insn
3100 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3101 }
3102 break;
3103
3104 case CODE_FOR_nvptx_join:
3105 /* A loop tail. Finish the current loop and return to
3106 parent. */
3107 {
3108 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3109
3110 gcc_assert (par->mask == mask);
3111 par->join_block = block;
3112 par->join_insn = end;
e91eba31 3113 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3114 par->joining_insn
3115 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3116 par = par->parent;
3117 }
3118 break;
3119
3120 default:
3121 gcc_unreachable ();
3122 }
3123 }
3124
3125 if (par)
3126 /* Add this block onto the current loop's list of blocks. */
3127 par->blocks.safe_push (block);
3128 else
3129 /* This must be the entry block. Create a NULL parallel. */
3130 par = new parallel (0, 0);
3131
3132 /* Walk successor blocks. */
3133 edge e;
3134 edge_iterator ei;
3135
3136 FOR_EACH_EDGE (e, ei, block->succs)
3137 nvptx_find_par (map, par, e->dest);
3138
3139 return par;
3140}
3141
3142/* DFS walk the CFG looking for fork & join markers. Construct
3143 loop structures as we go. MAP is a mapping of basic blocks
3144 to head & tail markers, discovered when splitting blocks. This
3145 speeds up the discovery. We rely on the BB visited flag having
3146 been cleared when splitting blocks. */
3147
3148static parallel *
3149nvptx_discover_pars (bb_insn_map_t *map)
3150{
3151 basic_block block;
3152
3153 /* Mark exit blocks as visited. */
3154 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3155 block->flags |= BB_VISITED;
3156
3157 /* And entry block as not. */
3158 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3159 block->flags &= ~BB_VISITED;
3160
3161 parallel *par = nvptx_find_par (map, 0, block);
3162
3163 if (dump_file)
3164 {
3165 fprintf (dump_file, "\nLoops\n");
3166 nvptx_dump_pars (par, 0);
3167 fprintf (dump_file, "\n");
3168 }
3169
3170 return par;
3171}
3172
912442c2
NS
3173/* Analyse a group of BBs within a partitioned region and create N
3174 Single-Entry-Single-Exit regions. Some of those regions will be
3175 trivial ones consisting of a single BB. The blocks of a
3176 partitioned region might form a set of disjoint graphs -- because
3177 the region encloses a differently partitoned sub region.
3178
3179 We use the linear time algorithm described in 'Finding Regions Fast:
3180 Single Entry Single Exit and control Regions in Linear Time'
3181 Johnson, Pearson & Pingali. That algorithm deals with complete
3182 CFGs, where a back edge is inserted from END to START, and thus the
3183 problem becomes one of finding equivalent loops.
3184
3185 In this case we have a partial CFG. We complete it by redirecting
3186 any incoming edge to the graph to be from an arbitrary external BB,
3187 and similarly redirecting any outgoing edge to be to that BB.
3188 Thus we end up with a closed graph.
3189
3190 The algorithm works by building a spanning tree of an undirected
3191 graph and keeping track of back edges from nodes further from the
3192 root in the tree to nodes nearer to the root in the tree. In the
3193 description below, the root is up and the tree grows downwards.
3194
3195 We avoid having to deal with degenerate back-edges to the same
3196 block, by splitting each BB into 3 -- one for input edges, one for
3197 the node itself and one for the output edges. Such back edges are
3198 referred to as 'Brackets'. Cycle equivalent nodes will have the
3199 same set of brackets.
3200
3201 Determining bracket equivalency is done by maintaining a list of
3202 brackets in such a manner that the list length and final bracket
3203 uniquely identify the set.
3204
3205 We use coloring to mark all BBs with cycle equivalency with the
3206 same color. This is the output of the 'Finding Regions Fast'
3207 algorithm. Notice it doesn't actually find the set of nodes within
3208 a particular region, just unorderd sets of nodes that are the
3209 entries and exits of SESE regions.
3210
3211 After determining cycle equivalency, we need to find the minimal
3212 set of SESE regions. Do this with a DFS coloring walk of the
3213 complete graph. We're either 'looking' or 'coloring'. When
3214 looking, and we're in the subgraph, we start coloring the color of
3215 the current node, and remember that node as the start of the
3216 current color's SESE region. Every time we go to a new node, we
3217 decrement the count of nodes with thet color. If it reaches zero,
3218 we remember that node as the end of the current color's SESE region
3219 and return to 'looking'. Otherwise we color the node the current
3220 color.
3221
3222 This way we end up with coloring the inside of non-trivial SESE
3223 regions with the color of that region. */
3224
3225/* A pair of BBs. We use this to represent SESE regions. */
3226typedef std::pair<basic_block, basic_block> bb_pair_t;
3227typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3228
3229/* A node in the undirected CFG. The discriminator SECOND indicates just
3230 above or just below the BB idicated by FIRST. */
3231typedef std::pair<basic_block, int> pseudo_node_t;
3232
3233/* A bracket indicates an edge towards the root of the spanning tree of the
3234 undirected graph. Each bracket has a color, determined
3235 from the currrent set of brackets. */
3236struct bracket
3237{
3238 pseudo_node_t back; /* Back target */
3239
3240 /* Current color and size of set. */
3241 unsigned color;
3242 unsigned size;
3243
3244 bracket (pseudo_node_t back_)
3245 : back (back_), color (~0u), size (~0u)
3246 {
3247 }
3248
3249 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3250 {
3251 if (length != size)
3252 {
3253 size = length;
3254 color = color_counts.length ();
3255 color_counts.quick_push (0);
3256 }
3257 color_counts[color]++;
3258 return color;
3259 }
3260};
3261
3262typedef auto_vec<bracket> bracket_vec_t;
3263
3264/* Basic block info for finding SESE regions. */
3265
3266struct bb_sese
3267{
3268 int node; /* Node number in spanning tree. */
3269 int parent; /* Parent node number. */
3270
3271 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3272 edges arrive at pseudo-node Ai and the outgoing edges leave at
3273 pseudo-node Ao. We have to remember which way we arrived at a
3274 particular node when generating the spanning tree. dir > 0 means
3275 we arrived at Ai, dir < 0 means we arrived at Ao. */
3276 int dir;
3277
3278 /* Lowest numbered pseudo-node reached via a backedge from thsis
3279 node, or any descendant. */
3280 pseudo_node_t high;
3281
3282 int color; /* Cycle-equivalence color */
3283
3284 /* Stack of brackets for this node. */
3285 bracket_vec_t brackets;
3286
3287 bb_sese (unsigned node_, unsigned p, int dir_)
3288 :node (node_), parent (p), dir (dir_)
3289 {
3290 }
3291 ~bb_sese ();
3292
3293 /* Push a bracket ending at BACK. */
3294 void push (const pseudo_node_t &back)
3295 {
3296 if (dump_file)
3297 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3298 back.first ? back.first->index : 0, back.second);
3299 brackets.safe_push (bracket (back));
3300 }
3301
3302 void append (bb_sese *child);
3303 void remove (const pseudo_node_t &);
3304
3305 /* Set node's color. */
3306 void set_color (auto_vec<unsigned> &color_counts)
3307 {
3308 color = brackets.last ().get_color (color_counts, brackets.length ());
3309 }
3310};
3311
3312bb_sese::~bb_sese ()
3313{
3314}
3315
3316/* Destructively append CHILD's brackets. */
3317
3318void
3319bb_sese::append (bb_sese *child)
3320{
3321 if (int len = child->brackets.length ())
3322 {
3323 int ix;
3324
3325 if (dump_file)
3326 {
3327 for (ix = 0; ix < len; ix++)
3328 {
3329 const pseudo_node_t &pseudo = child->brackets[ix].back;
3330 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3331 child->node, pseudo.first ? pseudo.first->index : 0,
3332 pseudo.second);
3333 }
3334 }
3335 if (!brackets.length ())
3336 std::swap (brackets, child->brackets);
3337 else
3338 {
3339 brackets.reserve (len);
3340 for (ix = 0; ix < len; ix++)
3341 brackets.quick_push (child->brackets[ix]);
3342 }
3343 }
3344}
3345
3346/* Remove brackets that terminate at PSEUDO. */
3347
3348void
3349bb_sese::remove (const pseudo_node_t &pseudo)
3350{
3351 unsigned removed = 0;
3352 int len = brackets.length ();
3353
3354 for (int ix = 0; ix < len; ix++)
3355 {
3356 if (brackets[ix].back == pseudo)
3357 {
3358 if (dump_file)
3359 fprintf (dump_file, "Removing backedge %d:%+d\n",
3360 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3361 removed++;
3362 }
3363 else if (removed)
3364 brackets[ix-removed] = brackets[ix];
3365 }
3366 while (removed--)
3367 brackets.pop ();
3368}
3369
3370/* Accessors for BB's aux pointer. */
3371#define BB_SET_SESE(B, S) ((B)->aux = (S))
3372#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3373
3374/* DFS walk creating SESE data structures. Only cover nodes with
3375 BB_VISITED set. Append discovered blocks to LIST. We number in
3376 increments of 3 so that the above and below pseudo nodes can be
3377 implicitly numbered too. */
3378
3379static int
3380nvptx_sese_number (int n, int p, int dir, basic_block b,
3381 auto_vec<basic_block> *list)
3382{
3383 if (BB_GET_SESE (b))
3384 return n;
3385
3386 if (dump_file)
3387 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3388 b->index, n, p, dir);
3389
3390 BB_SET_SESE (b, new bb_sese (n, p, dir));
3391 p = n;
3392
3393 n += 3;
3394 list->quick_push (b);
3395
3396 /* First walk the nodes on the 'other side' of this node, then walk
3397 the nodes on the same side. */
3398 for (unsigned ix = 2; ix; ix--)
3399 {
3400 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3401 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3402 : offsetof (edge_def, src));
3403 edge e;
3404 edge_iterator (ei);
3405
3406 FOR_EACH_EDGE (e, ei, edges)
3407 {
3408 basic_block target = *(basic_block *)((char *)e + offset);
3409
3410 if (target->flags & BB_VISITED)
3411 n = nvptx_sese_number (n, p, dir, target, list);
3412 }
3413 dir = -dir;
3414 }
3415 return n;
3416}
3417
3418/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3419 EDGES are the outgoing edges and OFFSET is the offset to the src
3420 or dst block on the edges. */
3421
3422static void
3423nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3424 vec<edge, va_gc> *edges, size_t offset)
3425{
3426 edge e;
3427 edge_iterator (ei);
3428 int hi_back = depth;
3429 pseudo_node_t node_back (0, depth);
3430 int hi_child = depth;
3431 pseudo_node_t node_child (0, depth);
3432 basic_block child = NULL;
3433 unsigned num_children = 0;
3434 int usd = -dir * sese->dir;
3435
3436 if (dump_file)
3437 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3438 me->index, sese->node, dir);
3439
3440 if (dir < 0)
3441 {
3442 /* This is the above pseudo-child. It has the BB itself as an
3443 additional child node. */
3444 node_child = sese->high;
3445 hi_child = node_child.second;
3446 if (node_child.first)
3447 hi_child += BB_GET_SESE (node_child.first)->node;
3448 num_children++;
3449 }
3450
3451 /* Examine each edge.
3452 - if it is a child (a) append its bracket list and (b) record
3453 whether it is the child with the highest reaching bracket.
3454 - if it is an edge to ancestor, record whether it's the highest
3455 reaching backlink. */
3456 FOR_EACH_EDGE (e, ei, edges)
3457 {
3458 basic_block target = *(basic_block *)((char *)e + offset);
3459
3460 if (bb_sese *t_sese = BB_GET_SESE (target))
3461 {
3462 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3463 {
3464 /* Child node. Append its bracket list. */
3465 num_children++;
3466 sese->append (t_sese);
3467
3468 /* Compare it's hi value. */
3469 int t_hi = t_sese->high.second;
3470
3471 if (basic_block child_hi_block = t_sese->high.first)
3472 t_hi += BB_GET_SESE (child_hi_block)->node;
3473
3474 if (hi_child > t_hi)
3475 {
3476 hi_child = t_hi;
3477 node_child = t_sese->high;
3478 child = target;
3479 }
3480 }
3481 else if (t_sese->node < sese->node + dir
3482 && !(dir < 0 && sese->parent == t_sese->node))
3483 {
3484 /* Non-parental ancestor node -- a backlink. */
3485 int d = usd * t_sese->dir;
3486 int back = t_sese->node + d;
3487
3488 if (hi_back > back)
3489 {
3490 hi_back = back;
3491 node_back = pseudo_node_t (target, d);
3492 }
3493 }
3494 }
3495 else
3496 { /* Fallen off graph, backlink to entry node. */
3497 hi_back = 0;
3498 node_back = pseudo_node_t (0, 0);
3499 }
3500 }
3501
3502 /* Remove any brackets that terminate at this pseudo node. */
3503 sese->remove (pseudo_node_t (me, dir));
3504
3505 /* Now push any backlinks from this pseudo node. */
3506 FOR_EACH_EDGE (e, ei, edges)
3507 {
3508 basic_block target = *(basic_block *)((char *)e + offset);
3509 if (bb_sese *t_sese = BB_GET_SESE (target))
3510 {
3511 if (t_sese->node < sese->node + dir
3512 && !(dir < 0 && sese->parent == t_sese->node))
3513 /* Non-parental ancestor node - backedge from me. */
3514 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3515 }
3516 else
3517 {
3518 /* back edge to entry node */
3519 sese->push (pseudo_node_t (0, 0));
3520 }
3521 }
3522
3523 /* If this node leads directly or indirectly to a no-return region of
3524 the graph, then fake a backedge to entry node. */
3525 if (!sese->brackets.length () || !edges || !edges->length ())
3526 {
3527 hi_back = 0;
3528 node_back = pseudo_node_t (0, 0);
3529 sese->push (node_back);
3530 }
3531
3532 /* Record the highest reaching backedge from us or a descendant. */
3533 sese->high = hi_back < hi_child ? node_back : node_child;
3534
3535 if (num_children > 1)
3536 {
3537 /* There is more than one child -- this is a Y shaped piece of
3538 spanning tree. We have to insert a fake backedge from this
3539 node to the highest ancestor reached by not-the-highest
3540 reaching child. Note that there may be multiple children
3541 with backedges to the same highest node. That's ok and we
3542 insert the edge to that highest node. */
3543 hi_child = depth;
3544 if (dir < 0 && child)
3545 {
3546 node_child = sese->high;
3547 hi_child = node_child.second;
3548 if (node_child.first)
3549 hi_child += BB_GET_SESE (node_child.first)->node;
3550 }
3551
3552 FOR_EACH_EDGE (e, ei, edges)
3553 {
3554 basic_block target = *(basic_block *)((char *)e + offset);
3555
3556 if (target == child)
3557 /* Ignore the highest child. */
3558 continue;
3559
3560 bb_sese *t_sese = BB_GET_SESE (target);
3561 if (!t_sese)
3562 continue;
3563 if (t_sese->parent != sese->node)
3564 /* Not a child. */
3565 continue;
3566
3567 /* Compare its hi value. */
3568 int t_hi = t_sese->high.second;
3569
3570 if (basic_block child_hi_block = t_sese->high.first)
3571 t_hi += BB_GET_SESE (child_hi_block)->node;
3572
3573 if (hi_child > t_hi)
3574 {
3575 hi_child = t_hi;
3576 node_child = t_sese->high;
3577 }
3578 }
3579
3580 sese->push (node_child);
3581 }
3582}
3583
3584
3585/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3586 proceed to successors. Set SESE entry and exit nodes of
3587 REGIONS. */
3588
3589static void
3590nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3591 basic_block block, int coloring)
3592{
3593 bb_sese *sese = BB_GET_SESE (block);
3594
3595 if (block->flags & BB_VISITED)
3596 {
3597 /* If we've already encountered this block, either we must not
3598 be coloring, or it must have been colored the current color. */
3599 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3600 return;
3601 }
3602
3603 block->flags |= BB_VISITED;
3604
3605 if (sese)
3606 {
3607 if (coloring < 0)
3608 {
3609 /* Start coloring a region. */
3610 regions[sese->color].first = block;
3611 coloring = sese->color;
3612 }
3613
3614 if (!--color_counts[sese->color] && sese->color == coloring)
3615 {
3616 /* Found final block of SESE region. */
3617 regions[sese->color].second = block;
3618 coloring = -1;
3619 }
3620 else
3621 /* Color the node, so we can assert on revisiting the node
3622 that the graph is indeed SESE. */
3623 sese->color = coloring;
3624 }
3625 else
3626 /* Fallen off the subgraph, we cannot be coloring. */
3627 gcc_assert (coloring < 0);
3628
3629 /* Walk each successor block. */
3630 if (block->succs && block->succs->length ())
3631 {
3632 edge e;
3633 edge_iterator ei;
3634
3635 FOR_EACH_EDGE (e, ei, block->succs)
3636 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3637 }
3638 else
3639 gcc_assert (coloring < 0);
3640}
3641
3642/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3643 end up with NULL entries in it. */
3644
3645static void
3646nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3647{
3648 basic_block block;
3649 int ix;
3650
3651 /* First clear each BB of the whole function. */
2300c332 3652 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3653 {
3654 block->flags &= ~BB_VISITED;
3655 BB_SET_SESE (block, 0);
3656 }
912442c2
NS
3657
3658 /* Mark blocks in the function that are in this graph. */
3659 for (ix = 0; blocks.iterate (ix, &block); ix++)
3660 block->flags |= BB_VISITED;
3661
3662 /* Counts of nodes assigned to each color. There cannot be more
3663 colors than blocks (and hopefully there will be fewer). */
3664 auto_vec<unsigned> color_counts;
3665 color_counts.reserve (blocks.length ());
3666
3667 /* Worklist of nodes in the spanning tree. Again, there cannot be
3668 more nodes in the tree than blocks (there will be fewer if the
3669 CFG of blocks is disjoint). */
3670 auto_vec<basic_block> spanlist;
3671 spanlist.reserve (blocks.length ());
3672
3673 /* Make sure every block has its cycle class determined. */
3674 for (ix = 0; blocks.iterate (ix, &block); ix++)
3675 {
3676 if (BB_GET_SESE (block))
3677 /* We already met this block in an earlier graph solve. */
3678 continue;
3679
3680 if (dump_file)
3681 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3682
3683 /* Number the nodes reachable from block initial DFS order. */
3684 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3685
3686 /* Now walk in reverse DFS order to find cycle equivalents. */
3687 while (spanlist.length ())
3688 {
3689 block = spanlist.pop ();
3690 bb_sese *sese = BB_GET_SESE (block);
3691
3692 /* Do the pseudo node below. */
3693 nvptx_sese_pseudo (block, sese, depth, +1,
3694 sese->dir > 0 ? block->succs : block->preds,
3695 (sese->dir > 0 ? offsetof (edge_def, dest)
3696 : offsetof (edge_def, src)));
3697 sese->set_color (color_counts);
3698 /* Do the pseudo node above. */
3699 nvptx_sese_pseudo (block, sese, depth, -1,
3700 sese->dir < 0 ? block->succs : block->preds,
3701 (sese->dir < 0 ? offsetof (edge_def, dest)
3702 : offsetof (edge_def, src)));
3703 }
3704 if (dump_file)
3705 fprintf (dump_file, "\n");
3706 }
3707
3708 if (dump_file)
3709 {
3710 unsigned count;
3711 const char *comma = "";
3712
3713 fprintf (dump_file, "Found %d cycle equivalents\n",
3714 color_counts.length ());
3715 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3716 {
3717 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3718
3719 comma = "";
3720 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3721 if (BB_GET_SESE (block)->color == ix)
3722 {
3723 block->flags |= BB_VISITED;
3724 fprintf (dump_file, "%s%d", comma, block->index);
3725 comma=",";
3726 }
3727 fprintf (dump_file, "}");
3728 comma = ", ";
3729 }
3730 fprintf (dump_file, "\n");
3731 }
3732
3733 /* Now we've colored every block in the subgraph. We now need to
3734 determine the minimal set of SESE regions that cover that
3735 subgraph. Do this with a DFS walk of the complete function.
3736 During the walk we're either 'looking' or 'coloring'. When we
3737 reach the last node of a particular color, we stop coloring and
3738 return to looking. */
3739
3740 /* There cannot be more SESE regions than colors. */
3741 regions.reserve (color_counts.length ());
3742 for (ix = color_counts.length (); ix--;)
3743 regions.quick_push (bb_pair_t (0, 0));
3744
3745 for (ix = 0; blocks.iterate (ix, &block); ix++)
3746 block->flags &= ~BB_VISITED;
3747
3748 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3749
3750 if (dump_file)
3751 {
3752 const char *comma = "";
3753 int len = regions.length ();
3754
3755 fprintf (dump_file, "SESE regions:");
3756 for (ix = 0; ix != len; ix++)
3757 {
3758 basic_block from = regions[ix].first;
3759 basic_block to = regions[ix].second;
3760
3761 if (from)
3762 {
3763 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3764 if (to != from)
3765 fprintf (dump_file, "->%d", to->index);
3766
3767 int color = BB_GET_SESE (from)->color;
3768
3769 /* Print the blocks within the region (excluding ends). */
3770 FOR_EACH_BB_FN (block, cfun)
3771 {
3772 bb_sese *sese = BB_GET_SESE (block);
3773
3774 if (sese && sese->color == color
3775 && block != from && block != to)
3776 fprintf (dump_file, ".%d", block->index);
3777 }
3778 fprintf (dump_file, "}");
3779 }
3780 comma = ",";
3781 }
3782 fprintf (dump_file, "\n\n");
3783 }
3784
3785 for (ix = 0; blocks.iterate (ix, &block); ix++)
3786 delete BB_GET_SESE (block);
3787}
3788
3789#undef BB_SET_SESE
3790#undef BB_GET_SESE
3791
e91eba31
NS
3792/* Propagate live state at the start of a partitioned region. IS_CALL
3793 indicates whether the propagation is for a (partitioned) call
3794 instruction. BLOCK provides the live register information, and
3795 might not contain INSN. Propagation is inserted just after INSN. RW
3796 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
3797 separation is needed for worker-level proppagation where we
3798 essentially do a spill & fill. FN is the underlying worker
3799 function to generate the propagation instructions for single
3800 register. DATA is user data.
3801
e91eba31
NS
3802 Returns true if we didn't emit any instructions.
3803
3804 We propagate the live register set for non-calls and the entire
3805 frame for calls and non-calls. We could do better by (a)
3806 propagating just the live set that is used within the partitioned
3807 regions and (b) only propagating stack entries that are used. The
3808 latter might be quite hard to determine. */
d88cd9c4 3809
a0b3b5c4 3810typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
d88cd9c4 3811
e91eba31
NS
3812static bool
3813nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
a0b3b5c4 3814 propagate_mask rw, propagator_fn fn, void *data, bool vector)
d88cd9c4
NS
3815{
3816 bitmap live = DF_LIVE_IN (block);
3817 bitmap_iterator iterator;
3818 unsigned ix;
e91eba31 3819 bool empty = true;
d88cd9c4
NS
3820
3821 /* Copy the frame array. */
3822 HOST_WIDE_INT fs = get_frame_size ();
3823 if (fs)
3824 {
3825 rtx tmp = gen_reg_rtx (DImode);
3826 rtx idx = NULL_RTX;
3827 rtx ptr = gen_reg_rtx (Pmode);
3828 rtx pred = NULL_RTX;
3829 rtx_code_label *label = NULL;
3830
e91eba31 3831 empty = false;
c1311c86
NS
3832 /* The frame size might not be DImode compatible, but the frame
3833 array's declaration will be. So it's ok to round up here. */
3834 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3835 /* Detect single iteration loop. */
3836 if (fs == 1)
3837 fs = 0;
3838
3839 start_sequence ();
3840 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3841 if (fs)
3842 {
3843 idx = gen_reg_rtx (SImode);
3844 pred = gen_reg_rtx (BImode);
3845 label = gen_label_rtx ();
3846
3847 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3848 /* Allow worker function to initialize anything needed. */
a0b3b5c4 3849 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
d88cd9c4
NS
3850 if (init)
3851 emit_insn (init);
3852 emit_label (label);
3853 LABEL_NUSES (label)++;
3854 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3855 }
3856 if (rw & PM_read)
3857 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
a0b3b5c4 3858 emit_insn (fn (tmp, rw, fs, data, vector));
d88cd9c4
NS
3859 if (rw & PM_write)
3860 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3861 if (fs)
3862 {
3863 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3864 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3865 emit_insn (gen_br_true_uni (pred, label));
a0b3b5c4 3866 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
d88cd9c4
NS
3867 if (fini)
3868 emit_insn (fini);
3869 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3870 }
3871 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3872 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3873 rtx cpy = get_insns ();
3874 end_sequence ();
3875 insn = emit_insn_after (cpy, insn);
3876 }
3877
e91eba31
NS
3878 if (!is_call)
3879 /* Copy live registers. */
3880 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3881 {
3882 rtx reg = regno_reg_rtx[ix];
d88cd9c4 3883
e91eba31
NS
3884 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3885 {
a0b3b5c4 3886 rtx bcast = fn (reg, rw, 0, data, vector);
d88cd9c4 3887
e91eba31
NS
3888 insn = emit_insn_after (bcast, insn);
3889 empty = false;
3890 }
3891 }
3892 return empty;
d88cd9c4
NS
3893}
3894
a0b3b5c4 3895/* Worker for nvptx_warp_propagate. */
d88cd9c4
NS
3896
3897static rtx
a0b3b5c4
TV
3898warp_prop_gen (rtx reg, propagate_mask pm,
3899 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
3900 bool ARG_UNUSED (vector))
d88cd9c4
NS
3901{
3902 if (!(pm & PM_read_write))
3903 return 0;
3904
a0b3b5c4 3905 return nvptx_gen_warp_bcast (reg);
d88cd9c4
NS
3906}
3907
3908/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
3909 of a single warp. Propagation is inserted just after INSN.
3910 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 3911
e91eba31 3912static bool
a0b3b5c4 3913nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 3914{
a0b3b5c4
TV
3915 return nvptx_propagate (is_call, block, insn, PM_read_write,
3916 warp_prop_gen, 0, false);
d88cd9c4
NS
3917}
3918
a0b3b5c4 3919/* Worker for nvptx_shared_propagate. */
d88cd9c4
NS
3920
3921static rtx
a0b3b5c4
TV
3922shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
3923 bool vector)
d88cd9c4 3924{
1ed57fb8 3925 broadcast_data_t *data = (broadcast_data_t *)data_;
d88cd9c4
NS
3926
3927 if (pm & PM_loop_begin)
3928 {
3929 /* Starting a loop, initialize pointer. */
3930 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3931
3c55d60f
TV
3932 oacc_bcast_align = MAX (oacc_bcast_align, align);
3933 data->offset = ROUND_UP (data->offset, align);
d88cd9c4
NS
3934
3935 data->ptr = gen_reg_rtx (Pmode);
3936
3937 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3938 }
3939 else if (pm & PM_loop_end)
3940 {
3941 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3942 data->ptr = NULL_RTX;
3943 return clobber;
3944 }
3945 else
a0b3b5c4 3946 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
d88cd9c4
NS
3947}
3948
3949/* Spill or fill live state that is live at start of BLOCK. PRE_P
3950 indicates if this is just before partitioned mode (do spill), or
3951 just after it starts (do fill). Sequence is inserted just after
e91eba31 3952 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 3953
e91eba31 3954static bool
a0b3b5c4
TV
3955nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
3956 rtx_insn *insn, bool vector)
d88cd9c4 3957{
1ed57fb8 3958 broadcast_data_t data;
d88cd9c4
NS
3959
3960 data.base = gen_reg_rtx (Pmode);
3961 data.offset = 0;
3962 data.ptr = NULL_RTX;
3963
e91eba31 3964 bool empty = nvptx_propagate (is_call, block, insn,
a0b3b5c4
TV
3965 pre_p ? PM_read : PM_write, shared_prop_gen,
3966 &data, vector);
e91eba31 3967 gcc_assert (empty == !data.offset);
d88cd9c4
NS
3968 if (data.offset)
3969 {
3970 /* Stuff was emitted, initialize the base pointer now. */
1ed57fb8 3971 rtx init = gen_rtx_SET (data.base, oacc_bcast_sym);
d88cd9c4 3972 emit_insn_after (init, insn);
15ab6f00 3973
3c55d60f 3974 oacc_bcast_size = MAX (oacc_bcast_size, data.offset);
d88cd9c4 3975 }
e91eba31 3976 return empty;
d88cd9c4
NS
3977}
3978
3979/* Emit a worker-level synchronization barrier. We use different
3980 markers for before and after synchronizations. */
3981
3982static rtx
a0b3b5c4 3983nvptx_cta_sync (bool after)
d88cd9c4 3984{
1dcf2688 3985 return gen_nvptx_barsync (GEN_INT (after), GEN_INT (0));
d88cd9c4
NS
3986}
3987
43c371e8
TV
3988#if WORKAROUND_PTXJIT_BUG
3989/* Return first real insn in BB, or return NULL_RTX if BB does not contain
3990 real insns. */
3991
3992static rtx_insn *
3993bb_first_real_insn (basic_block bb)
3994{
3995 rtx_insn *insn;
3996
3997 /* Find first insn of from block. */
3998 FOR_BB_INSNS (bb, insn)
3999 if (INSN_P (insn))
4000 return insn;
4001
4002 return 0;
4003}
4004#endif
4005
6beefdbd
TV
4006/* Return true if INSN needs neutering. */
4007
4008static bool
4009needs_neutering_p (rtx_insn *insn)
4010{
4011 if (!INSN_P (insn))
4012 return false;
4013
4014 switch (recog_memoized (insn))
4015 {
4016 case CODE_FOR_nvptx_fork:
4017 case CODE_FOR_nvptx_forked:
4018 case CODE_FOR_nvptx_joining:
4019 case CODE_FOR_nvptx_join:
4020 case CODE_FOR_nvptx_barsync:
4021 return false;
4022 default:
4023 return true;
4024 }
4025}
4026
a874808c
TV
4027/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4028
4029static bool
4030verify_neutering_jumps (basic_block from,
4031 rtx_insn *vector_jump, rtx_insn *worker_jump,
4032 rtx_insn *vector_label, rtx_insn *worker_label)
4033{
4034 basic_block bb = from;
4035 rtx_insn *insn = BB_HEAD (bb);
4036 bool seen_worker_jump = false;
4037 bool seen_vector_jump = false;
4038 bool seen_worker_label = false;
4039 bool seen_vector_label = false;
4040 bool worker_neutered = false;
4041 bool vector_neutered = false;
4042 while (true)
4043 {
4044 if (insn == worker_jump)
4045 {
4046 seen_worker_jump = true;
4047 worker_neutered = true;
4048 gcc_assert (!vector_neutered);
4049 }
4050 else if (insn == vector_jump)
4051 {
4052 seen_vector_jump = true;
4053 vector_neutered = true;
4054 }
4055 else if (insn == worker_label)
4056 {
4057 seen_worker_label = true;
4058 gcc_assert (worker_neutered);
4059 worker_neutered = false;
4060 }
4061 else if (insn == vector_label)
4062 {
4063 seen_vector_label = true;
4064 gcc_assert (vector_neutered);
4065 vector_neutered = false;
4066 }
4067 else if (INSN_P (insn))
4068 switch (recog_memoized (insn))
4069 {
4070 case CODE_FOR_nvptx_barsync:
4071 gcc_assert (!vector_neutered && !worker_neutered);
4072 break;
4073 default:
4074 break;
4075 }
4076
4077 if (insn != BB_END (bb))
4078 insn = NEXT_INSN (insn);
4079 else if (JUMP_P (insn) && single_succ_p (bb)
4080 && !seen_vector_jump && !seen_worker_jump)
4081 {
4082 bb = single_succ (bb);
4083 insn = BB_HEAD (bb);
4084 }
4085 else
4086 break;
4087 }
4088
4089 gcc_assert (!(vector_jump && !seen_vector_jump));
4090 gcc_assert (!(worker_jump && !seen_worker_jump));
4091
4092 if (seen_vector_label || seen_worker_label)
4093 {
4094 gcc_assert (!(vector_label && !seen_vector_label));
4095 gcc_assert (!(worker_label && !seen_worker_label));
4096
4097 return true;
4098 }
4099
4100 return false;
4101}
4102
4103/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4104
4105static void
4106verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4107 rtx_insn *worker_label)
4108{
4109 basic_block bb = to;
4110 rtx_insn *insn = BB_END (bb);
4111 bool seen_worker_label = false;
4112 bool seen_vector_label = false;
4113 while (true)
4114 {
4115 if (insn == worker_label)
4116 {
4117 seen_worker_label = true;
4118 gcc_assert (!seen_vector_label);
4119 }
4120 else if (insn == vector_label)
4121 seen_vector_label = true;
4122 else if (INSN_P (insn))
4123 switch (recog_memoized (insn))
4124 {
4125 case CODE_FOR_nvptx_barsync:
4126 gcc_assert (!seen_vector_label && !seen_worker_label);
4127 break;
4128 }
4129
4130 if (insn != BB_HEAD (bb))
4131 insn = PREV_INSN (insn);
4132 else
4133 break;
4134 }
4135
4136 gcc_assert (!(vector_label && !seen_vector_label));
4137 gcc_assert (!(worker_label && !seen_worker_label));
4138}
4139
d88cd9c4
NS
4140/* Single neutering according to MASK. FROM is the incoming block and
4141 TO is the outgoing block. These may be the same block. Insert at
4142 start of FROM:
4143
4144 if (tid.<axis>) goto end.
4145
4146 and insert before ending branch of TO (if there is such an insn):
4147
4148 end:
4149 <possibly-broadcast-cond>
4150 <branch>
4151
4152 We currently only use differnt FROM and TO when skipping an entire
4153 loop. We could do more if we detected superblocks. */
4154
4155static void
4156nvptx_single (unsigned mask, basic_block from, basic_block to)
4157{
4158 rtx_insn *head = BB_HEAD (from);
4159 rtx_insn *tail = BB_END (to);
4160 unsigned skip_mask = mask;
4161
ed9091b2
TV
4162 while (true)
4163 {
4164 /* Find first insn of from block. */
6beefdbd 4165 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4166 head = NEXT_INSN (head);
4167
4168 if (from == to)
4169 break;
4170
4171 if (!(JUMP_P (head) && single_succ_p (from)))
4172 break;
4173
4174 basic_block jump_target = single_succ (from);
4175 if (!single_pred_p (jump_target))
4176 break;
4177
4178 from = jump_target;
4179 head = BB_HEAD (from);
4180 }
d88cd9c4
NS
4181
4182 /* Find last insn of to block */
4183 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4184 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4185 tail = PREV_INSN (tail);
4186
4187 /* Detect if tail is a branch. */
4188 rtx tail_branch = NULL_RTX;
4189 rtx cond_branch = NULL_RTX;
4190 if (tail && INSN_P (tail))
4191 {
4192 tail_branch = PATTERN (tail);
4193 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4194 tail_branch = NULL_RTX;
4195 else
4196 {
4197 cond_branch = SET_SRC (tail_branch);
4198 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4199 cond_branch = NULL_RTX;
4200 }
4201 }
4202
4203 if (tail == head)
4204 {
4205 /* If this is empty, do nothing. */
6beefdbd 4206 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4207 return;
4208
d88cd9c4
NS
4209 if (cond_branch)
4210 {
4211 /* If we're only doing vector single, there's no need to
4212 emit skip code because we'll not insert anything. */
4213 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4214 skip_mask = 0;
4215 }
4216 else if (tail_branch)
4217 /* Block with only unconditional branch. Nothing to do. */
4218 return;
4219 }
4220
4221 /* Insert the vector test inside the worker test. */
4222 unsigned mode;
4223 rtx_insn *before = tail;
2ba16fd2 4224 rtx_insn *neuter_start = NULL;
a874808c
TV
4225 rtx_insn *worker_label = NULL, *vector_label = NULL;
4226 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4227 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4228 if (GOMP_DIM_MASK (mode) & skip_mask)
4229 {
4230 rtx_code_label *label = gen_label_rtx ();
4231 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
a874808c
TV
4232 rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4233 rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4234
4235 if (!pred)
4236 {
4237 pred = gen_reg_rtx (BImode);
4238 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4239 }
7820b298 4240
d88cd9c4
NS
4241 rtx br;
4242 if (mode == GOMP_DIM_VECTOR)
4243 br = gen_br_true (pred, label);
4244 else
4245 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4246 if (neuter_start)
4247 neuter_start = emit_insn_after (br, neuter_start);
4248 else
4249 neuter_start = emit_insn_before (br, head);
a874808c 4250 *mode_jump = neuter_start;
d88cd9c4
NS
4251
4252 LABEL_NUSES (label)++;
a874808c 4253 rtx_insn *label_insn;
d88cd9c4 4254 if (tail_branch)
a874808c
TV
4255 {
4256 label_insn = emit_label_before (label, before);
4257 before = label_insn;
4258 }
d88cd9c4 4259 else
be606483 4260 {
a874808c 4261 label_insn = emit_label_after (label, tail);
e77a1236
TV
4262 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4263 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4264 emit_insn_after (gen_exit (), label_insn);
4265 }
a874808c
TV
4266
4267 if (mode == GOMP_DIM_VECTOR)
4268 vector_label = label_insn;
4269 else
4270 worker_label = label_insn;
d88cd9c4
NS
4271 }
4272
4273 /* Now deal with propagating the branch condition. */
4274 if (cond_branch)
4275 {
4276 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4277
4278 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4279 {
4280 /* Vector mode only, do a shuffle. */
43c371e8
TV
4281#if WORKAROUND_PTXJIT_BUG
4282 /* The branch condition %rcond is propagated like this:
4283
4284 {
4285 .reg .u32 %x;
4286 mov.u32 %x,%tid.x;
4287 setp.ne.u32 %rnotvzero,%x,0;
4288 }
4289
4290 @%rnotvzero bra Lskip;
4291 setp.<op>.<type> %rcond,op1,op2;
4292 Lskip:
4293 selp.u32 %rcondu32,1,0,%rcond;
4294 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4295 setp.ne.u32 %rcond,%rcondu32,0;
4296
4297 There seems to be a bug in the ptx JIT compiler (observed at driver
4298 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4299 unless %rcond is initialized to something before 'bra Lskip'. The
4300 bug is not observed with ptxas from cuda 8.0.61.
4301
4302 It is true that the code is non-trivial: at Lskip, %rcond is
4303 uninitialized in threads 1-31, and after the selp the same holds
4304 for %rcondu32. But shfl propagates the defined value in thread 0
4305 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4306 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4307
4308 There is nothing in the PTX spec to suggest that this is wrong, or
4309 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4310 it as a JIT bug, and the extra initialization as workaround:
4311
4312 {
4313 .reg .u32 %x;
4314 mov.u32 %x,%tid.x;
4315 setp.ne.u32 %rnotvzero,%x,0;
4316 }
4317
4318 +.reg .pred %rcond2;
4319 +setp.eq.u32 %rcond2, 1, 0;
4320
4321 @%rnotvzero bra Lskip;
4322 setp.<op>.<type> %rcond,op1,op2;
4323 +mov.pred %rcond2, %rcond;
4324 Lskip:
4325 +mov.pred %rcond, %rcond2;
4326 selp.u32 %rcondu32,1,0,%rcond;
4327 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4328 setp.ne.u32 %rcond,%rcondu32,0;
4329 */
4330 rtx_insn *label = PREV_INSN (tail);
4331 gcc_assert (label && LABEL_P (label));
4332 rtx tmp = gen_reg_rtx (BImode);
4333 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4334 bb_first_real_insn (from));
8c8e9a6b
TV
4335 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4336 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4337#endif
a0b3b5c4 4338 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
d88cd9c4
NS
4339 }
4340 else
4341 {
4342 /* Includes worker mode, do spill & fill. By construction
4343 we should never have worker mode only. */
1ed57fb8 4344 broadcast_data_t data;
d88cd9c4 4345
1ed57fb8 4346 data.base = oacc_bcast_sym;
d88cd9c4
NS
4347 data.ptr = 0;
4348
3c55d60f 4349 oacc_bcast_size = MAX (oacc_bcast_size, GET_MODE_SIZE (SImode));
d88cd9c4
NS
4350
4351 data.offset = 0;
a0b3b5c4
TV
4352 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4353 false),
d88cd9c4
NS
4354 before);
4355 /* Barrier so other workers can see the write. */
a0b3b5c4 4356 emit_insn_before (nvptx_cta_sync (false), tail);
d88cd9c4 4357 data.offset = 0;
a0b3b5c4
TV
4358 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4359 false), tail);
d88cd9c4
NS
4360 /* This barrier is needed to avoid worker zero clobbering
4361 the broadcast buffer before all the other workers have
4362 had a chance to read this instance of it. */
a0b3b5c4 4363 emit_insn_before (nvptx_cta_sync (false), tail);
d88cd9c4
NS
4364 }
4365
4366 extract_insn (tail);
4367 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4368 UNSPEC_BR_UNIFIED);
4369 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4370 }
a874808c
TV
4371
4372 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4373 vector_label, worker_label);
4374 if (!seen_label)
4375 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4376}
4377
4378/* PAR is a parallel that is being skipped in its entirety according to
4379 MASK. Treat this as skipping a superblock starting at forked
4380 and ending at joining. */
4381
4382static void
4383nvptx_skip_par (unsigned mask, parallel *par)
4384{
4385 basic_block tail = par->join_block;
4386 gcc_assert (tail->preds->length () == 1);
4387
4388 basic_block pre_tail = (*tail->preds)[0]->src;
4389 gcc_assert (pre_tail->succs->length () == 1);
4390
4391 nvptx_single (mask, par->forked_block, pre_tail);
4392}
4393
dba619f3
NS
4394/* If PAR has a single inner parallel and PAR itself only contains
4395 empty entry and exit blocks, swallow the inner PAR. */
4396
4397static void
4398nvptx_optimize_inner (parallel *par)
4399{
4400 parallel *inner = par->inner;
4401
4402 /* We mustn't be the outer dummy par. */
4403 if (!par->mask)
4404 return;
4405
4406 /* We must have a single inner par. */
4407 if (!inner || inner->next)
4408 return;
4409
4410 /* We must only contain 2 blocks ourselves -- the head and tail of
4411 the inner par. */
4412 if (par->blocks.length () != 2)
4413 return;
4414
4415 /* We must be disjoint partitioning. As we only have vector and
4416 worker partitioning, this is sufficient to guarantee the pars
4417 have adjacent partitioning. */
4418 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4419 /* This indicates malformed code generation. */
4420 return;
4421
4422 /* The outer forked insn should be immediately followed by the inner
4423 fork insn. */
4424 rtx_insn *forked = par->forked_insn;
4425 rtx_insn *fork = BB_END (par->forked_block);
4426
4427 if (NEXT_INSN (forked) != fork)
4428 return;
4429 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4430
4431 /* The outer joining insn must immediately follow the inner join
4432 insn. */
4433 rtx_insn *joining = par->joining_insn;
4434 rtx_insn *join = inner->join_insn;
4435 if (NEXT_INSN (join) != joining)
4436 return;
4437
4438 /* Preconditions met. Swallow the inner par. */
4439 if (dump_file)
4440 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4441 inner->mask, inner->forked_block->index,
4442 inner->join_block->index,
4443 par->mask, par->forked_block->index, par->join_block->index);
4444
4445 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4446
4447 par->blocks.reserve (inner->blocks.length ());
4448 while (inner->blocks.length ())
4449 par->blocks.quick_push (inner->blocks.pop ());
4450
4451 par->inner = inner->inner;
4452 inner->inner = NULL;
4453
4454 delete inner;
4455}
4456
d88cd9c4
NS
4457/* Process the parallel PAR and all its contained
4458 parallels. We do everything but the neutering. Return mask of
4459 partitioned modes used within this parallel. */
4460
4461static unsigned
4462nvptx_process_pars (parallel *par)
4463{
dba619f3
NS
4464 if (nvptx_optimize)
4465 nvptx_optimize_inner (par);
4466
d88cd9c4
NS
4467 unsigned inner_mask = par->mask;
4468
4469 /* Do the inner parallels first. */
4470 if (par->inner)
4471 {
4472 par->inner_mask = nvptx_process_pars (par->inner);
4473 inner_mask |= par->inner_mask;
4474 }
4475
e91eba31
NS
4476 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4477
4478 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4 4479 {
a0b3b5c4
TV
4480 nvptx_shared_propagate (false, is_call, par->forked_block,
4481 par->forked_insn, false);
4482 bool empty = nvptx_shared_propagate (true, is_call,
4483 par->forked_block, par->fork_insn,
4484 false);
e91eba31
NS
4485
4486 if (!empty || !is_call)
4487 {
4488 /* Insert begin and end synchronizations. */
a0b3b5c4
TV
4489 emit_insn_before (nvptx_cta_sync (false), par->forked_insn);
4490 emit_insn_before (nvptx_cta_sync (false), par->join_insn);
e91eba31 4491 }
d88cd9c4
NS
4492 }
4493 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
a0b3b5c4 4494 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4495
4496 /* Now do siblings. */
4497 if (par->next)
4498 inner_mask |= nvptx_process_pars (par->next);
4499 return inner_mask;
4500}
4501
4502/* Neuter the parallel described by PAR. We recurse in depth-first
4503 order. MODES are the partitioning of the execution and OUTER is
4504 the partitioning of the parallels we are contained in. */
4505
4506static void
4507nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4508{
4509 unsigned me = (par->mask
4510 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4511 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4512 unsigned skip_mask = 0, neuter_mask = 0;
4513
4514 if (par->inner)
4515 nvptx_neuter_pars (par->inner, modes, outer | me);
4516
4517 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4518 {
4519 if ((outer | me) & GOMP_DIM_MASK (mode))
4520 {} /* Mode is partitioned: no neutering. */
4521 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4522 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4523 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4524 || !par->forked_insn)
4525 /* Partitioned in inner parallels, or we're not a partitioned
4526 at all: neuter individual blocks. */
4527 neuter_mask |= GOMP_DIM_MASK (mode);
4528 else if (!par->parent || !par->parent->forked_insn
4529 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4530 /* Parent isn't a parallel or contains this paralleling: skip
4531 parallel at this level. */
4532 skip_mask |= GOMP_DIM_MASK (mode);
4533 else
4534 {} /* Parent will skip this parallel itself. */
4535 }
4536
4537 if (neuter_mask)
4538 {
912442c2 4539 int ix, len;
d88cd9c4 4540
912442c2
NS
4541 if (nvptx_optimize)
4542 {
4543 /* Neuter whole SESE regions. */
4544 bb_pair_vec_t regions;
4545
4546 nvptx_find_sese (par->blocks, regions);
4547 len = regions.length ();
4548 for (ix = 0; ix != len; ix++)
4549 {
4550 basic_block from = regions[ix].first;
4551 basic_block to = regions[ix].second;
4552
4553 if (from)
4554 nvptx_single (neuter_mask, from, to);
4555 else
4556 gcc_assert (!to);
4557 }
4558 }
4559 else
d88cd9c4 4560 {
912442c2
NS
4561 /* Neuter each BB individually. */
4562 len = par->blocks.length ();
4563 for (ix = 0; ix != len; ix++)
4564 {
4565 basic_block block = par->blocks[ix];
d88cd9c4 4566
912442c2
NS
4567 nvptx_single (neuter_mask, block, block);
4568 }
d88cd9c4
NS
4569 }
4570 }
4571
4572 if (skip_mask)
7820b298 4573 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4574
4575 if (par->next)
4576 nvptx_neuter_pars (par->next, modes, outer);
4577}
4578
3dede32b
TV
4579#if WORKAROUND_PTXJIT_BUG_2
4580/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4581 is needed in the nvptx target because the branches generated for
4582 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4583
4584static rtx
4585nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4586{
4587 rtx pat;
4588 if ((strict && !JUMP_P (insn))
4589 || (!strict && !INSN_P (insn)))
4590 return NULL_RTX;
4591 pat = PATTERN (insn);
4592
4593 /* The set is allowed to appear either as the insn pattern or
4594 the first set in a PARALLEL. */
4595 if (GET_CODE (pat) == PARALLEL)
4596 pat = XVECEXP (pat, 0, 0);
4597 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4598 return pat;
4599
4600 return NULL_RTX;
4601}
4602
4603/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4604
4605static rtx
4606nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4607{
4608 rtx x = nvptx_pc_set (insn, strict);
4609
4610 if (!x)
4611 return NULL_RTX;
4612 x = SET_SRC (x);
4613 if (GET_CODE (x) == LABEL_REF)
4614 return x;
4615 if (GET_CODE (x) != IF_THEN_ELSE)
4616 return NULL_RTX;
4617 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4618 return XEXP (x, 1);
4619 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4620 return XEXP (x, 2);
4621 return NULL_RTX;
4622}
4623
4624/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4625 insn inbetween the branch and the label. This works around a JIT bug
4626 observed at driver version 384.111, at -O0 for sm_50. */
4627
4628static void
4629prevent_branch_around_nothing (void)
4630{
4631 rtx_insn *seen_label = NULL;
4632 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4633 {
b0b592fc 4634 if (INSN_P (insn) && condjump_p (insn))
3dede32b 4635 {
b0b592fc 4636 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
4637 continue;
4638 }
4639
b0b592fc
TV
4640 if (seen_label == NULL)
4641 continue;
4642
3dede32b
TV
4643 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4644 continue;
4645
4646 if (INSN_P (insn))
4647 switch (recog_memoized (insn))
4648 {
4649 case CODE_FOR_nvptx_fork:
4650 case CODE_FOR_nvptx_forked:
4651 case CODE_FOR_nvptx_joining:
4652 case CODE_FOR_nvptx_join:
4653 continue;
4654 default:
4655 seen_label = NULL;
4656 continue;
4657 }
4658
4659 if (LABEL_P (insn) && insn == seen_label)
4660 emit_insn_before (gen_fake_nop (), insn);
4661
4662 seen_label = NULL;
4663 }
4664 }
4665#endif
4666
21251395
TV
4667#ifdef WORKAROUND_PTXJIT_BUG_3
4668/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4669 works around a hang observed at driver version 390.48 for sm_50. */
4670
4671static void
4672workaround_barsyncs (void)
4673{
4674 bool seen_barsync = false;
4675 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4676 {
4677 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4678 {
4679 if (seen_barsync)
4680 {
4681 emit_insn_before (gen_nvptx_membar_cta (), insn);
4682 emit_insn_before (gen_nvptx_membar_cta (), insn);
4683 }
4684
4685 seen_barsync = true;
4686 continue;
4687 }
4688
4689 if (!seen_barsync)
4690 continue;
4691
4692 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4693 continue;
4694 else if (INSN_P (insn))
4695 switch (recog_memoized (insn))
4696 {
4697 case CODE_FOR_nvptx_fork:
4698 case CODE_FOR_nvptx_forked:
4699 case CODE_FOR_nvptx_joining:
4700 case CODE_FOR_nvptx_join:
4701 continue;
4702 default:
4703 break;
4704 }
4705
4706 seen_barsync = false;
4707 }
4708}
4709#endif
4710
517665b3 4711/* PTX-specific reorganization
d88cd9c4 4712 - Split blocks at fork and join instructions
c38f0d8c
NS
4713 - Compute live registers
4714 - Mark now-unused registers, so function begin doesn't declare
517665b3 4715 unused registers.
d88cd9c4
NS
4716 - Insert state propagation when entering partitioned mode
4717 - Insert neutering instructions when in single mode
c38f0d8c 4718 - Replace subregs with suitable sequences.
517665b3
NS
4719*/
4720
4721static void
4722nvptx_reorg (void)
4723{
517665b3
NS
4724 /* We are freeing block_for_insn in the toplev to keep compatibility
4725 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4726 compute_bb_for_insn ();
4727
4728 thread_prologue_and_epilogue_insns ();
4729
d88cd9c4
NS
4730 /* Split blocks and record interesting unspecs. */
4731 bb_insn_map_t bb_insn_map;
4732
4733 nvptx_split_blocks (&bb_insn_map);
4734
c38f0d8c 4735 /* Compute live regs */
517665b3
NS
4736 df_clear_flags (DF_LR_RUN_DCE);
4737 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4738 df_live_add_problem ();
4739 df_live_set_all_dirty ();
517665b3 4740 df_analyze ();
738f2522
BS
4741 regstat_init_n_sets_and_refs ();
4742
d88cd9c4
NS
4743 if (dump_file)
4744 df_dump (dump_file);
4745
517665b3 4746 /* Mark unused regs as unused. */
d88cd9c4 4747 int max_regs = max_reg_num ();
44c068ae 4748 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
4749 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4750 regno_reg_rtx[i] = const0_rtx;
517665b3 4751
d88cd9c4
NS
4752 /* Determine launch dimensions of the function. If it is not an
4753 offloaded function (i.e. this is a regular compiler), the
4754 function has no neutering. */
629b3d75 4755 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
4756 if (attr)
4757 {
4758 /* If we determined this mask before RTL expansion, we could
4759 elide emission of some levels of forks and joins. */
4760 unsigned mask = 0;
4761 tree dims = TREE_VALUE (attr);
4762 unsigned ix;
4763
4764 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4765 {
4766 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4767 tree allowed = TREE_PURPOSE (dims);
4768
4769 if (size != 1 && !(allowed && integer_zerop (allowed)))
4770 mask |= GOMP_DIM_MASK (ix);
4771 }
4772 /* If there is worker neutering, there must be vector
4773 neutering. Otherwise the hardware will fail. */
4774 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4775 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4776
4777 /* Discover & process partitioned regions. */
4778 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4779 nvptx_process_pars (pars);
4780 nvptx_neuter_pars (pars, mask, 0);
4781 delete pars;
4782 }
4783
517665b3 4784 /* Replace subregs. */
c03b0416 4785 nvptx_reorg_subreg ();
517665b3 4786
5012919d
AM
4787 if (TARGET_UNIFORM_SIMT)
4788 nvptx_reorg_uniform_simt ();
4789
3dede32b
TV
4790#if WORKAROUND_PTXJIT_BUG_2
4791 prevent_branch_around_nothing ();
4792#endif
4793
21251395
TV
4794#ifdef WORKAROUND_PTXJIT_BUG_3
4795 workaround_barsyncs ();
4796#endif
4797
738f2522 4798 regstat_free_n_sets_and_refs ();
517665b3
NS
4799
4800 df_finish_pass (true);
738f2522
BS
4801}
4802\f
4803/* Handle a "kernel" attribute; arguments as in
4804 struct attribute_spec.handler. */
4805
4806static tree
4807nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4808 int ARG_UNUSED (flags), bool *no_add_attrs)
4809{
4810 tree decl = *node;
4811
4812 if (TREE_CODE (decl) != FUNCTION_DECL)
4813 {
4814 error ("%qE attribute only applies to functions", name);
4815 *no_add_attrs = true;
4816 }
b49e35a9 4817 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
4818 {
4819 error ("%qE attribute requires a void return type", name);
4820 *no_add_attrs = true;
4821 }
4822
4823 return NULL_TREE;
4824}
4825
5012919d
AM
4826/* Handle a "shared" attribute; arguments as in
4827 struct attribute_spec.handler. */
4828
4829static tree
4830nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4831 int ARG_UNUSED (flags), bool *no_add_attrs)
4832{
4833 tree decl = *node;
4834
4835 if (TREE_CODE (decl) != VAR_DECL)
4836 {
4837 error ("%qE attribute only applies to variables", name);
4838 *no_add_attrs = true;
4839 }
4840 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4841 {
4842 error ("%qE attribute not allowed with auto storage class", name);
4843 *no_add_attrs = true;
4844 }
4845
4846 return NULL_TREE;
4847}
4848
738f2522
BS
4849/* Table of valid machine attributes. */
4850static const struct attribute_spec nvptx_attribute_table[] =
4851{
4849deb1
JJ
4852 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4853 affects_type_identity, handler, exclude } */
4854 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 4855 NULL },
4849deb1 4856 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 4857 NULL },
4849deb1 4858 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
4859};
4860\f
4861/* Limit vector alignments to BIGGEST_ALIGNMENT. */
4862
4863static HOST_WIDE_INT
4864nvptx_vector_alignment (const_tree type)
4865{
4866 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4867
4868 return MIN (align, BIGGEST_ALIGNMENT);
4869}
d88cd9c4
NS
4870
4871/* Indicate that INSN cannot be duplicated. */
4872
4873static bool
4874nvptx_cannot_copy_insn_p (rtx_insn *insn)
4875{
4876 switch (recog_memoized (insn))
4877 {
4878 case CODE_FOR_nvptx_shufflesi:
4879 case CODE_FOR_nvptx_shufflesf:
4880 case CODE_FOR_nvptx_barsync:
4881 case CODE_FOR_nvptx_fork:
4882 case CODE_FOR_nvptx_forked:
4883 case CODE_FOR_nvptx_joining:
4884 case CODE_FOR_nvptx_join:
4885 return true;
4886 default:
4887 return false;
4888 }
4889}
a794bd20
NS
4890
4891/* Section anchors do not work. Initialization for flag_section_anchor
4892 probes the existence of the anchoring target hooks and prevents
4893 anchoring if they don't exist. However, we may be being used with
4894 a host-side compiler that does support anchoring, and hence see
4895 the anchor flag set (as it's not recalculated). So provide an
4896 implementation denying anchoring. */
4897
4898static bool
4899nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4900{
4901 return false;
4902}
738f2522 4903\f
1f83528e
TS
4904/* Record a symbol for mkoffload to enter into the mapping table. */
4905
4906static void
4907nvptx_record_offload_symbol (tree decl)
4908{
3e32ee19
NS
4909 switch (TREE_CODE (decl))
4910 {
4911 case VAR_DECL:
4912 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4913 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4914 break;
4915
4916 case FUNCTION_DECL:
4917 {
629b3d75 4918 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
4919 /* OpenMP offloading does not set this attribute. */
4920 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 4921
3e32ee19
NS
4922 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4923 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4924
5012919d 4925 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 4926 {
5d306e55 4927 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 4928
5d306e55 4929 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
4930 fprintf (asm_out_file, ", %#x", size);
4931 }
d2d47a28 4932
3e32ee19
NS
4933 fprintf (asm_out_file, "\n");
4934 }
4935 break;
d2d47a28 4936
3e32ee19
NS
4937 default:
4938 gcc_unreachable ();
4939 }
1f83528e
TS
4940}
4941
738f2522
BS
4942/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4943 at the start of a file. */
4944
4945static void
4946nvptx_file_start (void)
4947{
4948 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4949 fputs ("\t.version\t3.1\n", asm_out_file);
53ceb8b5
CP
4950 if (TARGET_SM35)
4951 fputs ("\t.target\tsm_35\n", asm_out_file);
4952 else
4953 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
4954 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4955 fputs ("// END PREAMBLE\n", asm_out_file);
4956}
4957
a0b3b5c4
TV
4958/* Emit a declaration for a worker and vector-level buffer in .shared
4959 memory. */
15ab6f00
NS
4960
4961static void
a0b3b5c4 4962write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
15ab6f00
NS
4963{
4964 const char *name = XSTR (sym, 0);
4965
4966 write_var_marker (file, true, false, name);
4967 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4968 align, name, size);
4969}
4970
ecf6e535
BS
4971/* Write out the function declarations we've collected and declare storage
4972 for the broadcast buffer. */
738f2522
BS
4973
4974static void
4975nvptx_file_end (void)
4976{
f3dba894
TS
4977 hash_table<tree_hasher>::iterator iter;
4978 tree decl;
4979 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 4980 nvptx_record_fndecl (decl);
738f2522 4981 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4 4982
1ed57fb8 4983 if (oacc_bcast_size)
a0b3b5c4 4984 write_shared_buffer (asm_out_file, oacc_bcast_sym,
1ed57fb8 4985 oacc_bcast_align, oacc_bcast_size);
f3552158
NS
4986
4987 if (worker_red_size)
a0b3b5c4 4988 write_shared_buffer (asm_out_file, worker_red_sym,
15ab6f00 4989 worker_red_align, worker_red_size);
5012919d
AM
4990
4991 if (need_softstack_decl)
4992 {
4993 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4994 /* 32 is the maximum number of warps in a block. Even though it's an
4995 external declaration, emit the array size explicitly; otherwise, it
4996 may fail at PTX JIT time if the definition is later in link order. */
4997 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4998 POINTER_SIZE);
4999 }
5000 if (need_unisimt_decl)
5001 {
5002 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5003 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5004 }
f3552158
NS
5005}
5006
5007/* Expander for the shuffle builtins. */
5008
5009static rtx
5010nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5011{
5012 if (ignore)
5013 return target;
5014
5015 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5016 NULL_RTX, mode, EXPAND_NORMAL);
5017 if (!REG_P (src))
5018 src = copy_to_mode_reg (mode, src);
5019
5020 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5021 NULL_RTX, SImode, EXPAND_NORMAL);
5022 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5023 NULL_RTX, SImode, EXPAND_NORMAL);
5024
5025 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5026 idx = copy_to_mode_reg (SImode, idx);
5027
59263259
NS
5028 rtx pat = nvptx_gen_shuffle (target, src, idx,
5029 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5030 if (pat)
5031 emit_insn (pat);
5032
5033 return target;
5034}
5035
5036/* Worker reduction address expander. */
5037
5038static rtx
a0b3b5c4 5039nvptx_expand_shared_addr (tree exp, rtx target,
f3552158
NS
5040 machine_mode ARG_UNUSED (mode), int ignore)
5041{
5042 if (ignore)
5043 return target;
5044
5045 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3c55d60f 5046 worker_red_align = MAX (worker_red_align, align);
f3552158
NS
5047
5048 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5049 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3c55d60f 5050 worker_red_size = MAX (worker_red_size, size + offset);
f3552158 5051
9a863523 5052 rtx addr = worker_red_sym;
f3552158 5053 if (offset)
9a863523
NS
5054 {
5055 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5056 addr = gen_rtx_CONST (Pmode, addr);
5057 }
f3552158 5058
9a863523 5059 emit_move_insn (target, addr);
f3552158
NS
5060
5061 return target;
5062}
5063
5064/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5065 not require taking the address of any object, other than the memory
5066 cell being operated on. */
5067
5068static rtx
5069nvptx_expand_cmp_swap (tree exp, rtx target,
5070 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5071{
5072 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5073
5074 if (!target)
5075 target = gen_reg_rtx (mode);
5076
5077 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5078 NULL_RTX, Pmode, EXPAND_NORMAL);
5079 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5080 NULL_RTX, mode, EXPAND_NORMAL);
5081 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5082 NULL_RTX, mode, EXPAND_NORMAL);
5083 rtx pat;
5084
5085 mem = gen_rtx_MEM (mode, mem);
5086 if (!REG_P (cmp))
5087 cmp = copy_to_mode_reg (mode, cmp);
5088 if (!REG_P (src))
5089 src = copy_to_mode_reg (mode, src);
5090
5091 if (mode == SImode)
5092 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5093 else
5094 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5095
5096 emit_insn (pat);
5097
5098 return target;
5099}
5100
5101
5102/* Codes for all the NVPTX builtins. */
5103enum nvptx_builtins
5104{
5105 NVPTX_BUILTIN_SHUFFLE,
5106 NVPTX_BUILTIN_SHUFFLELL,
5107 NVPTX_BUILTIN_WORKER_ADDR,
5108 NVPTX_BUILTIN_CMP_SWAP,
5109 NVPTX_BUILTIN_CMP_SWAPLL,
5110 NVPTX_BUILTIN_MAX
5111};
5112
5113static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5114
5115/* Return the NVPTX builtin for CODE. */
5116
5117static tree
5118nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5119{
5120 if (code >= NVPTX_BUILTIN_MAX)
5121 return error_mark_node;
5122
5123 return nvptx_builtin_decls[code];
5124}
5125
5126/* Set up all builtin functions for this target. */
5127
5128static void
5129nvptx_init_builtins (void)
5130{
5131#define DEF(ID, NAME, T) \
5132 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5133 = add_builtin_function ("__builtin_nvptx_" NAME, \
5134 build_function_type_list T, \
5135 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5136#define ST sizetype
5137#define UINT unsigned_type_node
5138#define LLUINT long_long_unsigned_type_node
5139#define PTRVOID ptr_type_node
5140
5141 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5142 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5143 DEF (WORKER_ADDR, "worker_addr",
5144 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5145 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5146 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5147
5148#undef DEF
5149#undef ST
5150#undef UINT
5151#undef LLUINT
5152#undef PTRVOID
5153}
5154
5155/* Expand an expression EXP that calls a built-in function,
5156 with result going to TARGET if that's convenient
5157 (and in mode MODE if that's convenient).
5158 SUBTARGET may be used as the target for computing one of EXP's operands.
5159 IGNORE is nonzero if the value is to be ignored. */
5160
5161static rtx
5162nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5163 machine_mode mode, int ignore)
5164{
5165 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5166 switch (DECL_FUNCTION_CODE (fndecl))
5167 {
5168 case NVPTX_BUILTIN_SHUFFLE:
5169 case NVPTX_BUILTIN_SHUFFLELL:
5170 return nvptx_expand_shuffle (exp, target, mode, ignore);
5171
5172 case NVPTX_BUILTIN_WORKER_ADDR:
a0b3b5c4 5173 return nvptx_expand_shared_addr (exp, target, mode, ignore);
f3552158
NS
5174
5175 case NVPTX_BUILTIN_CMP_SWAP:
5176 case NVPTX_BUILTIN_CMP_SWAPLL:
5177 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5178
5179 default: gcc_unreachable ();
5180 }
738f2522 5181}
f3552158 5182
5012919d
AM
5183/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5184
5185static int
5186nvptx_simt_vf ()
5187{
5d17a476 5188 return PTX_WARP_SIZE;
5012919d
AM
5189}
5190
94829f87
NS
5191/* Validate compute dimensions of an OpenACC offload or routine, fill
5192 in non-unity defaults. FN_LEVEL indicates the level at which a
b6adbb9f
NS
5193 routine might spawn a loop. It is negative for non-routines. If
5194 DECL is null, we are validating the default dimensions. */
94829f87
NS
5195
5196static bool
5d306e55 5197nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
5198{
5199 bool changed = false;
ec6c865c
TV
5200 bool oacc_default_dims_p = false;
5201 bool oacc_min_dims_p = false;
5202 bool offload_region_p = false;
5203 bool routine_p = false;
5204 bool routine_seq_p = false;
5205
5206 if (decl == NULL_TREE)
5207 {
5208 if (fn_level == -1)
5209 oacc_default_dims_p = true;
5210 else if (fn_level == -2)
5211 oacc_min_dims_p = true;
5212 else
5213 gcc_unreachable ();
5214 }
5215 else if (fn_level == -1)
5216 offload_region_p = true;
5217 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5218 {
5219 routine_p = true;
5220 routine_seq_p = fn_level == GOMP_DIM_MAX;
5221 }
5222 else
5223 gcc_unreachable ();
94829f87 5224
207e7fea
TV
5225 if (routine_p)
5226 {
5227 /* OpenACC routines in C arrive here with the following attributes
5228 (omitting the 'omp declare target'):
5229 seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
5230 vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
5231 worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
5232 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5233
5234 If we take f.i. the oacc function attribute of the worker routine
5235 (0 1, 1 0, 1 0), then:
5236 - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
5237 meaning: worker routine, that is:
5238 - can't contain gang loop (0),
5239 - can contain worker loop (1),
5240 - can contain vector loop (1).
5241 - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
5242 dimensions: gang: 1, worker: 0, vector: 0.
5243
5244 OTOH, routines in Fortran arrive here with these attributes:
5245 seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
5246 vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
5247 worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
5248 gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
5249 that is, the same as for C but with the dimensions set to 0.
5250
5251 This is due to a bug in the Fortran front-end: PR72741. Work around
5252 this bug by forcing the dimensions to be the same in Fortran as for C,
5253 to be able to handle C and Fortran routines uniformly in this
5254 function. */
5255 dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
5256 dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
5257 dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
5258 changed = true;
5259 }
5260
ccc8282b 5261 /* The vector size must be 32, unless this is a SEQ routine. */
ec6c865c
TV
5262 if ((offload_region_p || oacc_default_dims_p
5263 || (routine_p && !routine_seq_p))
b6adbb9f 5264 && dims[GOMP_DIM_VECTOR] >= 0
ccc8282b
NS
5265 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5266 {
ec6c865c
TV
5267 if ((offload_region_p || oacc_default_dims_p)
5268 && dims[GOMP_DIM_VECTOR] >= 0)
b6adbb9f 5269 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b 5270 dims[GOMP_DIM_VECTOR]
324ff1a0
JJ
5271 ? G_("using vector_length (%d), ignoring %d")
5272 : G_("using vector_length (%d), ignoring runtime setting"),
ccc8282b
NS
5273 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5274 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5275 changed = true;
5276 }
5277
5278 /* Check the num workers is not too large. */
5279 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5280 {
b6adbb9f 5281 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
5282 "using num_workers (%d), ignoring %d",
5283 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5284 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5285 changed = true;
5286 }
94829f87 5287
ec6c865c 5288 if (oacc_default_dims_p || oacc_min_dims_p)
b6adbb9f
NS
5289 {
5290 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5291 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5292 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5293 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5294 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f
NS
5295 changed = true;
5296 }
5297
94829f87
NS
5298 return changed;
5299}
d88cd9c4 5300
bd751975
NS
5301/* Return maximum dimension size, or zero for unbounded. */
5302
5303static int
5304nvptx_dim_limit (int axis)
5305{
5306 switch (axis)
5307 {
bd751975
NS
5308 case GOMP_DIM_VECTOR:
5309 return PTX_VECTOR_LENGTH;
5310
5311 default:
5312 break;
5313 }
5314 return 0;
5315}
5316
d88cd9c4
NS
5317/* Determine whether fork & joins are needed. */
5318
5319static bool
5320nvptx_goacc_fork_join (gcall *call, const int dims[],
5321 bool ARG_UNUSED (is_fork))
5322{
5323 tree arg = gimple_call_arg (call, 2);
5324 unsigned axis = TREE_INT_CST_LOW (arg);
5325
5326 /* We only care about worker and vector partitioning. */
5327 if (axis < GOMP_DIM_WORKER)
5328 return false;
5329
5330 /* If the size is 1, there's no partitioning. */
5331 if (dims[axis] == 1)
5332 return false;
5333
5334 return true;
5335}
5336
f3552158
NS
5337/* Generate a PTX builtin function call that returns the address in
5338 the worker reduction buffer at OFFSET. TYPE is the type of the
5339 data at that location. */
5340
5341static tree
a0b3b5c4 5342nvptx_get_shared_red_addr (tree type, tree offset)
f3552158
NS
5343{
5344 machine_mode mode = TYPE_MODE (type);
5345 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5346 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5347 tree align = build_int_cst (unsigned_type_node,
5348 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5349 tree call = build_call_expr (fndecl, 3, offset, size, align);
5350
5351 return fold_convert (build_pointer_type (type), call);
5352}
5353
5354/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5355 will cast the variable if necessary. */
5356
5357static void
5358nvptx_generate_vector_shuffle (location_t loc,
5359 tree dest_var, tree var, unsigned shift,
5360 gimple_seq *seq)
5361{
5362 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5363 tree_code code = NOP_EXPR;
dd3c1b14
NS
5364 tree arg_type = unsigned_type_node;
5365 tree var_type = TREE_TYPE (var);
5366 tree dest_type = var_type;
f3552158 5367
dd3c1b14
NS
5368 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5369 var_type = TREE_TYPE (var_type);
5370
5371 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 5372 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
5373
5374 if (TYPE_SIZE (var_type)
5375 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
5376 {
5377 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 5378 arg_type = long_long_unsigned_type_node;
f3552158 5379 }
dd3c1b14 5380
f3552158 5381 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
5382 tree bits = build_int_cst (unsigned_type_node, shift);
5383 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5384 tree expr;
5385
5386 if (var_type != dest_type)
5387 {
5388 /* Do real and imaginary parts separately. */
5389 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5390 real = fold_build1 (code, arg_type, real);
5391 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5392 real = fold_build1 (code, var_type, real);
f3552158 5393
dd3c1b14
NS
5394 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5395 imag = fold_build1 (code, arg_type, imag);
5396 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5397 imag = fold_build1 (code, var_type, imag);
5398
5399 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5400 }
5401 else
5402 {
5403 expr = fold_build1 (code, arg_type, var);
5404 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5405 expr = fold_build1 (code, dest_type, expr);
5406 }
f3552158 5407
dd3c1b14 5408 gimplify_assign (dest_var, expr, seq);
f3552158
NS
5409}
5410
33f47f42
NS
5411/* Lazily generate the global lock var decl and return its address. */
5412
5413static tree
5414nvptx_global_lock_addr ()
5415{
5416 tree v = global_lock_var;
5417
5418 if (!v)
5419 {
5420 tree name = get_identifier ("__reduction_lock");
5421 tree type = build_qualified_type (unsigned_type_node,
5422 TYPE_QUAL_VOLATILE);
5423 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5424 global_lock_var = v;
5425 DECL_ARTIFICIAL (v) = 1;
5426 DECL_EXTERNAL (v) = 1;
5427 TREE_STATIC (v) = 1;
5428 TREE_PUBLIC (v) = 1;
5429 TREE_USED (v) = 1;
5430 mark_addressable (v);
5431 mark_decl_referenced (v);
5432 }
5433
5434 return build_fold_addr_expr (v);
5435}
5436
5437/* Insert code to locklessly update *PTR with *PTR OP VAR just before
5438 GSI. We use a lockless scheme for nearly all case, which looks
5439 like:
5440 actual = initval(OP);
5441 do {
5442 guess = actual;
5443 write = guess OP myval;
5444 actual = cmp&swap (ptr, guess, write)
5445 } while (actual bit-different-to guess);
5446 return write;
5447
5448 This relies on a cmp&swap instruction, which is available for 32-
5449 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
5450
5451static tree
5452nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5453 tree ptr, tree var, tree_code op)
5454{
5455 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5456 tree_code code = NOP_EXPR;
33f47f42
NS
5457 tree arg_type = unsigned_type_node;
5458 tree var_type = TREE_TYPE (var);
f3552158 5459
33f47f42
NS
5460 if (TREE_CODE (var_type) == COMPLEX_TYPE
5461 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 5462 code = VIEW_CONVERT_EXPR;
33f47f42
NS
5463
5464 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 5465 {
33f47f42 5466 arg_type = long_long_unsigned_type_node;
f3552158 5467 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
5468 }
5469
33f47f42
NS
5470 tree swap_fn = nvptx_builtin_decl (fn, true);
5471
f3552158 5472 gimple_seq init_seq = NULL;
33f47f42
NS
5473 tree init_var = make_ssa_name (arg_type);
5474 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5475 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
5476 gimplify_assign (init_var, init_expr, &init_seq);
5477 gimple *init_end = gimple_seq_last (init_seq);
5478
5479 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5480
f3552158
NS
5481 /* Split the block just after the init stmts. */
5482 basic_block pre_bb = gsi_bb (*gsi);
5483 edge pre_edge = split_block (pre_bb, init_end);
5484 basic_block loop_bb = pre_edge->dest;
5485 pre_bb = pre_edge->src;
5486 /* Reset the iterator. */
5487 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5488
33f47f42
NS
5489 tree expect_var = make_ssa_name (arg_type);
5490 tree actual_var = make_ssa_name (arg_type);
5491 tree write_var = make_ssa_name (arg_type);
5492
5493 /* Build and insert the reduction calculation. */
5494 gimple_seq red_seq = NULL;
5495 tree write_expr = fold_build1 (code, var_type, expect_var);
5496 write_expr = fold_build2 (op, var_type, write_expr, var);
5497 write_expr = fold_build1 (code, arg_type, write_expr);
5498 gimplify_assign (write_var, write_expr, &red_seq);
5499
5500 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5501
5502 /* Build & insert the cmp&swap sequence. */
5503 gimple_seq latch_seq = NULL;
5504 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5505 ptr, expect_var, write_var);
5506 gimplify_assign (actual_var, swap_expr, &latch_seq);
5507
5508 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5509 NULL_TREE, NULL_TREE);
5510 gimple_seq_add_stmt (&latch_seq, cond);
5511
5512 gimple *latch_end = gimple_seq_last (latch_seq);
5513 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 5514
33f47f42
NS
5515 /* Split the block just after the latch stmts. */
5516 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
5517 basic_block post_bb = post_edge->dest;
5518 loop_bb = post_edge->src;
5519 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5520
5521 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 5522 post_edge->probability = profile_probability::even ();
f3552158 5523 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 5524 loop_edge->probability = profile_probability::even ();
f3552158
NS
5525 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5526 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5527
5528 gphi *phi = create_phi_node (expect_var, loop_bb);
5529 add_phi_arg (phi, init_var, pre_edge, loc);
5530 add_phi_arg (phi, actual_var, loop_edge, loc);
5531
5532 loop *loop = alloc_loop ();
5533 loop->header = loop_bb;
5534 loop->latch = loop_bb;
5535 add_loop (loop, loop_bb->loop_father);
5536
33f47f42
NS
5537 return fold_build1 (code, var_type, write_var);
5538}
5539
5540/* Insert code to lockfully update *PTR with *PTR OP VAR just before
5541 GSI. This is necessary for types larger than 64 bits, where there
5542 is no cmp&swap instruction to implement a lockless scheme. We use
5543 a lock variable in global memory.
5544
5545 while (cmp&swap (&lock_var, 0, 1))
5546 continue;
5547 T accum = *ptr;
5548 accum = accum OP var;
5549 *ptr = accum;
5550 cmp&swap (&lock_var, 1, 0);
5551 return accum;
5552
5553 A lock in global memory is necessary to force execution engine
5554 descheduling and avoid resource starvation that can occur if the
5555 lock is in .shared memory. */
5556
5557static tree
5558nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5559 tree ptr, tree var, tree_code op)
5560{
5561 tree var_type = TREE_TYPE (var);
5562 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5563 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5564 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5565
5566 /* Split the block just before the gsi. Insert a gimple nop to make
5567 this easier. */
5568 gimple *nop = gimple_build_nop ();
5569 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5570 basic_block entry_bb = gsi_bb (*gsi);
5571 edge entry_edge = split_block (entry_bb, nop);
5572 basic_block lock_bb = entry_edge->dest;
5573 /* Reset the iterator. */
5574 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5575
5576 /* Build and insert the locking sequence. */
5577 gimple_seq lock_seq = NULL;
5578 tree lock_var = make_ssa_name (unsigned_type_node);
5579 tree lock_expr = nvptx_global_lock_addr ();
5580 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5581 uns_unlocked, uns_locked);
5582 gimplify_assign (lock_var, lock_expr, &lock_seq);
5583 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5584 NULL_TREE, NULL_TREE);
5585 gimple_seq_add_stmt (&lock_seq, cond);
5586 gimple *lock_end = gimple_seq_last (lock_seq);
5587 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5588
5589 /* Split the block just after the lock sequence. */
5590 edge locked_edge = split_block (lock_bb, lock_end);
5591 basic_block update_bb = locked_edge->dest;
5592 lock_bb = locked_edge->src;
5593 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5594
5595 /* Create the lock loop ... */
5596 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
5597 locked_edge->probability = profile_probability::even ();
5598 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5599 loop_edge->probability = profile_probability::even ();
33f47f42
NS
5600 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5601 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5602
5603 /* ... and the loop structure. */
5604 loop *lock_loop = alloc_loop ();
5605 lock_loop->header = lock_bb;
5606 lock_loop->latch = lock_bb;
5607 lock_loop->nb_iterations_estimate = 1;
5608 lock_loop->any_estimate = true;
5609 add_loop (lock_loop, entry_bb->loop_father);
5610
5611 /* Build and insert the reduction calculation. */
5612 gimple_seq red_seq = NULL;
5613 tree acc_in = make_ssa_name (var_type);
5614 tree ref_in = build_simple_mem_ref (ptr);
5615 TREE_THIS_VOLATILE (ref_in) = 1;
5616 gimplify_assign (acc_in, ref_in, &red_seq);
5617
5618 tree acc_out = make_ssa_name (var_type);
5619 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5620 gimplify_assign (acc_out, update_expr, &red_seq);
5621
5622 tree ref_out = build_simple_mem_ref (ptr);
5623 TREE_THIS_VOLATILE (ref_out) = 1;
5624 gimplify_assign (ref_out, acc_out, &red_seq);
5625
5626 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5627
5628 /* Build & insert the unlock sequence. */
5629 gimple_seq unlock_seq = NULL;
5630 tree unlock_expr = nvptx_global_lock_addr ();
5631 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5632 uns_locked, uns_unlocked);
5633 gimplify_and_add (unlock_expr, &unlock_seq);
5634 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5635
5636 return acc_out;
5637}
5638
5639/* Emit a sequence to update a reduction accumlator at *PTR with the
5640 value held in VAR using operator OP. Return the updated value.
5641
5642 TODO: optimize for atomic ops and indepedent complex ops. */
5643
5644static tree
5645nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5646 tree ptr, tree var, tree_code op)
5647{
5648 tree type = TREE_TYPE (var);
5649 tree size = TYPE_SIZE (type);
5650
5651 if (size == TYPE_SIZE (unsigned_type_node)
5652 || size == TYPE_SIZE (long_long_unsigned_type_node))
5653 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5654 else
5655 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
5656}
5657
5658/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5659
5660static void
5661nvptx_goacc_reduction_setup (gcall *call)
5662{
5663 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5664 tree lhs = gimple_call_lhs (call);
5665 tree var = gimple_call_arg (call, 2);
5666 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5667 gimple_seq seq = NULL;
5668
5669 push_gimplify_context (true);
5670
5671 if (level != GOMP_DIM_GANG)
5672 {
5673 /* Copy the receiver object. */
5674 tree ref_to_res = gimple_call_arg (call, 1);
5675
5676 if (!integer_zerop (ref_to_res))
5677 var = build_simple_mem_ref (ref_to_res);
5678 }
5679
5680 if (level == GOMP_DIM_WORKER)
5681 {
5682 /* Store incoming value to worker reduction buffer. */
5683 tree offset = gimple_call_arg (call, 5);
a0b3b5c4 5684 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
f3552158
NS
5685 tree ptr = make_ssa_name (TREE_TYPE (call));
5686
5687 gimplify_assign (ptr, call, &seq);
5688 tree ref = build_simple_mem_ref (ptr);
5689 TREE_THIS_VOLATILE (ref) = 1;
5690 gimplify_assign (ref, var, &seq);
5691 }
5692
5693 if (lhs)
5694 gimplify_assign (lhs, var, &seq);
5695
5696 pop_gimplify_context (NULL);
5697 gsi_replace_with_seq (&gsi, seq, true);
5698}
5699
5700/* NVPTX implementation of GOACC_REDUCTION_INIT. */
5701
5702static void
5703nvptx_goacc_reduction_init (gcall *call)
5704{
5705 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5706 tree lhs = gimple_call_lhs (call);
5707 tree var = gimple_call_arg (call, 2);
5708 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5709 enum tree_code rcode
5710 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5711 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5712 TREE_TYPE (var));
5713 gimple_seq seq = NULL;
5714
5715 push_gimplify_context (true);
5716
5717 if (level == GOMP_DIM_VECTOR)
5718 {
5719 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5720 tree tid = make_ssa_name (integer_type_node);
5721 tree dim_vector = gimple_call_arg (call, 3);
5722 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5723 dim_vector);
5724 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5725 NULL_TREE, NULL_TREE);
5726
5727 gimple_call_set_lhs (tid_call, tid);
5728 gimple_seq_add_stmt (&seq, tid_call);
5729 gimple_seq_add_stmt (&seq, cond_stmt);
5730
5731 /* Split the block just after the call. */
5732 edge init_edge = split_block (gsi_bb (gsi), call);
5733 basic_block init_bb = init_edge->dest;
5734 basic_block call_bb = init_edge->src;
5735
5736 /* Fixup flags from call_bb to init_bb. */
5737 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 5738 init_edge->probability = profile_probability::even ();
f3552158
NS
5739
5740 /* Set the initialization stmts. */
5741 gimple_seq init_seq = NULL;
5742 tree init_var = make_ssa_name (TREE_TYPE (var));
5743 gimplify_assign (init_var, init, &init_seq);
5744 gsi = gsi_start_bb (init_bb);
5745 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5746
5747 /* Split block just after the init stmt. */
5748 gsi_prev (&gsi);
5749 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5750 basic_block dst_bb = inited_edge->dest;
5751
5752 /* Create false edge from call_bb to dst_bb. */
5753 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 5754 nop_edge->probability = profile_probability::even ();
f3552158
NS
5755
5756 /* Create phi node in dst block. */
5757 gphi *phi = create_phi_node (lhs, dst_bb);
5758 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5759 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5760
5761 /* Reset dominator of dst bb. */
5762 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5763
5764 /* Reset the gsi. */
5765 gsi = gsi_for_stmt (call);
5766 }
5767 else
5768 {
5769 if (level == GOMP_DIM_GANG)
5770 {
5771 /* If there's no receiver object, propagate the incoming VAR. */
5772 tree ref_to_res = gimple_call_arg (call, 1);
5773 if (integer_zerop (ref_to_res))
5774 init = var;
5775 }
5776
5777 gimplify_assign (lhs, init, &seq);
5778 }
5779
5780 pop_gimplify_context (NULL);
5781 gsi_replace_with_seq (&gsi, seq, true);
5782}
5783
5784/* NVPTX implementation of GOACC_REDUCTION_FINI. */
5785
5786static void
5787nvptx_goacc_reduction_fini (gcall *call)
5788{
5789 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5790 tree lhs = gimple_call_lhs (call);
5791 tree ref_to_res = gimple_call_arg (call, 1);
5792 tree var = gimple_call_arg (call, 2);
5793 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5794 enum tree_code op
5795 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5796 gimple_seq seq = NULL;
5797 tree r = NULL_TREE;;
5798
5799 push_gimplify_context (true);
5800
5801 if (level == GOMP_DIM_VECTOR)
5802 {
5803 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5804 but that requires a method of emitting a unified jump at the
5805 gimple level. */
5806 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5807 {
5808 tree other_var = make_ssa_name (TREE_TYPE (var));
5809 nvptx_generate_vector_shuffle (gimple_location (call),
5810 other_var, var, shfl, &seq);
5811
5812 r = make_ssa_name (TREE_TYPE (var));
5813 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5814 var, other_var), &seq);
5815 var = r;
5816 }
5817 }
5818 else
5819 {
5820 tree accum = NULL_TREE;
5821
5822 if (level == GOMP_DIM_WORKER)
5823 {
5824 /* Get reduction buffer address. */
5825 tree offset = gimple_call_arg (call, 5);
a0b3b5c4 5826 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset);
f3552158
NS
5827 tree ptr = make_ssa_name (TREE_TYPE (call));
5828
5829 gimplify_assign (ptr, call, &seq);
5830 accum = ptr;
5831 }
5832 else if (integer_zerop (ref_to_res))
5833 r = var;
5834 else
5835 accum = ref_to_res;
5836
5837 if (accum)
5838 {
33f47f42 5839 /* UPDATE the accumulator. */
f3552158
NS
5840 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5841 seq = NULL;
33f47f42
NS
5842 r = nvptx_reduction_update (gimple_location (call), &gsi,
5843 accum, var, op);
f3552158
NS
5844 }
5845 }
5846
5847 if (lhs)
5848 gimplify_assign (lhs, r, &seq);
5849 pop_gimplify_context (NULL);
5850
5851 gsi_replace_with_seq (&gsi, seq, true);
5852}
5853
5854/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5855
5856static void
5857nvptx_goacc_reduction_teardown (gcall *call)
5858{
5859 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5860 tree lhs = gimple_call_lhs (call);
5861 tree var = gimple_call_arg (call, 2);
5862 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5863 gimple_seq seq = NULL;
5864
5865 push_gimplify_context (true);
5866 if (level == GOMP_DIM_WORKER)
5867 {
5868 /* Read the worker reduction buffer. */
5869 tree offset = gimple_call_arg (call, 5);
a0b3b5c4 5870 tree call = nvptx_get_shared_red_addr(TREE_TYPE (var), offset);
f3552158
NS
5871 tree ptr = make_ssa_name (TREE_TYPE (call));
5872
5873 gimplify_assign (ptr, call, &seq);
5874 var = build_simple_mem_ref (ptr);
5875 TREE_THIS_VOLATILE (var) = 1;
5876 }
5877
5878 if (level != GOMP_DIM_GANG)
5879 {
5880 /* Write to the receiver object. */
5881 tree ref_to_res = gimple_call_arg (call, 1);
5882
5883 if (!integer_zerop (ref_to_res))
5884 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5885 }
5886
5887 if (lhs)
5888 gimplify_assign (lhs, var, &seq);
5889
5890 pop_gimplify_context (NULL);
5891
5892 gsi_replace_with_seq (&gsi, seq, true);
5893}
5894
5895/* NVPTX reduction expander. */
5896
5563d5c0 5897static void
f3552158
NS
5898nvptx_goacc_reduction (gcall *call)
5899{
5900 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5901
5902 switch (code)
5903 {
5904 case IFN_GOACC_REDUCTION_SETUP:
5905 nvptx_goacc_reduction_setup (call);
5906 break;
5907
5908 case IFN_GOACC_REDUCTION_INIT:
5909 nvptx_goacc_reduction_init (call);
5910 break;
5911
5912 case IFN_GOACC_REDUCTION_FINI:
5913 nvptx_goacc_reduction_fini (call);
5914 break;
5915
5916 case IFN_GOACC_REDUCTION_TEARDOWN:
5917 nvptx_goacc_reduction_teardown (call);
5918 break;
5919
5920 default:
5921 gcc_unreachable ();
5922 }
5923}
5924
e053f710
TV
5925static bool
5926nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5927 rtx x ATTRIBUTE_UNUSED)
5928{
5929 return true;
5930}
5931
b98b34b7
TV
5932static bool
5933nvptx_vector_mode_supported (machine_mode mode)
5934{
3717fbe3
TV
5935 return (mode == V2SImode
5936 || mode == V2DImode);
5937}
5938
5939/* Return the preferred mode for vectorizing scalar MODE. */
5940
5941static machine_mode
005ba29c 5942nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
5943{
5944 switch (mode)
5945 {
4e10a5a7 5946 case E_DImode:
3717fbe3 5947 return V2DImode;
4e10a5a7 5948 case E_SImode:
3717fbe3
TV
5949 return V2SImode;
5950
5951 default:
5952 return default_preferred_simd_mode (mode);
5953 }
5954}
5955
5956unsigned int
5957nvptx_data_alignment (const_tree type, unsigned int basic_align)
5958{
5959 if (TREE_CODE (type) == INTEGER_TYPE)
5960 {
5961 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5962 if (size == GET_MODE_SIZE (TImode))
5963 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5964 }
5965
5966 return basic_align;
b98b34b7
TV
5967}
5968
99e1629f
RS
5969/* Implement TARGET_MODES_TIEABLE_P. */
5970
5971static bool
5972nvptx_modes_tieable_p (machine_mode, machine_mode)
5973{
5974 return false;
5975}
5976
c43f4279
RS
5977/* Implement TARGET_HARD_REGNO_NREGS. */
5978
5979static unsigned int
5980nvptx_hard_regno_nregs (unsigned int, machine_mode)
5981{
5982 return 1;
5983}
5984
0d803030
RS
5985/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5986
5987static bool
5988nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5989{
5990 return false;
5991}
5992
43be05f5
TV
5993static GTY(()) tree nvptx_previous_fndecl;
5994
5995static void
5996nvptx_set_current_function (tree fndecl)
5997{
5998 if (!fndecl || fndecl == nvptx_previous_fndecl)
5999 return;
6000
6001 nvptx_previous_fndecl = fndecl;
6002}
6003
738f2522
BS
6004#undef TARGET_OPTION_OVERRIDE
6005#define TARGET_OPTION_OVERRIDE nvptx_option_override
6006
6007#undef TARGET_ATTRIBUTE_TABLE
6008#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6009
d81db636
SB
6010#undef TARGET_LRA_P
6011#define TARGET_LRA_P hook_bool_void_false
6012
738f2522
BS
6013#undef TARGET_LEGITIMATE_ADDRESS_P
6014#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6015
6016#undef TARGET_PROMOTE_FUNCTION_MODE
6017#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6018
6019#undef TARGET_FUNCTION_ARG
6020#define TARGET_FUNCTION_ARG nvptx_function_arg
6021#undef TARGET_FUNCTION_INCOMING_ARG
6022#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6023#undef TARGET_FUNCTION_ARG_ADVANCE
6024#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6025#undef TARGET_FUNCTION_ARG_BOUNDARY
6026#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6027#undef TARGET_PASS_BY_REFERENCE
6028#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6029#undef TARGET_FUNCTION_VALUE_REGNO_P
6030#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6031#undef TARGET_FUNCTION_VALUE
6032#define TARGET_FUNCTION_VALUE nvptx_function_value
6033#undef TARGET_LIBCALL_VALUE
6034#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6035#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6036#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6037#undef TARGET_GET_DRAP_RTX
6038#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6039#undef TARGET_SPLIT_COMPLEX_ARG
6040#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6041#undef TARGET_RETURN_IN_MEMORY
6042#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6043#undef TARGET_OMIT_STRUCT_RETURN_REG
6044#define TARGET_OMIT_STRUCT_RETURN_REG true
6045#undef TARGET_STRICT_ARGUMENT_NAMING
6046#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6047#undef TARGET_CALL_ARGS
6048#define TARGET_CALL_ARGS nvptx_call_args
6049#undef TARGET_END_CALL_ARGS
6050#define TARGET_END_CALL_ARGS nvptx_end_call_args
6051
6052#undef TARGET_ASM_FILE_START
6053#define TARGET_ASM_FILE_START nvptx_file_start
6054#undef TARGET_ASM_FILE_END
6055#define TARGET_ASM_FILE_END nvptx_file_end
6056#undef TARGET_ASM_GLOBALIZE_LABEL
6057#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6058#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6059#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6060#undef TARGET_PRINT_OPERAND
6061#define TARGET_PRINT_OPERAND nvptx_print_operand
6062#undef TARGET_PRINT_OPERAND_ADDRESS
6063#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6064#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6065#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6066#undef TARGET_ASM_INTEGER
6067#define TARGET_ASM_INTEGER nvptx_assemble_integer
6068#undef TARGET_ASM_DECL_END
6069#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6070#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6071#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6072#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6073#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6074#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6075#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6076
6077#undef TARGET_MACHINE_DEPENDENT_REORG
6078#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6079#undef TARGET_NO_REGISTER_ALLOCATION
6080#define TARGET_NO_REGISTER_ALLOCATION true
6081
9a863523
NS
6082#undef TARGET_ENCODE_SECTION_INFO
6083#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6084#undef TARGET_RECORD_OFFLOAD_SYMBOL
6085#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6086
738f2522
BS
6087#undef TARGET_VECTOR_ALIGNMENT
6088#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6089
d88cd9c4
NS
6090#undef TARGET_CANNOT_COPY_INSN_P
6091#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6092
a794bd20
NS
6093#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6094#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6095
f3552158
NS
6096#undef TARGET_INIT_BUILTINS
6097#define TARGET_INIT_BUILTINS nvptx_init_builtins
6098#undef TARGET_EXPAND_BUILTIN
6099#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6100#undef TARGET_BUILTIN_DECL
6101#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6102
5012919d
AM
6103#undef TARGET_SIMT_VF
6104#define TARGET_SIMT_VF nvptx_simt_vf
6105
94829f87
NS
6106#undef TARGET_GOACC_VALIDATE_DIMS
6107#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6108
bd751975
NS
6109#undef TARGET_GOACC_DIM_LIMIT
6110#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6111
d88cd9c4
NS
6112#undef TARGET_GOACC_FORK_JOIN
6113#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6114
f3552158
NS
6115#undef TARGET_GOACC_REDUCTION
6116#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6117
e053f710
TV
6118#undef TARGET_CANNOT_FORCE_CONST_MEM
6119#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6120
b98b34b7
TV
6121#undef TARGET_VECTOR_MODE_SUPPORTED_P
6122#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6123
3717fbe3
TV
6124#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6125#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6126 nvptx_preferred_simd_mode
6127
99e1629f
RS
6128#undef TARGET_MODES_TIEABLE_P
6129#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6130
c43f4279
RS
6131#undef TARGET_HARD_REGNO_NREGS
6132#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6133
0d803030
RS
6134#undef TARGET_CAN_CHANGE_MODE_CLASS
6135#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6136
e335138d
TV
6137#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6138#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6139
43be05f5
TV
6140#undef TARGET_SET_CURRENT_FUNCTION
6141#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6142
738f2522
BS
6143struct gcc_target targetm = TARGET_INITIALIZER;
6144
6145#include "gt-nvptx.h"