]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.cc
Change references of .c files to .cc files
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.cc
CommitLineData
738f2522 1/* Target code for NVPTX.
7adcbafe 2 Copyright (C) 2014-2022 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
738f2522 23#include "config.h"
3a4d1cb1 24#include <sstream>
738f2522
BS
25#include "system.h"
26#include "coretypes.h"
c7131fb2 27#include "backend.h"
e11c4407 28#include "target.h"
738f2522 29#include "rtl.h"
e11c4407
AM
30#include "tree.h"
31#include "cfghooks.h"
c7131fb2 32#include "df.h"
4d0cdd0c 33#include "memmodel.h"
e11c4407
AM
34#include "tm_p.h"
35#include "expmed.h"
36#include "optabs.h"
37#include "regs.h"
38#include "emit-rtl.h"
39#include "recog.h"
40#include "diagnostic.h"
40e23961 41#include "alias.h"
738f2522
BS
42#include "insn-flags.h"
43#include "output.h"
44#include "insn-attr.h"
36566b39 45#include "flags.h"
36566b39
PK
46#include "dojump.h"
47#include "explow.h"
48#include "calls.h"
36566b39
PK
49#include "varasm.h"
50#include "stmt.h"
738f2522 51#include "expr.h"
738f2522
BS
52#include "tm-preds.h"
53#include "tm-constrs.h"
738f2522
BS
54#include "langhooks.h"
55#include "dbxout.h"
738f2522 56#include "cfgrtl.h"
d88cd9c4 57#include "gimple.h"
738f2522 58#include "stor-layout.h"
738f2522 59#include "builtins.h"
629b3d75 60#include "omp-general.h"
3e32ee19 61#include "omp-low.h"
6e723923 62#include "omp-offload.h"
3e32ee19 63#include "gomp-constants.h"
d88cd9c4 64#include "dumpfile.h"
f3552158
NS
65#include "internal-fn.h"
66#include "gimple-iterator.h"
67#include "stringpool.h"
314e6352 68#include "attribs.h"
f2d4d839 69#include "tree-vrp.h"
f3552158
NS
70#include "tree-ssa-operands.h"
71#include "tree-ssanames.h"
72#include "gimplify.h"
73#include "tree-phinodes.h"
74#include "cfgloop.h"
75#include "fold-const.h"
50c620ca 76#include "intl.h"
5874d156 77#include "opts.h"
29a2f518 78#include "tree-pretty-print.h"
738f2522 79
994c5d85 80/* This file should be included last. */
d58627a0
RS
81#include "target-def.h"
82
43c371e8 83#define WORKAROUND_PTXJIT_BUG 1
3dede32b 84#define WORKAROUND_PTXJIT_BUG_2 1
21251395 85#define WORKAROUND_PTXJIT_BUG_3 1
43c371e8 86
69b09a58
TV
87/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
88 block, which has had a maximum number of threads of 1024 since CUDA version
89 2.x. */
90#define PTX_CTA_SIZE 1024
91
052aaace 92#define PTX_CTA_NUM_BARRIERS 16
5d17a476 93#define PTX_WARP_SIZE 32
69b09a58 94
052aaace
TV
95#define PTX_PER_CTA_BARRIER 0
96#define PTX_NUM_PER_CTA_BARRIERS 1
97#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
98#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
99
764ecad4 100#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
2b9d9e39 101#define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
693ad66b
TV
102#define PTX_WORKER_LENGTH 32
103#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
5d17a476 104
9a863523
NS
105/* The various PTX memory areas an object might reside in. */
106enum nvptx_data_area
107{
108 DATA_AREA_GENERIC,
109 DATA_AREA_GLOBAL,
110 DATA_AREA_SHARED,
111 DATA_AREA_LOCAL,
112 DATA_AREA_CONST,
113 DATA_AREA_PARAM,
114 DATA_AREA_MAX
115};
116
117/* We record the data area in the target symbol flags. */
118#define SYMBOL_DATA_AREA(SYM) \
119 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
120 & 7)
121#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
122 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
123
738f2522
BS
124/* Record the function decls we've written, and the libfuncs and function
125 decls corresponding to them. */
126static std::stringstream func_decls;
f3dba894 127
6c907cff 128struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
129{
130 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
131 static bool equal (rtx a, rtx b) { return a == b; }
132};
133
134static GTY((cache))
135 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
136
6c907cff 137struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
138{
139 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
140 static bool equal (tree a, tree b) { return a == b; }
141};
142
143static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
144static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 145
1ed57fb8
TV
146/* Buffer needed to broadcast across workers and vectors. This is
147 used for both worker-neutering and worker broadcasting, and
148 vector-neutering and boardcasting when vector_length > 32. It is
149 shared by all functions emitted. The buffer is placed in shared
150 memory. It'd be nice if PTX supported common blocks, because then
151 this could be shared across TUs (taking the largest size). */
152static unsigned oacc_bcast_size;
0024c320 153static unsigned oacc_bcast_partition;
1ed57fb8
TV
154static unsigned oacc_bcast_align;
155static GTY(()) rtx oacc_bcast_sym;
d88cd9c4 156
f3552158
NS
157/* Buffer needed for worker reductions. This has to be distinct from
158 the worker broadcast array, as both may be live concurrently. */
159static unsigned worker_red_size;
160static unsigned worker_red_align;
f3552158
NS
161static GTY(()) rtx worker_red_sym;
162
f881693c
TV
163/* Buffer needed for vector reductions, when vector_length >
164 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
165 array, as both may be live concurrently. */
166static unsigned vector_red_size;
167static unsigned vector_red_align;
168static unsigned vector_red_partition;
169static GTY(()) rtx vector_red_sym;
170
29a2f518
JB
171/* Shared memory block for gang-private variables. */
172static unsigned gang_private_shared_size;
173static unsigned gang_private_shared_align;
174static GTY(()) rtx gang_private_shared_sym;
175static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
176
33f47f42
NS
177/* Global lock variable, needed for 128bit worker & gang reductions. */
178static GTY(()) tree global_lock_var;
179
5012919d
AM
180/* True if any function references __nvptx_stacks. */
181static bool need_softstack_decl;
182
183/* True if any function references __nvptx_uni. */
184static bool need_unisimt_decl;
185
0024c320
TV
186static int nvptx_mach_max_workers ();
187
738f2522
BS
188/* Allocate a new, cleared machine_function structure. */
189
190static struct machine_function *
191nvptx_init_machine_status (void)
192{
193 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 194 p->return_mode = VOIDmode;
738f2522
BS
195 return p;
196}
197
5012919d
AM
198/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
199 and -fopenacc is also enabled. */
200
201static void
202diagnose_openacc_conflict (bool optval, const char *optname)
203{
204 if (flag_openacc && optval)
a3f9f006 205 error ("option %s is not supported together with %<-fopenacc%>", optname);
5012919d
AM
206}
207
738f2522
BS
208/* Implement TARGET_OPTION_OVERRIDE. */
209
210static void
211nvptx_option_override (void)
212{
213 init_machine_status = nvptx_init_machine_status;
ae578943 214
3d339d5e
AM
215 /* Set toplevel_reorder, unless explicitly disabled. We need
216 reordering so that we emit necessary assembler decls of
217 undeclared variables. */
00f34291 218 if (!OPTION_SET_P (flag_toplevel_reorder))
3d339d5e
AM
219 flag_toplevel_reorder = 1;
220
fa9afdc3
TV
221 debug_nonbind_markers_p = 0;
222
ae578943
NS
223 /* Set flag_no_common, unless explicitly disabled. We fake common
224 using .weak, and that's not entirely accurate, so avoid it
225 unless forced. */
00f34291 226 if (!OPTION_SET_P (flag_no_common))
ae578943
NS
227 flag_no_common = 1;
228
cad136a4 229 /* The patch area requires nops, which we don't have. */
5874d156
ML
230 HOST_WIDE_INT patch_area_size, patch_area_entry;
231 parse_and_check_patch_area (flag_patchable_function_entry, false,
232 &patch_area_size, &patch_area_entry);
233 if (patch_area_size > 0)
cad136a4
TV
234 sorry ("not generating patch area, nops not supported");
235
738f2522
BS
236 /* Assumes that it will see only hard registers. */
237 flag_var_tracking = 0;
1e5154e7 238
dba619f3
NS
239 if (nvptx_optimize < 0)
240 nvptx_optimize = optimize > 0;
241
f3dba894
TS
242 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
243 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 244 declared_libfuncs_htab
f3dba894 245 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 246
1ed57fb8
TV
247 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
248 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
249 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
0024c320 250 oacc_bcast_partition = 0;
f3552158 251
15ab6f00 252 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 253 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 254 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d 255
f881693c
TV
256 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
257 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
258 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
259 vector_red_partition = 0;
260
29a2f518
JB
261 gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
262 SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
263 gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
264
5012919d
AM
265 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
266 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
267 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
268
269 if (TARGET_GOMP)
270 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
271}
272
738f2522
BS
273/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
274 deal with ptx ideosyncracies. */
275
276const char *
277nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
278{
279 switch (mode)
280 {
4e10a5a7 281 case E_BLKmode:
738f2522 282 return ".b8";
4e10a5a7 283 case E_BImode:
738f2522 284 return ".pred";
4e10a5a7 285 case E_QImode:
738f2522
BS
286 if (promote)
287 return ".u32";
288 else
289 return ".u8";
4e10a5a7 290 case E_HImode:
738f2522 291 return ".u16";
4e10a5a7 292 case E_SImode:
738f2522 293 return ".u32";
4e10a5a7 294 case E_DImode:
738f2522
BS
295 return ".u64";
296
aeedb00a
RS
297 case E_HFmode:
298 return ".f16";
4e10a5a7 299 case E_SFmode:
738f2522 300 return ".f32";
4e10a5a7 301 case E_DFmode:
738f2522
BS
302 return ".f64";
303
4e10a5a7 304 case E_V2SImode:
b98b34b7 305 return ".v2.u32";
4e10a5a7 306 case E_V2DImode:
3717fbe3 307 return ".v2.u64";
b98b34b7 308
738f2522
BS
309 default:
310 gcc_unreachable ();
311 }
312}
313
9a863523
NS
314/* Encode the PTX data area that DECL (which might not actually be a
315 _DECL) should reside in. */
7b8edc29 316
9a863523
NS
317static void
318nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 319{
9a863523
NS
320 default_encode_section_info (decl, rtl, first);
321 if (first && MEM_P (rtl))
322 {
323 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 324
9a863523
NS
325 if (TREE_CONSTANT (decl))
326 area = DATA_AREA_CONST;
327 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
328 {
329 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
330 {
331 area = DATA_AREA_SHARED;
332 if (DECL_INITIAL (decl))
333 error ("static initialization of variable %q+D in %<.shared%>"
334 " memory is not supported", decl);
335 }
336 else
337 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
338 }
7b8edc29 339
9a863523
NS
340 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
341 }
342}
343
344/* Return the PTX name of the data area in which SYM should be
345 placed. The symbol must have already been processed by
346 nvptx_encode_seciton_info, or equivalent. */
347
348static const char *
349section_for_sym (rtx sym)
350{
351 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
352 /* Same order as nvptx_data_area enum. */
353 static char const *const areas[] =
354 {"", ".global", ".shared", ".local", ".const", ".param"};
355
356 return areas[area];
357}
358
359/* Similarly for a decl. */
360
361static const char *
362section_for_decl (const_tree decl)
363{
364 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
365}
366
b4346107
NS
367/* Check NAME for special function names and redirect them by returning a
368 replacement. This applies to malloc, free and realloc, for which we
369 want to use libgcc wrappers, and call, which triggers a bug in
370 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
371 not active in an offload compiler -- the names are all set by the
372 host-side compiler. */
373
374static const char *
375nvptx_name_replacement (const char *name)
376{
377 if (strcmp (name, "call") == 0)
378 return "__nvptx_call";
379 if (strcmp (name, "malloc") == 0)
380 return "__nvptx_malloc";
381 if (strcmp (name, "free") == 0)
382 return "__nvptx_free";
383 if (strcmp (name, "realloc") == 0)
384 return "__nvptx_realloc";
385 return name;
386}
387
e88b04bd
TV
388/* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
389 with the dots replaced with dollar signs. */
390
391static char *
392nvptx_replace_dot (const char *name)
393{
394 if (strchr (name, '.') == NULL)
395 return NULL;
396
397 char *p = xstrdup (name);
398 for (size_t i = 0; i < strlen (p); ++i)
399 if (p[i] == '.')
400 p[i] = '$';
401 return p;
402}
403
d7479262
NS
404/* If MODE should be treated as two registers of an inner mode, return
405 that inner mode. Otherwise return VOIDmode. */
738f2522 406
d7479262
NS
407static machine_mode
408maybe_split_mode (machine_mode mode)
738f2522 409{
738f2522 410 if (COMPLEX_MODE_P (mode))
d7479262 411 return GET_MODE_INNER (mode);
738f2522 412
738f2522 413 if (mode == TImode)
d7479262
NS
414 return DImode;
415
416 return VOIDmode;
738f2522
BS
417}
418
38827811
TV
419/* Return true if mode should be treated as two registers. */
420
421static bool
422split_mode_p (machine_mode mode)
423{
424 return maybe_split_mode (mode) != VOIDmode;
425}
426
f313d112
NS
427/* Output a register, subreg, or register pair (with optional
428 enclosing braces). */
429
430static void
431output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
432 int subreg_offset = -1)
433{
434 if (inner_mode == VOIDmode)
435 {
436 if (HARD_REGISTER_NUM_P (regno))
437 fprintf (file, "%s", reg_names[regno]);
438 else
439 fprintf (file, "%%r%d", regno);
440 }
441 else if (subreg_offset >= 0)
442 {
443 output_reg (file, regno, VOIDmode);
444 fprintf (file, "$%d", subreg_offset);
445 }
446 else
447 {
448 if (subreg_offset == -1)
449 fprintf (file, "{");
450 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
451 fprintf (file, ",");
452 output_reg (file, regno, inner_mode, 0);
453 if (subreg_offset == -1)
454 fprintf (file, "}");
455 }
456}
457
d88cd9c4
NS
458/* Emit forking instructions for MASK. */
459
460static void
461nvptx_emit_forking (unsigned mask, bool is_call)
462{
463 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
464 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
465 if (mask)
466 {
467 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
468
469 /* Emit fork at all levels. This helps form SESE regions, as
470 it creates a block with a single successor before entering a
471 partitooned region. That is a good candidate for the end of
472 an SESE region. */
e91eba31 473 emit_insn (gen_nvptx_fork (op));
d88cd9c4
NS
474 emit_insn (gen_nvptx_forked (op));
475 }
476}
477
478/* Emit joining instructions for MASK. */
479
480static void
481nvptx_emit_joining (unsigned mask, bool is_call)
482{
483 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
484 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
485 if (mask)
486 {
487 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
488
489 /* Emit joining for all non-call pars to ensure there's a single
490 predecessor for the block the join insn ends up in. This is
491 needed for skipping entire loops. */
e91eba31 492 emit_insn (gen_nvptx_joining (op));
d88cd9c4
NS
493 emit_insn (gen_nvptx_join (op));
494 }
495}
496
738f2522 497\f
44eba92d
NS
498/* Determine whether MODE and TYPE (possibly NULL) should be passed or
499 returned in memory. Integer and floating types supported by the
500 machine are passed in registers, everything else is passed in
501 memory. Complex types are split. */
502
503static bool
504pass_in_memory (machine_mode mode, const_tree type, bool for_return)
505{
506 if (type)
507 {
508 if (AGGREGATE_TYPE_P (type))
509 return true;
510 if (TREE_CODE (type) == VECTOR_TYPE)
511 return true;
512 }
513
514 if (!for_return && COMPLEX_MODE_P (mode))
515 /* Complex types are passed as two underlying args. */
516 mode = GET_MODE_INNER (mode);
517
518 if (GET_MODE_CLASS (mode) != MODE_INT
519 && GET_MODE_CLASS (mode) != MODE_FLOAT)
520 return true;
521
522 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
523 return true;
524
525 return false;
526}
527
528/* A non-memory argument of mode MODE is being passed, determine the mode it
529 should be promoted to. This is also used for determining return
530 type promotion. */
531
532static machine_mode
533promote_arg (machine_mode mode, bool prototyped)
534{
535 if (!prototyped && mode == SFmode)
536 /* K&R float promotion for unprototyped functions. */
537 mode = DFmode;
538 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
539 mode = SImode;
540
541 return mode;
542}
543
544/* A non-memory return type of MODE is being returned. Determine the
545 mode it should be promoted to. */
546
547static machine_mode
548promote_return (machine_mode mode)
549{
550 return promote_arg (mode, true);
551}
552
e74f7152 553/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 554
e74f7152 555static rtx
6783fdb7 556nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
e74f7152 557{
6783fdb7 558 if (arg.end_marker_p () || !arg.named)
e74f7152 559 return NULL_RTX;
738f2522 560
6783fdb7 561 return gen_reg_rtx (arg.mode);
e74f7152
NS
562}
563
564/* Implement TARGET_FUNCTION_INCOMING_ARG. */
565
566static rtx
6783fdb7
RS
567nvptx_function_incoming_arg (cumulative_args_t cum_v,
568 const function_arg_info &arg)
738f2522 569{
e74f7152 570 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 571
6783fdb7 572 if (arg.end_marker_p () || !arg.named)
e74f7152 573 return NULL_RTX;
738f2522 574
e74f7152
NS
575 /* No need to deal with split modes here, the only case that can
576 happen is complex modes and those are dealt with by
577 TARGET_SPLIT_COMPLEX_ARG. */
6783fdb7 578 return gen_rtx_UNSPEC (arg.mode,
e74f7152
NS
579 gen_rtvec (1, GEN_INT (cum->count)),
580 UNSPEC_ARG_REG);
581}
582
583/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
584
585static void
6930c98c 586nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
e74f7152
NS
587{
588 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 589
e74f7152
NS
590 cum->count++;
591}
592
1988b2c2
NS
593/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
594
595 For nvptx This is only used for varadic args. The type has already
596 been promoted and/or converted to invisible reference. */
597
598static unsigned
599nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
600{
601 return GET_MODE_ALIGNMENT (mode);
602}
603
e74f7152
NS
604/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
605
606 For nvptx, we know how to handle functions declared as stdarg: by
607 passing an extra pointer to the unnamed arguments. However, the
608 Fortran frontend can produce a different situation, where a
609 function pointer is declared with no arguments, but the actual
610 function and calls to it take more arguments. In that case, we
611 want to ensure the call matches the definition of the function. */
612
613static bool
614nvptx_strict_argument_naming (cumulative_args_t cum_v)
615{
616 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 617
e74f7152
NS
618 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
619}
620
e74f7152
NS
621/* Implement TARGET_LIBCALL_VALUE. */
622
623static rtx
624nvptx_libcall_value (machine_mode mode, const_rtx)
625{
e91cacae 626 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
627 /* Pretend to return in a hard reg for early uses before pseudos can be
628 generated. */
629 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 630
e74f7152
NS
631 return gen_reg_rtx (mode);
632}
633
634/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
635 where function FUNC returns or receives a value of data type TYPE. */
636
637static rtx
44eba92d 638nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
639 bool outgoing)
640{
44eba92d
NS
641 machine_mode mode = promote_return (TYPE_MODE (type));
642
e74f7152 643 if (outgoing)
5c036f3f 644 {
e91cacae 645 gcc_assert (cfun);
44c068ae 646 cfun->machine->return_mode = mode;
5c036f3f
NS
647 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
648 }
b49e35a9
NS
649
650 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
651}
652
653/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
654
655static bool
656nvptx_function_value_regno_p (const unsigned int regno)
657{
658 return regno == NVPTX_RETURN_REGNUM;
659}
660
661/* Types with a mode other than those supported by the machine are passed by
662 reference in memory. */
663
664static bool
52090e4d 665nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
e74f7152 666{
52090e4d 667 return pass_in_memory (arg.mode, arg.type, false);
e74f7152
NS
668}
669
670/* Implement TARGET_RETURN_IN_MEMORY. */
671
672static bool
673nvptx_return_in_memory (const_tree type, const_tree)
674{
44eba92d 675 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
676}
677
678/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
679
680static machine_mode
681nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 682 int *ARG_UNUSED (punsignedp),
e74f7152
NS
683 const_tree funtype, int for_return)
684{
44eba92d 685 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
686}
687
e74f7152
NS
688/* Helper for write_arg. Emit a single PTX argument of MODE, either
689 in a prototype, or as copy in a function prologue. ARGNO is the
690 index of this argument in the PTX function. FOR_REG is negative,
691 if we're emitting the PTX prototype. It is zero if we're copying
692 to an argument register and it is greater than zero if we're
693 copying to a specific hard register. */
694
695static int
1f065954
NS
696write_arg_mode (std::stringstream &s, int for_reg, int argno,
697 machine_mode mode)
e74f7152
NS
698{
699 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
700
dc3d2aeb
NS
701 if (for_reg < 0)
702 {
703 /* Writing PTX prototype. */
704 s << (argno ? ", " : " (");
e74f7152 705 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
706 }
707 else
708 {
e74f7152 709 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
710 if (for_reg)
711 s << reg_names[for_reg];
712 else
713 s << "%ar" << argno;
714 s << ";\n";
5563d5c0
NS
715 if (argno >= 0)
716 {
717 s << "\tld.param" << ptx_type << " ";
718 if (for_reg)
719 s << reg_names[for_reg];
720 else
721 s << "%ar" << argno;
722 s << ", [%in_ar" << argno << "];\n";
723 }
dc3d2aeb
NS
724 }
725 return argno + 1;
738f2522
BS
726}
727
e74f7152 728/* Process function parameter TYPE to emit one or more PTX
1f065954 729 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
730 is true, if this is a prototyped function, rather than an old-style
731 C declaration. Returns the next argument number to use.
e74f7152 732
9c582551 733 The promotion behavior here must match the regular GCC function
e74f7152
NS
734 parameter marshalling machinery. */
735
736static int
1f065954
NS
737write_arg_type (std::stringstream &s, int for_reg, int argno,
738 tree type, bool prototyped)
e74f7152
NS
739{
740 machine_mode mode = TYPE_MODE (type);
741
742 if (mode == VOIDmode)
743 return argno;
744
44eba92d 745 if (pass_in_memory (mode, type, false))
e74f7152 746 mode = Pmode;
44eba92d
NS
747 else
748 {
749 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 750
44eba92d
NS
751 if (split)
752 {
753 /* Complex types are sent as two separate args. */
754 type = TREE_TYPE (type);
5563d5c0 755 mode = TYPE_MODE (type);
44eba92d
NS
756 prototyped = true;
757 }
e74f7152 758
44eba92d
NS
759 mode = promote_arg (mode, prototyped);
760 if (split)
1f065954 761 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 762 }
e74f7152 763
1f065954
NS
764 return write_arg_mode (s, for_reg, argno, mode);
765}
766
767/* Emit a PTX return as a prototype or function prologue declaration
768 for MODE. */
769
770static void
771write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
772{
773 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
774 const char *pfx = "\t.reg";
775 const char *sfx = ";\n";
776
777 if (for_proto)
778 pfx = "(.param", sfx = "_out) ";
779
780 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
781}
782
44eba92d 783/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 784 or function prologue declaration. Returns true if return is via an
9c582551 785 additional pointer parameter. The promotion behavior here must
1f065954 786 match the regular GCC function return mashalling. */
44eba92d 787
0beb7c71 788static bool
1f065954 789write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
790{
791 machine_mode mode = TYPE_MODE (type);
0beb7c71 792
44eba92d
NS
793 if (mode == VOIDmode)
794 return false;
795
796 bool return_in_mem = pass_in_memory (mode, type, true);
797
798 if (return_in_mem)
0beb7c71 799 {
44eba92d
NS
800 if (for_proto)
801 return return_in_mem;
802
803 /* Named return values can cause us to return a pointer as well
804 as expect an argument for the return location. This is
805 optimization-level specific, so no caller can make use of
806 this data, but more importantly for us, we must ensure it
807 doesn't change the PTX prototype. */
44c068ae 808 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 809
44eba92d
NS
810 if (mode == VOIDmode)
811 return return_in_mem;
812
44c068ae 813 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 814 retval parameter. */
44c068ae 815 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
816 }
817 else
44eba92d
NS
818 mode = promote_return (mode);
819
1f065954 820 write_return_mode (s, for_proto, mode);
0beb7c71
NS
821
822 return return_in_mem;
823}
824
738f2522
BS
825/* Look for attributes in ATTRS that would indicate we must write a function
826 as a .entry kernel rather than a .func. Return true if one is found. */
827
828static bool
829write_as_kernel (tree attrs)
830{
831 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
832 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
833 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
834 /* For OpenMP target regions, the corresponding kernel entry is emitted from
835 write_omp_entry as a separate function. */
738f2522
BS
836}
837
69823d76
NS
838/* Emit a linker marker for a function decl or defn. */
839
840static void
841write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
842 const char *name)
843{
844 s << "\n// BEGIN";
845 if (globalize)
846 s << " GLOBAL";
847 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
848 s << name << "\n";
849}
850
851/* Emit a linker marker for a variable decl or defn. */
852
853static void
854write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
855{
856 fprintf (file, "\n// BEGIN%s VAR %s: ",
857 globalize ? " GLOBAL" : "",
858 is_defn ? "DEF" : "DECL");
859 assemble_name_raw (file, name);
860 fputs ("\n", file);
861}
862
db52bcfe 863/* Helper function for write_fn_proto. */
738f2522 864
db52bcfe
TV
865static void
866write_fn_proto_1 (std::stringstream &s, bool is_defn,
867 const char *name, const_tree decl)
738f2522 868{
69823d76 869 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
870
871 /* PTX declaration. */
738f2522
BS
872 if (DECL_EXTERNAL (decl))
873 s << ".extern ";
874 else if (TREE_PUBLIC (decl))
0766660b 875 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 876 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 877
b699adcc
NS
878 tree fntype = TREE_TYPE (decl);
879 tree result_type = TREE_TYPE (fntype);
738f2522 880
ac3d02e2
NS
881 /* atomic_compare_exchange_$n builtins have an exceptional calling
882 convention. */
883 int not_atomic_weak_arg = -1;
884 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
885 switch (DECL_FUNCTION_CODE (decl))
886 {
887 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
888 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
889 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
890 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
891 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
892 /* These atomics skip the 'weak' parm in an actual library
893 call. We must skip it in the prototype too. */
894 not_atomic_weak_arg = 3;
895 break;
896
897 default:
898 break;
899 }
900
738f2522 901 /* Declare the result. */
1f065954 902 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 903
b699adcc
NS
904 s << name;
905
dc3d2aeb 906 int argno = 0;
b699adcc
NS
907
908 /* Emit argument list. */
909 if (return_in_mem)
1f065954 910 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 911
b699adcc
NS
912 /* We get:
913 NULL in TYPE_ARG_TYPES, for old-style functions
914 NULL in DECL_ARGUMENTS, for builtin functions without another
915 declaration.
916 So we have to pick the best one we have. */
917 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
918 bool prototyped = true;
919 if (!args)
920 {
921 args = DECL_ARGUMENTS (decl);
922 prototyped = false;
923 }
738f2522 924
ac3d02e2 925 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 926 {
dc3d2aeb 927 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
928
929 if (not_atomic_weak_arg)
930 argno = write_arg_type (s, -1, argno, type, prototyped);
931 else
505590b7 932 gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
738f2522 933 }
738f2522 934
b699adcc 935 if (stdarg_p (fntype))
1f065954 936 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 937
b699adcc 938 if (DECL_STATIC_CHAIN (decl))
1f065954 939 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 940
dc3d2aeb 941 if (!argno && strcmp (name, "main") == 0)
b699adcc 942 {
1f065954
NS
943 argno = write_arg_type (s, -1, argno, integer_type_node, true);
944 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
945 }
946
dc3d2aeb 947 if (argno)
b699adcc
NS
948 s << ")";
949
950 s << (is_defn ? "\n" : ";\n");
db52bcfe 951}
b699adcc 952
db52bcfe
TV
953/* Write a .func or .kernel declaration or definition along with
954 a helper comment for use by ld. S is the stream to write to, DECL
955 the decl for the function with name NAME. For definitions, emit
956 a declaration too. */
957
958static void
959write_fn_proto (std::stringstream &s, bool is_defn,
960 const char *name, const_tree decl)
961{
e88b04bd
TV
962 const char *replacement = nvptx_name_replacement (name);
963 char *replaced_dots = NULL;
964 if (replacement != name)
965 name = replacement;
966 else
967 {
968 replaced_dots = nvptx_replace_dot (name);
969 if (replaced_dots)
970 name = replaced_dots;
971 }
db52bcfe
TV
972 if (name[0] == '*')
973 name++;
974
975 if (is_defn)
976 /* Emit a declaration. The PTX assembler gets upset without it. */
977 write_fn_proto_1 (s, false, name, decl);
978
979 write_fn_proto_1 (s, is_defn, name, decl);
e88b04bd
TV
980
981 if (replaced_dots)
982 XDELETE (replaced_dots);
738f2522
BS
983}
984
00e52418
NS
985/* Construct a function declaration from a call insn. This can be
986 necessary for two reasons - either we have an indirect call which
987 requires a .callprototype declaration, or we have a libcall
988 generated by emit_library_call for which no decl exists. */
989
990static void
b699adcc
NS
991write_fn_proto_from_insn (std::stringstream &s, const char *name,
992 rtx result, rtx pat)
00e52418 993{
e88b04bd
TV
994 char *replaced_dots = NULL;
995
00e52418
NS
996 if (!name)
997 {
998 s << "\t.callprototype ";
999 name = "_";
1000 }
1001 else
1002 {
e88b04bd
TV
1003 const char *replacement = nvptx_name_replacement (name);
1004 if (replacement != name)
1005 name = replacement;
1006 else
1007 {
1008 replaced_dots = nvptx_replace_dot (name);
1009 if (replaced_dots)
1010 name = replaced_dots;
1011 }
69823d76 1012 write_fn_marker (s, false, true, name);
00e52418
NS
1013 s << "\t.extern .func ";
1014 }
1015
1016 if (result != NULL_RTX)
1f065954 1017 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
1018
1019 s << name;
e88b04bd
TV
1020 if (replaced_dots)
1021 XDELETE (replaced_dots);
00e52418 1022
00e52418
NS
1023 int arg_end = XVECLEN (pat, 0);
1024 for (int i = 1; i < arg_end; i++)
1025 {
1f065954
NS
1026 /* We don't have to deal with mode splitting & promotion here,
1027 as that was already done when generating the call
1028 sequence. */
00e52418
NS
1029 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1030
1f065954 1031 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
1032 }
1033 if (arg_end != 1)
1034 s << ")";
1035 s << ";\n";
1036}
1037
00e52418 1038/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
700d4cb0 1039 table and write a ptx prototype. These are emitted at end of
00e52418 1040 compilation. */
738f2522 1041
00e52418
NS
1042static void
1043nvptx_record_fndecl (tree decl)
738f2522 1044{
f3dba894 1045 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
1046 if (*slot == NULL)
1047 {
1048 *slot = decl;
1049 const char *name = get_fnname_from_decl (decl);
b699adcc 1050 write_fn_proto (func_decls, false, name, decl);
738f2522 1051 }
738f2522
BS
1052}
1053
00e52418
NS
1054/* Record a libcall or unprototyped external function. CALLEE is the
1055 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1056 declaration for it. */
1057
1058static void
1059nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1060{
1061 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1062 if (*slot == NULL)
1063 {
1064 *slot = callee;
1065
1066 const char *name = XSTR (callee, 0);
b699adcc 1067 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
1068 }
1069}
1070
1071/* DECL is an external FUNCTION_DECL, that we're referencing. If it
1072 is prototyped, record it now. Otherwise record it as needed at end
1073 of compilation, when we might have more information about it. */
738f2522
BS
1074
1075void
1076nvptx_record_needed_fndecl (tree decl)
1077{
00e52418
NS
1078 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1079 {
1080 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1081 if (*slot == NULL)
1082 *slot = decl;
1083 }
1084 else
1085 nvptx_record_fndecl (decl);
1086}
738f2522 1087
00e52418
NS
1088/* SYM is a SYMBOL_REF. If it refers to an external function, record
1089 it as needed. */
1090
1091static void
1092nvptx_maybe_record_fnsym (rtx sym)
1093{
1094 tree decl = SYMBOL_REF_DECL (sym);
1095
1096 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1097 nvptx_record_needed_fndecl (decl);
738f2522
BS
1098}
1099
1f065954 1100/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
1101 and initialize REGNO to point to it. If the size is zero, it'll
1102 never be valid to dereference, so we can simply initialize to
1103 zero. */
1f065954
NS
1104
1105static void
1106init_frame (FILE *file, int regno, unsigned align, unsigned size)
1107{
5563d5c0
NS
1108 if (size)
1109 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1110 align, reg_names[regno], size);
1111 fprintf (file, "\t.reg.u%d %s;\n",
1112 POINTER_SIZE, reg_names[regno]);
1113 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1114 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1115 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1116}
1117
5012919d
AM
1118/* Emit soft stack frame setup sequence. */
1119
1120static void
1121init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1122{
1123 /* Maintain 64-bit stack alignment. */
1124 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1125 size = ROUND_UP (size, keep_align);
1126 int bits = POINTER_SIZE;
1127 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1128 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1129 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1130 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1131 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1132 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1133 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1134 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1135 fprintf (file, "\t{\n");
1136 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1137 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1138 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1139 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1140 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1141 bits == 64 ? ".wide" : ".lo", bits / 8);
1142 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1143
1144 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1145 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1146
1147 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1148 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1149 bits, reg_sspprev, reg_sspslot);
1150
1151 /* Initialize %frame = %sspprev - size. */
1152 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1153 bits, reg_frame, reg_sspprev, size);
1154
1155 /* Apply alignment, if larger than 64. */
1156 if (alignment > keep_align)
1157 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1158 bits, reg_frame, reg_frame, -alignment);
1159
1160 size = crtl->outgoing_args_size;
1161 gcc_assert (size % keep_align == 0);
1162
1163 /* Initialize %stack. */
1164 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1165 bits, reg_stack, reg_frame, size);
1166
5012919d
AM
1167 if (!crtl->is_leaf)
1168 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1169 bits, reg_sspslot, reg_stack);
1170 fprintf (file, "\t}\n");
1171 cfun->machine->has_softstack = true;
1172 need_softstack_decl = true;
1173}
1174
d88cd9c4
NS
1175/* Emit code to initialize the REGNO predicate register to indicate
1176 whether we are not lane zero on the NAME axis. */
1177
1178static void
1179nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1180{
1181 fprintf (file, "\t{\n");
1182 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
f881693c
TV
1183 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1184 {
1185 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1186 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1187 }
d88cd9c4
NS
1188 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1189 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
f881693c
TV
1190 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1191 {
1192 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1193 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1194 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1195 "// vector reduction buffer\n",
1196 REGNO (cfun->machine->red_partition),
1197 vector_red_partition);
1198 }
1199 /* Verify vector_red_size. */
1200 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1201 <= vector_red_size);
d88cd9c4
NS
1202 fprintf (file, "\t}\n");
1203}
1204
0024c320
TV
1205/* Emit code to initialize OpenACC worker broadcast and synchronization
1206 registers. */
1207
1208static void
1209nvptx_init_oacc_workers (FILE *file)
1210{
1211 fprintf (file, "\t{\n");
1212 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1213 if (cfun->machine->bcast_partition)
1214 {
1215 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1216 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1217 }
1218 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1219 if (cfun->machine->bcast_partition)
1220 {
1221 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1222 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1223 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1224 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1225 "// vector broadcast offset\n",
1226 REGNO (cfun->machine->bcast_partition),
1227 oacc_bcast_partition);
1228 }
1229 /* Verify oacc_bcast_size. */
1230 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1231 <= oacc_bcast_size);
1232 if (cfun->machine->sync_bar)
1233 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1234 "// vector synchronization barrier\n",
1235 REGNO (cfun->machine->sync_bar));
1236 fprintf (file, "\t}\n");
1237}
1238
5012919d
AM
1239/* Emit code to initialize predicate and master lane index registers for
1240 -muniform-simt code generation variant. */
1241
1242static void
1243nvptx_init_unisimt_predicate (FILE *file)
1244{
0c6b03b5
AM
1245 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1246 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1247 int bits = POINTER_SIZE;
0c6b03b5 1248 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1249 fprintf (file, "\t{\n");
1250 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1251 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1252 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1253 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1254 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1255 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1256 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1257 if (cfun->machine->unisimt_predicate)
1258 {
1259 int master = REGNO (cfun->machine->unisimt_master);
1260 int pred = REGNO (cfun->machine->unisimt_predicate);
1261 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1262 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1263 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1264 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1265 /* Compute predicate as 'tid.x == master'. */
1266 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1267 }
5012919d
AM
1268 fprintf (file, "\t}\n");
1269 need_unisimt_decl = true;
1270}
1271
1272/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1273
1274 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1275 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1276 {
1277 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1278 __nvptx_uni[tid.y] = 0;
1279 gomp_nvptx_main (ORIG, arg);
1280 }
1281 ORIG itself should not be emitted as a PTX .entry function. */
1282
1283static void
1284write_omp_entry (FILE *file, const char *name, const char *orig)
1285{
1286 static bool gomp_nvptx_main_declared;
1287 if (!gomp_nvptx_main_declared)
1288 {
1289 gomp_nvptx_main_declared = true;
1290 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1291 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1292 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1293 }
3c5b4c6d
TS
1294 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1295#define NTID_Y "%ntid.y"
5012919d
AM
1296#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1297 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1298{\n\
1299 .reg.u32 %r<3>;\n\
1300 .reg.u" PS " %R<4>;\n\
1301 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1302 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1303 mov.u32 %r2, %ctaid.x;\n\
1304 cvt.u" PS ".u32 %R1, %r0;\n\
1305 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1306 mov.u" PS " %R0, __nvptx_stacks;\n\
1307 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1308 ld.param.u" PS " %R2, [%stack];\n\
1309 ld.param.u" PS " %R3, [%sz];\n\
1310 add.u" PS " %R2, %R2, %R3;\n\
1311 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1312 st.shared.u" PS " [%R0], %R2;\n\
1313 mov.u" PS " %R0, __nvptx_uni;\n\
1314 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1315 mov.u32 %r0, 0;\n\
1316 st.shared.u32 [%R0], %r0;\n\
1317 mov.u" PS " %R0, \0;\n\
1318 ld.param.u" PS " %R1, [%arg];\n\
1319 {\n\
1320 .param.u" PS " %P<2>;\n\
1321 st.param.u" PS " [%P0], %R0;\n\
1322 st.param.u" PS " [%P1], %R1;\n\
1323 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1324 }\n\
1325 ret.uni;\n\
1326}\n"
1327 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1328 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1329#undef ENTRY_TEMPLATE
3c5b4c6d 1330#undef NTID_Y
5012919d
AM
1331 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1332 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1333 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1334 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1335 need_softstack_decl = need_unisimt_decl = true;
1336}
1337
738f2522
BS
1338/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1339 function, including local var decls and copies from the arguments to
1340 local regs. */
1341
1342void
1343nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1344{
1345 tree fntype = TREE_TYPE (decl);
1346 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1347 int argno = 0;
738f2522 1348
5012919d
AM
1349 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1350 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1351 {
1352 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1353 sprintf (buf, "%s$impl", name);
1354 write_omp_entry (file, name, buf);
1355 name = buf;
1356 }
dc3d2aeb
NS
1357 /* We construct the initial part of the function into a string
1358 stream, in order to share the prototype writing code. */
738f2522 1359 std::stringstream s;
b699adcc 1360 write_fn_proto (s, true, name, decl);
dc3d2aeb 1361 s << "{\n";
738f2522 1362
1f065954 1363 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1364 if (return_in_mem)
1f065954 1365 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1366
5ab662d5 1367 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1368 tree args = TYPE_ARG_TYPES (fntype);
1369 bool prototyped = true;
1370 if (!args)
5ab662d5 1371 {
dc3d2aeb
NS
1372 args = DECL_ARGUMENTS (decl);
1373 prototyped = false;
5ab662d5
NS
1374 }
1375
1376 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1377 {
1378 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1379
1f065954 1380 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1381 }
5ab662d5 1382
dc3d2aeb 1383 if (stdarg_p (fntype))
5563d5c0 1384 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1385 true);
5ab662d5 1386
5563d5c0
NS
1387 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1388 write_arg_type (s, STATIC_CHAIN_REGNUM,
1389 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1390 true);
1391
dc3d2aeb 1392 fprintf (file, "%s", s.str().c_str());
25662751 1393
0c6b03b5
AM
1394 /* Usually 'crtl->is_leaf' is computed during register allocator
1395 initialization (which is not done on NVPTX) or for pressure-sensitive
1396 optimizations. Initialize it here, except if already set. */
1397 if (!crtl->is_leaf)
1398 crtl->is_leaf = leaf_function_p ();
1399
44c068ae 1400 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1401 bool need_frameptr = sz || cfun->machine->has_chain;
1402 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1403 if (!TARGET_SOFT_STACK)
1404 {
1405 /* Declare a local var for outgoing varargs. */
1406 if (cfun->machine->has_varadic)
1407 init_frame (file, STACK_POINTER_REGNUM,
1408 UNITS_PER_WORD, crtl->outgoing_args_size);
1409
1410 /* Declare a local variable for the frame. Force its size to be
1411 DImode-compatible. */
1412 if (need_frameptr)
1413 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1414 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1415 }
0c6b03b5
AM
1416 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1417 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1418 init_softstack_frame (file, alignment, sz);
44c068ae 1419
0c6b03b5
AM
1420 if (cfun->machine->has_simtreg)
1421 {
1422 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1423 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1424 align = MAX (align, GET_MODE_SIZE (DImode));
1425 if (!crtl->is_leaf || cfun->calls_alloca)
1426 simtsz = HOST_WIDE_INT_M1U;
1427 if (simtsz == HOST_WIDE_INT_M1U)
1428 simtsz = nvptx_softstack_size;
1429 if (cfun->machine->has_softstack)
1430 simtsz += POINTER_SIZE / 8;
1431 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1432 if (align > GET_MODE_SIZE (DImode))
1433 simtsz += align - GET_MODE_SIZE (DImode);
1434 if (simtsz)
1435 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1436 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1437 }
f881693c
TV
1438
1439 /* Restore the vector reduction partition register, if necessary.
1440 FIXME: Find out when and why this is necessary, and fix it. */
1441 if (cfun->machine->red_partition)
1442 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1443 = cfun->machine->red_partition;
1444
738f2522
BS
1445 /* Declare the pseudos we have as ptx registers. */
1446 int maxregs = max_reg_num ();
1447 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1448 {
1449 if (regno_reg_rtx[i] != const0_rtx)
1450 {
1451 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1452 machine_mode split = maybe_split_mode (mode);
f313d112 1453
38827811 1454 if (split_mode_p (mode))
f313d112
NS
1455 mode = split;
1456 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1457 output_reg (file, i, split, -2);
1458 fprintf (file, ";\n");
738f2522
BS
1459 }
1460 }
1461
d88cd9c4
NS
1462 /* Emit axis predicates. */
1463 if (cfun->machine->axis_predicate[0])
1464 nvptx_init_axis_predicate (file,
1465 REGNO (cfun->machine->axis_predicate[0]), "y");
1466 if (cfun->machine->axis_predicate[1])
1467 nvptx_init_axis_predicate (file,
1468 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1469 if (cfun->machine->unisimt_predicate
1470 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1471 nvptx_init_unisimt_predicate (file);
0024c320
TV
1472 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1473 nvptx_init_oacc_workers (file);
738f2522
BS
1474}
1475
0c6b03b5
AM
1476/* Output code for switching uniform-simt state. ENTERING indicates whether
1477 we are entering or leaving non-uniform execution region. */
1478
1479static void
1480nvptx_output_unisimt_switch (FILE *file, bool entering)
1481{
1482 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1483 return;
1484 fprintf (file, "\t{\n");
1485 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1486 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1487 if (!crtl->is_leaf)
1488 {
1489 int loc = REGNO (cfun->machine->unisimt_location);
1490 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1491 }
1492 if (cfun->machine->unisimt_predicate)
1493 {
1494 int master = REGNO (cfun->machine->unisimt_master);
1495 int pred = REGNO (cfun->machine->unisimt_predicate);
1496 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1497 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1498 master, entering ? "%ustmp2" : "0");
1499 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1500 }
1501 fprintf (file, "\t}\n");
1502}
1503
1504/* Output code for allocating per-lane storage and switching soft-stack pointer.
1505 ENTERING indicates whether we are entering or leaving non-uniform execution.
1506 PTR is the register pointing to allocated storage, it is assigned to on
1507 entering and used to restore state on leaving. SIZE and ALIGN are used only
1508 on entering. */
1509
1510static void
1511nvptx_output_softstack_switch (FILE *file, bool entering,
1512 rtx ptr, rtx size, rtx align)
1513{
1514 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1515 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1516 return;
1517 int bits = POINTER_SIZE, regno = REGNO (ptr);
1518 fprintf (file, "\t{\n");
1519 if (entering)
1520 {
1521 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1522 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1523 cfun->machine->simt_stack_size);
1524 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1525 if (CONST_INT_P (size))
1526 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1527 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1528 else
1529 output_reg (file, REGNO (size), VOIDmode);
1530 fputs (";\n", file);
1531 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296 1532 fprintf (file,
ce9c4ec3 1533 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1534 bits, regno, regno, UINTVAL (align));
1535 }
1536 if (cfun->machine->has_softstack)
1537 {
1538 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1539 if (entering)
1540 {
1541 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1542 bits, regno, bits / 8, reg_stack);
1543 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1544 bits, reg_stack, regno, bits / 8);
1545 }
1546 else
1547 {
1548 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1549 bits, reg_stack, regno, bits / 8);
1550 }
1551 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1552 }
1553 fprintf (file, "\t}\n");
1554}
1555
1556/* Output code to enter non-uniform execution region. DEST is a register
1557 to hold a per-lane allocation given by SIZE and ALIGN. */
1558
1559const char *
1560nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1561{
1562 nvptx_output_unisimt_switch (asm_out_file, true);
1563 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1564 return "";
1565}
1566
1567/* Output code to leave non-uniform execution region. SRC is the register
1568 holding per-lane storage previously allocated by omp_simt_enter insn. */
1569
1570const char *
1571nvptx_output_simt_exit (rtx src)
1572{
1573 nvptx_output_unisimt_switch (asm_out_file, false);
1574 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1575 return "";
1576}
1577
5012919d
AM
1578/* Output instruction that sets soft stack pointer in shared memory to the
1579 value in register given by SRC_REGNO. */
1580
1581const char *
1582nvptx_output_set_softstack (unsigned src_regno)
1583{
1584 if (cfun->machine->has_softstack && !crtl->is_leaf)
1585 {
1586 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1587 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1588 output_reg (asm_out_file, src_regno, VOIDmode);
1589 fprintf (asm_out_file, ";\n");
1590 }
1591 return "";
1592}
738f2522
BS
1593/* Output a return instruction. Also copy the return value to its outgoing
1594 location. */
1595
1596const char *
1597nvptx_output_return (void)
1598{
44c068ae 1599 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1600
1601 if (mode != VOIDmode)
1f065954
NS
1602 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1603 nvptx_ptx_type_from_mode (mode, false),
1604 reg_names[NVPTX_RETURN_REGNUM],
1605 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1606
1607 return "ret;";
1608}
1609
738f2522
BS
1610/* Terminate a function by writing a closing brace to FILE. */
1611
1612void
1613nvptx_function_end (FILE *file)
1614{
cf08c344 1615 fprintf (file, "}\n");
738f2522
BS
1616}
1617\f
1618/* Decide whether we can make a sibling call to a function. For ptx, we
1619 can't. */
1620
1621static bool
1622nvptx_function_ok_for_sibcall (tree, tree)
1623{
1624 return false;
1625}
1626
18c05628
NS
1627/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1628
1629static rtx
1630nvptx_get_drap_rtx (void)
1631{
5012919d
AM
1632 if (TARGET_SOFT_STACK && stack_realign_drap)
1633 return arg_pointer_rtx;
18c05628
NS
1634 return NULL_RTX;
1635}
1636
738f2522
BS
1637/* Implement the TARGET_CALL_ARGS hook. Record information about one
1638 argument to the next call. */
1639
1640static void
44c068ae 1641nvptx_call_args (rtx arg, tree fntype)
738f2522 1642{
44c068ae 1643 if (!cfun->machine->doing_call)
738f2522 1644 {
44c068ae
NS
1645 cfun->machine->doing_call = true;
1646 cfun->machine->is_varadic = false;
1647 cfun->machine->num_args = 0;
1648
1649 if (fntype && stdarg_p (fntype))
1650 {
1651 cfun->machine->is_varadic = true;
1652 cfun->machine->has_varadic = true;
1653 cfun->machine->num_args++;
1654 }
738f2522 1655 }
738f2522 1656
44c068ae
NS
1657 if (REG_P (arg) && arg != pc_rtx)
1658 {
1659 cfun->machine->num_args++;
1660 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1661 cfun->machine->call_args);
1662 }
738f2522
BS
1663}
1664
1665/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1666 information we recorded. */
1667
1668static void
1669nvptx_end_call_args (void)
1670{
44c068ae 1671 cfun->machine->doing_call = false;
738f2522
BS
1672 free_EXPR_LIST_list (&cfun->machine->call_args);
1673}
1674
ecf6e535
BS
1675/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1676 track of whether calls involving static chains or varargs were seen
1677 in the current function.
1678 For libcalls, maintain a hash table of decls we have seen, and
1679 record a function decl for later when encountering a new one. */
738f2522
BS
1680
1681void
1682nvptx_expand_call (rtx retval, rtx address)
1683{
738f2522 1684 rtx callee = XEXP (address, 0);
f324806d 1685 rtx varargs = NULL_RTX;
d88cd9c4 1686 unsigned parallel = 0;
738f2522 1687
738f2522
BS
1688 if (!call_insn_operand (callee, Pmode))
1689 {
1690 callee = force_reg (Pmode, callee);
1691 address = change_address (address, QImode, callee);
1692 }
1693
1694 if (GET_CODE (callee) == SYMBOL_REF)
1695 {
1696 tree decl = SYMBOL_REF_DECL (callee);
1697 if (decl != NULL_TREE)
1698 {
738f2522 1699 if (DECL_STATIC_CHAIN (decl))
44c068ae 1700 cfun->machine->has_chain = true;
00e52418 1701
629b3d75 1702 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1703 if (attr)
1704 {
1705 tree dims = TREE_VALUE (attr);
1706
1707 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1708 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1709 {
1710 if (TREE_PURPOSE (dims)
1711 && !integer_zerop (TREE_PURPOSE (dims)))
1712 break;
1713 /* Not on this axis. */
1714 parallel ^= GOMP_DIM_MASK (ix);
1715 dims = TREE_CHAIN (dims);
1716 }
1717 }
738f2522
BS
1718 }
1719 }
c38f0d8c 1720
44c068ae
NS
1721 unsigned nargs = cfun->machine->num_args;
1722 if (cfun->machine->is_varadic)
738f2522 1723 {
f324806d 1724 varargs = gen_reg_rtx (Pmode);
863af9a4 1725 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1726 }
1727
44c068ae
NS
1728 rtvec vec = rtvec_alloc (nargs + 1);
1729 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1730 int vec_pos = 0;
44c068ae
NS
1731
1732 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1733 rtx tmp_retval = retval;
44c068ae 1734 if (retval)
738f2522
BS
1735 {
1736 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1737 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1738 call = gen_rtx_SET (tmp_retval, call);
738f2522 1739 }
44c068ae 1740 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1741
1742 /* Construct the call insn, including a USE for each argument pseudo
1743 register. These will be used when printing the insn. */
1744 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1745 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1746
1747 if (varargs)
cf08c344 1748 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1749
1750 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1751
d88cd9c4 1752 nvptx_emit_forking (parallel, true);
738f2522 1753 emit_call_insn (pat);
d88cd9c4
NS
1754 nvptx_emit_joining (parallel, true);
1755
738f2522
BS
1756 if (tmp_retval != retval)
1757 emit_move_insn (retval, tmp_retval);
1758}
44eba92d 1759
738f2522
BS
1760/* Emit a comparison COMPARE, and return the new test to be used in the
1761 jump. */
1762
1763rtx
1764nvptx_expand_compare (rtx compare)
1765{
1766 rtx pred = gen_reg_rtx (BImode);
1767 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1768 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1769 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1770 return gen_rtx_NE (BImode, pred, const0_rtx);
1771}
1772
d88cd9c4
NS
1773/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1774
1775void
1776nvptx_expand_oacc_fork (unsigned mode)
1777{
1778 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1779}
1780
1781void
1782nvptx_expand_oacc_join (unsigned mode)
1783{
1784 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1785}
1786
1787/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1788 objects. */
1789
1790static rtx
1791nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1792{
1793 rtx res;
1794
1795 switch (GET_MODE (src))
1796 {
4e10a5a7 1797 case E_DImode:
d88cd9c4
NS
1798 res = gen_unpackdisi2 (dst0, dst1, src);
1799 break;
4e10a5a7 1800 case E_DFmode:
d88cd9c4
NS
1801 res = gen_unpackdfsi2 (dst0, dst1, src);
1802 break;
1803 default: gcc_unreachable ();
1804 }
1805 return res;
1806}
1807
1808/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1809 object. */
1810
1811static rtx
1812nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1813{
1814 rtx res;
1815
1816 switch (GET_MODE (dst))
1817 {
4e10a5a7 1818 case E_DImode:
d88cd9c4
NS
1819 res = gen_packsidi2 (dst, src0, src1);
1820 break;
4e10a5a7 1821 case E_DFmode:
d88cd9c4
NS
1822 res = gen_packsidf2 (dst, src0, src1);
1823 break;
1824 default: gcc_unreachable ();
1825 }
1826 return res;
1827}
1828
1829/* Generate an instruction or sequence to broadcast register REG
1830 across the vectors of a single warp. */
1831
5012919d 1832rtx
59263259 1833nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1834{
1835 rtx res;
1836
1837 switch (GET_MODE (dst))
1838 {
4e10a5a7 1839 case E_SImode:
d88cd9c4
NS
1840 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1841 break;
4e10a5a7 1842 case E_SFmode:
d88cd9c4
NS
1843 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1844 break;
4e10a5a7
RS
1845 case E_DImode:
1846 case E_DFmode:
d88cd9c4
NS
1847 {
1848 rtx tmp0 = gen_reg_rtx (SImode);
1849 rtx tmp1 = gen_reg_rtx (SImode);
1850
1851 start_sequence ();
1852 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1853 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1854 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1855 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1856 res = get_insns ();
1857 end_sequence ();
1858 }
1859 break;
344f09a7
TV
1860 case E_V2SImode:
1861 {
1862 rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
1863 rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
1864 rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
1865 rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
1866 rtx tmp0 = gen_reg_rtx (SImode);
1867 rtx tmp1 = gen_reg_rtx (SImode);
1868 start_sequence ();
1869 emit_insn (gen_movsi (tmp0, src0));
1870 emit_insn (gen_movsi (tmp1, src1));
1871 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1872 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1873 emit_insn (gen_movsi (dst0, tmp0));
1874 emit_insn (gen_movsi (dst1, tmp1));
1875 res = get_insns ();
1876 end_sequence ();
1877 }
1878 break;
1879 case E_V2DImode:
1880 {
1881 rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
1882 rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
1883 rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
1884 rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
1885 rtx tmp0 = gen_reg_rtx (DImode);
1886 rtx tmp1 = gen_reg_rtx (DImode);
1887 start_sequence ();
1888 emit_insn (gen_movdi (tmp0, src0));
1889 emit_insn (gen_movdi (tmp1, src1));
1890 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1891 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1892 emit_insn (gen_movdi (dst0, tmp0));
1893 emit_insn (gen_movdi (dst1, tmp1));
1894 res = get_insns ();
1895 end_sequence ();
1896 }
1897 break;
4e10a5a7 1898 case E_BImode:
d88cd9c4
NS
1899 {
1900 rtx tmp = gen_reg_rtx (SImode);
1901
1902 start_sequence ();
1903 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1904 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1905 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1906 res = get_insns ();
1907 end_sequence ();
1908 }
1909 break;
4e10a5a7
RS
1910 case E_QImode:
1911 case E_HImode:
d5ace3b5
CP
1912 {
1913 rtx tmp = gen_reg_rtx (SImode);
1914
1915 start_sequence ();
1916 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1917 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1918 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1919 tmp)));
1920 res = get_insns ();
1921 end_sequence ();
1922 }
1923 break;
d88cd9c4
NS
1924
1925 default:
1926 gcc_unreachable ();
1927 }
1928 return res;
1929}
1930
1931/* Generate an instruction or sequence to broadcast register REG
1932 across the vectors of a single warp. */
1933
1934static rtx
a0b3b5c4 1935nvptx_gen_warp_bcast (rtx reg)
d88cd9c4
NS
1936{
1937 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1938}
1939
1940/* Structure used when generating a worker-level spill or fill. */
1941
1ed57fb8 1942struct broadcast_data_t
d88cd9c4
NS
1943{
1944 rtx base; /* Register holding base addr of buffer. */
1945 rtx ptr; /* Iteration var, if needed. */
1946 unsigned offset; /* Offset into worker buffer. */
1947};
1948
1949/* Direction of the spill/fill and looping setup/teardown indicator. */
1950
1951enum propagate_mask
1952 {
1953 PM_read = 1 << 0,
1954 PM_write = 1 << 1,
1955 PM_loop_begin = 1 << 2,
1956 PM_loop_end = 1 << 3,
1957
1958 PM_read_write = PM_read | PM_write
1959 };
1960
1961/* Generate instruction(s) to spill or fill register REG to/from the
1962 worker broadcast array. PM indicates what is to be done, REP
1963 how many loop iterations will be executed (0 for not a loop). */
1964
1965static rtx
a0b3b5c4
TV
1966nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1967 broadcast_data_t *data, bool vector)
d88cd9c4
NS
1968{
1969 rtx res;
1970 machine_mode mode = GET_MODE (reg);
1971
1972 switch (mode)
1973 {
4e10a5a7 1974 case E_BImode:
d88cd9c4
NS
1975 {
1976 rtx tmp = gen_reg_rtx (SImode);
1977
1978 start_sequence ();
1979 if (pm & PM_read)
1980 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
a0b3b5c4 1981 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
d88cd9c4
NS
1982 if (pm & PM_write)
1983 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1984 res = get_insns ();
1985 end_sequence ();
1986 }
1987 break;
1988
1989 default:
1990 {
1991 rtx addr = data->ptr;
1992
1993 if (!addr)
1994 {
1995 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1996
3c55d60f
TV
1997 oacc_bcast_align = MAX (oacc_bcast_align, align);
1998 data->offset = ROUND_UP (data->offset, align);
d88cd9c4 1999 addr = data->base;
a0b3b5c4 2000 gcc_assert (data->base != NULL);
d88cd9c4
NS
2001 if (data->offset)
2002 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
2003 }
2004
2005 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
2006 if (pm == PM_read)
2007 res = gen_rtx_SET (addr, reg);
2008 else if (pm == PM_write)
2009 res = gen_rtx_SET (reg, addr);
2010 else
2011 gcc_unreachable ();
2012
2013 if (data->ptr)
2014 {
2015 /* We're using a ptr, increment it. */
2016 start_sequence ();
2017
2018 emit_insn (res);
2019 emit_insn (gen_adddi3 (data->ptr, data->ptr,
2020 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2021 res = get_insns ();
2022 end_sequence ();
2023 }
2024 else
2025 rep = 1;
2026 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2027 }
2028 break;
2029 }
2030 return res;
2031}
738f2522
BS
2032\f
2033/* Returns true if X is a valid address for use in a memory reference. */
2034
2035static bool
2036nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2037{
2038 enum rtx_code code = GET_CODE (x);
2039
2040 switch (code)
2041 {
2042 case REG:
2043 return true;
2044
2045 case PLUS:
2046 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2047 return true;
2048 return false;
2049
2050 case CONST:
2051 case SYMBOL_REF:
2052 case LABEL_REF:
2053 return true;
2054
2055 default:
2056 return false;
2057 }
2058}
738f2522 2059\f
4d5438cd
NS
2060/* Machinery to output constant initializers. When beginning an
2061 initializer, we decide on a fragment size (which is visible in ptx
2062 in the type used), and then all initializer data is buffered until
2063 a fragment is filled and ready to be written out. */
2064
2065static struct
2066{
2067 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2068 unsigned HOST_WIDE_INT val; /* Current fragment value. */
2069 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2070 out. */
2071 unsigned size; /* Fragment size to accumulate. */
2072 unsigned offset; /* Offset within current fragment. */
2073 bool started; /* Whether we've output any initializer. */
2074} init_frag;
2075
2076/* The current fragment is full, write it out. SYM may provide a
2077 symbolic reference we should output, in which case the fragment
2078 value is the addend. */
738f2522
BS
2079
2080static void
4d5438cd 2081output_init_frag (rtx sym)
738f2522 2082{
4d5438cd
NS
2083 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2084 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 2085
4d5438cd
NS
2086 init_frag.started = true;
2087 init_frag.val = 0;
2088 init_frag.offset = 0;
2089 init_frag.remaining--;
2090
2091 if (sym)
2092 {
6c7c4708
CP
2093 bool function = (SYMBOL_REF_DECL (sym)
2094 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2095 if (!function)
2096 fprintf (asm_out_file, "generic(");
4d5438cd 2097 output_address (VOIDmode, sym);
6c7c4708
CP
2098 if (!function)
2099 fprintf (asm_out_file, ")");
2100 if (val)
2101 fprintf (asm_out_file, " + ");
4d5438cd 2102 }
738f2522 2103
4d5438cd
NS
2104 if (!sym || val)
2105 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
2106}
2107
4d5438cd
NS
2108/* Add value VAL of size SIZE to the data we're emitting, and keep
2109 writing out chunks as they fill up. */
738f2522
BS
2110
2111static void
4d5438cd 2112nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 2113{
60e537a0
TV
2114 bool negative_p
2115 = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2116
5e044c67
TV
2117 /* Avoid undefined behaviour. */
2118 if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2119 val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
4d5438cd
NS
2120
2121 for (unsigned part = 0; size; size -= part)
738f2522 2122 {
af47a203
TV
2123 if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2124 /* Avoid undefined behaviour. */
60e537a0 2125 val = negative_p ? -1 : 0;
af47a203
TV
2126 else
2127 val >>= (part * BITS_PER_UNIT);
4d5438cd 2128 part = init_frag.size - init_frag.offset;
3c55d60f 2129 part = MIN (part, size);
4d5438cd
NS
2130
2131 unsigned HOST_WIDE_INT partial
2132 = val << (init_frag.offset * BITS_PER_UNIT);
2133 init_frag.val |= partial & init_frag.mask;
2134 init_frag.offset += part;
2135
2136 if (init_frag.offset == init_frag.size)
2137 output_init_frag (NULL);
738f2522
BS
2138 }
2139}
2140
2141/* Target hook for assembling integer object X of size SIZE. */
2142
2143static bool
2144nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2145{
00e52418
NS
2146 HOST_WIDE_INT val = 0;
2147
2148 switch (GET_CODE (x))
738f2522 2149 {
00e52418 2150 default:
a9000e1e
NS
2151 /* Let the generic machinery figure it out, usually for a
2152 CONST_WIDE_INT. */
2153 return false;
00e52418
NS
2154
2155 case CONST_INT:
4d5438cd 2156 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
2157 break;
2158
2159 case CONST:
2160 x = XEXP (x, 0);
2161 gcc_assert (GET_CODE (x) == PLUS);
2162 val = INTVAL (XEXP (x, 1));
2163 x = XEXP (x, 0);
2164 gcc_assert (GET_CODE (x) == SYMBOL_REF);
69cf7dec 2165 gcc_fallthrough (); /* FALLTHROUGH */
00e52418
NS
2166
2167 case SYMBOL_REF:
4d5438cd
NS
2168 gcc_assert (size == init_frag.size);
2169 if (init_frag.offset)
738f2522 2170 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 2171
00e52418 2172 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
2173 init_frag.val = val;
2174 output_init_frag (x);
738f2522 2175 break;
738f2522
BS
2176 }
2177
738f2522
BS
2178 return true;
2179}
2180
2181/* Output SIZE zero bytes. We ignore the FILE argument since the
2182 functions we're calling to perform the output just use
2183 asm_out_file. */
2184
2185void
2186nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2187{
4d5438cd
NS
2188 /* Finish the current fragment, if it's started. */
2189 if (init_frag.offset)
738f2522 2190 {
4d5438cd 2191 unsigned part = init_frag.size - init_frag.offset;
3c55d60f 2192 part = MIN (part, (unsigned)size);
4d5438cd
NS
2193 size -= part;
2194 nvptx_assemble_value (0, part);
738f2522
BS
2195 }
2196
4d5438cd
NS
2197 /* If this skip doesn't terminate the initializer, write as many
2198 remaining pieces as possible directly. */
2199 if (size < init_frag.remaining * init_frag.size)
738f2522 2200 {
4d5438cd
NS
2201 while (size >= init_frag.size)
2202 {
2203 size -= init_frag.size;
2204 output_init_frag (NULL_RTX);
2205 }
2206 if (size)
2207 nvptx_assemble_value (0, size);
738f2522 2208 }
738f2522
BS
2209}
2210
2211/* Output a string STR with length SIZE. As in nvptx_output_skip we
2212 ignore the FILE arg. */
2213
2214void
2215nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2216{
2217 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2218 nvptx_assemble_value (str[i], 1);
2219}
2220
bf398920
TV
2221/* Return true if TYPE is a record type where the last field is an array without
2222 given dimension. */
2223
2224static bool
2225flexible_array_member_type_p (const_tree type)
2226{
2227 if (TREE_CODE (type) != RECORD_TYPE)
2228 return false;
2229
2230 const_tree last_field = NULL_TREE;
2231 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2232 last_field = f;
2233
2234 if (!last_field)
2235 return false;
2236
2237 const_tree last_field_type = TREE_TYPE (last_field);
2238 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2239 return false;
2240
2241 return (! TYPE_DOMAIN (last_field_type)
2242 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2243}
2244
4ff3145a
NS
2245/* Emit a PTX variable decl and prepare for emission of its
2246 initializer. NAME is the symbol name and SETION the PTX data
2247 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2248 The caller has already emitted any indentation and linkage
2249 specifier. It is responsible for any initializer, terminating ;
2250 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2251 this is the opposite way round that PTX wants them! */
2252
2253static void
2254nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
bf398920
TV
2255 const_tree type, HOST_WIDE_INT size, unsigned align,
2256 bool undefined = false)
4ff3145a 2257{
59d2d238
CP
2258 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2259 && (TYPE_DOMAIN (type) == NULL_TREE);
2260
bf398920
TV
2261 if (undefined && flexible_array_member_type_p (type))
2262 {
2263 size = 0;
2264 atype = true;
2265 }
2266
4ff3145a
NS
2267 while (TREE_CODE (type) == ARRAY_TYPE)
2268 type = TREE_TYPE (type);
2269
fc0efeea
NS
2270 if (TREE_CODE (type) == VECTOR_TYPE
2271 || TREE_CODE (type) == COMPLEX_TYPE)
2272 /* Neither vector nor complex types can contain the other. */
2273 type = TREE_TYPE (type);
2274
b9c7fe59 2275 unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
fc0efeea
NS
2276
2277 /* Largest mode we're prepared to accept. For BLKmode types we
2278 don't know if it'll contain pointer constants, so have to choose
2279 pointer size, otherwise we can choose DImode. */
2280 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2281
2282 elt_size |= GET_MODE_SIZE (elt_mode);
2283 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2284
4d5438cd 2285 init_frag.size = elt_size;
9c582551 2286 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2287 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2288 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2289 init_frag.val = 0;
2290 init_frag.offset = 0;
2291 init_frag.started = false;
2292 /* Size might not be a multiple of elt size, if there's an
2293 initialized trailing struct array with smaller type than
2294 elt_size. */
2295 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a 2296
5d3c12e4 2297 fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
4ff3145a
NS
2298 section, align / BITS_PER_UNIT,
2299 elt_size * BITS_PER_UNIT);
2300 assemble_name (file, name);
2301
2302 if (size)
2303 /* We make everything an array, to simplify any initialization
2304 emission. */
b9c7fe59 2305 fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
59d2d238
CP
2306 else if (atype)
2307 fprintf (file, "[]");
4ff3145a
NS
2308}
2309
738f2522
BS
2310/* Called when the initializer for a decl has been completely output through
2311 combinations of the three functions above. */
2312
2313static void
2314nvptx_assemble_decl_end (void)
2315{
4d5438cd
NS
2316 if (init_frag.offset)
2317 /* This can happen with a packed struct with trailing array member. */
2318 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2319 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2320}
2321
69823d76
NS
2322/* Output an uninitialized common or file-scope variable. */
2323
2324void
2325nvptx_output_aligned_decl (FILE *file, const char *name,
2326 const_tree decl, HOST_WIDE_INT size, unsigned align)
2327{
2328 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2329
2330 /* If this is public, it is common. The nearest thing we have to
2331 common is weak. */
4ff3145a
NS
2332 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2333
2334 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2335 TREE_TYPE (decl), size, align);
4d5438cd 2336 nvptx_assemble_decl_end ();
69823d76
NS
2337}
2338
738f2522
BS
2339/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2340 writing a constant variable EXP with NAME and SIZE and its
2341 initializer to FILE. */
2342
2343static void
2344nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2345 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2346{
4ff3145a
NS
2347 write_var_marker (file, true, false, name);
2348
2349 fprintf (file, "\t");
2350
738f2522 2351 tree type = TREE_TYPE (exp);
4ff3145a
NS
2352 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2353 TYPE_ALIGN (type));
738f2522
BS
2354}
2355
2356/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2357 a variable DECL with NAME to FILE. */
2358
2359void
2360nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2361{
4ff3145a 2362 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2363
4ff3145a
NS
2364 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2365 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2366
4ff3145a
NS
2367 tree type = TREE_TYPE (decl);
2368 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2369 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2370 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2371}
2372
2373/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2374
2375static void
2376nvptx_globalize_label (FILE *, const char *)
2377{
2378}
2379
2380/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2381 declaration only for variable DECL with NAME to FILE. */
f313d112 2382
738f2522
BS
2383static void
2384nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2385{
0a0f74aa
NS
2386 /* The middle end can place constant pool decls into the varpool as
2387 undefined. Until that is fixed, catch the problem here. */
2388 if (DECL_IN_CONSTANT_POOL (decl))
2389 return;
2390
721547cd
NS
2391 /* We support weak defintions, and hence have the right
2392 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2393 if (DECL_WEAK (decl))
2394 error_at (DECL_SOURCE_LOCATION (decl),
2395 "PTX does not support weak declarations"
2396 " (only weak definitions)");
69823d76
NS
2397 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2398
4ff3145a
NS
2399 fprintf (file, "\t.extern ");
2400 tree size = DECL_SIZE_UNIT (decl);
2401 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2402 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
bf398920 2403 DECL_ALIGN (decl), true);
1e5154e7 2404 nvptx_assemble_decl_end ();
738f2522
BS
2405}
2406
f313d112
NS
2407/* Output a pattern for a move instruction. */
2408
2409const char *
2410nvptx_output_mov_insn (rtx dst, rtx src)
2411{
2412 machine_mode dst_mode = GET_MODE (dst);
ca52f937 2413 machine_mode src_mode = GET_MODE (src);
f313d112
NS
2414 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2415 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2416 machine_mode src_inner = (GET_CODE (src) == SUBREG
2417 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2418
15113b03
NS
2419 rtx sym = src;
2420 if (GET_CODE (sym) == CONST)
2421 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2422 if (SYMBOL_REF_P (sym))
2423 {
2424 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2425 return "%.\tcvta%D1%t0\t%0, %1;";
2426 nvptx_maybe_record_fnsym (sym);
2427 }
15113b03 2428
f313d112
NS
2429 if (src_inner == dst_inner)
2430 return "%.\tmov%t0\t%0, %1;";
2431
2432 if (CONSTANT_P (src))
2433 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2434 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2435 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2436
2437 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
3717fbe3
TV
2438 {
2439 if (GET_MODE_BITSIZE (dst_mode) == 128
ca52f937 2440 && GET_MODE_BITSIZE (src_mode) == 128)
3717fbe3
TV
2441 {
2442 /* mov.b128 is not supported. */
2443 if (dst_inner == V2DImode && src_inner == TImode)
2444 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2445 else if (dst_inner == TImode && src_inner == V2DImode)
2446 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2447
2448 gcc_unreachable ();
2449 }
2450 return "%.\tmov.b%T0\t%0, %1;";
2451 }
f313d112 2452
ca52f937
TV
2453 if (GET_MODE_BITSIZE (src_inner) == 128
2454 && GET_MODE_BITSIZE (src_mode) == 64)
2455 return "%.\tmov.b%T0\t%0, %1;";
2456
f313d112
NS
2457 return "%.\tcvt%t0%t1\t%0, %1;";
2458}
2459
58f7c7e0
TV
2460/* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */
2461
2462static void
2463nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
2464{
2465 bool post_p = !pre_p;
2466
2467 switch (memmodel)
2468 {
2469 case MEMMODEL_RELAXED:
2470 return;
2471 case MEMMODEL_CONSUME:
2472 case MEMMODEL_ACQUIRE:
2473 case MEMMODEL_SYNC_ACQUIRE:
2474 if (post_p)
2475 break;
2476 return;
2477 case MEMMODEL_RELEASE:
2478 case MEMMODEL_SYNC_RELEASE:
2479 if (pre_p)
2480 break;
2481 return;
2482 case MEMMODEL_ACQ_REL:
2483 case MEMMODEL_SEQ_CST:
2484 case MEMMODEL_SYNC_SEQ_CST:
2485 if (pre_p || post_p)
2486 break;
2487 return;
2488 default:
2489 gcc_unreachable ();
2490 }
2491
2492 output_asm_insn ("%.\tmembar%B0;", mem_operand);
2493}
2494
2495const char *
2496nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
2497 int memmodel_pos)
2498{
2499 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2500 true);
2501 output_asm_insn (asm_template, operands);
2502 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2503 false);
2504 return "";
2505}
2506
5012919d
AM
2507static void nvptx_print_operand (FILE *, rtx, int);
2508
738f2522 2509/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2510 involves writing .param declarations and in/out copies into them. For
2511 indirect calls, also write the .callprototype. */
738f2522
BS
2512
2513const char *
2514nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2515{
863af9a4 2516 char buf[16];
738f2522
BS
2517 static int labelno;
2518 bool needs_tgt = register_operand (callee, Pmode);
2519 rtx pat = PATTERN (insn);
5012919d
AM
2520 if (GET_CODE (pat) == COND_EXEC)
2521 pat = COND_EXEC_CODE (pat);
f324806d 2522 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2523 tree decl = NULL_TREE;
2524
2525 fprintf (asm_out_file, "\t{\n");
2526 if (result != NULL)
1f065954
NS
2527 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2528 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2529 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2530
ecf6e535 2531 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2532 if (GET_CODE (callee) == SYMBOL_REF)
2533 {
2534 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2535 if (!decl
2536 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2537 nvptx_record_libfunc (callee, result, pat);
2538 else if (DECL_EXTERNAL (decl))
738f2522
BS
2539 nvptx_record_fndecl (decl);
2540 }
2541
2542 if (needs_tgt)
2543 {
2544 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2545 labelno++;
2546 ASM_OUTPUT_LABEL (asm_out_file, buf);
2547 std::stringstream s;
b699adcc 2548 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2549 fputs (s.str().c_str(), asm_out_file);
2550 }
2551
863af9a4 2552 for (int argno = 1; argno < arg_end; argno++)
738f2522 2553 {
863af9a4 2554 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2555 machine_mode mode = GET_MODE (t);
a02d84b6 2556 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2557
863af9a4 2558 /* Mode splitting has already been done. */
a02d84b6
NS
2559 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2560 "\t\tst.param%s [%%out_arg%d], ",
2561 ptx_type, argno, ptx_type, argno);
2562 output_reg (asm_out_file, REGNO (t), VOIDmode);
2563 fprintf (asm_out_file, ";\n");
738f2522
BS
2564 }
2565
5012919d
AM
2566 /* The '.' stands for the call's predicate, if any. */
2567 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2568 fprintf (asm_out_file, "\t\tcall ");
2569 if (result != NULL_RTX)
1f065954
NS
2570 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2571
738f2522
BS
2572 if (decl)
2573 {
e88b04bd 2574 char *replaced_dots = NULL;
738f2522 2575 const char *name = get_fnname_from_decl (decl);
e88b04bd
TV
2576 const char *replacement = nvptx_name_replacement (name);
2577 if (replacement != name)
2578 name = replacement;
2579 else
2580 {
2581 replaced_dots = nvptx_replace_dot (name);
2582 if (replaced_dots)
2583 name = replaced_dots;
2584 }
738f2522 2585 assemble_name (asm_out_file, name);
e88b04bd
TV
2586 if (replaced_dots)
2587 XDELETE (replaced_dots);
738f2522
BS
2588 }
2589 else
cc8ca59e 2590 output_address (VOIDmode, callee);
738f2522 2591
863af9a4
NS
2592 const char *open = "(";
2593 for (int argno = 1; argno < arg_end; argno++)
738f2522 2594 {
863af9a4
NS
2595 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2596 open = "";
738f2522 2597 }
863af9a4
NS
2598 if (decl && DECL_STATIC_CHAIN (decl))
2599 {
5563d5c0 2600 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2601 open = "";
2602 }
2603 if (!open[0])
2604 fprintf (asm_out_file, ")");
f324806d 2605
738f2522
BS
2606 if (needs_tgt)
2607 {
2608 fprintf (asm_out_file, ", ");
2609 assemble_name (asm_out_file, buf);
2610 }
2611 fprintf (asm_out_file, ";\n");
738f2522 2612
51baf85a 2613 if (find_reg_note (insn, REG_NORETURN, NULL))
fb202364
TV
2614 {
2615 /* No return functions confuse the PTX JIT, as it doesn't realize
2616 the flow control barrier they imply. It can seg fault if it
2617 encounters what looks like an unexitable loop. Emit a trailing
2618 trap and exit, which it does grok. */
2619 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2620 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2621 }
51baf85a 2622
1f065954
NS
2623 if (result)
2624 {
2625 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2626
2627 if (!rval[0])
2628 /* We must escape the '%' that starts RETURN_REGNUM. */
2629 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2630 reg_names[NVPTX_RETURN_REGNUM]);
2631 return rval;
2632 }
2633
2634 return "}";
738f2522
BS
2635}
2636
2637/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2638
2639static bool
2640nvptx_print_operand_punct_valid_p (unsigned char c)
2641{
2642 return c == '.' || c== '#';
2643}
2644
738f2522
BS
2645/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2646
2647static void
2648nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2649{
2650 rtx off;
2651 if (GET_CODE (x) == CONST)
2652 x = XEXP (x, 0);
2653 switch (GET_CODE (x))
2654 {
2655 case PLUS:
2656 off = XEXP (x, 1);
cc8ca59e 2657 output_address (VOIDmode, XEXP (x, 0));
738f2522 2658 fprintf (file, "+");
cc8ca59e 2659 output_address (VOIDmode, off);
738f2522
BS
2660 break;
2661
2662 case SYMBOL_REF:
2663 case LABEL_REF:
2664 output_addr_const (file, x);
2665 break;
2666
2667 default:
2668 gcc_assert (GET_CODE (x) != MEM);
2669 nvptx_print_operand (file, x, 0);
2670 break;
2671 }
2672}
2673
2674/* Write assembly language output for the address ADDR to FILE. */
2675
2676static void
cc8ca59e 2677nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2678{
cc8ca59e 2679 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2680}
2681
2682/* Print an operand, X, to FILE, with an optional modifier in CODE.
2683
2684 Meaning of CODE:
2685 . -- print the predicate for the instruction or an emptry string for an
2686 unconditional one.
2687 # -- print a rounding mode for the instruction
2688
9a863523 2689 A -- print a data area for a MEM
738f2522 2690 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2691 D -- print a data area for a MEM operand
d88cd9c4 2692 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2693 t -- print a type opcode suffix, promoting QImode to 32 bits
2694 T -- print a type size in bits
2695 u -- print a type opcode suffix without promotions. */
2696
2697static void
2698nvptx_print_operand (FILE *file, rtx x, int code)
2699{
738f2522
BS
2700 if (code == '.')
2701 {
2702 x = current_insn_predicate;
2703 if (x)
2704 {
5012919d 2705 fputs ("@", file);
738f2522
BS
2706 if (GET_CODE (x) == EQ)
2707 fputs ("!", file);
5012919d 2708 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2709 }
2710 return;
2711 }
2712 else if (code == '#')
2713 {
2714 fputs (".rn", file);
2715 return;
2716 }
2717
2718 enum rtx_code x_code = GET_CODE (x);
f313d112 2719 machine_mode mode = GET_MODE (x);
738f2522
BS
2720
2721 switch (code)
2722 {
58f7c7e0
TV
2723 case 'B':
2724 if (SYMBOL_REF_P (XEXP (x, 0)))
2725 switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
2726 {
2727 case DATA_AREA_GENERIC:
2728 /* Assume worst-case: global. */
2729 gcc_fallthrough (); /* FALLTHROUGH. */
2730 case DATA_AREA_GLOBAL:
2731 break;
2732 case DATA_AREA_SHARED:
2733 fputs (".cta", file);
2734 return;
2735 case DATA_AREA_LOCAL:
2736 case DATA_AREA_CONST:
2737 case DATA_AREA_PARAM:
2738 default:
2739 gcc_unreachable ();
2740 }
2741
2742 /* There are 2 cases where membar.sys differs from membar.gl:
2743 - host accesses global memory (f.i. systemwide atomics)
2744 - 2 or more devices are setup in peer-to-peer mode, and one
2745 peer can access global memory of other peer.
2746 Neither are currently supported by openMP/OpenACC on nvptx, but
2747 that could change, so we default to membar.sys. We could support
2748 this more optimally by adding DATA_AREA_SYS and then emitting
2749 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */
2750 fputs (".sys", file);
2751 return;
2752
738f2522 2753 case 'A':
9a863523 2754 x = XEXP (x, 0);
69cf7dec 2755 gcc_fallthrough (); /* FALLTHROUGH. */
7b8edc29 2756
9a863523
NS
2757 case 'D':
2758 if (GET_CODE (x) == CONST)
2759 x = XEXP (x, 0);
2760 if (GET_CODE (x) == PLUS)
2761 x = XEXP (x, 0);
7b8edc29 2762
9a863523
NS
2763 if (GET_CODE (x) == SYMBOL_REF)
2764 fputs (section_for_sym (x), file);
738f2522
BS
2765 break;
2766
738f2522 2767 case 't':
738f2522 2768 case 'u':
f313d112
NS
2769 if (x_code == SUBREG)
2770 {
1310ff03
TV
2771 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2772 if (VECTOR_MODE_P (inner_mode)
2773 && (GET_MODE_SIZE (mode)
2774 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2775 mode = GET_MODE_INNER (inner_mode);
2776 else if (split_mode_p (inner_mode))
2777 mode = maybe_split_mode (inner_mode);
2778 else
2779 mode = inner_mode;
f313d112
NS
2780 }
2781 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2782 break;
2783
3717fbe3
TV
2784 case 'H':
2785 case 'L':
2786 {
2787 rtx inner_x = SUBREG_REG (x);
2788 machine_mode inner_mode = GET_MODE (inner_x);
2789 machine_mode split = maybe_split_mode (inner_mode);
2790
2791 output_reg (file, REGNO (inner_x), split,
2792 (code == 'H'
2793 ? GET_MODE_SIZE (inner_mode) / 2
2794 : 0));
2795 }
2796 break;
2797
d88cd9c4
NS
2798 case 'S':
2799 {
59263259
NS
2800 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2801 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2802 static const char *const kinds[] =
59263259
NS
2803 {".up", ".down", ".bfly", ".idx"};
2804 fputs (kinds[kind], file);
d88cd9c4
NS
2805 }
2806 break;
2807
738f2522 2808 case 'T':
f313d112 2809 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2810 break;
2811
2812 case 'j':
2813 fprintf (file, "@");
2814 goto common;
2815
2816 case 'J':
2817 fprintf (file, "@!");
2818 goto common;
2819
2820 case 'c':
f313d112 2821 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2822 switch (x_code)
2823 {
2824 case EQ:
2825 fputs (".eq", file);
2826 break;
2827 case NE:
f313d112 2828 if (FLOAT_MODE_P (mode))
738f2522
BS
2829 fputs (".neu", file);
2830 else
2831 fputs (".ne", file);
2832 break;
2833 case LE:
578fb225 2834 case LEU:
738f2522
BS
2835 fputs (".le", file);
2836 break;
2837 case GE:
578fb225 2838 case GEU:
738f2522
BS
2839 fputs (".ge", file);
2840 break;
2841 case LT:
578fb225 2842 case LTU:
738f2522
BS
2843 fputs (".lt", file);
2844 break;
2845 case GT:
738f2522 2846 case GTU:
578fb225 2847 fputs (".gt", file);
738f2522
BS
2848 break;
2849 case LTGT:
2850 fputs (".ne", file);
2851 break;
2852 case UNEQ:
2853 fputs (".equ", file);
2854 break;
2855 case UNLE:
2856 fputs (".leu", file);
2857 break;
2858 case UNGE:
2859 fputs (".geu", file);
2860 break;
2861 case UNLT:
2862 fputs (".ltu", file);
2863 break;
2864 case UNGT:
2865 fputs (".gtu", file);
2866 break;
2867 case UNORDERED:
2868 fputs (".nan", file);
2869 break;
2870 case ORDERED:
2871 fputs (".num", file);
2872 break;
2873 default:
2874 gcc_unreachable ();
2875 }
f313d112 2876 if (FLOAT_MODE_P (mode)
738f2522
BS
2877 || x_code == EQ || x_code == NE
2878 || x_code == GEU || x_code == GTU
2879 || x_code == LEU || x_code == LTU)
f313d112 2880 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2881 else
f313d112 2882 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2883 break;
2884 default:
2885 common:
2886 switch (x_code)
2887 {
2888 case SUBREG:
f313d112
NS
2889 {
2890 rtx inner_x = SUBREG_REG (x);
2891 machine_mode inner_mode = GET_MODE (inner_x);
2892 machine_mode split = maybe_split_mode (inner_mode);
2893
1310ff03
TV
2894 if (VECTOR_MODE_P (inner_mode)
2895 && (GET_MODE_SIZE (mode)
2896 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2897 {
2898 output_reg (file, REGNO (inner_x), VOIDmode);
2899 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2900 }
2901 else if (split_mode_p (inner_mode)
f313d112
NS
2902 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2903 output_reg (file, REGNO (inner_x), split);
2904 else
2905 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2906 }
2907 break;
738f2522
BS
2908
2909 case REG:
f313d112 2910 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2911 break;
2912
2913 case MEM:
2914 fputc ('[', file);
f313d112 2915 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2916 fputc (']', file);
2917 break;
2918
2919 case CONST_INT:
2920 output_addr_const (file, x);
2921 break;
2922
2923 case CONST:
2924 case SYMBOL_REF:
2925 case LABEL_REF:
2926 /* We could use output_addr_const, but that can print things like
2927 "x-8", which breaks ptxas. Need to ensure it is output as
2928 "x+-8". */
2929 nvptx_print_address_operand (file, x, VOIDmode);
2930 break;
2931
2932 case CONST_DOUBLE:
2933 long vals[2];
f313d112 2934 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2935 vals[0] &= 0xffffffff;
2936 vals[1] &= 0xffffffff;
f313d112 2937 if (mode == SFmode)
738f2522
BS
2938 fprintf (file, "0f%08lx", vals[0]);
2939 else
2940 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2941 break;
2942
1310ff03
TV
2943 case CONST_VECTOR:
2944 {
2945 unsigned n = CONST_VECTOR_NUNITS (x);
2946 fprintf (file, "{ ");
2947 for (unsigned i = 0; i < n; ++i)
2948 {
2949 if (i != 0)
2950 fprintf (file, ", ");
2951
2952 rtx elem = CONST_VECTOR_ELT (x, i);
2953 output_addr_const (file, elem);
2954 }
2955 fprintf (file, " }");
2956 }
2957 break;
2958
738f2522
BS
2959 default:
2960 output_addr_const (file, x);
2961 }
2962 }
2963}
2964\f
2965/* Record replacement regs used to deal with subreg operands. */
2966struct reg_replace
2967{
2968 rtx replacement[MAX_RECOG_OPERANDS];
2969 machine_mode mode;
2970 int n_allocated;
2971 int n_in_use;
2972};
2973
2974/* Allocate or reuse a replacement in R and return the rtx. */
2975
2976static rtx
2977get_replacement (struct reg_replace *r)
2978{
2979 if (r->n_allocated == r->n_in_use)
2980 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2981 return r->replacement[r->n_in_use++];
2982}
2983
2984/* Clean up subreg operands. In ptx assembly, everything is typed, and
2985 the presence of subregs would break the rules for most instructions.
2986 Replace them with a suitable new register of the right size, plus
2987 conversion copyin/copyout instructions. */
2988
2989static void
517665b3 2990nvptx_reorg_subreg (void)
738f2522
BS
2991{
2992 struct reg_replace qiregs, hiregs, siregs, diregs;
2993 rtx_insn *insn, *next;
2994
738f2522
BS
2995 qiregs.n_allocated = 0;
2996 hiregs.n_allocated = 0;
2997 siregs.n_allocated = 0;
2998 diregs.n_allocated = 0;
2999 qiregs.mode = QImode;
3000 hiregs.mode = HImode;
3001 siregs.mode = SImode;
3002 diregs.mode = DImode;
3003
3004 for (insn = get_insns (); insn; insn = next)
3005 {
3006 next = NEXT_INSN (insn);
3007 if (!NONDEBUG_INSN_P (insn)
1fe6befc 3008 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
3009 || GET_CODE (PATTERN (insn)) == USE
3010 || GET_CODE (PATTERN (insn)) == CLOBBER)
3011 continue;
f324806d 3012
738f2522
BS
3013 qiregs.n_in_use = 0;
3014 hiregs.n_in_use = 0;
3015 siregs.n_in_use = 0;
3016 diregs.n_in_use = 0;
3017 extract_insn (insn);
3018 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 3019
738f2522
BS
3020 for (int i = 0; i < recog_data.n_operands; i++)
3021 {
3022 rtx op = recog_data.operand[i];
3023 if (GET_CODE (op) != SUBREG)
3024 continue;
3025
3026 rtx inner = SUBREG_REG (op);
3027
3028 machine_mode outer_mode = GET_MODE (op);
3029 machine_mode inner_mode = GET_MODE (inner);
3030 gcc_assert (s_ok);
3031 if (s_ok
3032 && (GET_MODE_PRECISION (inner_mode)
3033 >= GET_MODE_PRECISION (outer_mode)))
3034 continue;
3035 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
3036 struct reg_replace *r = (outer_mode == QImode ? &qiregs
3037 : outer_mode == HImode ? &hiregs
3038 : outer_mode == SImode ? &siregs
3039 : &diregs);
3040 rtx new_reg = get_replacement (r);
3041
3042 if (recog_data.operand_type[i] != OP_OUT)
3043 {
3044 enum rtx_code code;
3045 if (GET_MODE_PRECISION (inner_mode)
3046 < GET_MODE_PRECISION (outer_mode))
3047 code = ZERO_EXTEND;
3048 else
3049 code = TRUNCATE;
3050
f7df4a84 3051 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
3052 gen_rtx_fmt_e (code, outer_mode, inner));
3053 emit_insn_before (pat, insn);
3054 }
3055
3056 if (recog_data.operand_type[i] != OP_IN)
3057 {
3058 enum rtx_code code;
3059 if (GET_MODE_PRECISION (inner_mode)
3060 < GET_MODE_PRECISION (outer_mode))
3061 code = TRUNCATE;
3062 else
3063 code = ZERO_EXTEND;
3064
f7df4a84 3065 rtx pat = gen_rtx_SET (inner,
738f2522
BS
3066 gen_rtx_fmt_e (code, inner_mode, new_reg));
3067 emit_insn_after (pat, insn);
3068 }
3069 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
3070 }
3071 }
517665b3 3072}
738f2522 3073
5012919d
AM
3074/* Return a SImode "master lane index" register for uniform-simt, allocating on
3075 first use. */
3076
3077static rtx
3078nvptx_get_unisimt_master ()
3079{
3080 rtx &master = cfun->machine->unisimt_master;
3081 return master ? master : master = gen_reg_rtx (SImode);
3082}
3083
3084/* Return a BImode "predicate" register for uniform-simt, similar to above. */
3085
3086static rtx
3087nvptx_get_unisimt_predicate ()
3088{
3089 rtx &pred = cfun->machine->unisimt_predicate;
3090 return pred ? pred : pred = gen_reg_rtx (BImode);
3091}
3092
3093/* Return true if given call insn references one of the functions provided by
3094 the CUDA runtime: malloc, free, vprintf. */
3095
3096static bool
3097nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3098{
3099 rtx pat = PATTERN (insn);
3100 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3101 pat = XVECEXP (pat, 0, 0);
3102 if (GET_CODE (pat) == SET)
3103 pat = SET_SRC (pat);
3104 gcc_checking_assert (GET_CODE (pat) == CALL
3105 && GET_CODE (XEXP (pat, 0)) == MEM);
3106 rtx addr = XEXP (XEXP (pat, 0), 0);
3107 if (GET_CODE (addr) != SYMBOL_REF)
3108 return false;
3109 const char *name = XSTR (addr, 0);
3110 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3111 references with forced assembler name refer to PTX syscalls. For vprintf,
3112 accept both normal and forced-assembler-name references. */
3113 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3114 || !strcmp (name, "*malloc")
3115 || !strcmp (name, "*free"));
3116}
3117
3118/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3119 propagate its value from lane MASTER to current lane. */
3120
3121static void
3122nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3123{
3124 rtx reg;
3125 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
3126 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
3127}
3128
3129/* Adjust code for uniform-simt code generation variant by making atomics and
3130 "syscalls" conditionally executed, and inserting shuffle-based propagation
3131 for registers being set. */
3132
3133static void
3134nvptx_reorg_uniform_simt ()
3135{
3136 rtx_insn *insn, *next;
3137
3138 for (insn = get_insns (); insn; insn = next)
3139 {
3140 next = NEXT_INSN (insn);
3141 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3142 && !(NONJUMP_INSN_P (insn)
3143 && GET_CODE (PATTERN (insn)) == PARALLEL
3144 && get_attr_atomic (insn)))
3145 continue;
3146 rtx pat = PATTERN (insn);
3147 rtx master = nvptx_get_unisimt_master ();
3148 for (int i = 0; i < XVECLEN (pat, 0); i++)
3149 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3150 rtx pred = nvptx_get_unisimt_predicate ();
3151 pred = gen_rtx_NE (BImode, pred, const0_rtx);
3152 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3153 validate_change (insn, &PATTERN (insn), pat, false);
3154 }
3155}
3156
2498a6ab
TV
3157/* Offloading function attributes. */
3158
3159struct offload_attrs
3160{
3161 unsigned mask;
3162 int num_gangs;
3163 int num_workers;
3164 int vector_length;
3165};
3166
0d06e8c7
TV
3167/* Define entries for cfun->machine->axis_dim. */
3168
3169#define MACH_VECTOR_LENGTH 0
3170#define MACH_MAX_WORKERS 1
3171
3172static void populate_offload_attrs (offload_attrs *oa);
3173
3174static void
3175init_axis_dim (void)
3176{
3177 offload_attrs oa;
3178 int max_workers;
3179
3180 populate_offload_attrs (&oa);
3181
3182 if (oa.num_workers == 0)
3183 max_workers = PTX_CTA_SIZE / oa.vector_length;
3184 else
3185 max_workers = oa.num_workers;
3186
3187 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3188 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3189 cfun->machine->axis_dim_init_p = true;
3190}
3191
3192static int ATTRIBUTE_UNUSED
3193nvptx_mach_max_workers ()
3194{
3195 if (!cfun->machine->axis_dim_init_p)
3196 init_axis_dim ();
3197 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3198}
3199
3200static int ATTRIBUTE_UNUSED
3201nvptx_mach_vector_length ()
3202{
3203 if (!cfun->machine->axis_dim_init_p)
3204 init_axis_dim ();
3205 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3206}
3207
d2d47a28
NS
3208/* Loop structure of the function. The entire function is described as
3209 a NULL loop. */
62f01243 3210/* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
d88cd9c4
NS
3211
3212struct parallel
3213{
3214 /* Parent parallel. */
3215 parallel *parent;
3216
3217 /* Next sibling parallel. */
3218 parallel *next;
3219
3220 /* First child parallel. */
3221 parallel *inner;
3222
3223 /* Partitioning mask of the parallel. */
3224 unsigned mask;
3225
3226 /* Partitioning used within inner parallels. */
3227 unsigned inner_mask;
3228
3229 /* Location of parallel forked and join. The forked is the first
3230 block in the parallel and the join is the first block after of
3231 the partition. */
3232 basic_block forked_block;
3233 basic_block join_block;
3234
3235 rtx_insn *forked_insn;
3236 rtx_insn *join_insn;
3237
3238 rtx_insn *fork_insn;
3239 rtx_insn *joining_insn;
3240
3241 /* Basic blocks in this parallel, but not in child parallels. The
3242 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3243 blocks are not. */
3244 auto_vec<basic_block> blocks;
3245
3246public:
3247 parallel (parallel *parent, unsigned mode);
3248 ~parallel ();
3249};
3250
3251/* Constructor links the new parallel into it's parent's chain of
3252 children. */
3253
3254parallel::parallel (parallel *parent_, unsigned mask_)
3255 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3256{
3257 forked_block = join_block = 0;
3258 forked_insn = join_insn = 0;
3259 fork_insn = joining_insn = 0;
3260
3261 if (parent)
3262 {
3263 next = parent->inner;
3264 parent->inner = this;
3265 }
3266}
3267
3268parallel::~parallel ()
3269{
3270 delete inner;
3271 delete next;
3272}
3273
3274/* Map of basic blocks to insns */
3275typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3276
3277/* A tuple of an insn of interest and the BB in which it resides. */
3278typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3279typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3280
3281/* Split basic blocks such that each forked and join unspecs are at
3282 the start of their basic blocks. Thus afterwards each block will
3283 have a single partitioning mode. We also do the same for return
3284 insns, as they are executed by every thread. Return the
3285 partitioning mode of the function as a whole. Populate MAP with
3286 head and tail blocks. We also clear the BB visited flag, which is
3287 used when finding partitions. */
62f01243 3288/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */
d88cd9c4
NS
3289
3290static void
3291nvptx_split_blocks (bb_insn_map_t *map)
3292{
3293 insn_bb_vec_t worklist;
3294 basic_block block;
3295 rtx_insn *insn;
3296
3297 /* Locate all the reorg instructions of interest. */
3298 FOR_ALL_BB_FN (block, cfun)
3299 {
3300 bool seen_insn = false;
3301
3302 /* Clear visited flag, for use by parallel locator */
3303 block->flags &= ~BB_VISITED;
3304
3305 FOR_BB_INSNS (block, insn)
3306 {
3307 if (!INSN_P (insn))
3308 continue;
3309 switch (recog_memoized (insn))
3310 {
3311 default:
3312 seen_insn = true;
3313 continue;
3314 case CODE_FOR_nvptx_forked:
3315 case CODE_FOR_nvptx_join:
3316 break;
3317
3318 case CODE_FOR_return:
3319 /* We also need to split just before return insns, as
3320 that insn needs executing by all threads, but the
3321 block it is in probably does not. */
3322 break;
3323 }
3324
3325 if (seen_insn)
3326 /* We've found an instruction that must be at the start of
3327 a block, but isn't. Add it to the worklist. */
3328 worklist.safe_push (insn_bb_t (insn, block));
3329 else
3330 /* It was already the first instruction. Just add it to
3331 the map. */
3332 map->get_or_insert (block) = insn;
3333 seen_insn = true;
3334 }
3335 }
3336
3337 /* Split blocks on the worklist. */
3338 unsigned ix;
3339 insn_bb_t *elt;
3340 basic_block remap = 0;
3341 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3342 {
3343 if (remap != elt->second)
3344 {
3345 block = elt->second;
3346 remap = block;
3347 }
3348
3349 /* Split block before insn. The insn is in the new block */
3350 edge e = split_block (block, PREV_INSN (elt->first));
3351
3352 block = e->dest;
3353 map->get_or_insert (block) = elt->first;
3354 }
3355}
3356
0024c320
TV
3357/* Return true if MASK contains parallelism that requires shared
3358 memory to broadcast. */
3359
3360static bool
3361nvptx_needs_shared_bcast (unsigned mask)
3362{
3363 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3364 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3365 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3366
3367 return worker || large_vector;
3368}
3369
d88cd9c4
NS
3370/* BLOCK is a basic block containing a head or tail instruction.
3371 Locate the associated prehead or pretail instruction, which must be
3372 in the single predecessor block. */
3373
3374static rtx_insn *
3375nvptx_discover_pre (basic_block block, int expected)
3376{
3377 gcc_assert (block->preds->length () == 1);
3378 basic_block pre_block = (*block->preds)[0]->src;
3379 rtx_insn *pre_insn;
3380
3381 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3382 pre_insn = PREV_INSN (pre_insn))
3383 gcc_assert (pre_insn != BB_HEAD (pre_block));
3384
3385 gcc_assert (recog_memoized (pre_insn) == expected);
3386 return pre_insn;
3387}
3388
3389/* Dump this parallel and all its inner parallels. */
62f01243 3390/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */
d88cd9c4
NS
3391
3392static void
3393nvptx_dump_pars (parallel *par, unsigned depth)
3394{
3395 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3396 depth, par->mask,
3397 par->forked_block ? par->forked_block->index : -1,
3398 par->join_block ? par->join_block->index : -1);
3399
3400 fprintf (dump_file, " blocks:");
3401
3402 basic_block block;
3403 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3404 fprintf (dump_file, " %d", block->index);
3405 fprintf (dump_file, "\n");
3406 if (par->inner)
3407 nvptx_dump_pars (par->inner, depth + 1);
3408
3409 if (par->next)
3410 nvptx_dump_pars (par->next, depth);
3411}
3412
3413/* If BLOCK contains a fork/join marker, process it to create or
3414 terminate a loop structure. Add this block to the current loop,
3415 and then walk successor blocks. */
62f01243 3416/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */
d88cd9c4
NS
3417
3418static parallel *
3419nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3420{
3421 if (block->flags & BB_VISITED)
3422 return par;
3423 block->flags |= BB_VISITED;
3424
3425 if (rtx_insn **endp = map->get (block))
3426 {
3427 rtx_insn *end = *endp;
3428
3429 /* This is a block head or tail, or return instruction. */
3430 switch (recog_memoized (end))
3431 {
3432 case CODE_FOR_return:
3433 /* Return instructions are in their own block, and we
3434 don't need to do anything more. */
3435 return par;
3436
3437 case CODE_FOR_nvptx_forked:
3438 /* Loop head, create a new inner loop and add it into
3439 our parent's child list. */
3440 {
3441 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3442
3443 gcc_assert (mask);
3444 par = new parallel (par, mask);
3445 par->forked_block = block;
3446 par->forked_insn = end;
0024c320 3447 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3448 par->fork_insn
3449 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3450 }
3451 break;
3452
3453 case CODE_FOR_nvptx_join:
3454 /* A loop tail. Finish the current loop and return to
3455 parent. */
3456 {
3457 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3458
3459 gcc_assert (par->mask == mask);
4b557bcc 3460 gcc_assert (par->join_block == NULL);
d88cd9c4
NS
3461 par->join_block = block;
3462 par->join_insn = end;
0024c320 3463 if (nvptx_needs_shared_bcast (mask))
d88cd9c4
NS
3464 par->joining_insn
3465 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3466 par = par->parent;
3467 }
3468 break;
3469
3470 default:
3471 gcc_unreachable ();
3472 }
3473 }
3474
3475 if (par)
3476 /* Add this block onto the current loop's list of blocks. */
3477 par->blocks.safe_push (block);
3478 else
3479 /* This must be the entry block. Create a NULL parallel. */
3480 par = new parallel (0, 0);
3481
3482 /* Walk successor blocks. */
3483 edge e;
3484 edge_iterator ei;
3485
3486 FOR_EACH_EDGE (e, ei, block->succs)
3487 nvptx_find_par (map, par, e->dest);
3488
3489 return par;
3490}
3491
3492/* DFS walk the CFG looking for fork & join markers. Construct
3493 loop structures as we go. MAP is a mapping of basic blocks
3494 to head & tail markers, discovered when splitting blocks. This
3495 speeds up the discovery. We rely on the BB visited flag having
3496 been cleared when splitting blocks. */
62f01243 3497/* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */
d88cd9c4
NS
3498
3499static parallel *
3500nvptx_discover_pars (bb_insn_map_t *map)
3501{
3502 basic_block block;
3503
3504 /* Mark exit blocks as visited. */
3505 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3506 block->flags |= BB_VISITED;
3507
3508 /* And entry block as not. */
3509 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3510 block->flags &= ~BB_VISITED;
3511
3512 parallel *par = nvptx_find_par (map, 0, block);
3513
3514 if (dump_file)
3515 {
3516 fprintf (dump_file, "\nLoops\n");
3517 nvptx_dump_pars (par, 0);
3518 fprintf (dump_file, "\n");
3519 }
3520
3521 return par;
3522}
3523
912442c2
NS
3524/* Analyse a group of BBs within a partitioned region and create N
3525 Single-Entry-Single-Exit regions. Some of those regions will be
3526 trivial ones consisting of a single BB. The blocks of a
3527 partitioned region might form a set of disjoint graphs -- because
3528 the region encloses a differently partitoned sub region.
3529
3530 We use the linear time algorithm described in 'Finding Regions Fast:
3531 Single Entry Single Exit and control Regions in Linear Time'
3532 Johnson, Pearson & Pingali. That algorithm deals with complete
3533 CFGs, where a back edge is inserted from END to START, and thus the
3534 problem becomes one of finding equivalent loops.
3535
3536 In this case we have a partial CFG. We complete it by redirecting
3537 any incoming edge to the graph to be from an arbitrary external BB,
3538 and similarly redirecting any outgoing edge to be to that BB.
3539 Thus we end up with a closed graph.
3540
3541 The algorithm works by building a spanning tree of an undirected
3542 graph and keeping track of back edges from nodes further from the
3543 root in the tree to nodes nearer to the root in the tree. In the
3544 description below, the root is up and the tree grows downwards.
3545
3546 We avoid having to deal with degenerate back-edges to the same
3547 block, by splitting each BB into 3 -- one for input edges, one for
3548 the node itself and one for the output edges. Such back edges are
3549 referred to as 'Brackets'. Cycle equivalent nodes will have the
3550 same set of brackets.
3551
3552 Determining bracket equivalency is done by maintaining a list of
3553 brackets in such a manner that the list length and final bracket
3554 uniquely identify the set.
3555
3556 We use coloring to mark all BBs with cycle equivalency with the
3557 same color. This is the output of the 'Finding Regions Fast'
3558 algorithm. Notice it doesn't actually find the set of nodes within
3559 a particular region, just unorderd sets of nodes that are the
3560 entries and exits of SESE regions.
3561
3562 After determining cycle equivalency, we need to find the minimal
3563 set of SESE regions. Do this with a DFS coloring walk of the
3564 complete graph. We're either 'looking' or 'coloring'. When
3565 looking, and we're in the subgraph, we start coloring the color of
3566 the current node, and remember that node as the start of the
3567 current color's SESE region. Every time we go to a new node, we
3568 decrement the count of nodes with thet color. If it reaches zero,
3569 we remember that node as the end of the current color's SESE region
3570 and return to 'looking'. Otherwise we color the node the current
3571 color.
3572
3573 This way we end up with coloring the inside of non-trivial SESE
3574 regions with the color of that region. */
3575
3576/* A pair of BBs. We use this to represent SESE regions. */
3577typedef std::pair<basic_block, basic_block> bb_pair_t;
3578typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3579
3580/* A node in the undirected CFG. The discriminator SECOND indicates just
3581 above or just below the BB idicated by FIRST. */
3582typedef std::pair<basic_block, int> pseudo_node_t;
3583
3584/* A bracket indicates an edge towards the root of the spanning tree of the
3585 undirected graph. Each bracket has a color, determined
3586 from the currrent set of brackets. */
3587struct bracket
3588{
3589 pseudo_node_t back; /* Back target */
3590
3591 /* Current color and size of set. */
3592 unsigned color;
3593 unsigned size;
3594
3595 bracket (pseudo_node_t back_)
3596 : back (back_), color (~0u), size (~0u)
3597 {
3598 }
3599
3600 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3601 {
3602 if (length != size)
3603 {
3604 size = length;
3605 color = color_counts.length ();
3606 color_counts.quick_push (0);
3607 }
3608 color_counts[color]++;
3609 return color;
3610 }
3611};
3612
3613typedef auto_vec<bracket> bracket_vec_t;
3614
3615/* Basic block info for finding SESE regions. */
3616
3617struct bb_sese
3618{
3619 int node; /* Node number in spanning tree. */
3620 int parent; /* Parent node number. */
3621
3622 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3623 edges arrive at pseudo-node Ai and the outgoing edges leave at
3624 pseudo-node Ao. We have to remember which way we arrived at a
3625 particular node when generating the spanning tree. dir > 0 means
3626 we arrived at Ai, dir < 0 means we arrived at Ao. */
3627 int dir;
3628
3629 /* Lowest numbered pseudo-node reached via a backedge from thsis
3630 node, or any descendant. */
3631 pseudo_node_t high;
3632
3633 int color; /* Cycle-equivalence color */
3634
3635 /* Stack of brackets for this node. */
3636 bracket_vec_t brackets;
3637
3638 bb_sese (unsigned node_, unsigned p, int dir_)
3639 :node (node_), parent (p), dir (dir_)
3640 {
3641 }
3642 ~bb_sese ();
3643
3644 /* Push a bracket ending at BACK. */
3645 void push (const pseudo_node_t &back)
3646 {
3647 if (dump_file)
3648 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3649 back.first ? back.first->index : 0, back.second);
3650 brackets.safe_push (bracket (back));
3651 }
3652
3653 void append (bb_sese *child);
3654 void remove (const pseudo_node_t &);
3655
3656 /* Set node's color. */
3657 void set_color (auto_vec<unsigned> &color_counts)
3658 {
3659 color = brackets.last ().get_color (color_counts, brackets.length ());
3660 }
3661};
3662
3663bb_sese::~bb_sese ()
3664{
3665}
3666
3667/* Destructively append CHILD's brackets. */
3668
3669void
3670bb_sese::append (bb_sese *child)
3671{
3672 if (int len = child->brackets.length ())
3673 {
3674 int ix;
3675
3676 if (dump_file)
3677 {
3678 for (ix = 0; ix < len; ix++)
3679 {
3680 const pseudo_node_t &pseudo = child->brackets[ix].back;
3681 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3682 child->node, pseudo.first ? pseudo.first->index : 0,
3683 pseudo.second);
3684 }
3685 }
3686 if (!brackets.length ())
3687 std::swap (brackets, child->brackets);
3688 else
3689 {
3690 brackets.reserve (len);
3691 for (ix = 0; ix < len; ix++)
3692 brackets.quick_push (child->brackets[ix]);
3693 }
3694 }
3695}
3696
3697/* Remove brackets that terminate at PSEUDO. */
3698
3699void
3700bb_sese::remove (const pseudo_node_t &pseudo)
3701{
3702 unsigned removed = 0;
3703 int len = brackets.length ();
3704
3705 for (int ix = 0; ix < len; ix++)
3706 {
3707 if (brackets[ix].back == pseudo)
3708 {
3709 if (dump_file)
3710 fprintf (dump_file, "Removing backedge %d:%+d\n",
3711 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3712 removed++;
3713 }
3714 else if (removed)
3715 brackets[ix-removed] = brackets[ix];
3716 }
3717 while (removed--)
3718 brackets.pop ();
3719}
3720
3721/* Accessors for BB's aux pointer. */
3722#define BB_SET_SESE(B, S) ((B)->aux = (S))
3723#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3724
3725/* DFS walk creating SESE data structures. Only cover nodes with
3726 BB_VISITED set. Append discovered blocks to LIST. We number in
3727 increments of 3 so that the above and below pseudo nodes can be
3728 implicitly numbered too. */
3729
3730static int
3731nvptx_sese_number (int n, int p, int dir, basic_block b,
3732 auto_vec<basic_block> *list)
3733{
3734 if (BB_GET_SESE (b))
3735 return n;
3736
3737 if (dump_file)
3738 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3739 b->index, n, p, dir);
3740
3741 BB_SET_SESE (b, new bb_sese (n, p, dir));
3742 p = n;
3743
3744 n += 3;
3745 list->quick_push (b);
3746
3747 /* First walk the nodes on the 'other side' of this node, then walk
3748 the nodes on the same side. */
3749 for (unsigned ix = 2; ix; ix--)
3750 {
3751 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3752 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3753 : offsetof (edge_def, src));
3754 edge e;
478cb3ce 3755 edge_iterator ei;
912442c2
NS
3756
3757 FOR_EACH_EDGE (e, ei, edges)
3758 {
3759 basic_block target = *(basic_block *)((char *)e + offset);
3760
3761 if (target->flags & BB_VISITED)
3762 n = nvptx_sese_number (n, p, dir, target, list);
3763 }
3764 dir = -dir;
3765 }
3766 return n;
3767}
3768
3769/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3770 EDGES are the outgoing edges and OFFSET is the offset to the src
3771 or dst block on the edges. */
3772
3773static void
3774nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3775 vec<edge, va_gc> *edges, size_t offset)
3776{
3777 edge e;
478cb3ce 3778 edge_iterator ei;
912442c2 3779 int hi_back = depth;
7911a905 3780 pseudo_node_t node_back (nullptr, depth);
912442c2 3781 int hi_child = depth;
7911a905 3782 pseudo_node_t node_child (nullptr, depth);
912442c2
NS
3783 basic_block child = NULL;
3784 unsigned num_children = 0;
3785 int usd = -dir * sese->dir;
3786
3787 if (dump_file)
3788 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3789 me->index, sese->node, dir);
3790
3791 if (dir < 0)
3792 {
3793 /* This is the above pseudo-child. It has the BB itself as an
3794 additional child node. */
3795 node_child = sese->high;
3796 hi_child = node_child.second;
3797 if (node_child.first)
3798 hi_child += BB_GET_SESE (node_child.first)->node;
3799 num_children++;
3800 }
3801
3802 /* Examine each edge.
3803 - if it is a child (a) append its bracket list and (b) record
3804 whether it is the child with the highest reaching bracket.
3805 - if it is an edge to ancestor, record whether it's the highest
3806 reaching backlink. */
3807 FOR_EACH_EDGE (e, ei, edges)
3808 {
3809 basic_block target = *(basic_block *)((char *)e + offset);
3810
3811 if (bb_sese *t_sese = BB_GET_SESE (target))
3812 {
3813 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3814 {
3815 /* Child node. Append its bracket list. */
3816 num_children++;
3817 sese->append (t_sese);
3818
3819 /* Compare it's hi value. */
3820 int t_hi = t_sese->high.second;
3821
3822 if (basic_block child_hi_block = t_sese->high.first)
3823 t_hi += BB_GET_SESE (child_hi_block)->node;
3824
3825 if (hi_child > t_hi)
3826 {
3827 hi_child = t_hi;
3828 node_child = t_sese->high;
3829 child = target;
3830 }
3831 }
3832 else if (t_sese->node < sese->node + dir
3833 && !(dir < 0 && sese->parent == t_sese->node))
3834 {
3835 /* Non-parental ancestor node -- a backlink. */
3836 int d = usd * t_sese->dir;
3837 int back = t_sese->node + d;
3838
3839 if (hi_back > back)
3840 {
3841 hi_back = back;
3842 node_back = pseudo_node_t (target, d);
3843 }
3844 }
3845 }
3846 else
3847 { /* Fallen off graph, backlink to entry node. */
3848 hi_back = 0;
7911a905 3849 node_back = pseudo_node_t (nullptr, 0);
912442c2
NS
3850 }
3851 }
3852
3853 /* Remove any brackets that terminate at this pseudo node. */
3854 sese->remove (pseudo_node_t (me, dir));
3855
3856 /* Now push any backlinks from this pseudo node. */
3857 FOR_EACH_EDGE (e, ei, edges)
3858 {
3859 basic_block target = *(basic_block *)((char *)e + offset);
3860 if (bb_sese *t_sese = BB_GET_SESE (target))
3861 {
3862 if (t_sese->node < sese->node + dir
3863 && !(dir < 0 && sese->parent == t_sese->node))
3864 /* Non-parental ancestor node - backedge from me. */
3865 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3866 }
3867 else
3868 {
3869 /* back edge to entry node */
7911a905 3870 sese->push (pseudo_node_t (nullptr, 0));
912442c2
NS
3871 }
3872 }
3873
3874 /* If this node leads directly or indirectly to a no-return region of
3875 the graph, then fake a backedge to entry node. */
3876 if (!sese->brackets.length () || !edges || !edges->length ())
3877 {
3878 hi_back = 0;
7911a905 3879 node_back = pseudo_node_t (nullptr, 0);
912442c2
NS
3880 sese->push (node_back);
3881 }
3882
3883 /* Record the highest reaching backedge from us or a descendant. */
3884 sese->high = hi_back < hi_child ? node_back : node_child;
3885
3886 if (num_children > 1)
3887 {
3888 /* There is more than one child -- this is a Y shaped piece of
3889 spanning tree. We have to insert a fake backedge from this
3890 node to the highest ancestor reached by not-the-highest
3891 reaching child. Note that there may be multiple children
3892 with backedges to the same highest node. That's ok and we
3893 insert the edge to that highest node. */
3894 hi_child = depth;
3895 if (dir < 0 && child)
3896 {
3897 node_child = sese->high;
3898 hi_child = node_child.second;
3899 if (node_child.first)
3900 hi_child += BB_GET_SESE (node_child.first)->node;
3901 }
3902
3903 FOR_EACH_EDGE (e, ei, edges)
3904 {
3905 basic_block target = *(basic_block *)((char *)e + offset);
3906
3907 if (target == child)
3908 /* Ignore the highest child. */
3909 continue;
3910
3911 bb_sese *t_sese = BB_GET_SESE (target);
3912 if (!t_sese)
3913 continue;
3914 if (t_sese->parent != sese->node)
3915 /* Not a child. */
3916 continue;
3917
3918 /* Compare its hi value. */
3919 int t_hi = t_sese->high.second;
3920
3921 if (basic_block child_hi_block = t_sese->high.first)
3922 t_hi += BB_GET_SESE (child_hi_block)->node;
3923
3924 if (hi_child > t_hi)
3925 {
3926 hi_child = t_hi;
3927 node_child = t_sese->high;
3928 }
3929 }
3930
3931 sese->push (node_child);
3932 }
3933}
3934
3935
3936/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3937 proceed to successors. Set SESE entry and exit nodes of
3938 REGIONS. */
3939
3940static void
3941nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3942 basic_block block, int coloring)
3943{
3944 bb_sese *sese = BB_GET_SESE (block);
3945
3946 if (block->flags & BB_VISITED)
3947 {
3948 /* If we've already encountered this block, either we must not
3949 be coloring, or it must have been colored the current color. */
3950 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3951 return;
3952 }
3953
3954 block->flags |= BB_VISITED;
3955
3956 if (sese)
3957 {
3958 if (coloring < 0)
3959 {
3960 /* Start coloring a region. */
3961 regions[sese->color].first = block;
3962 coloring = sese->color;
3963 }
3964
3965 if (!--color_counts[sese->color] && sese->color == coloring)
3966 {
3967 /* Found final block of SESE region. */
3968 regions[sese->color].second = block;
3969 coloring = -1;
3970 }
3971 else
3972 /* Color the node, so we can assert on revisiting the node
3973 that the graph is indeed SESE. */
3974 sese->color = coloring;
3975 }
3976 else
3977 /* Fallen off the subgraph, we cannot be coloring. */
3978 gcc_assert (coloring < 0);
3979
3980 /* Walk each successor block. */
3981 if (block->succs && block->succs->length ())
3982 {
3983 edge e;
3984 edge_iterator ei;
3985
3986 FOR_EACH_EDGE (e, ei, block->succs)
3987 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3988 }
3989 else
3990 gcc_assert (coloring < 0);
3991}
3992
3993/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3994 end up with NULL entries in it. */
3995
3996static void
3997nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3998{
3999 basic_block block;
4000 int ix;
4001
4002 /* First clear each BB of the whole function. */
2300c332 4003 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
4004 {
4005 block->flags &= ~BB_VISITED;
4006 BB_SET_SESE (block, 0);
4007 }
912442c2
NS
4008
4009 /* Mark blocks in the function that are in this graph. */
4010 for (ix = 0; blocks.iterate (ix, &block); ix++)
4011 block->flags |= BB_VISITED;
4012
4013 /* Counts of nodes assigned to each color. There cannot be more
4014 colors than blocks (and hopefully there will be fewer). */
4015 auto_vec<unsigned> color_counts;
4016 color_counts.reserve (blocks.length ());
4017
4018 /* Worklist of nodes in the spanning tree. Again, there cannot be
4019 more nodes in the tree than blocks (there will be fewer if the
4020 CFG of blocks is disjoint). */
4021 auto_vec<basic_block> spanlist;
4022 spanlist.reserve (blocks.length ());
4023
4024 /* Make sure every block has its cycle class determined. */
4025 for (ix = 0; blocks.iterate (ix, &block); ix++)
4026 {
4027 if (BB_GET_SESE (block))
4028 /* We already met this block in an earlier graph solve. */
4029 continue;
4030
4031 if (dump_file)
4032 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
4033
4034 /* Number the nodes reachable from block initial DFS order. */
4035 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
4036
4037 /* Now walk in reverse DFS order to find cycle equivalents. */
4038 while (spanlist.length ())
4039 {
4040 block = spanlist.pop ();
4041 bb_sese *sese = BB_GET_SESE (block);
4042
4043 /* Do the pseudo node below. */
4044 nvptx_sese_pseudo (block, sese, depth, +1,
4045 sese->dir > 0 ? block->succs : block->preds,
4046 (sese->dir > 0 ? offsetof (edge_def, dest)
4047 : offsetof (edge_def, src)));
4048 sese->set_color (color_counts);
4049 /* Do the pseudo node above. */
4050 nvptx_sese_pseudo (block, sese, depth, -1,
4051 sese->dir < 0 ? block->succs : block->preds,
4052 (sese->dir < 0 ? offsetof (edge_def, dest)
4053 : offsetof (edge_def, src)));
4054 }
4055 if (dump_file)
4056 fprintf (dump_file, "\n");
4057 }
4058
4059 if (dump_file)
4060 {
4061 unsigned count;
4062 const char *comma = "";
4063
4064 fprintf (dump_file, "Found %d cycle equivalents\n",
4065 color_counts.length ());
4066 for (ix = 0; color_counts.iterate (ix, &count); ix++)
4067 {
4068 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
4069
4070 comma = "";
4071 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
4072 if (BB_GET_SESE (block)->color == ix)
4073 {
4074 block->flags |= BB_VISITED;
4075 fprintf (dump_file, "%s%d", comma, block->index);
4076 comma=",";
4077 }
4078 fprintf (dump_file, "}");
4079 comma = ", ";
4080 }
4081 fprintf (dump_file, "\n");
4082 }
4083
4084 /* Now we've colored every block in the subgraph. We now need to
4085 determine the minimal set of SESE regions that cover that
4086 subgraph. Do this with a DFS walk of the complete function.
4087 During the walk we're either 'looking' or 'coloring'. When we
4088 reach the last node of a particular color, we stop coloring and
4089 return to looking. */
4090
4091 /* There cannot be more SESE regions than colors. */
4092 regions.reserve (color_counts.length ());
4093 for (ix = color_counts.length (); ix--;)
4094 regions.quick_push (bb_pair_t (0, 0));
4095
4096 for (ix = 0; blocks.iterate (ix, &block); ix++)
4097 block->flags &= ~BB_VISITED;
4098
4099 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4100
4101 if (dump_file)
4102 {
4103 const char *comma = "";
4104 int len = regions.length ();
4105
4106 fprintf (dump_file, "SESE regions:");
4107 for (ix = 0; ix != len; ix++)
4108 {
4109 basic_block from = regions[ix].first;
4110 basic_block to = regions[ix].second;
4111
4112 if (from)
4113 {
4114 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4115 if (to != from)
4116 fprintf (dump_file, "->%d", to->index);
4117
4118 int color = BB_GET_SESE (from)->color;
4119
4120 /* Print the blocks within the region (excluding ends). */
4121 FOR_EACH_BB_FN (block, cfun)
4122 {
4123 bb_sese *sese = BB_GET_SESE (block);
4124
4125 if (sese && sese->color == color
4126 && block != from && block != to)
4127 fprintf (dump_file, ".%d", block->index);
4128 }
4129 fprintf (dump_file, "}");
4130 }
4131 comma = ",";
4132 }
4133 fprintf (dump_file, "\n\n");
4134 }
4135
4136 for (ix = 0; blocks.iterate (ix, &block); ix++)
4137 delete BB_GET_SESE (block);
4138}
4139
4140#undef BB_SET_SESE
4141#undef BB_GET_SESE
4142
e91eba31
NS
4143/* Propagate live state at the start of a partitioned region. IS_CALL
4144 indicates whether the propagation is for a (partitioned) call
4145 instruction. BLOCK provides the live register information, and
4146 might not contain INSN. Propagation is inserted just after INSN. RW
4147 indicates whether we are reading and/or writing state. This
d88cd9c4
NS
4148 separation is needed for worker-level proppagation where we
4149 essentially do a spill & fill. FN is the underlying worker
4150 function to generate the propagation instructions for single
4151 register. DATA is user data.
4152
e91eba31
NS
4153 Returns true if we didn't emit any instructions.
4154
4155 We propagate the live register set for non-calls and the entire
4156 frame for calls and non-calls. We could do better by (a)
4157 propagating just the live set that is used within the partitioned
4158 regions and (b) only propagating stack entries that are used. The
4159 latter might be quite hard to determine. */
d88cd9c4 4160
a0b3b5c4 4161typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
d88cd9c4 4162
e91eba31
NS
4163static bool
4164nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
a0b3b5c4 4165 propagate_mask rw, propagator_fn fn, void *data, bool vector)
d88cd9c4
NS
4166{
4167 bitmap live = DF_LIVE_IN (block);
4168 bitmap_iterator iterator;
4169 unsigned ix;
e91eba31 4170 bool empty = true;
d88cd9c4
NS
4171
4172 /* Copy the frame array. */
4173 HOST_WIDE_INT fs = get_frame_size ();
4174 if (fs)
4175 {
4176 rtx tmp = gen_reg_rtx (DImode);
4177 rtx idx = NULL_RTX;
4178 rtx ptr = gen_reg_rtx (Pmode);
4179 rtx pred = NULL_RTX;
4180 rtx_code_label *label = NULL;
4181
e91eba31 4182 empty = false;
c1311c86
NS
4183 /* The frame size might not be DImode compatible, but the frame
4184 array's declaration will be. So it's ok to round up here. */
4185 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
4186 /* Detect single iteration loop. */
4187 if (fs == 1)
4188 fs = 0;
4189
4190 start_sequence ();
4191 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4192 if (fs)
4193 {
4194 idx = gen_reg_rtx (SImode);
4195 pred = gen_reg_rtx (BImode);
4196 label = gen_label_rtx ();
4197
4198 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4199 /* Allow worker function to initialize anything needed. */
a0b3b5c4 4200 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
d88cd9c4
NS
4201 if (init)
4202 emit_insn (init);
4203 emit_label (label);
4204 LABEL_NUSES (label)++;
4205 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4206 }
4207 if (rw & PM_read)
4208 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
a0b3b5c4 4209 emit_insn (fn (tmp, rw, fs, data, vector));
d88cd9c4
NS
4210 if (rw & PM_write)
4211 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4212 if (fs)
4213 {
4214 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4215 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4216 emit_insn (gen_br_true_uni (pred, label));
a0b3b5c4 4217 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
d88cd9c4
NS
4218 if (fini)
4219 emit_insn (fini);
4220 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4221 }
4222 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4223 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4224 rtx cpy = get_insns ();
4225 end_sequence ();
4226 insn = emit_insn_after (cpy, insn);
4227 }
4228
e91eba31
NS
4229 if (!is_call)
4230 /* Copy live registers. */
4231 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4232 {
4233 rtx reg = regno_reg_rtx[ix];
d88cd9c4 4234
e91eba31
NS
4235 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4236 {
a0b3b5c4 4237 rtx bcast = fn (reg, rw, 0, data, vector);
d88cd9c4 4238
e91eba31
NS
4239 insn = emit_insn_after (bcast, insn);
4240 empty = false;
4241 }
4242 }
4243 return empty;
d88cd9c4
NS
4244}
4245
a0b3b5c4 4246/* Worker for nvptx_warp_propagate. */
d88cd9c4
NS
4247
4248static rtx
a0b3b5c4
TV
4249warp_prop_gen (rtx reg, propagate_mask pm,
4250 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4251 bool ARG_UNUSED (vector))
d88cd9c4
NS
4252{
4253 if (!(pm & PM_read_write))
4254 return 0;
4255
a0b3b5c4 4256 return nvptx_gen_warp_bcast (reg);
d88cd9c4
NS
4257}
4258
4259/* Propagate state that is live at start of BLOCK across the vectors
e91eba31
NS
4260 of a single warp. Propagation is inserted just after INSN.
4261 IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4262
e91eba31 4263static bool
a0b3b5c4 4264nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
d88cd9c4 4265{
a0b3b5c4
TV
4266 return nvptx_propagate (is_call, block, insn, PM_read_write,
4267 warp_prop_gen, 0, false);
d88cd9c4
NS
4268}
4269
a0b3b5c4 4270/* Worker for nvptx_shared_propagate. */
d88cd9c4
NS
4271
4272static rtx
a0b3b5c4
TV
4273shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4274 bool vector)
d88cd9c4 4275{
1ed57fb8 4276 broadcast_data_t *data = (broadcast_data_t *)data_;
d88cd9c4
NS
4277
4278 if (pm & PM_loop_begin)
4279 {
4280 /* Starting a loop, initialize pointer. */
4281 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4282
3c55d60f
TV
4283 oacc_bcast_align = MAX (oacc_bcast_align, align);
4284 data->offset = ROUND_UP (data->offset, align);
d88cd9c4
NS
4285
4286 data->ptr = gen_reg_rtx (Pmode);
4287
4288 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4289 }
4290 else if (pm & PM_loop_end)
4291 {
4292 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4293 data->ptr = NULL_RTX;
4294 return clobber;
4295 }
4296 else
a0b3b5c4 4297 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
d88cd9c4
NS
4298}
4299
4300/* Spill or fill live state that is live at start of BLOCK. PRE_P
4301 indicates if this is just before partitioned mode (do spill), or
4302 just after it starts (do fill). Sequence is inserted just after
e91eba31 4303 INSN. IS_CALL and return as for nvptx_propagate. */
d88cd9c4 4304
e91eba31 4305static bool
a0b3b5c4
TV
4306nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4307 rtx_insn *insn, bool vector)
d88cd9c4 4308{
1ed57fb8 4309 broadcast_data_t data;
d88cd9c4
NS
4310
4311 data.base = gen_reg_rtx (Pmode);
4312 data.offset = 0;
4313 data.ptr = NULL_RTX;
4314
e91eba31 4315 bool empty = nvptx_propagate (is_call, block, insn,
a0b3b5c4
TV
4316 pre_p ? PM_read : PM_write, shared_prop_gen,
4317 &data, vector);
e91eba31 4318 gcc_assert (empty == !data.offset);
d88cd9c4
NS
4319 if (data.offset)
4320 {
0024c320
TV
4321 rtx bcast_sym = oacc_bcast_sym;
4322
d88cd9c4 4323 /* Stuff was emitted, initialize the base pointer now. */
0024c320
TV
4324 if (vector && nvptx_mach_max_workers () > 1)
4325 {
4326 if (!cfun->machine->bcast_partition)
4327 {
4328 /* It would be nice to place this register in
4329 DATA_AREA_SHARED. */
4330 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4331 }
4332 if (!cfun->machine->sync_bar)
4333 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4334
4335 bcast_sym = cfun->machine->bcast_partition;
4336 }
4337
4338 rtx init = gen_rtx_SET (data.base, bcast_sym);
d88cd9c4 4339 emit_insn_after (init, insn);
15ab6f00 4340
0024c320
TV
4341 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4342 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4343 ? nvptx_mach_max_workers () + 1
4344 : 1);
4345
4346 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4347 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4 4348 }
e91eba31 4349 return empty;
d88cd9c4
NS
4350}
4351
0024c320
TV
4352/* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4353 which is an integer or a register. THREADS is the number of threads
4354 controlled by the barrier. */
d88cd9c4
NS
4355
4356static rtx
0024c320 4357nvptx_cta_sync (rtx lock, int threads)
d88cd9c4 4358{
0024c320 4359 return gen_nvptx_barsync (lock, GEN_INT (threads));
d88cd9c4
NS
4360}
4361
43c371e8
TV
4362#if WORKAROUND_PTXJIT_BUG
4363/* Return first real insn in BB, or return NULL_RTX if BB does not contain
4364 real insns. */
4365
4366static rtx_insn *
4367bb_first_real_insn (basic_block bb)
4368{
4369 rtx_insn *insn;
4370
4371 /* Find first insn of from block. */
4372 FOR_BB_INSNS (bb, insn)
4373 if (INSN_P (insn))
4374 return insn;
4375
4376 return 0;
4377}
4378#endif
4379
6beefdbd
TV
4380/* Return true if INSN needs neutering. */
4381
4382static bool
4383needs_neutering_p (rtx_insn *insn)
4384{
4385 if (!INSN_P (insn))
4386 return false;
4387
4388 switch (recog_memoized (insn))
4389 {
4390 case CODE_FOR_nvptx_fork:
4391 case CODE_FOR_nvptx_forked:
4392 case CODE_FOR_nvptx_joining:
4393 case CODE_FOR_nvptx_join:
4394 case CODE_FOR_nvptx_barsync:
4395 return false;
4396 default:
4397 return true;
4398 }
4399}
4400
a874808c
TV
4401/* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4402
4403static bool
4404verify_neutering_jumps (basic_block from,
4405 rtx_insn *vector_jump, rtx_insn *worker_jump,
4406 rtx_insn *vector_label, rtx_insn *worker_label)
4407{
4408 basic_block bb = from;
4409 rtx_insn *insn = BB_HEAD (bb);
4410 bool seen_worker_jump = false;
4411 bool seen_vector_jump = false;
4412 bool seen_worker_label = false;
4413 bool seen_vector_label = false;
4414 bool worker_neutered = false;
4415 bool vector_neutered = false;
4416 while (true)
4417 {
4418 if (insn == worker_jump)
4419 {
4420 seen_worker_jump = true;
4421 worker_neutered = true;
4422 gcc_assert (!vector_neutered);
4423 }
4424 else if (insn == vector_jump)
4425 {
4426 seen_vector_jump = true;
4427 vector_neutered = true;
4428 }
4429 else if (insn == worker_label)
4430 {
4431 seen_worker_label = true;
4432 gcc_assert (worker_neutered);
4433 worker_neutered = false;
4434 }
4435 else if (insn == vector_label)
4436 {
4437 seen_vector_label = true;
4438 gcc_assert (vector_neutered);
4439 vector_neutered = false;
4440 }
4441 else if (INSN_P (insn))
4442 switch (recog_memoized (insn))
4443 {
4444 case CODE_FOR_nvptx_barsync:
4445 gcc_assert (!vector_neutered && !worker_neutered);
4446 break;
4447 default:
4448 break;
4449 }
4450
4451 if (insn != BB_END (bb))
4452 insn = NEXT_INSN (insn);
4453 else if (JUMP_P (insn) && single_succ_p (bb)
4454 && !seen_vector_jump && !seen_worker_jump)
4455 {
4456 bb = single_succ (bb);
4457 insn = BB_HEAD (bb);
4458 }
4459 else
4460 break;
4461 }
4462
4463 gcc_assert (!(vector_jump && !seen_vector_jump));
4464 gcc_assert (!(worker_jump && !seen_worker_jump));
4465
4466 if (seen_vector_label || seen_worker_label)
4467 {
4468 gcc_assert (!(vector_label && !seen_vector_label));
4469 gcc_assert (!(worker_label && !seen_worker_label));
4470
4471 return true;
4472 }
4473
4474 return false;
4475}
4476
4477/* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4478
4479static void
4480verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4481 rtx_insn *worker_label)
4482{
4483 basic_block bb = to;
4484 rtx_insn *insn = BB_END (bb);
4485 bool seen_worker_label = false;
4486 bool seen_vector_label = false;
4487 while (true)
4488 {
4489 if (insn == worker_label)
4490 {
4491 seen_worker_label = true;
4492 gcc_assert (!seen_vector_label);
4493 }
4494 else if (insn == vector_label)
4495 seen_vector_label = true;
4496 else if (INSN_P (insn))
4497 switch (recog_memoized (insn))
4498 {
4499 case CODE_FOR_nvptx_barsync:
4500 gcc_assert (!seen_vector_label && !seen_worker_label);
4501 break;
4502 }
4503
4504 if (insn != BB_HEAD (bb))
4505 insn = PREV_INSN (insn);
4506 else
4507 break;
4508 }
4509
4510 gcc_assert (!(vector_label && !seen_vector_label));
4511 gcc_assert (!(worker_label && !seen_worker_label));
4512}
4513
d88cd9c4
NS
4514/* Single neutering according to MASK. FROM is the incoming block and
4515 TO is the outgoing block. These may be the same block. Insert at
4516 start of FROM:
4517
4518 if (tid.<axis>) goto end.
4519
4520 and insert before ending branch of TO (if there is such an insn):
4521
4522 end:
4523 <possibly-broadcast-cond>
4524 <branch>
4525
4526 We currently only use differnt FROM and TO when skipping an entire
4527 loop. We could do more if we detected superblocks. */
4528
4529static void
4530nvptx_single (unsigned mask, basic_block from, basic_block to)
4531{
4532 rtx_insn *head = BB_HEAD (from);
4533 rtx_insn *tail = BB_END (to);
4534 unsigned skip_mask = mask;
4535
ed9091b2
TV
4536 while (true)
4537 {
4538 /* Find first insn of from block. */
6beefdbd 4539 while (head != BB_END (from) && !needs_neutering_p (head))
ed9091b2
TV
4540 head = NEXT_INSN (head);
4541
4542 if (from == to)
4543 break;
4544
4545 if (!(JUMP_P (head) && single_succ_p (from)))
4546 break;
4547
4548 basic_block jump_target = single_succ (from);
4549 if (!single_pred_p (jump_target))
4550 break;
4551
4552 from = jump_target;
4553 head = BB_HEAD (from);
4554 }
d88cd9c4
NS
4555
4556 /* Find last insn of to block */
4557 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4558 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4559 tail = PREV_INSN (tail);
4560
4561 /* Detect if tail is a branch. */
4562 rtx tail_branch = NULL_RTX;
4563 rtx cond_branch = NULL_RTX;
4564 if (tail && INSN_P (tail))
4565 {
4566 tail_branch = PATTERN (tail);
4567 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4568 tail_branch = NULL_RTX;
4569 else
4570 {
4571 cond_branch = SET_SRC (tail_branch);
4572 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4573 cond_branch = NULL_RTX;
4574 }
4575 }
4576
4577 if (tail == head)
4578 {
4579 /* If this is empty, do nothing. */
6beefdbd 4580 if (!head || !needs_neutering_p (head))
d88cd9c4
NS
4581 return;
4582
d88cd9c4
NS
4583 if (cond_branch)
4584 {
4585 /* If we're only doing vector single, there's no need to
4586 emit skip code because we'll not insert anything. */
4587 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4588 skip_mask = 0;
4589 }
4590 else if (tail_branch)
4591 /* Block with only unconditional branch. Nothing to do. */
4592 return;
4593 }
4594
4595 /* Insert the vector test inside the worker test. */
4596 unsigned mode;
4597 rtx_insn *before = tail;
2ba16fd2 4598 rtx_insn *neuter_start = NULL;
a874808c
TV
4599 rtx_insn *worker_label = NULL, *vector_label = NULL;
4600 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
d88cd9c4
NS
4601 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4602 if (GOMP_DIM_MASK (mode) & skip_mask)
4603 {
4604 rtx_code_label *label = gen_label_rtx ();
4605 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
478cb3ce
JJ
4606 rtx_insn **mode_jump
4607 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4608 rtx_insn **mode_label
4609 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
d88cd9c4
NS
4610
4611 if (!pred)
4612 {
4613 pred = gen_reg_rtx (BImode);
4614 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4615 }
7820b298 4616
d88cd9c4
NS
4617 rtx br;
4618 if (mode == GOMP_DIM_VECTOR)
4619 br = gen_br_true (pred, label);
4620 else
4621 br = gen_br_true_uni (pred, label);
2ba16fd2
TV
4622 if (neuter_start)
4623 neuter_start = emit_insn_after (br, neuter_start);
4624 else
4625 neuter_start = emit_insn_before (br, head);
a874808c 4626 *mode_jump = neuter_start;
d88cd9c4
NS
4627
4628 LABEL_NUSES (label)++;
a874808c 4629 rtx_insn *label_insn;
d88cd9c4 4630 if (tail_branch)
a874808c
TV
4631 {
4632 label_insn = emit_label_before (label, before);
4633 before = label_insn;
4634 }
d88cd9c4 4635 else
be606483 4636 {
a874808c 4637 label_insn = emit_label_after (label, tail);
e77a1236
TV
4638 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4639 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
be606483
TV
4640 emit_insn_after (gen_exit (), label_insn);
4641 }
a874808c 4642
478cb3ce 4643 *mode_label = label_insn;
d88cd9c4
NS
4644 }
4645
4646 /* Now deal with propagating the branch condition. */
4647 if (cond_branch)
4648 {
4649 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4650
0024c320
TV
4651 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4652 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
d88cd9c4
NS
4653 {
4654 /* Vector mode only, do a shuffle. */
43c371e8
TV
4655#if WORKAROUND_PTXJIT_BUG
4656 /* The branch condition %rcond is propagated like this:
4657
4658 {
4659 .reg .u32 %x;
4660 mov.u32 %x,%tid.x;
4661 setp.ne.u32 %rnotvzero,%x,0;
4662 }
4663
4664 @%rnotvzero bra Lskip;
4665 setp.<op>.<type> %rcond,op1,op2;
4666 Lskip:
4667 selp.u32 %rcondu32,1,0,%rcond;
4668 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4669 setp.ne.u32 %rcond,%rcondu32,0;
4670
4671 There seems to be a bug in the ptx JIT compiler (observed at driver
4672 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4673 unless %rcond is initialized to something before 'bra Lskip'. The
4674 bug is not observed with ptxas from cuda 8.0.61.
4675
4676 It is true that the code is non-trivial: at Lskip, %rcond is
4677 uninitialized in threads 1-31, and after the selp the same holds
4678 for %rcondu32. But shfl propagates the defined value in thread 0
4679 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4680 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4681
4682 There is nothing in the PTX spec to suggest that this is wrong, or
4683 to explain why the extra initialization is needed. So, we classify
8c8e9a6b
TV
4684 it as a JIT bug, and the extra initialization as workaround:
4685
4686 {
4687 .reg .u32 %x;
4688 mov.u32 %x,%tid.x;
4689 setp.ne.u32 %rnotvzero,%x,0;
4690 }
4691
4692 +.reg .pred %rcond2;
4693 +setp.eq.u32 %rcond2, 1, 0;
4694
4695 @%rnotvzero bra Lskip;
4696 setp.<op>.<type> %rcond,op1,op2;
4697 +mov.pred %rcond2, %rcond;
4698 Lskip:
4699 +mov.pred %rcond, %rcond2;
4700 selp.u32 %rcondu32,1,0,%rcond;
4701 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4702 setp.ne.u32 %rcond,%rcondu32,0;
4703 */
4704 rtx_insn *label = PREV_INSN (tail);
4705 gcc_assert (label && LABEL_P (label));
4706 rtx tmp = gen_reg_rtx (BImode);
4707 emit_insn_before (gen_movbi (tmp, const0_rtx),
43c371e8 4708 bb_first_real_insn (from));
8c8e9a6b
TV
4709 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4710 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
43c371e8 4711#endif
a0b3b5c4 4712 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
d88cd9c4
NS
4713 }
4714 else
4715 {
4716 /* Includes worker mode, do spill & fill. By construction
4717 we should never have worker mode only. */
1ed57fb8 4718 broadcast_data_t data;
0024c320
TV
4719 unsigned size = GET_MODE_SIZE (SImode);
4720 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4721 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4722 rtx barrier = GEN_INT (0);
4723 int threads = 0;
d88cd9c4 4724
1ed57fb8 4725 data.base = oacc_bcast_sym;
d88cd9c4
NS
4726 data.ptr = 0;
4727
0024c320
TV
4728 bool use_partitioning_p = (vector && !worker
4729 && nvptx_mach_max_workers () > 1
4730 && cfun->machine->bcast_partition);
4731 if (use_partitioning_p)
4732 {
4733 data.base = cfun->machine->bcast_partition;
4734 barrier = cfun->machine->sync_bar;
4735 threads = nvptx_mach_vector_length ();
4736 }
4737 gcc_assert (data.base != NULL);
4738 gcc_assert (barrier);
4739
4740 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4741 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4742 ? nvptx_mach_max_workers () + 1
4743 : 1);
4744
4745 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4746 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
d88cd9c4
NS
4747
4748 data.offset = 0;
a0b3b5c4 4749 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
0024c320 4750 vector),
d88cd9c4 4751 before);
0024c320 4752
d88cd9c4 4753 /* Barrier so other workers can see the write. */
0024c320 4754 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4 4755 data.offset = 0;
a0b3b5c4 4756 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
0024c320
TV
4757 vector),
4758 tail);
d88cd9c4
NS
4759 /* This barrier is needed to avoid worker zero clobbering
4760 the broadcast buffer before all the other workers have
4761 had a chance to read this instance of it. */
0024c320 4762 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
d88cd9c4
NS
4763 }
4764
4765 extract_insn (tail);
4766 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4767 UNSPEC_BR_UNIFIED);
4768 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4769 }
a874808c
TV
4770
4771 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4772 vector_label, worker_label);
4773 if (!seen_label)
4774 verify_neutering_labels (to, vector_label, worker_label);
d88cd9c4
NS
4775}
4776
4777/* PAR is a parallel that is being skipped in its entirety according to
4778 MASK. Treat this as skipping a superblock starting at forked
4779 and ending at joining. */
4780
4781static void
4782nvptx_skip_par (unsigned mask, parallel *par)
4783{
4784 basic_block tail = par->join_block;
4785 gcc_assert (tail->preds->length () == 1);
4786
4787 basic_block pre_tail = (*tail->preds)[0]->src;
4788 gcc_assert (pre_tail->succs->length () == 1);
4789
4790 nvptx_single (mask, par->forked_block, pre_tail);
4791}
4792
dba619f3
NS
4793/* If PAR has a single inner parallel and PAR itself only contains
4794 empty entry and exit blocks, swallow the inner PAR. */
4795
4796static void
4797nvptx_optimize_inner (parallel *par)
4798{
4799 parallel *inner = par->inner;
4800
4801 /* We mustn't be the outer dummy par. */
4802 if (!par->mask)
4803 return;
4804
4805 /* We must have a single inner par. */
4806 if (!inner || inner->next)
4807 return;
4808
4809 /* We must only contain 2 blocks ourselves -- the head and tail of
4810 the inner par. */
4811 if (par->blocks.length () != 2)
4812 return;
4813
4814 /* We must be disjoint partitioning. As we only have vector and
4815 worker partitioning, this is sufficient to guarantee the pars
4816 have adjacent partitioning. */
4817 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4818 /* This indicates malformed code generation. */
4819 return;
4820
4821 /* The outer forked insn should be immediately followed by the inner
4822 fork insn. */
4823 rtx_insn *forked = par->forked_insn;
4824 rtx_insn *fork = BB_END (par->forked_block);
4825
4826 if (NEXT_INSN (forked) != fork)
4827 return;
4828 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4829
4830 /* The outer joining insn must immediately follow the inner join
4831 insn. */
4832 rtx_insn *joining = par->joining_insn;
4833 rtx_insn *join = inner->join_insn;
4834 if (NEXT_INSN (join) != joining)
4835 return;
4836
4837 /* Preconditions met. Swallow the inner par. */
4838 if (dump_file)
4839 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4840 inner->mask, inner->forked_block->index,
4841 inner->join_block->index,
4842 par->mask, par->forked_block->index, par->join_block->index);
4843
4844 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4845
4846 par->blocks.reserve (inner->blocks.length ());
4847 while (inner->blocks.length ())
4848 par->blocks.quick_push (inner->blocks.pop ());
4849
4850 par->inner = inner->inner;
4851 inner->inner = NULL;
4852
4853 delete inner;
4854}
4855
d88cd9c4
NS
4856/* Process the parallel PAR and all its contained
4857 parallels. We do everything but the neutering. Return mask of
4858 partitioned modes used within this parallel. */
4859
4860static unsigned
4861nvptx_process_pars (parallel *par)
4862{
dba619f3
NS
4863 if (nvptx_optimize)
4864 nvptx_optimize_inner (par);
4865
d88cd9c4
NS
4866 unsigned inner_mask = par->mask;
4867
4868 /* Do the inner parallels first. */
4869 if (par->inner)
4870 {
4871 par->inner_mask = nvptx_process_pars (par->inner);
4872 inner_mask |= par->inner_mask;
4873 }
4874
e91eba31 4875 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
0024c320
TV
4876 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4877 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4878 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
e91eba31 4879
0024c320 4880 if (worker || large_vector)
d88cd9c4 4881 {
a0b3b5c4 4882 nvptx_shared_propagate (false, is_call, par->forked_block,
0024c320 4883 par->forked_insn, !worker);
d495b5cc
TV
4884 bool no_prop_p
4885 = nvptx_shared_propagate (true, is_call, par->forked_block,
4886 par->fork_insn, !worker);
4887 bool empty_loop_p
4888 = !is_call && (NEXT_INSN (par->forked_insn)
4889 && NEXT_INSN (par->forked_insn) == par->joining_insn);
0024c320
TV
4890 rtx barrier = GEN_INT (0);
4891 int threads = 0;
4892
4893 if (!worker && cfun->machine->sync_bar)
4894 {
4895 barrier = cfun->machine->sync_bar;
4896 threads = nvptx_mach_vector_length ();
4897 }
e91eba31 4898
d495b5cc
TV
4899 if (no_prop_p && empty_loop_p)
4900 ;
4901 else if (no_prop_p && is_call)
4902 ;
4903 else
e91eba31
NS
4904 {
4905 /* Insert begin and end synchronizations. */
0024c320
TV
4906 emit_insn_before (nvptx_cta_sync (barrier, threads),
4907 par->forked_insn);
4908 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
e91eba31 4909 }
d88cd9c4
NS
4910 }
4911 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
a0b3b5c4 4912 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
d88cd9c4
NS
4913
4914 /* Now do siblings. */
4915 if (par->next)
4916 inner_mask |= nvptx_process_pars (par->next);
4917 return inner_mask;
4918}
4919
4920/* Neuter the parallel described by PAR. We recurse in depth-first
4921 order. MODES are the partitioning of the execution and OUTER is
4922 the partitioning of the parallels we are contained in. */
4923
4924static void
4925nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4926{
4927 unsigned me = (par->mask
4928 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4929 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4930 unsigned skip_mask = 0, neuter_mask = 0;
4931
4932 if (par->inner)
4933 nvptx_neuter_pars (par->inner, modes, outer | me);
4934
4935 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4936 {
4937 if ((outer | me) & GOMP_DIM_MASK (mode))
4938 {} /* Mode is partitioned: no neutering. */
4939 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4940 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4941 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4942 || !par->forked_insn)
4943 /* Partitioned in inner parallels, or we're not a partitioned
4944 at all: neuter individual blocks. */
4945 neuter_mask |= GOMP_DIM_MASK (mode);
4946 else if (!par->parent || !par->parent->forked_insn
4947 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4948 /* Parent isn't a parallel or contains this paralleling: skip
4949 parallel at this level. */
4950 skip_mask |= GOMP_DIM_MASK (mode);
4951 else
4952 {} /* Parent will skip this parallel itself. */
4953 }
4954
4955 if (neuter_mask)
4956 {
912442c2 4957 int ix, len;
d88cd9c4 4958
912442c2
NS
4959 if (nvptx_optimize)
4960 {
4961 /* Neuter whole SESE regions. */
4962 bb_pair_vec_t regions;
4963
4964 nvptx_find_sese (par->blocks, regions);
4965 len = regions.length ();
4966 for (ix = 0; ix != len; ix++)
4967 {
4968 basic_block from = regions[ix].first;
4969 basic_block to = regions[ix].second;
4970
4971 if (from)
4972 nvptx_single (neuter_mask, from, to);
4973 else
4974 gcc_assert (!to);
4975 }
4976 }
4977 else
d88cd9c4 4978 {
912442c2
NS
4979 /* Neuter each BB individually. */
4980 len = par->blocks.length ();
4981 for (ix = 0; ix != len; ix++)
4982 {
4983 basic_block block = par->blocks[ix];
d88cd9c4 4984
912442c2
NS
4985 nvptx_single (neuter_mask, block, block);
4986 }
d88cd9c4
NS
4987 }
4988 }
4989
4990 if (skip_mask)
7820b298 4991 nvptx_skip_par (skip_mask, par);
d88cd9c4
NS
4992
4993 if (par->next)
4994 nvptx_neuter_pars (par->next, modes, outer);
4995}
4996
2498a6ab
TV
4997static void
4998populate_offload_attrs (offload_attrs *oa)
4999{
5000 tree attr = oacc_get_fn_attrib (current_function_decl);
5001 tree dims = TREE_VALUE (attr);
5002 unsigned ix;
5003
5004 oa->mask = 0;
5005
5006 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
5007 {
5008 tree t = TREE_VALUE (dims);
5009 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
5010 tree allowed = TREE_PURPOSE (dims);
5011
5012 if (size != 1 && !(allowed && integer_zerop (allowed)))
5013 oa->mask |= GOMP_DIM_MASK (ix);
5014
5015 switch (ix)
5016 {
5017 case GOMP_DIM_GANG:
5018 oa->num_gangs = size;
5019 break;
5020
5021 case GOMP_DIM_WORKER:
5022 oa->num_workers = size;
5023 break;
5024
5025 case GOMP_DIM_VECTOR:
5026 oa->vector_length = size;
5027 break;
5028 }
5029 }
5030}
5031
3dede32b
TV
5032#if WORKAROUND_PTXJIT_BUG_2
5033/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
5034 is needed in the nvptx target because the branches generated for
5035 parititioning are NONJUMP_INSN_P, not JUMP_P. */
5036
5037static rtx
5038nvptx_pc_set (const rtx_insn *insn, bool strict = true)
5039{
5040 rtx pat;
5041 if ((strict && !JUMP_P (insn))
5042 || (!strict && !INSN_P (insn)))
5043 return NULL_RTX;
5044 pat = PATTERN (insn);
5045
5046 /* The set is allowed to appear either as the insn pattern or
5047 the first set in a PARALLEL. */
5048 if (GET_CODE (pat) == PARALLEL)
5049 pat = XVECEXP (pat, 0, 0);
5050 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
5051 return pat;
5052
5053 return NULL_RTX;
5054}
5055
5056/* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
5057
5058static rtx
5059nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
5060{
5061 rtx x = nvptx_pc_set (insn, strict);
5062
5063 if (!x)
5064 return NULL_RTX;
5065 x = SET_SRC (x);
5066 if (GET_CODE (x) == LABEL_REF)
5067 return x;
5068 if (GET_CODE (x) != IF_THEN_ELSE)
5069 return NULL_RTX;
5070 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
5071 return XEXP (x, 1);
5072 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
5073 return XEXP (x, 2);
5074 return NULL_RTX;
5075}
5076
5077/* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5078 insn inbetween the branch and the label. This works around a JIT bug
5079 observed at driver version 384.111, at -O0 for sm_50. */
5080
5081static void
5082prevent_branch_around_nothing (void)
5083{
5084 rtx_insn *seen_label = NULL;
5085 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5086 {
b0b592fc 5087 if (INSN_P (insn) && condjump_p (insn))
3dede32b 5088 {
b0b592fc 5089 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
3dede32b
TV
5090 continue;
5091 }
5092
b0b592fc
TV
5093 if (seen_label == NULL)
5094 continue;
5095
3dede32b
TV
5096 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5097 continue;
5098
5099 if (INSN_P (insn))
5100 switch (recog_memoized (insn))
5101 {
5102 case CODE_FOR_nvptx_fork:
5103 case CODE_FOR_nvptx_forked:
5104 case CODE_FOR_nvptx_joining:
5105 case CODE_FOR_nvptx_join:
5106 continue;
5107 default:
5108 seen_label = NULL;
5109 continue;
5110 }
5111
5112 if (LABEL_P (insn) && insn == seen_label)
5113 emit_insn_before (gen_fake_nop (), insn);
5114
5115 seen_label = NULL;
5116 }
5117 }
5118#endif
5119
21251395
TV
5120#ifdef WORKAROUND_PTXJIT_BUG_3
5121/* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5122 works around a hang observed at driver version 390.48 for sm_50. */
5123
5124static void
5125workaround_barsyncs (void)
5126{
5127 bool seen_barsync = false;
5128 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5129 {
5130 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5131 {
5132 if (seen_barsync)
5133 {
5134 emit_insn_before (gen_nvptx_membar_cta (), insn);
5135 emit_insn_before (gen_nvptx_membar_cta (), insn);
5136 }
5137
5138 seen_barsync = true;
5139 continue;
5140 }
5141
5142 if (!seen_barsync)
5143 continue;
5144
5145 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5146 continue;
5147 else if (INSN_P (insn))
5148 switch (recog_memoized (insn))
5149 {
5150 case CODE_FOR_nvptx_fork:
5151 case CODE_FOR_nvptx_forked:
5152 case CODE_FOR_nvptx_joining:
5153 case CODE_FOR_nvptx_join:
5154 continue;
5155 default:
5156 break;
5157 }
5158
5159 seen_barsync = false;
5160 }
5161}
5162#endif
5163
517665b3 5164/* PTX-specific reorganization
d88cd9c4 5165 - Split blocks at fork and join instructions
c38f0d8c
NS
5166 - Compute live registers
5167 - Mark now-unused registers, so function begin doesn't declare
517665b3 5168 unused registers.
d88cd9c4
NS
5169 - Insert state propagation when entering partitioned mode
5170 - Insert neutering instructions when in single mode
c38f0d8c 5171 - Replace subregs with suitable sequences.
517665b3
NS
5172*/
5173
5174static void
5175nvptx_reorg (void)
5176{
517665b3
NS
5177 /* We are freeing block_for_insn in the toplev to keep compatibility
5178 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5179 compute_bb_for_insn ();
5180
5181 thread_prologue_and_epilogue_insns ();
5182
d88cd9c4
NS
5183 /* Split blocks and record interesting unspecs. */
5184 bb_insn_map_t bb_insn_map;
5185
5186 nvptx_split_blocks (&bb_insn_map);
5187
c38f0d8c 5188 /* Compute live regs */
517665b3
NS
5189 df_clear_flags (DF_LR_RUN_DCE);
5190 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
5191 df_live_add_problem ();
5192 df_live_set_all_dirty ();
517665b3 5193 df_analyze ();
738f2522
BS
5194 regstat_init_n_sets_and_refs ();
5195
d88cd9c4
NS
5196 if (dump_file)
5197 df_dump (dump_file);
5198
517665b3 5199 /* Mark unused regs as unused. */
d88cd9c4 5200 int max_regs = max_reg_num ();
44c068ae 5201 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
5202 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5203 regno_reg_rtx[i] = const0_rtx;
517665b3 5204
d88cd9c4
NS
5205 /* Determine launch dimensions of the function. If it is not an
5206 offloaded function (i.e. this is a regular compiler), the
5207 function has no neutering. */
629b3d75 5208 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
5209 if (attr)
5210 {
5211 /* If we determined this mask before RTL expansion, we could
5212 elide emission of some levels of forks and joins. */
2498a6ab 5213 offload_attrs oa;
d88cd9c4 5214
2498a6ab 5215 populate_offload_attrs (&oa);
d88cd9c4 5216
d88cd9c4
NS
5217 /* If there is worker neutering, there must be vector
5218 neutering. Otherwise the hardware will fail. */
2498a6ab
TV
5219 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5220 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
d88cd9c4
NS
5221
5222 /* Discover & process partitioned regions. */
5223 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5224 nvptx_process_pars (pars);
2498a6ab 5225 nvptx_neuter_pars (pars, oa.mask, 0);
d88cd9c4
NS
5226 delete pars;
5227 }
5228
517665b3 5229 /* Replace subregs. */
c03b0416 5230 nvptx_reorg_subreg ();
517665b3 5231
5012919d
AM
5232 if (TARGET_UNIFORM_SIMT)
5233 nvptx_reorg_uniform_simt ();
5234
3dede32b
TV
5235#if WORKAROUND_PTXJIT_BUG_2
5236 prevent_branch_around_nothing ();
5237#endif
5238
21251395
TV
5239#ifdef WORKAROUND_PTXJIT_BUG_3
5240 workaround_barsyncs ();
5241#endif
5242
738f2522 5243 regstat_free_n_sets_and_refs ();
517665b3
NS
5244
5245 df_finish_pass (true);
738f2522
BS
5246}
5247\f
5248/* Handle a "kernel" attribute; arguments as in
5249 struct attribute_spec.handler. */
5250
5251static tree
5252nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5253 int ARG_UNUSED (flags), bool *no_add_attrs)
5254{
5255 tree decl = *node;
5256
5257 if (TREE_CODE (decl) != FUNCTION_DECL)
5258 {
5259 error ("%qE attribute only applies to functions", name);
5260 *no_add_attrs = true;
5261 }
b49e35a9 5262 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
5263 {
5264 error ("%qE attribute requires a void return type", name);
5265 *no_add_attrs = true;
5266 }
5267
5268 return NULL_TREE;
5269}
5270
5012919d
AM
5271/* Handle a "shared" attribute; arguments as in
5272 struct attribute_spec.handler. */
5273
5274static tree
5275nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5276 int ARG_UNUSED (flags), bool *no_add_attrs)
5277{
5278 tree decl = *node;
5279
5280 if (TREE_CODE (decl) != VAR_DECL)
5281 {
5282 error ("%qE attribute only applies to variables", name);
5283 *no_add_attrs = true;
5284 }
5285 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5286 {
5287 error ("%qE attribute not allowed with auto storage class", name);
5288 *no_add_attrs = true;
5289 }
5290
5291 return NULL_TREE;
5292}
5293
738f2522
BS
5294/* Table of valid machine attributes. */
5295static const struct attribute_spec nvptx_attribute_table[] =
5296{
4849deb1
JJ
5297 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5298 affects_type_identity, handler, exclude } */
5299 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5d9ae53d 5300 NULL },
4849deb1 5301 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5d9ae53d 5302 NULL },
4849deb1 5303 { NULL, 0, 0, false, false, false, false, NULL, NULL }
738f2522
BS
5304};
5305\f
5306/* Limit vector alignments to BIGGEST_ALIGNMENT. */
5307
5308static HOST_WIDE_INT
5309nvptx_vector_alignment (const_tree type)
5310{
26b538cd
RS
5311 unsigned HOST_WIDE_INT align;
5312 tree size = TYPE_SIZE (type);
738f2522 5313
26b538cd
RS
5314 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5315 if (tree_fits_uhwi_p (size))
5316 {
5317 align = tree_to_uhwi (size);
5318 align = MIN (align, BIGGEST_ALIGNMENT);
5319 }
5320 else
5321 align = BIGGEST_ALIGNMENT;
5322
5323 /* Ensure align is not smaller than mode alignment. */
5324 align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5325
5326 return align;
738f2522 5327}
d88cd9c4
NS
5328
5329/* Indicate that INSN cannot be duplicated. */
5330
5331static bool
5332nvptx_cannot_copy_insn_p (rtx_insn *insn)
5333{
5334 switch (recog_memoized (insn))
5335 {
5336 case CODE_FOR_nvptx_shufflesi:
5337 case CODE_FOR_nvptx_shufflesf:
5338 case CODE_FOR_nvptx_barsync:
5339 case CODE_FOR_nvptx_fork:
5340 case CODE_FOR_nvptx_forked:
5341 case CODE_FOR_nvptx_joining:
5342 case CODE_FOR_nvptx_join:
5343 return true;
5344 default:
5345 return false;
5346 }
5347}
a794bd20
NS
5348
5349/* Section anchors do not work. Initialization for flag_section_anchor
5350 probes the existence of the anchoring target hooks and prevents
5351 anchoring if they don't exist. However, we may be being used with
5352 a host-side compiler that does support anchoring, and hence see
5353 the anchor flag set (as it's not recalculated). So provide an
5354 implementation denying anchoring. */
5355
5356static bool
5357nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5358{
5359 return false;
5360}
738f2522 5361\f
1f83528e
TS
5362/* Record a symbol for mkoffload to enter into the mapping table. */
5363
5364static void
5365nvptx_record_offload_symbol (tree decl)
5366{
3e32ee19
NS
5367 switch (TREE_CODE (decl))
5368 {
5369 case VAR_DECL:
5370 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5371 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5372 break;
5373
5374 case FUNCTION_DECL:
5375 {
629b3d75 5376 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
5377 /* OpenMP offloading does not set this attribute. */
5378 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 5379
3e32ee19
NS
5380 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5381 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5382
5012919d 5383 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 5384 {
5d306e55 5385 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 5386
5d306e55 5387 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
5388 fprintf (asm_out_file, ", %#x", size);
5389 }
d2d47a28 5390
3e32ee19
NS
5391 fprintf (asm_out_file, "\n");
5392 }
5393 break;
d2d47a28 5394
3e32ee19
NS
5395 default:
5396 gcc_unreachable ();
5397 }
1f83528e
TS
5398}
5399
738f2522
BS
5400/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5401 at the start of a file. */
5402
5403static void
5404nvptx_file_start (void)
5405{
5406 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
eede2498
TV
5407 if (TARGET_PTX_7_0)
5408 fputs ("\t.version\t7.0\n", asm_out_file);
5409 else if (TARGET_PTX_6_3)
2a158640
TV
5410 fputs ("\t.version\t6.3\n", asm_out_file);
5411 else
5412 fputs ("\t.version\t3.1\n", asm_out_file);
308d688b
RS
5413 if (TARGET_SM80)
5414 fputs ("\t.target\tsm_80\n", asm_out_file);
5415 else if (TARGET_SM75)
5416 fputs ("\t.target\tsm_75\n", asm_out_file);
5417 else if (TARGET_SM53)
aeedb00a
RS
5418 fputs ("\t.target\tsm_53\n", asm_out_file);
5419 else if (TARGET_SM35)
53ceb8b5
CP
5420 fputs ("\t.target\tsm_35\n", asm_out_file);
5421 else
5422 fputs ("\t.target\tsm_30\n", asm_out_file);
738f2522
BS
5423 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5424 fputs ("// END PREAMBLE\n", asm_out_file);
5425}
5426
a0b3b5c4
TV
5427/* Emit a declaration for a worker and vector-level buffer in .shared
5428 memory. */
15ab6f00
NS
5429
5430static void
a0b3b5c4 5431write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
15ab6f00
NS
5432{
5433 const char *name = XSTR (sym, 0);
5434
5435 write_var_marker (file, true, false, name);
5436 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5437 align, name, size);
5438}
5439
ecf6e535
BS
5440/* Write out the function declarations we've collected and declare storage
5441 for the broadcast buffer. */
738f2522
BS
5442
5443static void
5444nvptx_file_end (void)
5445{
f3dba894
TS
5446 hash_table<tree_hasher>::iterator iter;
5447 tree decl;
5448 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 5449 nvptx_record_fndecl (decl);
738f2522 5450 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4 5451
1ed57fb8 5452 if (oacc_bcast_size)
a0b3b5c4 5453 write_shared_buffer (asm_out_file, oacc_bcast_sym,
1ed57fb8 5454 oacc_bcast_align, oacc_bcast_size);
f3552158
NS
5455
5456 if (worker_red_size)
a0b3b5c4 5457 write_shared_buffer (asm_out_file, worker_red_sym,
15ab6f00 5458 worker_red_align, worker_red_size);
5012919d 5459
f881693c
TV
5460 if (vector_red_size)
5461 write_shared_buffer (asm_out_file, vector_red_sym,
5462 vector_red_align, vector_red_size);
5463
29a2f518
JB
5464 if (gang_private_shared_size)
5465 write_shared_buffer (asm_out_file, gang_private_shared_sym,
5466 gang_private_shared_align, gang_private_shared_size);
5467
5012919d
AM
5468 if (need_softstack_decl)
5469 {
5470 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5471 /* 32 is the maximum number of warps in a block. Even though it's an
5472 external declaration, emit the array size explicitly; otherwise, it
5473 may fail at PTX JIT time if the definition is later in link order. */
5474 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5475 POINTER_SIZE);
5476 }
5477 if (need_unisimt_decl)
5478 {
5479 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5480 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5481 }
f3552158
NS
5482}
5483
5484/* Expander for the shuffle builtins. */
5485
5486static rtx
5487nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5488{
5489 if (ignore)
5490 return target;
5491
5492 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5493 NULL_RTX, mode, EXPAND_NORMAL);
5494 if (!REG_P (src))
5495 src = copy_to_mode_reg (mode, src);
5496
5497 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5498 NULL_RTX, SImode, EXPAND_NORMAL);
5499 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5500 NULL_RTX, SImode, EXPAND_NORMAL);
5501
5502 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5503 idx = copy_to_mode_reg (SImode, idx);
5504
59263259
NS
5505 rtx pat = nvptx_gen_shuffle (target, src, idx,
5506 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
5507 if (pat)
5508 emit_insn (pat);
5509
5510 return target;
5511}
5512
f881693c
TV
5513const char *
5514nvptx_output_red_partition (rtx dst, rtx offset)
5515{
5516 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5517 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5518
5519 if (offset == const0_rtx)
5520 fprintf (asm_out_file, zero_offset, REGNO (dst),
5521 REGNO (cfun->machine->red_partition));
5522 else
5523 fprintf (asm_out_file, with_offset, REGNO (dst),
5524 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5525
5526 return "";
5527}
5528
5529/* Shared-memory reduction address expander. */
f3552158
NS
5530
5531static rtx
a0b3b5c4 5532nvptx_expand_shared_addr (tree exp, rtx target,
f881693c
TV
5533 machine_mode ARG_UNUSED (mode), int ignore,
5534 int vector)
f3552158
NS
5535{
5536 if (ignore)
5537 return target;
5538
5539 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
f3552158
NS
5540 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5541 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
9a863523 5542 rtx addr = worker_red_sym;
f881693c
TV
5543
5544 if (vector)
9a863523 5545 {
f881693c
TV
5546 offload_attrs oa;
5547
5548 populate_offload_attrs (&oa);
5549
5550 unsigned int psize = ROUND_UP (size + offset, align);
5551 unsigned int pnum = nvptx_mach_max_workers ();
5552 vector_red_partition = MAX (vector_red_partition, psize);
5553 vector_red_size = MAX (vector_red_size, psize * pnum);
5554 vector_red_align = MAX (vector_red_align, align);
5555
5556 if (cfun->machine->red_partition == NULL)
5557 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5558
5559 addr = gen_reg_rtx (Pmode);
5560 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
9a863523 5561 }
f881693c
TV
5562 else
5563 {
5564 worker_red_align = MAX (worker_red_align, align);
5565 worker_red_size = MAX (worker_red_size, size + offset);
f3552158 5566
f881693c
TV
5567 if (offset)
5568 {
5569 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5570 addr = gen_rtx_CONST (Pmode, addr);
5571 }
5572 }
f3552158 5573
f881693c 5574 emit_move_insn (target, addr);
f3552158
NS
5575 return target;
5576}
5577
5578/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5579 not require taking the address of any object, other than the memory
5580 cell being operated on. */
5581
5582static rtx
5583nvptx_expand_cmp_swap (tree exp, rtx target,
5584 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5585{
5586 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5587
5588 if (!target)
5589 target = gen_reg_rtx (mode);
5590
5591 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5592 NULL_RTX, Pmode, EXPAND_NORMAL);
5593 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5594 NULL_RTX, mode, EXPAND_NORMAL);
5595 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5596 NULL_RTX, mode, EXPAND_NORMAL);
5597 rtx pat;
5598
5599 mem = gen_rtx_MEM (mode, mem);
5600 if (!REG_P (cmp))
5601 cmp = copy_to_mode_reg (mode, cmp);
5602 if (!REG_P (src))
5603 src = copy_to_mode_reg (mode, src);
5604
5605 if (mode == SImode)
5606 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5607 else
5608 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5609
5610 emit_insn (pat);
5611
5612 return target;
5613}
5614
5615
5616/* Codes for all the NVPTX builtins. */
5617enum nvptx_builtins
5618{
5619 NVPTX_BUILTIN_SHUFFLE,
5620 NVPTX_BUILTIN_SHUFFLELL,
5621 NVPTX_BUILTIN_WORKER_ADDR,
f881693c 5622 NVPTX_BUILTIN_VECTOR_ADDR,
f3552158
NS
5623 NVPTX_BUILTIN_CMP_SWAP,
5624 NVPTX_BUILTIN_CMP_SWAPLL,
5625 NVPTX_BUILTIN_MAX
5626};
5627
5628static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5629
5630/* Return the NVPTX builtin for CODE. */
5631
5632static tree
5633nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5634{
5635 if (code >= NVPTX_BUILTIN_MAX)
5636 return error_mark_node;
5637
5638 return nvptx_builtin_decls[code];
5639}
5640
5641/* Set up all builtin functions for this target. */
5642
5643static void
5644nvptx_init_builtins (void)
5645{
5646#define DEF(ID, NAME, T) \
5647 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5648 = add_builtin_function ("__builtin_nvptx_" NAME, \
5649 build_function_type_list T, \
5650 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5651#define ST sizetype
5652#define UINT unsigned_type_node
5653#define LLUINT long_long_unsigned_type_node
5654#define PTRVOID ptr_type_node
5655
5656 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5657 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5658 DEF (WORKER_ADDR, "worker_addr",
5659 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f881693c
TV
5660 DEF (VECTOR_ADDR, "vector_addr",
5661 (PTRVOID, ST, UINT, UINT, NULL_TREE));
f3552158
NS
5662 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5663 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5664
5665#undef DEF
5666#undef ST
5667#undef UINT
5668#undef LLUINT
5669#undef PTRVOID
5670}
5671
5672/* Expand an expression EXP that calls a built-in function,
5673 with result going to TARGET if that's convenient
5674 (and in mode MODE if that's convenient).
5675 SUBTARGET may be used as the target for computing one of EXP's operands.
5676 IGNORE is nonzero if the value is to be ignored. */
5677
5678static rtx
5679nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5680 machine_mode mode, int ignore)
5681{
5682 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4d732405 5683 switch (DECL_MD_FUNCTION_CODE (fndecl))
f3552158
NS
5684 {
5685 case NVPTX_BUILTIN_SHUFFLE:
5686 case NVPTX_BUILTIN_SHUFFLELL:
5687 return nvptx_expand_shuffle (exp, target, mode, ignore);
5688
5689 case NVPTX_BUILTIN_WORKER_ADDR:
f881693c
TV
5690 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5691
5692 case NVPTX_BUILTIN_VECTOR_ADDR:
5693 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
f3552158
NS
5694
5695 case NVPTX_BUILTIN_CMP_SWAP:
5696 case NVPTX_BUILTIN_CMP_SWAPLL:
5697 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5698
5699 default: gcc_unreachable ();
5700 }
738f2522 5701}
f3552158 5702
5012919d
AM
5703/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5704
5705static int
5706nvptx_simt_vf ()
5707{
5d17a476 5708 return PTX_WARP_SIZE;
5012919d
AM
5709}
5710
9ba66bf5
JJ
5711/* Return 1 if TRAIT NAME is present in the OpenMP context's
5712 device trait set, return 0 if not present in any OpenMP context in the
5713 whole translation unit, or -1 if not present in the current OpenMP context
5714 but might be present in another OpenMP context in the same TU. */
5715
5716int
5717nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5718 const char *name)
5719{
5720 switch (trait)
5721 {
5722 case omp_device_kind:
5723 return strcmp (name, "gpu") == 0;
5724 case omp_device_arch:
5725 return strcmp (name, "nvptx") == 0;
5726 case omp_device_isa:
5727 if (strcmp (name, "sm_30") == 0)
5728 return !TARGET_SM35;
5729 if (strcmp (name, "sm_35") == 0)
aeedb00a
RS
5730 return TARGET_SM35 && !TARGET_SM53;
5731 if (strcmp (name, "sm_53") == 0)
5732 return TARGET_SM53;
9ba66bf5
JJ
5733 return 0;
5734 default:
5735 gcc_unreachable ();
5736 }
5737}
5738
ccc0d492
TV
5739static bool
5740nvptx_welformed_vector_length_p (int l)
5741{
5742 gcc_assert (l > 0);
5743 return l % PTX_WARP_SIZE == 0;
5744}
5745
5746static void
5747nvptx_apply_dim_limits (int dims[])
5748{
5749 /* Check that the vector_length is not too large. */
5750 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5751 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5752
5753 /* Check that the number of workers is not too large. */
5754 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5755 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5756
5757 /* Ensure that num_worker * vector_length <= cta size. */
5758 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5759 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5760 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
052aaace
TV
5761
5762 /* If we need a per-worker barrier ... . */
5763 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5764 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5765 /* Don't use more barriers than available. */
5766 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5767 PTX_NUM_PER_WORKER_BARRIERS);
ccc0d492
TV
5768}
5769
6e723923
TV
5770/* Return true if FNDECL contains calls to vector-partitionable routines. */
5771
5772static bool
5773has_vector_partitionable_routine_calls_p (tree fndecl)
5774{
5775 if (!fndecl)
5776 return false;
5777
5778 basic_block bb;
5779 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5780 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5781 gsi_next_nondebug (&i))
5782 {
5783 gimple *stmt = gsi_stmt (i);
5784 if (gimple_code (stmt) != GIMPLE_CALL)
5785 continue;
5786
5787 tree callee = gimple_call_fndecl (stmt);
5788 if (!callee)
5789 continue;
5790
5791 tree attrs = oacc_get_fn_attrib (callee);
5792 if (attrs == NULL_TREE)
5793 return false;
5794
5795 int partition_level = oacc_fn_attrib_level (attrs);
5796 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5797 if (!seq_routine_p)
5798 return true;
5799 }
5800
5801 return false;
5802}
5803
5f57c8f3
TV
5804/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5805 DIMS has changed. */
94829f87 5806
5f57c8f3 5807static void
46dedae6 5808nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
94829f87 5809{
ec6c865c
TV
5810 bool oacc_default_dims_p = false;
5811 bool oacc_min_dims_p = false;
5812 bool offload_region_p = false;
5813 bool routine_p = false;
5814 bool routine_seq_p = false;
d6f528e7 5815 int default_vector_length = -1;
ec6c865c
TV
5816
5817 if (decl == NULL_TREE)
5818 {
5819 if (fn_level == -1)
5820 oacc_default_dims_p = true;
5821 else if (fn_level == -2)
5822 oacc_min_dims_p = true;
5823 else
5824 gcc_unreachable ();
5825 }
5826 else if (fn_level == -1)
5827 offload_region_p = true;
5828 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5829 {
5830 routine_p = true;
5831 routine_seq_p = fn_level == GOMP_DIM_MAX;
5832 }
5833 else
5834 gcc_unreachable ();
94829f87 5835
4b171607
TV
5836 if (oacc_min_dims_p)
5837 {
5838 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5839 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5840 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5841
5842 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5843 return;
5844 }
5845
5846 if (routine_p)
5847 {
5848 if (!routine_seq_p)
5849 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5850
5851 return;
5852 }
5853
5854 if (oacc_default_dims_p)
5855 {
5856 /* -1 : not set
5857 0 : set at runtime, f.i. -fopenacc-dims=-
5858 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5859 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5860 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5861 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5862
5863 /* But -fopenacc-dims=- is not yet supported on trunk. */
5864 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5865 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5866 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5867 }
5868
5869 if (offload_region_p)
5870 {
5871 /* -1 : not set
5872 0 : set using variable, f.i. num_gangs (n)
5873 >= 1: set using constant, f.i. num_gangs (1). */
5874 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5875 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5876 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5877 }
5878
d6f528e7
TV
5879 if (offload_region_p)
5880 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5881 else
5882 /* oacc_default_dims_p. */
5883 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5884
d8ab4e54
TV
5885 int old_dims[GOMP_DIM_MAX];
5886 unsigned int i;
5887 for (i = 0; i < GOMP_DIM_MAX; ++i)
5888 old_dims[i] = dims[i];
5889
5890 const char *vector_reason = NULL;
6e723923
TV
5891 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5892 {
af79605e
TV
5893 default_vector_length = PTX_WARP_SIZE;
5894
6e723923
TV
5895 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5896 {
5897 vector_reason = G_("using vector_length (%d) due to call to"
5898 " vector-partitionable routine, ignoring %d");
5899 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5900 }
5901 }
5902
d8ab4e54 5903 if (dims[GOMP_DIM_VECTOR] == 0)
ccc8282b 5904 {
d8ab4e54 5905 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
d6f528e7 5906 dims[GOMP_DIM_VECTOR] = default_vector_length;
ccc8282b
NS
5907 }
5908
d8ab4e54 5909 if (dims[GOMP_DIM_VECTOR] > 0
ccc0d492 5910 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
d6f528e7 5911 dims[GOMP_DIM_VECTOR] = default_vector_length;
d8ab4e54 5912
ccc0d492 5913 nvptx_apply_dim_limits (dims);
d8ab4e54
TV
5914
5915 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5916 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5917 vector_reason != NULL
5918 ? vector_reason
5919 : G_("using vector_length (%d), ignoring %d"),
5920 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5921
5922 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5923 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5924 G_("using num_workers (%d), ignoring %d"),
5925 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
94829f87 5926
4b171607 5927 if (oacc_default_dims_p)
b6adbb9f 5928 {
2c2ff168
TV
5929 if (dims[GOMP_DIM_VECTOR] < 0)
5930 dims[GOMP_DIM_VECTOR] = default_vector_length;
b6adbb9f 5931 if (dims[GOMP_DIM_WORKER] < 0)
605219e7 5932 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
b6adbb9f 5933 if (dims[GOMP_DIM_GANG] < 0)
605219e7 5934 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
ccc0d492 5935 nvptx_apply_dim_limits (dims);
b6adbb9f 5936 }
980af7cd
TV
5937
5938 if (offload_region_p)
5939 {
5940 for (i = 0; i < GOMP_DIM_MAX; i++)
5941 {
5942 if (!(dims[i] < 0))
5943 continue;
5944
5945 if ((used & GOMP_DIM_MASK (i)) == 0)
5946 /* Function oacc_validate_dims will apply the minimal dimension. */
5947 continue;
5948
d6f528e7
TV
5949 dims[i] = (i == GOMP_DIM_VECTOR
5950 ? default_vector_length
5951 : oacc_get_default_dim (i));
980af7cd
TV
5952 }
5953
5954 nvptx_apply_dim_limits (dims);
5955 }
5f57c8f3
TV
5956}
5957
5958/* Validate compute dimensions of an OpenACC offload or routine, fill
5959 in non-unity defaults. FN_LEVEL indicates the level at which a
5960 routine might spawn a loop. It is negative for non-routines. If
5961 DECL is null, we are validating the default dimensions. */
5962
5963static bool
46dedae6 5964nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5f57c8f3
TV
5965{
5966 int old_dims[GOMP_DIM_MAX];
5967 unsigned int i;
b6adbb9f 5968
5f57c8f3
TV
5969 for (i = 0; i < GOMP_DIM_MAX; ++i)
5970 old_dims[i] = dims[i];
5971
46dedae6 5972 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5f57c8f3 5973
21fbea5c
TV
5974 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5975 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5976 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5977
5f57c8f3
TV
5978 for (i = 0; i < GOMP_DIM_MAX; ++i)
5979 if (old_dims[i] != dims[i])
5980 return true;
5981
5982 return false;
94829f87 5983}
d88cd9c4 5984
bd751975
NS
5985/* Return maximum dimension size, or zero for unbounded. */
5986
5987static int
5988nvptx_dim_limit (int axis)
5989{
5990 switch (axis)
5991 {
bd751975 5992 case GOMP_DIM_VECTOR:
764ecad4 5993 return PTX_MAX_VECTOR_LENGTH;
bd751975
NS
5994
5995 default:
5996 break;
5997 }
5998 return 0;
5999}
6000
d88cd9c4
NS
6001/* Determine whether fork & joins are needed. */
6002
6003static bool
6004nvptx_goacc_fork_join (gcall *call, const int dims[],
6005 bool ARG_UNUSED (is_fork))
6006{
6007 tree arg = gimple_call_arg (call, 2);
6008 unsigned axis = TREE_INT_CST_LOW (arg);
6009
6010 /* We only care about worker and vector partitioning. */
6011 if (axis < GOMP_DIM_WORKER)
6012 return false;
6013
6014 /* If the size is 1, there's no partitioning. */
6015 if (dims[axis] == 1)
6016 return false;
6017
6018 return true;
6019}
6020
f3552158
NS
6021/* Generate a PTX builtin function call that returns the address in
6022 the worker reduction buffer at OFFSET. TYPE is the type of the
6023 data at that location. */
6024
6025static tree
f881693c 6026nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
f3552158 6027{
f881693c
TV
6028 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
6029 if (vector)
6030 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
f3552158 6031 machine_mode mode = TYPE_MODE (type);
f881693c 6032 tree fndecl = nvptx_builtin_decl (addr_dim, true);
f3552158
NS
6033 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
6034 tree align = build_int_cst (unsigned_type_node,
6035 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
6036 tree call = build_call_expr (fndecl, 3, offset, size, align);
6037
6038 return fold_convert (build_pointer_type (type), call);
6039}
6040
6041/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
6042 will cast the variable if necessary. */
6043
6044static void
6045nvptx_generate_vector_shuffle (location_t loc,
6046 tree dest_var, tree var, unsigned shift,
6047 gimple_seq *seq)
6048{
6049 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
6050 tree_code code = NOP_EXPR;
dd3c1b14
NS
6051 tree arg_type = unsigned_type_node;
6052 tree var_type = TREE_TYPE (var);
6053 tree dest_type = var_type;
f3552158 6054
dd3c1b14
NS
6055 if (TREE_CODE (var_type) == COMPLEX_TYPE)
6056 var_type = TREE_TYPE (var_type);
6057
6058 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 6059 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
6060
6061 if (TYPE_SIZE (var_type)
6062 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
6063 {
6064 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 6065 arg_type = long_long_unsigned_type_node;
f3552158 6066 }
dd3c1b14 6067
f3552158 6068 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
6069 tree bits = build_int_cst (unsigned_type_node, shift);
6070 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
6071 tree expr;
6072
6073 if (var_type != dest_type)
6074 {
6075 /* Do real and imaginary parts separately. */
6076 tree real = fold_build1 (REALPART_EXPR, var_type, var);
6077 real = fold_build1 (code, arg_type, real);
6078 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
6079 real = fold_build1 (code, var_type, real);
f3552158 6080
dd3c1b14
NS
6081 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
6082 imag = fold_build1 (code, arg_type, imag);
6083 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
6084 imag = fold_build1 (code, var_type, imag);
6085
6086 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
6087 }
6088 else
6089 {
6090 expr = fold_build1 (code, arg_type, var);
6091 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
6092 expr = fold_build1 (code, dest_type, expr);
6093 }
f3552158 6094
dd3c1b14 6095 gimplify_assign (dest_var, expr, seq);
f3552158
NS
6096}
6097
33f47f42
NS
6098/* Lazily generate the global lock var decl and return its address. */
6099
6100static tree
6101nvptx_global_lock_addr ()
6102{
6103 tree v = global_lock_var;
6104
6105 if (!v)
6106 {
6107 tree name = get_identifier ("__reduction_lock");
6108 tree type = build_qualified_type (unsigned_type_node,
6109 TYPE_QUAL_VOLATILE);
6110 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
6111 global_lock_var = v;
6112 DECL_ARTIFICIAL (v) = 1;
6113 DECL_EXTERNAL (v) = 1;
6114 TREE_STATIC (v) = 1;
6115 TREE_PUBLIC (v) = 1;
6116 TREE_USED (v) = 1;
6117 mark_addressable (v);
6118 mark_decl_referenced (v);
6119 }
6120
6121 return build_fold_addr_expr (v);
6122}
6123
6124/* Insert code to locklessly update *PTR with *PTR OP VAR just before
6125 GSI. We use a lockless scheme for nearly all case, which looks
6126 like:
6127 actual = initval(OP);
6128 do {
6129 guess = actual;
6130 write = guess OP myval;
6131 actual = cmp&swap (ptr, guess, write)
6132 } while (actual bit-different-to guess);
6133 return write;
6134
6135 This relies on a cmp&swap instruction, which is available for 32-
6136 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
6137
6138static tree
6139nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6140 tree ptr, tree var, tree_code op)
6141{
6142 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6143 tree_code code = NOP_EXPR;
33f47f42
NS
6144 tree arg_type = unsigned_type_node;
6145 tree var_type = TREE_TYPE (var);
f3552158 6146
33f47f42
NS
6147 if (TREE_CODE (var_type) == COMPLEX_TYPE
6148 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 6149 code = VIEW_CONVERT_EXPR;
33f47f42
NS
6150
6151 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 6152 {
33f47f42 6153 arg_type = long_long_unsigned_type_node;
f3552158 6154 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
6155 }
6156
33f47f42
NS
6157 tree swap_fn = nvptx_builtin_decl (fn, true);
6158
f3552158 6159 gimple_seq init_seq = NULL;
33f47f42
NS
6160 tree init_var = make_ssa_name (arg_type);
6161 tree init_expr = omp_reduction_init_op (loc, op, var_type);
6162 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
6163 gimplify_assign (init_var, init_expr, &init_seq);
6164 gimple *init_end = gimple_seq_last (init_seq);
6165
6166 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6167
f3552158
NS
6168 /* Split the block just after the init stmts. */
6169 basic_block pre_bb = gsi_bb (*gsi);
6170 edge pre_edge = split_block (pre_bb, init_end);
6171 basic_block loop_bb = pre_edge->dest;
6172 pre_bb = pre_edge->src;
6173 /* Reset the iterator. */
6174 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6175
33f47f42
NS
6176 tree expect_var = make_ssa_name (arg_type);
6177 tree actual_var = make_ssa_name (arg_type);
6178 tree write_var = make_ssa_name (arg_type);
6179
6180 /* Build and insert the reduction calculation. */
6181 gimple_seq red_seq = NULL;
6182 tree write_expr = fold_build1 (code, var_type, expect_var);
6183 write_expr = fold_build2 (op, var_type, write_expr, var);
6184 write_expr = fold_build1 (code, arg_type, write_expr);
6185 gimplify_assign (write_var, write_expr, &red_seq);
6186
6187 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6188
6189 /* Build & insert the cmp&swap sequence. */
6190 gimple_seq latch_seq = NULL;
6191 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6192 ptr, expect_var, write_var);
6193 gimplify_assign (actual_var, swap_expr, &latch_seq);
6194
6195 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6196 NULL_TREE, NULL_TREE);
6197 gimple_seq_add_stmt (&latch_seq, cond);
6198
6199 gimple *latch_end = gimple_seq_last (latch_seq);
6200 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 6201
33f47f42
NS
6202 /* Split the block just after the latch stmts. */
6203 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
6204 basic_block post_bb = post_edge->dest;
6205 loop_bb = post_edge->src;
6206 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6207
6208 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5 6209 post_edge->probability = profile_probability::even ();
f3552158 6210 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
72270bb5 6211 loop_edge->probability = profile_probability::even ();
f3552158
NS
6212 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6213 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6214
6215 gphi *phi = create_phi_node (expect_var, loop_bb);
6216 add_phi_arg (phi, init_var, pre_edge, loc);
6217 add_phi_arg (phi, actual_var, loop_edge, loc);
6218
6219 loop *loop = alloc_loop ();
6220 loop->header = loop_bb;
6221 loop->latch = loop_bb;
6222 add_loop (loop, loop_bb->loop_father);
6223
33f47f42
NS
6224 return fold_build1 (code, var_type, write_var);
6225}
6226
6227/* Insert code to lockfully update *PTR with *PTR OP VAR just before
6228 GSI. This is necessary for types larger than 64 bits, where there
6229 is no cmp&swap instruction to implement a lockless scheme. We use
6230 a lock variable in global memory.
6231
6232 while (cmp&swap (&lock_var, 0, 1))
6233 continue;
6234 T accum = *ptr;
6235 accum = accum OP var;
6236 *ptr = accum;
6237 cmp&swap (&lock_var, 1, 0);
6238 return accum;
6239
6240 A lock in global memory is necessary to force execution engine
6241 descheduling and avoid resource starvation that can occur if the
6242 lock is in .shared memory. */
6243
6244static tree
6245nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6246 tree ptr, tree var, tree_code op)
6247{
6248 tree var_type = TREE_TYPE (var);
6249 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6250 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6251 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6252
6253 /* Split the block just before the gsi. Insert a gimple nop to make
6254 this easier. */
6255 gimple *nop = gimple_build_nop ();
6256 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6257 basic_block entry_bb = gsi_bb (*gsi);
6258 edge entry_edge = split_block (entry_bb, nop);
6259 basic_block lock_bb = entry_edge->dest;
6260 /* Reset the iterator. */
6261 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6262
6263 /* Build and insert the locking sequence. */
6264 gimple_seq lock_seq = NULL;
6265 tree lock_var = make_ssa_name (unsigned_type_node);
6266 tree lock_expr = nvptx_global_lock_addr ();
6267 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6268 uns_unlocked, uns_locked);
6269 gimplify_assign (lock_var, lock_expr, &lock_seq);
6270 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6271 NULL_TREE, NULL_TREE);
6272 gimple_seq_add_stmt (&lock_seq, cond);
6273 gimple *lock_end = gimple_seq_last (lock_seq);
6274 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6275
6276 /* Split the block just after the lock sequence. */
6277 edge locked_edge = split_block (lock_bb, lock_end);
6278 basic_block update_bb = locked_edge->dest;
6279 lock_bb = locked_edge->src;
6280 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6281
6282 /* Create the lock loop ... */
6283 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
72270bb5
TV
6284 locked_edge->probability = profile_probability::even ();
6285 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6286 loop_edge->probability = profile_probability::even ();
33f47f42
NS
6287 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6288 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6289
6290 /* ... and the loop structure. */
6291 loop *lock_loop = alloc_loop ();
6292 lock_loop->header = lock_bb;
6293 lock_loop->latch = lock_bb;
6294 lock_loop->nb_iterations_estimate = 1;
6295 lock_loop->any_estimate = true;
6296 add_loop (lock_loop, entry_bb->loop_father);
6297
6298 /* Build and insert the reduction calculation. */
6299 gimple_seq red_seq = NULL;
6300 tree acc_in = make_ssa_name (var_type);
6301 tree ref_in = build_simple_mem_ref (ptr);
6302 TREE_THIS_VOLATILE (ref_in) = 1;
6303 gimplify_assign (acc_in, ref_in, &red_seq);
6304
6305 tree acc_out = make_ssa_name (var_type);
6306 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6307 gimplify_assign (acc_out, update_expr, &red_seq);
6308
6309 tree ref_out = build_simple_mem_ref (ptr);
6310 TREE_THIS_VOLATILE (ref_out) = 1;
6311 gimplify_assign (ref_out, acc_out, &red_seq);
6312
6313 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6314
6315 /* Build & insert the unlock sequence. */
6316 gimple_seq unlock_seq = NULL;
6317 tree unlock_expr = nvptx_global_lock_addr ();
6318 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6319 uns_locked, uns_unlocked);
6320 gimplify_and_add (unlock_expr, &unlock_seq);
6321 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6322
6323 return acc_out;
6324}
6325
6326/* Emit a sequence to update a reduction accumlator at *PTR with the
6327 value held in VAR using operator OP. Return the updated value.
6328
6329 TODO: optimize for atomic ops and indepedent complex ops. */
6330
6331static tree
6332nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6333 tree ptr, tree var, tree_code op)
6334{
6335 tree type = TREE_TYPE (var);
6336 tree size = TYPE_SIZE (type);
6337
6338 if (size == TYPE_SIZE (unsigned_type_node)
6339 || size == TYPE_SIZE (long_long_unsigned_type_node))
6340 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6341 else
6342 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
6343}
6344
6345/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6346
6347static void
f881693c 6348nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
f3552158
NS
6349{
6350 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6351 tree lhs = gimple_call_lhs (call);
6352 tree var = gimple_call_arg (call, 2);
6353 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6354 gimple_seq seq = NULL;
6355
6356 push_gimplify_context (true);
6357
6358 if (level != GOMP_DIM_GANG)
6359 {
6360 /* Copy the receiver object. */
6361 tree ref_to_res = gimple_call_arg (call, 1);
6362
6363 if (!integer_zerop (ref_to_res))
6364 var = build_simple_mem_ref (ref_to_res);
6365 }
6366
f881693c
TV
6367 if (level == GOMP_DIM_WORKER
6368 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6369 {
6370 /* Store incoming value to worker reduction buffer. */
6371 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6372 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6373 level == GOMP_DIM_VECTOR);
f3552158
NS
6374 tree ptr = make_ssa_name (TREE_TYPE (call));
6375
6376 gimplify_assign (ptr, call, &seq);
6377 tree ref = build_simple_mem_ref (ptr);
6378 TREE_THIS_VOLATILE (ref) = 1;
6379 gimplify_assign (ref, var, &seq);
6380 }
6381
6382 if (lhs)
6383 gimplify_assign (lhs, var, &seq);
6384
6385 pop_gimplify_context (NULL);
6386 gsi_replace_with_seq (&gsi, seq, true);
6387}
6388
6389/* NVPTX implementation of GOACC_REDUCTION_INIT. */
6390
6391static void
f881693c 6392nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
f3552158
NS
6393{
6394 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6395 tree lhs = gimple_call_lhs (call);
6396 tree var = gimple_call_arg (call, 2);
6397 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6398 enum tree_code rcode
6399 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6400 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6401 TREE_TYPE (var));
6402 gimple_seq seq = NULL;
6403
6404 push_gimplify_context (true);
6405
f881693c 6406 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6407 {
6408 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6409 tree tid = make_ssa_name (integer_type_node);
6410 tree dim_vector = gimple_call_arg (call, 3);
6411 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6412 dim_vector);
6413 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6414 NULL_TREE, NULL_TREE);
6415
6416 gimple_call_set_lhs (tid_call, tid);
6417 gimple_seq_add_stmt (&seq, tid_call);
6418 gimple_seq_add_stmt (&seq, cond_stmt);
6419
6420 /* Split the block just after the call. */
6421 edge init_edge = split_block (gsi_bb (gsi), call);
6422 basic_block init_bb = init_edge->dest;
6423 basic_block call_bb = init_edge->src;
6424
6425 /* Fixup flags from call_bb to init_bb. */
6426 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8c38aed7 6427 init_edge->probability = profile_probability::even ();
f3552158
NS
6428
6429 /* Set the initialization stmts. */
6430 gimple_seq init_seq = NULL;
6431 tree init_var = make_ssa_name (TREE_TYPE (var));
6432 gimplify_assign (init_var, init, &init_seq);
6433 gsi = gsi_start_bb (init_bb);
6434 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6435
6436 /* Split block just after the init stmt. */
6437 gsi_prev (&gsi);
6438 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6439 basic_block dst_bb = inited_edge->dest;
6440
6441 /* Create false edge from call_bb to dst_bb. */
6442 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8c38aed7 6443 nop_edge->probability = profile_probability::even ();
f3552158
NS
6444
6445 /* Create phi node in dst block. */
6446 gphi *phi = create_phi_node (lhs, dst_bb);
6447 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6448 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6449
6450 /* Reset dominator of dst bb. */
6451 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6452
6453 /* Reset the gsi. */
6454 gsi = gsi_for_stmt (call);
6455 }
6456 else
6457 {
6458 if (level == GOMP_DIM_GANG)
6459 {
6460 /* If there's no receiver object, propagate the incoming VAR. */
6461 tree ref_to_res = gimple_call_arg (call, 1);
6462 if (integer_zerop (ref_to_res))
6463 init = var;
6464 }
6465
d41d952c
TV
6466 if (lhs != NULL_TREE)
6467 gimplify_assign (lhs, init, &seq);
f3552158
NS
6468 }
6469
6470 pop_gimplify_context (NULL);
6471 gsi_replace_with_seq (&gsi, seq, true);
6472}
6473
6474/* NVPTX implementation of GOACC_REDUCTION_FINI. */
6475
6476static void
f881693c 6477nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
f3552158
NS
6478{
6479 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6480 tree lhs = gimple_call_lhs (call);
6481 tree ref_to_res = gimple_call_arg (call, 1);
6482 tree var = gimple_call_arg (call, 2);
6483 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6484 enum tree_code op
6485 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6486 gimple_seq seq = NULL;
6487 tree r = NULL_TREE;;
6488
6489 push_gimplify_context (true);
6490
f881693c 6491 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
f3552158
NS
6492 {
6493 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6494 but that requires a method of emitting a unified jump at the
6495 gimple level. */
764ecad4 6496 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
f3552158
NS
6497 {
6498 tree other_var = make_ssa_name (TREE_TYPE (var));
6499 nvptx_generate_vector_shuffle (gimple_location (call),
6500 other_var, var, shfl, &seq);
6501
6502 r = make_ssa_name (TREE_TYPE (var));
6503 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6504 var, other_var), &seq);
6505 var = r;
6506 }
6507 }
6508 else
6509 {
6510 tree accum = NULL_TREE;
6511
f881693c 6512 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
f3552158
NS
6513 {
6514 /* Get reduction buffer address. */
6515 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6516 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6517 level == GOMP_DIM_VECTOR);
f3552158
NS
6518 tree ptr = make_ssa_name (TREE_TYPE (call));
6519
6520 gimplify_assign (ptr, call, &seq);
6521 accum = ptr;
6522 }
6523 else if (integer_zerop (ref_to_res))
6524 r = var;
6525 else
6526 accum = ref_to_res;
6527
6528 if (accum)
6529 {
33f47f42 6530 /* UPDATE the accumulator. */
f3552158
NS
6531 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6532 seq = NULL;
33f47f42
NS
6533 r = nvptx_reduction_update (gimple_location (call), &gsi,
6534 accum, var, op);
f3552158
NS
6535 }
6536 }
6537
6538 if (lhs)
6539 gimplify_assign (lhs, r, &seq);
6540 pop_gimplify_context (NULL);
6541
6542 gsi_replace_with_seq (&gsi, seq, true);
6543}
6544
6545/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6546
6547static void
f881693c 6548nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
f3552158
NS
6549{
6550 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6551 tree lhs = gimple_call_lhs (call);
6552 tree var = gimple_call_arg (call, 2);
6553 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6554 gimple_seq seq = NULL;
6555
6556 push_gimplify_context (true);
f881693c
TV
6557 if (level == GOMP_DIM_WORKER
6558 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
f3552158
NS
6559 {
6560 /* Read the worker reduction buffer. */
6561 tree offset = gimple_call_arg (call, 5);
f881693c
TV
6562 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6563 level == GOMP_DIM_VECTOR);
f3552158
NS
6564 tree ptr = make_ssa_name (TREE_TYPE (call));
6565
6566 gimplify_assign (ptr, call, &seq);
6567 var = build_simple_mem_ref (ptr);
6568 TREE_THIS_VOLATILE (var) = 1;
6569 }
6570
6571 if (level != GOMP_DIM_GANG)
6572 {
6573 /* Write to the receiver object. */
6574 tree ref_to_res = gimple_call_arg (call, 1);
6575
6576 if (!integer_zerop (ref_to_res))
6577 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6578 }
6579
6580 if (lhs)
6581 gimplify_assign (lhs, var, &seq);
6582
6583 pop_gimplify_context (NULL);
6584
6585 gsi_replace_with_seq (&gsi, seq, true);
6586}
6587
6588/* NVPTX reduction expander. */
6589
5563d5c0 6590static void
f3552158
NS
6591nvptx_goacc_reduction (gcall *call)
6592{
6593 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
f881693c
TV
6594 offload_attrs oa;
6595
6596 populate_offload_attrs (&oa);
f3552158
NS
6597
6598 switch (code)
6599 {
6600 case IFN_GOACC_REDUCTION_SETUP:
f881693c 6601 nvptx_goacc_reduction_setup (call, &oa);
f3552158
NS
6602 break;
6603
6604 case IFN_GOACC_REDUCTION_INIT:
f881693c 6605 nvptx_goacc_reduction_init (call, &oa);
f3552158
NS
6606 break;
6607
6608 case IFN_GOACC_REDUCTION_FINI:
f881693c 6609 nvptx_goacc_reduction_fini (call, &oa);
f3552158
NS
6610 break;
6611
6612 case IFN_GOACC_REDUCTION_TEARDOWN:
f881693c 6613 nvptx_goacc_reduction_teardown (call, &oa);
f3552158
NS
6614 break;
6615
6616 default:
6617 gcc_unreachable ();
6618 }
6619}
6620
e053f710
TV
6621static bool
6622nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6623 rtx x ATTRIBUTE_UNUSED)
6624{
6625 return true;
6626}
6627
aeedb00a
RS
6628static bool
6629nvptx_scalar_mode_supported_p (scalar_mode mode)
6630{
6631 if (mode == HFmode && TARGET_SM53)
6632 return true;
6633
6634 return default_scalar_mode_supported_p (mode);
6635}
6636
6637static bool
6638nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
6639{
6640 if (mode == HFmode && TARGET_SM53)
6641 return true;
6642
6643 return default_libgcc_floating_mode_supported_p (mode);
6644}
6645
b98b34b7
TV
6646static bool
6647nvptx_vector_mode_supported (machine_mode mode)
6648{
3717fbe3
TV
6649 return (mode == V2SImode
6650 || mode == V2DImode);
6651}
6652
6653/* Return the preferred mode for vectorizing scalar MODE. */
6654
6655static machine_mode
005ba29c 6656nvptx_preferred_simd_mode (scalar_mode mode)
3717fbe3
TV
6657{
6658 switch (mode)
6659 {
4e10a5a7 6660 case E_DImode:
3717fbe3 6661 return V2DImode;
4e10a5a7 6662 case E_SImode:
3717fbe3
TV
6663 return V2SImode;
6664
6665 default:
6666 return default_preferred_simd_mode (mode);
6667 }
6668}
6669
6670unsigned int
6671nvptx_data_alignment (const_tree type, unsigned int basic_align)
6672{
6673 if (TREE_CODE (type) == INTEGER_TYPE)
6674 {
6675 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6676 if (size == GET_MODE_SIZE (TImode))
6677 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6678 }
6679
6680 return basic_align;
b98b34b7
TV
6681}
6682
99e1629f
RS
6683/* Implement TARGET_MODES_TIEABLE_P. */
6684
6685static bool
6686nvptx_modes_tieable_p (machine_mode, machine_mode)
6687{
6688 return false;
6689}
6690
c43f4279
RS
6691/* Implement TARGET_HARD_REGNO_NREGS. */
6692
6693static unsigned int
6694nvptx_hard_regno_nregs (unsigned int, machine_mode)
6695{
6696 return 1;
6697}
6698
0d803030
RS
6699/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6700
6701static bool
6702nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6703{
6704 return false;
6705}
6706
3a4a9259
RS
6707/* Implement TARGET_TRULY_NOOP_TRUNCATION. */
6708
6709static bool
6710nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
6711{
6712 return false;
6713}
6714
29a2f518
JB
6715/* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */
6716
6717static tree
11b8286a 6718nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
29a2f518 6719{
f6f45309
TS
6720 gcc_checking_assert (!lookup_attribute ("oacc gang-private",
6721 DECL_ATTRIBUTES (decl)));
29a2f518
JB
6722
6723 /* Set "oacc gang-private" attribute for gang-private variable
6724 declarations. */
f6f45309 6725 if (level == GOMP_DIM_GANG)
29a2f518 6726 {
29a2f518 6727 tree id = get_identifier ("oacc gang-private");
11b8286a
TS
6728 /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
6729 TREE). */
6730 tree loc_tree = build_empty_stmt (loc);
6731 DECL_ATTRIBUTES (decl)
6732 = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
29a2f518
JB
6733 }
6734
6735 return decl;
6736}
6737
6738/* Implement TARGET_GOACC_EXPAND_VAR_DECL. */
6739
6740static rtx
6741nvptx_goacc_expand_var_decl (tree var)
6742{
6743 /* Place "oacc gang-private" variables in shared memory. */
11b8286a
TS
6744 if (tree attr = lookup_attribute ("oacc gang-private",
6745 DECL_ATTRIBUTES (var)))
29a2f518 6746 {
f6f45309
TS
6747 gcc_checking_assert (VAR_P (var));
6748
29a2f518
JB
6749 unsigned int offset, *poffset;
6750 poffset = gang_private_shared_hmap.get (var);
6751 if (poffset)
6752 offset = *poffset;
6753 else
6754 {
6755 unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
6756 gang_private_shared_size
6757 = (gang_private_shared_size + align - 1) & ~(align - 1);
6758 if (gang_private_shared_align < align)
6759 gang_private_shared_align = align;
6760
6761 offset = gang_private_shared_size;
6762 bool existed = gang_private_shared_hmap.put (var, offset);
6763 gcc_checking_assert (!existed);
6764 gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
11b8286a
TS
6765
6766 location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
6767#if 0 /* For some reason, this doesn't work. */
6768 if (dump_enabled_p ())
6769 {
6770 dump_flags_t l_dump_flags
6771 = get_openacc_privatization_dump_flags ();
6772
6773 const dump_user_location_t d_u_loc
6774 = dump_user_location_t::from_location_t (loc);
6775/* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
6776#if __GNUC__ >= 10
6777# pragma GCC diagnostic push
6778# pragma GCC diagnostic ignored "-Wformat"
6779#endif
6780 dump_printf_loc (l_dump_flags, d_u_loc,
6781 "variable %<%T%> adjusted for OpenACC"
6782 " privatization level: %qs\n",
6783 var, "gang");
6784#if __GNUC__ >= 10
6785# pragma GCC diagnostic pop
6786#endif
6787 }
6788#else /* ..., thus emulate that, good enough for testsuite usage. */
6789 if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
6790 inform (loc,
6791 "variable %qD adjusted for OpenACC privatization level:"
6792 " %qs",
6793 var, "gang");
6794 if (dump_file && (dump_flags & TDF_DETAILS))
6795 {
e53b6e56 6796 /* 'dumpfile.cc:dump_loc' */
11b8286a
TS
6797 fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
6798 LOCATION_LINE (loc), LOCATION_COLUMN (loc));
6799 fprintf (dump_file, "%s: ", "note");
6800
6801 fprintf (dump_file,
6802 "variable '");
6803 print_generic_expr (dump_file, var, TDF_SLIM);
6804 fprintf (dump_file,
6805 "' adjusted for OpenACC privatization level: '%s'\n",
6806 "gang");
6807 }
6808#endif
29a2f518
JB
6809 }
6810 rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
6811 return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
6812 }
6813
6814 return NULL_RTX;
6815}
6816
43be05f5
TV
6817static GTY(()) tree nvptx_previous_fndecl;
6818
6819static void
6820nvptx_set_current_function (tree fndecl)
6821{
6822 if (!fndecl || fndecl == nvptx_previous_fndecl)
6823 return;
6824
29a2f518 6825 gang_private_shared_hmap.empty ();
43be05f5 6826 nvptx_previous_fndecl = fndecl;
f881693c 6827 vector_red_partition = 0;
0024c320 6828 oacc_bcast_partition = 0;
43be05f5
TV
6829}
6830
bae974e6
TV
6831/* Implement TARGET_LIBC_HAS_FUNCTION. */
6832
6833bool
6834nvptx_libc_has_function (enum function_class fn_class, tree type)
6835{
6836 if (fn_class == function_sincos)
6837 {
6838 if (type != NULL_TREE)
6839 /* Currently, newlib does not support sincosl. */
6840 return type == float_type_node || type == double_type_node;
6841 else
6842 return true;
6843 }
6844
6845 return default_libc_has_function (fn_class, type);
6846}
6847
738f2522
BS
6848#undef TARGET_OPTION_OVERRIDE
6849#define TARGET_OPTION_OVERRIDE nvptx_option_override
6850
6851#undef TARGET_ATTRIBUTE_TABLE
6852#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6853
d81db636
SB
6854#undef TARGET_LRA_P
6855#define TARGET_LRA_P hook_bool_void_false
6856
738f2522
BS
6857#undef TARGET_LEGITIMATE_ADDRESS_P
6858#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6859
6860#undef TARGET_PROMOTE_FUNCTION_MODE
6861#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6862
6863#undef TARGET_FUNCTION_ARG
6864#define TARGET_FUNCTION_ARG nvptx_function_arg
6865#undef TARGET_FUNCTION_INCOMING_ARG
6866#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6867#undef TARGET_FUNCTION_ARG_ADVANCE
6868#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
6869#undef TARGET_FUNCTION_ARG_BOUNDARY
6870#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
6871#undef TARGET_PASS_BY_REFERENCE
6872#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6873#undef TARGET_FUNCTION_VALUE_REGNO_P
6874#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6875#undef TARGET_FUNCTION_VALUE
6876#define TARGET_FUNCTION_VALUE nvptx_function_value
6877#undef TARGET_LIBCALL_VALUE
6878#define TARGET_LIBCALL_VALUE nvptx_libcall_value
6879#undef TARGET_FUNCTION_OK_FOR_SIBCALL
6880#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
6881#undef TARGET_GET_DRAP_RTX
6882#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
6883#undef TARGET_SPLIT_COMPLEX_ARG
6884#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6885#undef TARGET_RETURN_IN_MEMORY
6886#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6887#undef TARGET_OMIT_STRUCT_RETURN_REG
6888#define TARGET_OMIT_STRUCT_RETURN_REG true
6889#undef TARGET_STRICT_ARGUMENT_NAMING
6890#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
6891#undef TARGET_CALL_ARGS
6892#define TARGET_CALL_ARGS nvptx_call_args
6893#undef TARGET_END_CALL_ARGS
6894#define TARGET_END_CALL_ARGS nvptx_end_call_args
6895
6896#undef TARGET_ASM_FILE_START
6897#define TARGET_ASM_FILE_START nvptx_file_start
6898#undef TARGET_ASM_FILE_END
6899#define TARGET_ASM_FILE_END nvptx_file_end
6900#undef TARGET_ASM_GLOBALIZE_LABEL
6901#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6902#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6903#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6904#undef TARGET_PRINT_OPERAND
6905#define TARGET_PRINT_OPERAND nvptx_print_operand
6906#undef TARGET_PRINT_OPERAND_ADDRESS
6907#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6908#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6909#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6910#undef TARGET_ASM_INTEGER
6911#define TARGET_ASM_INTEGER nvptx_assemble_integer
6912#undef TARGET_ASM_DECL_END
6913#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6914#undef TARGET_ASM_DECLARE_CONSTANT_NAME
6915#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6916#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6917#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6918#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6919#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6920
6921#undef TARGET_MACHINE_DEPENDENT_REORG
6922#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6923#undef TARGET_NO_REGISTER_ALLOCATION
6924#define TARGET_NO_REGISTER_ALLOCATION true
6925
9a863523
NS
6926#undef TARGET_ENCODE_SECTION_INFO
6927#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
6928#undef TARGET_RECORD_OFFLOAD_SYMBOL
6929#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6930
738f2522
BS
6931#undef TARGET_VECTOR_ALIGNMENT
6932#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6933
d88cd9c4
NS
6934#undef TARGET_CANNOT_COPY_INSN_P
6935#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6936
a794bd20
NS
6937#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6938#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6939
f3552158
NS
6940#undef TARGET_INIT_BUILTINS
6941#define TARGET_INIT_BUILTINS nvptx_init_builtins
6942#undef TARGET_EXPAND_BUILTIN
6943#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6944#undef TARGET_BUILTIN_DECL
6945#define TARGET_BUILTIN_DECL nvptx_builtin_decl
6946
5012919d
AM
6947#undef TARGET_SIMT_VF
6948#define TARGET_SIMT_VF nvptx_simt_vf
6949
9ba66bf5
JJ
6950#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6951#define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6952
94829f87
NS
6953#undef TARGET_GOACC_VALIDATE_DIMS
6954#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6955
bd751975
NS
6956#undef TARGET_GOACC_DIM_LIMIT
6957#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6958
d88cd9c4
NS
6959#undef TARGET_GOACC_FORK_JOIN
6960#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6961
f3552158
NS
6962#undef TARGET_GOACC_REDUCTION
6963#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6964
e053f710
TV
6965#undef TARGET_CANNOT_FORCE_CONST_MEM
6966#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6967
aeedb00a
RS
6968#undef TARGET_SCALAR_MODE_SUPPORTED_P
6969#define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
6970
6971#undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
6972#define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
6973 nvptx_libgcc_floating_mode_supported_p
6974
b98b34b7
TV
6975#undef TARGET_VECTOR_MODE_SUPPORTED_P
6976#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6977
3717fbe3
TV
6978#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6979#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6980 nvptx_preferred_simd_mode
6981
99e1629f
RS
6982#undef TARGET_MODES_TIEABLE_P
6983#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6984
c43f4279
RS
6985#undef TARGET_HARD_REGNO_NREGS
6986#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6987
0d803030
RS
6988#undef TARGET_CAN_CHANGE_MODE_CLASS
6989#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6990
3a4a9259
RS
6991#undef TARGET_TRULY_NOOP_TRUNCATION
6992#define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
6993
e335138d
TV
6994#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6995#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6996
29a2f518
JB
6997#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
6998#define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
6999
7000#undef TARGET_GOACC_EXPAND_VAR_DECL
7001#define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7002
43be05f5
TV
7003#undef TARGET_SET_CURRENT_FUNCTION
7004#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7005
bae974e6
TV
7006#undef TARGET_LIBC_HAS_FUNCTION
7007#define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7008
738f2522
BS
7009struct gcc_target targetm = TARGET_INITIALIZER;
7010
7011#include "gt-nvptx.h"