]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
profopt.exp: Print relative names in UNSUPPORTED AutoFDO tests.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
cbe34bb5 2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
4d0cdd0c 31#include "memmodel.h"
e11c4407
AM
32#include "tm_p.h"
33#include "expmed.h"
34#include "optabs.h"
35#include "regs.h"
36#include "emit-rtl.h"
37#include "recog.h"
38#include "diagnostic.h"
40e23961 39#include "alias.h"
738f2522
BS
40#include "insn-flags.h"
41#include "output.h"
42#include "insn-attr.h"
36566b39 43#include "flags.h"
36566b39
PK
44#include "dojump.h"
45#include "explow.h"
46#include "calls.h"
36566b39
PK
47#include "varasm.h"
48#include "stmt.h"
738f2522 49#include "expr.h"
738f2522
BS
50#include "tm-preds.h"
51#include "tm-constrs.h"
738f2522
BS
52#include "langhooks.h"
53#include "dbxout.h"
738f2522 54#include "cfgrtl.h"
d88cd9c4 55#include "gimple.h"
738f2522 56#include "stor-layout.h"
738f2522 57#include "builtins.h"
629b3d75 58#include "omp-general.h"
3e32ee19
NS
59#include "omp-low.h"
60#include "gomp-constants.h"
d88cd9c4 61#include "dumpfile.h"
f3552158
NS
62#include "internal-fn.h"
63#include "gimple-iterator.h"
64#include "stringpool.h"
f2d4d839 65#include "tree-vrp.h"
f3552158
NS
66#include "tree-ssa-operands.h"
67#include "tree-ssanames.h"
68#include "gimplify.h"
69#include "tree-phinodes.h"
70#include "cfgloop.h"
71#include "fold-const.h"
50c620ca 72#include "intl.h"
738f2522 73
994c5d85 74/* This file should be included last. */
d58627a0
RS
75#include "target-def.h"
76
9a863523
NS
77/* The various PTX memory areas an object might reside in. */
78enum nvptx_data_area
79{
80 DATA_AREA_GENERIC,
81 DATA_AREA_GLOBAL,
82 DATA_AREA_SHARED,
83 DATA_AREA_LOCAL,
84 DATA_AREA_CONST,
85 DATA_AREA_PARAM,
86 DATA_AREA_MAX
87};
88
89/* We record the data area in the target symbol flags. */
90#define SYMBOL_DATA_AREA(SYM) \
91 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
92 & 7)
93#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
94 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
95
738f2522
BS
96/* Record the function decls we've written, and the libfuncs and function
97 decls corresponding to them. */
98static std::stringstream func_decls;
f3dba894 99
6c907cff 100struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
101{
102 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
103 static bool equal (rtx a, rtx b) { return a == b; }
104};
105
106static GTY((cache))
107 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
108
6c907cff 109struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
110{
111 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
112 static bool equal (tree a, tree b) { return a == b; }
113};
114
115static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
116static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 117
f3552158
NS
118/* Buffer needed to broadcast across workers. This is used for both
119 worker-neutering and worker broadcasting. It is shared by all
120 functions emitted. The buffer is placed in shared memory. It'd be
121 nice if PTX supported common blocks, because then this could be
122 shared across TUs (taking the largest size). */
d88cd9c4
NS
123static unsigned worker_bcast_size;
124static unsigned worker_bcast_align;
d88cd9c4
NS
125static GTY(()) rtx worker_bcast_sym;
126
f3552158
NS
127/* Buffer needed for worker reductions. This has to be distinct from
128 the worker broadcast array, as both may be live concurrently. */
129static unsigned worker_red_size;
130static unsigned worker_red_align;
f3552158
NS
131static GTY(()) rtx worker_red_sym;
132
33f47f42
NS
133/* Global lock variable, needed for 128bit worker & gang reductions. */
134static GTY(()) tree global_lock_var;
135
5012919d
AM
136/* True if any function references __nvptx_stacks. */
137static bool need_softstack_decl;
138
139/* True if any function references __nvptx_uni. */
140static bool need_unisimt_decl;
141
738f2522
BS
142/* Allocate a new, cleared machine_function structure. */
143
144static struct machine_function *
145nvptx_init_machine_status (void)
146{
147 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 148 p->return_mode = VOIDmode;
738f2522
BS
149 return p;
150}
151
5012919d
AM
152/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
153 and -fopenacc is also enabled. */
154
155static void
156diagnose_openacc_conflict (bool optval, const char *optname)
157{
158 if (flag_openacc && optval)
159 error ("option %s is not supported together with -fopenacc", optname);
160}
161
738f2522
BS
162/* Implement TARGET_OPTION_OVERRIDE. */
163
164static void
165nvptx_option_override (void)
166{
167 init_machine_status = nvptx_init_machine_status;
ae578943 168
3d339d5e
AM
169 /* Set toplevel_reorder, unless explicitly disabled. We need
170 reordering so that we emit necessary assembler decls of
171 undeclared variables. */
172 if (!global_options_set.x_flag_toplevel_reorder)
173 flag_toplevel_reorder = 1;
174
ae578943
NS
175 /* Set flag_no_common, unless explicitly disabled. We fake common
176 using .weak, and that's not entirely accurate, so avoid it
177 unless forced. */
178 if (!global_options_set.x_flag_no_common)
179 flag_no_common = 1;
180
738f2522
BS
181 /* Assumes that it will see only hard registers. */
182 flag_var_tracking = 0;
1e5154e7 183
dba619f3
NS
184 if (nvptx_optimize < 0)
185 nvptx_optimize = optimize > 0;
186
f3dba894
TS
187 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
188 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 189 declared_libfuncs_htab
f3dba894 190 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 191
15ab6f00 192 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
9a863523 193 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
d88cd9c4 194 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158 195
15ab6f00 196 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 197 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 198 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
5012919d
AM
199
200 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
201 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
202 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
203
204 if (TARGET_GOMP)
205 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
738f2522
BS
206}
207
738f2522
BS
208/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
209 deal with ptx ideosyncracies. */
210
211const char *
212nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
213{
214 switch (mode)
215 {
216 case BLKmode:
217 return ".b8";
218 case BImode:
219 return ".pred";
220 case QImode:
221 if (promote)
222 return ".u32";
223 else
224 return ".u8";
225 case HImode:
226 return ".u16";
227 case SImode:
228 return ".u32";
229 case DImode:
230 return ".u64";
231
232 case SFmode:
233 return ".f32";
234 case DFmode:
235 return ".f64";
236
237 default:
238 gcc_unreachable ();
239 }
240}
241
9a863523
NS
242/* Encode the PTX data area that DECL (which might not actually be a
243 _DECL) should reside in. */
7b8edc29 244
9a863523
NS
245static void
246nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 247{
9a863523
NS
248 default_encode_section_info (decl, rtl, first);
249 if (first && MEM_P (rtl))
250 {
251 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 252
9a863523
NS
253 if (TREE_CONSTANT (decl))
254 area = DATA_AREA_CONST;
255 else if (TREE_CODE (decl) == VAR_DECL)
5012919d
AM
256 {
257 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
258 {
259 area = DATA_AREA_SHARED;
260 if (DECL_INITIAL (decl))
261 error ("static initialization of variable %q+D in %<.shared%>"
262 " memory is not supported", decl);
263 }
264 else
265 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
266 }
7b8edc29 267
9a863523
NS
268 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
269 }
270}
271
272/* Return the PTX name of the data area in which SYM should be
273 placed. The symbol must have already been processed by
274 nvptx_encode_seciton_info, or equivalent. */
275
276static const char *
277section_for_sym (rtx sym)
278{
279 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
280 /* Same order as nvptx_data_area enum. */
281 static char const *const areas[] =
282 {"", ".global", ".shared", ".local", ".const", ".param"};
283
284 return areas[area];
285}
286
287/* Similarly for a decl. */
288
289static const char *
290section_for_decl (const_tree decl)
291{
292 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
293}
294
b4346107
NS
295/* Check NAME for special function names and redirect them by returning a
296 replacement. This applies to malloc, free and realloc, for which we
297 want to use libgcc wrappers, and call, which triggers a bug in
298 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
299 not active in an offload compiler -- the names are all set by the
300 host-side compiler. */
301
302static const char *
303nvptx_name_replacement (const char *name)
304{
305 if (strcmp (name, "call") == 0)
306 return "__nvptx_call";
307 if (strcmp (name, "malloc") == 0)
308 return "__nvptx_malloc";
309 if (strcmp (name, "free") == 0)
310 return "__nvptx_free";
311 if (strcmp (name, "realloc") == 0)
312 return "__nvptx_realloc";
313 return name;
314}
315
d7479262
NS
316/* If MODE should be treated as two registers of an inner mode, return
317 that inner mode. Otherwise return VOIDmode. */
738f2522 318
d7479262
NS
319static machine_mode
320maybe_split_mode (machine_mode mode)
738f2522 321{
738f2522 322 if (COMPLEX_MODE_P (mode))
d7479262 323 return GET_MODE_INNER (mode);
738f2522 324
738f2522 325 if (mode == TImode)
d7479262
NS
326 return DImode;
327
328 return VOIDmode;
738f2522
BS
329}
330
38827811
TV
331/* Return true if mode should be treated as two registers. */
332
333static bool
334split_mode_p (machine_mode mode)
335{
336 return maybe_split_mode (mode) != VOIDmode;
337}
338
f313d112
NS
339/* Output a register, subreg, or register pair (with optional
340 enclosing braces). */
341
342static void
343output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
344 int subreg_offset = -1)
345{
346 if (inner_mode == VOIDmode)
347 {
348 if (HARD_REGISTER_NUM_P (regno))
349 fprintf (file, "%s", reg_names[regno]);
350 else
351 fprintf (file, "%%r%d", regno);
352 }
353 else if (subreg_offset >= 0)
354 {
355 output_reg (file, regno, VOIDmode);
356 fprintf (file, "$%d", subreg_offset);
357 }
358 else
359 {
360 if (subreg_offset == -1)
361 fprintf (file, "{");
362 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
363 fprintf (file, ",");
364 output_reg (file, regno, inner_mode, 0);
365 if (subreg_offset == -1)
366 fprintf (file, "}");
367 }
368}
369
d88cd9c4
NS
370/* Emit forking instructions for MASK. */
371
372static void
373nvptx_emit_forking (unsigned mask, bool is_call)
374{
375 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
376 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
377 if (mask)
378 {
379 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
380
381 /* Emit fork at all levels. This helps form SESE regions, as
382 it creates a block with a single successor before entering a
383 partitooned region. That is a good candidate for the end of
384 an SESE region. */
385 if (!is_call)
386 emit_insn (gen_nvptx_fork (op));
387 emit_insn (gen_nvptx_forked (op));
388 }
389}
390
391/* Emit joining instructions for MASK. */
392
393static void
394nvptx_emit_joining (unsigned mask, bool is_call)
395{
396 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
397 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
398 if (mask)
399 {
400 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
401
402 /* Emit joining for all non-call pars to ensure there's a single
403 predecessor for the block the join insn ends up in. This is
404 needed for skipping entire loops. */
405 if (!is_call)
406 emit_insn (gen_nvptx_joining (op));
407 emit_insn (gen_nvptx_join (op));
408 }
409}
410
738f2522 411\f
44eba92d
NS
412/* Determine whether MODE and TYPE (possibly NULL) should be passed or
413 returned in memory. Integer and floating types supported by the
414 machine are passed in registers, everything else is passed in
415 memory. Complex types are split. */
416
417static bool
418pass_in_memory (machine_mode mode, const_tree type, bool for_return)
419{
420 if (type)
421 {
422 if (AGGREGATE_TYPE_P (type))
423 return true;
424 if (TREE_CODE (type) == VECTOR_TYPE)
425 return true;
426 }
427
428 if (!for_return && COMPLEX_MODE_P (mode))
429 /* Complex types are passed as two underlying args. */
430 mode = GET_MODE_INNER (mode);
431
432 if (GET_MODE_CLASS (mode) != MODE_INT
433 && GET_MODE_CLASS (mode) != MODE_FLOAT)
434 return true;
435
436 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
437 return true;
438
439 return false;
440}
441
442/* A non-memory argument of mode MODE is being passed, determine the mode it
443 should be promoted to. This is also used for determining return
444 type promotion. */
445
446static machine_mode
447promote_arg (machine_mode mode, bool prototyped)
448{
449 if (!prototyped && mode == SFmode)
450 /* K&R float promotion for unprototyped functions. */
451 mode = DFmode;
452 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
453 mode = SImode;
454
455 return mode;
456}
457
458/* A non-memory return type of MODE is being returned. Determine the
459 mode it should be promoted to. */
460
461static machine_mode
462promote_return (machine_mode mode)
463{
464 return promote_arg (mode, true);
465}
466
e74f7152 467/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 468
e74f7152 469static rtx
b49e35a9 470nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
471 const_tree, bool named)
472{
b49e35a9 473 if (mode == VOIDmode || !named)
e74f7152 474 return NULL_RTX;
738f2522 475
b49e35a9 476 return gen_reg_rtx (mode);
e74f7152
NS
477}
478
479/* Implement TARGET_FUNCTION_INCOMING_ARG. */
480
481static rtx
482nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
483 const_tree, bool named)
738f2522 484{
e74f7152 485 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 486
b49e35a9 487 if (mode == VOIDmode || !named)
e74f7152 488 return NULL_RTX;
738f2522 489
e74f7152
NS
490 /* No need to deal with split modes here, the only case that can
491 happen is complex modes and those are dealt with by
492 TARGET_SPLIT_COMPLEX_ARG. */
493 return gen_rtx_UNSPEC (mode,
494 gen_rtvec (1, GEN_INT (cum->count)),
495 UNSPEC_ARG_REG);
496}
497
498/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
499
500static void
501nvptx_function_arg_advance (cumulative_args_t cum_v,
502 machine_mode ARG_UNUSED (mode),
503 const_tree ARG_UNUSED (type),
504 bool ARG_UNUSED (named))
505{
506 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 507
e74f7152
NS
508 cum->count++;
509}
510
1988b2c2
NS
511/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
512
513 For nvptx This is only used for varadic args. The type has already
514 been promoted and/or converted to invisible reference. */
515
516static unsigned
517nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
518{
519 return GET_MODE_ALIGNMENT (mode);
520}
521
e74f7152
NS
522/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
523
524 For nvptx, we know how to handle functions declared as stdarg: by
525 passing an extra pointer to the unnamed arguments. However, the
526 Fortran frontend can produce a different situation, where a
527 function pointer is declared with no arguments, but the actual
528 function and calls to it take more arguments. In that case, we
529 want to ensure the call matches the definition of the function. */
530
531static bool
532nvptx_strict_argument_naming (cumulative_args_t cum_v)
533{
534 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 535
e74f7152
NS
536 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
537}
538
e74f7152
NS
539/* Implement TARGET_LIBCALL_VALUE. */
540
541static rtx
542nvptx_libcall_value (machine_mode mode, const_rtx)
543{
e91cacae 544 if (!cfun || !cfun->machine->doing_call)
e74f7152
NS
545 /* Pretend to return in a hard reg for early uses before pseudos can be
546 generated. */
547 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 548
e74f7152
NS
549 return gen_reg_rtx (mode);
550}
551
552/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
553 where function FUNC returns or receives a value of data type TYPE. */
554
555static rtx
44eba92d 556nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
557 bool outgoing)
558{
44eba92d
NS
559 machine_mode mode = promote_return (TYPE_MODE (type));
560
e74f7152 561 if (outgoing)
5c036f3f 562 {
e91cacae 563 gcc_assert (cfun);
44c068ae 564 cfun->machine->return_mode = mode;
5c036f3f
NS
565 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
566 }
b49e35a9
NS
567
568 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
569}
570
571/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
572
573static bool
574nvptx_function_value_regno_p (const unsigned int regno)
575{
576 return regno == NVPTX_RETURN_REGNUM;
577}
578
579/* Types with a mode other than those supported by the machine are passed by
580 reference in memory. */
581
582static bool
5563d5c0
NS
583nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
584 machine_mode mode, const_tree type,
585 bool ARG_UNUSED (named))
e74f7152 586{
44eba92d 587 return pass_in_memory (mode, type, false);
e74f7152
NS
588}
589
590/* Implement TARGET_RETURN_IN_MEMORY. */
591
592static bool
593nvptx_return_in_memory (const_tree type, const_tree)
594{
44eba92d 595 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
596}
597
598/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
599
600static machine_mode
601nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 602 int *ARG_UNUSED (punsignedp),
e74f7152
NS
603 const_tree funtype, int for_return)
604{
44eba92d 605 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
606}
607
e74f7152
NS
608/* Helper for write_arg. Emit a single PTX argument of MODE, either
609 in a prototype, or as copy in a function prologue. ARGNO is the
610 index of this argument in the PTX function. FOR_REG is negative,
611 if we're emitting the PTX prototype. It is zero if we're copying
612 to an argument register and it is greater than zero if we're
613 copying to a specific hard register. */
614
615static int
1f065954
NS
616write_arg_mode (std::stringstream &s, int for_reg, int argno,
617 machine_mode mode)
e74f7152
NS
618{
619 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
620
dc3d2aeb
NS
621 if (for_reg < 0)
622 {
623 /* Writing PTX prototype. */
624 s << (argno ? ", " : " (");
e74f7152 625 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
626 }
627 else
628 {
e74f7152 629 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
630 if (for_reg)
631 s << reg_names[for_reg];
632 else
633 s << "%ar" << argno;
634 s << ";\n";
5563d5c0
NS
635 if (argno >= 0)
636 {
637 s << "\tld.param" << ptx_type << " ";
638 if (for_reg)
639 s << reg_names[for_reg];
640 else
641 s << "%ar" << argno;
642 s << ", [%in_ar" << argno << "];\n";
643 }
dc3d2aeb
NS
644 }
645 return argno + 1;
738f2522
BS
646}
647
e74f7152 648/* Process function parameter TYPE to emit one or more PTX
1f065954 649 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
650 is true, if this is a prototyped function, rather than an old-style
651 C declaration. Returns the next argument number to use.
e74f7152 652
9c582551 653 The promotion behavior here must match the regular GCC function
e74f7152
NS
654 parameter marshalling machinery. */
655
656static int
1f065954
NS
657write_arg_type (std::stringstream &s, int for_reg, int argno,
658 tree type, bool prototyped)
e74f7152
NS
659{
660 machine_mode mode = TYPE_MODE (type);
661
662 if (mode == VOIDmode)
663 return argno;
664
44eba92d 665 if (pass_in_memory (mode, type, false))
e74f7152 666 mode = Pmode;
44eba92d
NS
667 else
668 {
669 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 670
44eba92d
NS
671 if (split)
672 {
673 /* Complex types are sent as two separate args. */
674 type = TREE_TYPE (type);
5563d5c0 675 mode = TYPE_MODE (type);
44eba92d
NS
676 prototyped = true;
677 }
e74f7152 678
44eba92d
NS
679 mode = promote_arg (mode, prototyped);
680 if (split)
1f065954 681 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 682 }
e74f7152 683
1f065954
NS
684 return write_arg_mode (s, for_reg, argno, mode);
685}
686
687/* Emit a PTX return as a prototype or function prologue declaration
688 for MODE. */
689
690static void
691write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
692{
693 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
694 const char *pfx = "\t.reg";
695 const char *sfx = ";\n";
696
697 if (for_proto)
698 pfx = "(.param", sfx = "_out) ";
699
700 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
701}
702
44eba92d 703/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 704 or function prologue declaration. Returns true if return is via an
9c582551 705 additional pointer parameter. The promotion behavior here must
1f065954 706 match the regular GCC function return mashalling. */
44eba92d 707
0beb7c71 708static bool
1f065954 709write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
710{
711 machine_mode mode = TYPE_MODE (type);
0beb7c71 712
44eba92d
NS
713 if (mode == VOIDmode)
714 return false;
715
716 bool return_in_mem = pass_in_memory (mode, type, true);
717
718 if (return_in_mem)
0beb7c71 719 {
44eba92d
NS
720 if (for_proto)
721 return return_in_mem;
722
723 /* Named return values can cause us to return a pointer as well
724 as expect an argument for the return location. This is
725 optimization-level specific, so no caller can make use of
726 this data, but more importantly for us, we must ensure it
727 doesn't change the PTX prototype. */
44c068ae 728 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 729
44eba92d
NS
730 if (mode == VOIDmode)
731 return return_in_mem;
732
44c068ae 733 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 734 retval parameter. */
44c068ae 735 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
736 }
737 else
44eba92d
NS
738 mode = promote_return (mode);
739
1f065954 740 write_return_mode (s, for_proto, mode);
0beb7c71
NS
741
742 return return_in_mem;
743}
744
738f2522
BS
745/* Look for attributes in ATTRS that would indicate we must write a function
746 as a .entry kernel rather than a .func. Return true if one is found. */
747
748static bool
749write_as_kernel (tree attrs)
750{
751 return (lookup_attribute ("kernel", attrs) != NULL_TREE
5012919d
AM
752 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
753 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
754 /* For OpenMP target regions, the corresponding kernel entry is emitted from
755 write_omp_entry as a separate function. */
738f2522
BS
756}
757
69823d76
NS
758/* Emit a linker marker for a function decl or defn. */
759
760static void
761write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
762 const char *name)
763{
764 s << "\n// BEGIN";
765 if (globalize)
766 s << " GLOBAL";
767 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
768 s << name << "\n";
769}
770
771/* Emit a linker marker for a variable decl or defn. */
772
773static void
774write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
775{
776 fprintf (file, "\n// BEGIN%s VAR %s: ",
777 globalize ? " GLOBAL" : "",
778 is_defn ? "DEF" : "DECL");
779 assemble_name_raw (file, name);
780 fputs ("\n", file);
781}
782
b699adcc
NS
783/* Write a .func or .kernel declaration or definition along with
784 a helper comment for use by ld. S is the stream to write to, DECL
785 the decl for the function with name NAME. For definitions, emit
786 a declaration too. */
738f2522 787
b699adcc
NS
788static const char *
789write_fn_proto (std::stringstream &s, bool is_defn,
790 const char *name, const_tree decl)
738f2522 791{
b699adcc
NS
792 if (is_defn)
793 /* Emit a declaration. The PTX assembler gets upset without it. */
794 name = write_fn_proto (s, false, name, decl);
b4346107
NS
795 else
796 {
797 /* Avoid repeating the name replacement. */
798 name = nvptx_name_replacement (name);
799 if (name[0] == '*')
800 name++;
801 }
738f2522 802
69823d76 803 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
804
805 /* PTX declaration. */
738f2522
BS
806 if (DECL_EXTERNAL (decl))
807 s << ".extern ";
808 else if (TREE_PUBLIC (decl))
0766660b 809 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 810 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 811
b699adcc
NS
812 tree fntype = TREE_TYPE (decl);
813 tree result_type = TREE_TYPE (fntype);
738f2522 814
ac3d02e2
NS
815 /* atomic_compare_exchange_$n builtins have an exceptional calling
816 convention. */
817 int not_atomic_weak_arg = -1;
818 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
819 switch (DECL_FUNCTION_CODE (decl))
820 {
821 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
822 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
823 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
824 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
825 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
826 /* These atomics skip the 'weak' parm in an actual library
827 call. We must skip it in the prototype too. */
828 not_atomic_weak_arg = 3;
829 break;
830
831 default:
832 break;
833 }
834
738f2522 835 /* Declare the result. */
1f065954 836 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 837
b699adcc
NS
838 s << name;
839
dc3d2aeb 840 int argno = 0;
b699adcc
NS
841
842 /* Emit argument list. */
843 if (return_in_mem)
1f065954 844 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 845
b699adcc
NS
846 /* We get:
847 NULL in TYPE_ARG_TYPES, for old-style functions
848 NULL in DECL_ARGUMENTS, for builtin functions without another
849 declaration.
850 So we have to pick the best one we have. */
851 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
852 bool prototyped = true;
853 if (!args)
854 {
855 args = DECL_ARGUMENTS (decl);
856 prototyped = false;
857 }
738f2522 858
ac3d02e2 859 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
b699adcc 860 {
dc3d2aeb 861 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
ac3d02e2
NS
862
863 if (not_atomic_weak_arg)
864 argno = write_arg_type (s, -1, argno, type, prototyped);
865 else
866 gcc_assert (type == boolean_type_node);
738f2522 867 }
738f2522 868
b699adcc 869 if (stdarg_p (fntype))
1f065954 870 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 871
b699adcc 872 if (DECL_STATIC_CHAIN (decl))
1f065954 873 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 874
dc3d2aeb 875 if (!argno && strcmp (name, "main") == 0)
b699adcc 876 {
1f065954
NS
877 argno = write_arg_type (s, -1, argno, integer_type_node, true);
878 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
879 }
880
dc3d2aeb 881 if (argno)
b699adcc
NS
882 s << ")";
883
884 s << (is_defn ? "\n" : ";\n");
885
886 return name;
738f2522
BS
887}
888
00e52418
NS
889/* Construct a function declaration from a call insn. This can be
890 necessary for two reasons - either we have an indirect call which
891 requires a .callprototype declaration, or we have a libcall
892 generated by emit_library_call for which no decl exists. */
893
894static void
b699adcc
NS
895write_fn_proto_from_insn (std::stringstream &s, const char *name,
896 rtx result, rtx pat)
00e52418
NS
897{
898 if (!name)
899 {
900 s << "\t.callprototype ";
901 name = "_";
902 }
903 else
904 {
b4346107 905 name = nvptx_name_replacement (name);
69823d76 906 write_fn_marker (s, false, true, name);
00e52418
NS
907 s << "\t.extern .func ";
908 }
909
910 if (result != NULL_RTX)
1f065954 911 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
912
913 s << name;
914
00e52418
NS
915 int arg_end = XVECLEN (pat, 0);
916 for (int i = 1; i < arg_end; i++)
917 {
1f065954
NS
918 /* We don't have to deal with mode splitting & promotion here,
919 as that was already done when generating the call
920 sequence. */
00e52418
NS
921 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
922
1f065954 923 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
924 }
925 if (arg_end != 1)
926 s << ")";
927 s << ";\n";
928}
929
00e52418
NS
930/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
931 table and and write a ptx prototype. These are emitted at end of
932 compilation. */
738f2522 933
00e52418
NS
934static void
935nvptx_record_fndecl (tree decl)
738f2522 936{
f3dba894 937 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
938 if (*slot == NULL)
939 {
940 *slot = decl;
941 const char *name = get_fnname_from_decl (decl);
b699adcc 942 write_fn_proto (func_decls, false, name, decl);
738f2522 943 }
738f2522
BS
944}
945
00e52418
NS
946/* Record a libcall or unprototyped external function. CALLEE is the
947 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
948 declaration for it. */
949
950static void
951nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
952{
953 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
954 if (*slot == NULL)
955 {
956 *slot = callee;
957
958 const char *name = XSTR (callee, 0);
b699adcc 959 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
960 }
961}
962
963/* DECL is an external FUNCTION_DECL, that we're referencing. If it
964 is prototyped, record it now. Otherwise record it as needed at end
965 of compilation, when we might have more information about it. */
738f2522
BS
966
967void
968nvptx_record_needed_fndecl (tree decl)
969{
00e52418
NS
970 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
971 {
972 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
973 if (*slot == NULL)
974 *slot = decl;
975 }
976 else
977 nvptx_record_fndecl (decl);
978}
738f2522 979
00e52418
NS
980/* SYM is a SYMBOL_REF. If it refers to an external function, record
981 it as needed. */
982
983static void
984nvptx_maybe_record_fnsym (rtx sym)
985{
986 tree decl = SYMBOL_REF_DECL (sym);
987
988 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
989 nvptx_record_needed_fndecl (decl);
738f2522
BS
990}
991
1f065954 992/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
993 and initialize REGNO to point to it. If the size is zero, it'll
994 never be valid to dereference, so we can simply initialize to
995 zero. */
1f065954
NS
996
997static void
998init_frame (FILE *file, int regno, unsigned align, unsigned size)
999{
5563d5c0
NS
1000 if (size)
1001 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1002 align, reg_names[regno], size);
1003 fprintf (file, "\t.reg.u%d %s;\n",
1004 POINTER_SIZE, reg_names[regno]);
1005 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1006 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
1007 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1008}
1009
5012919d
AM
1010/* Emit soft stack frame setup sequence. */
1011
1012static void
1013init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1014{
1015 /* Maintain 64-bit stack alignment. */
1016 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1017 size = ROUND_UP (size, keep_align);
1018 int bits = POINTER_SIZE;
1019 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1020 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1021 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1022 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1023 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1024 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1025 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1026 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1027 fprintf (file, "\t{\n");
1028 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1029 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1030 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1031 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1032 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1033 bits == 64 ? ".wide" : ".lo", bits / 8);
1034 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1035
1036 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1037 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1038
1039 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1040 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1041 bits, reg_sspprev, reg_sspslot);
1042
1043 /* Initialize %frame = %sspprev - size. */
1044 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1045 bits, reg_frame, reg_sspprev, size);
1046
1047 /* Apply alignment, if larger than 64. */
1048 if (alignment > keep_align)
1049 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1050 bits, reg_frame, reg_frame, -alignment);
1051
1052 size = crtl->outgoing_args_size;
1053 gcc_assert (size % keep_align == 0);
1054
1055 /* Initialize %stack. */
1056 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1057 bits, reg_stack, reg_frame, size);
1058
5012919d
AM
1059 if (!crtl->is_leaf)
1060 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1061 bits, reg_sspslot, reg_stack);
1062 fprintf (file, "\t}\n");
1063 cfun->machine->has_softstack = true;
1064 need_softstack_decl = true;
1065}
1066
d88cd9c4
NS
1067/* Emit code to initialize the REGNO predicate register to indicate
1068 whether we are not lane zero on the NAME axis. */
1069
1070static void
1071nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1072{
1073 fprintf (file, "\t{\n");
1074 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1075 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1076 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1077 fprintf (file, "\t}\n");
1078}
1079
5012919d
AM
1080/* Emit code to initialize predicate and master lane index registers for
1081 -muniform-simt code generation variant. */
1082
1083static void
1084nvptx_init_unisimt_predicate (FILE *file)
1085{
0c6b03b5
AM
1086 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1087 int loc = REGNO (cfun->machine->unisimt_location);
5012919d 1088 int bits = POINTER_SIZE;
0c6b03b5 1089 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
5012919d
AM
1090 fprintf (file, "\t{\n");
1091 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1092 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
5012919d
AM
1093 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1094 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1095 bits == 64 ? ".wide" : ".lo");
0c6b03b5
AM
1096 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1097 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1098 if (cfun->machine->unisimt_predicate)
1099 {
1100 int master = REGNO (cfun->machine->unisimt_master);
1101 int pred = REGNO (cfun->machine->unisimt_predicate);
1102 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1103 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1104 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1105 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1106 /* Compute predicate as 'tid.x == master'. */
1107 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1108 }
5012919d
AM
1109 fprintf (file, "\t}\n");
1110 need_unisimt_decl = true;
1111}
1112
1113/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1114
1115 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1116 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1117 {
1118 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1119 __nvptx_uni[tid.y] = 0;
1120 gomp_nvptx_main (ORIG, arg);
1121 }
1122 ORIG itself should not be emitted as a PTX .entry function. */
1123
1124static void
1125write_omp_entry (FILE *file, const char *name, const char *orig)
1126{
1127 static bool gomp_nvptx_main_declared;
1128 if (!gomp_nvptx_main_declared)
1129 {
1130 gomp_nvptx_main_declared = true;
1131 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1132 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1133 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1134 }
3c5b4c6d
TS
1135 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1136#define NTID_Y "%ntid.y"
5012919d
AM
1137#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1138 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1139{\n\
1140 .reg.u32 %r<3>;\n\
1141 .reg.u" PS " %R<4>;\n\
1142 mov.u32 %r0, %tid.y;\n\
3c5b4c6d 1143 mov.u32 %r1, " NTID_Y ";\n\
5012919d
AM
1144 mov.u32 %r2, %ctaid.x;\n\
1145 cvt.u" PS ".u32 %R1, %r0;\n\
1146 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1147 mov.u" PS " %R0, __nvptx_stacks;\n\
1148 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1149 ld.param.u" PS " %R2, [%stack];\n\
1150 ld.param.u" PS " %R3, [%sz];\n\
1151 add.u" PS " %R2, %R2, %R3;\n\
1152 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1153 st.shared.u" PS " [%R0], %R2;\n\
1154 mov.u" PS " %R0, __nvptx_uni;\n\
1155 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1156 mov.u32 %r0, 0;\n\
1157 st.shared.u32 [%R0], %r0;\n\
1158 mov.u" PS " %R0, \0;\n\
1159 ld.param.u" PS " %R1, [%arg];\n\
1160 {\n\
1161 .param.u" PS " %P<2>;\n\
1162 st.param.u" PS " [%P0], %R0;\n\
1163 st.param.u" PS " [%P1], %R1;\n\
1164 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1165 }\n\
1166 ret.uni;\n\
1167}\n"
1168 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1169 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1170#undef ENTRY_TEMPLATE
3c5b4c6d 1171#undef NTID_Y
5012919d
AM
1172 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1173 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1174 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1175 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1176 need_softstack_decl = need_unisimt_decl = true;
1177}
1178
738f2522
BS
1179/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1180 function, including local var decls and copies from the arguments to
1181 local regs. */
1182
1183void
1184nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1185{
1186 tree fntype = TREE_TYPE (decl);
1187 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 1188 int argno = 0;
738f2522 1189
5012919d
AM
1190 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1191 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1192 {
1193 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1194 sprintf (buf, "%s$impl", name);
1195 write_omp_entry (file, name, buf);
1196 name = buf;
1197 }
dc3d2aeb
NS
1198 /* We construct the initial part of the function into a string
1199 stream, in order to share the prototype writing code. */
738f2522 1200 std::stringstream s;
b699adcc 1201 write_fn_proto (s, true, name, decl);
dc3d2aeb 1202 s << "{\n";
738f2522 1203
1f065954 1204 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 1205 if (return_in_mem)
1f065954 1206 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 1207
5ab662d5 1208 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
1209 tree args = TYPE_ARG_TYPES (fntype);
1210 bool prototyped = true;
1211 if (!args)
5ab662d5 1212 {
dc3d2aeb
NS
1213 args = DECL_ARGUMENTS (decl);
1214 prototyped = false;
5ab662d5
NS
1215 }
1216
1217 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1218 {
1219 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 1220
1f065954 1221 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 1222 }
5ab662d5 1223
dc3d2aeb 1224 if (stdarg_p (fntype))
5563d5c0 1225 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 1226 true);
5ab662d5 1227
5563d5c0
NS
1228 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1229 write_arg_type (s, STATIC_CHAIN_REGNUM,
1230 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1231 true);
1232
dc3d2aeb 1233 fprintf (file, "%s", s.str().c_str());
25662751 1234
0c6b03b5
AM
1235 /* Usually 'crtl->is_leaf' is computed during register allocator
1236 initialization (which is not done on NVPTX) or for pressure-sensitive
1237 optimizations. Initialize it here, except if already set. */
1238 if (!crtl->is_leaf)
1239 crtl->is_leaf = leaf_function_p ();
1240
44c068ae 1241 HOST_WIDE_INT sz = get_frame_size ();
5012919d
AM
1242 bool need_frameptr = sz || cfun->machine->has_chain;
1243 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1244 if (!TARGET_SOFT_STACK)
1245 {
1246 /* Declare a local var for outgoing varargs. */
1247 if (cfun->machine->has_varadic)
1248 init_frame (file, STACK_POINTER_REGNUM,
1249 UNITS_PER_WORD, crtl->outgoing_args_size);
1250
1251 /* Declare a local variable for the frame. Force its size to be
1252 DImode-compatible. */
1253 if (need_frameptr)
1254 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1255 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1256 }
0c6b03b5
AM
1257 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1258 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1259 init_softstack_frame (file, alignment, sz);
44c068ae 1260
0c6b03b5
AM
1261 if (cfun->machine->has_simtreg)
1262 {
1263 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1264 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1265 align = MAX (align, GET_MODE_SIZE (DImode));
1266 if (!crtl->is_leaf || cfun->calls_alloca)
1267 simtsz = HOST_WIDE_INT_M1U;
1268 if (simtsz == HOST_WIDE_INT_M1U)
1269 simtsz = nvptx_softstack_size;
1270 if (cfun->machine->has_softstack)
1271 simtsz += POINTER_SIZE / 8;
1272 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1273 if (align > GET_MODE_SIZE (DImode))
1274 simtsz += align - GET_MODE_SIZE (DImode);
1275 if (simtsz)
1276 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1277 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1278 }
738f2522
BS
1279 /* Declare the pseudos we have as ptx registers. */
1280 int maxregs = max_reg_num ();
1281 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1282 {
1283 if (regno_reg_rtx[i] != const0_rtx)
1284 {
1285 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1286 machine_mode split = maybe_split_mode (mode);
f313d112 1287
38827811 1288 if (split_mode_p (mode))
f313d112
NS
1289 mode = split;
1290 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1291 output_reg (file, i, split, -2);
1292 fprintf (file, ";\n");
738f2522
BS
1293 }
1294 }
1295
d88cd9c4
NS
1296 /* Emit axis predicates. */
1297 if (cfun->machine->axis_predicate[0])
1298 nvptx_init_axis_predicate (file,
1299 REGNO (cfun->machine->axis_predicate[0]), "y");
1300 if (cfun->machine->axis_predicate[1])
1301 nvptx_init_axis_predicate (file,
1302 REGNO (cfun->machine->axis_predicate[1]), "x");
0c6b03b5
AM
1303 if (cfun->machine->unisimt_predicate
1304 || (cfun->machine->has_simtreg && !crtl->is_leaf))
5012919d 1305 nvptx_init_unisimt_predicate (file);
738f2522
BS
1306}
1307
0c6b03b5
AM
1308/* Output code for switching uniform-simt state. ENTERING indicates whether
1309 we are entering or leaving non-uniform execution region. */
1310
1311static void
1312nvptx_output_unisimt_switch (FILE *file, bool entering)
1313{
1314 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1315 return;
1316 fprintf (file, "\t{\n");
1317 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1318 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1319 if (!crtl->is_leaf)
1320 {
1321 int loc = REGNO (cfun->machine->unisimt_location);
1322 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1323 }
1324 if (cfun->machine->unisimt_predicate)
1325 {
1326 int master = REGNO (cfun->machine->unisimt_master);
1327 int pred = REGNO (cfun->machine->unisimt_predicate);
1328 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1329 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1330 master, entering ? "%ustmp2" : "0");
1331 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1332 }
1333 fprintf (file, "\t}\n");
1334}
1335
1336/* Output code for allocating per-lane storage and switching soft-stack pointer.
1337 ENTERING indicates whether we are entering or leaving non-uniform execution.
1338 PTR is the register pointing to allocated storage, it is assigned to on
1339 entering and used to restore state on leaving. SIZE and ALIGN are used only
1340 on entering. */
1341
1342static void
1343nvptx_output_softstack_switch (FILE *file, bool entering,
1344 rtx ptr, rtx size, rtx align)
1345{
1346 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1347 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1348 return;
1349 int bits = POINTER_SIZE, regno = REGNO (ptr);
1350 fprintf (file, "\t{\n");
1351 if (entering)
1352 {
1353 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1354 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1355 cfun->machine->simt_stack_size);
1356 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1357 if (CONST_INT_P (size))
1358 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1359 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1360 else
1361 output_reg (file, REGNO (size), VOIDmode);
1362 fputs (";\n", file);
1363 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
77734296
AM
1364 fprintf (file,
1365 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
0c6b03b5
AM
1366 bits, regno, regno, UINTVAL (align));
1367 }
1368 if (cfun->machine->has_softstack)
1369 {
1370 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1371 if (entering)
1372 {
1373 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1374 bits, regno, bits / 8, reg_stack);
1375 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1376 bits, reg_stack, regno, bits / 8);
1377 }
1378 else
1379 {
1380 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1381 bits, reg_stack, regno, bits / 8);
1382 }
1383 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1384 }
1385 fprintf (file, "\t}\n");
1386}
1387
1388/* Output code to enter non-uniform execution region. DEST is a register
1389 to hold a per-lane allocation given by SIZE and ALIGN. */
1390
1391const char *
1392nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1393{
1394 nvptx_output_unisimt_switch (asm_out_file, true);
1395 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1396 return "";
1397}
1398
1399/* Output code to leave non-uniform execution region. SRC is the register
1400 holding per-lane storage previously allocated by omp_simt_enter insn. */
1401
1402const char *
1403nvptx_output_simt_exit (rtx src)
1404{
1405 nvptx_output_unisimt_switch (asm_out_file, false);
1406 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1407 return "";
1408}
1409
5012919d
AM
1410/* Output instruction that sets soft stack pointer in shared memory to the
1411 value in register given by SRC_REGNO. */
1412
1413const char *
1414nvptx_output_set_softstack (unsigned src_regno)
1415{
1416 if (cfun->machine->has_softstack && !crtl->is_leaf)
1417 {
1418 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1419 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1420 output_reg (asm_out_file, src_regno, VOIDmode);
1421 fprintf (asm_out_file, ";\n");
1422 }
1423 return "";
1424}
738f2522
BS
1425/* Output a return instruction. Also copy the return value to its outgoing
1426 location. */
1427
1428const char *
1429nvptx_output_return (void)
1430{
44c068ae 1431 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1432
1433 if (mode != VOIDmode)
1f065954
NS
1434 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1435 nvptx_ptx_type_from_mode (mode, false),
1436 reg_names[NVPTX_RETURN_REGNUM],
1437 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1438
1439 return "ret;";
1440}
1441
738f2522
BS
1442/* Terminate a function by writing a closing brace to FILE. */
1443
1444void
1445nvptx_function_end (FILE *file)
1446{
cf08c344 1447 fprintf (file, "}\n");
738f2522
BS
1448}
1449\f
1450/* Decide whether we can make a sibling call to a function. For ptx, we
1451 can't. */
1452
1453static bool
1454nvptx_function_ok_for_sibcall (tree, tree)
1455{
1456 return false;
1457}
1458
18c05628
NS
1459/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1460
1461static rtx
1462nvptx_get_drap_rtx (void)
1463{
5012919d
AM
1464 if (TARGET_SOFT_STACK && stack_realign_drap)
1465 return arg_pointer_rtx;
18c05628
NS
1466 return NULL_RTX;
1467}
1468
738f2522
BS
1469/* Implement the TARGET_CALL_ARGS hook. Record information about one
1470 argument to the next call. */
1471
1472static void
44c068ae 1473nvptx_call_args (rtx arg, tree fntype)
738f2522 1474{
44c068ae 1475 if (!cfun->machine->doing_call)
738f2522 1476 {
44c068ae
NS
1477 cfun->machine->doing_call = true;
1478 cfun->machine->is_varadic = false;
1479 cfun->machine->num_args = 0;
1480
1481 if (fntype && stdarg_p (fntype))
1482 {
1483 cfun->machine->is_varadic = true;
1484 cfun->machine->has_varadic = true;
1485 cfun->machine->num_args++;
1486 }
738f2522 1487 }
738f2522 1488
44c068ae
NS
1489 if (REG_P (arg) && arg != pc_rtx)
1490 {
1491 cfun->machine->num_args++;
1492 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1493 cfun->machine->call_args);
1494 }
738f2522
BS
1495}
1496
1497/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1498 information we recorded. */
1499
1500static void
1501nvptx_end_call_args (void)
1502{
44c068ae 1503 cfun->machine->doing_call = false;
738f2522
BS
1504 free_EXPR_LIST_list (&cfun->machine->call_args);
1505}
1506
ecf6e535
BS
1507/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1508 track of whether calls involving static chains or varargs were seen
1509 in the current function.
1510 For libcalls, maintain a hash table of decls we have seen, and
1511 record a function decl for later when encountering a new one. */
738f2522
BS
1512
1513void
1514nvptx_expand_call (rtx retval, rtx address)
1515{
738f2522 1516 rtx callee = XEXP (address, 0);
f324806d 1517 rtx varargs = NULL_RTX;
d88cd9c4 1518 unsigned parallel = 0;
738f2522 1519
738f2522
BS
1520 if (!call_insn_operand (callee, Pmode))
1521 {
1522 callee = force_reg (Pmode, callee);
1523 address = change_address (address, QImode, callee);
1524 }
1525
1526 if (GET_CODE (callee) == SYMBOL_REF)
1527 {
1528 tree decl = SYMBOL_REF_DECL (callee);
1529 if (decl != NULL_TREE)
1530 {
738f2522 1531 if (DECL_STATIC_CHAIN (decl))
44c068ae 1532 cfun->machine->has_chain = true;
00e52418 1533
629b3d75 1534 tree attr = oacc_get_fn_attrib (decl);
d88cd9c4
NS
1535 if (attr)
1536 {
1537 tree dims = TREE_VALUE (attr);
1538
1539 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1540 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1541 {
1542 if (TREE_PURPOSE (dims)
1543 && !integer_zerop (TREE_PURPOSE (dims)))
1544 break;
1545 /* Not on this axis. */
1546 parallel ^= GOMP_DIM_MASK (ix);
1547 dims = TREE_CHAIN (dims);
1548 }
1549 }
738f2522
BS
1550 }
1551 }
c38f0d8c 1552
44c068ae
NS
1553 unsigned nargs = cfun->machine->num_args;
1554 if (cfun->machine->is_varadic)
738f2522 1555 {
f324806d 1556 varargs = gen_reg_rtx (Pmode);
863af9a4 1557 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1558 }
1559
44c068ae
NS
1560 rtvec vec = rtvec_alloc (nargs + 1);
1561 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1562 int vec_pos = 0;
44c068ae
NS
1563
1564 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1565 rtx tmp_retval = retval;
44c068ae 1566 if (retval)
738f2522
BS
1567 {
1568 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1569 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1570 call = gen_rtx_SET (tmp_retval, call);
738f2522 1571 }
44c068ae 1572 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1573
1574 /* Construct the call insn, including a USE for each argument pseudo
1575 register. These will be used when printing the insn. */
1576 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1577 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1578
1579 if (varargs)
cf08c344 1580 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1581
1582 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1583
d88cd9c4 1584 nvptx_emit_forking (parallel, true);
738f2522 1585 emit_call_insn (pat);
d88cd9c4
NS
1586 nvptx_emit_joining (parallel, true);
1587
738f2522
BS
1588 if (tmp_retval != retval)
1589 emit_move_insn (retval, tmp_retval);
1590}
44eba92d 1591
738f2522
BS
1592/* Emit a comparison COMPARE, and return the new test to be used in the
1593 jump. */
1594
1595rtx
1596nvptx_expand_compare (rtx compare)
1597{
1598 rtx pred = gen_reg_rtx (BImode);
1599 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1600 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1601 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1602 return gen_rtx_NE (BImode, pred, const0_rtx);
1603}
1604
d88cd9c4
NS
1605/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1606
1607void
1608nvptx_expand_oacc_fork (unsigned mode)
1609{
1610 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1611}
1612
1613void
1614nvptx_expand_oacc_join (unsigned mode)
1615{
1616 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1617}
1618
1619/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1620 objects. */
1621
1622static rtx
1623nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1624{
1625 rtx res;
1626
1627 switch (GET_MODE (src))
1628 {
1629 case DImode:
1630 res = gen_unpackdisi2 (dst0, dst1, src);
1631 break;
1632 case DFmode:
1633 res = gen_unpackdfsi2 (dst0, dst1, src);
1634 break;
1635 default: gcc_unreachable ();
1636 }
1637 return res;
1638}
1639
1640/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1641 object. */
1642
1643static rtx
1644nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1645{
1646 rtx res;
1647
1648 switch (GET_MODE (dst))
1649 {
1650 case DImode:
1651 res = gen_packsidi2 (dst, src0, src1);
1652 break;
1653 case DFmode:
1654 res = gen_packsidf2 (dst, src0, src1);
1655 break;
1656 default: gcc_unreachable ();
1657 }
1658 return res;
1659}
1660
1661/* Generate an instruction or sequence to broadcast register REG
1662 across the vectors of a single warp. */
1663
5012919d 1664rtx
59263259 1665nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1666{
1667 rtx res;
1668
1669 switch (GET_MODE (dst))
1670 {
1671 case SImode:
1672 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1673 break;
1674 case SFmode:
1675 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1676 break;
1677 case DImode:
1678 case DFmode:
1679 {
1680 rtx tmp0 = gen_reg_rtx (SImode);
1681 rtx tmp1 = gen_reg_rtx (SImode);
1682
1683 start_sequence ();
1684 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1685 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1686 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1687 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1688 res = get_insns ();
1689 end_sequence ();
1690 }
1691 break;
1692 case BImode:
1693 {
1694 rtx tmp = gen_reg_rtx (SImode);
1695
1696 start_sequence ();
1697 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1698 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1699 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1700 res = get_insns ();
1701 end_sequence ();
1702 }
1703 break;
d5ace3b5
CP
1704 case QImode:
1705 case HImode:
1706 {
1707 rtx tmp = gen_reg_rtx (SImode);
1708
1709 start_sequence ();
1710 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1711 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1712 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1713 tmp)));
1714 res = get_insns ();
1715 end_sequence ();
1716 }
1717 break;
d88cd9c4
NS
1718
1719 default:
1720 gcc_unreachable ();
1721 }
1722 return res;
1723}
1724
1725/* Generate an instruction or sequence to broadcast register REG
1726 across the vectors of a single warp. */
1727
1728static rtx
1729nvptx_gen_vcast (rtx reg)
1730{
1731 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1732}
1733
1734/* Structure used when generating a worker-level spill or fill. */
1735
1736struct wcast_data_t
1737{
1738 rtx base; /* Register holding base addr of buffer. */
1739 rtx ptr; /* Iteration var, if needed. */
1740 unsigned offset; /* Offset into worker buffer. */
1741};
1742
1743/* Direction of the spill/fill and looping setup/teardown indicator. */
1744
1745enum propagate_mask
1746 {
1747 PM_read = 1 << 0,
1748 PM_write = 1 << 1,
1749 PM_loop_begin = 1 << 2,
1750 PM_loop_end = 1 << 3,
1751
1752 PM_read_write = PM_read | PM_write
1753 };
1754
1755/* Generate instruction(s) to spill or fill register REG to/from the
1756 worker broadcast array. PM indicates what is to be done, REP
1757 how many loop iterations will be executed (0 for not a loop). */
1758
1759static rtx
1760nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1761{
1762 rtx res;
1763 machine_mode mode = GET_MODE (reg);
1764
1765 switch (mode)
1766 {
1767 case BImode:
1768 {
1769 rtx tmp = gen_reg_rtx (SImode);
1770
1771 start_sequence ();
1772 if (pm & PM_read)
1773 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1774 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1775 if (pm & PM_write)
1776 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1777 res = get_insns ();
1778 end_sequence ();
1779 }
1780 break;
1781
1782 default:
1783 {
1784 rtx addr = data->ptr;
1785
1786 if (!addr)
1787 {
1788 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1789
1790 if (align > worker_bcast_align)
1791 worker_bcast_align = align;
1792 data->offset = (data->offset + align - 1) & ~(align - 1);
1793 addr = data->base;
1794 if (data->offset)
1795 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1796 }
1797
1798 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1799 if (pm == PM_read)
1800 res = gen_rtx_SET (addr, reg);
1801 else if (pm == PM_write)
1802 res = gen_rtx_SET (reg, addr);
1803 else
1804 gcc_unreachable ();
1805
1806 if (data->ptr)
1807 {
1808 /* We're using a ptr, increment it. */
1809 start_sequence ();
1810
1811 emit_insn (res);
1812 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1813 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1814 res = get_insns ();
1815 end_sequence ();
1816 }
1817 else
1818 rep = 1;
1819 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1820 }
1821 break;
1822 }
1823 return res;
1824}
738f2522
BS
1825\f
1826/* Returns true if X is a valid address for use in a memory reference. */
1827
1828static bool
1829nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1830{
1831 enum rtx_code code = GET_CODE (x);
1832
1833 switch (code)
1834 {
1835 case REG:
1836 return true;
1837
1838 case PLUS:
1839 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1840 return true;
1841 return false;
1842
1843 case CONST:
1844 case SYMBOL_REF:
1845 case LABEL_REF:
1846 return true;
1847
1848 default:
1849 return false;
1850 }
1851}
738f2522 1852\f
4d5438cd
NS
1853/* Machinery to output constant initializers. When beginning an
1854 initializer, we decide on a fragment size (which is visible in ptx
1855 in the type used), and then all initializer data is buffered until
1856 a fragment is filled and ready to be written out. */
1857
1858static struct
1859{
1860 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1861 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1862 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1863 out. */
1864 unsigned size; /* Fragment size to accumulate. */
1865 unsigned offset; /* Offset within current fragment. */
1866 bool started; /* Whether we've output any initializer. */
1867} init_frag;
1868
1869/* The current fragment is full, write it out. SYM may provide a
1870 symbolic reference we should output, in which case the fragment
1871 value is the addend. */
738f2522
BS
1872
1873static void
4d5438cd 1874output_init_frag (rtx sym)
738f2522 1875{
4d5438cd
NS
1876 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1877 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1878
4d5438cd
NS
1879 init_frag.started = true;
1880 init_frag.val = 0;
1881 init_frag.offset = 0;
1882 init_frag.remaining--;
1883
1884 if (sym)
1885 {
1886 fprintf (asm_out_file, "generic(");
1887 output_address (VOIDmode, sym);
1888 fprintf (asm_out_file, val ? ") + " : ")");
1889 }
738f2522 1890
4d5438cd
NS
1891 if (!sym || val)
1892 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
1893}
1894
4d5438cd
NS
1895/* Add value VAL of size SIZE to the data we're emitting, and keep
1896 writing out chunks as they fill up. */
738f2522
BS
1897
1898static void
4d5438cd 1899nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 1900{
4d5438cd
NS
1901 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1902
1903 for (unsigned part = 0; size; size -= part)
738f2522 1904 {
4d5438cd
NS
1905 val >>= part * BITS_PER_UNIT;
1906 part = init_frag.size - init_frag.offset;
1907 if (part > size)
1908 part = size;
1909
1910 unsigned HOST_WIDE_INT partial
1911 = val << (init_frag.offset * BITS_PER_UNIT);
1912 init_frag.val |= partial & init_frag.mask;
1913 init_frag.offset += part;
1914
1915 if (init_frag.offset == init_frag.size)
1916 output_init_frag (NULL);
738f2522
BS
1917 }
1918}
1919
1920/* Target hook for assembling integer object X of size SIZE. */
1921
1922static bool
1923nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1924{
00e52418
NS
1925 HOST_WIDE_INT val = 0;
1926
1927 switch (GET_CODE (x))
738f2522 1928 {
00e52418 1929 default:
a9000e1e
NS
1930 /* Let the generic machinery figure it out, usually for a
1931 CONST_WIDE_INT. */
1932 return false;
00e52418
NS
1933
1934 case CONST_INT:
4d5438cd 1935 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
1936 break;
1937
1938 case CONST:
1939 x = XEXP (x, 0);
1940 gcc_assert (GET_CODE (x) == PLUS);
1941 val = INTVAL (XEXP (x, 1));
1942 x = XEXP (x, 0);
1943 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1944 /* FALLTHROUGH */
1945
1946 case SYMBOL_REF:
4d5438cd
NS
1947 gcc_assert (size == init_frag.size);
1948 if (init_frag.offset)
738f2522 1949 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 1950
00e52418 1951 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
1952 init_frag.val = val;
1953 output_init_frag (x);
738f2522 1954 break;
738f2522
BS
1955 }
1956
738f2522
BS
1957 return true;
1958}
1959
1960/* Output SIZE zero bytes. We ignore the FILE argument since the
1961 functions we're calling to perform the output just use
1962 asm_out_file. */
1963
1964void
1965nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1966{
4d5438cd
NS
1967 /* Finish the current fragment, if it's started. */
1968 if (init_frag.offset)
738f2522 1969 {
4d5438cd
NS
1970 unsigned part = init_frag.size - init_frag.offset;
1971 if (part > size)
1972 part = (unsigned) size;
1973 size -= part;
1974 nvptx_assemble_value (0, part);
738f2522
BS
1975 }
1976
4d5438cd
NS
1977 /* If this skip doesn't terminate the initializer, write as many
1978 remaining pieces as possible directly. */
1979 if (size < init_frag.remaining * init_frag.size)
738f2522 1980 {
4d5438cd
NS
1981 while (size >= init_frag.size)
1982 {
1983 size -= init_frag.size;
1984 output_init_frag (NULL_RTX);
1985 }
1986 if (size)
1987 nvptx_assemble_value (0, size);
738f2522 1988 }
738f2522
BS
1989}
1990
1991/* Output a string STR with length SIZE. As in nvptx_output_skip we
1992 ignore the FILE arg. */
1993
1994void
1995nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1996{
1997 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1998 nvptx_assemble_value (str[i], 1);
1999}
2000
4ff3145a
NS
2001/* Emit a PTX variable decl and prepare for emission of its
2002 initializer. NAME is the symbol name and SETION the PTX data
2003 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2004 The caller has already emitted any indentation and linkage
2005 specifier. It is responsible for any initializer, terminating ;
2006 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2007 this is the opposite way round that PTX wants them! */
2008
2009static void
2010nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2011 const_tree type, HOST_WIDE_INT size, unsigned align)
2012{
2013 while (TREE_CODE (type) == ARRAY_TYPE)
2014 type = TREE_TYPE (type);
2015
fc0efeea
NS
2016 if (TREE_CODE (type) == VECTOR_TYPE
2017 || TREE_CODE (type) == COMPLEX_TYPE)
2018 /* Neither vector nor complex types can contain the other. */
2019 type = TREE_TYPE (type);
2020
4ff3145a 2021 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
2022
2023 /* Largest mode we're prepared to accept. For BLKmode types we
2024 don't know if it'll contain pointer constants, so have to choose
2025 pointer size, otherwise we can choose DImode. */
2026 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2027
2028 elt_size |= GET_MODE_SIZE (elt_mode);
2029 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 2030
4d5438cd 2031 init_frag.size = elt_size;
9c582551 2032 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
2033 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2034 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2035 init_frag.val = 0;
2036 init_frag.offset = 0;
2037 init_frag.started = false;
2038 /* Size might not be a multiple of elt size, if there's an
2039 initialized trailing struct array with smaller type than
2040 elt_size. */
2041 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
2042
2043 fprintf (file, "%s .align %d .u%d ",
2044 section, align / BITS_PER_UNIT,
2045 elt_size * BITS_PER_UNIT);
2046 assemble_name (file, name);
2047
2048 if (size)
2049 /* We make everything an array, to simplify any initialization
2050 emission. */
4d5438cd 2051 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
4ff3145a
NS
2052}
2053
738f2522
BS
2054/* Called when the initializer for a decl has been completely output through
2055 combinations of the three functions above. */
2056
2057static void
2058nvptx_assemble_decl_end (void)
2059{
4d5438cd
NS
2060 if (init_frag.offset)
2061 /* This can happen with a packed struct with trailing array member. */
2062 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2063 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
2064}
2065
69823d76
NS
2066/* Output an uninitialized common or file-scope variable. */
2067
2068void
2069nvptx_output_aligned_decl (FILE *file, const char *name,
2070 const_tree decl, HOST_WIDE_INT size, unsigned align)
2071{
2072 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2073
2074 /* If this is public, it is common. The nearest thing we have to
2075 common is weak. */
4ff3145a
NS
2076 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2077
2078 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2079 TREE_TYPE (decl), size, align);
4d5438cd 2080 nvptx_assemble_decl_end ();
69823d76
NS
2081}
2082
738f2522
BS
2083/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2084 writing a constant variable EXP with NAME and SIZE and its
2085 initializer to FILE. */
2086
2087static void
2088nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 2089 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 2090{
4ff3145a
NS
2091 write_var_marker (file, true, false, name);
2092
2093 fprintf (file, "\t");
2094
738f2522 2095 tree type = TREE_TYPE (exp);
4ff3145a
NS
2096 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2097 TYPE_ALIGN (type));
738f2522
BS
2098}
2099
2100/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2101 a variable DECL with NAME to FILE. */
2102
2103void
2104nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2105{
4ff3145a 2106 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 2107
4ff3145a
NS
2108 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2109 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 2110
4ff3145a
NS
2111 tree type = TREE_TYPE (decl);
2112 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2113 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2114 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
2115}
2116
2117/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2118
2119static void
2120nvptx_globalize_label (FILE *, const char *)
2121{
2122}
2123
2124/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2125 declaration only for variable DECL with NAME to FILE. */
f313d112 2126
738f2522
BS
2127static void
2128nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2129{
0a0f74aa
NS
2130 /* The middle end can place constant pool decls into the varpool as
2131 undefined. Until that is fixed, catch the problem here. */
2132 if (DECL_IN_CONSTANT_POOL (decl))
2133 return;
2134
721547cd
NS
2135 /* We support weak defintions, and hence have the right
2136 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2137 if (DECL_WEAK (decl))
2138 error_at (DECL_SOURCE_LOCATION (decl),
2139 "PTX does not support weak declarations"
2140 " (only weak definitions)");
69823d76
NS
2141 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2142
4ff3145a
NS
2143 fprintf (file, "\t.extern ");
2144 tree size = DECL_SIZE_UNIT (decl);
2145 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2146 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2147 DECL_ALIGN (decl));
1e5154e7 2148 nvptx_assemble_decl_end ();
738f2522
BS
2149}
2150
f313d112
NS
2151/* Output a pattern for a move instruction. */
2152
2153const char *
2154nvptx_output_mov_insn (rtx dst, rtx src)
2155{
2156 machine_mode dst_mode = GET_MODE (dst);
2157 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2158 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2159 machine_mode src_inner = (GET_CODE (src) == SUBREG
2160 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2161
15113b03
NS
2162 rtx sym = src;
2163 if (GET_CODE (sym) == CONST)
2164 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
2165 if (SYMBOL_REF_P (sym))
2166 {
2167 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2168 return "%.\tcvta%D1%t0\t%0, %1;";
2169 nvptx_maybe_record_fnsym (sym);
2170 }
15113b03 2171
f313d112
NS
2172 if (src_inner == dst_inner)
2173 return "%.\tmov%t0\t%0, %1;";
2174
2175 if (CONSTANT_P (src))
2176 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2177 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2178 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2179
2180 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2181 return "%.\tmov.b%T0\t%0, %1;";
2182
2183 return "%.\tcvt%t0%t1\t%0, %1;";
2184}
2185
5012919d
AM
2186static void nvptx_print_operand (FILE *, rtx, int);
2187
738f2522 2188/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
2189 involves writing .param declarations and in/out copies into them. For
2190 indirect calls, also write the .callprototype. */
738f2522
BS
2191
2192const char *
2193nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2194{
863af9a4 2195 char buf[16];
738f2522
BS
2196 static int labelno;
2197 bool needs_tgt = register_operand (callee, Pmode);
2198 rtx pat = PATTERN (insn);
5012919d
AM
2199 if (GET_CODE (pat) == COND_EXEC)
2200 pat = COND_EXEC_CODE (pat);
f324806d 2201 int arg_end = XVECLEN (pat, 0);
738f2522
BS
2202 tree decl = NULL_TREE;
2203
2204 fprintf (asm_out_file, "\t{\n");
2205 if (result != NULL)
1f065954
NS
2206 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2207 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2208 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 2209
ecf6e535 2210 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
2211 if (GET_CODE (callee) == SYMBOL_REF)
2212 {
2213 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
2214 if (!decl
2215 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2216 nvptx_record_libfunc (callee, result, pat);
2217 else if (DECL_EXTERNAL (decl))
738f2522
BS
2218 nvptx_record_fndecl (decl);
2219 }
2220
2221 if (needs_tgt)
2222 {
2223 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2224 labelno++;
2225 ASM_OUTPUT_LABEL (asm_out_file, buf);
2226 std::stringstream s;
b699adcc 2227 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
2228 fputs (s.str().c_str(), asm_out_file);
2229 }
2230
863af9a4 2231 for (int argno = 1; argno < arg_end; argno++)
738f2522 2232 {
863af9a4 2233 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 2234 machine_mode mode = GET_MODE (t);
a02d84b6 2235 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 2236
863af9a4 2237 /* Mode splitting has already been done. */
a02d84b6
NS
2238 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2239 "\t\tst.param%s [%%out_arg%d], ",
2240 ptx_type, argno, ptx_type, argno);
2241 output_reg (asm_out_file, REGNO (t), VOIDmode);
2242 fprintf (asm_out_file, ";\n");
738f2522
BS
2243 }
2244
5012919d
AM
2245 /* The '.' stands for the call's predicate, if any. */
2246 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
738f2522
BS
2247 fprintf (asm_out_file, "\t\tcall ");
2248 if (result != NULL_RTX)
1f065954
NS
2249 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2250
738f2522
BS
2251 if (decl)
2252 {
2253 const char *name = get_fnname_from_decl (decl);
b4346107 2254 name = nvptx_name_replacement (name);
738f2522
BS
2255 assemble_name (asm_out_file, name);
2256 }
2257 else
cc8ca59e 2258 output_address (VOIDmode, callee);
738f2522 2259
863af9a4
NS
2260 const char *open = "(";
2261 for (int argno = 1; argno < arg_end; argno++)
738f2522 2262 {
863af9a4
NS
2263 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2264 open = "";
738f2522 2265 }
863af9a4
NS
2266 if (decl && DECL_STATIC_CHAIN (decl))
2267 {
5563d5c0 2268 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
2269 open = "";
2270 }
2271 if (!open[0])
2272 fprintf (asm_out_file, ")");
f324806d 2273
738f2522
BS
2274 if (needs_tgt)
2275 {
2276 fprintf (asm_out_file, ", ");
2277 assemble_name (asm_out_file, buf);
2278 }
2279 fprintf (asm_out_file, ";\n");
738f2522 2280
51baf85a
NS
2281 if (find_reg_note (insn, REG_NORETURN, NULL))
2282 /* No return functions confuse the PTX JIT, as it doesn't realize
2283 the flow control barrier they imply. It can seg fault if it
2284 encounters what looks like an unexitable loop. Emit a trailing
2285 trap, which it does grok. */
2286 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2287
1f065954
NS
2288 if (result)
2289 {
2290 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2291
2292 if (!rval[0])
2293 /* We must escape the '%' that starts RETURN_REGNUM. */
2294 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2295 reg_names[NVPTX_RETURN_REGNUM]);
2296 return rval;
2297 }
2298
2299 return "}";
738f2522
BS
2300}
2301
2302/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2303
2304static bool
2305nvptx_print_operand_punct_valid_p (unsigned char c)
2306{
2307 return c == '.' || c== '#';
2308}
2309
738f2522
BS
2310/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2311
2312static void
2313nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2314{
2315 rtx off;
2316 if (GET_CODE (x) == CONST)
2317 x = XEXP (x, 0);
2318 switch (GET_CODE (x))
2319 {
2320 case PLUS:
2321 off = XEXP (x, 1);
cc8ca59e 2322 output_address (VOIDmode, XEXP (x, 0));
738f2522 2323 fprintf (file, "+");
cc8ca59e 2324 output_address (VOIDmode, off);
738f2522
BS
2325 break;
2326
2327 case SYMBOL_REF:
2328 case LABEL_REF:
2329 output_addr_const (file, x);
2330 break;
2331
2332 default:
2333 gcc_assert (GET_CODE (x) != MEM);
2334 nvptx_print_operand (file, x, 0);
2335 break;
2336 }
2337}
2338
2339/* Write assembly language output for the address ADDR to FILE. */
2340
2341static void
cc8ca59e 2342nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 2343{
cc8ca59e 2344 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
2345}
2346
2347/* Print an operand, X, to FILE, with an optional modifier in CODE.
2348
2349 Meaning of CODE:
2350 . -- print the predicate for the instruction or an emptry string for an
2351 unconditional one.
2352 # -- print a rounding mode for the instruction
2353
9a863523 2354 A -- print a data area for a MEM
738f2522 2355 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 2356 D -- print a data area for a MEM operand
d88cd9c4 2357 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2358 t -- print a type opcode suffix, promoting QImode to 32 bits
2359 T -- print a type size in bits
2360 u -- print a type opcode suffix without promotions. */
2361
2362static void
2363nvptx_print_operand (FILE *file, rtx x, int code)
2364{
738f2522
BS
2365 if (code == '.')
2366 {
2367 x = current_insn_predicate;
2368 if (x)
2369 {
5012919d 2370 fputs ("@", file);
738f2522
BS
2371 if (GET_CODE (x) == EQ)
2372 fputs ("!", file);
5012919d 2373 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
738f2522
BS
2374 }
2375 return;
2376 }
2377 else if (code == '#')
2378 {
2379 fputs (".rn", file);
2380 return;
2381 }
2382
2383 enum rtx_code x_code = GET_CODE (x);
f313d112 2384 machine_mode mode = GET_MODE (x);
738f2522
BS
2385
2386 switch (code)
2387 {
2388 case 'A':
9a863523
NS
2389 x = XEXP (x, 0);
2390 /* FALLTHROUGH. */
7b8edc29 2391
9a863523
NS
2392 case 'D':
2393 if (GET_CODE (x) == CONST)
2394 x = XEXP (x, 0);
2395 if (GET_CODE (x) == PLUS)
2396 x = XEXP (x, 0);
7b8edc29 2397
9a863523
NS
2398 if (GET_CODE (x) == SYMBOL_REF)
2399 fputs (section_for_sym (x), file);
738f2522
BS
2400 break;
2401
738f2522 2402 case 't':
738f2522 2403 case 'u':
f313d112
NS
2404 if (x_code == SUBREG)
2405 {
2406 mode = GET_MODE (SUBREG_REG (x));
38827811
TV
2407 if (split_mode_p (mode))
2408 mode = maybe_split_mode (mode);
f313d112
NS
2409 }
2410 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
2411 break;
2412
d88cd9c4
NS
2413 case 'S':
2414 {
59263259
NS
2415 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2416 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2417 static const char *const kinds[] =
59263259
NS
2418 {".up", ".down", ".bfly", ".idx"};
2419 fputs (kinds[kind], file);
d88cd9c4
NS
2420 }
2421 break;
2422
738f2522 2423 case 'T':
f313d112 2424 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2425 break;
2426
2427 case 'j':
2428 fprintf (file, "@");
2429 goto common;
2430
2431 case 'J':
2432 fprintf (file, "@!");
2433 goto common;
2434
2435 case 'c':
f313d112 2436 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2437 switch (x_code)
2438 {
2439 case EQ:
2440 fputs (".eq", file);
2441 break;
2442 case NE:
f313d112 2443 if (FLOAT_MODE_P (mode))
738f2522
BS
2444 fputs (".neu", file);
2445 else
2446 fputs (".ne", file);
2447 break;
2448 case LE:
578fb225 2449 case LEU:
738f2522
BS
2450 fputs (".le", file);
2451 break;
2452 case GE:
578fb225 2453 case GEU:
738f2522
BS
2454 fputs (".ge", file);
2455 break;
2456 case LT:
578fb225 2457 case LTU:
738f2522
BS
2458 fputs (".lt", file);
2459 break;
2460 case GT:
738f2522 2461 case GTU:
578fb225 2462 fputs (".gt", file);
738f2522
BS
2463 break;
2464 case LTGT:
2465 fputs (".ne", file);
2466 break;
2467 case UNEQ:
2468 fputs (".equ", file);
2469 break;
2470 case UNLE:
2471 fputs (".leu", file);
2472 break;
2473 case UNGE:
2474 fputs (".geu", file);
2475 break;
2476 case UNLT:
2477 fputs (".ltu", file);
2478 break;
2479 case UNGT:
2480 fputs (".gtu", file);
2481 break;
2482 case UNORDERED:
2483 fputs (".nan", file);
2484 break;
2485 case ORDERED:
2486 fputs (".num", file);
2487 break;
2488 default:
2489 gcc_unreachable ();
2490 }
f313d112 2491 if (FLOAT_MODE_P (mode)
738f2522
BS
2492 || x_code == EQ || x_code == NE
2493 || x_code == GEU || x_code == GTU
2494 || x_code == LEU || x_code == LTU)
f313d112 2495 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2496 else
f313d112 2497 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2498 break;
2499 default:
2500 common:
2501 switch (x_code)
2502 {
2503 case SUBREG:
f313d112
NS
2504 {
2505 rtx inner_x = SUBREG_REG (x);
2506 machine_mode inner_mode = GET_MODE (inner_x);
2507 machine_mode split = maybe_split_mode (inner_mode);
2508
38827811 2509 if (split_mode_p (inner_mode)
f313d112
NS
2510 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2511 output_reg (file, REGNO (inner_x), split);
2512 else
2513 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2514 }
2515 break;
738f2522
BS
2516
2517 case REG:
f313d112 2518 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2519 break;
2520
2521 case MEM:
2522 fputc ('[', file);
f313d112 2523 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2524 fputc (']', file);
2525 break;
2526
2527 case CONST_INT:
2528 output_addr_const (file, x);
2529 break;
2530
2531 case CONST:
2532 case SYMBOL_REF:
2533 case LABEL_REF:
2534 /* We could use output_addr_const, but that can print things like
2535 "x-8", which breaks ptxas. Need to ensure it is output as
2536 "x+-8". */
2537 nvptx_print_address_operand (file, x, VOIDmode);
2538 break;
2539
2540 case CONST_DOUBLE:
2541 long vals[2];
f313d112 2542 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2543 vals[0] &= 0xffffffff;
2544 vals[1] &= 0xffffffff;
f313d112 2545 if (mode == SFmode)
738f2522
BS
2546 fprintf (file, "0f%08lx", vals[0]);
2547 else
2548 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2549 break;
2550
2551 default:
2552 output_addr_const (file, x);
2553 }
2554 }
2555}
2556\f
2557/* Record replacement regs used to deal with subreg operands. */
2558struct reg_replace
2559{
2560 rtx replacement[MAX_RECOG_OPERANDS];
2561 machine_mode mode;
2562 int n_allocated;
2563 int n_in_use;
2564};
2565
2566/* Allocate or reuse a replacement in R and return the rtx. */
2567
2568static rtx
2569get_replacement (struct reg_replace *r)
2570{
2571 if (r->n_allocated == r->n_in_use)
2572 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2573 return r->replacement[r->n_in_use++];
2574}
2575
2576/* Clean up subreg operands. In ptx assembly, everything is typed, and
2577 the presence of subregs would break the rules for most instructions.
2578 Replace them with a suitable new register of the right size, plus
2579 conversion copyin/copyout instructions. */
2580
2581static void
517665b3 2582nvptx_reorg_subreg (void)
738f2522
BS
2583{
2584 struct reg_replace qiregs, hiregs, siregs, diregs;
2585 rtx_insn *insn, *next;
2586
738f2522
BS
2587 qiregs.n_allocated = 0;
2588 hiregs.n_allocated = 0;
2589 siregs.n_allocated = 0;
2590 diregs.n_allocated = 0;
2591 qiregs.mode = QImode;
2592 hiregs.mode = HImode;
2593 siregs.mode = SImode;
2594 diregs.mode = DImode;
2595
2596 for (insn = get_insns (); insn; insn = next)
2597 {
2598 next = NEXT_INSN (insn);
2599 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2600 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2601 || GET_CODE (PATTERN (insn)) == USE
2602 || GET_CODE (PATTERN (insn)) == CLOBBER)
2603 continue;
f324806d 2604
738f2522
BS
2605 qiregs.n_in_use = 0;
2606 hiregs.n_in_use = 0;
2607 siregs.n_in_use = 0;
2608 diregs.n_in_use = 0;
2609 extract_insn (insn);
2610 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2611
738f2522
BS
2612 for (int i = 0; i < recog_data.n_operands; i++)
2613 {
2614 rtx op = recog_data.operand[i];
2615 if (GET_CODE (op) != SUBREG)
2616 continue;
2617
2618 rtx inner = SUBREG_REG (op);
2619
2620 machine_mode outer_mode = GET_MODE (op);
2621 machine_mode inner_mode = GET_MODE (inner);
2622 gcc_assert (s_ok);
2623 if (s_ok
2624 && (GET_MODE_PRECISION (inner_mode)
2625 >= GET_MODE_PRECISION (outer_mode)))
2626 continue;
2627 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2628 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2629 : outer_mode == HImode ? &hiregs
2630 : outer_mode == SImode ? &siregs
2631 : &diregs);
2632 rtx new_reg = get_replacement (r);
2633
2634 if (recog_data.operand_type[i] != OP_OUT)
2635 {
2636 enum rtx_code code;
2637 if (GET_MODE_PRECISION (inner_mode)
2638 < GET_MODE_PRECISION (outer_mode))
2639 code = ZERO_EXTEND;
2640 else
2641 code = TRUNCATE;
2642
f7df4a84 2643 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2644 gen_rtx_fmt_e (code, outer_mode, inner));
2645 emit_insn_before (pat, insn);
2646 }
2647
2648 if (recog_data.operand_type[i] != OP_IN)
2649 {
2650 enum rtx_code code;
2651 if (GET_MODE_PRECISION (inner_mode)
2652 < GET_MODE_PRECISION (outer_mode))
2653 code = TRUNCATE;
2654 else
2655 code = ZERO_EXTEND;
2656
f7df4a84 2657 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2658 gen_rtx_fmt_e (code, inner_mode, new_reg));
2659 emit_insn_after (pat, insn);
2660 }
2661 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2662 }
2663 }
517665b3 2664}
738f2522 2665
5012919d
AM
2666/* Return a SImode "master lane index" register for uniform-simt, allocating on
2667 first use. */
2668
2669static rtx
2670nvptx_get_unisimt_master ()
2671{
2672 rtx &master = cfun->machine->unisimt_master;
2673 return master ? master : master = gen_reg_rtx (SImode);
2674}
2675
2676/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2677
2678static rtx
2679nvptx_get_unisimt_predicate ()
2680{
2681 rtx &pred = cfun->machine->unisimt_predicate;
2682 return pred ? pred : pred = gen_reg_rtx (BImode);
2683}
2684
2685/* Return true if given call insn references one of the functions provided by
2686 the CUDA runtime: malloc, free, vprintf. */
2687
2688static bool
2689nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2690{
2691 rtx pat = PATTERN (insn);
2692 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2693 pat = XVECEXP (pat, 0, 0);
2694 if (GET_CODE (pat) == SET)
2695 pat = SET_SRC (pat);
2696 gcc_checking_assert (GET_CODE (pat) == CALL
2697 && GET_CODE (XEXP (pat, 0)) == MEM);
2698 rtx addr = XEXP (XEXP (pat, 0), 0);
2699 if (GET_CODE (addr) != SYMBOL_REF)
2700 return false;
2701 const char *name = XSTR (addr, 0);
2702 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2703 references with forced assembler name refer to PTX syscalls. For vprintf,
2704 accept both normal and forced-assembler-name references. */
2705 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2706 || !strcmp (name, "*malloc")
2707 || !strcmp (name, "*free"));
2708}
2709
2710/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2711 propagate its value from lane MASTER to current lane. */
2712
2713static void
2714nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2715{
2716 rtx reg;
2717 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2718 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2719}
2720
2721/* Adjust code for uniform-simt code generation variant by making atomics and
2722 "syscalls" conditionally executed, and inserting shuffle-based propagation
2723 for registers being set. */
2724
2725static void
2726nvptx_reorg_uniform_simt ()
2727{
2728 rtx_insn *insn, *next;
2729
2730 for (insn = get_insns (); insn; insn = next)
2731 {
2732 next = NEXT_INSN (insn);
2733 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2734 && !(NONJUMP_INSN_P (insn)
2735 && GET_CODE (PATTERN (insn)) == PARALLEL
2736 && get_attr_atomic (insn)))
2737 continue;
2738 rtx pat = PATTERN (insn);
2739 rtx master = nvptx_get_unisimt_master ();
2740 for (int i = 0; i < XVECLEN (pat, 0); i++)
2741 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2742 rtx pred = nvptx_get_unisimt_predicate ();
2743 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2744 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2745 validate_change (insn, &PATTERN (insn), pat, false);
2746 }
2747}
2748
d2d47a28
NS
2749/* Loop structure of the function. The entire function is described as
2750 a NULL loop. */
d88cd9c4
NS
2751
2752struct parallel
2753{
2754 /* Parent parallel. */
2755 parallel *parent;
2756
2757 /* Next sibling parallel. */
2758 parallel *next;
2759
2760 /* First child parallel. */
2761 parallel *inner;
2762
2763 /* Partitioning mask of the parallel. */
2764 unsigned mask;
2765
2766 /* Partitioning used within inner parallels. */
2767 unsigned inner_mask;
2768
2769 /* Location of parallel forked and join. The forked is the first
2770 block in the parallel and the join is the first block after of
2771 the partition. */
2772 basic_block forked_block;
2773 basic_block join_block;
2774
2775 rtx_insn *forked_insn;
2776 rtx_insn *join_insn;
2777
2778 rtx_insn *fork_insn;
2779 rtx_insn *joining_insn;
2780
2781 /* Basic blocks in this parallel, but not in child parallels. The
2782 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2783 blocks are not. */
2784 auto_vec<basic_block> blocks;
2785
2786public:
2787 parallel (parallel *parent, unsigned mode);
2788 ~parallel ();
2789};
2790
2791/* Constructor links the new parallel into it's parent's chain of
2792 children. */
2793
2794parallel::parallel (parallel *parent_, unsigned mask_)
2795 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2796{
2797 forked_block = join_block = 0;
2798 forked_insn = join_insn = 0;
2799 fork_insn = joining_insn = 0;
2800
2801 if (parent)
2802 {
2803 next = parent->inner;
2804 parent->inner = this;
2805 }
2806}
2807
2808parallel::~parallel ()
2809{
2810 delete inner;
2811 delete next;
2812}
2813
2814/* Map of basic blocks to insns */
2815typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2816
2817/* A tuple of an insn of interest and the BB in which it resides. */
2818typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2819typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2820
2821/* Split basic blocks such that each forked and join unspecs are at
2822 the start of their basic blocks. Thus afterwards each block will
2823 have a single partitioning mode. We also do the same for return
2824 insns, as they are executed by every thread. Return the
2825 partitioning mode of the function as a whole. Populate MAP with
2826 head and tail blocks. We also clear the BB visited flag, which is
2827 used when finding partitions. */
2828
2829static void
2830nvptx_split_blocks (bb_insn_map_t *map)
2831{
2832 insn_bb_vec_t worklist;
2833 basic_block block;
2834 rtx_insn *insn;
2835
2836 /* Locate all the reorg instructions of interest. */
2837 FOR_ALL_BB_FN (block, cfun)
2838 {
2839 bool seen_insn = false;
2840
2841 /* Clear visited flag, for use by parallel locator */
2842 block->flags &= ~BB_VISITED;
2843
2844 FOR_BB_INSNS (block, insn)
2845 {
2846 if (!INSN_P (insn))
2847 continue;
2848 switch (recog_memoized (insn))
2849 {
2850 default:
2851 seen_insn = true;
2852 continue;
2853 case CODE_FOR_nvptx_forked:
2854 case CODE_FOR_nvptx_join:
2855 break;
2856
2857 case CODE_FOR_return:
2858 /* We also need to split just before return insns, as
2859 that insn needs executing by all threads, but the
2860 block it is in probably does not. */
2861 break;
2862 }
2863
2864 if (seen_insn)
2865 /* We've found an instruction that must be at the start of
2866 a block, but isn't. Add it to the worklist. */
2867 worklist.safe_push (insn_bb_t (insn, block));
2868 else
2869 /* It was already the first instruction. Just add it to
2870 the map. */
2871 map->get_or_insert (block) = insn;
2872 seen_insn = true;
2873 }
2874 }
2875
2876 /* Split blocks on the worklist. */
2877 unsigned ix;
2878 insn_bb_t *elt;
2879 basic_block remap = 0;
2880 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2881 {
2882 if (remap != elt->second)
2883 {
2884 block = elt->second;
2885 remap = block;
2886 }
2887
2888 /* Split block before insn. The insn is in the new block */
2889 edge e = split_block (block, PREV_INSN (elt->first));
2890
2891 block = e->dest;
2892 map->get_or_insert (block) = elt->first;
2893 }
2894}
2895
2896/* BLOCK is a basic block containing a head or tail instruction.
2897 Locate the associated prehead or pretail instruction, which must be
2898 in the single predecessor block. */
2899
2900static rtx_insn *
2901nvptx_discover_pre (basic_block block, int expected)
2902{
2903 gcc_assert (block->preds->length () == 1);
2904 basic_block pre_block = (*block->preds)[0]->src;
2905 rtx_insn *pre_insn;
2906
2907 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2908 pre_insn = PREV_INSN (pre_insn))
2909 gcc_assert (pre_insn != BB_HEAD (pre_block));
2910
2911 gcc_assert (recog_memoized (pre_insn) == expected);
2912 return pre_insn;
2913}
2914
2915/* Dump this parallel and all its inner parallels. */
2916
2917static void
2918nvptx_dump_pars (parallel *par, unsigned depth)
2919{
2920 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2921 depth, par->mask,
2922 par->forked_block ? par->forked_block->index : -1,
2923 par->join_block ? par->join_block->index : -1);
2924
2925 fprintf (dump_file, " blocks:");
2926
2927 basic_block block;
2928 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2929 fprintf (dump_file, " %d", block->index);
2930 fprintf (dump_file, "\n");
2931 if (par->inner)
2932 nvptx_dump_pars (par->inner, depth + 1);
2933
2934 if (par->next)
2935 nvptx_dump_pars (par->next, depth);
2936}
2937
2938/* If BLOCK contains a fork/join marker, process it to create or
2939 terminate a loop structure. Add this block to the current loop,
2940 and then walk successor blocks. */
2941
2942static parallel *
2943nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2944{
2945 if (block->flags & BB_VISITED)
2946 return par;
2947 block->flags |= BB_VISITED;
2948
2949 if (rtx_insn **endp = map->get (block))
2950 {
2951 rtx_insn *end = *endp;
2952
2953 /* This is a block head or tail, or return instruction. */
2954 switch (recog_memoized (end))
2955 {
2956 case CODE_FOR_return:
2957 /* Return instructions are in their own block, and we
2958 don't need to do anything more. */
2959 return par;
2960
2961 case CODE_FOR_nvptx_forked:
2962 /* Loop head, create a new inner loop and add it into
2963 our parent's child list. */
2964 {
2965 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2966
2967 gcc_assert (mask);
2968 par = new parallel (par, mask);
2969 par->forked_block = block;
2970 par->forked_insn = end;
2971 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2972 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2973 par->fork_insn
2974 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2975 }
2976 break;
2977
2978 case CODE_FOR_nvptx_join:
2979 /* A loop tail. Finish the current loop and return to
2980 parent. */
2981 {
2982 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2983
2984 gcc_assert (par->mask == mask);
2985 par->join_block = block;
2986 par->join_insn = end;
2987 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2988 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2989 par->joining_insn
2990 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2991 par = par->parent;
2992 }
2993 break;
2994
2995 default:
2996 gcc_unreachable ();
2997 }
2998 }
2999
3000 if (par)
3001 /* Add this block onto the current loop's list of blocks. */
3002 par->blocks.safe_push (block);
3003 else
3004 /* This must be the entry block. Create a NULL parallel. */
3005 par = new parallel (0, 0);
3006
3007 /* Walk successor blocks. */
3008 edge e;
3009 edge_iterator ei;
3010
3011 FOR_EACH_EDGE (e, ei, block->succs)
3012 nvptx_find_par (map, par, e->dest);
3013
3014 return par;
3015}
3016
3017/* DFS walk the CFG looking for fork & join markers. Construct
3018 loop structures as we go. MAP is a mapping of basic blocks
3019 to head & tail markers, discovered when splitting blocks. This
3020 speeds up the discovery. We rely on the BB visited flag having
3021 been cleared when splitting blocks. */
3022
3023static parallel *
3024nvptx_discover_pars (bb_insn_map_t *map)
3025{
3026 basic_block block;
3027
3028 /* Mark exit blocks as visited. */
3029 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3030 block->flags |= BB_VISITED;
3031
3032 /* And entry block as not. */
3033 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3034 block->flags &= ~BB_VISITED;
3035
3036 parallel *par = nvptx_find_par (map, 0, block);
3037
3038 if (dump_file)
3039 {
3040 fprintf (dump_file, "\nLoops\n");
3041 nvptx_dump_pars (par, 0);
3042 fprintf (dump_file, "\n");
3043 }
3044
3045 return par;
3046}
3047
912442c2
NS
3048/* Analyse a group of BBs within a partitioned region and create N
3049 Single-Entry-Single-Exit regions. Some of those regions will be
3050 trivial ones consisting of a single BB. The blocks of a
3051 partitioned region might form a set of disjoint graphs -- because
3052 the region encloses a differently partitoned sub region.
3053
3054 We use the linear time algorithm described in 'Finding Regions Fast:
3055 Single Entry Single Exit and control Regions in Linear Time'
3056 Johnson, Pearson & Pingali. That algorithm deals with complete
3057 CFGs, where a back edge is inserted from END to START, and thus the
3058 problem becomes one of finding equivalent loops.
3059
3060 In this case we have a partial CFG. We complete it by redirecting
3061 any incoming edge to the graph to be from an arbitrary external BB,
3062 and similarly redirecting any outgoing edge to be to that BB.
3063 Thus we end up with a closed graph.
3064
3065 The algorithm works by building a spanning tree of an undirected
3066 graph and keeping track of back edges from nodes further from the
3067 root in the tree to nodes nearer to the root in the tree. In the
3068 description below, the root is up and the tree grows downwards.
3069
3070 We avoid having to deal with degenerate back-edges to the same
3071 block, by splitting each BB into 3 -- one for input edges, one for
3072 the node itself and one for the output edges. Such back edges are
3073 referred to as 'Brackets'. Cycle equivalent nodes will have the
3074 same set of brackets.
3075
3076 Determining bracket equivalency is done by maintaining a list of
3077 brackets in such a manner that the list length and final bracket
3078 uniquely identify the set.
3079
3080 We use coloring to mark all BBs with cycle equivalency with the
3081 same color. This is the output of the 'Finding Regions Fast'
3082 algorithm. Notice it doesn't actually find the set of nodes within
3083 a particular region, just unorderd sets of nodes that are the
3084 entries and exits of SESE regions.
3085
3086 After determining cycle equivalency, we need to find the minimal
3087 set of SESE regions. Do this with a DFS coloring walk of the
3088 complete graph. We're either 'looking' or 'coloring'. When
3089 looking, and we're in the subgraph, we start coloring the color of
3090 the current node, and remember that node as the start of the
3091 current color's SESE region. Every time we go to a new node, we
3092 decrement the count of nodes with thet color. If it reaches zero,
3093 we remember that node as the end of the current color's SESE region
3094 and return to 'looking'. Otherwise we color the node the current
3095 color.
3096
3097 This way we end up with coloring the inside of non-trivial SESE
3098 regions with the color of that region. */
3099
3100/* A pair of BBs. We use this to represent SESE regions. */
3101typedef std::pair<basic_block, basic_block> bb_pair_t;
3102typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3103
3104/* A node in the undirected CFG. The discriminator SECOND indicates just
3105 above or just below the BB idicated by FIRST. */
3106typedef std::pair<basic_block, int> pseudo_node_t;
3107
3108/* A bracket indicates an edge towards the root of the spanning tree of the
3109 undirected graph. Each bracket has a color, determined
3110 from the currrent set of brackets. */
3111struct bracket
3112{
3113 pseudo_node_t back; /* Back target */
3114
3115 /* Current color and size of set. */
3116 unsigned color;
3117 unsigned size;
3118
3119 bracket (pseudo_node_t back_)
3120 : back (back_), color (~0u), size (~0u)
3121 {
3122 }
3123
3124 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3125 {
3126 if (length != size)
3127 {
3128 size = length;
3129 color = color_counts.length ();
3130 color_counts.quick_push (0);
3131 }
3132 color_counts[color]++;
3133 return color;
3134 }
3135};
3136
3137typedef auto_vec<bracket> bracket_vec_t;
3138
3139/* Basic block info for finding SESE regions. */
3140
3141struct bb_sese
3142{
3143 int node; /* Node number in spanning tree. */
3144 int parent; /* Parent node number. */
3145
3146 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3147 edges arrive at pseudo-node Ai and the outgoing edges leave at
3148 pseudo-node Ao. We have to remember which way we arrived at a
3149 particular node when generating the spanning tree. dir > 0 means
3150 we arrived at Ai, dir < 0 means we arrived at Ao. */
3151 int dir;
3152
3153 /* Lowest numbered pseudo-node reached via a backedge from thsis
3154 node, or any descendant. */
3155 pseudo_node_t high;
3156
3157 int color; /* Cycle-equivalence color */
3158
3159 /* Stack of brackets for this node. */
3160 bracket_vec_t brackets;
3161
3162 bb_sese (unsigned node_, unsigned p, int dir_)
3163 :node (node_), parent (p), dir (dir_)
3164 {
3165 }
3166 ~bb_sese ();
3167
3168 /* Push a bracket ending at BACK. */
3169 void push (const pseudo_node_t &back)
3170 {
3171 if (dump_file)
3172 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3173 back.first ? back.first->index : 0, back.second);
3174 brackets.safe_push (bracket (back));
3175 }
3176
3177 void append (bb_sese *child);
3178 void remove (const pseudo_node_t &);
3179
3180 /* Set node's color. */
3181 void set_color (auto_vec<unsigned> &color_counts)
3182 {
3183 color = brackets.last ().get_color (color_counts, brackets.length ());
3184 }
3185};
3186
3187bb_sese::~bb_sese ()
3188{
3189}
3190
3191/* Destructively append CHILD's brackets. */
3192
3193void
3194bb_sese::append (bb_sese *child)
3195{
3196 if (int len = child->brackets.length ())
3197 {
3198 int ix;
3199
3200 if (dump_file)
3201 {
3202 for (ix = 0; ix < len; ix++)
3203 {
3204 const pseudo_node_t &pseudo = child->brackets[ix].back;
3205 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3206 child->node, pseudo.first ? pseudo.first->index : 0,
3207 pseudo.second);
3208 }
3209 }
3210 if (!brackets.length ())
3211 std::swap (brackets, child->brackets);
3212 else
3213 {
3214 brackets.reserve (len);
3215 for (ix = 0; ix < len; ix++)
3216 brackets.quick_push (child->brackets[ix]);
3217 }
3218 }
3219}
3220
3221/* Remove brackets that terminate at PSEUDO. */
3222
3223void
3224bb_sese::remove (const pseudo_node_t &pseudo)
3225{
3226 unsigned removed = 0;
3227 int len = brackets.length ();
3228
3229 for (int ix = 0; ix < len; ix++)
3230 {
3231 if (brackets[ix].back == pseudo)
3232 {
3233 if (dump_file)
3234 fprintf (dump_file, "Removing backedge %d:%+d\n",
3235 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3236 removed++;
3237 }
3238 else if (removed)
3239 brackets[ix-removed] = brackets[ix];
3240 }
3241 while (removed--)
3242 brackets.pop ();
3243}
3244
3245/* Accessors for BB's aux pointer. */
3246#define BB_SET_SESE(B, S) ((B)->aux = (S))
3247#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3248
3249/* DFS walk creating SESE data structures. Only cover nodes with
3250 BB_VISITED set. Append discovered blocks to LIST. We number in
3251 increments of 3 so that the above and below pseudo nodes can be
3252 implicitly numbered too. */
3253
3254static int
3255nvptx_sese_number (int n, int p, int dir, basic_block b,
3256 auto_vec<basic_block> *list)
3257{
3258 if (BB_GET_SESE (b))
3259 return n;
3260
3261 if (dump_file)
3262 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3263 b->index, n, p, dir);
3264
3265 BB_SET_SESE (b, new bb_sese (n, p, dir));
3266 p = n;
3267
3268 n += 3;
3269 list->quick_push (b);
3270
3271 /* First walk the nodes on the 'other side' of this node, then walk
3272 the nodes on the same side. */
3273 for (unsigned ix = 2; ix; ix--)
3274 {
3275 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3276 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3277 : offsetof (edge_def, src));
3278 edge e;
3279 edge_iterator (ei);
3280
3281 FOR_EACH_EDGE (e, ei, edges)
3282 {
3283 basic_block target = *(basic_block *)((char *)e + offset);
3284
3285 if (target->flags & BB_VISITED)
3286 n = nvptx_sese_number (n, p, dir, target, list);
3287 }
3288 dir = -dir;
3289 }
3290 return n;
3291}
3292
3293/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3294 EDGES are the outgoing edges and OFFSET is the offset to the src
3295 or dst block on the edges. */
3296
3297static void
3298nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3299 vec<edge, va_gc> *edges, size_t offset)
3300{
3301 edge e;
3302 edge_iterator (ei);
3303 int hi_back = depth;
3304 pseudo_node_t node_back (0, depth);
3305 int hi_child = depth;
3306 pseudo_node_t node_child (0, depth);
3307 basic_block child = NULL;
3308 unsigned num_children = 0;
3309 int usd = -dir * sese->dir;
3310
3311 if (dump_file)
3312 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3313 me->index, sese->node, dir);
3314
3315 if (dir < 0)
3316 {
3317 /* This is the above pseudo-child. It has the BB itself as an
3318 additional child node. */
3319 node_child = sese->high;
3320 hi_child = node_child.second;
3321 if (node_child.first)
3322 hi_child += BB_GET_SESE (node_child.first)->node;
3323 num_children++;
3324 }
3325
3326 /* Examine each edge.
3327 - if it is a child (a) append its bracket list and (b) record
3328 whether it is the child with the highest reaching bracket.
3329 - if it is an edge to ancestor, record whether it's the highest
3330 reaching backlink. */
3331 FOR_EACH_EDGE (e, ei, edges)
3332 {
3333 basic_block target = *(basic_block *)((char *)e + offset);
3334
3335 if (bb_sese *t_sese = BB_GET_SESE (target))
3336 {
3337 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3338 {
3339 /* Child node. Append its bracket list. */
3340 num_children++;
3341 sese->append (t_sese);
3342
3343 /* Compare it's hi value. */
3344 int t_hi = t_sese->high.second;
3345
3346 if (basic_block child_hi_block = t_sese->high.first)
3347 t_hi += BB_GET_SESE (child_hi_block)->node;
3348
3349 if (hi_child > t_hi)
3350 {
3351 hi_child = t_hi;
3352 node_child = t_sese->high;
3353 child = target;
3354 }
3355 }
3356 else if (t_sese->node < sese->node + dir
3357 && !(dir < 0 && sese->parent == t_sese->node))
3358 {
3359 /* Non-parental ancestor node -- a backlink. */
3360 int d = usd * t_sese->dir;
3361 int back = t_sese->node + d;
3362
3363 if (hi_back > back)
3364 {
3365 hi_back = back;
3366 node_back = pseudo_node_t (target, d);
3367 }
3368 }
3369 }
3370 else
3371 { /* Fallen off graph, backlink to entry node. */
3372 hi_back = 0;
3373 node_back = pseudo_node_t (0, 0);
3374 }
3375 }
3376
3377 /* Remove any brackets that terminate at this pseudo node. */
3378 sese->remove (pseudo_node_t (me, dir));
3379
3380 /* Now push any backlinks from this pseudo node. */
3381 FOR_EACH_EDGE (e, ei, edges)
3382 {
3383 basic_block target = *(basic_block *)((char *)e + offset);
3384 if (bb_sese *t_sese = BB_GET_SESE (target))
3385 {
3386 if (t_sese->node < sese->node + dir
3387 && !(dir < 0 && sese->parent == t_sese->node))
3388 /* Non-parental ancestor node - backedge from me. */
3389 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3390 }
3391 else
3392 {
3393 /* back edge to entry node */
3394 sese->push (pseudo_node_t (0, 0));
3395 }
3396 }
3397
3398 /* If this node leads directly or indirectly to a no-return region of
3399 the graph, then fake a backedge to entry node. */
3400 if (!sese->brackets.length () || !edges || !edges->length ())
3401 {
3402 hi_back = 0;
3403 node_back = pseudo_node_t (0, 0);
3404 sese->push (node_back);
3405 }
3406
3407 /* Record the highest reaching backedge from us or a descendant. */
3408 sese->high = hi_back < hi_child ? node_back : node_child;
3409
3410 if (num_children > 1)
3411 {
3412 /* There is more than one child -- this is a Y shaped piece of
3413 spanning tree. We have to insert a fake backedge from this
3414 node to the highest ancestor reached by not-the-highest
3415 reaching child. Note that there may be multiple children
3416 with backedges to the same highest node. That's ok and we
3417 insert the edge to that highest node. */
3418 hi_child = depth;
3419 if (dir < 0 && child)
3420 {
3421 node_child = sese->high;
3422 hi_child = node_child.second;
3423 if (node_child.first)
3424 hi_child += BB_GET_SESE (node_child.first)->node;
3425 }
3426
3427 FOR_EACH_EDGE (e, ei, edges)
3428 {
3429 basic_block target = *(basic_block *)((char *)e + offset);
3430
3431 if (target == child)
3432 /* Ignore the highest child. */
3433 continue;
3434
3435 bb_sese *t_sese = BB_GET_SESE (target);
3436 if (!t_sese)
3437 continue;
3438 if (t_sese->parent != sese->node)
3439 /* Not a child. */
3440 continue;
3441
3442 /* Compare its hi value. */
3443 int t_hi = t_sese->high.second;
3444
3445 if (basic_block child_hi_block = t_sese->high.first)
3446 t_hi += BB_GET_SESE (child_hi_block)->node;
3447
3448 if (hi_child > t_hi)
3449 {
3450 hi_child = t_hi;
3451 node_child = t_sese->high;
3452 }
3453 }
3454
3455 sese->push (node_child);
3456 }
3457}
3458
3459
3460/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3461 proceed to successors. Set SESE entry and exit nodes of
3462 REGIONS. */
3463
3464static void
3465nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3466 basic_block block, int coloring)
3467{
3468 bb_sese *sese = BB_GET_SESE (block);
3469
3470 if (block->flags & BB_VISITED)
3471 {
3472 /* If we've already encountered this block, either we must not
3473 be coloring, or it must have been colored the current color. */
3474 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3475 return;
3476 }
3477
3478 block->flags |= BB_VISITED;
3479
3480 if (sese)
3481 {
3482 if (coloring < 0)
3483 {
3484 /* Start coloring a region. */
3485 regions[sese->color].first = block;
3486 coloring = sese->color;
3487 }
3488
3489 if (!--color_counts[sese->color] && sese->color == coloring)
3490 {
3491 /* Found final block of SESE region. */
3492 regions[sese->color].second = block;
3493 coloring = -1;
3494 }
3495 else
3496 /* Color the node, so we can assert on revisiting the node
3497 that the graph is indeed SESE. */
3498 sese->color = coloring;
3499 }
3500 else
3501 /* Fallen off the subgraph, we cannot be coloring. */
3502 gcc_assert (coloring < 0);
3503
3504 /* Walk each successor block. */
3505 if (block->succs && block->succs->length ())
3506 {
3507 edge e;
3508 edge_iterator ei;
3509
3510 FOR_EACH_EDGE (e, ei, block->succs)
3511 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3512 }
3513 else
3514 gcc_assert (coloring < 0);
3515}
3516
3517/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3518 end up with NULL entries in it. */
3519
3520static void
3521nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3522{
3523 basic_block block;
3524 int ix;
3525
3526 /* First clear each BB of the whole function. */
2300c332 3527 FOR_ALL_BB_FN (block, cfun)
912442c2
NS
3528 {
3529 block->flags &= ~BB_VISITED;
3530 BB_SET_SESE (block, 0);
3531 }
912442c2
NS
3532
3533 /* Mark blocks in the function that are in this graph. */
3534 for (ix = 0; blocks.iterate (ix, &block); ix++)
3535 block->flags |= BB_VISITED;
3536
3537 /* Counts of nodes assigned to each color. There cannot be more
3538 colors than blocks (and hopefully there will be fewer). */
3539 auto_vec<unsigned> color_counts;
3540 color_counts.reserve (blocks.length ());
3541
3542 /* Worklist of nodes in the spanning tree. Again, there cannot be
3543 more nodes in the tree than blocks (there will be fewer if the
3544 CFG of blocks is disjoint). */
3545 auto_vec<basic_block> spanlist;
3546 spanlist.reserve (blocks.length ());
3547
3548 /* Make sure every block has its cycle class determined. */
3549 for (ix = 0; blocks.iterate (ix, &block); ix++)
3550 {
3551 if (BB_GET_SESE (block))
3552 /* We already met this block in an earlier graph solve. */
3553 continue;
3554
3555 if (dump_file)
3556 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3557
3558 /* Number the nodes reachable from block initial DFS order. */
3559 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3560
3561 /* Now walk in reverse DFS order to find cycle equivalents. */
3562 while (spanlist.length ())
3563 {
3564 block = spanlist.pop ();
3565 bb_sese *sese = BB_GET_SESE (block);
3566
3567 /* Do the pseudo node below. */
3568 nvptx_sese_pseudo (block, sese, depth, +1,
3569 sese->dir > 0 ? block->succs : block->preds,
3570 (sese->dir > 0 ? offsetof (edge_def, dest)
3571 : offsetof (edge_def, src)));
3572 sese->set_color (color_counts);
3573 /* Do the pseudo node above. */
3574 nvptx_sese_pseudo (block, sese, depth, -1,
3575 sese->dir < 0 ? block->succs : block->preds,
3576 (sese->dir < 0 ? offsetof (edge_def, dest)
3577 : offsetof (edge_def, src)));
3578 }
3579 if (dump_file)
3580 fprintf (dump_file, "\n");
3581 }
3582
3583 if (dump_file)
3584 {
3585 unsigned count;
3586 const char *comma = "";
3587
3588 fprintf (dump_file, "Found %d cycle equivalents\n",
3589 color_counts.length ());
3590 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3591 {
3592 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3593
3594 comma = "";
3595 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3596 if (BB_GET_SESE (block)->color == ix)
3597 {
3598 block->flags |= BB_VISITED;
3599 fprintf (dump_file, "%s%d", comma, block->index);
3600 comma=",";
3601 }
3602 fprintf (dump_file, "}");
3603 comma = ", ";
3604 }
3605 fprintf (dump_file, "\n");
3606 }
3607
3608 /* Now we've colored every block in the subgraph. We now need to
3609 determine the minimal set of SESE regions that cover that
3610 subgraph. Do this with a DFS walk of the complete function.
3611 During the walk we're either 'looking' or 'coloring'. When we
3612 reach the last node of a particular color, we stop coloring and
3613 return to looking. */
3614
3615 /* There cannot be more SESE regions than colors. */
3616 regions.reserve (color_counts.length ());
3617 for (ix = color_counts.length (); ix--;)
3618 regions.quick_push (bb_pair_t (0, 0));
3619
3620 for (ix = 0; blocks.iterate (ix, &block); ix++)
3621 block->flags &= ~BB_VISITED;
3622
3623 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3624
3625 if (dump_file)
3626 {
3627 const char *comma = "";
3628 int len = regions.length ();
3629
3630 fprintf (dump_file, "SESE regions:");
3631 for (ix = 0; ix != len; ix++)
3632 {
3633 basic_block from = regions[ix].first;
3634 basic_block to = regions[ix].second;
3635
3636 if (from)
3637 {
3638 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3639 if (to != from)
3640 fprintf (dump_file, "->%d", to->index);
3641
3642 int color = BB_GET_SESE (from)->color;
3643
3644 /* Print the blocks within the region (excluding ends). */
3645 FOR_EACH_BB_FN (block, cfun)
3646 {
3647 bb_sese *sese = BB_GET_SESE (block);
3648
3649 if (sese && sese->color == color
3650 && block != from && block != to)
3651 fprintf (dump_file, ".%d", block->index);
3652 }
3653 fprintf (dump_file, "}");
3654 }
3655 comma = ",";
3656 }
3657 fprintf (dump_file, "\n\n");
3658 }
3659
3660 for (ix = 0; blocks.iterate (ix, &block); ix++)
3661 delete BB_GET_SESE (block);
3662}
3663
3664#undef BB_SET_SESE
3665#undef BB_GET_SESE
3666
d88cd9c4
NS
3667/* Propagate live state at the start of a partitioned region. BLOCK
3668 provides the live register information, and might not contain
3669 INSN. Propagation is inserted just after INSN. RW indicates whether
3670 we are reading and/or writing state. This
3671 separation is needed for worker-level proppagation where we
3672 essentially do a spill & fill. FN is the underlying worker
3673 function to generate the propagation instructions for single
3674 register. DATA is user data.
3675
3676 We propagate the live register set and the entire frame. We could
3677 do better by (a) propagating just the live set that is used within
3678 the partitioned regions and (b) only propagating stack entries that
3679 are used. The latter might be quite hard to determine. */
3680
3681typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3682
3683static void
3684nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3685 propagator_fn fn, void *data)
3686{
3687 bitmap live = DF_LIVE_IN (block);
3688 bitmap_iterator iterator;
3689 unsigned ix;
3690
3691 /* Copy the frame array. */
3692 HOST_WIDE_INT fs = get_frame_size ();
3693 if (fs)
3694 {
3695 rtx tmp = gen_reg_rtx (DImode);
3696 rtx idx = NULL_RTX;
3697 rtx ptr = gen_reg_rtx (Pmode);
3698 rtx pred = NULL_RTX;
3699 rtx_code_label *label = NULL;
3700
c1311c86
NS
3701 /* The frame size might not be DImode compatible, but the frame
3702 array's declaration will be. So it's ok to round up here. */
3703 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
d88cd9c4
NS
3704 /* Detect single iteration loop. */
3705 if (fs == 1)
3706 fs = 0;
3707
3708 start_sequence ();
3709 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3710 if (fs)
3711 {
3712 idx = gen_reg_rtx (SImode);
3713 pred = gen_reg_rtx (BImode);
3714 label = gen_label_rtx ();
3715
3716 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3717 /* Allow worker function to initialize anything needed. */
3718 rtx init = fn (tmp, PM_loop_begin, fs, data);
3719 if (init)
3720 emit_insn (init);
3721 emit_label (label);
3722 LABEL_NUSES (label)++;
3723 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3724 }
3725 if (rw & PM_read)
3726 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3727 emit_insn (fn (tmp, rw, fs, data));
3728 if (rw & PM_write)
3729 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3730 if (fs)
3731 {
3732 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3733 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3734 emit_insn (gen_br_true_uni (pred, label));
3735 rtx fini = fn (tmp, PM_loop_end, fs, data);
3736 if (fini)
3737 emit_insn (fini);
3738 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3739 }
3740 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3741 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3742 rtx cpy = get_insns ();
3743 end_sequence ();
3744 insn = emit_insn_after (cpy, insn);
3745 }
3746
3747 /* Copy live registers. */
3748 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3749 {
3750 rtx reg = regno_reg_rtx[ix];
3751
3752 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3753 {
3754 rtx bcast = fn (reg, rw, 0, data);
3755
3756 insn = emit_insn_after (bcast, insn);
3757 }
3758 }
3759}
3760
3761/* Worker for nvptx_vpropagate. */
3762
3763static rtx
3764vprop_gen (rtx reg, propagate_mask pm,
3765 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3766{
3767 if (!(pm & PM_read_write))
3768 return 0;
3769
3770 return nvptx_gen_vcast (reg);
3771}
3772
3773/* Propagate state that is live at start of BLOCK across the vectors
3774 of a single warp. Propagation is inserted just after INSN. */
3775
3776static void
3777nvptx_vpropagate (basic_block block, rtx_insn *insn)
3778{
3779 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3780}
3781
3782/* Worker for nvptx_wpropagate. */
3783
3784static rtx
3785wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3786{
3787 wcast_data_t *data = (wcast_data_t *)data_;
3788
3789 if (pm & PM_loop_begin)
3790 {
3791 /* Starting a loop, initialize pointer. */
3792 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3793
3794 if (align > worker_bcast_align)
3795 worker_bcast_align = align;
3796 data->offset = (data->offset + align - 1) & ~(align - 1);
3797
3798 data->ptr = gen_reg_rtx (Pmode);
3799
3800 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3801 }
3802 else if (pm & PM_loop_end)
3803 {
3804 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3805 data->ptr = NULL_RTX;
3806 return clobber;
3807 }
3808 else
3809 return nvptx_gen_wcast (reg, pm, rep, data);
3810}
3811
3812/* Spill or fill live state that is live at start of BLOCK. PRE_P
3813 indicates if this is just before partitioned mode (do spill), or
3814 just after it starts (do fill). Sequence is inserted just after
3815 INSN. */
3816
3817static void
3818nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3819{
3820 wcast_data_t data;
3821
3822 data.base = gen_reg_rtx (Pmode);
3823 data.offset = 0;
3824 data.ptr = NULL_RTX;
3825
3826 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3827 if (data.offset)
3828 {
3829 /* Stuff was emitted, initialize the base pointer now. */
bd602b7f 3830 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
d88cd9c4 3831 emit_insn_after (init, insn);
15ab6f00 3832
d88cd9c4
NS
3833 if (worker_bcast_size < data.offset)
3834 worker_bcast_size = data.offset;
3835 }
3836}
3837
3838/* Emit a worker-level synchronization barrier. We use different
3839 markers for before and after synchronizations. */
3840
3841static rtx
3842nvptx_wsync (bool after)
3843{
3844 return gen_nvptx_barsync (GEN_INT (after));
3845}
3846
3847/* Single neutering according to MASK. FROM is the incoming block and
3848 TO is the outgoing block. These may be the same block. Insert at
3849 start of FROM:
3850
3851 if (tid.<axis>) goto end.
3852
3853 and insert before ending branch of TO (if there is such an insn):
3854
3855 end:
3856 <possibly-broadcast-cond>
3857 <branch>
3858
3859 We currently only use differnt FROM and TO when skipping an entire
3860 loop. We could do more if we detected superblocks. */
3861
3862static void
3863nvptx_single (unsigned mask, basic_block from, basic_block to)
3864{
3865 rtx_insn *head = BB_HEAD (from);
3866 rtx_insn *tail = BB_END (to);
3867 unsigned skip_mask = mask;
3868
3869 /* Find first insn of from block */
3870 while (head != BB_END (from) && !INSN_P (head))
3871 head = NEXT_INSN (head);
3872
3873 /* Find last insn of to block */
3874 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3875 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3876 tail = PREV_INSN (tail);
3877
3878 /* Detect if tail is a branch. */
3879 rtx tail_branch = NULL_RTX;
3880 rtx cond_branch = NULL_RTX;
3881 if (tail && INSN_P (tail))
3882 {
3883 tail_branch = PATTERN (tail);
3884 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3885 tail_branch = NULL_RTX;
3886 else
3887 {
3888 cond_branch = SET_SRC (tail_branch);
3889 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3890 cond_branch = NULL_RTX;
3891 }
3892 }
3893
3894 if (tail == head)
3895 {
3896 /* If this is empty, do nothing. */
3897 if (!head || !INSN_P (head))
3898 return;
3899
3900 /* If this is a dummy insn, do nothing. */
3901 switch (recog_memoized (head))
3902 {
3903 default:
3904 break;
3905 case CODE_FOR_nvptx_fork:
3906 case CODE_FOR_nvptx_forked:
3907 case CODE_FOR_nvptx_joining:
3908 case CODE_FOR_nvptx_join:
3909 return;
3910 }
3911
3912 if (cond_branch)
3913 {
3914 /* If we're only doing vector single, there's no need to
3915 emit skip code because we'll not insert anything. */
3916 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3917 skip_mask = 0;
3918 }
3919 else if (tail_branch)
3920 /* Block with only unconditional branch. Nothing to do. */
3921 return;
3922 }
3923
3924 /* Insert the vector test inside the worker test. */
3925 unsigned mode;
3926 rtx_insn *before = tail;
3927 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3928 if (GOMP_DIM_MASK (mode) & skip_mask)
3929 {
3930 rtx_code_label *label = gen_label_rtx ();
3931 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3932
3933 if (!pred)
3934 {
3935 pred = gen_reg_rtx (BImode);
3936 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3937 }
3938
3939 rtx br;
3940 if (mode == GOMP_DIM_VECTOR)
3941 br = gen_br_true (pred, label);
3942 else
3943 br = gen_br_true_uni (pred, label);
3944 emit_insn_before (br, head);
3945
3946 LABEL_NUSES (label)++;
3947 if (tail_branch)
3948 before = emit_label_before (label, before);
3949 else
3950 emit_label_after (label, tail);
3951 }
3952
3953 /* Now deal with propagating the branch condition. */
3954 if (cond_branch)
3955 {
3956 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3957
3958 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3959 {
3960 /* Vector mode only, do a shuffle. */
3961 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3962 }
3963 else
3964 {
3965 /* Includes worker mode, do spill & fill. By construction
3966 we should never have worker mode only. */
3967 wcast_data_t data;
3968
3969 data.base = worker_bcast_sym;
3970 data.ptr = 0;
3971
3972 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3973 worker_bcast_size = GET_MODE_SIZE (SImode);
3974
3975 data.offset = 0;
3976 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3977 before);
3978 /* Barrier so other workers can see the write. */
3979 emit_insn_before (nvptx_wsync (false), tail);
3980 data.offset = 0;
3981 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3982 /* This barrier is needed to avoid worker zero clobbering
3983 the broadcast buffer before all the other workers have
3984 had a chance to read this instance of it. */
3985 emit_insn_before (nvptx_wsync (true), tail);
3986 }
3987
3988 extract_insn (tail);
3989 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3990 UNSPEC_BR_UNIFIED);
3991 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3992 }
3993}
3994
3995/* PAR is a parallel that is being skipped in its entirety according to
3996 MASK. Treat this as skipping a superblock starting at forked
3997 and ending at joining. */
3998
3999static void
4000nvptx_skip_par (unsigned mask, parallel *par)
4001{
4002 basic_block tail = par->join_block;
4003 gcc_assert (tail->preds->length () == 1);
4004
4005 basic_block pre_tail = (*tail->preds)[0]->src;
4006 gcc_assert (pre_tail->succs->length () == 1);
4007
4008 nvptx_single (mask, par->forked_block, pre_tail);
4009}
4010
dba619f3
NS
4011/* If PAR has a single inner parallel and PAR itself only contains
4012 empty entry and exit blocks, swallow the inner PAR. */
4013
4014static void
4015nvptx_optimize_inner (parallel *par)
4016{
4017 parallel *inner = par->inner;
4018
4019 /* We mustn't be the outer dummy par. */
4020 if (!par->mask)
4021 return;
4022
4023 /* We must have a single inner par. */
4024 if (!inner || inner->next)
4025 return;
4026
4027 /* We must only contain 2 blocks ourselves -- the head and tail of
4028 the inner par. */
4029 if (par->blocks.length () != 2)
4030 return;
4031
4032 /* We must be disjoint partitioning. As we only have vector and
4033 worker partitioning, this is sufficient to guarantee the pars
4034 have adjacent partitioning. */
4035 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4036 /* This indicates malformed code generation. */
4037 return;
4038
4039 /* The outer forked insn should be immediately followed by the inner
4040 fork insn. */
4041 rtx_insn *forked = par->forked_insn;
4042 rtx_insn *fork = BB_END (par->forked_block);
4043
4044 if (NEXT_INSN (forked) != fork)
4045 return;
4046 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4047
4048 /* The outer joining insn must immediately follow the inner join
4049 insn. */
4050 rtx_insn *joining = par->joining_insn;
4051 rtx_insn *join = inner->join_insn;
4052 if (NEXT_INSN (join) != joining)
4053 return;
4054
4055 /* Preconditions met. Swallow the inner par. */
4056 if (dump_file)
4057 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4058 inner->mask, inner->forked_block->index,
4059 inner->join_block->index,
4060 par->mask, par->forked_block->index, par->join_block->index);
4061
4062 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4063
4064 par->blocks.reserve (inner->blocks.length ());
4065 while (inner->blocks.length ())
4066 par->blocks.quick_push (inner->blocks.pop ());
4067
4068 par->inner = inner->inner;
4069 inner->inner = NULL;
4070
4071 delete inner;
4072}
4073
d88cd9c4
NS
4074/* Process the parallel PAR and all its contained
4075 parallels. We do everything but the neutering. Return mask of
4076 partitioned modes used within this parallel. */
4077
4078static unsigned
4079nvptx_process_pars (parallel *par)
4080{
dba619f3
NS
4081 if (nvptx_optimize)
4082 nvptx_optimize_inner (par);
4083
d88cd9c4
NS
4084 unsigned inner_mask = par->mask;
4085
4086 /* Do the inner parallels first. */
4087 if (par->inner)
4088 {
4089 par->inner_mask = nvptx_process_pars (par->inner);
4090 inner_mask |= par->inner_mask;
4091 }
4092
4093 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4094 /* No propagation needed for a call. */;
5d306e55 4095 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
4096 {
4097 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4098 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4099 /* Insert begin and end synchronizations. */
4100 emit_insn_after (nvptx_wsync (false), par->forked_insn);
4101 emit_insn_before (nvptx_wsync (true), par->joining_insn);
4102 }
4103 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4104 nvptx_vpropagate (par->forked_block, par->forked_insn);
4105
4106 /* Now do siblings. */
4107 if (par->next)
4108 inner_mask |= nvptx_process_pars (par->next);
4109 return inner_mask;
4110}
4111
4112/* Neuter the parallel described by PAR. We recurse in depth-first
4113 order. MODES are the partitioning of the execution and OUTER is
4114 the partitioning of the parallels we are contained in. */
4115
4116static void
4117nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4118{
4119 unsigned me = (par->mask
4120 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4121 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4122 unsigned skip_mask = 0, neuter_mask = 0;
4123
4124 if (par->inner)
4125 nvptx_neuter_pars (par->inner, modes, outer | me);
4126
4127 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4128 {
4129 if ((outer | me) & GOMP_DIM_MASK (mode))
4130 {} /* Mode is partitioned: no neutering. */
4131 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 4132 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
4133 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4134 || !par->forked_insn)
4135 /* Partitioned in inner parallels, or we're not a partitioned
4136 at all: neuter individual blocks. */
4137 neuter_mask |= GOMP_DIM_MASK (mode);
4138 else if (!par->parent || !par->parent->forked_insn
4139 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4140 /* Parent isn't a parallel or contains this paralleling: skip
4141 parallel at this level. */
4142 skip_mask |= GOMP_DIM_MASK (mode);
4143 else
4144 {} /* Parent will skip this parallel itself. */
4145 }
4146
4147 if (neuter_mask)
4148 {
912442c2 4149 int ix, len;
d88cd9c4 4150
912442c2
NS
4151 if (nvptx_optimize)
4152 {
4153 /* Neuter whole SESE regions. */
4154 bb_pair_vec_t regions;
4155
4156 nvptx_find_sese (par->blocks, regions);
4157 len = regions.length ();
4158 for (ix = 0; ix != len; ix++)
4159 {
4160 basic_block from = regions[ix].first;
4161 basic_block to = regions[ix].second;
4162
4163 if (from)
4164 nvptx_single (neuter_mask, from, to);
4165 else
4166 gcc_assert (!to);
4167 }
4168 }
4169 else
d88cd9c4 4170 {
912442c2
NS
4171 /* Neuter each BB individually. */
4172 len = par->blocks.length ();
4173 for (ix = 0; ix != len; ix++)
4174 {
4175 basic_block block = par->blocks[ix];
d88cd9c4 4176
912442c2
NS
4177 nvptx_single (neuter_mask, block, block);
4178 }
d88cd9c4
NS
4179 }
4180 }
4181
4182 if (skip_mask)
4183 nvptx_skip_par (skip_mask, par);
4184
4185 if (par->next)
4186 nvptx_neuter_pars (par->next, modes, outer);
4187}
4188
517665b3 4189/* PTX-specific reorganization
d88cd9c4 4190 - Split blocks at fork and join instructions
c38f0d8c
NS
4191 - Compute live registers
4192 - Mark now-unused registers, so function begin doesn't declare
517665b3 4193 unused registers.
d88cd9c4
NS
4194 - Insert state propagation when entering partitioned mode
4195 - Insert neutering instructions when in single mode
c38f0d8c 4196 - Replace subregs with suitable sequences.
517665b3
NS
4197*/
4198
4199static void
4200nvptx_reorg (void)
4201{
517665b3
NS
4202 /* We are freeing block_for_insn in the toplev to keep compatibility
4203 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4204 compute_bb_for_insn ();
4205
4206 thread_prologue_and_epilogue_insns ();
4207
d88cd9c4
NS
4208 /* Split blocks and record interesting unspecs. */
4209 bb_insn_map_t bb_insn_map;
4210
4211 nvptx_split_blocks (&bb_insn_map);
4212
c38f0d8c 4213 /* Compute live regs */
517665b3
NS
4214 df_clear_flags (DF_LR_RUN_DCE);
4215 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
4216 df_live_add_problem ();
4217 df_live_set_all_dirty ();
517665b3 4218 df_analyze ();
738f2522
BS
4219 regstat_init_n_sets_and_refs ();
4220
d88cd9c4
NS
4221 if (dump_file)
4222 df_dump (dump_file);
4223
517665b3 4224 /* Mark unused regs as unused. */
d88cd9c4 4225 int max_regs = max_reg_num ();
44c068ae 4226 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
4227 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4228 regno_reg_rtx[i] = const0_rtx;
517665b3 4229
d88cd9c4
NS
4230 /* Determine launch dimensions of the function. If it is not an
4231 offloaded function (i.e. this is a regular compiler), the
4232 function has no neutering. */
629b3d75 4233 tree attr = oacc_get_fn_attrib (current_function_decl);
d88cd9c4
NS
4234 if (attr)
4235 {
4236 /* If we determined this mask before RTL expansion, we could
4237 elide emission of some levels of forks and joins. */
4238 unsigned mask = 0;
4239 tree dims = TREE_VALUE (attr);
4240 unsigned ix;
4241
4242 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4243 {
4244 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4245 tree allowed = TREE_PURPOSE (dims);
4246
4247 if (size != 1 && !(allowed && integer_zerop (allowed)))
4248 mask |= GOMP_DIM_MASK (ix);
4249 }
4250 /* If there is worker neutering, there must be vector
4251 neutering. Otherwise the hardware will fail. */
4252 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4253 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4254
4255 /* Discover & process partitioned regions. */
4256 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4257 nvptx_process_pars (pars);
4258 nvptx_neuter_pars (pars, mask, 0);
4259 delete pars;
4260 }
4261
517665b3 4262 /* Replace subregs. */
c03b0416 4263 nvptx_reorg_subreg ();
517665b3 4264
5012919d
AM
4265 if (TARGET_UNIFORM_SIMT)
4266 nvptx_reorg_uniform_simt ();
4267
738f2522 4268 regstat_free_n_sets_and_refs ();
517665b3
NS
4269
4270 df_finish_pass (true);
738f2522
BS
4271}
4272\f
4273/* Handle a "kernel" attribute; arguments as in
4274 struct attribute_spec.handler. */
4275
4276static tree
4277nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4278 int ARG_UNUSED (flags), bool *no_add_attrs)
4279{
4280 tree decl = *node;
4281
4282 if (TREE_CODE (decl) != FUNCTION_DECL)
4283 {
4284 error ("%qE attribute only applies to functions", name);
4285 *no_add_attrs = true;
4286 }
b49e35a9 4287 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
4288 {
4289 error ("%qE attribute requires a void return type", name);
4290 *no_add_attrs = true;
4291 }
4292
4293 return NULL_TREE;
4294}
4295
5012919d
AM
4296/* Handle a "shared" attribute; arguments as in
4297 struct attribute_spec.handler. */
4298
4299static tree
4300nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4301 int ARG_UNUSED (flags), bool *no_add_attrs)
4302{
4303 tree decl = *node;
4304
4305 if (TREE_CODE (decl) != VAR_DECL)
4306 {
4307 error ("%qE attribute only applies to variables", name);
4308 *no_add_attrs = true;
4309 }
4310 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4311 {
4312 error ("%qE attribute not allowed with auto storage class", name);
4313 *no_add_attrs = true;
4314 }
4315
4316 return NULL_TREE;
4317}
4318
738f2522
BS
4319/* Table of valid machine attributes. */
4320static const struct attribute_spec nvptx_attribute_table[] =
4321{
4322 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
4323 affects_type_identity } */
4324 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
5012919d 4325 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
738f2522
BS
4326 { NULL, 0, 0, false, false, false, NULL, false }
4327};
4328\f
4329/* Limit vector alignments to BIGGEST_ALIGNMENT. */
4330
4331static HOST_WIDE_INT
4332nvptx_vector_alignment (const_tree type)
4333{
4334 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4335
4336 return MIN (align, BIGGEST_ALIGNMENT);
4337}
d88cd9c4
NS
4338
4339/* Indicate that INSN cannot be duplicated. */
4340
4341static bool
4342nvptx_cannot_copy_insn_p (rtx_insn *insn)
4343{
4344 switch (recog_memoized (insn))
4345 {
4346 case CODE_FOR_nvptx_shufflesi:
4347 case CODE_FOR_nvptx_shufflesf:
4348 case CODE_FOR_nvptx_barsync:
4349 case CODE_FOR_nvptx_fork:
4350 case CODE_FOR_nvptx_forked:
4351 case CODE_FOR_nvptx_joining:
4352 case CODE_FOR_nvptx_join:
4353 return true;
4354 default:
4355 return false;
4356 }
4357}
a794bd20
NS
4358
4359/* Section anchors do not work. Initialization for flag_section_anchor
4360 probes the existence of the anchoring target hooks and prevents
4361 anchoring if they don't exist. However, we may be being used with
4362 a host-side compiler that does support anchoring, and hence see
4363 the anchor flag set (as it's not recalculated). So provide an
4364 implementation denying anchoring. */
4365
4366static bool
4367nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4368{
4369 return false;
4370}
738f2522 4371\f
1f83528e
TS
4372/* Record a symbol for mkoffload to enter into the mapping table. */
4373
4374static void
4375nvptx_record_offload_symbol (tree decl)
4376{
3e32ee19
NS
4377 switch (TREE_CODE (decl))
4378 {
4379 case VAR_DECL:
4380 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4381 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4382 break;
4383
4384 case FUNCTION_DECL:
4385 {
629b3d75 4386 tree attr = oacc_get_fn_attrib (decl);
5012919d
AM
4387 /* OpenMP offloading does not set this attribute. */
4388 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
3e32ee19 4389
3e32ee19
NS
4390 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4391 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4392
5012919d 4393 for (; dims; dims = TREE_CHAIN (dims))
3e32ee19 4394 {
5d306e55 4395 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 4396
5d306e55 4397 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
4398 fprintf (asm_out_file, ", %#x", size);
4399 }
d2d47a28 4400
3e32ee19
NS
4401 fprintf (asm_out_file, "\n");
4402 }
4403 break;
d2d47a28 4404
3e32ee19
NS
4405 default:
4406 gcc_unreachable ();
4407 }
1f83528e
TS
4408}
4409
738f2522
BS
4410/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4411 at the start of a file. */
4412
4413static void
4414nvptx_file_start (void)
4415{
4416 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4417 fputs ("\t.version\t3.1\n", asm_out_file);
4418 fputs ("\t.target\tsm_30\n", asm_out_file);
4419 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4420 fputs ("// END PREAMBLE\n", asm_out_file);
4421}
4422
15ab6f00
NS
4423/* Emit a declaration for a worker-level buffer in .shared memory. */
4424
4425static void
4426write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4427{
4428 const char *name = XSTR (sym, 0);
4429
4430 write_var_marker (file, true, false, name);
4431 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4432 align, name, size);
4433}
4434
ecf6e535
BS
4435/* Write out the function declarations we've collected and declare storage
4436 for the broadcast buffer. */
738f2522
BS
4437
4438static void
4439nvptx_file_end (void)
4440{
f3dba894
TS
4441 hash_table<tree_hasher>::iterator iter;
4442 tree decl;
4443 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 4444 nvptx_record_fndecl (decl);
738f2522 4445 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
4446
4447 if (worker_bcast_size)
15ab6f00
NS
4448 write_worker_buffer (asm_out_file, worker_bcast_sym,
4449 worker_bcast_align, worker_bcast_size);
f3552158
NS
4450
4451 if (worker_red_size)
15ab6f00
NS
4452 write_worker_buffer (asm_out_file, worker_red_sym,
4453 worker_red_align, worker_red_size);
5012919d
AM
4454
4455 if (need_softstack_decl)
4456 {
4457 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4458 /* 32 is the maximum number of warps in a block. Even though it's an
4459 external declaration, emit the array size explicitly; otherwise, it
4460 may fail at PTX JIT time if the definition is later in link order. */
4461 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4462 POINTER_SIZE);
4463 }
4464 if (need_unisimt_decl)
4465 {
4466 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4467 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4468 }
f3552158
NS
4469}
4470
4471/* Expander for the shuffle builtins. */
4472
4473static rtx
4474nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4475{
4476 if (ignore)
4477 return target;
4478
4479 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4480 NULL_RTX, mode, EXPAND_NORMAL);
4481 if (!REG_P (src))
4482 src = copy_to_mode_reg (mode, src);
4483
4484 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4485 NULL_RTX, SImode, EXPAND_NORMAL);
4486 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4487 NULL_RTX, SImode, EXPAND_NORMAL);
4488
4489 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4490 idx = copy_to_mode_reg (SImode, idx);
4491
59263259
NS
4492 rtx pat = nvptx_gen_shuffle (target, src, idx,
4493 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
4494 if (pat)
4495 emit_insn (pat);
4496
4497 return target;
4498}
4499
4500/* Worker reduction address expander. */
4501
4502static rtx
4503nvptx_expand_worker_addr (tree exp, rtx target,
4504 machine_mode ARG_UNUSED (mode), int ignore)
4505{
4506 if (ignore)
4507 return target;
4508
4509 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4510 if (align > worker_red_align)
4511 worker_red_align = align;
4512
4513 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4514 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4515 if (size + offset > worker_red_size)
4516 worker_red_size = size + offset;
4517
9a863523 4518 rtx addr = worker_red_sym;
f3552158 4519 if (offset)
9a863523
NS
4520 {
4521 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4522 addr = gen_rtx_CONST (Pmode, addr);
4523 }
f3552158 4524
9a863523 4525 emit_move_insn (target, addr);
f3552158
NS
4526
4527 return target;
4528}
4529
4530/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4531 not require taking the address of any object, other than the memory
4532 cell being operated on. */
4533
4534static rtx
4535nvptx_expand_cmp_swap (tree exp, rtx target,
4536 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4537{
4538 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4539
4540 if (!target)
4541 target = gen_reg_rtx (mode);
4542
4543 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4544 NULL_RTX, Pmode, EXPAND_NORMAL);
4545 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4546 NULL_RTX, mode, EXPAND_NORMAL);
4547 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4548 NULL_RTX, mode, EXPAND_NORMAL);
4549 rtx pat;
4550
4551 mem = gen_rtx_MEM (mode, mem);
4552 if (!REG_P (cmp))
4553 cmp = copy_to_mode_reg (mode, cmp);
4554 if (!REG_P (src))
4555 src = copy_to_mode_reg (mode, src);
4556
4557 if (mode == SImode)
4558 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4559 else
4560 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4561
4562 emit_insn (pat);
4563
4564 return target;
4565}
4566
4567
4568/* Codes for all the NVPTX builtins. */
4569enum nvptx_builtins
4570{
4571 NVPTX_BUILTIN_SHUFFLE,
4572 NVPTX_BUILTIN_SHUFFLELL,
4573 NVPTX_BUILTIN_WORKER_ADDR,
4574 NVPTX_BUILTIN_CMP_SWAP,
4575 NVPTX_BUILTIN_CMP_SWAPLL,
4576 NVPTX_BUILTIN_MAX
4577};
4578
4579static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4580
4581/* Return the NVPTX builtin for CODE. */
4582
4583static tree
4584nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4585{
4586 if (code >= NVPTX_BUILTIN_MAX)
4587 return error_mark_node;
4588
4589 return nvptx_builtin_decls[code];
4590}
4591
4592/* Set up all builtin functions for this target. */
4593
4594static void
4595nvptx_init_builtins (void)
4596{
4597#define DEF(ID, NAME, T) \
4598 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4599 = add_builtin_function ("__builtin_nvptx_" NAME, \
4600 build_function_type_list T, \
4601 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4602#define ST sizetype
4603#define UINT unsigned_type_node
4604#define LLUINT long_long_unsigned_type_node
4605#define PTRVOID ptr_type_node
4606
4607 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4608 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4609 DEF (WORKER_ADDR, "worker_addr",
4610 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4611 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4612 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4613
4614#undef DEF
4615#undef ST
4616#undef UINT
4617#undef LLUINT
4618#undef PTRVOID
4619}
4620
4621/* Expand an expression EXP that calls a built-in function,
4622 with result going to TARGET if that's convenient
4623 (and in mode MODE if that's convenient).
4624 SUBTARGET may be used as the target for computing one of EXP's operands.
4625 IGNORE is nonzero if the value is to be ignored. */
4626
4627static rtx
4628nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4629 machine_mode mode, int ignore)
4630{
4631 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4632 switch (DECL_FUNCTION_CODE (fndecl))
4633 {
4634 case NVPTX_BUILTIN_SHUFFLE:
4635 case NVPTX_BUILTIN_SHUFFLELL:
4636 return nvptx_expand_shuffle (exp, target, mode, ignore);
4637
4638 case NVPTX_BUILTIN_WORKER_ADDR:
4639 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4640
4641 case NVPTX_BUILTIN_CMP_SWAP:
4642 case NVPTX_BUILTIN_CMP_SWAPLL:
4643 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4644
4645 default: gcc_unreachable ();
4646 }
738f2522
BS
4647}
4648\f
f3552158
NS
4649/* Define dimension sizes for known hardware. */
4650#define PTX_VECTOR_LENGTH 32
4651#define PTX_WORKER_LENGTH 32
6668eb45 4652#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
f3552158 4653
5012919d
AM
4654/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4655
4656static int
4657nvptx_simt_vf ()
4658{
4659 return PTX_VECTOR_LENGTH;
4660}
4661
94829f87
NS
4662/* Validate compute dimensions of an OpenACC offload or routine, fill
4663 in non-unity defaults. FN_LEVEL indicates the level at which a
b6adbb9f
NS
4664 routine might spawn a loop. It is negative for non-routines. If
4665 DECL is null, we are validating the default dimensions. */
94829f87
NS
4666
4667static bool
5d306e55 4668nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4669{
4670 bool changed = false;
4671
ccc8282b 4672 /* The vector size must be 32, unless this is a SEQ routine. */
b6adbb9f
NS
4673 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4674 && dims[GOMP_DIM_VECTOR] >= 0
ccc8282b
NS
4675 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4676 {
b6adbb9f
NS
4677 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4678 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b 4679 dims[GOMP_DIM_VECTOR]
324ff1a0
JJ
4680 ? G_("using vector_length (%d), ignoring %d")
4681 : G_("using vector_length (%d), ignoring runtime setting"),
ccc8282b
NS
4682 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4683 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4684 changed = true;
4685 }
4686
4687 /* Check the num workers is not too large. */
4688 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4689 {
b6adbb9f 4690 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
4691 "using num_workers (%d), ignoring %d",
4692 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4693 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4694 changed = true;
4695 }
94829f87 4696
b6adbb9f
NS
4697 if (!decl)
4698 {
4699 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4700 if (dims[GOMP_DIM_WORKER] < 0)
4701 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4702 if (dims[GOMP_DIM_GANG] < 0)
4703 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4704 changed = true;
4705 }
4706
94829f87
NS
4707 return changed;
4708}
d88cd9c4 4709
bd751975
NS
4710/* Return maximum dimension size, or zero for unbounded. */
4711
4712static int
4713nvptx_dim_limit (int axis)
4714{
4715 switch (axis)
4716 {
4717 case GOMP_DIM_WORKER:
4718 return PTX_WORKER_LENGTH;
4719
4720 case GOMP_DIM_VECTOR:
4721 return PTX_VECTOR_LENGTH;
4722
4723 default:
4724 break;
4725 }
4726 return 0;
4727}
4728
d88cd9c4
NS
4729/* Determine whether fork & joins are needed. */
4730
4731static bool
4732nvptx_goacc_fork_join (gcall *call, const int dims[],
4733 bool ARG_UNUSED (is_fork))
4734{
4735 tree arg = gimple_call_arg (call, 2);
4736 unsigned axis = TREE_INT_CST_LOW (arg);
4737
4738 /* We only care about worker and vector partitioning. */
4739 if (axis < GOMP_DIM_WORKER)
4740 return false;
4741
4742 /* If the size is 1, there's no partitioning. */
4743 if (dims[axis] == 1)
4744 return false;
4745
4746 return true;
4747}
4748
f3552158
NS
4749/* Generate a PTX builtin function call that returns the address in
4750 the worker reduction buffer at OFFSET. TYPE is the type of the
4751 data at that location. */
4752
4753static tree
4754nvptx_get_worker_red_addr (tree type, tree offset)
4755{
4756 machine_mode mode = TYPE_MODE (type);
4757 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4758 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4759 tree align = build_int_cst (unsigned_type_node,
4760 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4761 tree call = build_call_expr (fndecl, 3, offset, size, align);
4762
4763 return fold_convert (build_pointer_type (type), call);
4764}
4765
4766/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4767 will cast the variable if necessary. */
4768
4769static void
4770nvptx_generate_vector_shuffle (location_t loc,
4771 tree dest_var, tree var, unsigned shift,
4772 gimple_seq *seq)
4773{
4774 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4775 tree_code code = NOP_EXPR;
dd3c1b14
NS
4776 tree arg_type = unsigned_type_node;
4777 tree var_type = TREE_TYPE (var);
4778 tree dest_type = var_type;
f3552158 4779
dd3c1b14
NS
4780 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4781 var_type = TREE_TYPE (var_type);
4782
4783 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4784 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4785
4786 if (TYPE_SIZE (var_type)
4787 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4788 {
4789 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4790 arg_type = long_long_unsigned_type_node;
f3552158 4791 }
dd3c1b14 4792
f3552158 4793 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4794 tree bits = build_int_cst (unsigned_type_node, shift);
4795 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4796 tree expr;
4797
4798 if (var_type != dest_type)
4799 {
4800 /* Do real and imaginary parts separately. */
4801 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4802 real = fold_build1 (code, arg_type, real);
4803 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4804 real = fold_build1 (code, var_type, real);
f3552158 4805
dd3c1b14
NS
4806 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4807 imag = fold_build1 (code, arg_type, imag);
4808 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4809 imag = fold_build1 (code, var_type, imag);
4810
4811 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4812 }
4813 else
4814 {
4815 expr = fold_build1 (code, arg_type, var);
4816 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4817 expr = fold_build1 (code, dest_type, expr);
4818 }
f3552158 4819
dd3c1b14 4820 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4821}
4822
33f47f42
NS
4823/* Lazily generate the global lock var decl and return its address. */
4824
4825static tree
4826nvptx_global_lock_addr ()
4827{
4828 tree v = global_lock_var;
4829
4830 if (!v)
4831 {
4832 tree name = get_identifier ("__reduction_lock");
4833 tree type = build_qualified_type (unsigned_type_node,
4834 TYPE_QUAL_VOLATILE);
4835 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4836 global_lock_var = v;
4837 DECL_ARTIFICIAL (v) = 1;
4838 DECL_EXTERNAL (v) = 1;
4839 TREE_STATIC (v) = 1;
4840 TREE_PUBLIC (v) = 1;
4841 TREE_USED (v) = 1;
4842 mark_addressable (v);
4843 mark_decl_referenced (v);
4844 }
4845
4846 return build_fold_addr_expr (v);
4847}
4848
4849/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4850 GSI. We use a lockless scheme for nearly all case, which looks
4851 like:
4852 actual = initval(OP);
4853 do {
4854 guess = actual;
4855 write = guess OP myval;
4856 actual = cmp&swap (ptr, guess, write)
4857 } while (actual bit-different-to guess);
4858 return write;
4859
4860 This relies on a cmp&swap instruction, which is available for 32-
4861 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4862
4863static tree
4864nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4865 tree ptr, tree var, tree_code op)
4866{
4867 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4868 tree_code code = NOP_EXPR;
33f47f42
NS
4869 tree arg_type = unsigned_type_node;
4870 tree var_type = TREE_TYPE (var);
f3552158 4871
33f47f42
NS
4872 if (TREE_CODE (var_type) == COMPLEX_TYPE
4873 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4874 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4875
4876 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4877 {
33f47f42 4878 arg_type = long_long_unsigned_type_node;
f3552158 4879 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4880 }
4881
33f47f42
NS
4882 tree swap_fn = nvptx_builtin_decl (fn, true);
4883
f3552158 4884 gimple_seq init_seq = NULL;
33f47f42
NS
4885 tree init_var = make_ssa_name (arg_type);
4886 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4887 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4888 gimplify_assign (init_var, init_expr, &init_seq);
4889 gimple *init_end = gimple_seq_last (init_seq);
4890
4891 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4892
f3552158
NS
4893 /* Split the block just after the init stmts. */
4894 basic_block pre_bb = gsi_bb (*gsi);
4895 edge pre_edge = split_block (pre_bb, init_end);
4896 basic_block loop_bb = pre_edge->dest;
4897 pre_bb = pre_edge->src;
4898 /* Reset the iterator. */
4899 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4900
33f47f42
NS
4901 tree expect_var = make_ssa_name (arg_type);
4902 tree actual_var = make_ssa_name (arg_type);
4903 tree write_var = make_ssa_name (arg_type);
4904
4905 /* Build and insert the reduction calculation. */
4906 gimple_seq red_seq = NULL;
4907 tree write_expr = fold_build1 (code, var_type, expect_var);
4908 write_expr = fold_build2 (op, var_type, write_expr, var);
4909 write_expr = fold_build1 (code, arg_type, write_expr);
4910 gimplify_assign (write_var, write_expr, &red_seq);
4911
4912 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4913
4914 /* Build & insert the cmp&swap sequence. */
4915 gimple_seq latch_seq = NULL;
4916 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4917 ptr, expect_var, write_var);
4918 gimplify_assign (actual_var, swap_expr, &latch_seq);
4919
4920 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4921 NULL_TREE, NULL_TREE);
4922 gimple_seq_add_stmt (&latch_seq, cond);
4923
4924 gimple *latch_end = gimple_seq_last (latch_seq);
4925 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4926
33f47f42
NS
4927 /* Split the block just after the latch stmts. */
4928 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4929 basic_block post_bb = post_edge->dest;
4930 loop_bb = post_edge->src;
4931 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4932
4933 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4934 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4935 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4936 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4937
4938 gphi *phi = create_phi_node (expect_var, loop_bb);
4939 add_phi_arg (phi, init_var, pre_edge, loc);
4940 add_phi_arg (phi, actual_var, loop_edge, loc);
4941
4942 loop *loop = alloc_loop ();
4943 loop->header = loop_bb;
4944 loop->latch = loop_bb;
4945 add_loop (loop, loop_bb->loop_father);
4946
33f47f42
NS
4947 return fold_build1 (code, var_type, write_var);
4948}
4949
4950/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4951 GSI. This is necessary for types larger than 64 bits, where there
4952 is no cmp&swap instruction to implement a lockless scheme. We use
4953 a lock variable in global memory.
4954
4955 while (cmp&swap (&lock_var, 0, 1))
4956 continue;
4957 T accum = *ptr;
4958 accum = accum OP var;
4959 *ptr = accum;
4960 cmp&swap (&lock_var, 1, 0);
4961 return accum;
4962
4963 A lock in global memory is necessary to force execution engine
4964 descheduling and avoid resource starvation that can occur if the
4965 lock is in .shared memory. */
4966
4967static tree
4968nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4969 tree ptr, tree var, tree_code op)
4970{
4971 tree var_type = TREE_TYPE (var);
4972 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4973 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4974 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4975
4976 /* Split the block just before the gsi. Insert a gimple nop to make
4977 this easier. */
4978 gimple *nop = gimple_build_nop ();
4979 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4980 basic_block entry_bb = gsi_bb (*gsi);
4981 edge entry_edge = split_block (entry_bb, nop);
4982 basic_block lock_bb = entry_edge->dest;
4983 /* Reset the iterator. */
4984 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4985
4986 /* Build and insert the locking sequence. */
4987 gimple_seq lock_seq = NULL;
4988 tree lock_var = make_ssa_name (unsigned_type_node);
4989 tree lock_expr = nvptx_global_lock_addr ();
4990 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4991 uns_unlocked, uns_locked);
4992 gimplify_assign (lock_var, lock_expr, &lock_seq);
4993 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4994 NULL_TREE, NULL_TREE);
4995 gimple_seq_add_stmt (&lock_seq, cond);
4996 gimple *lock_end = gimple_seq_last (lock_seq);
4997 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4998
4999 /* Split the block just after the lock sequence. */
5000 edge locked_edge = split_block (lock_bb, lock_end);
5001 basic_block update_bb = locked_edge->dest;
5002 lock_bb = locked_edge->src;
5003 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5004
5005 /* Create the lock loop ... */
5006 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5007 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5008 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5009 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5010
5011 /* ... and the loop structure. */
5012 loop *lock_loop = alloc_loop ();
5013 lock_loop->header = lock_bb;
5014 lock_loop->latch = lock_bb;
5015 lock_loop->nb_iterations_estimate = 1;
5016 lock_loop->any_estimate = true;
5017 add_loop (lock_loop, entry_bb->loop_father);
5018
5019 /* Build and insert the reduction calculation. */
5020 gimple_seq red_seq = NULL;
5021 tree acc_in = make_ssa_name (var_type);
5022 tree ref_in = build_simple_mem_ref (ptr);
5023 TREE_THIS_VOLATILE (ref_in) = 1;
5024 gimplify_assign (acc_in, ref_in, &red_seq);
5025
5026 tree acc_out = make_ssa_name (var_type);
5027 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5028 gimplify_assign (acc_out, update_expr, &red_seq);
5029
5030 tree ref_out = build_simple_mem_ref (ptr);
5031 TREE_THIS_VOLATILE (ref_out) = 1;
5032 gimplify_assign (ref_out, acc_out, &red_seq);
5033
5034 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5035
5036 /* Build & insert the unlock sequence. */
5037 gimple_seq unlock_seq = NULL;
5038 tree unlock_expr = nvptx_global_lock_addr ();
5039 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5040 uns_locked, uns_unlocked);
5041 gimplify_and_add (unlock_expr, &unlock_seq);
5042 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5043
5044 return acc_out;
5045}
5046
5047/* Emit a sequence to update a reduction accumlator at *PTR with the
5048 value held in VAR using operator OP. Return the updated value.
5049
5050 TODO: optimize for atomic ops and indepedent complex ops. */
5051
5052static tree
5053nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5054 tree ptr, tree var, tree_code op)
5055{
5056 tree type = TREE_TYPE (var);
5057 tree size = TYPE_SIZE (type);
5058
5059 if (size == TYPE_SIZE (unsigned_type_node)
5060 || size == TYPE_SIZE (long_long_unsigned_type_node))
5061 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5062 else
5063 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
5064}
5065
5066/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5067
5068static void
5069nvptx_goacc_reduction_setup (gcall *call)
5070{
5071 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5072 tree lhs = gimple_call_lhs (call);
5073 tree var = gimple_call_arg (call, 2);
5074 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5075 gimple_seq seq = NULL;
5076
5077 push_gimplify_context (true);
5078
5079 if (level != GOMP_DIM_GANG)
5080 {
5081 /* Copy the receiver object. */
5082 tree ref_to_res = gimple_call_arg (call, 1);
5083
5084 if (!integer_zerop (ref_to_res))
5085 var = build_simple_mem_ref (ref_to_res);
5086 }
5087
5088 if (level == GOMP_DIM_WORKER)
5089 {
5090 /* Store incoming value to worker reduction buffer. */
5091 tree offset = gimple_call_arg (call, 5);
5092 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5093 tree ptr = make_ssa_name (TREE_TYPE (call));
5094
5095 gimplify_assign (ptr, call, &seq);
5096 tree ref = build_simple_mem_ref (ptr);
5097 TREE_THIS_VOLATILE (ref) = 1;
5098 gimplify_assign (ref, var, &seq);
5099 }
5100
5101 if (lhs)
5102 gimplify_assign (lhs, var, &seq);
5103
5104 pop_gimplify_context (NULL);
5105 gsi_replace_with_seq (&gsi, seq, true);
5106}
5107
5108/* NVPTX implementation of GOACC_REDUCTION_INIT. */
5109
5110static void
5111nvptx_goacc_reduction_init (gcall *call)
5112{
5113 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5114 tree lhs = gimple_call_lhs (call);
5115 tree var = gimple_call_arg (call, 2);
5116 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5117 enum tree_code rcode
5118 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5119 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5120 TREE_TYPE (var));
5121 gimple_seq seq = NULL;
5122
5123 push_gimplify_context (true);
5124
5125 if (level == GOMP_DIM_VECTOR)
5126 {
5127 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5128 tree tid = make_ssa_name (integer_type_node);
5129 tree dim_vector = gimple_call_arg (call, 3);
5130 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5131 dim_vector);
5132 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5133 NULL_TREE, NULL_TREE);
5134
5135 gimple_call_set_lhs (tid_call, tid);
5136 gimple_seq_add_stmt (&seq, tid_call);
5137 gimple_seq_add_stmt (&seq, cond_stmt);
5138
5139 /* Split the block just after the call. */
5140 edge init_edge = split_block (gsi_bb (gsi), call);
5141 basic_block init_bb = init_edge->dest;
5142 basic_block call_bb = init_edge->src;
5143
5144 /* Fixup flags from call_bb to init_bb. */
5145 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5146
5147 /* Set the initialization stmts. */
5148 gimple_seq init_seq = NULL;
5149 tree init_var = make_ssa_name (TREE_TYPE (var));
5150 gimplify_assign (init_var, init, &init_seq);
5151 gsi = gsi_start_bb (init_bb);
5152 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5153
5154 /* Split block just after the init stmt. */
5155 gsi_prev (&gsi);
5156 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5157 basic_block dst_bb = inited_edge->dest;
5158
5159 /* Create false edge from call_bb to dst_bb. */
5160 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5161
5162 /* Create phi node in dst block. */
5163 gphi *phi = create_phi_node (lhs, dst_bb);
5164 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5165 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5166
5167 /* Reset dominator of dst bb. */
5168 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5169
5170 /* Reset the gsi. */
5171 gsi = gsi_for_stmt (call);
5172 }
5173 else
5174 {
5175 if (level == GOMP_DIM_GANG)
5176 {
5177 /* If there's no receiver object, propagate the incoming VAR. */
5178 tree ref_to_res = gimple_call_arg (call, 1);
5179 if (integer_zerop (ref_to_res))
5180 init = var;
5181 }
5182
5183 gimplify_assign (lhs, init, &seq);
5184 }
5185
5186 pop_gimplify_context (NULL);
5187 gsi_replace_with_seq (&gsi, seq, true);
5188}
5189
5190/* NVPTX implementation of GOACC_REDUCTION_FINI. */
5191
5192static void
5193nvptx_goacc_reduction_fini (gcall *call)
5194{
5195 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5196 tree lhs = gimple_call_lhs (call);
5197 tree ref_to_res = gimple_call_arg (call, 1);
5198 tree var = gimple_call_arg (call, 2);
5199 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5200 enum tree_code op
5201 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5202 gimple_seq seq = NULL;
5203 tree r = NULL_TREE;;
5204
5205 push_gimplify_context (true);
5206
5207 if (level == GOMP_DIM_VECTOR)
5208 {
5209 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5210 but that requires a method of emitting a unified jump at the
5211 gimple level. */
5212 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5213 {
5214 tree other_var = make_ssa_name (TREE_TYPE (var));
5215 nvptx_generate_vector_shuffle (gimple_location (call),
5216 other_var, var, shfl, &seq);
5217
5218 r = make_ssa_name (TREE_TYPE (var));
5219 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5220 var, other_var), &seq);
5221 var = r;
5222 }
5223 }
5224 else
5225 {
5226 tree accum = NULL_TREE;
5227
5228 if (level == GOMP_DIM_WORKER)
5229 {
5230 /* Get reduction buffer address. */
5231 tree offset = gimple_call_arg (call, 5);
5232 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5233 tree ptr = make_ssa_name (TREE_TYPE (call));
5234
5235 gimplify_assign (ptr, call, &seq);
5236 accum = ptr;
5237 }
5238 else if (integer_zerop (ref_to_res))
5239 r = var;
5240 else
5241 accum = ref_to_res;
5242
5243 if (accum)
5244 {
33f47f42 5245 /* UPDATE the accumulator. */
f3552158
NS
5246 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5247 seq = NULL;
33f47f42
NS
5248 r = nvptx_reduction_update (gimple_location (call), &gsi,
5249 accum, var, op);
f3552158
NS
5250 }
5251 }
5252
5253 if (lhs)
5254 gimplify_assign (lhs, r, &seq);
5255 pop_gimplify_context (NULL);
5256
5257 gsi_replace_with_seq (&gsi, seq, true);
5258}
5259
5260/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5261
5262static void
5263nvptx_goacc_reduction_teardown (gcall *call)
5264{
5265 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5266 tree lhs = gimple_call_lhs (call);
5267 tree var = gimple_call_arg (call, 2);
5268 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5269 gimple_seq seq = NULL;
5270
5271 push_gimplify_context (true);
5272 if (level == GOMP_DIM_WORKER)
5273 {
5274 /* Read the worker reduction buffer. */
5275 tree offset = gimple_call_arg (call, 5);
5276 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5277 tree ptr = make_ssa_name (TREE_TYPE (call));
5278
5279 gimplify_assign (ptr, call, &seq);
5280 var = build_simple_mem_ref (ptr);
5281 TREE_THIS_VOLATILE (var) = 1;
5282 }
5283
5284 if (level != GOMP_DIM_GANG)
5285 {
5286 /* Write to the receiver object. */
5287 tree ref_to_res = gimple_call_arg (call, 1);
5288
5289 if (!integer_zerop (ref_to_res))
5290 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5291 }
5292
5293 if (lhs)
5294 gimplify_assign (lhs, var, &seq);
5295
5296 pop_gimplify_context (NULL);
5297
5298 gsi_replace_with_seq (&gsi, seq, true);
5299}
5300
5301/* NVPTX reduction expander. */
5302
5563d5c0 5303static void
f3552158
NS
5304nvptx_goacc_reduction (gcall *call)
5305{
5306 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5307
5308 switch (code)
5309 {
5310 case IFN_GOACC_REDUCTION_SETUP:
5311 nvptx_goacc_reduction_setup (call);
5312 break;
5313
5314 case IFN_GOACC_REDUCTION_INIT:
5315 nvptx_goacc_reduction_init (call);
5316 break;
5317
5318 case IFN_GOACC_REDUCTION_FINI:
5319 nvptx_goacc_reduction_fini (call);
5320 break;
5321
5322 case IFN_GOACC_REDUCTION_TEARDOWN:
5323 nvptx_goacc_reduction_teardown (call);
5324 break;
5325
5326 default:
5327 gcc_unreachable ();
5328 }
5329}
5330
e053f710
TV
5331static bool
5332nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5333 rtx x ATTRIBUTE_UNUSED)
5334{
5335 return true;
5336}
5337
738f2522
BS
5338#undef TARGET_OPTION_OVERRIDE
5339#define TARGET_OPTION_OVERRIDE nvptx_option_override
5340
5341#undef TARGET_ATTRIBUTE_TABLE
5342#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5343
d81db636
SB
5344#undef TARGET_LRA_P
5345#define TARGET_LRA_P hook_bool_void_false
5346
738f2522
BS
5347#undef TARGET_LEGITIMATE_ADDRESS_P
5348#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5349
5350#undef TARGET_PROMOTE_FUNCTION_MODE
5351#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5352
5353#undef TARGET_FUNCTION_ARG
5354#define TARGET_FUNCTION_ARG nvptx_function_arg
5355#undef TARGET_FUNCTION_INCOMING_ARG
5356#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5357#undef TARGET_FUNCTION_ARG_ADVANCE
5358#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
1988b2c2
NS
5359#undef TARGET_FUNCTION_ARG_BOUNDARY
5360#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
738f2522
BS
5361#undef TARGET_PASS_BY_REFERENCE
5362#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5363#undef TARGET_FUNCTION_VALUE_REGNO_P
5364#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5365#undef TARGET_FUNCTION_VALUE
5366#define TARGET_FUNCTION_VALUE nvptx_function_value
5367#undef TARGET_LIBCALL_VALUE
5368#define TARGET_LIBCALL_VALUE nvptx_libcall_value
5369#undef TARGET_FUNCTION_OK_FOR_SIBCALL
5370#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
5371#undef TARGET_GET_DRAP_RTX
5372#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
5373#undef TARGET_SPLIT_COMPLEX_ARG
5374#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5375#undef TARGET_RETURN_IN_MEMORY
5376#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5377#undef TARGET_OMIT_STRUCT_RETURN_REG
5378#define TARGET_OMIT_STRUCT_RETURN_REG true
5379#undef TARGET_STRICT_ARGUMENT_NAMING
5380#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
5381#undef TARGET_CALL_ARGS
5382#define TARGET_CALL_ARGS nvptx_call_args
5383#undef TARGET_END_CALL_ARGS
5384#define TARGET_END_CALL_ARGS nvptx_end_call_args
5385
5386#undef TARGET_ASM_FILE_START
5387#define TARGET_ASM_FILE_START nvptx_file_start
5388#undef TARGET_ASM_FILE_END
5389#define TARGET_ASM_FILE_END nvptx_file_end
5390#undef TARGET_ASM_GLOBALIZE_LABEL
5391#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5392#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5393#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5394#undef TARGET_PRINT_OPERAND
5395#define TARGET_PRINT_OPERAND nvptx_print_operand
5396#undef TARGET_PRINT_OPERAND_ADDRESS
5397#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5398#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5399#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5400#undef TARGET_ASM_INTEGER
5401#define TARGET_ASM_INTEGER nvptx_assemble_integer
5402#undef TARGET_ASM_DECL_END
5403#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5404#undef TARGET_ASM_DECLARE_CONSTANT_NAME
5405#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5406#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5407#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5408#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5409#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5410
5411#undef TARGET_MACHINE_DEPENDENT_REORG
5412#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5413#undef TARGET_NO_REGISTER_ALLOCATION
5414#define TARGET_NO_REGISTER_ALLOCATION true
5415
9a863523
NS
5416#undef TARGET_ENCODE_SECTION_INFO
5417#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
5418#undef TARGET_RECORD_OFFLOAD_SYMBOL
5419#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5420
738f2522
BS
5421#undef TARGET_VECTOR_ALIGNMENT
5422#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5423
d88cd9c4
NS
5424#undef TARGET_CANNOT_COPY_INSN_P
5425#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5426
a794bd20
NS
5427#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5428#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5429
f3552158
NS
5430#undef TARGET_INIT_BUILTINS
5431#define TARGET_INIT_BUILTINS nvptx_init_builtins
5432#undef TARGET_EXPAND_BUILTIN
5433#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5434#undef TARGET_BUILTIN_DECL
5435#define TARGET_BUILTIN_DECL nvptx_builtin_decl
5436
5012919d
AM
5437#undef TARGET_SIMT_VF
5438#define TARGET_SIMT_VF nvptx_simt_vf
5439
94829f87
NS
5440#undef TARGET_GOACC_VALIDATE_DIMS
5441#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5442
bd751975
NS
5443#undef TARGET_GOACC_DIM_LIMIT
5444#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5445
d88cd9c4
NS
5446#undef TARGET_GOACC_FORK_JOIN
5447#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5448
f3552158
NS
5449#undef TARGET_GOACC_REDUCTION
5450#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5451
e053f710
TV
5452#undef TARGET_CANNOT_FORCE_CONST_MEM
5453#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5454
738f2522
BS
5455struct gcc_target targetm = TARGET_INITIALIZER;
5456
5457#include "gt-nvptx.h"