]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
tree.h (tree_invariant_p): Declare.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
5624e564 2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
e11c4407
AM
31#include "tm_p.h"
32#include "expmed.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic.h"
40e23961 38#include "alias.h"
738f2522
BS
39#include "insn-flags.h"
40#include "output.h"
41#include "insn-attr.h"
36566b39 42#include "flags.h"
36566b39
PK
43#include "dojump.h"
44#include "explow.h"
45#include "calls.h"
36566b39
PK
46#include "varasm.h"
47#include "stmt.h"
738f2522 48#include "expr.h"
738f2522
BS
49#include "tm-preds.h"
50#include "tm-constrs.h"
738f2522
BS
51#include "langhooks.h"
52#include "dbxout.h"
738f2522 53#include "cfgrtl.h"
d88cd9c4 54#include "gimple.h"
738f2522 55#include "stor-layout.h"
738f2522 56#include "builtins.h"
3e32ee19
NS
57#include "omp-low.h"
58#include "gomp-constants.h"
d88cd9c4 59#include "dumpfile.h"
f3552158
NS
60#include "internal-fn.h"
61#include "gimple-iterator.h"
62#include "stringpool.h"
63#include "tree-ssa-operands.h"
64#include "tree-ssanames.h"
65#include "gimplify.h"
66#include "tree-phinodes.h"
67#include "cfgloop.h"
68#include "fold-const.h"
738f2522 69
994c5d85 70/* This file should be included last. */
d58627a0
RS
71#include "target-def.h"
72
d88cd9c4
NS
73#define SHUFFLE_UP 0
74#define SHUFFLE_DOWN 1
75#define SHUFFLE_BFLY 2
76#define SHUFFLE_IDX 3
77
738f2522
BS
78/* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80static std::stringstream func_decls;
f3dba894 81
6c907cff 82struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
83{
84 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
85 static bool equal (rtx a, rtx b) { return a == b; }
86};
87
88static GTY((cache))
89 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
90
6c907cff 91struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
92{
93 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
94 static bool equal (tree a, tree b) { return a == b; }
95};
96
97static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
98static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 99
f3552158
NS
100/* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
d88cd9c4
NS
105static unsigned worker_bcast_size;
106static unsigned worker_bcast_align;
107#define worker_bcast_name "__worker_bcast"
108static GTY(()) rtx worker_bcast_sym;
109
f3552158
NS
110/* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112static unsigned worker_red_size;
113static unsigned worker_red_align;
114#define worker_red_name "__worker_red"
115static GTY(()) rtx worker_red_sym;
116
33f47f42
NS
117/* Global lock variable, needed for 128bit worker & gang reductions. */
118static GTY(()) tree global_lock_var;
119
738f2522
BS
120/* Allocate a new, cleared machine_function structure. */
121
122static struct machine_function *
123nvptx_init_machine_status (void)
124{
125 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
126 p->ret_reg_mode = VOIDmode;
127 return p;
128}
129
130/* Implement TARGET_OPTION_OVERRIDE. */
131
132static void
133nvptx_option_override (void)
134{
135 init_machine_status = nvptx_init_machine_status;
136 /* Gives us a predictable order, which we need especially for variables. */
137 flag_toplevel_reorder = 1;
138 /* Assumes that it will see only hard registers. */
139 flag_var_tracking = 0;
f324806d
NS
140 write_symbols = NO_DEBUG;
141 debug_info_level = DINFO_LEVEL_NONE;
738f2522 142
dba619f3
NS
143 if (nvptx_optimize < 0)
144 nvptx_optimize = optimize > 0;
145
f3dba894
TS
146 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
147 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 148 declared_libfuncs_htab
f3dba894 149 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4
NS
150
151 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
152 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158
NS
153
154 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
155 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
738f2522
BS
156}
157
158/* Return the mode to be used when declaring a ptx object for OBJ.
159 For objects with subparts such as complex modes this is the mode
160 of the subpart. */
161
162machine_mode
163nvptx_underlying_object_mode (rtx obj)
164{
165 if (GET_CODE (obj) == SUBREG)
166 obj = SUBREG_REG (obj);
167 machine_mode mode = GET_MODE (obj);
168 if (mode == TImode)
169 return DImode;
170 if (COMPLEX_MODE_P (mode))
171 return GET_MODE_INNER (mode);
172 return mode;
173}
174
175/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
176 deal with ptx ideosyncracies. */
177
178const char *
179nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
180{
181 switch (mode)
182 {
183 case BLKmode:
184 return ".b8";
185 case BImode:
186 return ".pred";
187 case QImode:
188 if (promote)
189 return ".u32";
190 else
191 return ".u8";
192 case HImode:
193 return ".u16";
194 case SImode:
195 return ".u32";
196 case DImode:
197 return ".u64";
198
199 case SFmode:
200 return ".f32";
201 case DFmode:
202 return ".f64";
203
204 default:
205 gcc_unreachable ();
206 }
207}
208
7b8edc29
NS
209/* Determine the address space to use for SYMBOL_REF SYM. */
210
211static addr_space_t
212nvptx_addr_space_from_sym (rtx sym)
213{
214 tree decl = SYMBOL_REF_DECL (sym);
215 if (decl == NULL_TREE || TREE_CODE (decl) == FUNCTION_DECL)
216 return ADDR_SPACE_GENERIC;
217
218 bool is_const = (CONSTANT_CLASS_P (decl)
219 || TREE_CODE (decl) == CONST_DECL
220 || TREE_READONLY (decl));
221 if (is_const)
222 return ADDR_SPACE_CONST;
223
224 return ADDR_SPACE_GLOBAL;
225}
226
b699adcc
NS
227/* Check NAME for special function names and redirect them by returning a
228 replacement. This applies to malloc, free and realloc, for which we
229 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
230
231static const char *
232nvptx_name_replacement (const char *name)
233{
234 if (strcmp (name, "call") == 0)
235 return "__nvptx_call";
236 if (strcmp (name, "malloc") == 0)
237 return "__nvptx_malloc";
238 if (strcmp (name, "free") == 0)
239 return "__nvptx_free";
240 if (strcmp (name, "realloc") == 0)
241 return "__nvptx_realloc";
242 return name;
243}
244
d7479262
NS
245/* If MODE should be treated as two registers of an inner mode, return
246 that inner mode. Otherwise return VOIDmode. */
738f2522 247
d7479262
NS
248static machine_mode
249maybe_split_mode (machine_mode mode)
738f2522 250{
738f2522 251 if (COMPLEX_MODE_P (mode))
d7479262 252 return GET_MODE_INNER (mode);
738f2522 253
738f2522 254 if (mode == TImode)
d7479262
NS
255 return DImode;
256
257 return VOIDmode;
738f2522
BS
258}
259
d88cd9c4
NS
260/* Emit forking instructions for MASK. */
261
262static void
263nvptx_emit_forking (unsigned mask, bool is_call)
264{
265 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
266 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
267 if (mask)
268 {
269 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
270
271 /* Emit fork at all levels. This helps form SESE regions, as
272 it creates a block with a single successor before entering a
273 partitooned region. That is a good candidate for the end of
274 an SESE region. */
275 if (!is_call)
276 emit_insn (gen_nvptx_fork (op));
277 emit_insn (gen_nvptx_forked (op));
278 }
279}
280
281/* Emit joining instructions for MASK. */
282
283static void
284nvptx_emit_joining (unsigned mask, bool is_call)
285{
286 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
287 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
288 if (mask)
289 {
290 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
291
292 /* Emit joining for all non-call pars to ensure there's a single
293 predecessor for the block the join insn ends up in. This is
294 needed for skipping entire loops. */
295 if (!is_call)
296 emit_insn (gen_nvptx_joining (op));
297 emit_insn (gen_nvptx_join (op));
298 }
299}
300
738f2522
BS
301#define PASS_IN_REG_P(MODE, TYPE) \
302 ((GET_MODE_CLASS (MODE) == MODE_INT \
303 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
304 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
305 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
306 && !AGGREGATE_TYPE_P (TYPE))) \
307 && (MODE) != TImode)
308
309#define RETURN_IN_REG_P(MODE) \
310 ((GET_MODE_CLASS (MODE) == MODE_INT \
311 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
312 && GET_MODE_SIZE (MODE) <= 8)
313\f
314/* Perform a mode promotion for a function argument with MODE. Return
315 the promoted mode. */
316
317static machine_mode
318arg_promotion (machine_mode mode)
319{
320 if (mode == QImode || mode == HImode)
321 return SImode;
322 return mode;
323}
324
325/* Write the declaration of a function arg of TYPE to S. I is the index
326 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
327 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
328
329static int
b699adcc
NS
330write_one_arg (std::stringstream &s, const char *sep, int i,
331 tree type, machine_mode mode, bool no_arg_types)
738f2522
BS
332{
333 if (!PASS_IN_REG_P (mode, type))
334 mode = Pmode;
335
d7479262
NS
336 machine_mode split = maybe_split_mode (mode);
337 if (split != VOIDmode)
738f2522 338 {
b699adcc
NS
339 i = write_one_arg (s, sep, i, TREE_TYPE (type), split, false);
340 sep = ", ";
341 mode = split;
738f2522
BS
342 }
343
344 if (no_arg_types && !AGGREGATE_TYPE_P (type))
345 {
346 if (mode == SFmode)
347 mode = DFmode;
348 mode = arg_promotion (mode);
349 }
350
b699adcc 351 s << sep;
738f2522 352 s << ".param" << nvptx_ptx_type_from_mode (mode, false) << " %in_ar"
df1bdded 353 << i << (mode == QImode || mode == HImode ? "[1]" : "");
738f2522
BS
354 if (mode == BLKmode)
355 s << "[" << int_size_in_bytes (type) << "]";
df1bdded 356 return i + 1;
738f2522
BS
357}
358
359/* Look for attributes in ATTRS that would indicate we must write a function
360 as a .entry kernel rather than a .func. Return true if one is found. */
361
362static bool
363write_as_kernel (tree attrs)
364{
365 return (lookup_attribute ("kernel", attrs) != NULL_TREE
366 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
367}
368
69823d76
NS
369/* Emit a linker marker for a function decl or defn. */
370
371static void
372write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
373 const char *name)
374{
375 s << "\n// BEGIN";
376 if (globalize)
377 s << " GLOBAL";
378 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
379 s << name << "\n";
380}
381
382/* Emit a linker marker for a variable decl or defn. */
383
384static void
385write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
386{
387 fprintf (file, "\n// BEGIN%s VAR %s: ",
388 globalize ? " GLOBAL" : "",
389 is_defn ? "DEF" : "DECL");
390 assemble_name_raw (file, name);
391 fputs ("\n", file);
392}
393
b699adcc
NS
394/* Write a .func or .kernel declaration or definition along with
395 a helper comment for use by ld. S is the stream to write to, DECL
396 the decl for the function with name NAME. For definitions, emit
397 a declaration too. */
738f2522 398
b699adcc
NS
399static const char *
400write_fn_proto (std::stringstream &s, bool is_defn,
401 const char *name, const_tree decl)
738f2522 402{
b699adcc
NS
403 if (is_defn)
404 /* Emit a declaration. The PTX assembler gets upset without it. */
405 name = write_fn_proto (s, false, name, decl);
406 else
738f2522 407 {
b699adcc
NS
408 /* Avoid repeating the name replacement. */
409 name = nvptx_name_replacement (name);
410 if (name[0] == '*')
411 name++;
738f2522
BS
412 }
413
69823d76 414 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
415
416 /* PTX declaration. */
738f2522
BS
417 if (DECL_EXTERNAL (decl))
418 s << ".extern ";
419 else if (TREE_PUBLIC (decl))
0766660b 420 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 421 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 422
b699adcc
NS
423 tree fntype = TREE_TYPE (decl);
424 tree result_type = TREE_TYPE (fntype);
738f2522
BS
425
426 /* Declare the result. */
427 bool return_in_mem = false;
428 if (TYPE_MODE (result_type) != VOIDmode)
429 {
430 machine_mode mode = TYPE_MODE (result_type);
431 if (!RETURN_IN_REG_P (mode))
432 return_in_mem = true;
433 else
434 {
435 mode = arg_promotion (mode);
436 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
b699adcc 437 << " %out_retval) ";
738f2522
BS
438 }
439 }
440
b699adcc
NS
441 s << name;
442
443 const char *sep = " (";
444 int i = 0;
445
446 /* Emit argument list. */
447 if (return_in_mem)
738f2522 448 {
b699adcc
NS
449 s << sep << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_ar0";
450 sep = ", ";
451 i++;
452 }
df1bdded 453
b699adcc
NS
454 /* We get:
455 NULL in TYPE_ARG_TYPES, for old-style functions
456 NULL in DECL_ARGUMENTS, for builtin functions without another
457 declaration.
458 So we have to pick the best one we have. */
459 tree args = TYPE_ARG_TYPES (fntype);
460 bool null_type_args = !args;
461 if (null_type_args)
462 args = DECL_ARGUMENTS (decl);
738f2522 463
b699adcc
NS
464 for (; args; args = TREE_CHAIN (args))
465 {
466 tree type = null_type_args ? TREE_TYPE (args) : TREE_VALUE (args);
467 machine_mode mode = TYPE_MODE (type);
468
469 if (mode == VOIDmode)
470 break;
471 i = write_one_arg (s, sep, i, type, mode, null_type_args);
472 sep = ", ";
738f2522 473 }
738f2522 474
b699adcc
NS
475 if (stdarg_p (fntype))
476 {
477 s << sep << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_argp";
478 i++;
479 sep = ", ";
480 }
738f2522 481
b699adcc
NS
482 if (DECL_STATIC_CHAIN (decl))
483 {
484 s << sep << ".reg.u" << GET_MODE_BITSIZE (Pmode)
485 << reg_names [STATIC_CHAIN_REGNUM];
486 i++;
487 sep = ", ";
488 }
489
490 if (!i && strcmp (name, "main") == 0)
491 {
492 s << sep
493 << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode)
494 << " %argv";
495 i++;
496 sep = ", ";
497 }
498
499 if (i)
500 s << ")";
501
502 s << (is_defn ? "\n" : ";\n");
503
504 return name;
738f2522
BS
505}
506
00e52418
NS
507/* Construct a function declaration from a call insn. This can be
508 necessary for two reasons - either we have an indirect call which
509 requires a .callprototype declaration, or we have a libcall
510 generated by emit_library_call for which no decl exists. */
511
512static void
b699adcc
NS
513write_fn_proto_from_insn (std::stringstream &s, const char *name,
514 rtx result, rtx pat)
00e52418
NS
515{
516 if (!name)
517 {
518 s << "\t.callprototype ";
519 name = "_";
520 }
521 else
522 {
b699adcc 523 name = nvptx_name_replacement (name);
69823d76 524 write_fn_marker (s, false, true, name);
00e52418
NS
525 s << "\t.extern .func ";
526 }
527
528 if (result != NULL_RTX)
529 s << "(.param"
530 << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)), false)
531 << " %rval) ";
532
533 s << name;
534
535 const char *sep = " (";
536 int arg_end = XVECLEN (pat, 0);
537 for (int i = 1; i < arg_end; i++)
538 {
539 /* We don't have to deal with mode splitting here, as that was
540 already done when generating the call sequence. */
541 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
542
543 s << sep
544 << ".param"
545 << nvptx_ptx_type_from_mode (mode, false)
546 << " %arg"
547 << i;
548 if (mode == QImode || mode == HImode)
549 s << "[1]";
550 sep = ", ";
551 }
552 if (arg_end != 1)
553 s << ")";
554 s << ";\n";
555}
556
00e52418
NS
557/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
558 table and and write a ptx prototype. These are emitted at end of
559 compilation. */
738f2522 560
00e52418
NS
561static void
562nvptx_record_fndecl (tree decl)
738f2522 563{
f3dba894 564 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
565 if (*slot == NULL)
566 {
567 *slot = decl;
568 const char *name = get_fnname_from_decl (decl);
b699adcc 569 write_fn_proto (func_decls, false, name, decl);
738f2522 570 }
738f2522
BS
571}
572
00e52418
NS
573/* Record a libcall or unprototyped external function. CALLEE is the
574 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
575 declaration for it. */
576
577static void
578nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
579{
580 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
581 if (*slot == NULL)
582 {
583 *slot = callee;
584
585 const char *name = XSTR (callee, 0);
b699adcc 586 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
587 }
588}
589
590/* DECL is an external FUNCTION_DECL, that we're referencing. If it
591 is prototyped, record it now. Otherwise record it as needed at end
592 of compilation, when we might have more information about it. */
738f2522
BS
593
594void
595nvptx_record_needed_fndecl (tree decl)
596{
00e52418
NS
597 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
598 {
599 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
600 if (*slot == NULL)
601 *slot = decl;
602 }
603 else
604 nvptx_record_fndecl (decl);
605}
738f2522 606
00e52418
NS
607/* SYM is a SYMBOL_REF. If it refers to an external function, record
608 it as needed. */
609
610static void
611nvptx_maybe_record_fnsym (rtx sym)
612{
613 tree decl = SYMBOL_REF_DECL (sym);
614
615 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
616 nvptx_record_needed_fndecl (decl);
738f2522
BS
617}
618
d88cd9c4
NS
619/* Emit code to initialize the REGNO predicate register to indicate
620 whether we are not lane zero on the NAME axis. */
621
622static void
623nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
624{
625 fprintf (file, "\t{\n");
626 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
627 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
628 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
629 fprintf (file, "\t}\n");
630}
631
738f2522
BS
632/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
633 function, including local var decls and copies from the arguments to
634 local regs. */
635
636void
637nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
638{
639 tree fntype = TREE_TYPE (decl);
640 tree result_type = TREE_TYPE (fntype);
5ab662d5 641 int argno = 0;
738f2522 642
738f2522 643 std::stringstream s;
b699adcc 644 write_fn_proto (s, true, name, decl);
738f2522 645 fprintf (file, "%s", s.str().c_str());
b699adcc 646 fprintf (file, "{\n");
738f2522 647
25662751
NS
648 bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
649 && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
738f2522 650 if (return_in_mem)
5ab662d5 651 {
5ab662d5
NS
652 fprintf (file, "\t.reg.u%d %%ar%d;\n", GET_MODE_BITSIZE (Pmode), argno);
653 fprintf (file, "\tld.param.u%d %%ar%d, [%%in_ar%d];\n",
654 GET_MODE_BITSIZE (Pmode), argno, argno);
df1bdded 655 argno++;
5ab662d5
NS
656 }
657
658 /* Declare and initialize incoming arguments. */
659 tree args = DECL_ARGUMENTS (decl);
660 bool prototyped = false;
661 if (TYPE_ARG_TYPES (fntype))
662 {
663 args = TYPE_ARG_TYPES (fntype);
664 prototyped = true;
665 }
666
667 for (; args != NULL_TREE; args = TREE_CHAIN (args))
668 {
669 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
670 machine_mode mode = TYPE_MODE (type);
671 int count = 1;
672
673 if (mode == VOIDmode)
674 break;
675
676 if (!PASS_IN_REG_P (mode, type))
677 mode = Pmode;
678
679 machine_mode split = maybe_split_mode (mode);
680 if (split != VOIDmode)
681 {
682 count = 2;
683 mode = split;
684 }
685 else if (!prototyped && !AGGREGATE_TYPE_P (type) && mode == SFmode)
686 mode = DFmode;
687
688 mode = arg_promotion (mode);
689 while (count--)
690 {
5ab662d5
NS
691 fprintf (file, "\t.reg%s %%ar%d;\n",
692 nvptx_ptx_type_from_mode (mode, false), argno);
693 fprintf (file, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
694 nvptx_ptx_type_from_mode (mode, false), argno, argno);
df1bdded 695 argno++;
5ab662d5
NS
696 }
697 }
25662751
NS
698
699 /* C++11 ABI causes us to return a reference to the passed in
700 pointer for return_in_mem. */
701 if (cfun->machine->ret_reg_mode != VOIDmode)
738f2522 702 {
25662751
NS
703 machine_mode mode = arg_promotion
704 ((machine_mode)cfun->machine->ret_reg_mode);
ac952181 705 fprintf (file, "\t.reg%s %%retval;\n",
738f2522
BS
706 nvptx_ptx_type_from_mode (mode, false));
707 }
708
709 if (stdarg_p (fntype))
5ab662d5
NS
710 {
711 fprintf (file, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode));
712 fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
713 GET_MODE_BITSIZE (Pmode));
714 }
738f2522
BS
715
716 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
717 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
718
719 /* Declare the pseudos we have as ptx registers. */
720 int maxregs = max_reg_num ();
721 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
722 {
723 if (regno_reg_rtx[i] != const0_rtx)
724 {
725 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262
NS
726 machine_mode split = maybe_split_mode (mode);
727 if (split != VOIDmode)
738f2522 728 {
d7479262
NS
729 fprintf (file, "\t.reg%s %%r%d$%d;\n",
730 nvptx_ptx_type_from_mode (split, true), i, 0);
731 fprintf (file, "\t.reg%s %%r%d$%d;\n",
732 nvptx_ptx_type_from_mode (split, true), i, 1);
738f2522
BS
733 }
734 else
735 fprintf (file, "\t.reg%s %%r%d;\n",
d7479262 736 nvptx_ptx_type_from_mode (mode, true), i);
738f2522
BS
737 }
738 }
739
740 /* The only reason we might be using outgoing args is if we call a stdargs
741 function. Allocate the space for this. If we called varargs functions
742 without passing any variadic arguments, we'll see a reference to outargs
743 even with a zero outgoing_args_size. */
744 HOST_WIDE_INT sz = crtl->outgoing_args_size;
745 if (sz == 0)
746 sz = 1;
747 if (cfun->machine->has_call_with_varargs)
5ab662d5
NS
748 {
749 fprintf (file, "\t.reg.u%d %%outargs;\n"
750 "\t.local.align 8 .b8 %%outargs_ar["
751 HOST_WIDE_INT_PRINT_DEC"];\n",
752 BITS_PER_WORD, sz);
753 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
754 BITS_PER_WORD);
755 }
756
738f2522 757 if (cfun->machine->punning_buffer_size > 0)
5ab662d5
NS
758 {
759 fprintf (file, "\t.reg.u%d %%punbuffer;\n"
760 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
761 BITS_PER_WORD, cfun->machine->punning_buffer_size);
762 fprintf (file, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
763 BITS_PER_WORD);
764 }
738f2522
BS
765
766 /* Declare a local variable for the frame. */
767 sz = get_frame_size ();
768 if (sz > 0 || cfun->machine->has_call_with_sc)
769 {
18c05628
NS
770 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
771
738f2522 772 fprintf (file, "\t.reg.u%d %%frame;\n"
18c05628
NS
773 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
774 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
738f2522
BS
775 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
776 BITS_PER_WORD);
777 }
778
d88cd9c4
NS
779 /* Emit axis predicates. */
780 if (cfun->machine->axis_predicate[0])
781 nvptx_init_axis_predicate (file,
782 REGNO (cfun->machine->axis_predicate[0]), "y");
783 if (cfun->machine->axis_predicate[1])
784 nvptx_init_axis_predicate (file,
785 REGNO (cfun->machine->axis_predicate[1]), "x");
738f2522
BS
786}
787
788/* Output a return instruction. Also copy the return value to its outgoing
789 location. */
790
791const char *
792nvptx_output_return (void)
793{
25662751
NS
794 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
795
796 if (mode != VOIDmode)
738f2522 797 {
25662751
NS
798 mode = arg_promotion (mode);
799 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
800 nvptx_ptx_type_from_mode (mode, false));
738f2522
BS
801 }
802
803 return "ret;";
804}
805
738f2522
BS
806/* Terminate a function by writing a closing brace to FILE. */
807
808void
809nvptx_function_end (FILE *file)
810{
cf08c344 811 fprintf (file, "}\n");
738f2522
BS
812}
813\f
814/* Decide whether we can make a sibling call to a function. For ptx, we
815 can't. */
816
817static bool
818nvptx_function_ok_for_sibcall (tree, tree)
819{
820 return false;
821}
822
18c05628
NS
823/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
824
825static rtx
826nvptx_get_drap_rtx (void)
827{
828 return NULL_RTX;
829}
830
738f2522
BS
831/* Implement the TARGET_CALL_ARGS hook. Record information about one
832 argument to the next call. */
833
834static void
835nvptx_call_args (rtx arg, tree funtype)
836{
837 if (cfun->machine->start_call == NULL_RTX)
838 {
839 cfun->machine->call_args = NULL;
840 cfun->machine->funtype = funtype;
841 cfun->machine->start_call = const0_rtx;
842 }
843 if (arg == pc_rtx)
844 return;
845
846 rtx_expr_list *args_so_far = cfun->machine->call_args;
847 if (REG_P (arg))
848 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
849}
850
851/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
852 information we recorded. */
853
854static void
855nvptx_end_call_args (void)
856{
857 cfun->machine->start_call = NULL_RTX;
858 free_EXPR_LIST_list (&cfun->machine->call_args);
859}
860
ecf6e535
BS
861/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
862 track of whether calls involving static chains or varargs were seen
863 in the current function.
864 For libcalls, maintain a hash table of decls we have seen, and
865 record a function decl for later when encountering a new one. */
738f2522
BS
866
867void
868nvptx_expand_call (rtx retval, rtx address)
869{
f324806d 870 int nargs = 0;
738f2522
BS
871 rtx callee = XEXP (address, 0);
872 rtx pat, t;
873 rtvec vec;
f324806d 874 rtx varargs = NULL_RTX;
d88cd9c4 875 unsigned parallel = 0;
738f2522 876
738f2522
BS
877 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
878 nargs++;
879
738f2522
BS
880 if (!call_insn_operand (callee, Pmode))
881 {
882 callee = force_reg (Pmode, callee);
883 address = change_address (address, QImode, callee);
884 }
885
886 if (GET_CODE (callee) == SYMBOL_REF)
887 {
888 tree decl = SYMBOL_REF_DECL (callee);
889 if (decl != NULL_TREE)
890 {
738f2522
BS
891 if (DECL_STATIC_CHAIN (decl))
892 cfun->machine->has_call_with_sc = true;
00e52418 893
d88cd9c4
NS
894 tree attr = get_oacc_fn_attrib (decl);
895 if (attr)
896 {
897 tree dims = TREE_VALUE (attr);
898
899 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
900 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
901 {
902 if (TREE_PURPOSE (dims)
903 && !integer_zerop (TREE_PURPOSE (dims)))
904 break;
905 /* Not on this axis. */
906 parallel ^= GOMP_DIM_MASK (ix);
907 dims = TREE_CHAIN (dims);
908 }
909 }
738f2522
BS
910 }
911 }
c38f0d8c 912
738f2522
BS
913 if (cfun->machine->funtype
914 /* It's possible to construct testcases where we call a variable.
915 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
916 in such a case. */
917 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
918 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
919 && stdarg_p (cfun->machine->funtype))
920 {
f324806d 921 varargs = gen_reg_rtx (Pmode);
863af9a4 922 emit_move_insn (varargs, stack_pointer_rtx);
f324806d 923 cfun->machine->has_call_with_varargs = true;
738f2522 924 }
f324806d
NS
925 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
926 pat = gen_rtx_PARALLEL (VOIDmode, vec);
738f2522 927
f324806d
NS
928 int vec_pos = 0;
929
738f2522
BS
930 rtx tmp_retval = retval;
931 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
932 if (retval != NULL_RTX)
933 {
934 if (!nvptx_register_operand (retval, GET_MODE (retval)))
935 tmp_retval = gen_reg_rtx (GET_MODE (retval));
f7df4a84 936 t = gen_rtx_SET (tmp_retval, t);
738f2522 937 }
f324806d
NS
938 XVECEXP (pat, 0, vec_pos++) = t;
939
940 /* Construct the call insn, including a USE for each argument pseudo
941 register. These will be used when printing the insn. */
942 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
943 {
944 rtx this_arg = XEXP (arg, 0);
945 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
946 }
947
948 if (varargs)
cf08c344 949 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
950
951 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 952
d88cd9c4 953 nvptx_emit_forking (parallel, true);
738f2522 954 emit_call_insn (pat);
d88cd9c4
NS
955 nvptx_emit_joining (parallel, true);
956
738f2522
BS
957 if (tmp_retval != retval)
958 emit_move_insn (retval, tmp_retval);
959}
960
961/* Implement TARGET_FUNCTION_ARG. */
962
963static rtx
964nvptx_function_arg (cumulative_args_t, machine_mode mode,
965 const_tree, bool named)
966{
967 if (mode == VOIDmode)
968 return NULL_RTX;
969
970 if (named)
971 return gen_reg_rtx (mode);
972 return NULL_RTX;
973}
974
975/* Implement TARGET_FUNCTION_INCOMING_ARG. */
976
977static rtx
978nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
979 const_tree, bool named)
980{
981 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
982 if (mode == VOIDmode)
983 return NULL_RTX;
984
985 if (!named)
986 return NULL_RTX;
987
988 /* No need to deal with split modes here, the only case that can
989 happen is complex modes and those are dealt with by
990 TARGET_SPLIT_COMPLEX_ARG. */
991 return gen_rtx_UNSPEC (mode,
df1bdded 992 gen_rtvec (1, GEN_INT (cum->count)),
738f2522
BS
993 UNSPEC_ARG_REG);
994}
995
996/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
997
998static void
ac856078
NS
999nvptx_function_arg_advance (cumulative_args_t cum_v,
1000 machine_mode ARG_UNUSED (mode),
1001 const_tree ARG_UNUSED (type),
1002 bool ARG_UNUSED (named))
738f2522
BS
1003{
1004 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
ac856078 1005 cum->count++;
738f2522
BS
1006}
1007
1008/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
1009
1010 For nvptx, we know how to handle functions declared as stdarg: by
1011 passing an extra pointer to the unnamed arguments. However, the
1012 Fortran frontend can produce a different situation, where a
1013 function pointer is declared with no arguments, but the actual
1014 function and calls to it take more arguments. In that case, we
1015 want to ensure the call matches the definition of the function. */
1016
1017static bool
1018nvptx_strict_argument_naming (cumulative_args_t cum_v)
1019{
1020 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1021 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
1022}
1023
1024/* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1025
1026static unsigned int
1027nvptx_function_arg_boundary (machine_mode mode, const_tree type)
1028{
1029 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
1030
1031 if (boundary > BITS_PER_WORD)
1032 return 2 * BITS_PER_WORD;
1033
1034 if (mode == BLKmode)
1035 {
1036 HOST_WIDE_INT size = int_size_in_bytes (type);
1037 if (size > 4)
1038 return 2 * BITS_PER_WORD;
1039 if (boundary < BITS_PER_WORD)
1040 {
1041 if (size >= 3)
1042 return BITS_PER_WORD;
1043 if (size >= 2)
1044 return 2 * BITS_PER_UNIT;
1045 }
1046 }
1047 return boundary;
1048}
1049
1050/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1051 where function FUNC returns or receives a value of data type TYPE. */
1052
1053static rtx
1054nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
1055 bool outgoing)
1056{
1057 int unsignedp = TYPE_UNSIGNED (type);
1058 machine_mode orig_mode = TYPE_MODE (type);
1059 machine_mode mode = promote_function_mode (type, orig_mode,
1060 &unsignedp, NULL_TREE, 1);
1061 if (outgoing)
1062 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1063 if (cfun->machine->start_call == NULL_RTX)
1064 /* Pretend to return in a hard reg for early uses before pseudos can be
1065 generated. */
1066 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1067 return gen_reg_rtx (mode);
1068}
1069
1070/* Implement TARGET_LIBCALL_VALUE. */
1071
1072static rtx
1073nvptx_libcall_value (machine_mode mode, const_rtx)
1074{
1075 if (cfun->machine->start_call == NULL_RTX)
1076 /* Pretend to return in a hard reg for early uses before pseudos can be
1077 generated. */
1078 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1079 return gen_reg_rtx (mode);
1080}
1081
1082/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1083
1084static bool
1085nvptx_function_value_regno_p (const unsigned int regno)
1086{
1087 return regno == NVPTX_RETURN_REGNUM;
1088}
1089
1090/* Types with a mode other than those supported by the machine are passed by
1091 reference in memory. */
1092
1093static bool
1094nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
1095 const_tree type, bool)
1096{
1097 return !PASS_IN_REG_P (mode, type);
1098}
1099
1100/* Implement TARGET_RETURN_IN_MEMORY. */
1101
1102static bool
1103nvptx_return_in_memory (const_tree type, const_tree)
1104{
1105 machine_mode mode = TYPE_MODE (type);
1106 if (!RETURN_IN_REG_P (mode))
1107 return true;
1108 return false;
1109}
1110
1111/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1112
1113static machine_mode
1114nvptx_promote_function_mode (const_tree type, machine_mode mode,
1115 int *punsignedp,
1116 const_tree funtype, int for_return)
1117{
1118 if (type == NULL_TREE)
1119 return mode;
1120 if (for_return)
1121 return promote_mode (type, mode, punsignedp);
1122 /* For K&R-style functions, try to match the language promotion rules to
1123 minimize type mismatches at assembly time. */
1124 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
1125 && type != NULL_TREE
1126 && !AGGREGATE_TYPE_P (type))
1127 {
1128 if (mode == SFmode)
1129 mode = DFmode;
1130 mode = arg_promotion (mode);
1131 }
1132
1133 return mode;
1134}
1135
1136/* Implement TARGET_STATIC_CHAIN. */
1137
1138static rtx
1139nvptx_static_chain (const_tree fndecl, bool incoming_p)
1140{
1141 if (!DECL_STATIC_CHAIN (fndecl))
1142 return NULL;
1143
1144 if (incoming_p)
1145 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
1146 else
1147 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
1148}
1149\f
1150/* Emit a comparison COMPARE, and return the new test to be used in the
1151 jump. */
1152
1153rtx
1154nvptx_expand_compare (rtx compare)
1155{
1156 rtx pred = gen_reg_rtx (BImode);
1157 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1158 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1159 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1160 return gen_rtx_NE (BImode, pred, const0_rtx);
1161}
1162
d88cd9c4
NS
1163/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1164
1165void
1166nvptx_expand_oacc_fork (unsigned mode)
1167{
1168 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1169}
1170
1171void
1172nvptx_expand_oacc_join (unsigned mode)
1173{
1174 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1175}
1176
1177/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1178 objects. */
1179
1180static rtx
1181nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1182{
1183 rtx res;
1184
1185 switch (GET_MODE (src))
1186 {
1187 case DImode:
1188 res = gen_unpackdisi2 (dst0, dst1, src);
1189 break;
1190 case DFmode:
1191 res = gen_unpackdfsi2 (dst0, dst1, src);
1192 break;
1193 default: gcc_unreachable ();
1194 }
1195 return res;
1196}
1197
1198/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1199 object. */
1200
1201static rtx
1202nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1203{
1204 rtx res;
1205
1206 switch (GET_MODE (dst))
1207 {
1208 case DImode:
1209 res = gen_packsidi2 (dst, src0, src1);
1210 break;
1211 case DFmode:
1212 res = gen_packsidf2 (dst, src0, src1);
1213 break;
1214 default: gcc_unreachable ();
1215 }
1216 return res;
1217}
1218
1219/* Generate an instruction or sequence to broadcast register REG
1220 across the vectors of a single warp. */
1221
1222static rtx
1223nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
1224{
1225 rtx res;
1226
1227 switch (GET_MODE (dst))
1228 {
1229 case SImode:
1230 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1231 break;
1232 case SFmode:
1233 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1234 break;
1235 case DImode:
1236 case DFmode:
1237 {
1238 rtx tmp0 = gen_reg_rtx (SImode);
1239 rtx tmp1 = gen_reg_rtx (SImode);
1240
1241 start_sequence ();
1242 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1243 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1244 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1245 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1246 res = get_insns ();
1247 end_sequence ();
1248 }
1249 break;
1250 case BImode:
1251 {
1252 rtx tmp = gen_reg_rtx (SImode);
1253
1254 start_sequence ();
1255 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1256 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1257 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1258 res = get_insns ();
1259 end_sequence ();
1260 }
1261 break;
1262
1263 default:
1264 gcc_unreachable ();
1265 }
1266 return res;
1267}
1268
1269/* Generate an instruction or sequence to broadcast register REG
1270 across the vectors of a single warp. */
1271
1272static rtx
1273nvptx_gen_vcast (rtx reg)
1274{
1275 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1276}
1277
1278/* Structure used when generating a worker-level spill or fill. */
1279
1280struct wcast_data_t
1281{
1282 rtx base; /* Register holding base addr of buffer. */
1283 rtx ptr; /* Iteration var, if needed. */
1284 unsigned offset; /* Offset into worker buffer. */
1285};
1286
1287/* Direction of the spill/fill and looping setup/teardown indicator. */
1288
1289enum propagate_mask
1290 {
1291 PM_read = 1 << 0,
1292 PM_write = 1 << 1,
1293 PM_loop_begin = 1 << 2,
1294 PM_loop_end = 1 << 3,
1295
1296 PM_read_write = PM_read | PM_write
1297 };
1298
1299/* Generate instruction(s) to spill or fill register REG to/from the
1300 worker broadcast array. PM indicates what is to be done, REP
1301 how many loop iterations will be executed (0 for not a loop). */
1302
1303static rtx
1304nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1305{
1306 rtx res;
1307 machine_mode mode = GET_MODE (reg);
1308
1309 switch (mode)
1310 {
1311 case BImode:
1312 {
1313 rtx tmp = gen_reg_rtx (SImode);
1314
1315 start_sequence ();
1316 if (pm & PM_read)
1317 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1318 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1319 if (pm & PM_write)
1320 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1321 res = get_insns ();
1322 end_sequence ();
1323 }
1324 break;
1325
1326 default:
1327 {
1328 rtx addr = data->ptr;
1329
1330 if (!addr)
1331 {
1332 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1333
1334 if (align > worker_bcast_align)
1335 worker_bcast_align = align;
1336 data->offset = (data->offset + align - 1) & ~(align - 1);
1337 addr = data->base;
1338 if (data->offset)
1339 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1340 }
1341
1342 addr = gen_rtx_MEM (mode, addr);
1343 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1344 if (pm == PM_read)
1345 res = gen_rtx_SET (addr, reg);
1346 else if (pm == PM_write)
1347 res = gen_rtx_SET (reg, addr);
1348 else
1349 gcc_unreachable ();
1350
1351 if (data->ptr)
1352 {
1353 /* We're using a ptr, increment it. */
1354 start_sequence ();
1355
1356 emit_insn (res);
1357 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1358 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1359 res = get_insns ();
1360 end_sequence ();
1361 }
1362 else
1363 rep = 1;
1364 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1365 }
1366 break;
1367 }
1368 return res;
1369}
1370
738f2522 1371/* When loading an operand ORIG_OP, verify whether an address space
00e52418
NS
1372 conversion to generic is required, and if so, perform it. Check
1373 for SYMBOL_REFs and record them if needed. Return either the
1374 original operand, or the converted one. */
738f2522
BS
1375
1376rtx
7b8edc29 1377nvptx_maybe_convert_symbolic_operand (rtx op)
738f2522 1378{
7b8edc29
NS
1379 if (GET_MODE (op) != Pmode)
1380 return op;
1381
1382 rtx sym = op;
1383 if (GET_CODE (sym) == CONST)
1384 sym = XEXP (sym, 0);
1385 if (GET_CODE (sym) == PLUS)
1386 sym = XEXP (sym, 0);
738f2522 1387
7b8edc29
NS
1388 if (GET_CODE (sym) != SYMBOL_REF)
1389 return op;
738f2522 1390
7b8edc29 1391 nvptx_maybe_record_fnsym (sym);
00e52418 1392
7b8edc29 1393 addr_space_t as = nvptx_addr_space_from_sym (sym);
738f2522 1394 if (as == ADDR_SPACE_GENERIC)
7b8edc29 1395 return op;
738f2522
BS
1396
1397 enum unspec code;
1398 code = (as == ADDR_SPACE_GLOBAL ? UNSPEC_FROM_GLOBAL
1399 : as == ADDR_SPACE_LOCAL ? UNSPEC_FROM_LOCAL
1400 : as == ADDR_SPACE_SHARED ? UNSPEC_FROM_SHARED
1401 : as == ADDR_SPACE_CONST ? UNSPEC_FROM_CONST
1402 : UNSPEC_FROM_PARAM);
7b8edc29 1403
738f2522 1404 rtx dest = gen_reg_rtx (Pmode);
7b8edc29
NS
1405 emit_insn (gen_rtx_SET (dest,
1406 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op), code)));
738f2522
BS
1407 return dest;
1408}
1409\f
1410/* Returns true if X is a valid address for use in a memory reference. */
1411
1412static bool
1413nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1414{
1415 enum rtx_code code = GET_CODE (x);
1416
1417 switch (code)
1418 {
1419 case REG:
1420 return true;
1421
1422 case PLUS:
1423 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1424 return true;
1425 return false;
1426
1427 case CONST:
1428 case SYMBOL_REF:
1429 case LABEL_REF:
1430 return true;
1431
1432 default:
1433 return false;
1434 }
1435}
1436
1437/* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1438 to ensure that the return register's mode isn't changed. */
1439
1440bool
1441nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1442{
1443 if (regno != NVPTX_RETURN_REGNUM
1444 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1445 return true;
1446 return mode == cfun->machine->ret_reg_mode;
1447}
1448\f
1449/* Convert an address space AS to the corresponding ptx string. */
1450
1451const char *
1452nvptx_section_from_addr_space (addr_space_t as)
1453{
1454 switch (as)
1455 {
1456 case ADDR_SPACE_CONST:
1457 return ".const";
1458
1459 case ADDR_SPACE_GLOBAL:
1460 return ".global";
1461
1462 case ADDR_SPACE_SHARED:
1463 return ".shared";
1464
1465 case ADDR_SPACE_GENERIC:
1466 return "";
1467
1468 default:
1469 gcc_unreachable ();
1470 }
1471}
1472
1473/* Determine whether DECL goes into .const or .global. */
1474
1475const char *
1476nvptx_section_for_decl (const_tree decl)
1477{
1478 bool is_const = (CONSTANT_CLASS_P (decl)
1479 || TREE_CODE (decl) == CONST_DECL
1480 || TREE_READONLY (decl));
1481 if (is_const)
1482 return ".const";
1483
1484 return ".global";
1485}
1486
738f2522 1487\f
ecf6e535
BS
1488/* Machinery to output constant initializers. When beginning an initializer,
1489 we decide on a chunk size (which is visible in ptx in the type used), and
1490 then all initializer data is buffered until a chunk is filled and ready to
1491 be written out. */
738f2522
BS
1492
1493/* Used when assembling integers to ensure data is emitted in
1494 pieces whose size matches the declaration we printed. */
1495static unsigned int decl_chunk_size;
1496static machine_mode decl_chunk_mode;
1497/* Used in the same situation, to keep track of the byte offset
1498 into the initializer. */
1499static unsigned HOST_WIDE_INT decl_offset;
1500/* The initializer part we are currently processing. */
1501static HOST_WIDE_INT init_part;
1502/* The total size of the object. */
1503static unsigned HOST_WIDE_INT object_size;
1504/* True if we found a skip extending to the end of the object. Used to
1505 assert that no data follows. */
1506static bool object_finished;
1507
1508/* Write the necessary separator string to begin a new initializer value. */
1509
1510static void
1511begin_decl_field (void)
1512{
1513 /* We never see decl_offset at zero by the time we get here. */
1514 if (decl_offset == decl_chunk_size)
1515 fprintf (asm_out_file, " = { ");
1516 else
1517 fprintf (asm_out_file, ", ");
1518}
1519
1520/* Output the currently stored chunk as an initializer value. */
1521
1522static void
1523output_decl_chunk (void)
1524{
1525 begin_decl_field ();
cc8ca59e 1526 output_address (VOIDmode, gen_int_mode (init_part, decl_chunk_mode));
738f2522
BS
1527 init_part = 0;
1528}
1529
1530/* Add value VAL sized SIZE to the data we're emitting, and keep writing
1531 out chunks as they fill up. */
1532
1533static void
1534nvptx_assemble_value (HOST_WIDE_INT val, unsigned int size)
1535{
1536 unsigned HOST_WIDE_INT chunk_offset = decl_offset % decl_chunk_size;
1537 gcc_assert (!object_finished);
1538 while (size > 0)
1539 {
1540 int this_part = size;
1541 if (chunk_offset + this_part > decl_chunk_size)
1542 this_part = decl_chunk_size - chunk_offset;
1543 HOST_WIDE_INT val_part;
1544 HOST_WIDE_INT mask = 2;
1545 mask <<= this_part * BITS_PER_UNIT - 1;
1546 val_part = val & (mask - 1);
1547 init_part |= val_part << (BITS_PER_UNIT * chunk_offset);
1548 val >>= BITS_PER_UNIT * this_part;
1549 size -= this_part;
1550 decl_offset += this_part;
1551 if (decl_offset % decl_chunk_size == 0)
1552 output_decl_chunk ();
1553
1554 chunk_offset = 0;
1555 }
1556}
1557
1558/* Target hook for assembling integer object X of size SIZE. */
1559
1560static bool
1561nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1562{
00e52418
NS
1563 HOST_WIDE_INT val = 0;
1564
1565 switch (GET_CODE (x))
738f2522 1566 {
00e52418
NS
1567 default:
1568 gcc_unreachable ();
1569
1570 case CONST_INT:
1571 val = INTVAL (x);
1572 nvptx_assemble_value (val, size);
1573 break;
1574
1575 case CONST:
1576 x = XEXP (x, 0);
1577 gcc_assert (GET_CODE (x) == PLUS);
1578 val = INTVAL (XEXP (x, 1));
1579 x = XEXP (x, 0);
1580 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1581 /* FALLTHROUGH */
1582
1583 case SYMBOL_REF:
738f2522
BS
1584 gcc_assert (size = decl_chunk_size);
1585 if (decl_offset % decl_chunk_size != 0)
1586 sorry ("cannot emit unaligned pointers in ptx assembly");
1587 decl_offset += size;
1588 begin_decl_field ();
1589
00e52418
NS
1590 nvptx_maybe_record_fnsym (x);
1591 fprintf (asm_out_file, "generic(");
1592 output_address (VOIDmode, x);
1593 fprintf (asm_out_file, ")");
738f2522 1594
00e52418
NS
1595 if (val)
1596 fprintf (asm_out_file, " + " HOST_WIDE_INT_PRINT_DEC, val);
738f2522 1597 break;
738f2522
BS
1598 }
1599
738f2522
BS
1600 return true;
1601}
1602
1603/* Output SIZE zero bytes. We ignore the FILE argument since the
1604 functions we're calling to perform the output just use
1605 asm_out_file. */
1606
1607void
1608nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1609{
1610 if (decl_offset + size >= object_size)
1611 {
1612 if (decl_offset % decl_chunk_size != 0)
1613 nvptx_assemble_value (0, decl_chunk_size);
1614 object_finished = true;
1615 return;
1616 }
1617
1618 while (size > decl_chunk_size)
1619 {
1620 nvptx_assemble_value (0, decl_chunk_size);
1621 size -= decl_chunk_size;
1622 }
1623 while (size-- > 0)
1624 nvptx_assemble_value (0, 1);
1625}
1626
1627/* Output a string STR with length SIZE. As in nvptx_output_skip we
1628 ignore the FILE arg. */
1629
1630void
1631nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1632{
1633 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1634 nvptx_assemble_value (str[i], 1);
1635}
1636
1637/* Called when the initializer for a decl has been completely output through
1638 combinations of the three functions above. */
1639
1640static void
1641nvptx_assemble_decl_end (void)
1642{
1643 if (decl_offset != 0)
1644 {
1645 if (!object_finished && decl_offset % decl_chunk_size != 0)
1646 nvptx_assemble_value (0, decl_chunk_size);
1647
1648 fprintf (asm_out_file, " }");
1649 }
1650 fprintf (asm_out_file, ";\n");
1651}
1652
1653/* Start a declaration of a variable of TYPE with NAME to
1654 FILE. IS_PUBLIC says whether this will be externally visible.
1655 Here we just write the linker hint and decide on the chunk size
1656 to use. */
1657
1658static void
1659init_output_initializer (FILE *file, const char *name, const_tree type,
1660 bool is_public)
1661{
69823d76 1662 write_var_marker (file, true, is_public, name);
738f2522
BS
1663
1664 if (TREE_CODE (type) == ARRAY_TYPE)
1665 type = TREE_TYPE (type);
1666 int sz = int_size_in_bytes (type);
1667 if ((TREE_CODE (type) != INTEGER_TYPE
1668 && TREE_CODE (type) != ENUMERAL_TYPE
1669 && TREE_CODE (type) != REAL_TYPE)
1670 || sz < 0
1671 || sz > HOST_BITS_PER_WIDE_INT)
1672 type = ptr_type_node;
1673 decl_chunk_size = int_size_in_bytes (type);
1674 decl_chunk_mode = int_mode_for_mode (TYPE_MODE (type));
1675 decl_offset = 0;
1676 init_part = 0;
1677 object_finished = false;
1678}
1679
69823d76
NS
1680/* Output an uninitialized common or file-scope variable. */
1681
1682void
1683nvptx_output_aligned_decl (FILE *file, const char *name,
1684 const_tree decl, HOST_WIDE_INT size, unsigned align)
1685{
1686 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1687
1688 /* If this is public, it is common. The nearest thing we have to
1689 common is weak. */
1690 if (TREE_PUBLIC (decl))
1691 fprintf (file, ".weak ");
1692
1693 const char *sec = nvptx_section_for_decl (decl);
1694 fprintf (file, "%s.align %d .b8 ", sec, align / BITS_PER_UNIT);
1695 assemble_name (file, name);
1696 if (size > 0)
1697 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
1698 fprintf (file, ";\n");
1699}
1700
738f2522
BS
1701/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1702 writing a constant variable EXP with NAME and SIZE and its
1703 initializer to FILE. */
1704
1705static void
1706nvptx_asm_declare_constant_name (FILE *file, const char *name,
1707 const_tree exp, HOST_WIDE_INT size)
1708{
1709 tree type = TREE_TYPE (exp);
1710 init_output_initializer (file, name, type, false);
1711 fprintf (file, "\t.const .align %d .u%d ",
1712 TYPE_ALIGN (TREE_TYPE (exp)) / BITS_PER_UNIT,
1713 decl_chunk_size * BITS_PER_UNIT);
1714 assemble_name (file, name);
1715 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1716 (size + decl_chunk_size - 1) / decl_chunk_size);
1717 object_size = size;
1718}
1719
1720/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1721 a variable DECL with NAME to FILE. */
1722
1723void
1724nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1725{
1726 if (decl && DECL_SIZE (decl))
1727 {
1728 tree type = TREE_TYPE (decl);
1729 unsigned HOST_WIDE_INT size;
1730
1731 init_output_initializer (file, name, type, TREE_PUBLIC (decl));
1732 size = tree_to_uhwi (DECL_SIZE_UNIT (decl));
1733 const char *section = nvptx_section_for_decl (decl);
1734 fprintf (file, "\t%s%s .align %d .u%d ",
0766660b
NS
1735 !TREE_PUBLIC (decl) ? ""
1736 : DECL_WEAK (decl) ? ".weak" : ".visible",
1737 section, DECL_ALIGN (decl) / BITS_PER_UNIT,
738f2522
BS
1738 decl_chunk_size * BITS_PER_UNIT);
1739 assemble_name (file, name);
1740 if (size > 0)
1741 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1742 (size + decl_chunk_size - 1) / decl_chunk_size);
1743 else
1744 object_finished = true;
1745 object_size = size;
1746 }
1747}
1748
1749/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1750
1751static void
1752nvptx_globalize_label (FILE *, const char *)
1753{
1754}
1755
1756/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1757 declaration only for variable DECL with NAME to FILE. */
1758static void
1759nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1760{
1761 if (TREE_CODE (decl) != VAR_DECL)
1762 return;
69823d76
NS
1763
1764 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1765
738f2522 1766 const char *section = nvptx_section_for_decl (decl);
738f2522
BS
1767 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (decl));
1768 fprintf (file, ".extern %s .b8 ", section);
1769 assemble_name_raw (file, name);
1770 if (size > 0)
16998094 1771 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
738f2522
BS
1772 fprintf (file, ";\n\n");
1773}
1774
1775/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
1776 involves writing .param declarations and in/out copies into them. For
1777 indirect calls, also write the .callprototype. */
738f2522
BS
1778
1779const char *
1780nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1781{
863af9a4 1782 char buf[16];
738f2522
BS
1783 static int labelno;
1784 bool needs_tgt = register_operand (callee, Pmode);
1785 rtx pat = PATTERN (insn);
f324806d 1786 int arg_end = XVECLEN (pat, 0);
738f2522
BS
1787 tree decl = NULL_TREE;
1788
1789 fprintf (asm_out_file, "\t{\n");
1790 if (result != NULL)
f324806d
NS
1791 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1792 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1793 false));
738f2522 1794
ecf6e535 1795 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
1796 if (GET_CODE (callee) == SYMBOL_REF)
1797 {
1798 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
1799 if (!decl
1800 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1801 nvptx_record_libfunc (callee, result, pat);
1802 else if (DECL_EXTERNAL (decl))
738f2522
BS
1803 nvptx_record_fndecl (decl);
1804 }
1805
1806 if (needs_tgt)
1807 {
1808 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1809 labelno++;
1810 ASM_OUTPUT_LABEL (asm_out_file, buf);
1811 std::stringstream s;
b699adcc 1812 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
1813 fputs (s.str().c_str(), asm_out_file);
1814 }
1815
863af9a4 1816 for (int argno = 1; argno < arg_end; argno++)
738f2522 1817 {
863af9a4 1818 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 1819 machine_mode mode = GET_MODE (t);
738f2522 1820
863af9a4
NS
1821 /* Mode splitting has already been done. */
1822 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1823 nvptx_ptx_type_from_mode (mode, false), argno,
1824 mode == QImode || mode == HImode ? "[1]" : "");
1825 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1826 nvptx_ptx_type_from_mode (mode, false), argno,
1827 REGNO (t));
738f2522
BS
1828 }
1829
1830 fprintf (asm_out_file, "\t\tcall ");
1831 if (result != NULL_RTX)
1832 fprintf (asm_out_file, "(%%retval_in), ");
1833
1834 if (decl)
1835 {
1836 const char *name = get_fnname_from_decl (decl);
1837 name = nvptx_name_replacement (name);
1838 assemble_name (asm_out_file, name);
1839 }
1840 else
cc8ca59e 1841 output_address (VOIDmode, callee);
738f2522 1842
863af9a4
NS
1843 const char *open = "(";
1844 for (int argno = 1; argno < arg_end; argno++)
738f2522 1845 {
863af9a4
NS
1846 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1847 open = "";
738f2522 1848 }
863af9a4
NS
1849 if (decl && DECL_STATIC_CHAIN (decl))
1850 {
1851 fprintf (asm_out_file, ", %s%s", open,
1852 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
1853 open = "";
1854 }
1855 if (!open[0])
1856 fprintf (asm_out_file, ")");
f324806d 1857
738f2522
BS
1858 if (needs_tgt)
1859 {
1860 fprintf (asm_out_file, ", ");
1861 assemble_name (asm_out_file, buf);
1862 }
1863 fprintf (asm_out_file, ";\n");
738f2522 1864
863af9a4 1865 return result != NULL_RTX ? "\tld.param%t0\t%0, [%%retval_in];\n\t}" : "}";
738f2522
BS
1866}
1867
1868/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1869
1870static bool
1871nvptx_print_operand_punct_valid_p (unsigned char c)
1872{
1873 return c == '.' || c== '#';
1874}
1875
1876static void nvptx_print_operand (FILE *, rtx, int);
1877
1878/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1879
1880static void
1881nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1882{
1883 rtx off;
1884 if (GET_CODE (x) == CONST)
1885 x = XEXP (x, 0);
1886 switch (GET_CODE (x))
1887 {
1888 case PLUS:
1889 off = XEXP (x, 1);
cc8ca59e 1890 output_address (VOIDmode, XEXP (x, 0));
738f2522 1891 fprintf (file, "+");
cc8ca59e 1892 output_address (VOIDmode, off);
738f2522
BS
1893 break;
1894
1895 case SYMBOL_REF:
1896 case LABEL_REF:
1897 output_addr_const (file, x);
1898 break;
1899
1900 default:
1901 gcc_assert (GET_CODE (x) != MEM);
1902 nvptx_print_operand (file, x, 0);
1903 break;
1904 }
1905}
1906
1907/* Write assembly language output for the address ADDR to FILE. */
1908
1909static void
cc8ca59e 1910nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 1911{
cc8ca59e 1912 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
1913}
1914
1915/* Print an operand, X, to FILE, with an optional modifier in CODE.
1916
1917 Meaning of CODE:
1918 . -- print the predicate for the instruction or an emptry string for an
1919 unconditional one.
1920 # -- print a rounding mode for the instruction
1921
1922 A -- print an address space identifier for a MEM
1923 c -- print an opcode suffix for a comparison operator, including a type code
738f2522 1924 f -- print a full reg even for something that must always be split
d88cd9c4 1925 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
1926 t -- print a type opcode suffix, promoting QImode to 32 bits
1927 T -- print a type size in bits
1928 u -- print a type opcode suffix without promotions. */
1929
1930static void
1931nvptx_print_operand (FILE *file, rtx x, int code)
1932{
1933 rtx orig_x = x;
1934 machine_mode op_mode;
1935
1936 if (code == '.')
1937 {
1938 x = current_insn_predicate;
1939 if (x)
1940 {
1941 unsigned int regno = REGNO (XEXP (x, 0));
1942 fputs ("[", file);
1943 if (GET_CODE (x) == EQ)
1944 fputs ("!", file);
1945 fputs (reg_names [regno], file);
1946 fputs ("]", file);
1947 }
1948 return;
1949 }
1950 else if (code == '#')
1951 {
1952 fputs (".rn", file);
1953 return;
1954 }
1955
1956 enum rtx_code x_code = GET_CODE (x);
1957
1958 switch (code)
1959 {
1960 case 'A':
1961 {
7b8edc29
NS
1962 addr_space_t as = ADDR_SPACE_GENERIC;
1963 rtx sym = XEXP (x, 0);
1964
1965 if (GET_CODE (sym) == CONST)
1966 sym = XEXP (sym, 0);
1967 if (GET_CODE (sym) == PLUS)
1968 sym = XEXP (sym, 0);
1969
1970 if (GET_CODE (sym) == SYMBOL_REF)
1971 as = nvptx_addr_space_from_sym (sym);
1972
738f2522
BS
1973 fputs (nvptx_section_from_addr_space (as), file);
1974 }
1975 break;
1976
738f2522
BS
1977 case 't':
1978 op_mode = nvptx_underlying_object_mode (x);
1979 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
1980 break;
1981
1982 case 'u':
1983 op_mode = nvptx_underlying_object_mode (x);
1984 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
1985 break;
1986
d88cd9c4
NS
1987 case 'S':
1988 {
1989 unsigned kind = UINTVAL (x);
1990 static const char *const kinds[] =
1991 {"up", "down", "bfly", "idx"};
1992 fprintf (file, ".%s", kinds[kind]);
1993 }
1994 break;
1995
738f2522
BS
1996 case 'T':
1997 fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
1998 break;
1999
2000 case 'j':
2001 fprintf (file, "@");
2002 goto common;
2003
2004 case 'J':
2005 fprintf (file, "@!");
2006 goto common;
2007
2008 case 'c':
2009 op_mode = GET_MODE (XEXP (x, 0));
2010 switch (x_code)
2011 {
2012 case EQ:
2013 fputs (".eq", file);
2014 break;
2015 case NE:
2016 if (FLOAT_MODE_P (op_mode))
2017 fputs (".neu", file);
2018 else
2019 fputs (".ne", file);
2020 break;
2021 case LE:
2022 fputs (".le", file);
2023 break;
2024 case GE:
2025 fputs (".ge", file);
2026 break;
2027 case LT:
2028 fputs (".lt", file);
2029 break;
2030 case GT:
2031 fputs (".gt", file);
2032 break;
2033 case LEU:
2034 fputs (".ls", file);
2035 break;
2036 case GEU:
2037 fputs (".hs", file);
2038 break;
2039 case LTU:
2040 fputs (".lo", file);
2041 break;
2042 case GTU:
2043 fputs (".hi", file);
2044 break;
2045 case LTGT:
2046 fputs (".ne", file);
2047 break;
2048 case UNEQ:
2049 fputs (".equ", file);
2050 break;
2051 case UNLE:
2052 fputs (".leu", file);
2053 break;
2054 case UNGE:
2055 fputs (".geu", file);
2056 break;
2057 case UNLT:
2058 fputs (".ltu", file);
2059 break;
2060 case UNGT:
2061 fputs (".gtu", file);
2062 break;
2063 case UNORDERED:
2064 fputs (".nan", file);
2065 break;
2066 case ORDERED:
2067 fputs (".num", file);
2068 break;
2069 default:
2070 gcc_unreachable ();
2071 }
2072 if (FLOAT_MODE_P (op_mode)
2073 || x_code == EQ || x_code == NE
2074 || x_code == GEU || x_code == GTU
2075 || x_code == LEU || x_code == LTU)
2076 fputs (nvptx_ptx_type_from_mode (op_mode, true), file);
2077 else
2078 fprintf (file, ".s%d", GET_MODE_BITSIZE (op_mode));
2079 break;
2080 default:
2081 common:
2082 switch (x_code)
2083 {
2084 case SUBREG:
2085 x = SUBREG_REG (x);
2086 /* fall through */
2087
2088 case REG:
2089 if (HARD_REGISTER_P (x))
2090 fprintf (file, "%s", reg_names[REGNO (x)]);
2091 else
2092 fprintf (file, "%%r%d", REGNO (x));
d7479262 2093 if (code != 'f' && maybe_split_mode (GET_MODE (x)) != VOIDmode)
738f2522
BS
2094 {
2095 gcc_assert (GET_CODE (orig_x) == SUBREG
d7479262 2096 && maybe_split_mode (GET_MODE (orig_x)) == VOIDmode);
738f2522
BS
2097 fprintf (file, "$%d", SUBREG_BYTE (orig_x) / UNITS_PER_WORD);
2098 }
2099 break;
2100
2101 case MEM:
2102 fputc ('[', file);
2103 nvptx_print_address_operand (file, XEXP (x, 0), GET_MODE (x));
2104 fputc (']', file);
2105 break;
2106
2107 case CONST_INT:
2108 output_addr_const (file, x);
2109 break;
2110
2111 case CONST:
2112 case SYMBOL_REF:
2113 case LABEL_REF:
2114 /* We could use output_addr_const, but that can print things like
2115 "x-8", which breaks ptxas. Need to ensure it is output as
2116 "x+-8". */
2117 nvptx_print_address_operand (file, x, VOIDmode);
2118 break;
2119
2120 case CONST_DOUBLE:
2121 long vals[2];
34a72c33 2122 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), GET_MODE (x));
738f2522
BS
2123 vals[0] &= 0xffffffff;
2124 vals[1] &= 0xffffffff;
2125 if (GET_MODE (x) == SFmode)
2126 fprintf (file, "0f%08lx", vals[0]);
2127 else
2128 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2129 break;
2130
2131 default:
2132 output_addr_const (file, x);
2133 }
2134 }
2135}
2136\f
2137/* Record replacement regs used to deal with subreg operands. */
2138struct reg_replace
2139{
2140 rtx replacement[MAX_RECOG_OPERANDS];
2141 machine_mode mode;
2142 int n_allocated;
2143 int n_in_use;
2144};
2145
2146/* Allocate or reuse a replacement in R and return the rtx. */
2147
2148static rtx
2149get_replacement (struct reg_replace *r)
2150{
2151 if (r->n_allocated == r->n_in_use)
2152 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2153 return r->replacement[r->n_in_use++];
2154}
2155
2156/* Clean up subreg operands. In ptx assembly, everything is typed, and
2157 the presence of subregs would break the rules for most instructions.
2158 Replace them with a suitable new register of the right size, plus
2159 conversion copyin/copyout instructions. */
2160
2161static void
517665b3 2162nvptx_reorg_subreg (void)
738f2522
BS
2163{
2164 struct reg_replace qiregs, hiregs, siregs, diregs;
2165 rtx_insn *insn, *next;
2166
738f2522
BS
2167 qiregs.n_allocated = 0;
2168 hiregs.n_allocated = 0;
2169 siregs.n_allocated = 0;
2170 diregs.n_allocated = 0;
2171 qiregs.mode = QImode;
2172 hiregs.mode = HImode;
2173 siregs.mode = SImode;
2174 diregs.mode = DImode;
2175
2176 for (insn = get_insns (); insn; insn = next)
2177 {
2178 next = NEXT_INSN (insn);
2179 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2180 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2181 || GET_CODE (PATTERN (insn)) == USE
2182 || GET_CODE (PATTERN (insn)) == CLOBBER)
2183 continue;
f324806d 2184
738f2522
BS
2185 qiregs.n_in_use = 0;
2186 hiregs.n_in_use = 0;
2187 siregs.n_in_use = 0;
2188 diregs.n_in_use = 0;
2189 extract_insn (insn);
2190 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2191
738f2522
BS
2192 for (int i = 0; i < recog_data.n_operands; i++)
2193 {
2194 rtx op = recog_data.operand[i];
2195 if (GET_CODE (op) != SUBREG)
2196 continue;
2197
2198 rtx inner = SUBREG_REG (op);
2199
2200 machine_mode outer_mode = GET_MODE (op);
2201 machine_mode inner_mode = GET_MODE (inner);
2202 gcc_assert (s_ok);
2203 if (s_ok
2204 && (GET_MODE_PRECISION (inner_mode)
2205 >= GET_MODE_PRECISION (outer_mode)))
2206 continue;
2207 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2208 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2209 : outer_mode == HImode ? &hiregs
2210 : outer_mode == SImode ? &siregs
2211 : &diregs);
2212 rtx new_reg = get_replacement (r);
2213
2214 if (recog_data.operand_type[i] != OP_OUT)
2215 {
2216 enum rtx_code code;
2217 if (GET_MODE_PRECISION (inner_mode)
2218 < GET_MODE_PRECISION (outer_mode))
2219 code = ZERO_EXTEND;
2220 else
2221 code = TRUNCATE;
2222
f7df4a84 2223 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2224 gen_rtx_fmt_e (code, outer_mode, inner));
2225 emit_insn_before (pat, insn);
2226 }
2227
2228 if (recog_data.operand_type[i] != OP_IN)
2229 {
2230 enum rtx_code code;
2231 if (GET_MODE_PRECISION (inner_mode)
2232 < GET_MODE_PRECISION (outer_mode))
2233 code = TRUNCATE;
2234 else
2235 code = ZERO_EXTEND;
2236
f7df4a84 2237 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2238 gen_rtx_fmt_e (code, inner_mode, new_reg));
2239 emit_insn_after (pat, insn);
2240 }
2241 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2242 }
2243 }
517665b3 2244}
738f2522 2245
d2d47a28
NS
2246/* Loop structure of the function. The entire function is described as
2247 a NULL loop. */
d88cd9c4
NS
2248
2249struct parallel
2250{
2251 /* Parent parallel. */
2252 parallel *parent;
2253
2254 /* Next sibling parallel. */
2255 parallel *next;
2256
2257 /* First child parallel. */
2258 parallel *inner;
2259
2260 /* Partitioning mask of the parallel. */
2261 unsigned mask;
2262
2263 /* Partitioning used within inner parallels. */
2264 unsigned inner_mask;
2265
2266 /* Location of parallel forked and join. The forked is the first
2267 block in the parallel and the join is the first block after of
2268 the partition. */
2269 basic_block forked_block;
2270 basic_block join_block;
2271
2272 rtx_insn *forked_insn;
2273 rtx_insn *join_insn;
2274
2275 rtx_insn *fork_insn;
2276 rtx_insn *joining_insn;
2277
2278 /* Basic blocks in this parallel, but not in child parallels. The
2279 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2280 blocks are not. */
2281 auto_vec<basic_block> blocks;
2282
2283public:
2284 parallel (parallel *parent, unsigned mode);
2285 ~parallel ();
2286};
2287
2288/* Constructor links the new parallel into it's parent's chain of
2289 children. */
2290
2291parallel::parallel (parallel *parent_, unsigned mask_)
2292 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2293{
2294 forked_block = join_block = 0;
2295 forked_insn = join_insn = 0;
2296 fork_insn = joining_insn = 0;
2297
2298 if (parent)
2299 {
2300 next = parent->inner;
2301 parent->inner = this;
2302 }
2303}
2304
2305parallel::~parallel ()
2306{
2307 delete inner;
2308 delete next;
2309}
2310
2311/* Map of basic blocks to insns */
2312typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2313
2314/* A tuple of an insn of interest and the BB in which it resides. */
2315typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2316typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2317
2318/* Split basic blocks such that each forked and join unspecs are at
2319 the start of their basic blocks. Thus afterwards each block will
2320 have a single partitioning mode. We also do the same for return
2321 insns, as they are executed by every thread. Return the
2322 partitioning mode of the function as a whole. Populate MAP with
2323 head and tail blocks. We also clear the BB visited flag, which is
2324 used when finding partitions. */
2325
2326static void
2327nvptx_split_blocks (bb_insn_map_t *map)
2328{
2329 insn_bb_vec_t worklist;
2330 basic_block block;
2331 rtx_insn *insn;
2332
2333 /* Locate all the reorg instructions of interest. */
2334 FOR_ALL_BB_FN (block, cfun)
2335 {
2336 bool seen_insn = false;
2337
2338 /* Clear visited flag, for use by parallel locator */
2339 block->flags &= ~BB_VISITED;
2340
2341 FOR_BB_INSNS (block, insn)
2342 {
2343 if (!INSN_P (insn))
2344 continue;
2345 switch (recog_memoized (insn))
2346 {
2347 default:
2348 seen_insn = true;
2349 continue;
2350 case CODE_FOR_nvptx_forked:
2351 case CODE_FOR_nvptx_join:
2352 break;
2353
2354 case CODE_FOR_return:
2355 /* We also need to split just before return insns, as
2356 that insn needs executing by all threads, but the
2357 block it is in probably does not. */
2358 break;
2359 }
2360
2361 if (seen_insn)
2362 /* We've found an instruction that must be at the start of
2363 a block, but isn't. Add it to the worklist. */
2364 worklist.safe_push (insn_bb_t (insn, block));
2365 else
2366 /* It was already the first instruction. Just add it to
2367 the map. */
2368 map->get_or_insert (block) = insn;
2369 seen_insn = true;
2370 }
2371 }
2372
2373 /* Split blocks on the worklist. */
2374 unsigned ix;
2375 insn_bb_t *elt;
2376 basic_block remap = 0;
2377 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2378 {
2379 if (remap != elt->second)
2380 {
2381 block = elt->second;
2382 remap = block;
2383 }
2384
2385 /* Split block before insn. The insn is in the new block */
2386 edge e = split_block (block, PREV_INSN (elt->first));
2387
2388 block = e->dest;
2389 map->get_or_insert (block) = elt->first;
2390 }
2391}
2392
2393/* BLOCK is a basic block containing a head or tail instruction.
2394 Locate the associated prehead or pretail instruction, which must be
2395 in the single predecessor block. */
2396
2397static rtx_insn *
2398nvptx_discover_pre (basic_block block, int expected)
2399{
2400 gcc_assert (block->preds->length () == 1);
2401 basic_block pre_block = (*block->preds)[0]->src;
2402 rtx_insn *pre_insn;
2403
2404 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2405 pre_insn = PREV_INSN (pre_insn))
2406 gcc_assert (pre_insn != BB_HEAD (pre_block));
2407
2408 gcc_assert (recog_memoized (pre_insn) == expected);
2409 return pre_insn;
2410}
2411
2412/* Dump this parallel and all its inner parallels. */
2413
2414static void
2415nvptx_dump_pars (parallel *par, unsigned depth)
2416{
2417 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2418 depth, par->mask,
2419 par->forked_block ? par->forked_block->index : -1,
2420 par->join_block ? par->join_block->index : -1);
2421
2422 fprintf (dump_file, " blocks:");
2423
2424 basic_block block;
2425 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2426 fprintf (dump_file, " %d", block->index);
2427 fprintf (dump_file, "\n");
2428 if (par->inner)
2429 nvptx_dump_pars (par->inner, depth + 1);
2430
2431 if (par->next)
2432 nvptx_dump_pars (par->next, depth);
2433}
2434
2435/* If BLOCK contains a fork/join marker, process it to create or
2436 terminate a loop structure. Add this block to the current loop,
2437 and then walk successor blocks. */
2438
2439static parallel *
2440nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2441{
2442 if (block->flags & BB_VISITED)
2443 return par;
2444 block->flags |= BB_VISITED;
2445
2446 if (rtx_insn **endp = map->get (block))
2447 {
2448 rtx_insn *end = *endp;
2449
2450 /* This is a block head or tail, or return instruction. */
2451 switch (recog_memoized (end))
2452 {
2453 case CODE_FOR_return:
2454 /* Return instructions are in their own block, and we
2455 don't need to do anything more. */
2456 return par;
2457
2458 case CODE_FOR_nvptx_forked:
2459 /* Loop head, create a new inner loop and add it into
2460 our parent's child list. */
2461 {
2462 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2463
2464 gcc_assert (mask);
2465 par = new parallel (par, mask);
2466 par->forked_block = block;
2467 par->forked_insn = end;
2468 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2469 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2470 par->fork_insn
2471 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2472 }
2473 break;
2474
2475 case CODE_FOR_nvptx_join:
2476 /* A loop tail. Finish the current loop and return to
2477 parent. */
2478 {
2479 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2480
2481 gcc_assert (par->mask == mask);
2482 par->join_block = block;
2483 par->join_insn = end;
2484 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2485 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2486 par->joining_insn
2487 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2488 par = par->parent;
2489 }
2490 break;
2491
2492 default:
2493 gcc_unreachable ();
2494 }
2495 }
2496
2497 if (par)
2498 /* Add this block onto the current loop's list of blocks. */
2499 par->blocks.safe_push (block);
2500 else
2501 /* This must be the entry block. Create a NULL parallel. */
2502 par = new parallel (0, 0);
2503
2504 /* Walk successor blocks. */
2505 edge e;
2506 edge_iterator ei;
2507
2508 FOR_EACH_EDGE (e, ei, block->succs)
2509 nvptx_find_par (map, par, e->dest);
2510
2511 return par;
2512}
2513
2514/* DFS walk the CFG looking for fork & join markers. Construct
2515 loop structures as we go. MAP is a mapping of basic blocks
2516 to head & tail markers, discovered when splitting blocks. This
2517 speeds up the discovery. We rely on the BB visited flag having
2518 been cleared when splitting blocks. */
2519
2520static parallel *
2521nvptx_discover_pars (bb_insn_map_t *map)
2522{
2523 basic_block block;
2524
2525 /* Mark exit blocks as visited. */
2526 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2527 block->flags |= BB_VISITED;
2528
2529 /* And entry block as not. */
2530 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2531 block->flags &= ~BB_VISITED;
2532
2533 parallel *par = nvptx_find_par (map, 0, block);
2534
2535 if (dump_file)
2536 {
2537 fprintf (dump_file, "\nLoops\n");
2538 nvptx_dump_pars (par, 0);
2539 fprintf (dump_file, "\n");
2540 }
2541
2542 return par;
2543}
2544
912442c2
NS
2545/* Analyse a group of BBs within a partitioned region and create N
2546 Single-Entry-Single-Exit regions. Some of those regions will be
2547 trivial ones consisting of a single BB. The blocks of a
2548 partitioned region might form a set of disjoint graphs -- because
2549 the region encloses a differently partitoned sub region.
2550
2551 We use the linear time algorithm described in 'Finding Regions Fast:
2552 Single Entry Single Exit and control Regions in Linear Time'
2553 Johnson, Pearson & Pingali. That algorithm deals with complete
2554 CFGs, where a back edge is inserted from END to START, and thus the
2555 problem becomes one of finding equivalent loops.
2556
2557 In this case we have a partial CFG. We complete it by redirecting
2558 any incoming edge to the graph to be from an arbitrary external BB,
2559 and similarly redirecting any outgoing edge to be to that BB.
2560 Thus we end up with a closed graph.
2561
2562 The algorithm works by building a spanning tree of an undirected
2563 graph and keeping track of back edges from nodes further from the
2564 root in the tree to nodes nearer to the root in the tree. In the
2565 description below, the root is up and the tree grows downwards.
2566
2567 We avoid having to deal with degenerate back-edges to the same
2568 block, by splitting each BB into 3 -- one for input edges, one for
2569 the node itself and one for the output edges. Such back edges are
2570 referred to as 'Brackets'. Cycle equivalent nodes will have the
2571 same set of brackets.
2572
2573 Determining bracket equivalency is done by maintaining a list of
2574 brackets in such a manner that the list length and final bracket
2575 uniquely identify the set.
2576
2577 We use coloring to mark all BBs with cycle equivalency with the
2578 same color. This is the output of the 'Finding Regions Fast'
2579 algorithm. Notice it doesn't actually find the set of nodes within
2580 a particular region, just unorderd sets of nodes that are the
2581 entries and exits of SESE regions.
2582
2583 After determining cycle equivalency, we need to find the minimal
2584 set of SESE regions. Do this with a DFS coloring walk of the
2585 complete graph. We're either 'looking' or 'coloring'. When
2586 looking, and we're in the subgraph, we start coloring the color of
2587 the current node, and remember that node as the start of the
2588 current color's SESE region. Every time we go to a new node, we
2589 decrement the count of nodes with thet color. If it reaches zero,
2590 we remember that node as the end of the current color's SESE region
2591 and return to 'looking'. Otherwise we color the node the current
2592 color.
2593
2594 This way we end up with coloring the inside of non-trivial SESE
2595 regions with the color of that region. */
2596
2597/* A pair of BBs. We use this to represent SESE regions. */
2598typedef std::pair<basic_block, basic_block> bb_pair_t;
2599typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2600
2601/* A node in the undirected CFG. The discriminator SECOND indicates just
2602 above or just below the BB idicated by FIRST. */
2603typedef std::pair<basic_block, int> pseudo_node_t;
2604
2605/* A bracket indicates an edge towards the root of the spanning tree of the
2606 undirected graph. Each bracket has a color, determined
2607 from the currrent set of brackets. */
2608struct bracket
2609{
2610 pseudo_node_t back; /* Back target */
2611
2612 /* Current color and size of set. */
2613 unsigned color;
2614 unsigned size;
2615
2616 bracket (pseudo_node_t back_)
2617 : back (back_), color (~0u), size (~0u)
2618 {
2619 }
2620
2621 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2622 {
2623 if (length != size)
2624 {
2625 size = length;
2626 color = color_counts.length ();
2627 color_counts.quick_push (0);
2628 }
2629 color_counts[color]++;
2630 return color;
2631 }
2632};
2633
2634typedef auto_vec<bracket> bracket_vec_t;
2635
2636/* Basic block info for finding SESE regions. */
2637
2638struct bb_sese
2639{
2640 int node; /* Node number in spanning tree. */
2641 int parent; /* Parent node number. */
2642
2643 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2644 edges arrive at pseudo-node Ai and the outgoing edges leave at
2645 pseudo-node Ao. We have to remember which way we arrived at a
2646 particular node when generating the spanning tree. dir > 0 means
2647 we arrived at Ai, dir < 0 means we arrived at Ao. */
2648 int dir;
2649
2650 /* Lowest numbered pseudo-node reached via a backedge from thsis
2651 node, or any descendant. */
2652 pseudo_node_t high;
2653
2654 int color; /* Cycle-equivalence color */
2655
2656 /* Stack of brackets for this node. */
2657 bracket_vec_t brackets;
2658
2659 bb_sese (unsigned node_, unsigned p, int dir_)
2660 :node (node_), parent (p), dir (dir_)
2661 {
2662 }
2663 ~bb_sese ();
2664
2665 /* Push a bracket ending at BACK. */
2666 void push (const pseudo_node_t &back)
2667 {
2668 if (dump_file)
2669 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2670 back.first ? back.first->index : 0, back.second);
2671 brackets.safe_push (bracket (back));
2672 }
2673
2674 void append (bb_sese *child);
2675 void remove (const pseudo_node_t &);
2676
2677 /* Set node's color. */
2678 void set_color (auto_vec<unsigned> &color_counts)
2679 {
2680 color = brackets.last ().get_color (color_counts, brackets.length ());
2681 }
2682};
2683
2684bb_sese::~bb_sese ()
2685{
2686}
2687
2688/* Destructively append CHILD's brackets. */
2689
2690void
2691bb_sese::append (bb_sese *child)
2692{
2693 if (int len = child->brackets.length ())
2694 {
2695 int ix;
2696
2697 if (dump_file)
2698 {
2699 for (ix = 0; ix < len; ix++)
2700 {
2701 const pseudo_node_t &pseudo = child->brackets[ix].back;
2702 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2703 child->node, pseudo.first ? pseudo.first->index : 0,
2704 pseudo.second);
2705 }
2706 }
2707 if (!brackets.length ())
2708 std::swap (brackets, child->brackets);
2709 else
2710 {
2711 brackets.reserve (len);
2712 for (ix = 0; ix < len; ix++)
2713 brackets.quick_push (child->brackets[ix]);
2714 }
2715 }
2716}
2717
2718/* Remove brackets that terminate at PSEUDO. */
2719
2720void
2721bb_sese::remove (const pseudo_node_t &pseudo)
2722{
2723 unsigned removed = 0;
2724 int len = brackets.length ();
2725
2726 for (int ix = 0; ix < len; ix++)
2727 {
2728 if (brackets[ix].back == pseudo)
2729 {
2730 if (dump_file)
2731 fprintf (dump_file, "Removing backedge %d:%+d\n",
2732 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2733 removed++;
2734 }
2735 else if (removed)
2736 brackets[ix-removed] = brackets[ix];
2737 }
2738 while (removed--)
2739 brackets.pop ();
2740}
2741
2742/* Accessors for BB's aux pointer. */
2743#define BB_SET_SESE(B, S) ((B)->aux = (S))
2744#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2745
2746/* DFS walk creating SESE data structures. Only cover nodes with
2747 BB_VISITED set. Append discovered blocks to LIST. We number in
2748 increments of 3 so that the above and below pseudo nodes can be
2749 implicitly numbered too. */
2750
2751static int
2752nvptx_sese_number (int n, int p, int dir, basic_block b,
2753 auto_vec<basic_block> *list)
2754{
2755 if (BB_GET_SESE (b))
2756 return n;
2757
2758 if (dump_file)
2759 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2760 b->index, n, p, dir);
2761
2762 BB_SET_SESE (b, new bb_sese (n, p, dir));
2763 p = n;
2764
2765 n += 3;
2766 list->quick_push (b);
2767
2768 /* First walk the nodes on the 'other side' of this node, then walk
2769 the nodes on the same side. */
2770 for (unsigned ix = 2; ix; ix--)
2771 {
2772 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2773 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2774 : offsetof (edge_def, src));
2775 edge e;
2776 edge_iterator (ei);
2777
2778 FOR_EACH_EDGE (e, ei, edges)
2779 {
2780 basic_block target = *(basic_block *)((char *)e + offset);
2781
2782 if (target->flags & BB_VISITED)
2783 n = nvptx_sese_number (n, p, dir, target, list);
2784 }
2785 dir = -dir;
2786 }
2787 return n;
2788}
2789
2790/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2791 EDGES are the outgoing edges and OFFSET is the offset to the src
2792 or dst block on the edges. */
2793
2794static void
2795nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2796 vec<edge, va_gc> *edges, size_t offset)
2797{
2798 edge e;
2799 edge_iterator (ei);
2800 int hi_back = depth;
2801 pseudo_node_t node_back (0, depth);
2802 int hi_child = depth;
2803 pseudo_node_t node_child (0, depth);
2804 basic_block child = NULL;
2805 unsigned num_children = 0;
2806 int usd = -dir * sese->dir;
2807
2808 if (dump_file)
2809 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2810 me->index, sese->node, dir);
2811
2812 if (dir < 0)
2813 {
2814 /* This is the above pseudo-child. It has the BB itself as an
2815 additional child node. */
2816 node_child = sese->high;
2817 hi_child = node_child.second;
2818 if (node_child.first)
2819 hi_child += BB_GET_SESE (node_child.first)->node;
2820 num_children++;
2821 }
2822
2823 /* Examine each edge.
2824 - if it is a child (a) append its bracket list and (b) record
2825 whether it is the child with the highest reaching bracket.
2826 - if it is an edge to ancestor, record whether it's the highest
2827 reaching backlink. */
2828 FOR_EACH_EDGE (e, ei, edges)
2829 {
2830 basic_block target = *(basic_block *)((char *)e + offset);
2831
2832 if (bb_sese *t_sese = BB_GET_SESE (target))
2833 {
2834 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2835 {
2836 /* Child node. Append its bracket list. */
2837 num_children++;
2838 sese->append (t_sese);
2839
2840 /* Compare it's hi value. */
2841 int t_hi = t_sese->high.second;
2842
2843 if (basic_block child_hi_block = t_sese->high.first)
2844 t_hi += BB_GET_SESE (child_hi_block)->node;
2845
2846 if (hi_child > t_hi)
2847 {
2848 hi_child = t_hi;
2849 node_child = t_sese->high;
2850 child = target;
2851 }
2852 }
2853 else if (t_sese->node < sese->node + dir
2854 && !(dir < 0 && sese->parent == t_sese->node))
2855 {
2856 /* Non-parental ancestor node -- a backlink. */
2857 int d = usd * t_sese->dir;
2858 int back = t_sese->node + d;
2859
2860 if (hi_back > back)
2861 {
2862 hi_back = back;
2863 node_back = pseudo_node_t (target, d);
2864 }
2865 }
2866 }
2867 else
2868 { /* Fallen off graph, backlink to entry node. */
2869 hi_back = 0;
2870 node_back = pseudo_node_t (0, 0);
2871 }
2872 }
2873
2874 /* Remove any brackets that terminate at this pseudo node. */
2875 sese->remove (pseudo_node_t (me, dir));
2876
2877 /* Now push any backlinks from this pseudo node. */
2878 FOR_EACH_EDGE (e, ei, edges)
2879 {
2880 basic_block target = *(basic_block *)((char *)e + offset);
2881 if (bb_sese *t_sese = BB_GET_SESE (target))
2882 {
2883 if (t_sese->node < sese->node + dir
2884 && !(dir < 0 && sese->parent == t_sese->node))
2885 /* Non-parental ancestor node - backedge from me. */
2886 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2887 }
2888 else
2889 {
2890 /* back edge to entry node */
2891 sese->push (pseudo_node_t (0, 0));
2892 }
2893 }
2894
2895 /* If this node leads directly or indirectly to a no-return region of
2896 the graph, then fake a backedge to entry node. */
2897 if (!sese->brackets.length () || !edges || !edges->length ())
2898 {
2899 hi_back = 0;
2900 node_back = pseudo_node_t (0, 0);
2901 sese->push (node_back);
2902 }
2903
2904 /* Record the highest reaching backedge from us or a descendant. */
2905 sese->high = hi_back < hi_child ? node_back : node_child;
2906
2907 if (num_children > 1)
2908 {
2909 /* There is more than one child -- this is a Y shaped piece of
2910 spanning tree. We have to insert a fake backedge from this
2911 node to the highest ancestor reached by not-the-highest
2912 reaching child. Note that there may be multiple children
2913 with backedges to the same highest node. That's ok and we
2914 insert the edge to that highest node. */
2915 hi_child = depth;
2916 if (dir < 0 && child)
2917 {
2918 node_child = sese->high;
2919 hi_child = node_child.second;
2920 if (node_child.first)
2921 hi_child += BB_GET_SESE (node_child.first)->node;
2922 }
2923
2924 FOR_EACH_EDGE (e, ei, edges)
2925 {
2926 basic_block target = *(basic_block *)((char *)e + offset);
2927
2928 if (target == child)
2929 /* Ignore the highest child. */
2930 continue;
2931
2932 bb_sese *t_sese = BB_GET_SESE (target);
2933 if (!t_sese)
2934 continue;
2935 if (t_sese->parent != sese->node)
2936 /* Not a child. */
2937 continue;
2938
2939 /* Compare its hi value. */
2940 int t_hi = t_sese->high.second;
2941
2942 if (basic_block child_hi_block = t_sese->high.first)
2943 t_hi += BB_GET_SESE (child_hi_block)->node;
2944
2945 if (hi_child > t_hi)
2946 {
2947 hi_child = t_hi;
2948 node_child = t_sese->high;
2949 }
2950 }
2951
2952 sese->push (node_child);
2953 }
2954}
2955
2956
2957/* DFS walk of BB graph. Color node BLOCK according to COLORING then
2958 proceed to successors. Set SESE entry and exit nodes of
2959 REGIONS. */
2960
2961static void
2962nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2963 basic_block block, int coloring)
2964{
2965 bb_sese *sese = BB_GET_SESE (block);
2966
2967 if (block->flags & BB_VISITED)
2968 {
2969 /* If we've already encountered this block, either we must not
2970 be coloring, or it must have been colored the current color. */
2971 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
2972 return;
2973 }
2974
2975 block->flags |= BB_VISITED;
2976
2977 if (sese)
2978 {
2979 if (coloring < 0)
2980 {
2981 /* Start coloring a region. */
2982 regions[sese->color].first = block;
2983 coloring = sese->color;
2984 }
2985
2986 if (!--color_counts[sese->color] && sese->color == coloring)
2987 {
2988 /* Found final block of SESE region. */
2989 regions[sese->color].second = block;
2990 coloring = -1;
2991 }
2992 else
2993 /* Color the node, so we can assert on revisiting the node
2994 that the graph is indeed SESE. */
2995 sese->color = coloring;
2996 }
2997 else
2998 /* Fallen off the subgraph, we cannot be coloring. */
2999 gcc_assert (coloring < 0);
3000
3001 /* Walk each successor block. */
3002 if (block->succs && block->succs->length ())
3003 {
3004 edge e;
3005 edge_iterator ei;
3006
3007 FOR_EACH_EDGE (e, ei, block->succs)
3008 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3009 }
3010 else
3011 gcc_assert (coloring < 0);
3012}
3013
3014/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3015 end up with NULL entries in it. */
3016
3017static void
3018nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3019{
3020 basic_block block;
3021 int ix;
3022
3023 /* First clear each BB of the whole function. */
3024 FOR_EACH_BB_FN (block, cfun)
3025 {
3026 block->flags &= ~BB_VISITED;
3027 BB_SET_SESE (block, 0);
3028 }
3029 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3030 block->flags &= ~BB_VISITED;
3031 BB_SET_SESE (block, 0);
3032 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3033 block->flags &= ~BB_VISITED;
3034 BB_SET_SESE (block, 0);
3035
3036 /* Mark blocks in the function that are in this graph. */
3037 for (ix = 0; blocks.iterate (ix, &block); ix++)
3038 block->flags |= BB_VISITED;
3039
3040 /* Counts of nodes assigned to each color. There cannot be more
3041 colors than blocks (and hopefully there will be fewer). */
3042 auto_vec<unsigned> color_counts;
3043 color_counts.reserve (blocks.length ());
3044
3045 /* Worklist of nodes in the spanning tree. Again, there cannot be
3046 more nodes in the tree than blocks (there will be fewer if the
3047 CFG of blocks is disjoint). */
3048 auto_vec<basic_block> spanlist;
3049 spanlist.reserve (blocks.length ());
3050
3051 /* Make sure every block has its cycle class determined. */
3052 for (ix = 0; blocks.iterate (ix, &block); ix++)
3053 {
3054 if (BB_GET_SESE (block))
3055 /* We already met this block in an earlier graph solve. */
3056 continue;
3057
3058 if (dump_file)
3059 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3060
3061 /* Number the nodes reachable from block initial DFS order. */
3062 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3063
3064 /* Now walk in reverse DFS order to find cycle equivalents. */
3065 while (spanlist.length ())
3066 {
3067 block = spanlist.pop ();
3068 bb_sese *sese = BB_GET_SESE (block);
3069
3070 /* Do the pseudo node below. */
3071 nvptx_sese_pseudo (block, sese, depth, +1,
3072 sese->dir > 0 ? block->succs : block->preds,
3073 (sese->dir > 0 ? offsetof (edge_def, dest)
3074 : offsetof (edge_def, src)));
3075 sese->set_color (color_counts);
3076 /* Do the pseudo node above. */
3077 nvptx_sese_pseudo (block, sese, depth, -1,
3078 sese->dir < 0 ? block->succs : block->preds,
3079 (sese->dir < 0 ? offsetof (edge_def, dest)
3080 : offsetof (edge_def, src)));
3081 }
3082 if (dump_file)
3083 fprintf (dump_file, "\n");
3084 }
3085
3086 if (dump_file)
3087 {
3088 unsigned count;
3089 const char *comma = "";
3090
3091 fprintf (dump_file, "Found %d cycle equivalents\n",
3092 color_counts.length ());
3093 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3094 {
3095 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3096
3097 comma = "";
3098 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3099 if (BB_GET_SESE (block)->color == ix)
3100 {
3101 block->flags |= BB_VISITED;
3102 fprintf (dump_file, "%s%d", comma, block->index);
3103 comma=",";
3104 }
3105 fprintf (dump_file, "}");
3106 comma = ", ";
3107 }
3108 fprintf (dump_file, "\n");
3109 }
3110
3111 /* Now we've colored every block in the subgraph. We now need to
3112 determine the minimal set of SESE regions that cover that
3113 subgraph. Do this with a DFS walk of the complete function.
3114 During the walk we're either 'looking' or 'coloring'. When we
3115 reach the last node of a particular color, we stop coloring and
3116 return to looking. */
3117
3118 /* There cannot be more SESE regions than colors. */
3119 regions.reserve (color_counts.length ());
3120 for (ix = color_counts.length (); ix--;)
3121 regions.quick_push (bb_pair_t (0, 0));
3122
3123 for (ix = 0; blocks.iterate (ix, &block); ix++)
3124 block->flags &= ~BB_VISITED;
3125
3126 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3127
3128 if (dump_file)
3129 {
3130 const char *comma = "";
3131 int len = regions.length ();
3132
3133 fprintf (dump_file, "SESE regions:");
3134 for (ix = 0; ix != len; ix++)
3135 {
3136 basic_block from = regions[ix].first;
3137 basic_block to = regions[ix].second;
3138
3139 if (from)
3140 {
3141 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3142 if (to != from)
3143 fprintf (dump_file, "->%d", to->index);
3144
3145 int color = BB_GET_SESE (from)->color;
3146
3147 /* Print the blocks within the region (excluding ends). */
3148 FOR_EACH_BB_FN (block, cfun)
3149 {
3150 bb_sese *sese = BB_GET_SESE (block);
3151
3152 if (sese && sese->color == color
3153 && block != from && block != to)
3154 fprintf (dump_file, ".%d", block->index);
3155 }
3156 fprintf (dump_file, "}");
3157 }
3158 comma = ",";
3159 }
3160 fprintf (dump_file, "\n\n");
3161 }
3162
3163 for (ix = 0; blocks.iterate (ix, &block); ix++)
3164 delete BB_GET_SESE (block);
3165}
3166
3167#undef BB_SET_SESE
3168#undef BB_GET_SESE
3169
d88cd9c4
NS
3170/* Propagate live state at the start of a partitioned region. BLOCK
3171 provides the live register information, and might not contain
3172 INSN. Propagation is inserted just after INSN. RW indicates whether
3173 we are reading and/or writing state. This
3174 separation is needed for worker-level proppagation where we
3175 essentially do a spill & fill. FN is the underlying worker
3176 function to generate the propagation instructions for single
3177 register. DATA is user data.
3178
3179 We propagate the live register set and the entire frame. We could
3180 do better by (a) propagating just the live set that is used within
3181 the partitioned regions and (b) only propagating stack entries that
3182 are used. The latter might be quite hard to determine. */
3183
3184typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3185
3186static void
3187nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3188 propagator_fn fn, void *data)
3189{
3190 bitmap live = DF_LIVE_IN (block);
3191 bitmap_iterator iterator;
3192 unsigned ix;
3193
3194 /* Copy the frame array. */
3195 HOST_WIDE_INT fs = get_frame_size ();
3196 if (fs)
3197 {
3198 rtx tmp = gen_reg_rtx (DImode);
3199 rtx idx = NULL_RTX;
3200 rtx ptr = gen_reg_rtx (Pmode);
3201 rtx pred = NULL_RTX;
3202 rtx_code_label *label = NULL;
3203
3204 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3205 fs /= GET_MODE_SIZE (DImode);
3206 /* Detect single iteration loop. */
3207 if (fs == 1)
3208 fs = 0;
3209
3210 start_sequence ();
3211 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3212 if (fs)
3213 {
3214 idx = gen_reg_rtx (SImode);
3215 pred = gen_reg_rtx (BImode);
3216 label = gen_label_rtx ();
3217
3218 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3219 /* Allow worker function to initialize anything needed. */
3220 rtx init = fn (tmp, PM_loop_begin, fs, data);
3221 if (init)
3222 emit_insn (init);
3223 emit_label (label);
3224 LABEL_NUSES (label)++;
3225 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3226 }
3227 if (rw & PM_read)
3228 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3229 emit_insn (fn (tmp, rw, fs, data));
3230 if (rw & PM_write)
3231 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3232 if (fs)
3233 {
3234 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3235 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3236 emit_insn (gen_br_true_uni (pred, label));
3237 rtx fini = fn (tmp, PM_loop_end, fs, data);
3238 if (fini)
3239 emit_insn (fini);
3240 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3241 }
3242 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3243 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3244 rtx cpy = get_insns ();
3245 end_sequence ();
3246 insn = emit_insn_after (cpy, insn);
3247 }
3248
3249 /* Copy live registers. */
3250 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3251 {
3252 rtx reg = regno_reg_rtx[ix];
3253
3254 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3255 {
3256 rtx bcast = fn (reg, rw, 0, data);
3257
3258 insn = emit_insn_after (bcast, insn);
3259 }
3260 }
3261}
3262
3263/* Worker for nvptx_vpropagate. */
3264
3265static rtx
3266vprop_gen (rtx reg, propagate_mask pm,
3267 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3268{
3269 if (!(pm & PM_read_write))
3270 return 0;
3271
3272 return nvptx_gen_vcast (reg);
3273}
3274
3275/* Propagate state that is live at start of BLOCK across the vectors
3276 of a single warp. Propagation is inserted just after INSN. */
3277
3278static void
3279nvptx_vpropagate (basic_block block, rtx_insn *insn)
3280{
3281 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3282}
3283
3284/* Worker for nvptx_wpropagate. */
3285
3286static rtx
3287wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3288{
3289 wcast_data_t *data = (wcast_data_t *)data_;
3290
3291 if (pm & PM_loop_begin)
3292 {
3293 /* Starting a loop, initialize pointer. */
3294 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3295
3296 if (align > worker_bcast_align)
3297 worker_bcast_align = align;
3298 data->offset = (data->offset + align - 1) & ~(align - 1);
3299
3300 data->ptr = gen_reg_rtx (Pmode);
3301
3302 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3303 }
3304 else if (pm & PM_loop_end)
3305 {
3306 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3307 data->ptr = NULL_RTX;
3308 return clobber;
3309 }
3310 else
3311 return nvptx_gen_wcast (reg, pm, rep, data);
3312}
3313
3314/* Spill or fill live state that is live at start of BLOCK. PRE_P
3315 indicates if this is just before partitioned mode (do spill), or
3316 just after it starts (do fill). Sequence is inserted just after
3317 INSN. */
3318
3319static void
3320nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3321{
3322 wcast_data_t data;
3323
3324 data.base = gen_reg_rtx (Pmode);
3325 data.offset = 0;
3326 data.ptr = NULL_RTX;
3327
3328 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3329 if (data.offset)
3330 {
3331 /* Stuff was emitted, initialize the base pointer now. */
3332 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3333 emit_insn_after (init, insn);
3334
3335 if (worker_bcast_size < data.offset)
3336 worker_bcast_size = data.offset;
3337 }
3338}
3339
3340/* Emit a worker-level synchronization barrier. We use different
3341 markers for before and after synchronizations. */
3342
3343static rtx
3344nvptx_wsync (bool after)
3345{
3346 return gen_nvptx_barsync (GEN_INT (after));
3347}
3348
3349/* Single neutering according to MASK. FROM is the incoming block and
3350 TO is the outgoing block. These may be the same block. Insert at
3351 start of FROM:
3352
3353 if (tid.<axis>) goto end.
3354
3355 and insert before ending branch of TO (if there is such an insn):
3356
3357 end:
3358 <possibly-broadcast-cond>
3359 <branch>
3360
3361 We currently only use differnt FROM and TO when skipping an entire
3362 loop. We could do more if we detected superblocks. */
3363
3364static void
3365nvptx_single (unsigned mask, basic_block from, basic_block to)
3366{
3367 rtx_insn *head = BB_HEAD (from);
3368 rtx_insn *tail = BB_END (to);
3369 unsigned skip_mask = mask;
3370
3371 /* Find first insn of from block */
3372 while (head != BB_END (from) && !INSN_P (head))
3373 head = NEXT_INSN (head);
3374
3375 /* Find last insn of to block */
3376 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3377 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3378 tail = PREV_INSN (tail);
3379
3380 /* Detect if tail is a branch. */
3381 rtx tail_branch = NULL_RTX;
3382 rtx cond_branch = NULL_RTX;
3383 if (tail && INSN_P (tail))
3384 {
3385 tail_branch = PATTERN (tail);
3386 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3387 tail_branch = NULL_RTX;
3388 else
3389 {
3390 cond_branch = SET_SRC (tail_branch);
3391 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3392 cond_branch = NULL_RTX;
3393 }
3394 }
3395
3396 if (tail == head)
3397 {
3398 /* If this is empty, do nothing. */
3399 if (!head || !INSN_P (head))
3400 return;
3401
3402 /* If this is a dummy insn, do nothing. */
3403 switch (recog_memoized (head))
3404 {
3405 default:
3406 break;
3407 case CODE_FOR_nvptx_fork:
3408 case CODE_FOR_nvptx_forked:
3409 case CODE_FOR_nvptx_joining:
3410 case CODE_FOR_nvptx_join:
3411 return;
3412 }
3413
3414 if (cond_branch)
3415 {
3416 /* If we're only doing vector single, there's no need to
3417 emit skip code because we'll not insert anything. */
3418 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3419 skip_mask = 0;
3420 }
3421 else if (tail_branch)
3422 /* Block with only unconditional branch. Nothing to do. */
3423 return;
3424 }
3425
3426 /* Insert the vector test inside the worker test. */
3427 unsigned mode;
3428 rtx_insn *before = tail;
3429 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3430 if (GOMP_DIM_MASK (mode) & skip_mask)
3431 {
3432 rtx_code_label *label = gen_label_rtx ();
3433 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3434
3435 if (!pred)
3436 {
3437 pred = gen_reg_rtx (BImode);
3438 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3439 }
3440
3441 rtx br;
3442 if (mode == GOMP_DIM_VECTOR)
3443 br = gen_br_true (pred, label);
3444 else
3445 br = gen_br_true_uni (pred, label);
3446 emit_insn_before (br, head);
3447
3448 LABEL_NUSES (label)++;
3449 if (tail_branch)
3450 before = emit_label_before (label, before);
3451 else
3452 emit_label_after (label, tail);
3453 }
3454
3455 /* Now deal with propagating the branch condition. */
3456 if (cond_branch)
3457 {
3458 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3459
3460 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3461 {
3462 /* Vector mode only, do a shuffle. */
3463 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3464 }
3465 else
3466 {
3467 /* Includes worker mode, do spill & fill. By construction
3468 we should never have worker mode only. */
3469 wcast_data_t data;
3470
3471 data.base = worker_bcast_sym;
3472 data.ptr = 0;
3473
3474 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3475 worker_bcast_size = GET_MODE_SIZE (SImode);
3476
3477 data.offset = 0;
3478 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3479 before);
3480 /* Barrier so other workers can see the write. */
3481 emit_insn_before (nvptx_wsync (false), tail);
3482 data.offset = 0;
3483 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3484 /* This barrier is needed to avoid worker zero clobbering
3485 the broadcast buffer before all the other workers have
3486 had a chance to read this instance of it. */
3487 emit_insn_before (nvptx_wsync (true), tail);
3488 }
3489
3490 extract_insn (tail);
3491 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3492 UNSPEC_BR_UNIFIED);
3493 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3494 }
3495}
3496
3497/* PAR is a parallel that is being skipped in its entirety according to
3498 MASK. Treat this as skipping a superblock starting at forked
3499 and ending at joining. */
3500
3501static void
3502nvptx_skip_par (unsigned mask, parallel *par)
3503{
3504 basic_block tail = par->join_block;
3505 gcc_assert (tail->preds->length () == 1);
3506
3507 basic_block pre_tail = (*tail->preds)[0]->src;
3508 gcc_assert (pre_tail->succs->length () == 1);
3509
3510 nvptx_single (mask, par->forked_block, pre_tail);
3511}
3512
dba619f3
NS
3513/* If PAR has a single inner parallel and PAR itself only contains
3514 empty entry and exit blocks, swallow the inner PAR. */
3515
3516static void
3517nvptx_optimize_inner (parallel *par)
3518{
3519 parallel *inner = par->inner;
3520
3521 /* We mustn't be the outer dummy par. */
3522 if (!par->mask)
3523 return;
3524
3525 /* We must have a single inner par. */
3526 if (!inner || inner->next)
3527 return;
3528
3529 /* We must only contain 2 blocks ourselves -- the head and tail of
3530 the inner par. */
3531 if (par->blocks.length () != 2)
3532 return;
3533
3534 /* We must be disjoint partitioning. As we only have vector and
3535 worker partitioning, this is sufficient to guarantee the pars
3536 have adjacent partitioning. */
3537 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3538 /* This indicates malformed code generation. */
3539 return;
3540
3541 /* The outer forked insn should be immediately followed by the inner
3542 fork insn. */
3543 rtx_insn *forked = par->forked_insn;
3544 rtx_insn *fork = BB_END (par->forked_block);
3545
3546 if (NEXT_INSN (forked) != fork)
3547 return;
3548 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3549
3550 /* The outer joining insn must immediately follow the inner join
3551 insn. */
3552 rtx_insn *joining = par->joining_insn;
3553 rtx_insn *join = inner->join_insn;
3554 if (NEXT_INSN (join) != joining)
3555 return;
3556
3557 /* Preconditions met. Swallow the inner par. */
3558 if (dump_file)
3559 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3560 inner->mask, inner->forked_block->index,
3561 inner->join_block->index,
3562 par->mask, par->forked_block->index, par->join_block->index);
3563
3564 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3565
3566 par->blocks.reserve (inner->blocks.length ());
3567 while (inner->blocks.length ())
3568 par->blocks.quick_push (inner->blocks.pop ());
3569
3570 par->inner = inner->inner;
3571 inner->inner = NULL;
3572
3573 delete inner;
3574}
3575
d88cd9c4
NS
3576/* Process the parallel PAR and all its contained
3577 parallels. We do everything but the neutering. Return mask of
3578 partitioned modes used within this parallel. */
3579
3580static unsigned
3581nvptx_process_pars (parallel *par)
3582{
dba619f3
NS
3583 if (nvptx_optimize)
3584 nvptx_optimize_inner (par);
3585
d88cd9c4
NS
3586 unsigned inner_mask = par->mask;
3587
3588 /* Do the inner parallels first. */
3589 if (par->inner)
3590 {
3591 par->inner_mask = nvptx_process_pars (par->inner);
3592 inner_mask |= par->inner_mask;
3593 }
3594
3595 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3596 /* No propagation needed for a call. */;
5d306e55 3597 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3598 {
3599 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3600 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3601 /* Insert begin and end synchronizations. */
3602 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3603 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3604 }
3605 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3606 nvptx_vpropagate (par->forked_block, par->forked_insn);
3607
3608 /* Now do siblings. */
3609 if (par->next)
3610 inner_mask |= nvptx_process_pars (par->next);
3611 return inner_mask;
3612}
3613
3614/* Neuter the parallel described by PAR. We recurse in depth-first
3615 order. MODES are the partitioning of the execution and OUTER is
3616 the partitioning of the parallels we are contained in. */
3617
3618static void
3619nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3620{
3621 unsigned me = (par->mask
3622 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3623 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3624 unsigned skip_mask = 0, neuter_mask = 0;
3625
3626 if (par->inner)
3627 nvptx_neuter_pars (par->inner, modes, outer | me);
3628
3629 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3630 {
3631 if ((outer | me) & GOMP_DIM_MASK (mode))
3632 {} /* Mode is partitioned: no neutering. */
3633 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 3634 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
3635 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3636 || !par->forked_insn)
3637 /* Partitioned in inner parallels, or we're not a partitioned
3638 at all: neuter individual blocks. */
3639 neuter_mask |= GOMP_DIM_MASK (mode);
3640 else if (!par->parent || !par->parent->forked_insn
3641 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3642 /* Parent isn't a parallel or contains this paralleling: skip
3643 parallel at this level. */
3644 skip_mask |= GOMP_DIM_MASK (mode);
3645 else
3646 {} /* Parent will skip this parallel itself. */
3647 }
3648
3649 if (neuter_mask)
3650 {
912442c2 3651 int ix, len;
d88cd9c4 3652
912442c2
NS
3653 if (nvptx_optimize)
3654 {
3655 /* Neuter whole SESE regions. */
3656 bb_pair_vec_t regions;
3657
3658 nvptx_find_sese (par->blocks, regions);
3659 len = regions.length ();
3660 for (ix = 0; ix != len; ix++)
3661 {
3662 basic_block from = regions[ix].first;
3663 basic_block to = regions[ix].second;
3664
3665 if (from)
3666 nvptx_single (neuter_mask, from, to);
3667 else
3668 gcc_assert (!to);
3669 }
3670 }
3671 else
d88cd9c4 3672 {
912442c2
NS
3673 /* Neuter each BB individually. */
3674 len = par->blocks.length ();
3675 for (ix = 0; ix != len; ix++)
3676 {
3677 basic_block block = par->blocks[ix];
d88cd9c4 3678
912442c2
NS
3679 nvptx_single (neuter_mask, block, block);
3680 }
d88cd9c4
NS
3681 }
3682 }
3683
3684 if (skip_mask)
3685 nvptx_skip_par (skip_mask, par);
3686
3687 if (par->next)
3688 nvptx_neuter_pars (par->next, modes, outer);
3689}
3690
517665b3 3691/* PTX-specific reorganization
d88cd9c4 3692 - Split blocks at fork and join instructions
c38f0d8c
NS
3693 - Compute live registers
3694 - Mark now-unused registers, so function begin doesn't declare
517665b3 3695 unused registers.
d88cd9c4
NS
3696 - Insert state propagation when entering partitioned mode
3697 - Insert neutering instructions when in single mode
c38f0d8c 3698 - Replace subregs with suitable sequences.
517665b3
NS
3699*/
3700
3701static void
3702nvptx_reorg (void)
3703{
517665b3
NS
3704 /* We are freeing block_for_insn in the toplev to keep compatibility
3705 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3706 compute_bb_for_insn ();
3707
3708 thread_prologue_and_epilogue_insns ();
3709
d88cd9c4
NS
3710 /* Split blocks and record interesting unspecs. */
3711 bb_insn_map_t bb_insn_map;
3712
3713 nvptx_split_blocks (&bb_insn_map);
3714
c38f0d8c 3715 /* Compute live regs */
517665b3
NS
3716 df_clear_flags (DF_LR_RUN_DCE);
3717 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
3718 df_live_add_problem ();
3719 df_live_set_all_dirty ();
517665b3 3720 df_analyze ();
738f2522
BS
3721 regstat_init_n_sets_and_refs ();
3722
d88cd9c4
NS
3723 if (dump_file)
3724 df_dump (dump_file);
3725
517665b3 3726 /* Mark unused regs as unused. */
d88cd9c4 3727 int max_regs = max_reg_num ();
517665b3 3728 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
3729 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3730 regno_reg_rtx[i] = const0_rtx;
517665b3 3731
d88cd9c4
NS
3732 /* Determine launch dimensions of the function. If it is not an
3733 offloaded function (i.e. this is a regular compiler), the
3734 function has no neutering. */
3735 tree attr = get_oacc_fn_attrib (current_function_decl);
3736 if (attr)
3737 {
3738 /* If we determined this mask before RTL expansion, we could
3739 elide emission of some levels of forks and joins. */
3740 unsigned mask = 0;
3741 tree dims = TREE_VALUE (attr);
3742 unsigned ix;
3743
3744 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3745 {
3746 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3747 tree allowed = TREE_PURPOSE (dims);
3748
3749 if (size != 1 && !(allowed && integer_zerop (allowed)))
3750 mask |= GOMP_DIM_MASK (ix);
3751 }
3752 /* If there is worker neutering, there must be vector
3753 neutering. Otherwise the hardware will fail. */
3754 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3755 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3756
3757 /* Discover & process partitioned regions. */
3758 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3759 nvptx_process_pars (pars);
3760 nvptx_neuter_pars (pars, mask, 0);
3761 delete pars;
3762 }
3763
517665b3 3764 /* Replace subregs. */
c03b0416 3765 nvptx_reorg_subreg ();
517665b3 3766
738f2522 3767 regstat_free_n_sets_and_refs ();
517665b3
NS
3768
3769 df_finish_pass (true);
738f2522
BS
3770}
3771\f
3772/* Handle a "kernel" attribute; arguments as in
3773 struct attribute_spec.handler. */
3774
3775static tree
3776nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3777 int ARG_UNUSED (flags), bool *no_add_attrs)
3778{
3779 tree decl = *node;
3780
3781 if (TREE_CODE (decl) != FUNCTION_DECL)
3782 {
3783 error ("%qE attribute only applies to functions", name);
3784 *no_add_attrs = true;
3785 }
3786
3787 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3788 {
3789 error ("%qE attribute requires a void return type", name);
3790 *no_add_attrs = true;
3791 }
3792
3793 return NULL_TREE;
3794}
3795
3796/* Table of valid machine attributes. */
3797static const struct attribute_spec nvptx_attribute_table[] =
3798{
3799 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3800 affects_type_identity } */
3801 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3802 { NULL, 0, 0, false, false, false, NULL, false }
3803};
3804\f
3805/* Limit vector alignments to BIGGEST_ALIGNMENT. */
3806
3807static HOST_WIDE_INT
3808nvptx_vector_alignment (const_tree type)
3809{
3810 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3811
3812 return MIN (align, BIGGEST_ALIGNMENT);
3813}
d88cd9c4
NS
3814
3815/* Indicate that INSN cannot be duplicated. */
3816
3817static bool
3818nvptx_cannot_copy_insn_p (rtx_insn *insn)
3819{
3820 switch (recog_memoized (insn))
3821 {
3822 case CODE_FOR_nvptx_shufflesi:
3823 case CODE_FOR_nvptx_shufflesf:
3824 case CODE_FOR_nvptx_barsync:
3825 case CODE_FOR_nvptx_fork:
3826 case CODE_FOR_nvptx_forked:
3827 case CODE_FOR_nvptx_joining:
3828 case CODE_FOR_nvptx_join:
3829 return true;
3830 default:
3831 return false;
3832 }
3833}
a794bd20
NS
3834
3835/* Section anchors do not work. Initialization for flag_section_anchor
3836 probes the existence of the anchoring target hooks and prevents
3837 anchoring if they don't exist. However, we may be being used with
3838 a host-side compiler that does support anchoring, and hence see
3839 the anchor flag set (as it's not recalculated). So provide an
3840 implementation denying anchoring. */
3841
3842static bool
3843nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3844{
3845 return false;
3846}
738f2522 3847\f
1f83528e
TS
3848/* Record a symbol for mkoffload to enter into the mapping table. */
3849
3850static void
3851nvptx_record_offload_symbol (tree decl)
3852{
3e32ee19
NS
3853 switch (TREE_CODE (decl))
3854 {
3855 case VAR_DECL:
3856 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3857 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3858 break;
3859
3860 case FUNCTION_DECL:
3861 {
3862 tree attr = get_oacc_fn_attrib (decl);
5d306e55 3863 tree dims = TREE_VALUE (attr);
3e32ee19
NS
3864 unsigned ix;
3865
3e32ee19
NS
3866 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3867 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3868
5d306e55 3869 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3e32ee19 3870 {
5d306e55 3871 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 3872
5d306e55 3873 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
3874 fprintf (asm_out_file, ", %#x", size);
3875 }
d2d47a28 3876
3e32ee19
NS
3877 fprintf (asm_out_file, "\n");
3878 }
3879 break;
d2d47a28 3880
3e32ee19
NS
3881 default:
3882 gcc_unreachable ();
3883 }
1f83528e
TS
3884}
3885
738f2522
BS
3886/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3887 at the start of a file. */
3888
3889static void
3890nvptx_file_start (void)
3891{
3892 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3893 fputs ("\t.version\t3.1\n", asm_out_file);
3894 fputs ("\t.target\tsm_30\n", asm_out_file);
3895 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3896 fputs ("// END PREAMBLE\n", asm_out_file);
3897}
3898
ecf6e535
BS
3899/* Write out the function declarations we've collected and declare storage
3900 for the broadcast buffer. */
738f2522
BS
3901
3902static void
3903nvptx_file_end (void)
3904{
f3dba894
TS
3905 hash_table<tree_hasher>::iterator iter;
3906 tree decl;
3907 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 3908 nvptx_record_fndecl (decl);
738f2522 3909 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
3910
3911 if (worker_bcast_size)
3912 {
3913 /* Define the broadcast buffer. */
3914
3915 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3916 & ~(worker_bcast_align - 1);
3917
69823d76 3918 write_var_marker (asm_out_file, true, false, worker_bcast_name);
d88cd9c4
NS
3919 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3920 worker_bcast_align,
3921 worker_bcast_name, worker_bcast_size);
3922 }
f3552158
NS
3923
3924 if (worker_red_size)
3925 {
3926 /* Define the reduction buffer. */
3927
3928 worker_red_size = ((worker_red_size + worker_red_align - 1)
3929 & ~(worker_red_align - 1));
69823d76
NS
3930
3931 write_var_marker (asm_out_file, true, false, worker_red_name);
f3552158
NS
3932 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3933 worker_red_align,
3934 worker_red_name, worker_red_size);
3935 }
3936}
3937
3938/* Expander for the shuffle builtins. */
3939
3940static rtx
3941nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3942{
3943 if (ignore)
3944 return target;
3945
3946 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3947 NULL_RTX, mode, EXPAND_NORMAL);
3948 if (!REG_P (src))
3949 src = copy_to_mode_reg (mode, src);
3950
3951 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3952 NULL_RTX, SImode, EXPAND_NORMAL);
3953 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3954 NULL_RTX, SImode, EXPAND_NORMAL);
3955
3956 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3957 idx = copy_to_mode_reg (SImode, idx);
3958
3959 rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op));
3960 if (pat)
3961 emit_insn (pat);
3962
3963 return target;
3964}
3965
3966/* Worker reduction address expander. */
3967
3968static rtx
3969nvptx_expand_worker_addr (tree exp, rtx target,
3970 machine_mode ARG_UNUSED (mode), int ignore)
3971{
3972 if (ignore)
3973 return target;
3974
3975 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3976 if (align > worker_red_align)
3977 worker_red_align = align;
3978
3979 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3980 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3981 if (size + offset > worker_red_size)
3982 worker_red_size = size + offset;
3983
3984 emit_insn (gen_rtx_SET (target, worker_red_sym));
3985
3986 if (offset)
3987 emit_insn (gen_rtx_SET (target,
3988 gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
3989
3990 emit_insn (gen_rtx_SET (target,
3991 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
3992 UNSPEC_FROM_SHARED)));
3993
3994 return target;
3995}
3996
3997/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3998 not require taking the address of any object, other than the memory
3999 cell being operated on. */
4000
4001static rtx
4002nvptx_expand_cmp_swap (tree exp, rtx target,
4003 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4004{
4005 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4006
4007 if (!target)
4008 target = gen_reg_rtx (mode);
4009
4010 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4011 NULL_RTX, Pmode, EXPAND_NORMAL);
4012 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4013 NULL_RTX, mode, EXPAND_NORMAL);
4014 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4015 NULL_RTX, mode, EXPAND_NORMAL);
4016 rtx pat;
4017
4018 mem = gen_rtx_MEM (mode, mem);
4019 if (!REG_P (cmp))
4020 cmp = copy_to_mode_reg (mode, cmp);
4021 if (!REG_P (src))
4022 src = copy_to_mode_reg (mode, src);
4023
4024 if (mode == SImode)
4025 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4026 else
4027 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4028
4029 emit_insn (pat);
4030
4031 return target;
4032}
4033
4034
4035/* Codes for all the NVPTX builtins. */
4036enum nvptx_builtins
4037{
4038 NVPTX_BUILTIN_SHUFFLE,
4039 NVPTX_BUILTIN_SHUFFLELL,
4040 NVPTX_BUILTIN_WORKER_ADDR,
4041 NVPTX_BUILTIN_CMP_SWAP,
4042 NVPTX_BUILTIN_CMP_SWAPLL,
4043 NVPTX_BUILTIN_MAX
4044};
4045
4046static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4047
4048/* Return the NVPTX builtin for CODE. */
4049
4050static tree
4051nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4052{
4053 if (code >= NVPTX_BUILTIN_MAX)
4054 return error_mark_node;
4055
4056 return nvptx_builtin_decls[code];
4057}
4058
4059/* Set up all builtin functions for this target. */
4060
4061static void
4062nvptx_init_builtins (void)
4063{
4064#define DEF(ID, NAME, T) \
4065 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4066 = add_builtin_function ("__builtin_nvptx_" NAME, \
4067 build_function_type_list T, \
4068 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4069#define ST sizetype
4070#define UINT unsigned_type_node
4071#define LLUINT long_long_unsigned_type_node
4072#define PTRVOID ptr_type_node
4073
4074 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4075 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4076 DEF (WORKER_ADDR, "worker_addr",
4077 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4078 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4079 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4080
4081#undef DEF
4082#undef ST
4083#undef UINT
4084#undef LLUINT
4085#undef PTRVOID
4086}
4087
4088/* Expand an expression EXP that calls a built-in function,
4089 with result going to TARGET if that's convenient
4090 (and in mode MODE if that's convenient).
4091 SUBTARGET may be used as the target for computing one of EXP's operands.
4092 IGNORE is nonzero if the value is to be ignored. */
4093
4094static rtx
4095nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4096 machine_mode mode, int ignore)
4097{
4098 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4099 switch (DECL_FUNCTION_CODE (fndecl))
4100 {
4101 case NVPTX_BUILTIN_SHUFFLE:
4102 case NVPTX_BUILTIN_SHUFFLELL:
4103 return nvptx_expand_shuffle (exp, target, mode, ignore);
4104
4105 case NVPTX_BUILTIN_WORKER_ADDR:
4106 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4107
4108 case NVPTX_BUILTIN_CMP_SWAP:
4109 case NVPTX_BUILTIN_CMP_SWAPLL:
4110 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4111
4112 default: gcc_unreachable ();
4113 }
738f2522
BS
4114}
4115\f
f3552158
NS
4116/* Define dimension sizes for known hardware. */
4117#define PTX_VECTOR_LENGTH 32
4118#define PTX_WORKER_LENGTH 32
4119
94829f87
NS
4120/* Validate compute dimensions of an OpenACC offload or routine, fill
4121 in non-unity defaults. FN_LEVEL indicates the level at which a
4122 routine might spawn a loop. It is negative for non-routines. */
4123
4124static bool
5d306e55 4125nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4126{
4127 bool changed = false;
4128
ccc8282b
NS
4129 /* The vector size must be 32, unless this is a SEQ routine. */
4130 if (fn_level <= GOMP_DIM_VECTOR
4131 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4132 {
4133 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4134 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4135 dims[GOMP_DIM_VECTOR]
4136 ? "using vector_length (%d), ignoring %d"
4137 : "using vector_length (%d), ignoring runtime setting",
4138 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4139 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4140 changed = true;
4141 }
4142
4143 /* Check the num workers is not too large. */
4144 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4145 {
4146 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4147 "using num_workers (%d), ignoring %d",
4148 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4149 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4150 changed = true;
4151 }
94829f87
NS
4152
4153 return changed;
4154}
d88cd9c4 4155
bd751975
NS
4156/* Return maximum dimension size, or zero for unbounded. */
4157
4158static int
4159nvptx_dim_limit (int axis)
4160{
4161 switch (axis)
4162 {
4163 case GOMP_DIM_WORKER:
4164 return PTX_WORKER_LENGTH;
4165
4166 case GOMP_DIM_VECTOR:
4167 return PTX_VECTOR_LENGTH;
4168
4169 default:
4170 break;
4171 }
4172 return 0;
4173}
4174
d88cd9c4
NS
4175/* Determine whether fork & joins are needed. */
4176
4177static bool
4178nvptx_goacc_fork_join (gcall *call, const int dims[],
4179 bool ARG_UNUSED (is_fork))
4180{
4181 tree arg = gimple_call_arg (call, 2);
4182 unsigned axis = TREE_INT_CST_LOW (arg);
4183
4184 /* We only care about worker and vector partitioning. */
4185 if (axis < GOMP_DIM_WORKER)
4186 return false;
4187
4188 /* If the size is 1, there's no partitioning. */
4189 if (dims[axis] == 1)
4190 return false;
4191
4192 return true;
4193}
4194
f3552158
NS
4195/* Generate a PTX builtin function call that returns the address in
4196 the worker reduction buffer at OFFSET. TYPE is the type of the
4197 data at that location. */
4198
4199static tree
4200nvptx_get_worker_red_addr (tree type, tree offset)
4201{
4202 machine_mode mode = TYPE_MODE (type);
4203 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4204 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4205 tree align = build_int_cst (unsigned_type_node,
4206 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4207 tree call = build_call_expr (fndecl, 3, offset, size, align);
4208
4209 return fold_convert (build_pointer_type (type), call);
4210}
4211
4212/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4213 will cast the variable if necessary. */
4214
4215static void
4216nvptx_generate_vector_shuffle (location_t loc,
4217 tree dest_var, tree var, unsigned shift,
4218 gimple_seq *seq)
4219{
4220 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4221 tree_code code = NOP_EXPR;
dd3c1b14
NS
4222 tree arg_type = unsigned_type_node;
4223 tree var_type = TREE_TYPE (var);
4224 tree dest_type = var_type;
f3552158 4225
dd3c1b14
NS
4226 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4227 var_type = TREE_TYPE (var_type);
4228
4229 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4230 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4231
4232 if (TYPE_SIZE (var_type)
4233 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4234 {
4235 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4236 arg_type = long_long_unsigned_type_node;
f3552158 4237 }
dd3c1b14 4238
f3552158 4239 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4240 tree bits = build_int_cst (unsigned_type_node, shift);
4241 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4242 tree expr;
4243
4244 if (var_type != dest_type)
4245 {
4246 /* Do real and imaginary parts separately. */
4247 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4248 real = fold_build1 (code, arg_type, real);
4249 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4250 real = fold_build1 (code, var_type, real);
f3552158 4251
dd3c1b14
NS
4252 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4253 imag = fold_build1 (code, arg_type, imag);
4254 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4255 imag = fold_build1 (code, var_type, imag);
4256
4257 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4258 }
4259 else
4260 {
4261 expr = fold_build1 (code, arg_type, var);
4262 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4263 expr = fold_build1 (code, dest_type, expr);
4264 }
f3552158 4265
dd3c1b14 4266 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4267}
4268
33f47f42
NS
4269/* Lazily generate the global lock var decl and return its address. */
4270
4271static tree
4272nvptx_global_lock_addr ()
4273{
4274 tree v = global_lock_var;
4275
4276 if (!v)
4277 {
4278 tree name = get_identifier ("__reduction_lock");
4279 tree type = build_qualified_type (unsigned_type_node,
4280 TYPE_QUAL_VOLATILE);
4281 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4282 global_lock_var = v;
4283 DECL_ARTIFICIAL (v) = 1;
4284 DECL_EXTERNAL (v) = 1;
4285 TREE_STATIC (v) = 1;
4286 TREE_PUBLIC (v) = 1;
4287 TREE_USED (v) = 1;
4288 mark_addressable (v);
4289 mark_decl_referenced (v);
4290 }
4291
4292 return build_fold_addr_expr (v);
4293}
4294
4295/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4296 GSI. We use a lockless scheme for nearly all case, which looks
4297 like:
4298 actual = initval(OP);
4299 do {
4300 guess = actual;
4301 write = guess OP myval;
4302 actual = cmp&swap (ptr, guess, write)
4303 } while (actual bit-different-to guess);
4304 return write;
4305
4306 This relies on a cmp&swap instruction, which is available for 32-
4307 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4308
4309static tree
4310nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4311 tree ptr, tree var, tree_code op)
4312{
4313 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4314 tree_code code = NOP_EXPR;
33f47f42
NS
4315 tree arg_type = unsigned_type_node;
4316 tree var_type = TREE_TYPE (var);
f3552158 4317
33f47f42
NS
4318 if (TREE_CODE (var_type) == COMPLEX_TYPE
4319 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4320 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4321
4322 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4323 {
33f47f42 4324 arg_type = long_long_unsigned_type_node;
f3552158 4325 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4326 }
4327
33f47f42
NS
4328 tree swap_fn = nvptx_builtin_decl (fn, true);
4329
f3552158 4330 gimple_seq init_seq = NULL;
33f47f42
NS
4331 tree init_var = make_ssa_name (arg_type);
4332 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4333 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4334 gimplify_assign (init_var, init_expr, &init_seq);
4335 gimple *init_end = gimple_seq_last (init_seq);
4336
4337 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4338
f3552158
NS
4339 /* Split the block just after the init stmts. */
4340 basic_block pre_bb = gsi_bb (*gsi);
4341 edge pre_edge = split_block (pre_bb, init_end);
4342 basic_block loop_bb = pre_edge->dest;
4343 pre_bb = pre_edge->src;
4344 /* Reset the iterator. */
4345 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4346
33f47f42
NS
4347 tree expect_var = make_ssa_name (arg_type);
4348 tree actual_var = make_ssa_name (arg_type);
4349 tree write_var = make_ssa_name (arg_type);
4350
4351 /* Build and insert the reduction calculation. */
4352 gimple_seq red_seq = NULL;
4353 tree write_expr = fold_build1 (code, var_type, expect_var);
4354 write_expr = fold_build2 (op, var_type, write_expr, var);
4355 write_expr = fold_build1 (code, arg_type, write_expr);
4356 gimplify_assign (write_var, write_expr, &red_seq);
4357
4358 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4359
4360 /* Build & insert the cmp&swap sequence. */
4361 gimple_seq latch_seq = NULL;
4362 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4363 ptr, expect_var, write_var);
4364 gimplify_assign (actual_var, swap_expr, &latch_seq);
4365
4366 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4367 NULL_TREE, NULL_TREE);
4368 gimple_seq_add_stmt (&latch_seq, cond);
4369
4370 gimple *latch_end = gimple_seq_last (latch_seq);
4371 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4372
33f47f42
NS
4373 /* Split the block just after the latch stmts. */
4374 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4375 basic_block post_bb = post_edge->dest;
4376 loop_bb = post_edge->src;
4377 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4378
4379 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4380 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4381 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4382 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4383
4384 gphi *phi = create_phi_node (expect_var, loop_bb);
4385 add_phi_arg (phi, init_var, pre_edge, loc);
4386 add_phi_arg (phi, actual_var, loop_edge, loc);
4387
4388 loop *loop = alloc_loop ();
4389 loop->header = loop_bb;
4390 loop->latch = loop_bb;
4391 add_loop (loop, loop_bb->loop_father);
4392
33f47f42
NS
4393 return fold_build1 (code, var_type, write_var);
4394}
4395
4396/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4397 GSI. This is necessary for types larger than 64 bits, where there
4398 is no cmp&swap instruction to implement a lockless scheme. We use
4399 a lock variable in global memory.
4400
4401 while (cmp&swap (&lock_var, 0, 1))
4402 continue;
4403 T accum = *ptr;
4404 accum = accum OP var;
4405 *ptr = accum;
4406 cmp&swap (&lock_var, 1, 0);
4407 return accum;
4408
4409 A lock in global memory is necessary to force execution engine
4410 descheduling and avoid resource starvation that can occur if the
4411 lock is in .shared memory. */
4412
4413static tree
4414nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4415 tree ptr, tree var, tree_code op)
4416{
4417 tree var_type = TREE_TYPE (var);
4418 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4419 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4420 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4421
4422 /* Split the block just before the gsi. Insert a gimple nop to make
4423 this easier. */
4424 gimple *nop = gimple_build_nop ();
4425 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4426 basic_block entry_bb = gsi_bb (*gsi);
4427 edge entry_edge = split_block (entry_bb, nop);
4428 basic_block lock_bb = entry_edge->dest;
4429 /* Reset the iterator. */
4430 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4431
4432 /* Build and insert the locking sequence. */
4433 gimple_seq lock_seq = NULL;
4434 tree lock_var = make_ssa_name (unsigned_type_node);
4435 tree lock_expr = nvptx_global_lock_addr ();
4436 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4437 uns_unlocked, uns_locked);
4438 gimplify_assign (lock_var, lock_expr, &lock_seq);
4439 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4440 NULL_TREE, NULL_TREE);
4441 gimple_seq_add_stmt (&lock_seq, cond);
4442 gimple *lock_end = gimple_seq_last (lock_seq);
4443 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4444
4445 /* Split the block just after the lock sequence. */
4446 edge locked_edge = split_block (lock_bb, lock_end);
4447 basic_block update_bb = locked_edge->dest;
4448 lock_bb = locked_edge->src;
4449 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4450
4451 /* Create the lock loop ... */
4452 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4453 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4454 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4455 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4456
4457 /* ... and the loop structure. */
4458 loop *lock_loop = alloc_loop ();
4459 lock_loop->header = lock_bb;
4460 lock_loop->latch = lock_bb;
4461 lock_loop->nb_iterations_estimate = 1;
4462 lock_loop->any_estimate = true;
4463 add_loop (lock_loop, entry_bb->loop_father);
4464
4465 /* Build and insert the reduction calculation. */
4466 gimple_seq red_seq = NULL;
4467 tree acc_in = make_ssa_name (var_type);
4468 tree ref_in = build_simple_mem_ref (ptr);
4469 TREE_THIS_VOLATILE (ref_in) = 1;
4470 gimplify_assign (acc_in, ref_in, &red_seq);
4471
4472 tree acc_out = make_ssa_name (var_type);
4473 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4474 gimplify_assign (acc_out, update_expr, &red_seq);
4475
4476 tree ref_out = build_simple_mem_ref (ptr);
4477 TREE_THIS_VOLATILE (ref_out) = 1;
4478 gimplify_assign (ref_out, acc_out, &red_seq);
4479
4480 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4481
4482 /* Build & insert the unlock sequence. */
4483 gimple_seq unlock_seq = NULL;
4484 tree unlock_expr = nvptx_global_lock_addr ();
4485 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4486 uns_locked, uns_unlocked);
4487 gimplify_and_add (unlock_expr, &unlock_seq);
4488 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4489
4490 return acc_out;
4491}
4492
4493/* Emit a sequence to update a reduction accumlator at *PTR with the
4494 value held in VAR using operator OP. Return the updated value.
4495
4496 TODO: optimize for atomic ops and indepedent complex ops. */
4497
4498static tree
4499nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4500 tree ptr, tree var, tree_code op)
4501{
4502 tree type = TREE_TYPE (var);
4503 tree size = TYPE_SIZE (type);
4504
4505 if (size == TYPE_SIZE (unsigned_type_node)
4506 || size == TYPE_SIZE (long_long_unsigned_type_node))
4507 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4508 else
4509 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
4510}
4511
4512/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4513
4514static void
4515nvptx_goacc_reduction_setup (gcall *call)
4516{
4517 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4518 tree lhs = gimple_call_lhs (call);
4519 tree var = gimple_call_arg (call, 2);
4520 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4521 gimple_seq seq = NULL;
4522
4523 push_gimplify_context (true);
4524
4525 if (level != GOMP_DIM_GANG)
4526 {
4527 /* Copy the receiver object. */
4528 tree ref_to_res = gimple_call_arg (call, 1);
4529
4530 if (!integer_zerop (ref_to_res))
4531 var = build_simple_mem_ref (ref_to_res);
4532 }
4533
4534 if (level == GOMP_DIM_WORKER)
4535 {
4536 /* Store incoming value to worker reduction buffer. */
4537 tree offset = gimple_call_arg (call, 5);
4538 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4539 tree ptr = make_ssa_name (TREE_TYPE (call));
4540
4541 gimplify_assign (ptr, call, &seq);
4542 tree ref = build_simple_mem_ref (ptr);
4543 TREE_THIS_VOLATILE (ref) = 1;
4544 gimplify_assign (ref, var, &seq);
4545 }
4546
4547 if (lhs)
4548 gimplify_assign (lhs, var, &seq);
4549
4550 pop_gimplify_context (NULL);
4551 gsi_replace_with_seq (&gsi, seq, true);
4552}
4553
4554/* NVPTX implementation of GOACC_REDUCTION_INIT. */
4555
4556static void
4557nvptx_goacc_reduction_init (gcall *call)
4558{
4559 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4560 tree lhs = gimple_call_lhs (call);
4561 tree var = gimple_call_arg (call, 2);
4562 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4563 enum tree_code rcode
4564 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4565 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4566 TREE_TYPE (var));
4567 gimple_seq seq = NULL;
4568
4569 push_gimplify_context (true);
4570
4571 if (level == GOMP_DIM_VECTOR)
4572 {
4573 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4574 tree tid = make_ssa_name (integer_type_node);
4575 tree dim_vector = gimple_call_arg (call, 3);
4576 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4577 dim_vector);
4578 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4579 NULL_TREE, NULL_TREE);
4580
4581 gimple_call_set_lhs (tid_call, tid);
4582 gimple_seq_add_stmt (&seq, tid_call);
4583 gimple_seq_add_stmt (&seq, cond_stmt);
4584
4585 /* Split the block just after the call. */
4586 edge init_edge = split_block (gsi_bb (gsi), call);
4587 basic_block init_bb = init_edge->dest;
4588 basic_block call_bb = init_edge->src;
4589
4590 /* Fixup flags from call_bb to init_bb. */
4591 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4592
4593 /* Set the initialization stmts. */
4594 gimple_seq init_seq = NULL;
4595 tree init_var = make_ssa_name (TREE_TYPE (var));
4596 gimplify_assign (init_var, init, &init_seq);
4597 gsi = gsi_start_bb (init_bb);
4598 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4599
4600 /* Split block just after the init stmt. */
4601 gsi_prev (&gsi);
4602 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4603 basic_block dst_bb = inited_edge->dest;
4604
4605 /* Create false edge from call_bb to dst_bb. */
4606 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4607
4608 /* Create phi node in dst block. */
4609 gphi *phi = create_phi_node (lhs, dst_bb);
4610 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4611 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4612
4613 /* Reset dominator of dst bb. */
4614 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4615
4616 /* Reset the gsi. */
4617 gsi = gsi_for_stmt (call);
4618 }
4619 else
4620 {
4621 if (level == GOMP_DIM_GANG)
4622 {
4623 /* If there's no receiver object, propagate the incoming VAR. */
4624 tree ref_to_res = gimple_call_arg (call, 1);
4625 if (integer_zerop (ref_to_res))
4626 init = var;
4627 }
4628
4629 gimplify_assign (lhs, init, &seq);
4630 }
4631
4632 pop_gimplify_context (NULL);
4633 gsi_replace_with_seq (&gsi, seq, true);
4634}
4635
4636/* NVPTX implementation of GOACC_REDUCTION_FINI. */
4637
4638static void
4639nvptx_goacc_reduction_fini (gcall *call)
4640{
4641 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4642 tree lhs = gimple_call_lhs (call);
4643 tree ref_to_res = gimple_call_arg (call, 1);
4644 tree var = gimple_call_arg (call, 2);
4645 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4646 enum tree_code op
4647 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4648 gimple_seq seq = NULL;
4649 tree r = NULL_TREE;;
4650
4651 push_gimplify_context (true);
4652
4653 if (level == GOMP_DIM_VECTOR)
4654 {
4655 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4656 but that requires a method of emitting a unified jump at the
4657 gimple level. */
4658 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4659 {
4660 tree other_var = make_ssa_name (TREE_TYPE (var));
4661 nvptx_generate_vector_shuffle (gimple_location (call),
4662 other_var, var, shfl, &seq);
4663
4664 r = make_ssa_name (TREE_TYPE (var));
4665 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4666 var, other_var), &seq);
4667 var = r;
4668 }
4669 }
4670 else
4671 {
4672 tree accum = NULL_TREE;
4673
4674 if (level == GOMP_DIM_WORKER)
4675 {
4676 /* Get reduction buffer address. */
4677 tree offset = gimple_call_arg (call, 5);
4678 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4679 tree ptr = make_ssa_name (TREE_TYPE (call));
4680
4681 gimplify_assign (ptr, call, &seq);
4682 accum = ptr;
4683 }
4684 else if (integer_zerop (ref_to_res))
4685 r = var;
4686 else
4687 accum = ref_to_res;
4688
4689 if (accum)
4690 {
33f47f42 4691 /* UPDATE the accumulator. */
f3552158
NS
4692 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4693 seq = NULL;
33f47f42
NS
4694 r = nvptx_reduction_update (gimple_location (call), &gsi,
4695 accum, var, op);
f3552158
NS
4696 }
4697 }
4698
4699 if (lhs)
4700 gimplify_assign (lhs, r, &seq);
4701 pop_gimplify_context (NULL);
4702
4703 gsi_replace_with_seq (&gsi, seq, true);
4704}
4705
4706/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4707
4708static void
4709nvptx_goacc_reduction_teardown (gcall *call)
4710{
4711 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4712 tree lhs = gimple_call_lhs (call);
4713 tree var = gimple_call_arg (call, 2);
4714 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4715 gimple_seq seq = NULL;
4716
4717 push_gimplify_context (true);
4718 if (level == GOMP_DIM_WORKER)
4719 {
4720 /* Read the worker reduction buffer. */
4721 tree offset = gimple_call_arg (call, 5);
4722 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4723 tree ptr = make_ssa_name (TREE_TYPE (call));
4724
4725 gimplify_assign (ptr, call, &seq);
4726 var = build_simple_mem_ref (ptr);
4727 TREE_THIS_VOLATILE (var) = 1;
4728 }
4729
4730 if (level != GOMP_DIM_GANG)
4731 {
4732 /* Write to the receiver object. */
4733 tree ref_to_res = gimple_call_arg (call, 1);
4734
4735 if (!integer_zerop (ref_to_res))
4736 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4737 }
4738
4739 if (lhs)
4740 gimplify_assign (lhs, var, &seq);
4741
4742 pop_gimplify_context (NULL);
4743
4744 gsi_replace_with_seq (&gsi, seq, true);
4745}
4746
4747/* NVPTX reduction expander. */
4748
4749void
4750nvptx_goacc_reduction (gcall *call)
4751{
4752 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4753
4754 switch (code)
4755 {
4756 case IFN_GOACC_REDUCTION_SETUP:
4757 nvptx_goacc_reduction_setup (call);
4758 break;
4759
4760 case IFN_GOACC_REDUCTION_INIT:
4761 nvptx_goacc_reduction_init (call);
4762 break;
4763
4764 case IFN_GOACC_REDUCTION_FINI:
4765 nvptx_goacc_reduction_fini (call);
4766 break;
4767
4768 case IFN_GOACC_REDUCTION_TEARDOWN:
4769 nvptx_goacc_reduction_teardown (call);
4770 break;
4771
4772 default:
4773 gcc_unreachable ();
4774 }
4775}
4776
738f2522
BS
4777#undef TARGET_OPTION_OVERRIDE
4778#define TARGET_OPTION_OVERRIDE nvptx_option_override
4779
4780#undef TARGET_ATTRIBUTE_TABLE
4781#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4782
4783#undef TARGET_LEGITIMATE_ADDRESS_P
4784#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4785
4786#undef TARGET_PROMOTE_FUNCTION_MODE
4787#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4788
4789#undef TARGET_FUNCTION_ARG
4790#define TARGET_FUNCTION_ARG nvptx_function_arg
4791#undef TARGET_FUNCTION_INCOMING_ARG
4792#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4793#undef TARGET_FUNCTION_ARG_ADVANCE
4794#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4795#undef TARGET_FUNCTION_ARG_BOUNDARY
4796#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4797#undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4798#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4799#undef TARGET_PASS_BY_REFERENCE
4800#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4801#undef TARGET_FUNCTION_VALUE_REGNO_P
4802#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4803#undef TARGET_FUNCTION_VALUE
4804#define TARGET_FUNCTION_VALUE nvptx_function_value
4805#undef TARGET_LIBCALL_VALUE
4806#define TARGET_LIBCALL_VALUE nvptx_libcall_value
4807#undef TARGET_FUNCTION_OK_FOR_SIBCALL
4808#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
4809#undef TARGET_GET_DRAP_RTX
4810#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
4811#undef TARGET_SPLIT_COMPLEX_ARG
4812#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4813#undef TARGET_RETURN_IN_MEMORY
4814#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4815#undef TARGET_OMIT_STRUCT_RETURN_REG
4816#define TARGET_OMIT_STRUCT_RETURN_REG true
4817#undef TARGET_STRICT_ARGUMENT_NAMING
4818#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4819#undef TARGET_STATIC_CHAIN
4820#define TARGET_STATIC_CHAIN nvptx_static_chain
4821
4822#undef TARGET_CALL_ARGS
4823#define TARGET_CALL_ARGS nvptx_call_args
4824#undef TARGET_END_CALL_ARGS
4825#define TARGET_END_CALL_ARGS nvptx_end_call_args
4826
4827#undef TARGET_ASM_FILE_START
4828#define TARGET_ASM_FILE_START nvptx_file_start
4829#undef TARGET_ASM_FILE_END
4830#define TARGET_ASM_FILE_END nvptx_file_end
4831#undef TARGET_ASM_GLOBALIZE_LABEL
4832#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4833#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4834#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4835#undef TARGET_PRINT_OPERAND
4836#define TARGET_PRINT_OPERAND nvptx_print_operand
4837#undef TARGET_PRINT_OPERAND_ADDRESS
4838#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4839#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4840#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4841#undef TARGET_ASM_INTEGER
4842#define TARGET_ASM_INTEGER nvptx_assemble_integer
4843#undef TARGET_ASM_DECL_END
4844#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4845#undef TARGET_ASM_DECLARE_CONSTANT_NAME
4846#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4847#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4848#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4849#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4850#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4851
4852#undef TARGET_MACHINE_DEPENDENT_REORG
4853#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4854#undef TARGET_NO_REGISTER_ALLOCATION
4855#define TARGET_NO_REGISTER_ALLOCATION true
4856
1f83528e
TS
4857#undef TARGET_RECORD_OFFLOAD_SYMBOL
4858#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4859
738f2522
BS
4860#undef TARGET_VECTOR_ALIGNMENT
4861#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4862
d88cd9c4
NS
4863#undef TARGET_CANNOT_COPY_INSN_P
4864#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4865
a794bd20
NS
4866#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4867#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4868
f3552158
NS
4869#undef TARGET_INIT_BUILTINS
4870#define TARGET_INIT_BUILTINS nvptx_init_builtins
4871#undef TARGET_EXPAND_BUILTIN
4872#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4873#undef TARGET_BUILTIN_DECL
4874#define TARGET_BUILTIN_DECL nvptx_builtin_decl
4875
94829f87
NS
4876#undef TARGET_GOACC_VALIDATE_DIMS
4877#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4878
bd751975
NS
4879#undef TARGET_GOACC_DIM_LIMIT
4880#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4881
d88cd9c4
NS
4882#undef TARGET_GOACC_FORK_JOIN
4883#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4884
f3552158
NS
4885#undef TARGET_GOACC_REDUCTION
4886#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4887
738f2522
BS
4888struct gcc_target targetm = TARGET_INITIALIZER;
4889
4890#include "gt-nvptx.h"