]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
re PR fortran/68534 (No error on mismatch in number of arguments between submodule...
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
5624e564 2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
e11c4407
AM
31#include "tm_p.h"
32#include "expmed.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic.h"
40e23961 38#include "alias.h"
738f2522
BS
39#include "insn-flags.h"
40#include "output.h"
41#include "insn-attr.h"
36566b39 42#include "flags.h"
36566b39
PK
43#include "dojump.h"
44#include "explow.h"
45#include "calls.h"
36566b39
PK
46#include "varasm.h"
47#include "stmt.h"
738f2522 48#include "expr.h"
738f2522
BS
49#include "tm-preds.h"
50#include "tm-constrs.h"
738f2522
BS
51#include "langhooks.h"
52#include "dbxout.h"
738f2522 53#include "cfgrtl.h"
d88cd9c4 54#include "gimple.h"
738f2522 55#include "stor-layout.h"
738f2522 56#include "builtins.h"
3e32ee19
NS
57#include "omp-low.h"
58#include "gomp-constants.h"
d88cd9c4 59#include "dumpfile.h"
f3552158
NS
60#include "internal-fn.h"
61#include "gimple-iterator.h"
62#include "stringpool.h"
63#include "tree-ssa-operands.h"
64#include "tree-ssanames.h"
65#include "gimplify.h"
66#include "tree-phinodes.h"
67#include "cfgloop.h"
68#include "fold-const.h"
738f2522 69
994c5d85 70/* This file should be included last. */
d58627a0
RS
71#include "target-def.h"
72
d88cd9c4
NS
73#define SHUFFLE_UP 0
74#define SHUFFLE_DOWN 1
75#define SHUFFLE_BFLY 2
76#define SHUFFLE_IDX 3
77
738f2522
BS
78/* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80static std::stringstream func_decls;
f3dba894 81
6c907cff 82struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
83{
84 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
85 static bool equal (rtx a, rtx b) { return a == b; }
86};
87
88static GTY((cache))
89 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
90
6c907cff 91struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
92{
93 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
94 static bool equal (tree a, tree b) { return a == b; }
95};
96
97static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
98static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 99
f3552158
NS
100/* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
d88cd9c4
NS
105static unsigned worker_bcast_size;
106static unsigned worker_bcast_align;
107#define worker_bcast_name "__worker_bcast"
108static GTY(()) rtx worker_bcast_sym;
109
f3552158
NS
110/* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112static unsigned worker_red_size;
113static unsigned worker_red_align;
114#define worker_red_name "__worker_red"
115static GTY(()) rtx worker_red_sym;
116
33f47f42
NS
117/* Global lock variable, needed for 128bit worker & gang reductions. */
118static GTY(()) tree global_lock_var;
119
738f2522
BS
120/* Allocate a new, cleared machine_function structure. */
121
122static struct machine_function *
123nvptx_init_machine_status (void)
124{
125 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
126 p->ret_reg_mode = VOIDmode;
127 return p;
128}
129
130/* Implement TARGET_OPTION_OVERRIDE. */
131
132static void
133nvptx_option_override (void)
134{
135 init_machine_status = nvptx_init_machine_status;
136 /* Gives us a predictable order, which we need especially for variables. */
137 flag_toplevel_reorder = 1;
138 /* Assumes that it will see only hard registers. */
139 flag_var_tracking = 0;
f324806d
NS
140 write_symbols = NO_DEBUG;
141 debug_info_level = DINFO_LEVEL_NONE;
738f2522 142
dba619f3
NS
143 if (nvptx_optimize < 0)
144 nvptx_optimize = optimize > 0;
145
f3dba894
TS
146 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
147 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 148 declared_libfuncs_htab
f3dba894 149 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4
NS
150
151 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
152 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158
NS
153
154 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
155 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
738f2522
BS
156}
157
158/* Return the mode to be used when declaring a ptx object for OBJ.
159 For objects with subparts such as complex modes this is the mode
160 of the subpart. */
161
162machine_mode
163nvptx_underlying_object_mode (rtx obj)
164{
165 if (GET_CODE (obj) == SUBREG)
166 obj = SUBREG_REG (obj);
167 machine_mode mode = GET_MODE (obj);
168 if (mode == TImode)
169 return DImode;
170 if (COMPLEX_MODE_P (mode))
171 return GET_MODE_INNER (mode);
172 return mode;
173}
174
175/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
176 deal with ptx ideosyncracies. */
177
178const char *
179nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
180{
181 switch (mode)
182 {
183 case BLKmode:
184 return ".b8";
185 case BImode:
186 return ".pred";
187 case QImode:
188 if (promote)
189 return ".u32";
190 else
191 return ".u8";
192 case HImode:
193 return ".u16";
194 case SImode:
195 return ".u32";
196 case DImode:
197 return ".u64";
198
199 case SFmode:
200 return ".f32";
201 case DFmode:
202 return ".f64";
203
204 default:
205 gcc_unreachable ();
206 }
207}
208
7b8edc29
NS
209/* Determine the address space to use for SYMBOL_REF SYM. */
210
211static addr_space_t
212nvptx_addr_space_from_sym (rtx sym)
213{
214 tree decl = SYMBOL_REF_DECL (sym);
215 if (decl == NULL_TREE || TREE_CODE (decl) == FUNCTION_DECL)
216 return ADDR_SPACE_GENERIC;
217
218 bool is_const = (CONSTANT_CLASS_P (decl)
219 || TREE_CODE (decl) == CONST_DECL
220 || TREE_READONLY (decl));
221 if (is_const)
222 return ADDR_SPACE_CONST;
223
224 return ADDR_SPACE_GLOBAL;
225}
226
d7479262
NS
227/* If MODE should be treated as two registers of an inner mode, return
228 that inner mode. Otherwise return VOIDmode. */
738f2522 229
d7479262
NS
230static machine_mode
231maybe_split_mode (machine_mode mode)
738f2522 232{
738f2522 233 if (COMPLEX_MODE_P (mode))
d7479262 234 return GET_MODE_INNER (mode);
738f2522 235
738f2522 236 if (mode == TImode)
d7479262
NS
237 return DImode;
238
239 return VOIDmode;
738f2522
BS
240}
241
d88cd9c4
NS
242/* Emit forking instructions for MASK. */
243
244static void
245nvptx_emit_forking (unsigned mask, bool is_call)
246{
247 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
248 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
249 if (mask)
250 {
251 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
252
253 /* Emit fork at all levels. This helps form SESE regions, as
254 it creates a block with a single successor before entering a
255 partitooned region. That is a good candidate for the end of
256 an SESE region. */
257 if (!is_call)
258 emit_insn (gen_nvptx_fork (op));
259 emit_insn (gen_nvptx_forked (op));
260 }
261}
262
263/* Emit joining instructions for MASK. */
264
265static void
266nvptx_emit_joining (unsigned mask, bool is_call)
267{
268 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
269 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
270 if (mask)
271 {
272 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
273
274 /* Emit joining for all non-call pars to ensure there's a single
275 predecessor for the block the join insn ends up in. This is
276 needed for skipping entire loops. */
277 if (!is_call)
278 emit_insn (gen_nvptx_joining (op));
279 emit_insn (gen_nvptx_join (op));
280 }
281}
282
738f2522
BS
283#define PASS_IN_REG_P(MODE, TYPE) \
284 ((GET_MODE_CLASS (MODE) == MODE_INT \
285 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
286 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
287 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
288 && !AGGREGATE_TYPE_P (TYPE))) \
289 && (MODE) != TImode)
290
291#define RETURN_IN_REG_P(MODE) \
292 ((GET_MODE_CLASS (MODE) == MODE_INT \
293 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
294 && GET_MODE_SIZE (MODE) <= 8)
295\f
296/* Perform a mode promotion for a function argument with MODE. Return
297 the promoted mode. */
298
299static machine_mode
300arg_promotion (machine_mode mode)
301{
302 if (mode == QImode || mode == HImode)
303 return SImode;
304 return mode;
305}
306
307/* Write the declaration of a function arg of TYPE to S. I is the index
308 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
309 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
310
311static int
312write_one_arg (std::stringstream &s, tree type, int i, machine_mode mode,
313 bool no_arg_types)
314{
315 if (!PASS_IN_REG_P (mode, type))
316 mode = Pmode;
317
d7479262
NS
318 machine_mode split = maybe_split_mode (mode);
319 if (split != VOIDmode)
738f2522 320 {
df1bdded
NS
321 i = write_one_arg (s, NULL_TREE, i, split, false);
322 i = write_one_arg (s, NULL_TREE, i, split, false);
323 return i;
738f2522
BS
324 }
325
326 if (no_arg_types && !AGGREGATE_TYPE_P (type))
327 {
328 if (mode == SFmode)
329 mode = DFmode;
330 mode = arg_promotion (mode);
331 }
332
df1bdded 333 if (i)
738f2522
BS
334 s << ", ";
335 s << ".param" << nvptx_ptx_type_from_mode (mode, false) << " %in_ar"
df1bdded 336 << i << (mode == QImode || mode == HImode ? "[1]" : "");
738f2522
BS
337 if (mode == BLKmode)
338 s << "[" << int_size_in_bytes (type) << "]";
df1bdded 339 return i + 1;
738f2522
BS
340}
341
342/* Look for attributes in ATTRS that would indicate we must write a function
343 as a .entry kernel rather than a .func. Return true if one is found. */
344
345static bool
346write_as_kernel (tree attrs)
347{
348 return (lookup_attribute ("kernel", attrs) != NULL_TREE
349 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
350}
351
ecf6e535
BS
352/* Write a function decl for DECL to S, where NAME is the name to be used.
353 This includes ptx .visible or .extern specifiers, .func or .kernel, and
354 argument and return types. */
738f2522
BS
355
356static void
357nvptx_write_function_decl (std::stringstream &s, const char *name, const_tree decl)
358{
359 tree fntype = TREE_TYPE (decl);
360 tree result_type = TREE_TYPE (fntype);
361 tree args = TYPE_ARG_TYPES (fntype);
362 tree attrs = DECL_ATTRIBUTES (decl);
363 bool kernel = write_as_kernel (attrs);
364 bool is_main = strcmp (name, "main") == 0;
365 bool args_from_decl = false;
366
367 /* We get:
368 NULL in TYPE_ARG_TYPES, for old-style functions
369 NULL in DECL_ARGUMENTS, for builtin functions without another
370 declaration.
371 So we have to pick the best one we have. */
372 if (args == 0)
373 {
374 args = DECL_ARGUMENTS (decl);
375 args_from_decl = true;
376 }
377
378 if (DECL_EXTERNAL (decl))
379 s << ".extern ";
380 else if (TREE_PUBLIC (decl))
0766660b 381 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
738f2522
BS
382
383 if (kernel)
384 s << ".entry ";
385 else
386 s << ".func ";
387
388 /* Declare the result. */
389 bool return_in_mem = false;
390 if (TYPE_MODE (result_type) != VOIDmode)
391 {
392 machine_mode mode = TYPE_MODE (result_type);
393 if (!RETURN_IN_REG_P (mode))
394 return_in_mem = true;
395 else
396 {
397 mode = arg_promotion (mode);
398 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
399 << " %out_retval)";
400 }
401 }
402
403 if (name[0] == '*')
404 s << (name + 1);
405 else
406 s << name;
407
408 /* Declare argument types. */
409 if ((args != NULL_TREE
1fe6befc
NS
410 && !(TREE_CODE (args) == TREE_LIST
411 && TREE_VALUE (args) == void_type_node))
738f2522
BS
412 || is_main
413 || return_in_mem
414 || DECL_STATIC_CHAIN (decl))
415 {
416 s << "(";
417 int i = 0;
df1bdded 418
738f2522
BS
419 if (return_in_mem)
420 {
df1bdded 421 s << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_ar0";
738f2522
BS
422 i++;
423 }
424 while (args != NULL_TREE)
425 {
426 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
427 machine_mode mode = TYPE_MODE (type);
428
429 if (mode != VOIDmode)
df1bdded
NS
430 i = write_one_arg (s, type, i, mode,
431 TYPE_ARG_TYPES (fntype) == 0);
738f2522
BS
432 args = TREE_CHAIN (args);
433 }
434 if (stdarg_p (fntype))
435 {
436 gcc_assert (i > 0);
437 s << ", .param.u" << GET_MODE_BITSIZE (Pmode) << " %in_argp";
438 }
439 if (DECL_STATIC_CHAIN (decl))
440 {
df1bdded 441 if (i)
738f2522
BS
442 s << ", ";
443 s << ".reg.u" << GET_MODE_BITSIZE (Pmode)
444 << reg_names [STATIC_CHAIN_REGNUM];
445 }
df1bdded 446 if (!i && is_main)
738f2522
BS
447 s << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode)
448 << " %argv";
449 s << ")";
450 }
451}
452
738f2522
BS
453/* Write a .func or .kernel declaration (not a definition) along with
454 a helper comment for use by ld. S is the stream to write to, DECL
455 the decl for the function with name NAME. */
456
457static void
458write_function_decl_and_comment (std::stringstream &s, const char *name, const_tree decl)
459{
cf08c344 460 s << "\n// BEGIN";
738f2522
BS
461 if (TREE_PUBLIC (decl))
462 s << " GLOBAL";
463 s << " FUNCTION DECL: ";
464 if (name[0] == '*')
465 s << (name + 1);
466 else
467 s << name;
468 s << "\n";
469 nvptx_write_function_decl (s, name, decl);
470 s << ";\n";
471}
472
00e52418
NS
473/* Construct a function declaration from a call insn. This can be
474 necessary for two reasons - either we have an indirect call which
475 requires a .callprototype declaration, or we have a libcall
476 generated by emit_library_call for which no decl exists. */
477
478static void
479write_func_decl_from_insn (std::stringstream &s, const char *name,
480 rtx result, rtx pat)
481{
482 if (!name)
483 {
484 s << "\t.callprototype ";
485 name = "_";
486 }
487 else
488 {
489 s << "\n// BEGIN GLOBAL FUNCTION DECL: " << name << "\n";
490 s << "\t.extern .func ";
491 }
492
493 if (result != NULL_RTX)
494 s << "(.param"
495 << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)), false)
496 << " %rval) ";
497
498 s << name;
499
500 const char *sep = " (";
501 int arg_end = XVECLEN (pat, 0);
502 for (int i = 1; i < arg_end; i++)
503 {
504 /* We don't have to deal with mode splitting here, as that was
505 already done when generating the call sequence. */
506 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
507
508 s << sep
509 << ".param"
510 << nvptx_ptx_type_from_mode (mode, false)
511 << " %arg"
512 << i;
513 if (mode == QImode || mode == HImode)
514 s << "[1]";
515 sep = ", ";
516 }
517 if (arg_end != 1)
518 s << ")";
519 s << ";\n";
520}
521
738f2522
BS
522/* Check NAME for special function names and redirect them by returning a
523 replacement. This applies to malloc, free and realloc, for which we
524 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
525
526static const char *
527nvptx_name_replacement (const char *name)
528{
529 if (strcmp (name, "call") == 0)
530 return "__nvptx_call";
531 if (strcmp (name, "malloc") == 0)
532 return "__nvptx_malloc";
533 if (strcmp (name, "free") == 0)
534 return "__nvptx_free";
535 if (strcmp (name, "realloc") == 0)
536 return "__nvptx_realloc";
537 return name;
538}
539
00e52418
NS
540/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
541 table and and write a ptx prototype. These are emitted at end of
542 compilation. */
738f2522 543
00e52418
NS
544static void
545nvptx_record_fndecl (tree decl)
738f2522 546{
f3dba894 547 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
548 if (*slot == NULL)
549 {
550 *slot = decl;
551 const char *name = get_fnname_from_decl (decl);
552 name = nvptx_name_replacement (name);
553 write_function_decl_and_comment (func_decls, name, decl);
554 }
738f2522
BS
555}
556
00e52418
NS
557/* Record a libcall or unprototyped external function. CALLEE is the
558 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
559 declaration for it. */
560
561static void
562nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
563{
564 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
565 if (*slot == NULL)
566 {
567 *slot = callee;
568
569 const char *name = XSTR (callee, 0);
570 name = nvptx_name_replacement (name);
571 write_func_decl_from_insn (func_decls, name, retval, pat);
572 }
573}
574
575/* DECL is an external FUNCTION_DECL, that we're referencing. If it
576 is prototyped, record it now. Otherwise record it as needed at end
577 of compilation, when we might have more information about it. */
738f2522
BS
578
579void
580nvptx_record_needed_fndecl (tree decl)
581{
00e52418
NS
582 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
583 {
584 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
585 if (*slot == NULL)
586 *slot = decl;
587 }
588 else
589 nvptx_record_fndecl (decl);
590}
738f2522 591
00e52418
NS
592/* SYM is a SYMBOL_REF. If it refers to an external function, record
593 it as needed. */
594
595static void
596nvptx_maybe_record_fnsym (rtx sym)
597{
598 tree decl = SYMBOL_REF_DECL (sym);
599
600 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
601 nvptx_record_needed_fndecl (decl);
738f2522
BS
602}
603
d88cd9c4
NS
604/* Emit code to initialize the REGNO predicate register to indicate
605 whether we are not lane zero on the NAME axis. */
606
607static void
608nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
609{
610 fprintf (file, "\t{\n");
611 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
612 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
613 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
614 fprintf (file, "\t}\n");
615}
616
738f2522
BS
617/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
618 function, including local var decls and copies from the arguments to
619 local regs. */
620
621void
622nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
623{
624 tree fntype = TREE_TYPE (decl);
625 tree result_type = TREE_TYPE (fntype);
5ab662d5 626 int argno = 0;
738f2522
BS
627
628 name = nvptx_name_replacement (name);
629
630 std::stringstream s;
631 write_function_decl_and_comment (s, name, decl);
632 s << "// BEGIN";
633 if (TREE_PUBLIC (decl))
634 s << " GLOBAL";
635 s << " FUNCTION DEF: ";
636
637 if (name[0] == '*')
638 s << (name + 1);
639 else
640 s << name;
641 s << "\n";
642
643 nvptx_write_function_decl (s, name, decl);
644 fprintf (file, "%s", s.str().c_str());
645
25662751
NS
646 bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
647 && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
738f2522
BS
648
649 fprintf (file, "\n{\n");
650
738f2522 651 if (return_in_mem)
5ab662d5 652 {
5ab662d5
NS
653 fprintf (file, "\t.reg.u%d %%ar%d;\n", GET_MODE_BITSIZE (Pmode), argno);
654 fprintf (file, "\tld.param.u%d %%ar%d, [%%in_ar%d];\n",
655 GET_MODE_BITSIZE (Pmode), argno, argno);
df1bdded 656 argno++;
5ab662d5
NS
657 }
658
659 /* Declare and initialize incoming arguments. */
660 tree args = DECL_ARGUMENTS (decl);
661 bool prototyped = false;
662 if (TYPE_ARG_TYPES (fntype))
663 {
664 args = TYPE_ARG_TYPES (fntype);
665 prototyped = true;
666 }
667
668 for (; args != NULL_TREE; args = TREE_CHAIN (args))
669 {
670 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
671 machine_mode mode = TYPE_MODE (type);
672 int count = 1;
673
674 if (mode == VOIDmode)
675 break;
676
677 if (!PASS_IN_REG_P (mode, type))
678 mode = Pmode;
679
680 machine_mode split = maybe_split_mode (mode);
681 if (split != VOIDmode)
682 {
683 count = 2;
684 mode = split;
685 }
686 else if (!prototyped && !AGGREGATE_TYPE_P (type) && mode == SFmode)
687 mode = DFmode;
688
689 mode = arg_promotion (mode);
690 while (count--)
691 {
5ab662d5
NS
692 fprintf (file, "\t.reg%s %%ar%d;\n",
693 nvptx_ptx_type_from_mode (mode, false), argno);
694 fprintf (file, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
695 nvptx_ptx_type_from_mode (mode, false), argno, argno);
df1bdded 696 argno++;
5ab662d5
NS
697 }
698 }
25662751
NS
699
700 /* C++11 ABI causes us to return a reference to the passed in
701 pointer for return_in_mem. */
702 if (cfun->machine->ret_reg_mode != VOIDmode)
738f2522 703 {
25662751
NS
704 machine_mode mode = arg_promotion
705 ((machine_mode)cfun->machine->ret_reg_mode);
ac952181 706 fprintf (file, "\t.reg%s %%retval;\n",
738f2522
BS
707 nvptx_ptx_type_from_mode (mode, false));
708 }
709
710 if (stdarg_p (fntype))
5ab662d5
NS
711 {
712 fprintf (file, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode));
713 fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
714 GET_MODE_BITSIZE (Pmode));
715 }
738f2522
BS
716
717 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
718 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
719
720 /* Declare the pseudos we have as ptx registers. */
721 int maxregs = max_reg_num ();
722 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
723 {
724 if (regno_reg_rtx[i] != const0_rtx)
725 {
726 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262
NS
727 machine_mode split = maybe_split_mode (mode);
728 if (split != VOIDmode)
738f2522 729 {
d7479262
NS
730 fprintf (file, "\t.reg%s %%r%d$%d;\n",
731 nvptx_ptx_type_from_mode (split, true), i, 0);
732 fprintf (file, "\t.reg%s %%r%d$%d;\n",
733 nvptx_ptx_type_from_mode (split, true), i, 1);
738f2522
BS
734 }
735 else
736 fprintf (file, "\t.reg%s %%r%d;\n",
d7479262 737 nvptx_ptx_type_from_mode (mode, true), i);
738f2522
BS
738 }
739 }
740
741 /* The only reason we might be using outgoing args is if we call a stdargs
742 function. Allocate the space for this. If we called varargs functions
743 without passing any variadic arguments, we'll see a reference to outargs
744 even with a zero outgoing_args_size. */
745 HOST_WIDE_INT sz = crtl->outgoing_args_size;
746 if (sz == 0)
747 sz = 1;
748 if (cfun->machine->has_call_with_varargs)
5ab662d5
NS
749 {
750 fprintf (file, "\t.reg.u%d %%outargs;\n"
751 "\t.local.align 8 .b8 %%outargs_ar["
752 HOST_WIDE_INT_PRINT_DEC"];\n",
753 BITS_PER_WORD, sz);
754 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
755 BITS_PER_WORD);
756 }
757
738f2522 758 if (cfun->machine->punning_buffer_size > 0)
5ab662d5
NS
759 {
760 fprintf (file, "\t.reg.u%d %%punbuffer;\n"
761 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
762 BITS_PER_WORD, cfun->machine->punning_buffer_size);
763 fprintf (file, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
764 BITS_PER_WORD);
765 }
738f2522
BS
766
767 /* Declare a local variable for the frame. */
768 sz = get_frame_size ();
769 if (sz > 0 || cfun->machine->has_call_with_sc)
770 {
18c05628
NS
771 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
772
738f2522 773 fprintf (file, "\t.reg.u%d %%frame;\n"
18c05628
NS
774 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
775 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
738f2522
BS
776 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
777 BITS_PER_WORD);
778 }
779
d88cd9c4
NS
780 /* Emit axis predicates. */
781 if (cfun->machine->axis_predicate[0])
782 nvptx_init_axis_predicate (file,
783 REGNO (cfun->machine->axis_predicate[0]), "y");
784 if (cfun->machine->axis_predicate[1])
785 nvptx_init_axis_predicate (file,
786 REGNO (cfun->machine->axis_predicate[1]), "x");
738f2522
BS
787}
788
789/* Output a return instruction. Also copy the return value to its outgoing
790 location. */
791
792const char *
793nvptx_output_return (void)
794{
25662751
NS
795 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
796
797 if (mode != VOIDmode)
738f2522 798 {
25662751
NS
799 mode = arg_promotion (mode);
800 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
801 nvptx_ptx_type_from_mode (mode, false));
738f2522
BS
802 }
803
804 return "ret;";
805}
806
738f2522
BS
807/* Terminate a function by writing a closing brace to FILE. */
808
809void
810nvptx_function_end (FILE *file)
811{
cf08c344 812 fprintf (file, "}\n");
738f2522
BS
813}
814\f
815/* Decide whether we can make a sibling call to a function. For ptx, we
816 can't. */
817
818static bool
819nvptx_function_ok_for_sibcall (tree, tree)
820{
821 return false;
822}
823
18c05628
NS
824/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
825
826static rtx
827nvptx_get_drap_rtx (void)
828{
829 return NULL_RTX;
830}
831
738f2522
BS
832/* Implement the TARGET_CALL_ARGS hook. Record information about one
833 argument to the next call. */
834
835static void
836nvptx_call_args (rtx arg, tree funtype)
837{
838 if (cfun->machine->start_call == NULL_RTX)
839 {
840 cfun->machine->call_args = NULL;
841 cfun->machine->funtype = funtype;
842 cfun->machine->start_call = const0_rtx;
843 }
844 if (arg == pc_rtx)
845 return;
846
847 rtx_expr_list *args_so_far = cfun->machine->call_args;
848 if (REG_P (arg))
849 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
850}
851
852/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
853 information we recorded. */
854
855static void
856nvptx_end_call_args (void)
857{
858 cfun->machine->start_call = NULL_RTX;
859 free_EXPR_LIST_list (&cfun->machine->call_args);
860}
861
ecf6e535
BS
862/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
863 track of whether calls involving static chains or varargs were seen
864 in the current function.
865 For libcalls, maintain a hash table of decls we have seen, and
866 record a function decl for later when encountering a new one. */
738f2522
BS
867
868void
869nvptx_expand_call (rtx retval, rtx address)
870{
f324806d 871 int nargs = 0;
738f2522
BS
872 rtx callee = XEXP (address, 0);
873 rtx pat, t;
874 rtvec vec;
f324806d 875 rtx varargs = NULL_RTX;
d88cd9c4 876 unsigned parallel = 0;
738f2522 877
738f2522
BS
878 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
879 nargs++;
880
738f2522
BS
881 if (!call_insn_operand (callee, Pmode))
882 {
883 callee = force_reg (Pmode, callee);
884 address = change_address (address, QImode, callee);
885 }
886
887 if (GET_CODE (callee) == SYMBOL_REF)
888 {
889 tree decl = SYMBOL_REF_DECL (callee);
890 if (decl != NULL_TREE)
891 {
738f2522
BS
892 if (DECL_STATIC_CHAIN (decl))
893 cfun->machine->has_call_with_sc = true;
00e52418 894
d88cd9c4
NS
895 tree attr = get_oacc_fn_attrib (decl);
896 if (attr)
897 {
898 tree dims = TREE_VALUE (attr);
899
900 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
901 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
902 {
903 if (TREE_PURPOSE (dims)
904 && !integer_zerop (TREE_PURPOSE (dims)))
905 break;
906 /* Not on this axis. */
907 parallel ^= GOMP_DIM_MASK (ix);
908 dims = TREE_CHAIN (dims);
909 }
910 }
738f2522
BS
911 }
912 }
c38f0d8c 913
738f2522
BS
914 if (cfun->machine->funtype
915 /* It's possible to construct testcases where we call a variable.
916 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
917 in such a case. */
918 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
919 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
920 && stdarg_p (cfun->machine->funtype))
921 {
f324806d 922 varargs = gen_reg_rtx (Pmode);
863af9a4 923 emit_move_insn (varargs, stack_pointer_rtx);
f324806d 924 cfun->machine->has_call_with_varargs = true;
738f2522 925 }
f324806d
NS
926 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
927 pat = gen_rtx_PARALLEL (VOIDmode, vec);
738f2522 928
f324806d
NS
929 int vec_pos = 0;
930
738f2522
BS
931 rtx tmp_retval = retval;
932 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
933 if (retval != NULL_RTX)
934 {
935 if (!nvptx_register_operand (retval, GET_MODE (retval)))
936 tmp_retval = gen_reg_rtx (GET_MODE (retval));
f7df4a84 937 t = gen_rtx_SET (tmp_retval, t);
738f2522 938 }
f324806d
NS
939 XVECEXP (pat, 0, vec_pos++) = t;
940
941 /* Construct the call insn, including a USE for each argument pseudo
942 register. These will be used when printing the insn. */
943 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
944 {
945 rtx this_arg = XEXP (arg, 0);
946 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
947 }
948
949 if (varargs)
cf08c344 950 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
951
952 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 953
d88cd9c4 954 nvptx_emit_forking (parallel, true);
738f2522 955 emit_call_insn (pat);
d88cd9c4
NS
956 nvptx_emit_joining (parallel, true);
957
738f2522
BS
958 if (tmp_retval != retval)
959 emit_move_insn (retval, tmp_retval);
960}
961
962/* Implement TARGET_FUNCTION_ARG. */
963
964static rtx
965nvptx_function_arg (cumulative_args_t, machine_mode mode,
966 const_tree, bool named)
967{
968 if (mode == VOIDmode)
969 return NULL_RTX;
970
971 if (named)
972 return gen_reg_rtx (mode);
973 return NULL_RTX;
974}
975
976/* Implement TARGET_FUNCTION_INCOMING_ARG. */
977
978static rtx
979nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
980 const_tree, bool named)
981{
982 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
983 if (mode == VOIDmode)
984 return NULL_RTX;
985
986 if (!named)
987 return NULL_RTX;
988
989 /* No need to deal with split modes here, the only case that can
990 happen is complex modes and those are dealt with by
991 TARGET_SPLIT_COMPLEX_ARG. */
992 return gen_rtx_UNSPEC (mode,
df1bdded 993 gen_rtvec (1, GEN_INT (cum->count)),
738f2522
BS
994 UNSPEC_ARG_REG);
995}
996
997/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
998
999static void
1000nvptx_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
1001 const_tree type ATTRIBUTE_UNUSED,
1002 bool named ATTRIBUTE_UNUSED)
1003{
1004 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1005 if (mode == TImode)
1006 cum->count += 2;
1007 else
1008 cum->count++;
1009}
1010
1011/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
1012
1013 For nvptx, we know how to handle functions declared as stdarg: by
1014 passing an extra pointer to the unnamed arguments. However, the
1015 Fortran frontend can produce a different situation, where a
1016 function pointer is declared with no arguments, but the actual
1017 function and calls to it take more arguments. In that case, we
1018 want to ensure the call matches the definition of the function. */
1019
1020static bool
1021nvptx_strict_argument_naming (cumulative_args_t cum_v)
1022{
1023 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1024 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
1025}
1026
1027/* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1028
1029static unsigned int
1030nvptx_function_arg_boundary (machine_mode mode, const_tree type)
1031{
1032 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
1033
1034 if (boundary > BITS_PER_WORD)
1035 return 2 * BITS_PER_WORD;
1036
1037 if (mode == BLKmode)
1038 {
1039 HOST_WIDE_INT size = int_size_in_bytes (type);
1040 if (size > 4)
1041 return 2 * BITS_PER_WORD;
1042 if (boundary < BITS_PER_WORD)
1043 {
1044 if (size >= 3)
1045 return BITS_PER_WORD;
1046 if (size >= 2)
1047 return 2 * BITS_PER_UNIT;
1048 }
1049 }
1050 return boundary;
1051}
1052
1053/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1054 where function FUNC returns or receives a value of data type TYPE. */
1055
1056static rtx
1057nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
1058 bool outgoing)
1059{
1060 int unsignedp = TYPE_UNSIGNED (type);
1061 machine_mode orig_mode = TYPE_MODE (type);
1062 machine_mode mode = promote_function_mode (type, orig_mode,
1063 &unsignedp, NULL_TREE, 1);
1064 if (outgoing)
1065 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1066 if (cfun->machine->start_call == NULL_RTX)
1067 /* Pretend to return in a hard reg for early uses before pseudos can be
1068 generated. */
1069 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1070 return gen_reg_rtx (mode);
1071}
1072
1073/* Implement TARGET_LIBCALL_VALUE. */
1074
1075static rtx
1076nvptx_libcall_value (machine_mode mode, const_rtx)
1077{
1078 if (cfun->machine->start_call == NULL_RTX)
1079 /* Pretend to return in a hard reg for early uses before pseudos can be
1080 generated. */
1081 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1082 return gen_reg_rtx (mode);
1083}
1084
1085/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1086
1087static bool
1088nvptx_function_value_regno_p (const unsigned int regno)
1089{
1090 return regno == NVPTX_RETURN_REGNUM;
1091}
1092
1093/* Types with a mode other than those supported by the machine are passed by
1094 reference in memory. */
1095
1096static bool
1097nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
1098 const_tree type, bool)
1099{
1100 return !PASS_IN_REG_P (mode, type);
1101}
1102
1103/* Implement TARGET_RETURN_IN_MEMORY. */
1104
1105static bool
1106nvptx_return_in_memory (const_tree type, const_tree)
1107{
1108 machine_mode mode = TYPE_MODE (type);
1109 if (!RETURN_IN_REG_P (mode))
1110 return true;
1111 return false;
1112}
1113
1114/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1115
1116static machine_mode
1117nvptx_promote_function_mode (const_tree type, machine_mode mode,
1118 int *punsignedp,
1119 const_tree funtype, int for_return)
1120{
1121 if (type == NULL_TREE)
1122 return mode;
1123 if (for_return)
1124 return promote_mode (type, mode, punsignedp);
1125 /* For K&R-style functions, try to match the language promotion rules to
1126 minimize type mismatches at assembly time. */
1127 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
1128 && type != NULL_TREE
1129 && !AGGREGATE_TYPE_P (type))
1130 {
1131 if (mode == SFmode)
1132 mode = DFmode;
1133 mode = arg_promotion (mode);
1134 }
1135
1136 return mode;
1137}
1138
1139/* Implement TARGET_STATIC_CHAIN. */
1140
1141static rtx
1142nvptx_static_chain (const_tree fndecl, bool incoming_p)
1143{
1144 if (!DECL_STATIC_CHAIN (fndecl))
1145 return NULL;
1146
1147 if (incoming_p)
1148 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
1149 else
1150 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
1151}
1152\f
1153/* Emit a comparison COMPARE, and return the new test to be used in the
1154 jump. */
1155
1156rtx
1157nvptx_expand_compare (rtx compare)
1158{
1159 rtx pred = gen_reg_rtx (BImode);
1160 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1161 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1162 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1163 return gen_rtx_NE (BImode, pred, const0_rtx);
1164}
1165
d88cd9c4
NS
1166/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1167
1168void
1169nvptx_expand_oacc_fork (unsigned mode)
1170{
1171 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1172}
1173
1174void
1175nvptx_expand_oacc_join (unsigned mode)
1176{
1177 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1178}
1179
1180/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1181 objects. */
1182
1183static rtx
1184nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1185{
1186 rtx res;
1187
1188 switch (GET_MODE (src))
1189 {
1190 case DImode:
1191 res = gen_unpackdisi2 (dst0, dst1, src);
1192 break;
1193 case DFmode:
1194 res = gen_unpackdfsi2 (dst0, dst1, src);
1195 break;
1196 default: gcc_unreachable ();
1197 }
1198 return res;
1199}
1200
1201/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1202 object. */
1203
1204static rtx
1205nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1206{
1207 rtx res;
1208
1209 switch (GET_MODE (dst))
1210 {
1211 case DImode:
1212 res = gen_packsidi2 (dst, src0, src1);
1213 break;
1214 case DFmode:
1215 res = gen_packsidf2 (dst, src0, src1);
1216 break;
1217 default: gcc_unreachable ();
1218 }
1219 return res;
1220}
1221
1222/* Generate an instruction or sequence to broadcast register REG
1223 across the vectors of a single warp. */
1224
1225static rtx
1226nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
1227{
1228 rtx res;
1229
1230 switch (GET_MODE (dst))
1231 {
1232 case SImode:
1233 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1234 break;
1235 case SFmode:
1236 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1237 break;
1238 case DImode:
1239 case DFmode:
1240 {
1241 rtx tmp0 = gen_reg_rtx (SImode);
1242 rtx tmp1 = gen_reg_rtx (SImode);
1243
1244 start_sequence ();
1245 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1246 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1247 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1248 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1249 res = get_insns ();
1250 end_sequence ();
1251 }
1252 break;
1253 case BImode:
1254 {
1255 rtx tmp = gen_reg_rtx (SImode);
1256
1257 start_sequence ();
1258 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1259 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1260 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1261 res = get_insns ();
1262 end_sequence ();
1263 }
1264 break;
1265
1266 default:
1267 gcc_unreachable ();
1268 }
1269 return res;
1270}
1271
1272/* Generate an instruction or sequence to broadcast register REG
1273 across the vectors of a single warp. */
1274
1275static rtx
1276nvptx_gen_vcast (rtx reg)
1277{
1278 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1279}
1280
1281/* Structure used when generating a worker-level spill or fill. */
1282
1283struct wcast_data_t
1284{
1285 rtx base; /* Register holding base addr of buffer. */
1286 rtx ptr; /* Iteration var, if needed. */
1287 unsigned offset; /* Offset into worker buffer. */
1288};
1289
1290/* Direction of the spill/fill and looping setup/teardown indicator. */
1291
1292enum propagate_mask
1293 {
1294 PM_read = 1 << 0,
1295 PM_write = 1 << 1,
1296 PM_loop_begin = 1 << 2,
1297 PM_loop_end = 1 << 3,
1298
1299 PM_read_write = PM_read | PM_write
1300 };
1301
1302/* Generate instruction(s) to spill or fill register REG to/from the
1303 worker broadcast array. PM indicates what is to be done, REP
1304 how many loop iterations will be executed (0 for not a loop). */
1305
1306static rtx
1307nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1308{
1309 rtx res;
1310 machine_mode mode = GET_MODE (reg);
1311
1312 switch (mode)
1313 {
1314 case BImode:
1315 {
1316 rtx tmp = gen_reg_rtx (SImode);
1317
1318 start_sequence ();
1319 if (pm & PM_read)
1320 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1321 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1322 if (pm & PM_write)
1323 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1324 res = get_insns ();
1325 end_sequence ();
1326 }
1327 break;
1328
1329 default:
1330 {
1331 rtx addr = data->ptr;
1332
1333 if (!addr)
1334 {
1335 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1336
1337 if (align > worker_bcast_align)
1338 worker_bcast_align = align;
1339 data->offset = (data->offset + align - 1) & ~(align - 1);
1340 addr = data->base;
1341 if (data->offset)
1342 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1343 }
1344
1345 addr = gen_rtx_MEM (mode, addr);
1346 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1347 if (pm == PM_read)
1348 res = gen_rtx_SET (addr, reg);
1349 else if (pm == PM_write)
1350 res = gen_rtx_SET (reg, addr);
1351 else
1352 gcc_unreachable ();
1353
1354 if (data->ptr)
1355 {
1356 /* We're using a ptr, increment it. */
1357 start_sequence ();
1358
1359 emit_insn (res);
1360 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1361 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1362 res = get_insns ();
1363 end_sequence ();
1364 }
1365 else
1366 rep = 1;
1367 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1368 }
1369 break;
1370 }
1371 return res;
1372}
1373
738f2522 1374/* When loading an operand ORIG_OP, verify whether an address space
00e52418
NS
1375 conversion to generic is required, and if so, perform it. Check
1376 for SYMBOL_REFs and record them if needed. Return either the
1377 original operand, or the converted one. */
738f2522
BS
1378
1379rtx
7b8edc29 1380nvptx_maybe_convert_symbolic_operand (rtx op)
738f2522 1381{
7b8edc29
NS
1382 if (GET_MODE (op) != Pmode)
1383 return op;
1384
1385 rtx sym = op;
1386 if (GET_CODE (sym) == CONST)
1387 sym = XEXP (sym, 0);
1388 if (GET_CODE (sym) == PLUS)
1389 sym = XEXP (sym, 0);
738f2522 1390
7b8edc29
NS
1391 if (GET_CODE (sym) != SYMBOL_REF)
1392 return op;
738f2522 1393
7b8edc29 1394 nvptx_maybe_record_fnsym (sym);
00e52418 1395
7b8edc29 1396 addr_space_t as = nvptx_addr_space_from_sym (sym);
738f2522 1397 if (as == ADDR_SPACE_GENERIC)
7b8edc29 1398 return op;
738f2522
BS
1399
1400 enum unspec code;
1401 code = (as == ADDR_SPACE_GLOBAL ? UNSPEC_FROM_GLOBAL
1402 : as == ADDR_SPACE_LOCAL ? UNSPEC_FROM_LOCAL
1403 : as == ADDR_SPACE_SHARED ? UNSPEC_FROM_SHARED
1404 : as == ADDR_SPACE_CONST ? UNSPEC_FROM_CONST
1405 : UNSPEC_FROM_PARAM);
7b8edc29 1406
738f2522 1407 rtx dest = gen_reg_rtx (Pmode);
7b8edc29
NS
1408 emit_insn (gen_rtx_SET (dest,
1409 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op), code)));
738f2522
BS
1410 return dest;
1411}
1412\f
1413/* Returns true if X is a valid address for use in a memory reference. */
1414
1415static bool
1416nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1417{
1418 enum rtx_code code = GET_CODE (x);
1419
1420 switch (code)
1421 {
1422 case REG:
1423 return true;
1424
1425 case PLUS:
1426 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1427 return true;
1428 return false;
1429
1430 case CONST:
1431 case SYMBOL_REF:
1432 case LABEL_REF:
1433 return true;
1434
1435 default:
1436 return false;
1437 }
1438}
1439
1440/* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1441 to ensure that the return register's mode isn't changed. */
1442
1443bool
1444nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1445{
1446 if (regno != NVPTX_RETURN_REGNUM
1447 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1448 return true;
1449 return mode == cfun->machine->ret_reg_mode;
1450}
1451\f
1452/* Convert an address space AS to the corresponding ptx string. */
1453
1454const char *
1455nvptx_section_from_addr_space (addr_space_t as)
1456{
1457 switch (as)
1458 {
1459 case ADDR_SPACE_CONST:
1460 return ".const";
1461
1462 case ADDR_SPACE_GLOBAL:
1463 return ".global";
1464
1465 case ADDR_SPACE_SHARED:
1466 return ".shared";
1467
1468 case ADDR_SPACE_GENERIC:
1469 return "";
1470
1471 default:
1472 gcc_unreachable ();
1473 }
1474}
1475
1476/* Determine whether DECL goes into .const or .global. */
1477
1478const char *
1479nvptx_section_for_decl (const_tree decl)
1480{
1481 bool is_const = (CONSTANT_CLASS_P (decl)
1482 || TREE_CODE (decl) == CONST_DECL
1483 || TREE_READONLY (decl));
1484 if (is_const)
1485 return ".const";
1486
1487 return ".global";
1488}
1489
738f2522 1490\f
ecf6e535
BS
1491/* Machinery to output constant initializers. When beginning an initializer,
1492 we decide on a chunk size (which is visible in ptx in the type used), and
1493 then all initializer data is buffered until a chunk is filled and ready to
1494 be written out. */
738f2522
BS
1495
1496/* Used when assembling integers to ensure data is emitted in
1497 pieces whose size matches the declaration we printed. */
1498static unsigned int decl_chunk_size;
1499static machine_mode decl_chunk_mode;
1500/* Used in the same situation, to keep track of the byte offset
1501 into the initializer. */
1502static unsigned HOST_WIDE_INT decl_offset;
1503/* The initializer part we are currently processing. */
1504static HOST_WIDE_INT init_part;
1505/* The total size of the object. */
1506static unsigned HOST_WIDE_INT object_size;
1507/* True if we found a skip extending to the end of the object. Used to
1508 assert that no data follows. */
1509static bool object_finished;
1510
1511/* Write the necessary separator string to begin a new initializer value. */
1512
1513static void
1514begin_decl_field (void)
1515{
1516 /* We never see decl_offset at zero by the time we get here. */
1517 if (decl_offset == decl_chunk_size)
1518 fprintf (asm_out_file, " = { ");
1519 else
1520 fprintf (asm_out_file, ", ");
1521}
1522
1523/* Output the currently stored chunk as an initializer value. */
1524
1525static void
1526output_decl_chunk (void)
1527{
1528 begin_decl_field ();
cc8ca59e 1529 output_address (VOIDmode, gen_int_mode (init_part, decl_chunk_mode));
738f2522
BS
1530 init_part = 0;
1531}
1532
1533/* Add value VAL sized SIZE to the data we're emitting, and keep writing
1534 out chunks as they fill up. */
1535
1536static void
1537nvptx_assemble_value (HOST_WIDE_INT val, unsigned int size)
1538{
1539 unsigned HOST_WIDE_INT chunk_offset = decl_offset % decl_chunk_size;
1540 gcc_assert (!object_finished);
1541 while (size > 0)
1542 {
1543 int this_part = size;
1544 if (chunk_offset + this_part > decl_chunk_size)
1545 this_part = decl_chunk_size - chunk_offset;
1546 HOST_WIDE_INT val_part;
1547 HOST_WIDE_INT mask = 2;
1548 mask <<= this_part * BITS_PER_UNIT - 1;
1549 val_part = val & (mask - 1);
1550 init_part |= val_part << (BITS_PER_UNIT * chunk_offset);
1551 val >>= BITS_PER_UNIT * this_part;
1552 size -= this_part;
1553 decl_offset += this_part;
1554 if (decl_offset % decl_chunk_size == 0)
1555 output_decl_chunk ();
1556
1557 chunk_offset = 0;
1558 }
1559}
1560
1561/* Target hook for assembling integer object X of size SIZE. */
1562
1563static bool
1564nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1565{
00e52418
NS
1566 HOST_WIDE_INT val = 0;
1567
1568 switch (GET_CODE (x))
738f2522 1569 {
00e52418
NS
1570 default:
1571 gcc_unreachable ();
1572
1573 case CONST_INT:
1574 val = INTVAL (x);
1575 nvptx_assemble_value (val, size);
1576 break;
1577
1578 case CONST:
1579 x = XEXP (x, 0);
1580 gcc_assert (GET_CODE (x) == PLUS);
1581 val = INTVAL (XEXP (x, 1));
1582 x = XEXP (x, 0);
1583 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1584 /* FALLTHROUGH */
1585
1586 case SYMBOL_REF:
738f2522
BS
1587 gcc_assert (size = decl_chunk_size);
1588 if (decl_offset % decl_chunk_size != 0)
1589 sorry ("cannot emit unaligned pointers in ptx assembly");
1590 decl_offset += size;
1591 begin_decl_field ();
1592
00e52418
NS
1593 nvptx_maybe_record_fnsym (x);
1594 fprintf (asm_out_file, "generic(");
1595 output_address (VOIDmode, x);
1596 fprintf (asm_out_file, ")");
738f2522 1597
00e52418
NS
1598 if (val)
1599 fprintf (asm_out_file, " + " HOST_WIDE_INT_PRINT_DEC, val);
738f2522 1600 break;
738f2522
BS
1601 }
1602
738f2522
BS
1603 return true;
1604}
1605
1606/* Output SIZE zero bytes. We ignore the FILE argument since the
1607 functions we're calling to perform the output just use
1608 asm_out_file. */
1609
1610void
1611nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1612{
1613 if (decl_offset + size >= object_size)
1614 {
1615 if (decl_offset % decl_chunk_size != 0)
1616 nvptx_assemble_value (0, decl_chunk_size);
1617 object_finished = true;
1618 return;
1619 }
1620
1621 while (size > decl_chunk_size)
1622 {
1623 nvptx_assemble_value (0, decl_chunk_size);
1624 size -= decl_chunk_size;
1625 }
1626 while (size-- > 0)
1627 nvptx_assemble_value (0, 1);
1628}
1629
1630/* Output a string STR with length SIZE. As in nvptx_output_skip we
1631 ignore the FILE arg. */
1632
1633void
1634nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1635{
1636 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1637 nvptx_assemble_value (str[i], 1);
1638}
1639
1640/* Called when the initializer for a decl has been completely output through
1641 combinations of the three functions above. */
1642
1643static void
1644nvptx_assemble_decl_end (void)
1645{
1646 if (decl_offset != 0)
1647 {
1648 if (!object_finished && decl_offset % decl_chunk_size != 0)
1649 nvptx_assemble_value (0, decl_chunk_size);
1650
1651 fprintf (asm_out_file, " }");
1652 }
1653 fprintf (asm_out_file, ";\n");
1654}
1655
1656/* Start a declaration of a variable of TYPE with NAME to
1657 FILE. IS_PUBLIC says whether this will be externally visible.
1658 Here we just write the linker hint and decide on the chunk size
1659 to use. */
1660
1661static void
1662init_output_initializer (FILE *file, const char *name, const_tree type,
1663 bool is_public)
1664{
cf08c344 1665 fprintf (file, "\n// BEGIN%s VAR DEF: ", is_public ? " GLOBAL" : "");
738f2522
BS
1666 assemble_name_raw (file, name);
1667 fputc ('\n', file);
1668
1669 if (TREE_CODE (type) == ARRAY_TYPE)
1670 type = TREE_TYPE (type);
1671 int sz = int_size_in_bytes (type);
1672 if ((TREE_CODE (type) != INTEGER_TYPE
1673 && TREE_CODE (type) != ENUMERAL_TYPE
1674 && TREE_CODE (type) != REAL_TYPE)
1675 || sz < 0
1676 || sz > HOST_BITS_PER_WIDE_INT)
1677 type = ptr_type_node;
1678 decl_chunk_size = int_size_in_bytes (type);
1679 decl_chunk_mode = int_mode_for_mode (TYPE_MODE (type));
1680 decl_offset = 0;
1681 init_part = 0;
1682 object_finished = false;
1683}
1684
1685/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1686 writing a constant variable EXP with NAME and SIZE and its
1687 initializer to FILE. */
1688
1689static void
1690nvptx_asm_declare_constant_name (FILE *file, const char *name,
1691 const_tree exp, HOST_WIDE_INT size)
1692{
1693 tree type = TREE_TYPE (exp);
1694 init_output_initializer (file, name, type, false);
1695 fprintf (file, "\t.const .align %d .u%d ",
1696 TYPE_ALIGN (TREE_TYPE (exp)) / BITS_PER_UNIT,
1697 decl_chunk_size * BITS_PER_UNIT);
1698 assemble_name (file, name);
1699 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1700 (size + decl_chunk_size - 1) / decl_chunk_size);
1701 object_size = size;
1702}
1703
1704/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1705 a variable DECL with NAME to FILE. */
1706
1707void
1708nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1709{
1710 if (decl && DECL_SIZE (decl))
1711 {
1712 tree type = TREE_TYPE (decl);
1713 unsigned HOST_WIDE_INT size;
1714
1715 init_output_initializer (file, name, type, TREE_PUBLIC (decl));
1716 size = tree_to_uhwi (DECL_SIZE_UNIT (decl));
1717 const char *section = nvptx_section_for_decl (decl);
1718 fprintf (file, "\t%s%s .align %d .u%d ",
0766660b
NS
1719 !TREE_PUBLIC (decl) ? ""
1720 : DECL_WEAK (decl) ? ".weak" : ".visible",
1721 section, DECL_ALIGN (decl) / BITS_PER_UNIT,
738f2522
BS
1722 decl_chunk_size * BITS_PER_UNIT);
1723 assemble_name (file, name);
1724 if (size > 0)
1725 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1726 (size + decl_chunk_size - 1) / decl_chunk_size);
1727 else
1728 object_finished = true;
1729 object_size = size;
1730 }
1731}
1732
1733/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1734
1735static void
1736nvptx_globalize_label (FILE *, const char *)
1737{
1738}
1739
1740/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1741 declaration only for variable DECL with NAME to FILE. */
1742static void
1743nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1744{
1745 if (TREE_CODE (decl) != VAR_DECL)
1746 return;
1747 const char *section = nvptx_section_for_decl (decl);
cf08c344
NS
1748 fprintf (file, "\n// BEGIN%s VAR DECL: ",
1749 TREE_PUBLIC (decl) ? " GLOBAL" : "");
738f2522
BS
1750 assemble_name_raw (file, name);
1751 fputs ("\n", file);
1752 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (decl));
1753 fprintf (file, ".extern %s .b8 ", section);
1754 assemble_name_raw (file, name);
1755 if (size > 0)
16998094 1756 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
738f2522
BS
1757 fprintf (file, ";\n\n");
1758}
1759
1760/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
1761 involves writing .param declarations and in/out copies into them. For
1762 indirect calls, also write the .callprototype. */
738f2522
BS
1763
1764const char *
1765nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1766{
863af9a4 1767 char buf[16];
738f2522
BS
1768 static int labelno;
1769 bool needs_tgt = register_operand (callee, Pmode);
1770 rtx pat = PATTERN (insn);
f324806d 1771 int arg_end = XVECLEN (pat, 0);
738f2522
BS
1772 tree decl = NULL_TREE;
1773
1774 fprintf (asm_out_file, "\t{\n");
1775 if (result != NULL)
f324806d
NS
1776 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1777 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1778 false));
738f2522 1779
ecf6e535 1780 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
1781 if (GET_CODE (callee) == SYMBOL_REF)
1782 {
1783 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
1784 if (!decl
1785 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1786 nvptx_record_libfunc (callee, result, pat);
1787 else if (DECL_EXTERNAL (decl))
738f2522
BS
1788 nvptx_record_fndecl (decl);
1789 }
1790
1791 if (needs_tgt)
1792 {
1793 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1794 labelno++;
1795 ASM_OUTPUT_LABEL (asm_out_file, buf);
1796 std::stringstream s;
863af9a4 1797 write_func_decl_from_insn (s, NULL, result, pat);
738f2522
BS
1798 fputs (s.str().c_str(), asm_out_file);
1799 }
1800
863af9a4 1801 for (int argno = 1; argno < arg_end; argno++)
738f2522 1802 {
863af9a4 1803 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 1804 machine_mode mode = GET_MODE (t);
738f2522 1805
863af9a4
NS
1806 /* Mode splitting has already been done. */
1807 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1808 nvptx_ptx_type_from_mode (mode, false), argno,
1809 mode == QImode || mode == HImode ? "[1]" : "");
1810 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1811 nvptx_ptx_type_from_mode (mode, false), argno,
1812 REGNO (t));
738f2522
BS
1813 }
1814
1815 fprintf (asm_out_file, "\t\tcall ");
1816 if (result != NULL_RTX)
1817 fprintf (asm_out_file, "(%%retval_in), ");
1818
1819 if (decl)
1820 {
1821 const char *name = get_fnname_from_decl (decl);
1822 name = nvptx_name_replacement (name);
1823 assemble_name (asm_out_file, name);
1824 }
1825 else
cc8ca59e 1826 output_address (VOIDmode, callee);
738f2522 1827
863af9a4
NS
1828 const char *open = "(";
1829 for (int argno = 1; argno < arg_end; argno++)
738f2522 1830 {
863af9a4
NS
1831 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1832 open = "";
738f2522 1833 }
863af9a4
NS
1834 if (decl && DECL_STATIC_CHAIN (decl))
1835 {
1836 fprintf (asm_out_file, ", %s%s", open,
1837 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
1838 open = "";
1839 }
1840 if (!open[0])
1841 fprintf (asm_out_file, ")");
f324806d 1842
738f2522
BS
1843 if (needs_tgt)
1844 {
1845 fprintf (asm_out_file, ", ");
1846 assemble_name (asm_out_file, buf);
1847 }
1848 fprintf (asm_out_file, ";\n");
738f2522 1849
863af9a4 1850 return result != NULL_RTX ? "\tld.param%t0\t%0, [%%retval_in];\n\t}" : "}";
738f2522
BS
1851}
1852
1853/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1854
1855static bool
1856nvptx_print_operand_punct_valid_p (unsigned char c)
1857{
1858 return c == '.' || c== '#';
1859}
1860
1861static void nvptx_print_operand (FILE *, rtx, int);
1862
1863/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1864
1865static void
1866nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1867{
1868 rtx off;
1869 if (GET_CODE (x) == CONST)
1870 x = XEXP (x, 0);
1871 switch (GET_CODE (x))
1872 {
1873 case PLUS:
1874 off = XEXP (x, 1);
cc8ca59e 1875 output_address (VOIDmode, XEXP (x, 0));
738f2522 1876 fprintf (file, "+");
cc8ca59e 1877 output_address (VOIDmode, off);
738f2522
BS
1878 break;
1879
1880 case SYMBOL_REF:
1881 case LABEL_REF:
1882 output_addr_const (file, x);
1883 break;
1884
1885 default:
1886 gcc_assert (GET_CODE (x) != MEM);
1887 nvptx_print_operand (file, x, 0);
1888 break;
1889 }
1890}
1891
1892/* Write assembly language output for the address ADDR to FILE. */
1893
1894static void
cc8ca59e 1895nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 1896{
cc8ca59e 1897 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
1898}
1899
1900/* Print an operand, X, to FILE, with an optional modifier in CODE.
1901
1902 Meaning of CODE:
1903 . -- print the predicate for the instruction or an emptry string for an
1904 unconditional one.
1905 # -- print a rounding mode for the instruction
1906
1907 A -- print an address space identifier for a MEM
1908 c -- print an opcode suffix for a comparison operator, including a type code
738f2522 1909 f -- print a full reg even for something that must always be split
d88cd9c4 1910 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
1911 t -- print a type opcode suffix, promoting QImode to 32 bits
1912 T -- print a type size in bits
1913 u -- print a type opcode suffix without promotions. */
1914
1915static void
1916nvptx_print_operand (FILE *file, rtx x, int code)
1917{
1918 rtx orig_x = x;
1919 machine_mode op_mode;
1920
1921 if (code == '.')
1922 {
1923 x = current_insn_predicate;
1924 if (x)
1925 {
1926 unsigned int regno = REGNO (XEXP (x, 0));
1927 fputs ("[", file);
1928 if (GET_CODE (x) == EQ)
1929 fputs ("!", file);
1930 fputs (reg_names [regno], file);
1931 fputs ("]", file);
1932 }
1933 return;
1934 }
1935 else if (code == '#')
1936 {
1937 fputs (".rn", file);
1938 return;
1939 }
1940
1941 enum rtx_code x_code = GET_CODE (x);
1942
1943 switch (code)
1944 {
1945 case 'A':
1946 {
7b8edc29
NS
1947 addr_space_t as = ADDR_SPACE_GENERIC;
1948 rtx sym = XEXP (x, 0);
1949
1950 if (GET_CODE (sym) == CONST)
1951 sym = XEXP (sym, 0);
1952 if (GET_CODE (sym) == PLUS)
1953 sym = XEXP (sym, 0);
1954
1955 if (GET_CODE (sym) == SYMBOL_REF)
1956 as = nvptx_addr_space_from_sym (sym);
1957
738f2522
BS
1958 fputs (nvptx_section_from_addr_space (as), file);
1959 }
1960 break;
1961
738f2522
BS
1962 case 't':
1963 op_mode = nvptx_underlying_object_mode (x);
1964 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
1965 break;
1966
1967 case 'u':
1968 op_mode = nvptx_underlying_object_mode (x);
1969 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
1970 break;
1971
d88cd9c4
NS
1972 case 'S':
1973 {
1974 unsigned kind = UINTVAL (x);
1975 static const char *const kinds[] =
1976 {"up", "down", "bfly", "idx"};
1977 fprintf (file, ".%s", kinds[kind]);
1978 }
1979 break;
1980
738f2522
BS
1981 case 'T':
1982 fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
1983 break;
1984
1985 case 'j':
1986 fprintf (file, "@");
1987 goto common;
1988
1989 case 'J':
1990 fprintf (file, "@!");
1991 goto common;
1992
1993 case 'c':
1994 op_mode = GET_MODE (XEXP (x, 0));
1995 switch (x_code)
1996 {
1997 case EQ:
1998 fputs (".eq", file);
1999 break;
2000 case NE:
2001 if (FLOAT_MODE_P (op_mode))
2002 fputs (".neu", file);
2003 else
2004 fputs (".ne", file);
2005 break;
2006 case LE:
2007 fputs (".le", file);
2008 break;
2009 case GE:
2010 fputs (".ge", file);
2011 break;
2012 case LT:
2013 fputs (".lt", file);
2014 break;
2015 case GT:
2016 fputs (".gt", file);
2017 break;
2018 case LEU:
2019 fputs (".ls", file);
2020 break;
2021 case GEU:
2022 fputs (".hs", file);
2023 break;
2024 case LTU:
2025 fputs (".lo", file);
2026 break;
2027 case GTU:
2028 fputs (".hi", file);
2029 break;
2030 case LTGT:
2031 fputs (".ne", file);
2032 break;
2033 case UNEQ:
2034 fputs (".equ", file);
2035 break;
2036 case UNLE:
2037 fputs (".leu", file);
2038 break;
2039 case UNGE:
2040 fputs (".geu", file);
2041 break;
2042 case UNLT:
2043 fputs (".ltu", file);
2044 break;
2045 case UNGT:
2046 fputs (".gtu", file);
2047 break;
2048 case UNORDERED:
2049 fputs (".nan", file);
2050 break;
2051 case ORDERED:
2052 fputs (".num", file);
2053 break;
2054 default:
2055 gcc_unreachable ();
2056 }
2057 if (FLOAT_MODE_P (op_mode)
2058 || x_code == EQ || x_code == NE
2059 || x_code == GEU || x_code == GTU
2060 || x_code == LEU || x_code == LTU)
2061 fputs (nvptx_ptx_type_from_mode (op_mode, true), file);
2062 else
2063 fprintf (file, ".s%d", GET_MODE_BITSIZE (op_mode));
2064 break;
2065 default:
2066 common:
2067 switch (x_code)
2068 {
2069 case SUBREG:
2070 x = SUBREG_REG (x);
2071 /* fall through */
2072
2073 case REG:
2074 if (HARD_REGISTER_P (x))
2075 fprintf (file, "%s", reg_names[REGNO (x)]);
2076 else
2077 fprintf (file, "%%r%d", REGNO (x));
d7479262 2078 if (code != 'f' && maybe_split_mode (GET_MODE (x)) != VOIDmode)
738f2522
BS
2079 {
2080 gcc_assert (GET_CODE (orig_x) == SUBREG
d7479262 2081 && maybe_split_mode (GET_MODE (orig_x)) == VOIDmode);
738f2522
BS
2082 fprintf (file, "$%d", SUBREG_BYTE (orig_x) / UNITS_PER_WORD);
2083 }
2084 break;
2085
2086 case MEM:
2087 fputc ('[', file);
2088 nvptx_print_address_operand (file, XEXP (x, 0), GET_MODE (x));
2089 fputc (']', file);
2090 break;
2091
2092 case CONST_INT:
2093 output_addr_const (file, x);
2094 break;
2095
2096 case CONST:
2097 case SYMBOL_REF:
2098 case LABEL_REF:
2099 /* We could use output_addr_const, but that can print things like
2100 "x-8", which breaks ptxas. Need to ensure it is output as
2101 "x+-8". */
2102 nvptx_print_address_operand (file, x, VOIDmode);
2103 break;
2104
2105 case CONST_DOUBLE:
2106 long vals[2];
34a72c33 2107 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), GET_MODE (x));
738f2522
BS
2108 vals[0] &= 0xffffffff;
2109 vals[1] &= 0xffffffff;
2110 if (GET_MODE (x) == SFmode)
2111 fprintf (file, "0f%08lx", vals[0]);
2112 else
2113 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2114 break;
2115
2116 default:
2117 output_addr_const (file, x);
2118 }
2119 }
2120}
2121\f
2122/* Record replacement regs used to deal with subreg operands. */
2123struct reg_replace
2124{
2125 rtx replacement[MAX_RECOG_OPERANDS];
2126 machine_mode mode;
2127 int n_allocated;
2128 int n_in_use;
2129};
2130
2131/* Allocate or reuse a replacement in R and return the rtx. */
2132
2133static rtx
2134get_replacement (struct reg_replace *r)
2135{
2136 if (r->n_allocated == r->n_in_use)
2137 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2138 return r->replacement[r->n_in_use++];
2139}
2140
2141/* Clean up subreg operands. In ptx assembly, everything is typed, and
2142 the presence of subregs would break the rules for most instructions.
2143 Replace them with a suitable new register of the right size, plus
2144 conversion copyin/copyout instructions. */
2145
2146static void
517665b3 2147nvptx_reorg_subreg (void)
738f2522
BS
2148{
2149 struct reg_replace qiregs, hiregs, siregs, diregs;
2150 rtx_insn *insn, *next;
2151
738f2522
BS
2152 qiregs.n_allocated = 0;
2153 hiregs.n_allocated = 0;
2154 siregs.n_allocated = 0;
2155 diregs.n_allocated = 0;
2156 qiregs.mode = QImode;
2157 hiregs.mode = HImode;
2158 siregs.mode = SImode;
2159 diregs.mode = DImode;
2160
2161 for (insn = get_insns (); insn; insn = next)
2162 {
2163 next = NEXT_INSN (insn);
2164 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2165 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2166 || GET_CODE (PATTERN (insn)) == USE
2167 || GET_CODE (PATTERN (insn)) == CLOBBER)
2168 continue;
f324806d 2169
738f2522
BS
2170 qiregs.n_in_use = 0;
2171 hiregs.n_in_use = 0;
2172 siregs.n_in_use = 0;
2173 diregs.n_in_use = 0;
2174 extract_insn (insn);
2175 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2176
738f2522
BS
2177 for (int i = 0; i < recog_data.n_operands; i++)
2178 {
2179 rtx op = recog_data.operand[i];
2180 if (GET_CODE (op) != SUBREG)
2181 continue;
2182
2183 rtx inner = SUBREG_REG (op);
2184
2185 machine_mode outer_mode = GET_MODE (op);
2186 machine_mode inner_mode = GET_MODE (inner);
2187 gcc_assert (s_ok);
2188 if (s_ok
2189 && (GET_MODE_PRECISION (inner_mode)
2190 >= GET_MODE_PRECISION (outer_mode)))
2191 continue;
2192 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2193 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2194 : outer_mode == HImode ? &hiregs
2195 : outer_mode == SImode ? &siregs
2196 : &diregs);
2197 rtx new_reg = get_replacement (r);
2198
2199 if (recog_data.operand_type[i] != OP_OUT)
2200 {
2201 enum rtx_code code;
2202 if (GET_MODE_PRECISION (inner_mode)
2203 < GET_MODE_PRECISION (outer_mode))
2204 code = ZERO_EXTEND;
2205 else
2206 code = TRUNCATE;
2207
f7df4a84 2208 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2209 gen_rtx_fmt_e (code, outer_mode, inner));
2210 emit_insn_before (pat, insn);
2211 }
2212
2213 if (recog_data.operand_type[i] != OP_IN)
2214 {
2215 enum rtx_code code;
2216 if (GET_MODE_PRECISION (inner_mode)
2217 < GET_MODE_PRECISION (outer_mode))
2218 code = TRUNCATE;
2219 else
2220 code = ZERO_EXTEND;
2221
f7df4a84 2222 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2223 gen_rtx_fmt_e (code, inner_mode, new_reg));
2224 emit_insn_after (pat, insn);
2225 }
2226 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2227 }
2228 }
517665b3 2229}
738f2522 2230
d2d47a28
NS
2231/* Loop structure of the function. The entire function is described as
2232 a NULL loop. */
d88cd9c4
NS
2233
2234struct parallel
2235{
2236 /* Parent parallel. */
2237 parallel *parent;
2238
2239 /* Next sibling parallel. */
2240 parallel *next;
2241
2242 /* First child parallel. */
2243 parallel *inner;
2244
2245 /* Partitioning mask of the parallel. */
2246 unsigned mask;
2247
2248 /* Partitioning used within inner parallels. */
2249 unsigned inner_mask;
2250
2251 /* Location of parallel forked and join. The forked is the first
2252 block in the parallel and the join is the first block after of
2253 the partition. */
2254 basic_block forked_block;
2255 basic_block join_block;
2256
2257 rtx_insn *forked_insn;
2258 rtx_insn *join_insn;
2259
2260 rtx_insn *fork_insn;
2261 rtx_insn *joining_insn;
2262
2263 /* Basic blocks in this parallel, but not in child parallels. The
2264 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2265 blocks are not. */
2266 auto_vec<basic_block> blocks;
2267
2268public:
2269 parallel (parallel *parent, unsigned mode);
2270 ~parallel ();
2271};
2272
2273/* Constructor links the new parallel into it's parent's chain of
2274 children. */
2275
2276parallel::parallel (parallel *parent_, unsigned mask_)
2277 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2278{
2279 forked_block = join_block = 0;
2280 forked_insn = join_insn = 0;
2281 fork_insn = joining_insn = 0;
2282
2283 if (parent)
2284 {
2285 next = parent->inner;
2286 parent->inner = this;
2287 }
2288}
2289
2290parallel::~parallel ()
2291{
2292 delete inner;
2293 delete next;
2294}
2295
2296/* Map of basic blocks to insns */
2297typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2298
2299/* A tuple of an insn of interest and the BB in which it resides. */
2300typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2301typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2302
2303/* Split basic blocks such that each forked and join unspecs are at
2304 the start of their basic blocks. Thus afterwards each block will
2305 have a single partitioning mode. We also do the same for return
2306 insns, as they are executed by every thread. Return the
2307 partitioning mode of the function as a whole. Populate MAP with
2308 head and tail blocks. We also clear the BB visited flag, which is
2309 used when finding partitions. */
2310
2311static void
2312nvptx_split_blocks (bb_insn_map_t *map)
2313{
2314 insn_bb_vec_t worklist;
2315 basic_block block;
2316 rtx_insn *insn;
2317
2318 /* Locate all the reorg instructions of interest. */
2319 FOR_ALL_BB_FN (block, cfun)
2320 {
2321 bool seen_insn = false;
2322
2323 /* Clear visited flag, for use by parallel locator */
2324 block->flags &= ~BB_VISITED;
2325
2326 FOR_BB_INSNS (block, insn)
2327 {
2328 if (!INSN_P (insn))
2329 continue;
2330 switch (recog_memoized (insn))
2331 {
2332 default:
2333 seen_insn = true;
2334 continue;
2335 case CODE_FOR_nvptx_forked:
2336 case CODE_FOR_nvptx_join:
2337 break;
2338
2339 case CODE_FOR_return:
2340 /* We also need to split just before return insns, as
2341 that insn needs executing by all threads, but the
2342 block it is in probably does not. */
2343 break;
2344 }
2345
2346 if (seen_insn)
2347 /* We've found an instruction that must be at the start of
2348 a block, but isn't. Add it to the worklist. */
2349 worklist.safe_push (insn_bb_t (insn, block));
2350 else
2351 /* It was already the first instruction. Just add it to
2352 the map. */
2353 map->get_or_insert (block) = insn;
2354 seen_insn = true;
2355 }
2356 }
2357
2358 /* Split blocks on the worklist. */
2359 unsigned ix;
2360 insn_bb_t *elt;
2361 basic_block remap = 0;
2362 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2363 {
2364 if (remap != elt->second)
2365 {
2366 block = elt->second;
2367 remap = block;
2368 }
2369
2370 /* Split block before insn. The insn is in the new block */
2371 edge e = split_block (block, PREV_INSN (elt->first));
2372
2373 block = e->dest;
2374 map->get_or_insert (block) = elt->first;
2375 }
2376}
2377
2378/* BLOCK is a basic block containing a head or tail instruction.
2379 Locate the associated prehead or pretail instruction, which must be
2380 in the single predecessor block. */
2381
2382static rtx_insn *
2383nvptx_discover_pre (basic_block block, int expected)
2384{
2385 gcc_assert (block->preds->length () == 1);
2386 basic_block pre_block = (*block->preds)[0]->src;
2387 rtx_insn *pre_insn;
2388
2389 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2390 pre_insn = PREV_INSN (pre_insn))
2391 gcc_assert (pre_insn != BB_HEAD (pre_block));
2392
2393 gcc_assert (recog_memoized (pre_insn) == expected);
2394 return pre_insn;
2395}
2396
2397/* Dump this parallel and all its inner parallels. */
2398
2399static void
2400nvptx_dump_pars (parallel *par, unsigned depth)
2401{
2402 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2403 depth, par->mask,
2404 par->forked_block ? par->forked_block->index : -1,
2405 par->join_block ? par->join_block->index : -1);
2406
2407 fprintf (dump_file, " blocks:");
2408
2409 basic_block block;
2410 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2411 fprintf (dump_file, " %d", block->index);
2412 fprintf (dump_file, "\n");
2413 if (par->inner)
2414 nvptx_dump_pars (par->inner, depth + 1);
2415
2416 if (par->next)
2417 nvptx_dump_pars (par->next, depth);
2418}
2419
2420/* If BLOCK contains a fork/join marker, process it to create or
2421 terminate a loop structure. Add this block to the current loop,
2422 and then walk successor blocks. */
2423
2424static parallel *
2425nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2426{
2427 if (block->flags & BB_VISITED)
2428 return par;
2429 block->flags |= BB_VISITED;
2430
2431 if (rtx_insn **endp = map->get (block))
2432 {
2433 rtx_insn *end = *endp;
2434
2435 /* This is a block head or tail, or return instruction. */
2436 switch (recog_memoized (end))
2437 {
2438 case CODE_FOR_return:
2439 /* Return instructions are in their own block, and we
2440 don't need to do anything more. */
2441 return par;
2442
2443 case CODE_FOR_nvptx_forked:
2444 /* Loop head, create a new inner loop and add it into
2445 our parent's child list. */
2446 {
2447 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2448
2449 gcc_assert (mask);
2450 par = new parallel (par, mask);
2451 par->forked_block = block;
2452 par->forked_insn = end;
2453 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2454 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2455 par->fork_insn
2456 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2457 }
2458 break;
2459
2460 case CODE_FOR_nvptx_join:
2461 /* A loop tail. Finish the current loop and return to
2462 parent. */
2463 {
2464 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2465
2466 gcc_assert (par->mask == mask);
2467 par->join_block = block;
2468 par->join_insn = end;
2469 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2470 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2471 par->joining_insn
2472 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2473 par = par->parent;
2474 }
2475 break;
2476
2477 default:
2478 gcc_unreachable ();
2479 }
2480 }
2481
2482 if (par)
2483 /* Add this block onto the current loop's list of blocks. */
2484 par->blocks.safe_push (block);
2485 else
2486 /* This must be the entry block. Create a NULL parallel. */
2487 par = new parallel (0, 0);
2488
2489 /* Walk successor blocks. */
2490 edge e;
2491 edge_iterator ei;
2492
2493 FOR_EACH_EDGE (e, ei, block->succs)
2494 nvptx_find_par (map, par, e->dest);
2495
2496 return par;
2497}
2498
2499/* DFS walk the CFG looking for fork & join markers. Construct
2500 loop structures as we go. MAP is a mapping of basic blocks
2501 to head & tail markers, discovered when splitting blocks. This
2502 speeds up the discovery. We rely on the BB visited flag having
2503 been cleared when splitting blocks. */
2504
2505static parallel *
2506nvptx_discover_pars (bb_insn_map_t *map)
2507{
2508 basic_block block;
2509
2510 /* Mark exit blocks as visited. */
2511 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2512 block->flags |= BB_VISITED;
2513
2514 /* And entry block as not. */
2515 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2516 block->flags &= ~BB_VISITED;
2517
2518 parallel *par = nvptx_find_par (map, 0, block);
2519
2520 if (dump_file)
2521 {
2522 fprintf (dump_file, "\nLoops\n");
2523 nvptx_dump_pars (par, 0);
2524 fprintf (dump_file, "\n");
2525 }
2526
2527 return par;
2528}
2529
912442c2
NS
2530/* Analyse a group of BBs within a partitioned region and create N
2531 Single-Entry-Single-Exit regions. Some of those regions will be
2532 trivial ones consisting of a single BB. The blocks of a
2533 partitioned region might form a set of disjoint graphs -- because
2534 the region encloses a differently partitoned sub region.
2535
2536 We use the linear time algorithm described in 'Finding Regions Fast:
2537 Single Entry Single Exit and control Regions in Linear Time'
2538 Johnson, Pearson & Pingali. That algorithm deals with complete
2539 CFGs, where a back edge is inserted from END to START, and thus the
2540 problem becomes one of finding equivalent loops.
2541
2542 In this case we have a partial CFG. We complete it by redirecting
2543 any incoming edge to the graph to be from an arbitrary external BB,
2544 and similarly redirecting any outgoing edge to be to that BB.
2545 Thus we end up with a closed graph.
2546
2547 The algorithm works by building a spanning tree of an undirected
2548 graph and keeping track of back edges from nodes further from the
2549 root in the tree to nodes nearer to the root in the tree. In the
2550 description below, the root is up and the tree grows downwards.
2551
2552 We avoid having to deal with degenerate back-edges to the same
2553 block, by splitting each BB into 3 -- one for input edges, one for
2554 the node itself and one for the output edges. Such back edges are
2555 referred to as 'Brackets'. Cycle equivalent nodes will have the
2556 same set of brackets.
2557
2558 Determining bracket equivalency is done by maintaining a list of
2559 brackets in such a manner that the list length and final bracket
2560 uniquely identify the set.
2561
2562 We use coloring to mark all BBs with cycle equivalency with the
2563 same color. This is the output of the 'Finding Regions Fast'
2564 algorithm. Notice it doesn't actually find the set of nodes within
2565 a particular region, just unorderd sets of nodes that are the
2566 entries and exits of SESE regions.
2567
2568 After determining cycle equivalency, we need to find the minimal
2569 set of SESE regions. Do this with a DFS coloring walk of the
2570 complete graph. We're either 'looking' or 'coloring'. When
2571 looking, and we're in the subgraph, we start coloring the color of
2572 the current node, and remember that node as the start of the
2573 current color's SESE region. Every time we go to a new node, we
2574 decrement the count of nodes with thet color. If it reaches zero,
2575 we remember that node as the end of the current color's SESE region
2576 and return to 'looking'. Otherwise we color the node the current
2577 color.
2578
2579 This way we end up with coloring the inside of non-trivial SESE
2580 regions with the color of that region. */
2581
2582/* A pair of BBs. We use this to represent SESE regions. */
2583typedef std::pair<basic_block, basic_block> bb_pair_t;
2584typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2585
2586/* A node in the undirected CFG. The discriminator SECOND indicates just
2587 above or just below the BB idicated by FIRST. */
2588typedef std::pair<basic_block, int> pseudo_node_t;
2589
2590/* A bracket indicates an edge towards the root of the spanning tree of the
2591 undirected graph. Each bracket has a color, determined
2592 from the currrent set of brackets. */
2593struct bracket
2594{
2595 pseudo_node_t back; /* Back target */
2596
2597 /* Current color and size of set. */
2598 unsigned color;
2599 unsigned size;
2600
2601 bracket (pseudo_node_t back_)
2602 : back (back_), color (~0u), size (~0u)
2603 {
2604 }
2605
2606 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2607 {
2608 if (length != size)
2609 {
2610 size = length;
2611 color = color_counts.length ();
2612 color_counts.quick_push (0);
2613 }
2614 color_counts[color]++;
2615 return color;
2616 }
2617};
2618
2619typedef auto_vec<bracket> bracket_vec_t;
2620
2621/* Basic block info for finding SESE regions. */
2622
2623struct bb_sese
2624{
2625 int node; /* Node number in spanning tree. */
2626 int parent; /* Parent node number. */
2627
2628 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2629 edges arrive at pseudo-node Ai and the outgoing edges leave at
2630 pseudo-node Ao. We have to remember which way we arrived at a
2631 particular node when generating the spanning tree. dir > 0 means
2632 we arrived at Ai, dir < 0 means we arrived at Ao. */
2633 int dir;
2634
2635 /* Lowest numbered pseudo-node reached via a backedge from thsis
2636 node, or any descendant. */
2637 pseudo_node_t high;
2638
2639 int color; /* Cycle-equivalence color */
2640
2641 /* Stack of brackets for this node. */
2642 bracket_vec_t brackets;
2643
2644 bb_sese (unsigned node_, unsigned p, int dir_)
2645 :node (node_), parent (p), dir (dir_)
2646 {
2647 }
2648 ~bb_sese ();
2649
2650 /* Push a bracket ending at BACK. */
2651 void push (const pseudo_node_t &back)
2652 {
2653 if (dump_file)
2654 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2655 back.first ? back.first->index : 0, back.second);
2656 brackets.safe_push (bracket (back));
2657 }
2658
2659 void append (bb_sese *child);
2660 void remove (const pseudo_node_t &);
2661
2662 /* Set node's color. */
2663 void set_color (auto_vec<unsigned> &color_counts)
2664 {
2665 color = brackets.last ().get_color (color_counts, brackets.length ());
2666 }
2667};
2668
2669bb_sese::~bb_sese ()
2670{
2671}
2672
2673/* Destructively append CHILD's brackets. */
2674
2675void
2676bb_sese::append (bb_sese *child)
2677{
2678 if (int len = child->brackets.length ())
2679 {
2680 int ix;
2681
2682 if (dump_file)
2683 {
2684 for (ix = 0; ix < len; ix++)
2685 {
2686 const pseudo_node_t &pseudo = child->brackets[ix].back;
2687 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2688 child->node, pseudo.first ? pseudo.first->index : 0,
2689 pseudo.second);
2690 }
2691 }
2692 if (!brackets.length ())
2693 std::swap (brackets, child->brackets);
2694 else
2695 {
2696 brackets.reserve (len);
2697 for (ix = 0; ix < len; ix++)
2698 brackets.quick_push (child->brackets[ix]);
2699 }
2700 }
2701}
2702
2703/* Remove brackets that terminate at PSEUDO. */
2704
2705void
2706bb_sese::remove (const pseudo_node_t &pseudo)
2707{
2708 unsigned removed = 0;
2709 int len = brackets.length ();
2710
2711 for (int ix = 0; ix < len; ix++)
2712 {
2713 if (brackets[ix].back == pseudo)
2714 {
2715 if (dump_file)
2716 fprintf (dump_file, "Removing backedge %d:%+d\n",
2717 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2718 removed++;
2719 }
2720 else if (removed)
2721 brackets[ix-removed] = brackets[ix];
2722 }
2723 while (removed--)
2724 brackets.pop ();
2725}
2726
2727/* Accessors for BB's aux pointer. */
2728#define BB_SET_SESE(B, S) ((B)->aux = (S))
2729#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2730
2731/* DFS walk creating SESE data structures. Only cover nodes with
2732 BB_VISITED set. Append discovered blocks to LIST. We number in
2733 increments of 3 so that the above and below pseudo nodes can be
2734 implicitly numbered too. */
2735
2736static int
2737nvptx_sese_number (int n, int p, int dir, basic_block b,
2738 auto_vec<basic_block> *list)
2739{
2740 if (BB_GET_SESE (b))
2741 return n;
2742
2743 if (dump_file)
2744 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2745 b->index, n, p, dir);
2746
2747 BB_SET_SESE (b, new bb_sese (n, p, dir));
2748 p = n;
2749
2750 n += 3;
2751 list->quick_push (b);
2752
2753 /* First walk the nodes on the 'other side' of this node, then walk
2754 the nodes on the same side. */
2755 for (unsigned ix = 2; ix; ix--)
2756 {
2757 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2758 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2759 : offsetof (edge_def, src));
2760 edge e;
2761 edge_iterator (ei);
2762
2763 FOR_EACH_EDGE (e, ei, edges)
2764 {
2765 basic_block target = *(basic_block *)((char *)e + offset);
2766
2767 if (target->flags & BB_VISITED)
2768 n = nvptx_sese_number (n, p, dir, target, list);
2769 }
2770 dir = -dir;
2771 }
2772 return n;
2773}
2774
2775/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2776 EDGES are the outgoing edges and OFFSET is the offset to the src
2777 or dst block on the edges. */
2778
2779static void
2780nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2781 vec<edge, va_gc> *edges, size_t offset)
2782{
2783 edge e;
2784 edge_iterator (ei);
2785 int hi_back = depth;
2786 pseudo_node_t node_back (0, depth);
2787 int hi_child = depth;
2788 pseudo_node_t node_child (0, depth);
2789 basic_block child = NULL;
2790 unsigned num_children = 0;
2791 int usd = -dir * sese->dir;
2792
2793 if (dump_file)
2794 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2795 me->index, sese->node, dir);
2796
2797 if (dir < 0)
2798 {
2799 /* This is the above pseudo-child. It has the BB itself as an
2800 additional child node. */
2801 node_child = sese->high;
2802 hi_child = node_child.second;
2803 if (node_child.first)
2804 hi_child += BB_GET_SESE (node_child.first)->node;
2805 num_children++;
2806 }
2807
2808 /* Examine each edge.
2809 - if it is a child (a) append its bracket list and (b) record
2810 whether it is the child with the highest reaching bracket.
2811 - if it is an edge to ancestor, record whether it's the highest
2812 reaching backlink. */
2813 FOR_EACH_EDGE (e, ei, edges)
2814 {
2815 basic_block target = *(basic_block *)((char *)e + offset);
2816
2817 if (bb_sese *t_sese = BB_GET_SESE (target))
2818 {
2819 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2820 {
2821 /* Child node. Append its bracket list. */
2822 num_children++;
2823 sese->append (t_sese);
2824
2825 /* Compare it's hi value. */
2826 int t_hi = t_sese->high.second;
2827
2828 if (basic_block child_hi_block = t_sese->high.first)
2829 t_hi += BB_GET_SESE (child_hi_block)->node;
2830
2831 if (hi_child > t_hi)
2832 {
2833 hi_child = t_hi;
2834 node_child = t_sese->high;
2835 child = target;
2836 }
2837 }
2838 else if (t_sese->node < sese->node + dir
2839 && !(dir < 0 && sese->parent == t_sese->node))
2840 {
2841 /* Non-parental ancestor node -- a backlink. */
2842 int d = usd * t_sese->dir;
2843 int back = t_sese->node + d;
2844
2845 if (hi_back > back)
2846 {
2847 hi_back = back;
2848 node_back = pseudo_node_t (target, d);
2849 }
2850 }
2851 }
2852 else
2853 { /* Fallen off graph, backlink to entry node. */
2854 hi_back = 0;
2855 node_back = pseudo_node_t (0, 0);
2856 }
2857 }
2858
2859 /* Remove any brackets that terminate at this pseudo node. */
2860 sese->remove (pseudo_node_t (me, dir));
2861
2862 /* Now push any backlinks from this pseudo node. */
2863 FOR_EACH_EDGE (e, ei, edges)
2864 {
2865 basic_block target = *(basic_block *)((char *)e + offset);
2866 if (bb_sese *t_sese = BB_GET_SESE (target))
2867 {
2868 if (t_sese->node < sese->node + dir
2869 && !(dir < 0 && sese->parent == t_sese->node))
2870 /* Non-parental ancestor node - backedge from me. */
2871 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2872 }
2873 else
2874 {
2875 /* back edge to entry node */
2876 sese->push (pseudo_node_t (0, 0));
2877 }
2878 }
2879
2880 /* If this node leads directly or indirectly to a no-return region of
2881 the graph, then fake a backedge to entry node. */
2882 if (!sese->brackets.length () || !edges || !edges->length ())
2883 {
2884 hi_back = 0;
2885 node_back = pseudo_node_t (0, 0);
2886 sese->push (node_back);
2887 }
2888
2889 /* Record the highest reaching backedge from us or a descendant. */
2890 sese->high = hi_back < hi_child ? node_back : node_child;
2891
2892 if (num_children > 1)
2893 {
2894 /* There is more than one child -- this is a Y shaped piece of
2895 spanning tree. We have to insert a fake backedge from this
2896 node to the highest ancestor reached by not-the-highest
2897 reaching child. Note that there may be multiple children
2898 with backedges to the same highest node. That's ok and we
2899 insert the edge to that highest node. */
2900 hi_child = depth;
2901 if (dir < 0 && child)
2902 {
2903 node_child = sese->high;
2904 hi_child = node_child.second;
2905 if (node_child.first)
2906 hi_child += BB_GET_SESE (node_child.first)->node;
2907 }
2908
2909 FOR_EACH_EDGE (e, ei, edges)
2910 {
2911 basic_block target = *(basic_block *)((char *)e + offset);
2912
2913 if (target == child)
2914 /* Ignore the highest child. */
2915 continue;
2916
2917 bb_sese *t_sese = BB_GET_SESE (target);
2918 if (!t_sese)
2919 continue;
2920 if (t_sese->parent != sese->node)
2921 /* Not a child. */
2922 continue;
2923
2924 /* Compare its hi value. */
2925 int t_hi = t_sese->high.second;
2926
2927 if (basic_block child_hi_block = t_sese->high.first)
2928 t_hi += BB_GET_SESE (child_hi_block)->node;
2929
2930 if (hi_child > t_hi)
2931 {
2932 hi_child = t_hi;
2933 node_child = t_sese->high;
2934 }
2935 }
2936
2937 sese->push (node_child);
2938 }
2939}
2940
2941
2942/* DFS walk of BB graph. Color node BLOCK according to COLORING then
2943 proceed to successors. Set SESE entry and exit nodes of
2944 REGIONS. */
2945
2946static void
2947nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2948 basic_block block, int coloring)
2949{
2950 bb_sese *sese = BB_GET_SESE (block);
2951
2952 if (block->flags & BB_VISITED)
2953 {
2954 /* If we've already encountered this block, either we must not
2955 be coloring, or it must have been colored the current color. */
2956 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
2957 return;
2958 }
2959
2960 block->flags |= BB_VISITED;
2961
2962 if (sese)
2963 {
2964 if (coloring < 0)
2965 {
2966 /* Start coloring a region. */
2967 regions[sese->color].first = block;
2968 coloring = sese->color;
2969 }
2970
2971 if (!--color_counts[sese->color] && sese->color == coloring)
2972 {
2973 /* Found final block of SESE region. */
2974 regions[sese->color].second = block;
2975 coloring = -1;
2976 }
2977 else
2978 /* Color the node, so we can assert on revisiting the node
2979 that the graph is indeed SESE. */
2980 sese->color = coloring;
2981 }
2982 else
2983 /* Fallen off the subgraph, we cannot be coloring. */
2984 gcc_assert (coloring < 0);
2985
2986 /* Walk each successor block. */
2987 if (block->succs && block->succs->length ())
2988 {
2989 edge e;
2990 edge_iterator ei;
2991
2992 FOR_EACH_EDGE (e, ei, block->succs)
2993 nvptx_sese_color (color_counts, regions, e->dest, coloring);
2994 }
2995 else
2996 gcc_assert (coloring < 0);
2997}
2998
2999/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3000 end up with NULL entries in it. */
3001
3002static void
3003nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3004{
3005 basic_block block;
3006 int ix;
3007
3008 /* First clear each BB of the whole function. */
3009 FOR_EACH_BB_FN (block, cfun)
3010 {
3011 block->flags &= ~BB_VISITED;
3012 BB_SET_SESE (block, 0);
3013 }
3014 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3015 block->flags &= ~BB_VISITED;
3016 BB_SET_SESE (block, 0);
3017 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3018 block->flags &= ~BB_VISITED;
3019 BB_SET_SESE (block, 0);
3020
3021 /* Mark blocks in the function that are in this graph. */
3022 for (ix = 0; blocks.iterate (ix, &block); ix++)
3023 block->flags |= BB_VISITED;
3024
3025 /* Counts of nodes assigned to each color. There cannot be more
3026 colors than blocks (and hopefully there will be fewer). */
3027 auto_vec<unsigned> color_counts;
3028 color_counts.reserve (blocks.length ());
3029
3030 /* Worklist of nodes in the spanning tree. Again, there cannot be
3031 more nodes in the tree than blocks (there will be fewer if the
3032 CFG of blocks is disjoint). */
3033 auto_vec<basic_block> spanlist;
3034 spanlist.reserve (blocks.length ());
3035
3036 /* Make sure every block has its cycle class determined. */
3037 for (ix = 0; blocks.iterate (ix, &block); ix++)
3038 {
3039 if (BB_GET_SESE (block))
3040 /* We already met this block in an earlier graph solve. */
3041 continue;
3042
3043 if (dump_file)
3044 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3045
3046 /* Number the nodes reachable from block initial DFS order. */
3047 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3048
3049 /* Now walk in reverse DFS order to find cycle equivalents. */
3050 while (spanlist.length ())
3051 {
3052 block = spanlist.pop ();
3053 bb_sese *sese = BB_GET_SESE (block);
3054
3055 /* Do the pseudo node below. */
3056 nvptx_sese_pseudo (block, sese, depth, +1,
3057 sese->dir > 0 ? block->succs : block->preds,
3058 (sese->dir > 0 ? offsetof (edge_def, dest)
3059 : offsetof (edge_def, src)));
3060 sese->set_color (color_counts);
3061 /* Do the pseudo node above. */
3062 nvptx_sese_pseudo (block, sese, depth, -1,
3063 sese->dir < 0 ? block->succs : block->preds,
3064 (sese->dir < 0 ? offsetof (edge_def, dest)
3065 : offsetof (edge_def, src)));
3066 }
3067 if (dump_file)
3068 fprintf (dump_file, "\n");
3069 }
3070
3071 if (dump_file)
3072 {
3073 unsigned count;
3074 const char *comma = "";
3075
3076 fprintf (dump_file, "Found %d cycle equivalents\n",
3077 color_counts.length ());
3078 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3079 {
3080 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3081
3082 comma = "";
3083 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3084 if (BB_GET_SESE (block)->color == ix)
3085 {
3086 block->flags |= BB_VISITED;
3087 fprintf (dump_file, "%s%d", comma, block->index);
3088 comma=",";
3089 }
3090 fprintf (dump_file, "}");
3091 comma = ", ";
3092 }
3093 fprintf (dump_file, "\n");
3094 }
3095
3096 /* Now we've colored every block in the subgraph. We now need to
3097 determine the minimal set of SESE regions that cover that
3098 subgraph. Do this with a DFS walk of the complete function.
3099 During the walk we're either 'looking' or 'coloring'. When we
3100 reach the last node of a particular color, we stop coloring and
3101 return to looking. */
3102
3103 /* There cannot be more SESE regions than colors. */
3104 regions.reserve (color_counts.length ());
3105 for (ix = color_counts.length (); ix--;)
3106 regions.quick_push (bb_pair_t (0, 0));
3107
3108 for (ix = 0; blocks.iterate (ix, &block); ix++)
3109 block->flags &= ~BB_VISITED;
3110
3111 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3112
3113 if (dump_file)
3114 {
3115 const char *comma = "";
3116 int len = regions.length ();
3117
3118 fprintf (dump_file, "SESE regions:");
3119 for (ix = 0; ix != len; ix++)
3120 {
3121 basic_block from = regions[ix].first;
3122 basic_block to = regions[ix].second;
3123
3124 if (from)
3125 {
3126 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3127 if (to != from)
3128 fprintf (dump_file, "->%d", to->index);
3129
3130 int color = BB_GET_SESE (from)->color;
3131
3132 /* Print the blocks within the region (excluding ends). */
3133 FOR_EACH_BB_FN (block, cfun)
3134 {
3135 bb_sese *sese = BB_GET_SESE (block);
3136
3137 if (sese && sese->color == color
3138 && block != from && block != to)
3139 fprintf (dump_file, ".%d", block->index);
3140 }
3141 fprintf (dump_file, "}");
3142 }
3143 comma = ",";
3144 }
3145 fprintf (dump_file, "\n\n");
3146 }
3147
3148 for (ix = 0; blocks.iterate (ix, &block); ix++)
3149 delete BB_GET_SESE (block);
3150}
3151
3152#undef BB_SET_SESE
3153#undef BB_GET_SESE
3154
d88cd9c4
NS
3155/* Propagate live state at the start of a partitioned region. BLOCK
3156 provides the live register information, and might not contain
3157 INSN. Propagation is inserted just after INSN. RW indicates whether
3158 we are reading and/or writing state. This
3159 separation is needed for worker-level proppagation where we
3160 essentially do a spill & fill. FN is the underlying worker
3161 function to generate the propagation instructions for single
3162 register. DATA is user data.
3163
3164 We propagate the live register set and the entire frame. We could
3165 do better by (a) propagating just the live set that is used within
3166 the partitioned regions and (b) only propagating stack entries that
3167 are used. The latter might be quite hard to determine. */
3168
3169typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3170
3171static void
3172nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3173 propagator_fn fn, void *data)
3174{
3175 bitmap live = DF_LIVE_IN (block);
3176 bitmap_iterator iterator;
3177 unsigned ix;
3178
3179 /* Copy the frame array. */
3180 HOST_WIDE_INT fs = get_frame_size ();
3181 if (fs)
3182 {
3183 rtx tmp = gen_reg_rtx (DImode);
3184 rtx idx = NULL_RTX;
3185 rtx ptr = gen_reg_rtx (Pmode);
3186 rtx pred = NULL_RTX;
3187 rtx_code_label *label = NULL;
3188
3189 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3190 fs /= GET_MODE_SIZE (DImode);
3191 /* Detect single iteration loop. */
3192 if (fs == 1)
3193 fs = 0;
3194
3195 start_sequence ();
3196 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3197 if (fs)
3198 {
3199 idx = gen_reg_rtx (SImode);
3200 pred = gen_reg_rtx (BImode);
3201 label = gen_label_rtx ();
3202
3203 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3204 /* Allow worker function to initialize anything needed. */
3205 rtx init = fn (tmp, PM_loop_begin, fs, data);
3206 if (init)
3207 emit_insn (init);
3208 emit_label (label);
3209 LABEL_NUSES (label)++;
3210 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3211 }
3212 if (rw & PM_read)
3213 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3214 emit_insn (fn (tmp, rw, fs, data));
3215 if (rw & PM_write)
3216 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3217 if (fs)
3218 {
3219 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3220 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3221 emit_insn (gen_br_true_uni (pred, label));
3222 rtx fini = fn (tmp, PM_loop_end, fs, data);
3223 if (fini)
3224 emit_insn (fini);
3225 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3226 }
3227 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3228 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3229 rtx cpy = get_insns ();
3230 end_sequence ();
3231 insn = emit_insn_after (cpy, insn);
3232 }
3233
3234 /* Copy live registers. */
3235 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3236 {
3237 rtx reg = regno_reg_rtx[ix];
3238
3239 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3240 {
3241 rtx bcast = fn (reg, rw, 0, data);
3242
3243 insn = emit_insn_after (bcast, insn);
3244 }
3245 }
3246}
3247
3248/* Worker for nvptx_vpropagate. */
3249
3250static rtx
3251vprop_gen (rtx reg, propagate_mask pm,
3252 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3253{
3254 if (!(pm & PM_read_write))
3255 return 0;
3256
3257 return nvptx_gen_vcast (reg);
3258}
3259
3260/* Propagate state that is live at start of BLOCK across the vectors
3261 of a single warp. Propagation is inserted just after INSN. */
3262
3263static void
3264nvptx_vpropagate (basic_block block, rtx_insn *insn)
3265{
3266 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3267}
3268
3269/* Worker for nvptx_wpropagate. */
3270
3271static rtx
3272wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3273{
3274 wcast_data_t *data = (wcast_data_t *)data_;
3275
3276 if (pm & PM_loop_begin)
3277 {
3278 /* Starting a loop, initialize pointer. */
3279 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3280
3281 if (align > worker_bcast_align)
3282 worker_bcast_align = align;
3283 data->offset = (data->offset + align - 1) & ~(align - 1);
3284
3285 data->ptr = gen_reg_rtx (Pmode);
3286
3287 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3288 }
3289 else if (pm & PM_loop_end)
3290 {
3291 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3292 data->ptr = NULL_RTX;
3293 return clobber;
3294 }
3295 else
3296 return nvptx_gen_wcast (reg, pm, rep, data);
3297}
3298
3299/* Spill or fill live state that is live at start of BLOCK. PRE_P
3300 indicates if this is just before partitioned mode (do spill), or
3301 just after it starts (do fill). Sequence is inserted just after
3302 INSN. */
3303
3304static void
3305nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3306{
3307 wcast_data_t data;
3308
3309 data.base = gen_reg_rtx (Pmode);
3310 data.offset = 0;
3311 data.ptr = NULL_RTX;
3312
3313 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3314 if (data.offset)
3315 {
3316 /* Stuff was emitted, initialize the base pointer now. */
3317 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3318 emit_insn_after (init, insn);
3319
3320 if (worker_bcast_size < data.offset)
3321 worker_bcast_size = data.offset;
3322 }
3323}
3324
3325/* Emit a worker-level synchronization barrier. We use different
3326 markers for before and after synchronizations. */
3327
3328static rtx
3329nvptx_wsync (bool after)
3330{
3331 return gen_nvptx_barsync (GEN_INT (after));
3332}
3333
3334/* Single neutering according to MASK. FROM is the incoming block and
3335 TO is the outgoing block. These may be the same block. Insert at
3336 start of FROM:
3337
3338 if (tid.<axis>) goto end.
3339
3340 and insert before ending branch of TO (if there is such an insn):
3341
3342 end:
3343 <possibly-broadcast-cond>
3344 <branch>
3345
3346 We currently only use differnt FROM and TO when skipping an entire
3347 loop. We could do more if we detected superblocks. */
3348
3349static void
3350nvptx_single (unsigned mask, basic_block from, basic_block to)
3351{
3352 rtx_insn *head = BB_HEAD (from);
3353 rtx_insn *tail = BB_END (to);
3354 unsigned skip_mask = mask;
3355
3356 /* Find first insn of from block */
3357 while (head != BB_END (from) && !INSN_P (head))
3358 head = NEXT_INSN (head);
3359
3360 /* Find last insn of to block */
3361 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3362 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3363 tail = PREV_INSN (tail);
3364
3365 /* Detect if tail is a branch. */
3366 rtx tail_branch = NULL_RTX;
3367 rtx cond_branch = NULL_RTX;
3368 if (tail && INSN_P (tail))
3369 {
3370 tail_branch = PATTERN (tail);
3371 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3372 tail_branch = NULL_RTX;
3373 else
3374 {
3375 cond_branch = SET_SRC (tail_branch);
3376 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3377 cond_branch = NULL_RTX;
3378 }
3379 }
3380
3381 if (tail == head)
3382 {
3383 /* If this is empty, do nothing. */
3384 if (!head || !INSN_P (head))
3385 return;
3386
3387 /* If this is a dummy insn, do nothing. */
3388 switch (recog_memoized (head))
3389 {
3390 default:
3391 break;
3392 case CODE_FOR_nvptx_fork:
3393 case CODE_FOR_nvptx_forked:
3394 case CODE_FOR_nvptx_joining:
3395 case CODE_FOR_nvptx_join:
3396 return;
3397 }
3398
3399 if (cond_branch)
3400 {
3401 /* If we're only doing vector single, there's no need to
3402 emit skip code because we'll not insert anything. */
3403 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3404 skip_mask = 0;
3405 }
3406 else if (tail_branch)
3407 /* Block with only unconditional branch. Nothing to do. */
3408 return;
3409 }
3410
3411 /* Insert the vector test inside the worker test. */
3412 unsigned mode;
3413 rtx_insn *before = tail;
3414 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3415 if (GOMP_DIM_MASK (mode) & skip_mask)
3416 {
3417 rtx_code_label *label = gen_label_rtx ();
3418 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3419
3420 if (!pred)
3421 {
3422 pred = gen_reg_rtx (BImode);
3423 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3424 }
3425
3426 rtx br;
3427 if (mode == GOMP_DIM_VECTOR)
3428 br = gen_br_true (pred, label);
3429 else
3430 br = gen_br_true_uni (pred, label);
3431 emit_insn_before (br, head);
3432
3433 LABEL_NUSES (label)++;
3434 if (tail_branch)
3435 before = emit_label_before (label, before);
3436 else
3437 emit_label_after (label, tail);
3438 }
3439
3440 /* Now deal with propagating the branch condition. */
3441 if (cond_branch)
3442 {
3443 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3444
3445 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3446 {
3447 /* Vector mode only, do a shuffle. */
3448 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3449 }
3450 else
3451 {
3452 /* Includes worker mode, do spill & fill. By construction
3453 we should never have worker mode only. */
3454 wcast_data_t data;
3455
3456 data.base = worker_bcast_sym;
3457 data.ptr = 0;
3458
3459 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3460 worker_bcast_size = GET_MODE_SIZE (SImode);
3461
3462 data.offset = 0;
3463 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3464 before);
3465 /* Barrier so other workers can see the write. */
3466 emit_insn_before (nvptx_wsync (false), tail);
3467 data.offset = 0;
3468 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3469 /* This barrier is needed to avoid worker zero clobbering
3470 the broadcast buffer before all the other workers have
3471 had a chance to read this instance of it. */
3472 emit_insn_before (nvptx_wsync (true), tail);
3473 }
3474
3475 extract_insn (tail);
3476 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3477 UNSPEC_BR_UNIFIED);
3478 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3479 }
3480}
3481
3482/* PAR is a parallel that is being skipped in its entirety according to
3483 MASK. Treat this as skipping a superblock starting at forked
3484 and ending at joining. */
3485
3486static void
3487nvptx_skip_par (unsigned mask, parallel *par)
3488{
3489 basic_block tail = par->join_block;
3490 gcc_assert (tail->preds->length () == 1);
3491
3492 basic_block pre_tail = (*tail->preds)[0]->src;
3493 gcc_assert (pre_tail->succs->length () == 1);
3494
3495 nvptx_single (mask, par->forked_block, pre_tail);
3496}
3497
dba619f3
NS
3498/* If PAR has a single inner parallel and PAR itself only contains
3499 empty entry and exit blocks, swallow the inner PAR. */
3500
3501static void
3502nvptx_optimize_inner (parallel *par)
3503{
3504 parallel *inner = par->inner;
3505
3506 /* We mustn't be the outer dummy par. */
3507 if (!par->mask)
3508 return;
3509
3510 /* We must have a single inner par. */
3511 if (!inner || inner->next)
3512 return;
3513
3514 /* We must only contain 2 blocks ourselves -- the head and tail of
3515 the inner par. */
3516 if (par->blocks.length () != 2)
3517 return;
3518
3519 /* We must be disjoint partitioning. As we only have vector and
3520 worker partitioning, this is sufficient to guarantee the pars
3521 have adjacent partitioning. */
3522 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3523 /* This indicates malformed code generation. */
3524 return;
3525
3526 /* The outer forked insn should be immediately followed by the inner
3527 fork insn. */
3528 rtx_insn *forked = par->forked_insn;
3529 rtx_insn *fork = BB_END (par->forked_block);
3530
3531 if (NEXT_INSN (forked) != fork)
3532 return;
3533 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3534
3535 /* The outer joining insn must immediately follow the inner join
3536 insn. */
3537 rtx_insn *joining = par->joining_insn;
3538 rtx_insn *join = inner->join_insn;
3539 if (NEXT_INSN (join) != joining)
3540 return;
3541
3542 /* Preconditions met. Swallow the inner par. */
3543 if (dump_file)
3544 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3545 inner->mask, inner->forked_block->index,
3546 inner->join_block->index,
3547 par->mask, par->forked_block->index, par->join_block->index);
3548
3549 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3550
3551 par->blocks.reserve (inner->blocks.length ());
3552 while (inner->blocks.length ())
3553 par->blocks.quick_push (inner->blocks.pop ());
3554
3555 par->inner = inner->inner;
3556 inner->inner = NULL;
3557
3558 delete inner;
3559}
3560
d88cd9c4
NS
3561/* Process the parallel PAR and all its contained
3562 parallels. We do everything but the neutering. Return mask of
3563 partitioned modes used within this parallel. */
3564
3565static unsigned
3566nvptx_process_pars (parallel *par)
3567{
dba619f3
NS
3568 if (nvptx_optimize)
3569 nvptx_optimize_inner (par);
3570
d88cd9c4
NS
3571 unsigned inner_mask = par->mask;
3572
3573 /* Do the inner parallels first. */
3574 if (par->inner)
3575 {
3576 par->inner_mask = nvptx_process_pars (par->inner);
3577 inner_mask |= par->inner_mask;
3578 }
3579
3580 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3581 /* No propagation needed for a call. */;
5d306e55 3582 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3583 {
3584 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3585 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3586 /* Insert begin and end synchronizations. */
3587 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3588 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3589 }
3590 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3591 nvptx_vpropagate (par->forked_block, par->forked_insn);
3592
3593 /* Now do siblings. */
3594 if (par->next)
3595 inner_mask |= nvptx_process_pars (par->next);
3596 return inner_mask;
3597}
3598
3599/* Neuter the parallel described by PAR. We recurse in depth-first
3600 order. MODES are the partitioning of the execution and OUTER is
3601 the partitioning of the parallels we are contained in. */
3602
3603static void
3604nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3605{
3606 unsigned me = (par->mask
3607 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3608 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3609 unsigned skip_mask = 0, neuter_mask = 0;
3610
3611 if (par->inner)
3612 nvptx_neuter_pars (par->inner, modes, outer | me);
3613
3614 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3615 {
3616 if ((outer | me) & GOMP_DIM_MASK (mode))
3617 {} /* Mode is partitioned: no neutering. */
3618 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 3619 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
3620 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3621 || !par->forked_insn)
3622 /* Partitioned in inner parallels, or we're not a partitioned
3623 at all: neuter individual blocks. */
3624 neuter_mask |= GOMP_DIM_MASK (mode);
3625 else if (!par->parent || !par->parent->forked_insn
3626 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3627 /* Parent isn't a parallel or contains this paralleling: skip
3628 parallel at this level. */
3629 skip_mask |= GOMP_DIM_MASK (mode);
3630 else
3631 {} /* Parent will skip this parallel itself. */
3632 }
3633
3634 if (neuter_mask)
3635 {
912442c2 3636 int ix, len;
d88cd9c4 3637
912442c2
NS
3638 if (nvptx_optimize)
3639 {
3640 /* Neuter whole SESE regions. */
3641 bb_pair_vec_t regions;
3642
3643 nvptx_find_sese (par->blocks, regions);
3644 len = regions.length ();
3645 for (ix = 0; ix != len; ix++)
3646 {
3647 basic_block from = regions[ix].first;
3648 basic_block to = regions[ix].second;
3649
3650 if (from)
3651 nvptx_single (neuter_mask, from, to);
3652 else
3653 gcc_assert (!to);
3654 }
3655 }
3656 else
d88cd9c4 3657 {
912442c2
NS
3658 /* Neuter each BB individually. */
3659 len = par->blocks.length ();
3660 for (ix = 0; ix != len; ix++)
3661 {
3662 basic_block block = par->blocks[ix];
d88cd9c4 3663
912442c2
NS
3664 nvptx_single (neuter_mask, block, block);
3665 }
d88cd9c4
NS
3666 }
3667 }
3668
3669 if (skip_mask)
3670 nvptx_skip_par (skip_mask, par);
3671
3672 if (par->next)
3673 nvptx_neuter_pars (par->next, modes, outer);
3674}
3675
517665b3 3676/* PTX-specific reorganization
d88cd9c4 3677 - Split blocks at fork and join instructions
c38f0d8c
NS
3678 - Compute live registers
3679 - Mark now-unused registers, so function begin doesn't declare
517665b3 3680 unused registers.
d88cd9c4
NS
3681 - Insert state propagation when entering partitioned mode
3682 - Insert neutering instructions when in single mode
c38f0d8c 3683 - Replace subregs with suitable sequences.
517665b3
NS
3684*/
3685
3686static void
3687nvptx_reorg (void)
3688{
517665b3
NS
3689 /* We are freeing block_for_insn in the toplev to keep compatibility
3690 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3691 compute_bb_for_insn ();
3692
3693 thread_prologue_and_epilogue_insns ();
3694
d88cd9c4
NS
3695 /* Split blocks and record interesting unspecs. */
3696 bb_insn_map_t bb_insn_map;
3697
3698 nvptx_split_blocks (&bb_insn_map);
3699
c38f0d8c 3700 /* Compute live regs */
517665b3
NS
3701 df_clear_flags (DF_LR_RUN_DCE);
3702 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
3703 df_live_add_problem ();
3704 df_live_set_all_dirty ();
517665b3 3705 df_analyze ();
738f2522
BS
3706 regstat_init_n_sets_and_refs ();
3707
d88cd9c4
NS
3708 if (dump_file)
3709 df_dump (dump_file);
3710
517665b3 3711 /* Mark unused regs as unused. */
d88cd9c4 3712 int max_regs = max_reg_num ();
517665b3 3713 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
3714 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3715 regno_reg_rtx[i] = const0_rtx;
517665b3 3716
d88cd9c4
NS
3717 /* Determine launch dimensions of the function. If it is not an
3718 offloaded function (i.e. this is a regular compiler), the
3719 function has no neutering. */
3720 tree attr = get_oacc_fn_attrib (current_function_decl);
3721 if (attr)
3722 {
3723 /* If we determined this mask before RTL expansion, we could
3724 elide emission of some levels of forks and joins. */
3725 unsigned mask = 0;
3726 tree dims = TREE_VALUE (attr);
3727 unsigned ix;
3728
3729 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3730 {
3731 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3732 tree allowed = TREE_PURPOSE (dims);
3733
3734 if (size != 1 && !(allowed && integer_zerop (allowed)))
3735 mask |= GOMP_DIM_MASK (ix);
3736 }
3737 /* If there is worker neutering, there must be vector
3738 neutering. Otherwise the hardware will fail. */
3739 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3740 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3741
3742 /* Discover & process partitioned regions. */
3743 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3744 nvptx_process_pars (pars);
3745 nvptx_neuter_pars (pars, mask, 0);
3746 delete pars;
3747 }
3748
517665b3 3749 /* Replace subregs. */
c03b0416 3750 nvptx_reorg_subreg ();
517665b3 3751
738f2522 3752 regstat_free_n_sets_and_refs ();
517665b3
NS
3753
3754 df_finish_pass (true);
738f2522
BS
3755}
3756\f
3757/* Handle a "kernel" attribute; arguments as in
3758 struct attribute_spec.handler. */
3759
3760static tree
3761nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3762 int ARG_UNUSED (flags), bool *no_add_attrs)
3763{
3764 tree decl = *node;
3765
3766 if (TREE_CODE (decl) != FUNCTION_DECL)
3767 {
3768 error ("%qE attribute only applies to functions", name);
3769 *no_add_attrs = true;
3770 }
3771
3772 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3773 {
3774 error ("%qE attribute requires a void return type", name);
3775 *no_add_attrs = true;
3776 }
3777
3778 return NULL_TREE;
3779}
3780
3781/* Table of valid machine attributes. */
3782static const struct attribute_spec nvptx_attribute_table[] =
3783{
3784 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3785 affects_type_identity } */
3786 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3787 { NULL, 0, 0, false, false, false, NULL, false }
3788};
3789\f
3790/* Limit vector alignments to BIGGEST_ALIGNMENT. */
3791
3792static HOST_WIDE_INT
3793nvptx_vector_alignment (const_tree type)
3794{
3795 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3796
3797 return MIN (align, BIGGEST_ALIGNMENT);
3798}
d88cd9c4
NS
3799
3800/* Indicate that INSN cannot be duplicated. */
3801
3802static bool
3803nvptx_cannot_copy_insn_p (rtx_insn *insn)
3804{
3805 switch (recog_memoized (insn))
3806 {
3807 case CODE_FOR_nvptx_shufflesi:
3808 case CODE_FOR_nvptx_shufflesf:
3809 case CODE_FOR_nvptx_barsync:
3810 case CODE_FOR_nvptx_fork:
3811 case CODE_FOR_nvptx_forked:
3812 case CODE_FOR_nvptx_joining:
3813 case CODE_FOR_nvptx_join:
3814 return true;
3815 default:
3816 return false;
3817 }
3818}
a794bd20
NS
3819
3820/* Section anchors do not work. Initialization for flag_section_anchor
3821 probes the existence of the anchoring target hooks and prevents
3822 anchoring if they don't exist. However, we may be being used with
3823 a host-side compiler that does support anchoring, and hence see
3824 the anchor flag set (as it's not recalculated). So provide an
3825 implementation denying anchoring. */
3826
3827static bool
3828nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3829{
3830 return false;
3831}
738f2522 3832\f
1f83528e
TS
3833/* Record a symbol for mkoffload to enter into the mapping table. */
3834
3835static void
3836nvptx_record_offload_symbol (tree decl)
3837{
3e32ee19
NS
3838 switch (TREE_CODE (decl))
3839 {
3840 case VAR_DECL:
3841 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3842 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3843 break;
3844
3845 case FUNCTION_DECL:
3846 {
3847 tree attr = get_oacc_fn_attrib (decl);
5d306e55 3848 tree dims = TREE_VALUE (attr);
3e32ee19
NS
3849 unsigned ix;
3850
3e32ee19
NS
3851 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3852 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3853
5d306e55 3854 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3e32ee19 3855 {
5d306e55 3856 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 3857
5d306e55 3858 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
3859 fprintf (asm_out_file, ", %#x", size);
3860 }
d2d47a28 3861
3e32ee19
NS
3862 fprintf (asm_out_file, "\n");
3863 }
3864 break;
d2d47a28 3865
3e32ee19
NS
3866 default:
3867 gcc_unreachable ();
3868 }
1f83528e
TS
3869}
3870
738f2522
BS
3871/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3872 at the start of a file. */
3873
3874static void
3875nvptx_file_start (void)
3876{
3877 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3878 fputs ("\t.version\t3.1\n", asm_out_file);
3879 fputs ("\t.target\tsm_30\n", asm_out_file);
3880 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3881 fputs ("// END PREAMBLE\n", asm_out_file);
3882}
3883
ecf6e535
BS
3884/* Write out the function declarations we've collected and declare storage
3885 for the broadcast buffer. */
738f2522
BS
3886
3887static void
3888nvptx_file_end (void)
3889{
f3dba894
TS
3890 hash_table<tree_hasher>::iterator iter;
3891 tree decl;
3892 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 3893 nvptx_record_fndecl (decl);
738f2522 3894 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
3895
3896 if (worker_bcast_size)
3897 {
3898 /* Define the broadcast buffer. */
3899
3900 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3901 & ~(worker_bcast_align - 1);
3902
cf08c344 3903 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_bcast_name);
d88cd9c4
NS
3904 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3905 worker_bcast_align,
3906 worker_bcast_name, worker_bcast_size);
3907 }
f3552158
NS
3908
3909 if (worker_red_size)
3910 {
3911 /* Define the reduction buffer. */
3912
3913 worker_red_size = ((worker_red_size + worker_red_align - 1)
3914 & ~(worker_red_align - 1));
3915
cf08c344 3916 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_red_name);
f3552158
NS
3917 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3918 worker_red_align,
3919 worker_red_name, worker_red_size);
3920 }
3921}
3922
3923/* Expander for the shuffle builtins. */
3924
3925static rtx
3926nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3927{
3928 if (ignore)
3929 return target;
3930
3931 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3932 NULL_RTX, mode, EXPAND_NORMAL);
3933 if (!REG_P (src))
3934 src = copy_to_mode_reg (mode, src);
3935
3936 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3937 NULL_RTX, SImode, EXPAND_NORMAL);
3938 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3939 NULL_RTX, SImode, EXPAND_NORMAL);
3940
3941 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3942 idx = copy_to_mode_reg (SImode, idx);
3943
3944 rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op));
3945 if (pat)
3946 emit_insn (pat);
3947
3948 return target;
3949}
3950
3951/* Worker reduction address expander. */
3952
3953static rtx
3954nvptx_expand_worker_addr (tree exp, rtx target,
3955 machine_mode ARG_UNUSED (mode), int ignore)
3956{
3957 if (ignore)
3958 return target;
3959
3960 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3961 if (align > worker_red_align)
3962 worker_red_align = align;
3963
3964 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3965 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3966 if (size + offset > worker_red_size)
3967 worker_red_size = size + offset;
3968
3969 emit_insn (gen_rtx_SET (target, worker_red_sym));
3970
3971 if (offset)
3972 emit_insn (gen_rtx_SET (target,
3973 gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
3974
3975 emit_insn (gen_rtx_SET (target,
3976 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
3977 UNSPEC_FROM_SHARED)));
3978
3979 return target;
3980}
3981
3982/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3983 not require taking the address of any object, other than the memory
3984 cell being operated on. */
3985
3986static rtx
3987nvptx_expand_cmp_swap (tree exp, rtx target,
3988 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
3989{
3990 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3991
3992 if (!target)
3993 target = gen_reg_rtx (mode);
3994
3995 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
3996 NULL_RTX, Pmode, EXPAND_NORMAL);
3997 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3998 NULL_RTX, mode, EXPAND_NORMAL);
3999 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4000 NULL_RTX, mode, EXPAND_NORMAL);
4001 rtx pat;
4002
4003 mem = gen_rtx_MEM (mode, mem);
4004 if (!REG_P (cmp))
4005 cmp = copy_to_mode_reg (mode, cmp);
4006 if (!REG_P (src))
4007 src = copy_to_mode_reg (mode, src);
4008
4009 if (mode == SImode)
4010 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4011 else
4012 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4013
4014 emit_insn (pat);
4015
4016 return target;
4017}
4018
4019
4020/* Codes for all the NVPTX builtins. */
4021enum nvptx_builtins
4022{
4023 NVPTX_BUILTIN_SHUFFLE,
4024 NVPTX_BUILTIN_SHUFFLELL,
4025 NVPTX_BUILTIN_WORKER_ADDR,
4026 NVPTX_BUILTIN_CMP_SWAP,
4027 NVPTX_BUILTIN_CMP_SWAPLL,
4028 NVPTX_BUILTIN_MAX
4029};
4030
4031static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4032
4033/* Return the NVPTX builtin for CODE. */
4034
4035static tree
4036nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4037{
4038 if (code >= NVPTX_BUILTIN_MAX)
4039 return error_mark_node;
4040
4041 return nvptx_builtin_decls[code];
4042}
4043
4044/* Set up all builtin functions for this target. */
4045
4046static void
4047nvptx_init_builtins (void)
4048{
4049#define DEF(ID, NAME, T) \
4050 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4051 = add_builtin_function ("__builtin_nvptx_" NAME, \
4052 build_function_type_list T, \
4053 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4054#define ST sizetype
4055#define UINT unsigned_type_node
4056#define LLUINT long_long_unsigned_type_node
4057#define PTRVOID ptr_type_node
4058
4059 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4060 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4061 DEF (WORKER_ADDR, "worker_addr",
4062 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4063 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4064 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4065
4066#undef DEF
4067#undef ST
4068#undef UINT
4069#undef LLUINT
4070#undef PTRVOID
4071}
4072
4073/* Expand an expression EXP that calls a built-in function,
4074 with result going to TARGET if that's convenient
4075 (and in mode MODE if that's convenient).
4076 SUBTARGET may be used as the target for computing one of EXP's operands.
4077 IGNORE is nonzero if the value is to be ignored. */
4078
4079static rtx
4080nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4081 machine_mode mode, int ignore)
4082{
4083 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4084 switch (DECL_FUNCTION_CODE (fndecl))
4085 {
4086 case NVPTX_BUILTIN_SHUFFLE:
4087 case NVPTX_BUILTIN_SHUFFLELL:
4088 return nvptx_expand_shuffle (exp, target, mode, ignore);
4089
4090 case NVPTX_BUILTIN_WORKER_ADDR:
4091 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4092
4093 case NVPTX_BUILTIN_CMP_SWAP:
4094 case NVPTX_BUILTIN_CMP_SWAPLL:
4095 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4096
4097 default: gcc_unreachable ();
4098 }
738f2522
BS
4099}
4100\f
f3552158
NS
4101/* Define dimension sizes for known hardware. */
4102#define PTX_VECTOR_LENGTH 32
4103#define PTX_WORKER_LENGTH 32
4104
94829f87
NS
4105/* Validate compute dimensions of an OpenACC offload or routine, fill
4106 in non-unity defaults. FN_LEVEL indicates the level at which a
4107 routine might spawn a loop. It is negative for non-routines. */
4108
4109static bool
5d306e55 4110nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4111{
4112 bool changed = false;
4113
ccc8282b
NS
4114 /* The vector size must be 32, unless this is a SEQ routine. */
4115 if (fn_level <= GOMP_DIM_VECTOR
4116 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4117 {
4118 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4119 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4120 dims[GOMP_DIM_VECTOR]
4121 ? "using vector_length (%d), ignoring %d"
4122 : "using vector_length (%d), ignoring runtime setting",
4123 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4124 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4125 changed = true;
4126 }
4127
4128 /* Check the num workers is not too large. */
4129 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4130 {
4131 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4132 "using num_workers (%d), ignoring %d",
4133 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4134 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4135 changed = true;
4136 }
94829f87
NS
4137
4138 return changed;
4139}
d88cd9c4 4140
bd751975
NS
4141/* Return maximum dimension size, or zero for unbounded. */
4142
4143static int
4144nvptx_dim_limit (int axis)
4145{
4146 switch (axis)
4147 {
4148 case GOMP_DIM_WORKER:
4149 return PTX_WORKER_LENGTH;
4150
4151 case GOMP_DIM_VECTOR:
4152 return PTX_VECTOR_LENGTH;
4153
4154 default:
4155 break;
4156 }
4157 return 0;
4158}
4159
d88cd9c4
NS
4160/* Determine whether fork & joins are needed. */
4161
4162static bool
4163nvptx_goacc_fork_join (gcall *call, const int dims[],
4164 bool ARG_UNUSED (is_fork))
4165{
4166 tree arg = gimple_call_arg (call, 2);
4167 unsigned axis = TREE_INT_CST_LOW (arg);
4168
4169 /* We only care about worker and vector partitioning. */
4170 if (axis < GOMP_DIM_WORKER)
4171 return false;
4172
4173 /* If the size is 1, there's no partitioning. */
4174 if (dims[axis] == 1)
4175 return false;
4176
4177 return true;
4178}
4179
f3552158
NS
4180/* Generate a PTX builtin function call that returns the address in
4181 the worker reduction buffer at OFFSET. TYPE is the type of the
4182 data at that location. */
4183
4184static tree
4185nvptx_get_worker_red_addr (tree type, tree offset)
4186{
4187 machine_mode mode = TYPE_MODE (type);
4188 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4189 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4190 tree align = build_int_cst (unsigned_type_node,
4191 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4192 tree call = build_call_expr (fndecl, 3, offset, size, align);
4193
4194 return fold_convert (build_pointer_type (type), call);
4195}
4196
4197/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4198 will cast the variable if necessary. */
4199
4200static void
4201nvptx_generate_vector_shuffle (location_t loc,
4202 tree dest_var, tree var, unsigned shift,
4203 gimple_seq *seq)
4204{
4205 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4206 tree_code code = NOP_EXPR;
dd3c1b14
NS
4207 tree arg_type = unsigned_type_node;
4208 tree var_type = TREE_TYPE (var);
4209 tree dest_type = var_type;
f3552158 4210
dd3c1b14
NS
4211 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4212 var_type = TREE_TYPE (var_type);
4213
4214 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4215 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4216
4217 if (TYPE_SIZE (var_type)
4218 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4219 {
4220 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4221 arg_type = long_long_unsigned_type_node;
f3552158 4222 }
dd3c1b14 4223
f3552158 4224 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4225 tree bits = build_int_cst (unsigned_type_node, shift);
4226 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4227 tree expr;
4228
4229 if (var_type != dest_type)
4230 {
4231 /* Do real and imaginary parts separately. */
4232 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4233 real = fold_build1 (code, arg_type, real);
4234 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4235 real = fold_build1 (code, var_type, real);
f3552158 4236
dd3c1b14
NS
4237 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4238 imag = fold_build1 (code, arg_type, imag);
4239 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4240 imag = fold_build1 (code, var_type, imag);
4241
4242 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4243 }
4244 else
4245 {
4246 expr = fold_build1 (code, arg_type, var);
4247 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4248 expr = fold_build1 (code, dest_type, expr);
4249 }
f3552158 4250
dd3c1b14 4251 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4252}
4253
33f47f42
NS
4254/* Lazily generate the global lock var decl and return its address. */
4255
4256static tree
4257nvptx_global_lock_addr ()
4258{
4259 tree v = global_lock_var;
4260
4261 if (!v)
4262 {
4263 tree name = get_identifier ("__reduction_lock");
4264 tree type = build_qualified_type (unsigned_type_node,
4265 TYPE_QUAL_VOLATILE);
4266 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4267 global_lock_var = v;
4268 DECL_ARTIFICIAL (v) = 1;
4269 DECL_EXTERNAL (v) = 1;
4270 TREE_STATIC (v) = 1;
4271 TREE_PUBLIC (v) = 1;
4272 TREE_USED (v) = 1;
4273 mark_addressable (v);
4274 mark_decl_referenced (v);
4275 }
4276
4277 return build_fold_addr_expr (v);
4278}
4279
4280/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4281 GSI. We use a lockless scheme for nearly all case, which looks
4282 like:
4283 actual = initval(OP);
4284 do {
4285 guess = actual;
4286 write = guess OP myval;
4287 actual = cmp&swap (ptr, guess, write)
4288 } while (actual bit-different-to guess);
4289 return write;
4290
4291 This relies on a cmp&swap instruction, which is available for 32-
4292 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4293
4294static tree
4295nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4296 tree ptr, tree var, tree_code op)
4297{
4298 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4299 tree_code code = NOP_EXPR;
33f47f42
NS
4300 tree arg_type = unsigned_type_node;
4301 tree var_type = TREE_TYPE (var);
f3552158 4302
33f47f42
NS
4303 if (TREE_CODE (var_type) == COMPLEX_TYPE
4304 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4305 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4306
4307 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4308 {
33f47f42 4309 arg_type = long_long_unsigned_type_node;
f3552158 4310 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4311 }
4312
33f47f42
NS
4313 tree swap_fn = nvptx_builtin_decl (fn, true);
4314
f3552158 4315 gimple_seq init_seq = NULL;
33f47f42
NS
4316 tree init_var = make_ssa_name (arg_type);
4317 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4318 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4319 gimplify_assign (init_var, init_expr, &init_seq);
4320 gimple *init_end = gimple_seq_last (init_seq);
4321
4322 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4323
f3552158
NS
4324 /* Split the block just after the init stmts. */
4325 basic_block pre_bb = gsi_bb (*gsi);
4326 edge pre_edge = split_block (pre_bb, init_end);
4327 basic_block loop_bb = pre_edge->dest;
4328 pre_bb = pre_edge->src;
4329 /* Reset the iterator. */
4330 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4331
33f47f42
NS
4332 tree expect_var = make_ssa_name (arg_type);
4333 tree actual_var = make_ssa_name (arg_type);
4334 tree write_var = make_ssa_name (arg_type);
4335
4336 /* Build and insert the reduction calculation. */
4337 gimple_seq red_seq = NULL;
4338 tree write_expr = fold_build1 (code, var_type, expect_var);
4339 write_expr = fold_build2 (op, var_type, write_expr, var);
4340 write_expr = fold_build1 (code, arg_type, write_expr);
4341 gimplify_assign (write_var, write_expr, &red_seq);
4342
4343 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4344
4345 /* Build & insert the cmp&swap sequence. */
4346 gimple_seq latch_seq = NULL;
4347 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4348 ptr, expect_var, write_var);
4349 gimplify_assign (actual_var, swap_expr, &latch_seq);
4350
4351 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4352 NULL_TREE, NULL_TREE);
4353 gimple_seq_add_stmt (&latch_seq, cond);
4354
4355 gimple *latch_end = gimple_seq_last (latch_seq);
4356 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4357
33f47f42
NS
4358 /* Split the block just after the latch stmts. */
4359 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4360 basic_block post_bb = post_edge->dest;
4361 loop_bb = post_edge->src;
4362 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4363
4364 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4365 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4366 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4367 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4368
4369 gphi *phi = create_phi_node (expect_var, loop_bb);
4370 add_phi_arg (phi, init_var, pre_edge, loc);
4371 add_phi_arg (phi, actual_var, loop_edge, loc);
4372
4373 loop *loop = alloc_loop ();
4374 loop->header = loop_bb;
4375 loop->latch = loop_bb;
4376 add_loop (loop, loop_bb->loop_father);
4377
33f47f42
NS
4378 return fold_build1 (code, var_type, write_var);
4379}
4380
4381/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4382 GSI. This is necessary for types larger than 64 bits, where there
4383 is no cmp&swap instruction to implement a lockless scheme. We use
4384 a lock variable in global memory.
4385
4386 while (cmp&swap (&lock_var, 0, 1))
4387 continue;
4388 T accum = *ptr;
4389 accum = accum OP var;
4390 *ptr = accum;
4391 cmp&swap (&lock_var, 1, 0);
4392 return accum;
4393
4394 A lock in global memory is necessary to force execution engine
4395 descheduling and avoid resource starvation that can occur if the
4396 lock is in .shared memory. */
4397
4398static tree
4399nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4400 tree ptr, tree var, tree_code op)
4401{
4402 tree var_type = TREE_TYPE (var);
4403 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4404 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4405 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4406
4407 /* Split the block just before the gsi. Insert a gimple nop to make
4408 this easier. */
4409 gimple *nop = gimple_build_nop ();
4410 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4411 basic_block entry_bb = gsi_bb (*gsi);
4412 edge entry_edge = split_block (entry_bb, nop);
4413 basic_block lock_bb = entry_edge->dest;
4414 /* Reset the iterator. */
4415 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4416
4417 /* Build and insert the locking sequence. */
4418 gimple_seq lock_seq = NULL;
4419 tree lock_var = make_ssa_name (unsigned_type_node);
4420 tree lock_expr = nvptx_global_lock_addr ();
4421 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4422 uns_unlocked, uns_locked);
4423 gimplify_assign (lock_var, lock_expr, &lock_seq);
4424 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4425 NULL_TREE, NULL_TREE);
4426 gimple_seq_add_stmt (&lock_seq, cond);
4427 gimple *lock_end = gimple_seq_last (lock_seq);
4428 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4429
4430 /* Split the block just after the lock sequence. */
4431 edge locked_edge = split_block (lock_bb, lock_end);
4432 basic_block update_bb = locked_edge->dest;
4433 lock_bb = locked_edge->src;
4434 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4435
4436 /* Create the lock loop ... */
4437 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4438 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4439 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4440 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4441
4442 /* ... and the loop structure. */
4443 loop *lock_loop = alloc_loop ();
4444 lock_loop->header = lock_bb;
4445 lock_loop->latch = lock_bb;
4446 lock_loop->nb_iterations_estimate = 1;
4447 lock_loop->any_estimate = true;
4448 add_loop (lock_loop, entry_bb->loop_father);
4449
4450 /* Build and insert the reduction calculation. */
4451 gimple_seq red_seq = NULL;
4452 tree acc_in = make_ssa_name (var_type);
4453 tree ref_in = build_simple_mem_ref (ptr);
4454 TREE_THIS_VOLATILE (ref_in) = 1;
4455 gimplify_assign (acc_in, ref_in, &red_seq);
4456
4457 tree acc_out = make_ssa_name (var_type);
4458 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4459 gimplify_assign (acc_out, update_expr, &red_seq);
4460
4461 tree ref_out = build_simple_mem_ref (ptr);
4462 TREE_THIS_VOLATILE (ref_out) = 1;
4463 gimplify_assign (ref_out, acc_out, &red_seq);
4464
4465 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4466
4467 /* Build & insert the unlock sequence. */
4468 gimple_seq unlock_seq = NULL;
4469 tree unlock_expr = nvptx_global_lock_addr ();
4470 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4471 uns_locked, uns_unlocked);
4472 gimplify_and_add (unlock_expr, &unlock_seq);
4473 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4474
4475 return acc_out;
4476}
4477
4478/* Emit a sequence to update a reduction accumlator at *PTR with the
4479 value held in VAR using operator OP. Return the updated value.
4480
4481 TODO: optimize for atomic ops and indepedent complex ops. */
4482
4483static tree
4484nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4485 tree ptr, tree var, tree_code op)
4486{
4487 tree type = TREE_TYPE (var);
4488 tree size = TYPE_SIZE (type);
4489
4490 if (size == TYPE_SIZE (unsigned_type_node)
4491 || size == TYPE_SIZE (long_long_unsigned_type_node))
4492 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4493 else
4494 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
4495}
4496
4497/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4498
4499static void
4500nvptx_goacc_reduction_setup (gcall *call)
4501{
4502 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4503 tree lhs = gimple_call_lhs (call);
4504 tree var = gimple_call_arg (call, 2);
4505 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4506 gimple_seq seq = NULL;
4507
4508 push_gimplify_context (true);
4509
4510 if (level != GOMP_DIM_GANG)
4511 {
4512 /* Copy the receiver object. */
4513 tree ref_to_res = gimple_call_arg (call, 1);
4514
4515 if (!integer_zerop (ref_to_res))
4516 var = build_simple_mem_ref (ref_to_res);
4517 }
4518
4519 if (level == GOMP_DIM_WORKER)
4520 {
4521 /* Store incoming value to worker reduction buffer. */
4522 tree offset = gimple_call_arg (call, 5);
4523 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4524 tree ptr = make_ssa_name (TREE_TYPE (call));
4525
4526 gimplify_assign (ptr, call, &seq);
4527 tree ref = build_simple_mem_ref (ptr);
4528 TREE_THIS_VOLATILE (ref) = 1;
4529 gimplify_assign (ref, var, &seq);
4530 }
4531
4532 if (lhs)
4533 gimplify_assign (lhs, var, &seq);
4534
4535 pop_gimplify_context (NULL);
4536 gsi_replace_with_seq (&gsi, seq, true);
4537}
4538
4539/* NVPTX implementation of GOACC_REDUCTION_INIT. */
4540
4541static void
4542nvptx_goacc_reduction_init (gcall *call)
4543{
4544 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4545 tree lhs = gimple_call_lhs (call);
4546 tree var = gimple_call_arg (call, 2);
4547 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4548 enum tree_code rcode
4549 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4550 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4551 TREE_TYPE (var));
4552 gimple_seq seq = NULL;
4553
4554 push_gimplify_context (true);
4555
4556 if (level == GOMP_DIM_VECTOR)
4557 {
4558 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4559 tree tid = make_ssa_name (integer_type_node);
4560 tree dim_vector = gimple_call_arg (call, 3);
4561 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4562 dim_vector);
4563 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4564 NULL_TREE, NULL_TREE);
4565
4566 gimple_call_set_lhs (tid_call, tid);
4567 gimple_seq_add_stmt (&seq, tid_call);
4568 gimple_seq_add_stmt (&seq, cond_stmt);
4569
4570 /* Split the block just after the call. */
4571 edge init_edge = split_block (gsi_bb (gsi), call);
4572 basic_block init_bb = init_edge->dest;
4573 basic_block call_bb = init_edge->src;
4574
4575 /* Fixup flags from call_bb to init_bb. */
4576 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4577
4578 /* Set the initialization stmts. */
4579 gimple_seq init_seq = NULL;
4580 tree init_var = make_ssa_name (TREE_TYPE (var));
4581 gimplify_assign (init_var, init, &init_seq);
4582 gsi = gsi_start_bb (init_bb);
4583 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4584
4585 /* Split block just after the init stmt. */
4586 gsi_prev (&gsi);
4587 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4588 basic_block dst_bb = inited_edge->dest;
4589
4590 /* Create false edge from call_bb to dst_bb. */
4591 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4592
4593 /* Create phi node in dst block. */
4594 gphi *phi = create_phi_node (lhs, dst_bb);
4595 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4596 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4597
4598 /* Reset dominator of dst bb. */
4599 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4600
4601 /* Reset the gsi. */
4602 gsi = gsi_for_stmt (call);
4603 }
4604 else
4605 {
4606 if (level == GOMP_DIM_GANG)
4607 {
4608 /* If there's no receiver object, propagate the incoming VAR. */
4609 tree ref_to_res = gimple_call_arg (call, 1);
4610 if (integer_zerop (ref_to_res))
4611 init = var;
4612 }
4613
4614 gimplify_assign (lhs, init, &seq);
4615 }
4616
4617 pop_gimplify_context (NULL);
4618 gsi_replace_with_seq (&gsi, seq, true);
4619}
4620
4621/* NVPTX implementation of GOACC_REDUCTION_FINI. */
4622
4623static void
4624nvptx_goacc_reduction_fini (gcall *call)
4625{
4626 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4627 tree lhs = gimple_call_lhs (call);
4628 tree ref_to_res = gimple_call_arg (call, 1);
4629 tree var = gimple_call_arg (call, 2);
4630 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4631 enum tree_code op
4632 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4633 gimple_seq seq = NULL;
4634 tree r = NULL_TREE;;
4635
4636 push_gimplify_context (true);
4637
4638 if (level == GOMP_DIM_VECTOR)
4639 {
4640 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4641 but that requires a method of emitting a unified jump at the
4642 gimple level. */
4643 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4644 {
4645 tree other_var = make_ssa_name (TREE_TYPE (var));
4646 nvptx_generate_vector_shuffle (gimple_location (call),
4647 other_var, var, shfl, &seq);
4648
4649 r = make_ssa_name (TREE_TYPE (var));
4650 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4651 var, other_var), &seq);
4652 var = r;
4653 }
4654 }
4655 else
4656 {
4657 tree accum = NULL_TREE;
4658
4659 if (level == GOMP_DIM_WORKER)
4660 {
4661 /* Get reduction buffer address. */
4662 tree offset = gimple_call_arg (call, 5);
4663 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4664 tree ptr = make_ssa_name (TREE_TYPE (call));
4665
4666 gimplify_assign (ptr, call, &seq);
4667 accum = ptr;
4668 }
4669 else if (integer_zerop (ref_to_res))
4670 r = var;
4671 else
4672 accum = ref_to_res;
4673
4674 if (accum)
4675 {
33f47f42 4676 /* UPDATE the accumulator. */
f3552158
NS
4677 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4678 seq = NULL;
33f47f42
NS
4679 r = nvptx_reduction_update (gimple_location (call), &gsi,
4680 accum, var, op);
f3552158
NS
4681 }
4682 }
4683
4684 if (lhs)
4685 gimplify_assign (lhs, r, &seq);
4686 pop_gimplify_context (NULL);
4687
4688 gsi_replace_with_seq (&gsi, seq, true);
4689}
4690
4691/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4692
4693static void
4694nvptx_goacc_reduction_teardown (gcall *call)
4695{
4696 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4697 tree lhs = gimple_call_lhs (call);
4698 tree var = gimple_call_arg (call, 2);
4699 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4700 gimple_seq seq = NULL;
4701
4702 push_gimplify_context (true);
4703 if (level == GOMP_DIM_WORKER)
4704 {
4705 /* Read the worker reduction buffer. */
4706 tree offset = gimple_call_arg (call, 5);
4707 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4708 tree ptr = make_ssa_name (TREE_TYPE (call));
4709
4710 gimplify_assign (ptr, call, &seq);
4711 var = build_simple_mem_ref (ptr);
4712 TREE_THIS_VOLATILE (var) = 1;
4713 }
4714
4715 if (level != GOMP_DIM_GANG)
4716 {
4717 /* Write to the receiver object. */
4718 tree ref_to_res = gimple_call_arg (call, 1);
4719
4720 if (!integer_zerop (ref_to_res))
4721 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4722 }
4723
4724 if (lhs)
4725 gimplify_assign (lhs, var, &seq);
4726
4727 pop_gimplify_context (NULL);
4728
4729 gsi_replace_with_seq (&gsi, seq, true);
4730}
4731
4732/* NVPTX reduction expander. */
4733
4734void
4735nvptx_goacc_reduction (gcall *call)
4736{
4737 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4738
4739 switch (code)
4740 {
4741 case IFN_GOACC_REDUCTION_SETUP:
4742 nvptx_goacc_reduction_setup (call);
4743 break;
4744
4745 case IFN_GOACC_REDUCTION_INIT:
4746 nvptx_goacc_reduction_init (call);
4747 break;
4748
4749 case IFN_GOACC_REDUCTION_FINI:
4750 nvptx_goacc_reduction_fini (call);
4751 break;
4752
4753 case IFN_GOACC_REDUCTION_TEARDOWN:
4754 nvptx_goacc_reduction_teardown (call);
4755 break;
4756
4757 default:
4758 gcc_unreachable ();
4759 }
4760}
4761
738f2522
BS
4762#undef TARGET_OPTION_OVERRIDE
4763#define TARGET_OPTION_OVERRIDE nvptx_option_override
4764
4765#undef TARGET_ATTRIBUTE_TABLE
4766#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4767
4768#undef TARGET_LEGITIMATE_ADDRESS_P
4769#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4770
4771#undef TARGET_PROMOTE_FUNCTION_MODE
4772#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4773
4774#undef TARGET_FUNCTION_ARG
4775#define TARGET_FUNCTION_ARG nvptx_function_arg
4776#undef TARGET_FUNCTION_INCOMING_ARG
4777#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4778#undef TARGET_FUNCTION_ARG_ADVANCE
4779#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4780#undef TARGET_FUNCTION_ARG_BOUNDARY
4781#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4782#undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4783#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4784#undef TARGET_PASS_BY_REFERENCE
4785#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4786#undef TARGET_FUNCTION_VALUE_REGNO_P
4787#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4788#undef TARGET_FUNCTION_VALUE
4789#define TARGET_FUNCTION_VALUE nvptx_function_value
4790#undef TARGET_LIBCALL_VALUE
4791#define TARGET_LIBCALL_VALUE nvptx_libcall_value
4792#undef TARGET_FUNCTION_OK_FOR_SIBCALL
4793#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
4794#undef TARGET_GET_DRAP_RTX
4795#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
4796#undef TARGET_SPLIT_COMPLEX_ARG
4797#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4798#undef TARGET_RETURN_IN_MEMORY
4799#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4800#undef TARGET_OMIT_STRUCT_RETURN_REG
4801#define TARGET_OMIT_STRUCT_RETURN_REG true
4802#undef TARGET_STRICT_ARGUMENT_NAMING
4803#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4804#undef TARGET_STATIC_CHAIN
4805#define TARGET_STATIC_CHAIN nvptx_static_chain
4806
4807#undef TARGET_CALL_ARGS
4808#define TARGET_CALL_ARGS nvptx_call_args
4809#undef TARGET_END_CALL_ARGS
4810#define TARGET_END_CALL_ARGS nvptx_end_call_args
4811
4812#undef TARGET_ASM_FILE_START
4813#define TARGET_ASM_FILE_START nvptx_file_start
4814#undef TARGET_ASM_FILE_END
4815#define TARGET_ASM_FILE_END nvptx_file_end
4816#undef TARGET_ASM_GLOBALIZE_LABEL
4817#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4818#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4819#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4820#undef TARGET_PRINT_OPERAND
4821#define TARGET_PRINT_OPERAND nvptx_print_operand
4822#undef TARGET_PRINT_OPERAND_ADDRESS
4823#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4824#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4825#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4826#undef TARGET_ASM_INTEGER
4827#define TARGET_ASM_INTEGER nvptx_assemble_integer
4828#undef TARGET_ASM_DECL_END
4829#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4830#undef TARGET_ASM_DECLARE_CONSTANT_NAME
4831#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4832#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4833#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4834#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4835#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4836
4837#undef TARGET_MACHINE_DEPENDENT_REORG
4838#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4839#undef TARGET_NO_REGISTER_ALLOCATION
4840#define TARGET_NO_REGISTER_ALLOCATION true
4841
1f83528e
TS
4842#undef TARGET_RECORD_OFFLOAD_SYMBOL
4843#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4844
738f2522
BS
4845#undef TARGET_VECTOR_ALIGNMENT
4846#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4847
d88cd9c4
NS
4848#undef TARGET_CANNOT_COPY_INSN_P
4849#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4850
a794bd20
NS
4851#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4852#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4853
f3552158
NS
4854#undef TARGET_INIT_BUILTINS
4855#define TARGET_INIT_BUILTINS nvptx_init_builtins
4856#undef TARGET_EXPAND_BUILTIN
4857#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4858#undef TARGET_BUILTIN_DECL
4859#define TARGET_BUILTIN_DECL nvptx_builtin_decl
4860
94829f87
NS
4861#undef TARGET_GOACC_VALIDATE_DIMS
4862#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4863
bd751975
NS
4864#undef TARGET_GOACC_DIM_LIMIT
4865#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4866
d88cd9c4
NS
4867#undef TARGET_GOACC_FORK_JOIN
4868#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4869
f3552158
NS
4870#undef TARGET_GOACC_REDUCTION
4871#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4872
738f2522
BS
4873struct gcc_target targetm = TARGET_INITIALIZER;
4874
4875#include "gt-nvptx.h"