]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
lto-streamer-in.c (lto_read_body_or_constructor): Set TYPE_CANONICAL only for types...
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
5624e564 2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
e11c4407
AM
31#include "tm_p.h"
32#include "expmed.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic.h"
40e23961 38#include "alias.h"
738f2522
BS
39#include "insn-flags.h"
40#include "output.h"
41#include "insn-attr.h"
36566b39 42#include "flags.h"
36566b39
PK
43#include "dojump.h"
44#include "explow.h"
45#include "calls.h"
36566b39
PK
46#include "varasm.h"
47#include "stmt.h"
738f2522 48#include "expr.h"
738f2522
BS
49#include "tm-preds.h"
50#include "tm-constrs.h"
738f2522
BS
51#include "langhooks.h"
52#include "dbxout.h"
738f2522 53#include "cfgrtl.h"
d88cd9c4 54#include "gimple.h"
738f2522 55#include "stor-layout.h"
738f2522 56#include "builtins.h"
3e32ee19
NS
57#include "omp-low.h"
58#include "gomp-constants.h"
d88cd9c4 59#include "dumpfile.h"
f3552158
NS
60#include "internal-fn.h"
61#include "gimple-iterator.h"
62#include "stringpool.h"
63#include "tree-ssa-operands.h"
64#include "tree-ssanames.h"
65#include "gimplify.h"
66#include "tree-phinodes.h"
67#include "cfgloop.h"
68#include "fold-const.h"
738f2522 69
994c5d85 70/* This file should be included last. */
d58627a0
RS
71#include "target-def.h"
72
d88cd9c4
NS
73#define SHUFFLE_UP 0
74#define SHUFFLE_DOWN 1
75#define SHUFFLE_BFLY 2
76#define SHUFFLE_IDX 3
77
738f2522
BS
78/* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80static std::stringstream func_decls;
f3dba894 81
6c907cff 82struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
83{
84 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
85 static bool equal (rtx a, rtx b) { return a == b; }
86};
87
88static GTY((cache))
89 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
90
6c907cff 91struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
92{
93 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
94 static bool equal (tree a, tree b) { return a == b; }
95};
96
97static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
98static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 99
f3552158
NS
100/* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
d88cd9c4
NS
105static unsigned worker_bcast_size;
106static unsigned worker_bcast_align;
107#define worker_bcast_name "__worker_bcast"
108static GTY(()) rtx worker_bcast_sym;
109
f3552158
NS
110/* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112static unsigned worker_red_size;
113static unsigned worker_red_align;
114#define worker_red_name "__worker_red"
115static GTY(()) rtx worker_red_sym;
116
33f47f42
NS
117/* Global lock variable, needed for 128bit worker & gang reductions. */
118static GTY(()) tree global_lock_var;
119
738f2522
BS
120/* Allocate a new, cleared machine_function structure. */
121
122static struct machine_function *
123nvptx_init_machine_status (void)
124{
125 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
126 p->ret_reg_mode = VOIDmode;
127 return p;
128}
129
130/* Implement TARGET_OPTION_OVERRIDE. */
131
132static void
133nvptx_option_override (void)
134{
135 init_machine_status = nvptx_init_machine_status;
136 /* Gives us a predictable order, which we need especially for variables. */
137 flag_toplevel_reorder = 1;
138 /* Assumes that it will see only hard registers. */
139 flag_var_tracking = 0;
f324806d
NS
140 write_symbols = NO_DEBUG;
141 debug_info_level = DINFO_LEVEL_NONE;
738f2522 142
dba619f3
NS
143 if (nvptx_optimize < 0)
144 nvptx_optimize = optimize > 0;
145
f3dba894
TS
146 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
147 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 148 declared_libfuncs_htab
f3dba894 149 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4
NS
150
151 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
152 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158
NS
153
154 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
155 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
738f2522
BS
156}
157
158/* Return the mode to be used when declaring a ptx object for OBJ.
159 For objects with subparts such as complex modes this is the mode
160 of the subpart. */
161
162machine_mode
163nvptx_underlying_object_mode (rtx obj)
164{
165 if (GET_CODE (obj) == SUBREG)
166 obj = SUBREG_REG (obj);
167 machine_mode mode = GET_MODE (obj);
168 if (mode == TImode)
169 return DImode;
170 if (COMPLEX_MODE_P (mode))
171 return GET_MODE_INNER (mode);
172 return mode;
173}
174
175/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
176 deal with ptx ideosyncracies. */
177
178const char *
179nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
180{
181 switch (mode)
182 {
183 case BLKmode:
184 return ".b8";
185 case BImode:
186 return ".pred";
187 case QImode:
188 if (promote)
189 return ".u32";
190 else
191 return ".u8";
192 case HImode:
193 return ".u16";
194 case SImode:
195 return ".u32";
196 case DImode:
197 return ".u64";
198
199 case SFmode:
200 return ".f32";
201 case DFmode:
202 return ".f64";
203
204 default:
205 gcc_unreachable ();
206 }
207}
208
209/* Return the number of pieces to use when dealing with a pseudo of *PMODE.
210 Alter *PMODE if we return a number greater than one. */
211
212static int
213maybe_split_mode (machine_mode *pmode)
214{
215 machine_mode mode = *pmode;
216
217 if (COMPLEX_MODE_P (mode))
218 {
219 *pmode = GET_MODE_INNER (mode);
220 return 2;
221 }
222 else if (mode == TImode)
223 {
224 *pmode = DImode;
225 return 2;
226 }
227 return 1;
228}
229
230/* Like maybe_split_mode, but only return whether or not the mode
231 needs to be split. */
232static bool
233nvptx_split_reg_p (machine_mode mode)
234{
235 if (COMPLEX_MODE_P (mode))
236 return true;
237 if (mode == TImode)
238 return true;
239 return false;
240}
241
d88cd9c4
NS
242/* Emit forking instructions for MASK. */
243
244static void
245nvptx_emit_forking (unsigned mask, bool is_call)
246{
247 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
248 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
249 if (mask)
250 {
251 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
252
253 /* Emit fork at all levels. This helps form SESE regions, as
254 it creates a block with a single successor before entering a
255 partitooned region. That is a good candidate for the end of
256 an SESE region. */
257 if (!is_call)
258 emit_insn (gen_nvptx_fork (op));
259 emit_insn (gen_nvptx_forked (op));
260 }
261}
262
263/* Emit joining instructions for MASK. */
264
265static void
266nvptx_emit_joining (unsigned mask, bool is_call)
267{
268 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
269 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
270 if (mask)
271 {
272 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
273
274 /* Emit joining for all non-call pars to ensure there's a single
275 predecessor for the block the join insn ends up in. This is
276 needed for skipping entire loops. */
277 if (!is_call)
278 emit_insn (gen_nvptx_joining (op));
279 emit_insn (gen_nvptx_join (op));
280 }
281}
282
738f2522
BS
283#define PASS_IN_REG_P(MODE, TYPE) \
284 ((GET_MODE_CLASS (MODE) == MODE_INT \
285 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
286 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
287 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
288 && !AGGREGATE_TYPE_P (TYPE))) \
289 && (MODE) != TImode)
290
291#define RETURN_IN_REG_P(MODE) \
292 ((GET_MODE_CLASS (MODE) == MODE_INT \
293 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
294 && GET_MODE_SIZE (MODE) <= 8)
295\f
296/* Perform a mode promotion for a function argument with MODE. Return
297 the promoted mode. */
298
299static machine_mode
300arg_promotion (machine_mode mode)
301{
302 if (mode == QImode || mode == HImode)
303 return SImode;
304 return mode;
305}
306
307/* Write the declaration of a function arg of TYPE to S. I is the index
308 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
309 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
310
311static int
312write_one_arg (std::stringstream &s, tree type, int i, machine_mode mode,
313 bool no_arg_types)
314{
315 if (!PASS_IN_REG_P (mode, type))
316 mode = Pmode;
317
318 int count = maybe_split_mode (&mode);
319
320 if (count == 2)
321 {
322 write_one_arg (s, NULL_TREE, i, mode, false);
323 write_one_arg (s, NULL_TREE, i + 1, mode, false);
324 return i + 1;
325 }
326
327 if (no_arg_types && !AGGREGATE_TYPE_P (type))
328 {
329 if (mode == SFmode)
330 mode = DFmode;
331 mode = arg_promotion (mode);
332 }
333
334 if (i > 0)
335 s << ", ";
336 s << ".param" << nvptx_ptx_type_from_mode (mode, false) << " %in_ar"
337 << (i + 1) << (mode == QImode || mode == HImode ? "[1]" : "");
338 if (mode == BLKmode)
339 s << "[" << int_size_in_bytes (type) << "]";
340 return i;
341}
342
343/* Look for attributes in ATTRS that would indicate we must write a function
344 as a .entry kernel rather than a .func. Return true if one is found. */
345
346static bool
347write_as_kernel (tree attrs)
348{
349 return (lookup_attribute ("kernel", attrs) != NULL_TREE
350 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
351}
352
ecf6e535
BS
353/* Write a function decl for DECL to S, where NAME is the name to be used.
354 This includes ptx .visible or .extern specifiers, .func or .kernel, and
355 argument and return types. */
738f2522
BS
356
357static void
358nvptx_write_function_decl (std::stringstream &s, const char *name, const_tree decl)
359{
360 tree fntype = TREE_TYPE (decl);
361 tree result_type = TREE_TYPE (fntype);
362 tree args = TYPE_ARG_TYPES (fntype);
363 tree attrs = DECL_ATTRIBUTES (decl);
364 bool kernel = write_as_kernel (attrs);
365 bool is_main = strcmp (name, "main") == 0;
366 bool args_from_decl = false;
367
368 /* We get:
369 NULL in TYPE_ARG_TYPES, for old-style functions
370 NULL in DECL_ARGUMENTS, for builtin functions without another
371 declaration.
372 So we have to pick the best one we have. */
373 if (args == 0)
374 {
375 args = DECL_ARGUMENTS (decl);
376 args_from_decl = true;
377 }
378
379 if (DECL_EXTERNAL (decl))
380 s << ".extern ";
381 else if (TREE_PUBLIC (decl))
0766660b 382 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
738f2522
BS
383
384 if (kernel)
385 s << ".entry ";
386 else
387 s << ".func ";
388
389 /* Declare the result. */
390 bool return_in_mem = false;
391 if (TYPE_MODE (result_type) != VOIDmode)
392 {
393 machine_mode mode = TYPE_MODE (result_type);
394 if (!RETURN_IN_REG_P (mode))
395 return_in_mem = true;
396 else
397 {
398 mode = arg_promotion (mode);
399 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
400 << " %out_retval)";
401 }
402 }
403
404 if (name[0] == '*')
405 s << (name + 1);
406 else
407 s << name;
408
409 /* Declare argument types. */
410 if ((args != NULL_TREE
1fe6befc
NS
411 && !(TREE_CODE (args) == TREE_LIST
412 && TREE_VALUE (args) == void_type_node))
738f2522
BS
413 || is_main
414 || return_in_mem
415 || DECL_STATIC_CHAIN (decl))
416 {
417 s << "(";
418 int i = 0;
419 bool any_args = false;
420 if (return_in_mem)
421 {
422 s << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_ar1";
423 i++;
424 }
425 while (args != NULL_TREE)
426 {
427 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
428 machine_mode mode = TYPE_MODE (type);
429
430 if (mode != VOIDmode)
431 {
432 i = write_one_arg (s, type, i, mode,
433 TYPE_ARG_TYPES (fntype) == 0);
434 any_args = true;
435 i++;
436 }
437 args = TREE_CHAIN (args);
438 }
439 if (stdarg_p (fntype))
440 {
441 gcc_assert (i > 0);
442 s << ", .param.u" << GET_MODE_BITSIZE (Pmode) << " %in_argp";
443 }
444 if (DECL_STATIC_CHAIN (decl))
445 {
446 if (i > 0)
447 s << ", ";
448 s << ".reg.u" << GET_MODE_BITSIZE (Pmode)
449 << reg_names [STATIC_CHAIN_REGNUM];
450 }
451 if (!any_args && is_main)
452 s << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode)
453 << " %argv";
454 s << ")";
455 }
456}
457
458/* Walk either ARGTYPES or ARGS if the former is null, and write out part of
459 the function header to FILE. If WRITE_COPY is false, write reg
460 declarations, otherwise write the copy from the incoming argument to that
461 reg. RETURN_IN_MEM indicates whether to start counting arg numbers at 1
462 instead of 0. */
463
464static void
465walk_args_for_param (FILE *file, tree argtypes, tree args, bool write_copy,
466 bool return_in_mem)
467{
468 int i;
469
470 bool args_from_decl = false;
471 if (argtypes == 0)
472 args_from_decl = true;
473 else
474 args = argtypes;
475
476 for (i = return_in_mem ? 1 : 0; args != NULL_TREE; args = TREE_CHAIN (args))
477 {
478 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
479 machine_mode mode = TYPE_MODE (type);
480
481 if (mode == VOIDmode)
482 break;
483
484 if (!PASS_IN_REG_P (mode, type))
485 mode = Pmode;
486
487 int count = maybe_split_mode (&mode);
488 if (count == 1)
489 {
490 if (argtypes == NULL && !AGGREGATE_TYPE_P (type))
491 {
492 if (mode == SFmode)
493 mode = DFmode;
494
495 }
738f2522 496 }
7373d132 497 mode = arg_promotion (mode);
738f2522
BS
498 while (count-- > 0)
499 {
500 i++;
501 if (write_copy)
502 fprintf (file, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
7373d132 503 nvptx_ptx_type_from_mode (mode, false), i, i);
738f2522
BS
504 else
505 fprintf (file, "\t.reg%s %%ar%d;\n",
7373d132 506 nvptx_ptx_type_from_mode (mode, false), i);
738f2522
BS
507 }
508 }
509}
510
511/* Write a .func or .kernel declaration (not a definition) along with
512 a helper comment for use by ld. S is the stream to write to, DECL
513 the decl for the function with name NAME. */
514
515static void
516write_function_decl_and_comment (std::stringstream &s, const char *name, const_tree decl)
517{
cf08c344 518 s << "\n// BEGIN";
738f2522
BS
519 if (TREE_PUBLIC (decl))
520 s << " GLOBAL";
521 s << " FUNCTION DECL: ";
522 if (name[0] == '*')
523 s << (name + 1);
524 else
525 s << name;
526 s << "\n";
527 nvptx_write_function_decl (s, name, decl);
528 s << ";\n";
529}
530
531/* Check NAME for special function names and redirect them by returning a
532 replacement. This applies to malloc, free and realloc, for which we
533 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
534
535static const char *
536nvptx_name_replacement (const char *name)
537{
538 if (strcmp (name, "call") == 0)
539 return "__nvptx_call";
540 if (strcmp (name, "malloc") == 0)
541 return "__nvptx_malloc";
542 if (strcmp (name, "free") == 0)
543 return "__nvptx_free";
544 if (strcmp (name, "realloc") == 0)
545 return "__nvptx_realloc";
546 return name;
547}
548
549/* If DECL is a FUNCTION_DECL, check the hash table to see if we
550 already encountered it, and if not, insert it and write a ptx
551 declarations that will be output at the end of compilation. */
552
553static bool
554nvptx_record_fndecl (tree decl, bool force = false)
555{
556 if (decl == NULL_TREE || TREE_CODE (decl) != FUNCTION_DECL
557 || !DECL_EXTERNAL (decl))
558 return true;
559
560 if (!force && TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
561 return false;
562
f3dba894 563 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
564 if (*slot == NULL)
565 {
566 *slot = decl;
567 const char *name = get_fnname_from_decl (decl);
568 name = nvptx_name_replacement (name);
569 write_function_decl_and_comment (func_decls, name, decl);
570 }
571 return true;
572}
573
574/* Record that we need to emit a ptx decl for DECL. Either do it now, or
575 record it for later in case we have no argument information at this
576 point. */
577
578void
579nvptx_record_needed_fndecl (tree decl)
580{
581 if (nvptx_record_fndecl (decl))
582 return;
583
f3dba894 584 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
585 if (*slot == NULL)
586 *slot = decl;
587}
588
d88cd9c4
NS
589/* Emit code to initialize the REGNO predicate register to indicate
590 whether we are not lane zero on the NAME axis. */
591
592static void
593nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
594{
595 fprintf (file, "\t{\n");
596 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
597 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
598 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
599 fprintf (file, "\t}\n");
600}
601
738f2522
BS
602/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
603 function, including local var decls and copies from the arguments to
604 local regs. */
605
606void
607nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
608{
609 tree fntype = TREE_TYPE (decl);
610 tree result_type = TREE_TYPE (fntype);
611
612 name = nvptx_name_replacement (name);
613
614 std::stringstream s;
615 write_function_decl_and_comment (s, name, decl);
616 s << "// BEGIN";
617 if (TREE_PUBLIC (decl))
618 s << " GLOBAL";
619 s << " FUNCTION DEF: ";
620
621 if (name[0] == '*')
622 s << (name + 1);
623 else
624 s << name;
625 s << "\n";
626
627 nvptx_write_function_decl (s, name, decl);
628 fprintf (file, "%s", s.str().c_str());
629
25662751
NS
630 bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
631 && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
738f2522
BS
632
633 fprintf (file, "\n{\n");
634
635 /* Ensure all arguments that should live in a register have one
636 declared. We'll emit the copies below. */
637 walk_args_for_param (file, TYPE_ARG_TYPES (fntype), DECL_ARGUMENTS (decl),
638 false, return_in_mem);
639 if (return_in_mem)
640 fprintf (file, "\t.reg.u%d %%ar1;\n", GET_MODE_BITSIZE (Pmode));
25662751
NS
641
642 /* C++11 ABI causes us to return a reference to the passed in
643 pointer for return_in_mem. */
644 if (cfun->machine->ret_reg_mode != VOIDmode)
738f2522 645 {
25662751
NS
646 machine_mode mode = arg_promotion
647 ((machine_mode)cfun->machine->ret_reg_mode);
ac952181 648 fprintf (file, "\t.reg%s %%retval;\n",
738f2522
BS
649 nvptx_ptx_type_from_mode (mode, false));
650 }
651
652 if (stdarg_p (fntype))
653 fprintf (file, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode));
654
655 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
656 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
657
658 /* Declare the pseudos we have as ptx registers. */
659 int maxregs = max_reg_num ();
660 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
661 {
662 if (regno_reg_rtx[i] != const0_rtx)
663 {
664 machine_mode mode = PSEUDO_REGNO_MODE (i);
665 int count = maybe_split_mode (&mode);
666 if (count > 1)
667 {
668 while (count-- > 0)
669 fprintf (file, "\t.reg%s %%r%d$%d;\n",
670 nvptx_ptx_type_from_mode (mode, true),
671 i, count);
672 }
673 else
674 fprintf (file, "\t.reg%s %%r%d;\n",
675 nvptx_ptx_type_from_mode (mode, true),
676 i);
677 }
678 }
679
680 /* The only reason we might be using outgoing args is if we call a stdargs
681 function. Allocate the space for this. If we called varargs functions
682 without passing any variadic arguments, we'll see a reference to outargs
683 even with a zero outgoing_args_size. */
684 HOST_WIDE_INT sz = crtl->outgoing_args_size;
685 if (sz == 0)
686 sz = 1;
687 if (cfun->machine->has_call_with_varargs)
688 fprintf (file, "\t.reg.u%d %%outargs;\n"
16998094 689 "\t.local.align 8 .b8 %%outargs_ar[" HOST_WIDE_INT_PRINT_DEC"];\n",
738f2522
BS
690 BITS_PER_WORD, sz);
691 if (cfun->machine->punning_buffer_size > 0)
692 fprintf (file, "\t.reg.u%d %%punbuffer;\n"
693 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
694 BITS_PER_WORD, cfun->machine->punning_buffer_size);
695
696 /* Declare a local variable for the frame. */
697 sz = get_frame_size ();
698 if (sz > 0 || cfun->machine->has_call_with_sc)
699 {
18c05628
NS
700 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
701
738f2522 702 fprintf (file, "\t.reg.u%d %%frame;\n"
18c05628
NS
703 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
704 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
738f2522
BS
705 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
706 BITS_PER_WORD);
707 }
708
709 if (cfun->machine->has_call_with_varargs)
710 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
711 BITS_PER_WORD);
712 if (cfun->machine->punning_buffer_size > 0)
713 fprintf (file, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
714 BITS_PER_WORD);
715
716 /* Now emit any copies necessary for arguments. */
717 walk_args_for_param (file, TYPE_ARG_TYPES (fntype), DECL_ARGUMENTS (decl),
718 true, return_in_mem);
719 if (return_in_mem)
ac952181 720 fprintf (file, "\tld.param.u%d %%ar1, [%%in_ar1];\n",
738f2522
BS
721 GET_MODE_BITSIZE (Pmode));
722 if (stdarg_p (fntype))
ac952181 723 fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
738f2522 724 GET_MODE_BITSIZE (Pmode));
d88cd9c4
NS
725
726 /* Emit axis predicates. */
727 if (cfun->machine->axis_predicate[0])
728 nvptx_init_axis_predicate (file,
729 REGNO (cfun->machine->axis_predicate[0]), "y");
730 if (cfun->machine->axis_predicate[1])
731 nvptx_init_axis_predicate (file,
732 REGNO (cfun->machine->axis_predicate[1]), "x");
738f2522
BS
733}
734
735/* Output a return instruction. Also copy the return value to its outgoing
736 location. */
737
738const char *
739nvptx_output_return (void)
740{
25662751
NS
741 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
742
743 if (mode != VOIDmode)
738f2522 744 {
25662751
NS
745 mode = arg_promotion (mode);
746 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
747 nvptx_ptx_type_from_mode (mode, false));
738f2522
BS
748 }
749
750 return "ret;";
751}
752
753/* Construct a function declaration from a call insn. This can be
754 necessary for two reasons - either we have an indirect call which
755 requires a .callprototype declaration, or we have a libcall
756 generated by emit_library_call for which no decl exists. */
757
758static void
759write_func_decl_from_insn (std::stringstream &s, rtx result, rtx pat,
760 rtx callee)
761{
762 bool callprototype = register_operand (callee, Pmode);
763 const char *name = "_";
764 if (!callprototype)
765 {
766 name = XSTR (callee, 0);
767 name = nvptx_name_replacement (name);
cf08c344 768 s << "\n// BEGIN GLOBAL FUNCTION DECL: " << name << "\n";
738f2522
BS
769 }
770 s << (callprototype ? "\t.callprototype\t" : "\t.extern .func ");
771
772 if (result != NULL_RTX)
773 {
774 s << "(.param";
775 s << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
776 false);
777 s << " ";
778 if (callprototype)
779 s << "_";
780 else
781 s << "%out_retval";
782 s << ")";
783 }
784
785 s << name;
786
f324806d
NS
787 int arg_end = XVECLEN (pat, 0);
788
789 if (1 < arg_end)
738f2522 790 {
f324806d 791 const char *comma = "";
738f2522 792 s << " (";
f324806d 793 for (int i = 1; i < arg_end; i++)
738f2522 794 {
f324806d 795 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
796 machine_mode mode = GET_MODE (t);
797 int count = maybe_split_mode (&mode);
798
f324806d 799 while (count--)
738f2522 800 {
f324806d 801 s << comma << ".param";
738f2522
BS
802 s << nvptx_ptx_type_from_mode (mode, false);
803 s << " ";
804 if (callprototype)
805 s << "_";
806 else
f324806d 807 s << "%arg" << i - 1;
738f2522
BS
808 if (mode == QImode || mode == HImode)
809 s << "[1]";
f324806d 810 comma = ", ";
738f2522
BS
811 }
812 }
813 s << ")";
814 }
815 s << ";\n";
816}
817
818/* Terminate a function by writing a closing brace to FILE. */
819
820void
821nvptx_function_end (FILE *file)
822{
cf08c344 823 fprintf (file, "}\n");
738f2522
BS
824}
825\f
826/* Decide whether we can make a sibling call to a function. For ptx, we
827 can't. */
828
829static bool
830nvptx_function_ok_for_sibcall (tree, tree)
831{
832 return false;
833}
834
18c05628
NS
835/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
836
837static rtx
838nvptx_get_drap_rtx (void)
839{
840 return NULL_RTX;
841}
842
738f2522
BS
843/* Implement the TARGET_CALL_ARGS hook. Record information about one
844 argument to the next call. */
845
846static void
847nvptx_call_args (rtx arg, tree funtype)
848{
849 if (cfun->machine->start_call == NULL_RTX)
850 {
851 cfun->machine->call_args = NULL;
852 cfun->machine->funtype = funtype;
853 cfun->machine->start_call = const0_rtx;
854 }
855 if (arg == pc_rtx)
856 return;
857
858 rtx_expr_list *args_so_far = cfun->machine->call_args;
859 if (REG_P (arg))
860 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
861}
862
863/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
864 information we recorded. */
865
866static void
867nvptx_end_call_args (void)
868{
869 cfun->machine->start_call = NULL_RTX;
870 free_EXPR_LIST_list (&cfun->machine->call_args);
871}
872
ecf6e535
BS
873/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
874 track of whether calls involving static chains or varargs were seen
875 in the current function.
876 For libcalls, maintain a hash table of decls we have seen, and
877 record a function decl for later when encountering a new one. */
738f2522
BS
878
879void
880nvptx_expand_call (rtx retval, rtx address)
881{
f324806d 882 int nargs = 0;
738f2522
BS
883 rtx callee = XEXP (address, 0);
884 rtx pat, t;
885 rtvec vec;
886 bool external_decl = false;
f324806d
NS
887 rtx varargs = NULL_RTX;
888 tree decl_type = NULL_TREE;
d88cd9c4 889 unsigned parallel = 0;
738f2522 890
738f2522
BS
891 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
892 nargs++;
893
738f2522
BS
894 if (!call_insn_operand (callee, Pmode))
895 {
896 callee = force_reg (Pmode, callee);
897 address = change_address (address, QImode, callee);
898 }
899
900 if (GET_CODE (callee) == SYMBOL_REF)
901 {
902 tree decl = SYMBOL_REF_DECL (callee);
903 if (decl != NULL_TREE)
904 {
905 decl_type = TREE_TYPE (decl);
906 if (DECL_STATIC_CHAIN (decl))
907 cfun->machine->has_call_with_sc = true;
908 if (DECL_EXTERNAL (decl))
909 external_decl = true;
d88cd9c4
NS
910 tree attr = get_oacc_fn_attrib (decl);
911 if (attr)
912 {
913 tree dims = TREE_VALUE (attr);
914
915 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
916 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
917 {
918 if (TREE_PURPOSE (dims)
919 && !integer_zerop (TREE_PURPOSE (dims)))
920 break;
921 /* Not on this axis. */
922 parallel ^= GOMP_DIM_MASK (ix);
923 dims = TREE_CHAIN (dims);
924 }
925 }
738f2522
BS
926 }
927 }
c38f0d8c 928
738f2522
BS
929 if (cfun->machine->funtype
930 /* It's possible to construct testcases where we call a variable.
931 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
932 in such a case. */
933 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
934 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
935 && stdarg_p (cfun->machine->funtype))
936 {
f324806d 937 varargs = gen_reg_rtx (Pmode);
738f2522 938 if (Pmode == DImode)
f324806d 939 emit_move_insn (varargs, stack_pointer_rtx);
738f2522 940 else
f324806d
NS
941 emit_move_insn (varargs, stack_pointer_rtx);
942 cfun->machine->has_call_with_varargs = true;
738f2522 943 }
f324806d
NS
944 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
945 pat = gen_rtx_PARALLEL (VOIDmode, vec);
738f2522 946
f324806d
NS
947 int vec_pos = 0;
948
738f2522
BS
949 rtx tmp_retval = retval;
950 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
951 if (retval != NULL_RTX)
952 {
953 if (!nvptx_register_operand (retval, GET_MODE (retval)))
954 tmp_retval = gen_reg_rtx (GET_MODE (retval));
f7df4a84 955 t = gen_rtx_SET (tmp_retval, t);
738f2522 956 }
f324806d
NS
957 XVECEXP (pat, 0, vec_pos++) = t;
958
959 /* Construct the call insn, including a USE for each argument pseudo
960 register. These will be used when printing the insn. */
961 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
962 {
963 rtx this_arg = XEXP (arg, 0);
964 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
965 }
966
967 if (varargs)
cf08c344 968 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
969
970 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535
BS
971
972 /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
973 undeclared function, we'll have an external decl without arg types.
974 In either case we have to try to construct a ptx declaration from one of
975 the calls to the function. */
738f2522
BS
976 if (!REG_P (callee)
977 && (decl_type == NULL_TREE
978 || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
979 {
f3dba894 980 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
738f2522
BS
981 if (*slot == NULL)
982 {
983 *slot = callee;
984 write_func_decl_from_insn (func_decls, retval, pat, callee);
985 }
986 }
d88cd9c4
NS
987
988 nvptx_emit_forking (parallel, true);
738f2522 989 emit_call_insn (pat);
d88cd9c4
NS
990 nvptx_emit_joining (parallel, true);
991
738f2522
BS
992 if (tmp_retval != retval)
993 emit_move_insn (retval, tmp_retval);
994}
995
996/* Implement TARGET_FUNCTION_ARG. */
997
998static rtx
999nvptx_function_arg (cumulative_args_t, machine_mode mode,
1000 const_tree, bool named)
1001{
1002 if (mode == VOIDmode)
1003 return NULL_RTX;
1004
1005 if (named)
1006 return gen_reg_rtx (mode);
1007 return NULL_RTX;
1008}
1009
1010/* Implement TARGET_FUNCTION_INCOMING_ARG. */
1011
1012static rtx
1013nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
1014 const_tree, bool named)
1015{
1016 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1017 if (mode == VOIDmode)
1018 return NULL_RTX;
1019
1020 if (!named)
1021 return NULL_RTX;
1022
1023 /* No need to deal with split modes here, the only case that can
1024 happen is complex modes and those are dealt with by
1025 TARGET_SPLIT_COMPLEX_ARG. */
1026 return gen_rtx_UNSPEC (mode,
1027 gen_rtvec (1, GEN_INT (1 + cum->count)),
1028 UNSPEC_ARG_REG);
1029}
1030
1031/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
1032
1033static void
1034nvptx_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
1035 const_tree type ATTRIBUTE_UNUSED,
1036 bool named ATTRIBUTE_UNUSED)
1037{
1038 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1039 if (mode == TImode)
1040 cum->count += 2;
1041 else
1042 cum->count++;
1043}
1044
1045/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
1046
1047 For nvptx, we know how to handle functions declared as stdarg: by
1048 passing an extra pointer to the unnamed arguments. However, the
1049 Fortran frontend can produce a different situation, where a
1050 function pointer is declared with no arguments, but the actual
1051 function and calls to it take more arguments. In that case, we
1052 want to ensure the call matches the definition of the function. */
1053
1054static bool
1055nvptx_strict_argument_naming (cumulative_args_t cum_v)
1056{
1057 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1058 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
1059}
1060
1061/* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1062
1063static unsigned int
1064nvptx_function_arg_boundary (machine_mode mode, const_tree type)
1065{
1066 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
1067
1068 if (boundary > BITS_PER_WORD)
1069 return 2 * BITS_PER_WORD;
1070
1071 if (mode == BLKmode)
1072 {
1073 HOST_WIDE_INT size = int_size_in_bytes (type);
1074 if (size > 4)
1075 return 2 * BITS_PER_WORD;
1076 if (boundary < BITS_PER_WORD)
1077 {
1078 if (size >= 3)
1079 return BITS_PER_WORD;
1080 if (size >= 2)
1081 return 2 * BITS_PER_UNIT;
1082 }
1083 }
1084 return boundary;
1085}
1086
1087/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1088 where function FUNC returns or receives a value of data type TYPE. */
1089
1090static rtx
1091nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
1092 bool outgoing)
1093{
1094 int unsignedp = TYPE_UNSIGNED (type);
1095 machine_mode orig_mode = TYPE_MODE (type);
1096 machine_mode mode = promote_function_mode (type, orig_mode,
1097 &unsignedp, NULL_TREE, 1);
1098 if (outgoing)
1099 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1100 if (cfun->machine->start_call == NULL_RTX)
1101 /* Pretend to return in a hard reg for early uses before pseudos can be
1102 generated. */
1103 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1104 return gen_reg_rtx (mode);
1105}
1106
1107/* Implement TARGET_LIBCALL_VALUE. */
1108
1109static rtx
1110nvptx_libcall_value (machine_mode mode, const_rtx)
1111{
1112 if (cfun->machine->start_call == NULL_RTX)
1113 /* Pretend to return in a hard reg for early uses before pseudos can be
1114 generated. */
1115 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1116 return gen_reg_rtx (mode);
1117}
1118
1119/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1120
1121static bool
1122nvptx_function_value_regno_p (const unsigned int regno)
1123{
1124 return regno == NVPTX_RETURN_REGNUM;
1125}
1126
1127/* Types with a mode other than those supported by the machine are passed by
1128 reference in memory. */
1129
1130static bool
1131nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
1132 const_tree type, bool)
1133{
1134 return !PASS_IN_REG_P (mode, type);
1135}
1136
1137/* Implement TARGET_RETURN_IN_MEMORY. */
1138
1139static bool
1140nvptx_return_in_memory (const_tree type, const_tree)
1141{
1142 machine_mode mode = TYPE_MODE (type);
1143 if (!RETURN_IN_REG_P (mode))
1144 return true;
1145 return false;
1146}
1147
1148/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1149
1150static machine_mode
1151nvptx_promote_function_mode (const_tree type, machine_mode mode,
1152 int *punsignedp,
1153 const_tree funtype, int for_return)
1154{
1155 if (type == NULL_TREE)
1156 return mode;
1157 if (for_return)
1158 return promote_mode (type, mode, punsignedp);
1159 /* For K&R-style functions, try to match the language promotion rules to
1160 minimize type mismatches at assembly time. */
1161 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
1162 && type != NULL_TREE
1163 && !AGGREGATE_TYPE_P (type))
1164 {
1165 if (mode == SFmode)
1166 mode = DFmode;
1167 mode = arg_promotion (mode);
1168 }
1169
1170 return mode;
1171}
1172
1173/* Implement TARGET_STATIC_CHAIN. */
1174
1175static rtx
1176nvptx_static_chain (const_tree fndecl, bool incoming_p)
1177{
1178 if (!DECL_STATIC_CHAIN (fndecl))
1179 return NULL;
1180
1181 if (incoming_p)
1182 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
1183 else
1184 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
1185}
1186\f
1187/* Emit a comparison COMPARE, and return the new test to be used in the
1188 jump. */
1189
1190rtx
1191nvptx_expand_compare (rtx compare)
1192{
1193 rtx pred = gen_reg_rtx (BImode);
1194 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1195 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1196 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1197 return gen_rtx_NE (BImode, pred, const0_rtx);
1198}
1199
d88cd9c4
NS
1200/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1201
1202void
1203nvptx_expand_oacc_fork (unsigned mode)
1204{
1205 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1206}
1207
1208void
1209nvptx_expand_oacc_join (unsigned mode)
1210{
1211 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1212}
1213
1214/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1215 objects. */
1216
1217static rtx
1218nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1219{
1220 rtx res;
1221
1222 switch (GET_MODE (src))
1223 {
1224 case DImode:
1225 res = gen_unpackdisi2 (dst0, dst1, src);
1226 break;
1227 case DFmode:
1228 res = gen_unpackdfsi2 (dst0, dst1, src);
1229 break;
1230 default: gcc_unreachable ();
1231 }
1232 return res;
1233}
1234
1235/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1236 object. */
1237
1238static rtx
1239nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1240{
1241 rtx res;
1242
1243 switch (GET_MODE (dst))
1244 {
1245 case DImode:
1246 res = gen_packsidi2 (dst, src0, src1);
1247 break;
1248 case DFmode:
1249 res = gen_packsidf2 (dst, src0, src1);
1250 break;
1251 default: gcc_unreachable ();
1252 }
1253 return res;
1254}
1255
1256/* Generate an instruction or sequence to broadcast register REG
1257 across the vectors of a single warp. */
1258
1259static rtx
1260nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
1261{
1262 rtx res;
1263
1264 switch (GET_MODE (dst))
1265 {
1266 case SImode:
1267 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1268 break;
1269 case SFmode:
1270 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1271 break;
1272 case DImode:
1273 case DFmode:
1274 {
1275 rtx tmp0 = gen_reg_rtx (SImode);
1276 rtx tmp1 = gen_reg_rtx (SImode);
1277
1278 start_sequence ();
1279 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1280 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1281 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1282 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1283 res = get_insns ();
1284 end_sequence ();
1285 }
1286 break;
1287 case BImode:
1288 {
1289 rtx tmp = gen_reg_rtx (SImode);
1290
1291 start_sequence ();
1292 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1293 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1294 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1295 res = get_insns ();
1296 end_sequence ();
1297 }
1298 break;
1299
1300 default:
1301 gcc_unreachable ();
1302 }
1303 return res;
1304}
1305
1306/* Generate an instruction or sequence to broadcast register REG
1307 across the vectors of a single warp. */
1308
1309static rtx
1310nvptx_gen_vcast (rtx reg)
1311{
1312 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1313}
1314
1315/* Structure used when generating a worker-level spill or fill. */
1316
1317struct wcast_data_t
1318{
1319 rtx base; /* Register holding base addr of buffer. */
1320 rtx ptr; /* Iteration var, if needed. */
1321 unsigned offset; /* Offset into worker buffer. */
1322};
1323
1324/* Direction of the spill/fill and looping setup/teardown indicator. */
1325
1326enum propagate_mask
1327 {
1328 PM_read = 1 << 0,
1329 PM_write = 1 << 1,
1330 PM_loop_begin = 1 << 2,
1331 PM_loop_end = 1 << 3,
1332
1333 PM_read_write = PM_read | PM_write
1334 };
1335
1336/* Generate instruction(s) to spill or fill register REG to/from the
1337 worker broadcast array. PM indicates what is to be done, REP
1338 how many loop iterations will be executed (0 for not a loop). */
1339
1340static rtx
1341nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1342{
1343 rtx res;
1344 machine_mode mode = GET_MODE (reg);
1345
1346 switch (mode)
1347 {
1348 case BImode:
1349 {
1350 rtx tmp = gen_reg_rtx (SImode);
1351
1352 start_sequence ();
1353 if (pm & PM_read)
1354 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1355 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1356 if (pm & PM_write)
1357 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1358 res = get_insns ();
1359 end_sequence ();
1360 }
1361 break;
1362
1363 default:
1364 {
1365 rtx addr = data->ptr;
1366
1367 if (!addr)
1368 {
1369 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1370
1371 if (align > worker_bcast_align)
1372 worker_bcast_align = align;
1373 data->offset = (data->offset + align - 1) & ~(align - 1);
1374 addr = data->base;
1375 if (data->offset)
1376 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1377 }
1378
1379 addr = gen_rtx_MEM (mode, addr);
1380 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1381 if (pm == PM_read)
1382 res = gen_rtx_SET (addr, reg);
1383 else if (pm == PM_write)
1384 res = gen_rtx_SET (reg, addr);
1385 else
1386 gcc_unreachable ();
1387
1388 if (data->ptr)
1389 {
1390 /* We're using a ptr, increment it. */
1391 start_sequence ();
1392
1393 emit_insn (res);
1394 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1395 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1396 res = get_insns ();
1397 end_sequence ();
1398 }
1399 else
1400 rep = 1;
1401 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1402 }
1403 break;
1404 }
1405 return res;
1406}
1407
738f2522
BS
1408/* When loading an operand ORIG_OP, verify whether an address space
1409 conversion to generic is required, and if so, perform it. Also
1410 check for SYMBOL_REFs for function decls and call
1411 nvptx_record_needed_fndecl as needed.
1412 Return either the original operand, or the converted one. */
1413
1414rtx
1415nvptx_maybe_convert_symbolic_operand (rtx orig_op)
1416{
1417 if (GET_MODE (orig_op) != Pmode)
1418 return orig_op;
1419
1420 rtx op = orig_op;
1421 while (GET_CODE (op) == PLUS || GET_CODE (op) == CONST)
1422 op = XEXP (op, 0);
1423 if (GET_CODE (op) != SYMBOL_REF)
1424 return orig_op;
1425
1426 tree decl = SYMBOL_REF_DECL (op);
1427 if (decl && TREE_CODE (decl) == FUNCTION_DECL)
1428 {
1429 nvptx_record_needed_fndecl (decl);
1430 return orig_op;
1431 }
1432
1433 addr_space_t as = nvptx_addr_space_from_address (op);
1434 if (as == ADDR_SPACE_GENERIC)
1435 return orig_op;
1436
1437 enum unspec code;
1438 code = (as == ADDR_SPACE_GLOBAL ? UNSPEC_FROM_GLOBAL
1439 : as == ADDR_SPACE_LOCAL ? UNSPEC_FROM_LOCAL
1440 : as == ADDR_SPACE_SHARED ? UNSPEC_FROM_SHARED
1441 : as == ADDR_SPACE_CONST ? UNSPEC_FROM_CONST
1442 : UNSPEC_FROM_PARAM);
1443 rtx dest = gen_reg_rtx (Pmode);
f7df4a84
RS
1444 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, orig_op),
1445 code)));
738f2522
BS
1446 return dest;
1447}
1448\f
1449/* Returns true if X is a valid address for use in a memory reference. */
1450
1451static bool
1452nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1453{
1454 enum rtx_code code = GET_CODE (x);
1455
1456 switch (code)
1457 {
1458 case REG:
1459 return true;
1460
1461 case PLUS:
1462 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1463 return true;
1464 return false;
1465
1466 case CONST:
1467 case SYMBOL_REF:
1468 case LABEL_REF:
1469 return true;
1470
1471 default:
1472 return false;
1473 }
1474}
1475
1476/* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1477 to ensure that the return register's mode isn't changed. */
1478
1479bool
1480nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1481{
1482 if (regno != NVPTX_RETURN_REGNUM
1483 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1484 return true;
1485 return mode == cfun->machine->ret_reg_mode;
1486}
1487\f
1488/* Convert an address space AS to the corresponding ptx string. */
1489
1490const char *
1491nvptx_section_from_addr_space (addr_space_t as)
1492{
1493 switch (as)
1494 {
1495 case ADDR_SPACE_CONST:
1496 return ".const";
1497
1498 case ADDR_SPACE_GLOBAL:
1499 return ".global";
1500
1501 case ADDR_SPACE_SHARED:
1502 return ".shared";
1503
1504 case ADDR_SPACE_GENERIC:
1505 return "";
1506
1507 default:
1508 gcc_unreachable ();
1509 }
1510}
1511
1512/* Determine whether DECL goes into .const or .global. */
1513
1514const char *
1515nvptx_section_for_decl (const_tree decl)
1516{
1517 bool is_const = (CONSTANT_CLASS_P (decl)
1518 || TREE_CODE (decl) == CONST_DECL
1519 || TREE_READONLY (decl));
1520 if (is_const)
1521 return ".const";
1522
1523 return ".global";
1524}
1525
1526/* Look for a SYMBOL_REF in ADDR and return the address space to be used
1527 for the insn referencing this address. */
1528
1529addr_space_t
1530nvptx_addr_space_from_address (rtx addr)
1531{
1532 while (GET_CODE (addr) == PLUS || GET_CODE (addr) == CONST)
1533 addr = XEXP (addr, 0);
1534 if (GET_CODE (addr) != SYMBOL_REF)
1535 return ADDR_SPACE_GENERIC;
1536
1537 tree decl = SYMBOL_REF_DECL (addr);
1538 if (decl == NULL_TREE || TREE_CODE (decl) == FUNCTION_DECL)
1539 return ADDR_SPACE_GENERIC;
1540
1541 bool is_const = (CONSTANT_CLASS_P (decl)
1542 || TREE_CODE (decl) == CONST_DECL
1543 || TREE_READONLY (decl));
1544 if (is_const)
1545 return ADDR_SPACE_CONST;
1546
1547 return ADDR_SPACE_GLOBAL;
1548}
1549\f
ecf6e535
BS
1550/* Machinery to output constant initializers. When beginning an initializer,
1551 we decide on a chunk size (which is visible in ptx in the type used), and
1552 then all initializer data is buffered until a chunk is filled and ready to
1553 be written out. */
738f2522
BS
1554
1555/* Used when assembling integers to ensure data is emitted in
1556 pieces whose size matches the declaration we printed. */
1557static unsigned int decl_chunk_size;
1558static machine_mode decl_chunk_mode;
1559/* Used in the same situation, to keep track of the byte offset
1560 into the initializer. */
1561static unsigned HOST_WIDE_INT decl_offset;
1562/* The initializer part we are currently processing. */
1563static HOST_WIDE_INT init_part;
1564/* The total size of the object. */
1565static unsigned HOST_WIDE_INT object_size;
1566/* True if we found a skip extending to the end of the object. Used to
1567 assert that no data follows. */
1568static bool object_finished;
1569
1570/* Write the necessary separator string to begin a new initializer value. */
1571
1572static void
1573begin_decl_field (void)
1574{
1575 /* We never see decl_offset at zero by the time we get here. */
1576 if (decl_offset == decl_chunk_size)
1577 fprintf (asm_out_file, " = { ");
1578 else
1579 fprintf (asm_out_file, ", ");
1580}
1581
1582/* Output the currently stored chunk as an initializer value. */
1583
1584static void
1585output_decl_chunk (void)
1586{
1587 begin_decl_field ();
cc8ca59e 1588 output_address (VOIDmode, gen_int_mode (init_part, decl_chunk_mode));
738f2522
BS
1589 init_part = 0;
1590}
1591
1592/* Add value VAL sized SIZE to the data we're emitting, and keep writing
1593 out chunks as they fill up. */
1594
1595static void
1596nvptx_assemble_value (HOST_WIDE_INT val, unsigned int size)
1597{
1598 unsigned HOST_WIDE_INT chunk_offset = decl_offset % decl_chunk_size;
1599 gcc_assert (!object_finished);
1600 while (size > 0)
1601 {
1602 int this_part = size;
1603 if (chunk_offset + this_part > decl_chunk_size)
1604 this_part = decl_chunk_size - chunk_offset;
1605 HOST_WIDE_INT val_part;
1606 HOST_WIDE_INT mask = 2;
1607 mask <<= this_part * BITS_PER_UNIT - 1;
1608 val_part = val & (mask - 1);
1609 init_part |= val_part << (BITS_PER_UNIT * chunk_offset);
1610 val >>= BITS_PER_UNIT * this_part;
1611 size -= this_part;
1612 decl_offset += this_part;
1613 if (decl_offset % decl_chunk_size == 0)
1614 output_decl_chunk ();
1615
1616 chunk_offset = 0;
1617 }
1618}
1619
1620/* Target hook for assembling integer object X of size SIZE. */
1621
1622static bool
1623nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1624{
1625 if (GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == CONST)
1626 {
1627 gcc_assert (size = decl_chunk_size);
1628 if (decl_offset % decl_chunk_size != 0)
1629 sorry ("cannot emit unaligned pointers in ptx assembly");
1630 decl_offset += size;
1631 begin_decl_field ();
1632
1633 HOST_WIDE_INT off = 0;
1634 if (GET_CODE (x) == CONST)
1635 x = XEXP (x, 0);
1636 if (GET_CODE (x) == PLUS)
1637 {
1638 off = INTVAL (XEXP (x, 1));
1639 x = XEXP (x, 0);
1640 }
1641 if (GET_CODE (x) == SYMBOL_REF)
1642 {
1643 nvptx_record_needed_fndecl (SYMBOL_REF_DECL (x));
1644 fprintf (asm_out_file, "generic(");
cc8ca59e 1645 output_address (VOIDmode, x);
738f2522
BS
1646 fprintf (asm_out_file, ")");
1647 }
1648 if (off != 0)
1649 fprintf (asm_out_file, " + " HOST_WIDE_INT_PRINT_DEC, off);
1650 return true;
1651 }
1652
1653 HOST_WIDE_INT val;
1654 switch (GET_CODE (x))
1655 {
1656 case CONST_INT:
1657 val = INTVAL (x);
1658 break;
1659 case CONST_DOUBLE:
1660 gcc_unreachable ();
1661 break;
1662 default:
1663 gcc_unreachable ();
1664 }
1665
1666 nvptx_assemble_value (val, size);
1667 return true;
1668}
1669
1670/* Output SIZE zero bytes. We ignore the FILE argument since the
1671 functions we're calling to perform the output just use
1672 asm_out_file. */
1673
1674void
1675nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1676{
1677 if (decl_offset + size >= object_size)
1678 {
1679 if (decl_offset % decl_chunk_size != 0)
1680 nvptx_assemble_value (0, decl_chunk_size);
1681 object_finished = true;
1682 return;
1683 }
1684
1685 while (size > decl_chunk_size)
1686 {
1687 nvptx_assemble_value (0, decl_chunk_size);
1688 size -= decl_chunk_size;
1689 }
1690 while (size-- > 0)
1691 nvptx_assemble_value (0, 1);
1692}
1693
1694/* Output a string STR with length SIZE. As in nvptx_output_skip we
1695 ignore the FILE arg. */
1696
1697void
1698nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1699{
1700 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1701 nvptx_assemble_value (str[i], 1);
1702}
1703
1704/* Called when the initializer for a decl has been completely output through
1705 combinations of the three functions above. */
1706
1707static void
1708nvptx_assemble_decl_end (void)
1709{
1710 if (decl_offset != 0)
1711 {
1712 if (!object_finished && decl_offset % decl_chunk_size != 0)
1713 nvptx_assemble_value (0, decl_chunk_size);
1714
1715 fprintf (asm_out_file, " }");
1716 }
1717 fprintf (asm_out_file, ";\n");
1718}
1719
1720/* Start a declaration of a variable of TYPE with NAME to
1721 FILE. IS_PUBLIC says whether this will be externally visible.
1722 Here we just write the linker hint and decide on the chunk size
1723 to use. */
1724
1725static void
1726init_output_initializer (FILE *file, const char *name, const_tree type,
1727 bool is_public)
1728{
cf08c344 1729 fprintf (file, "\n// BEGIN%s VAR DEF: ", is_public ? " GLOBAL" : "");
738f2522
BS
1730 assemble_name_raw (file, name);
1731 fputc ('\n', file);
1732
1733 if (TREE_CODE (type) == ARRAY_TYPE)
1734 type = TREE_TYPE (type);
1735 int sz = int_size_in_bytes (type);
1736 if ((TREE_CODE (type) != INTEGER_TYPE
1737 && TREE_CODE (type) != ENUMERAL_TYPE
1738 && TREE_CODE (type) != REAL_TYPE)
1739 || sz < 0
1740 || sz > HOST_BITS_PER_WIDE_INT)
1741 type = ptr_type_node;
1742 decl_chunk_size = int_size_in_bytes (type);
1743 decl_chunk_mode = int_mode_for_mode (TYPE_MODE (type));
1744 decl_offset = 0;
1745 init_part = 0;
1746 object_finished = false;
1747}
1748
1749/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1750 writing a constant variable EXP with NAME and SIZE and its
1751 initializer to FILE. */
1752
1753static void
1754nvptx_asm_declare_constant_name (FILE *file, const char *name,
1755 const_tree exp, HOST_WIDE_INT size)
1756{
1757 tree type = TREE_TYPE (exp);
1758 init_output_initializer (file, name, type, false);
1759 fprintf (file, "\t.const .align %d .u%d ",
1760 TYPE_ALIGN (TREE_TYPE (exp)) / BITS_PER_UNIT,
1761 decl_chunk_size * BITS_PER_UNIT);
1762 assemble_name (file, name);
1763 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1764 (size + decl_chunk_size - 1) / decl_chunk_size);
1765 object_size = size;
1766}
1767
1768/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1769 a variable DECL with NAME to FILE. */
1770
1771void
1772nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1773{
1774 if (decl && DECL_SIZE (decl))
1775 {
1776 tree type = TREE_TYPE (decl);
1777 unsigned HOST_WIDE_INT size;
1778
1779 init_output_initializer (file, name, type, TREE_PUBLIC (decl));
1780 size = tree_to_uhwi (DECL_SIZE_UNIT (decl));
1781 const char *section = nvptx_section_for_decl (decl);
1782 fprintf (file, "\t%s%s .align %d .u%d ",
0766660b
NS
1783 !TREE_PUBLIC (decl) ? ""
1784 : DECL_WEAK (decl) ? ".weak" : ".visible",
1785 section, DECL_ALIGN (decl) / BITS_PER_UNIT,
738f2522
BS
1786 decl_chunk_size * BITS_PER_UNIT);
1787 assemble_name (file, name);
1788 if (size > 0)
1789 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1790 (size + decl_chunk_size - 1) / decl_chunk_size);
1791 else
1792 object_finished = true;
1793 object_size = size;
1794 }
1795}
1796
1797/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1798
1799static void
1800nvptx_globalize_label (FILE *, const char *)
1801{
1802}
1803
1804/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1805 declaration only for variable DECL with NAME to FILE. */
1806static void
1807nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1808{
1809 if (TREE_CODE (decl) != VAR_DECL)
1810 return;
1811 const char *section = nvptx_section_for_decl (decl);
cf08c344
NS
1812 fprintf (file, "\n// BEGIN%s VAR DECL: ",
1813 TREE_PUBLIC (decl) ? " GLOBAL" : "");
738f2522
BS
1814 assemble_name_raw (file, name);
1815 fputs ("\n", file);
1816 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (decl));
1817 fprintf (file, ".extern %s .b8 ", section);
1818 assemble_name_raw (file, name);
1819 if (size > 0)
16998094 1820 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
738f2522
BS
1821 fprintf (file, ";\n\n");
1822}
1823
1824/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
1825 involves writing .param declarations and in/out copies into them. For
1826 indirect calls, also write the .callprototype. */
738f2522
BS
1827
1828const char *
1829nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1830{
1831 char buf[256];
1832 static int labelno;
1833 bool needs_tgt = register_operand (callee, Pmode);
1834 rtx pat = PATTERN (insn);
f324806d 1835 int arg_end = XVECLEN (pat, 0);
738f2522
BS
1836 tree decl = NULL_TREE;
1837
1838 fprintf (asm_out_file, "\t{\n");
1839 if (result != NULL)
f324806d
NS
1840 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1841 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1842 false));
738f2522 1843
ecf6e535 1844 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
1845 if (GET_CODE (callee) == SYMBOL_REF)
1846 {
1847 decl = SYMBOL_REF_DECL (callee);
1848 if (decl && DECL_EXTERNAL (decl))
1849 nvptx_record_fndecl (decl);
1850 }
1851
1852 if (needs_tgt)
1853 {
1854 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1855 labelno++;
1856 ASM_OUTPUT_LABEL (asm_out_file, buf);
1857 std::stringstream s;
1858 write_func_decl_from_insn (s, result, pat, callee);
1859 fputs (s.str().c_str(), asm_out_file);
1860 }
1861
f324806d 1862 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1863 {
f324806d 1864 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
1865 machine_mode mode = GET_MODE (t);
1866 int count = maybe_split_mode (&mode);
1867
f324806d 1868 while (count--)
738f2522
BS
1869 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1870 nvptx_ptx_type_from_mode (mode, false), argno++,
1871 mode == QImode || mode == HImode ? "[1]" : "");
1872 }
f324806d 1873 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1874 {
f324806d 1875 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
1876 gcc_assert (REG_P (t));
1877 machine_mode mode = GET_MODE (t);
1878 int count = maybe_split_mode (&mode);
1879
1880 if (count == 1)
1881 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1882 nvptx_ptx_type_from_mode (mode, false), argno++,
1883 REGNO (t));
1884 else
1885 {
1886 int n = 0;
f324806d 1887 while (count--)
738f2522
BS
1888 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d$%d;\n",
1889 nvptx_ptx_type_from_mode (mode, false), argno++,
1890 REGNO (t), n++);
1891 }
1892 }
1893
1894 fprintf (asm_out_file, "\t\tcall ");
1895 if (result != NULL_RTX)
1896 fprintf (asm_out_file, "(%%retval_in), ");
1897
1898 if (decl)
1899 {
1900 const char *name = get_fnname_from_decl (decl);
1901 name = nvptx_name_replacement (name);
1902 assemble_name (asm_out_file, name);
1903 }
1904 else
cc8ca59e 1905 output_address (VOIDmode, callee);
738f2522 1906
f324806d 1907 if (arg_end > 1 || (decl && DECL_STATIC_CHAIN (decl)))
738f2522 1908 {
f324806d
NS
1909 const char *comma = "";
1910
738f2522 1911 fprintf (asm_out_file, ", (");
f324806d 1912 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1913 {
f324806d 1914 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
1915 machine_mode mode = GET_MODE (t);
1916 int count = maybe_split_mode (&mode);
1917
f324806d 1918 while (count--)
738f2522 1919 {
f324806d
NS
1920 fprintf (asm_out_file, "%s%%out_arg%d", comma, argno++);
1921 comma = ", ";
738f2522
BS
1922 }
1923 }
1924 if (decl && DECL_STATIC_CHAIN (decl))
f324806d
NS
1925 fprintf (asm_out_file, "%s%s", comma,
1926 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
738f2522
BS
1927
1928 fprintf (asm_out_file, ")");
1929 }
f324806d 1930
738f2522
BS
1931 if (needs_tgt)
1932 {
1933 fprintf (asm_out_file, ", ");
1934 assemble_name (asm_out_file, buf);
1935 }
1936 fprintf (asm_out_file, ";\n");
1937 if (result != NULL_RTX)
cf08c344 1938 return "\tld.param%t0\t%0, [%%retval_in];\n\t}";
738f2522
BS
1939
1940 return "}";
1941}
1942
1943/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1944
1945static bool
1946nvptx_print_operand_punct_valid_p (unsigned char c)
1947{
1948 return c == '.' || c== '#';
1949}
1950
1951static void nvptx_print_operand (FILE *, rtx, int);
1952
1953/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1954
1955static void
1956nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1957{
1958 rtx off;
1959 if (GET_CODE (x) == CONST)
1960 x = XEXP (x, 0);
1961 switch (GET_CODE (x))
1962 {
1963 case PLUS:
1964 off = XEXP (x, 1);
cc8ca59e 1965 output_address (VOIDmode, XEXP (x, 0));
738f2522 1966 fprintf (file, "+");
cc8ca59e 1967 output_address (VOIDmode, off);
738f2522
BS
1968 break;
1969
1970 case SYMBOL_REF:
1971 case LABEL_REF:
1972 output_addr_const (file, x);
1973 break;
1974
1975 default:
1976 gcc_assert (GET_CODE (x) != MEM);
1977 nvptx_print_operand (file, x, 0);
1978 break;
1979 }
1980}
1981
1982/* Write assembly language output for the address ADDR to FILE. */
1983
1984static void
cc8ca59e 1985nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 1986{
cc8ca59e 1987 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
1988}
1989
1990/* Print an operand, X, to FILE, with an optional modifier in CODE.
1991
1992 Meaning of CODE:
1993 . -- print the predicate for the instruction or an emptry string for an
1994 unconditional one.
1995 # -- print a rounding mode for the instruction
1996
1997 A -- print an address space identifier for a MEM
1998 c -- print an opcode suffix for a comparison operator, including a type code
738f2522 1999 f -- print a full reg even for something that must always be split
d88cd9c4 2000 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
2001 t -- print a type opcode suffix, promoting QImode to 32 bits
2002 T -- print a type size in bits
2003 u -- print a type opcode suffix without promotions. */
2004
2005static void
2006nvptx_print_operand (FILE *file, rtx x, int code)
2007{
2008 rtx orig_x = x;
2009 machine_mode op_mode;
2010
2011 if (code == '.')
2012 {
2013 x = current_insn_predicate;
2014 if (x)
2015 {
2016 unsigned int regno = REGNO (XEXP (x, 0));
2017 fputs ("[", file);
2018 if (GET_CODE (x) == EQ)
2019 fputs ("!", file);
2020 fputs (reg_names [regno], file);
2021 fputs ("]", file);
2022 }
2023 return;
2024 }
2025 else if (code == '#')
2026 {
2027 fputs (".rn", file);
2028 return;
2029 }
2030
2031 enum rtx_code x_code = GET_CODE (x);
2032
2033 switch (code)
2034 {
2035 case 'A':
2036 {
2037 addr_space_t as = nvptx_addr_space_from_address (XEXP (x, 0));
2038 fputs (nvptx_section_from_addr_space (as), file);
2039 }
2040 break;
2041
738f2522
BS
2042 case 't':
2043 op_mode = nvptx_underlying_object_mode (x);
2044 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
2045 break;
2046
2047 case 'u':
2048 op_mode = nvptx_underlying_object_mode (x);
2049 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
2050 break;
2051
d88cd9c4
NS
2052 case 'S':
2053 {
2054 unsigned kind = UINTVAL (x);
2055 static const char *const kinds[] =
2056 {"up", "down", "bfly", "idx"};
2057 fprintf (file, ".%s", kinds[kind]);
2058 }
2059 break;
2060
738f2522
BS
2061 case 'T':
2062 fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
2063 break;
2064
2065 case 'j':
2066 fprintf (file, "@");
2067 goto common;
2068
2069 case 'J':
2070 fprintf (file, "@!");
2071 goto common;
2072
2073 case 'c':
2074 op_mode = GET_MODE (XEXP (x, 0));
2075 switch (x_code)
2076 {
2077 case EQ:
2078 fputs (".eq", file);
2079 break;
2080 case NE:
2081 if (FLOAT_MODE_P (op_mode))
2082 fputs (".neu", file);
2083 else
2084 fputs (".ne", file);
2085 break;
2086 case LE:
2087 fputs (".le", file);
2088 break;
2089 case GE:
2090 fputs (".ge", file);
2091 break;
2092 case LT:
2093 fputs (".lt", file);
2094 break;
2095 case GT:
2096 fputs (".gt", file);
2097 break;
2098 case LEU:
2099 fputs (".ls", file);
2100 break;
2101 case GEU:
2102 fputs (".hs", file);
2103 break;
2104 case LTU:
2105 fputs (".lo", file);
2106 break;
2107 case GTU:
2108 fputs (".hi", file);
2109 break;
2110 case LTGT:
2111 fputs (".ne", file);
2112 break;
2113 case UNEQ:
2114 fputs (".equ", file);
2115 break;
2116 case UNLE:
2117 fputs (".leu", file);
2118 break;
2119 case UNGE:
2120 fputs (".geu", file);
2121 break;
2122 case UNLT:
2123 fputs (".ltu", file);
2124 break;
2125 case UNGT:
2126 fputs (".gtu", file);
2127 break;
2128 case UNORDERED:
2129 fputs (".nan", file);
2130 break;
2131 case ORDERED:
2132 fputs (".num", file);
2133 break;
2134 default:
2135 gcc_unreachable ();
2136 }
2137 if (FLOAT_MODE_P (op_mode)
2138 || x_code == EQ || x_code == NE
2139 || x_code == GEU || x_code == GTU
2140 || x_code == LEU || x_code == LTU)
2141 fputs (nvptx_ptx_type_from_mode (op_mode, true), file);
2142 else
2143 fprintf (file, ".s%d", GET_MODE_BITSIZE (op_mode));
2144 break;
2145 default:
2146 common:
2147 switch (x_code)
2148 {
2149 case SUBREG:
2150 x = SUBREG_REG (x);
2151 /* fall through */
2152
2153 case REG:
2154 if (HARD_REGISTER_P (x))
2155 fprintf (file, "%s", reg_names[REGNO (x)]);
2156 else
2157 fprintf (file, "%%r%d", REGNO (x));
2158 if (code != 'f' && nvptx_split_reg_p (GET_MODE (x)))
2159 {
2160 gcc_assert (GET_CODE (orig_x) == SUBREG
2161 && !nvptx_split_reg_p (GET_MODE (orig_x)));
2162 fprintf (file, "$%d", SUBREG_BYTE (orig_x) / UNITS_PER_WORD);
2163 }
2164 break;
2165
2166 case MEM:
2167 fputc ('[', file);
2168 nvptx_print_address_operand (file, XEXP (x, 0), GET_MODE (x));
2169 fputc (']', file);
2170 break;
2171
2172 case CONST_INT:
2173 output_addr_const (file, x);
2174 break;
2175
2176 case CONST:
2177 case SYMBOL_REF:
2178 case LABEL_REF:
2179 /* We could use output_addr_const, but that can print things like
2180 "x-8", which breaks ptxas. Need to ensure it is output as
2181 "x+-8". */
2182 nvptx_print_address_operand (file, x, VOIDmode);
2183 break;
2184
2185 case CONST_DOUBLE:
2186 long vals[2];
34a72c33 2187 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), GET_MODE (x));
738f2522
BS
2188 vals[0] &= 0xffffffff;
2189 vals[1] &= 0xffffffff;
2190 if (GET_MODE (x) == SFmode)
2191 fprintf (file, "0f%08lx", vals[0]);
2192 else
2193 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2194 break;
2195
2196 default:
2197 output_addr_const (file, x);
2198 }
2199 }
2200}
2201\f
2202/* Record replacement regs used to deal with subreg operands. */
2203struct reg_replace
2204{
2205 rtx replacement[MAX_RECOG_OPERANDS];
2206 machine_mode mode;
2207 int n_allocated;
2208 int n_in_use;
2209};
2210
2211/* Allocate or reuse a replacement in R and return the rtx. */
2212
2213static rtx
2214get_replacement (struct reg_replace *r)
2215{
2216 if (r->n_allocated == r->n_in_use)
2217 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2218 return r->replacement[r->n_in_use++];
2219}
2220
2221/* Clean up subreg operands. In ptx assembly, everything is typed, and
2222 the presence of subregs would break the rules for most instructions.
2223 Replace them with a suitable new register of the right size, plus
2224 conversion copyin/copyout instructions. */
2225
2226static void
517665b3 2227nvptx_reorg_subreg (void)
738f2522
BS
2228{
2229 struct reg_replace qiregs, hiregs, siregs, diregs;
2230 rtx_insn *insn, *next;
2231
738f2522
BS
2232 qiregs.n_allocated = 0;
2233 hiregs.n_allocated = 0;
2234 siregs.n_allocated = 0;
2235 diregs.n_allocated = 0;
2236 qiregs.mode = QImode;
2237 hiregs.mode = HImode;
2238 siregs.mode = SImode;
2239 diregs.mode = DImode;
2240
2241 for (insn = get_insns (); insn; insn = next)
2242 {
2243 next = NEXT_INSN (insn);
2244 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2245 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2246 || GET_CODE (PATTERN (insn)) == USE
2247 || GET_CODE (PATTERN (insn)) == CLOBBER)
2248 continue;
f324806d 2249
738f2522
BS
2250 qiregs.n_in_use = 0;
2251 hiregs.n_in_use = 0;
2252 siregs.n_in_use = 0;
2253 diregs.n_in_use = 0;
2254 extract_insn (insn);
2255 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2256
738f2522
BS
2257 for (int i = 0; i < recog_data.n_operands; i++)
2258 {
2259 rtx op = recog_data.operand[i];
2260 if (GET_CODE (op) != SUBREG)
2261 continue;
2262
2263 rtx inner = SUBREG_REG (op);
2264
2265 machine_mode outer_mode = GET_MODE (op);
2266 machine_mode inner_mode = GET_MODE (inner);
2267 gcc_assert (s_ok);
2268 if (s_ok
2269 && (GET_MODE_PRECISION (inner_mode)
2270 >= GET_MODE_PRECISION (outer_mode)))
2271 continue;
2272 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2273 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2274 : outer_mode == HImode ? &hiregs
2275 : outer_mode == SImode ? &siregs
2276 : &diregs);
2277 rtx new_reg = get_replacement (r);
2278
2279 if (recog_data.operand_type[i] != OP_OUT)
2280 {
2281 enum rtx_code code;
2282 if (GET_MODE_PRECISION (inner_mode)
2283 < GET_MODE_PRECISION (outer_mode))
2284 code = ZERO_EXTEND;
2285 else
2286 code = TRUNCATE;
2287
f7df4a84 2288 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2289 gen_rtx_fmt_e (code, outer_mode, inner));
2290 emit_insn_before (pat, insn);
2291 }
2292
2293 if (recog_data.operand_type[i] != OP_IN)
2294 {
2295 enum rtx_code code;
2296 if (GET_MODE_PRECISION (inner_mode)
2297 < GET_MODE_PRECISION (outer_mode))
2298 code = TRUNCATE;
2299 else
2300 code = ZERO_EXTEND;
2301
f7df4a84 2302 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2303 gen_rtx_fmt_e (code, inner_mode, new_reg));
2304 emit_insn_after (pat, insn);
2305 }
2306 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2307 }
2308 }
517665b3 2309}
738f2522 2310
d2d47a28
NS
2311/* Loop structure of the function. The entire function is described as
2312 a NULL loop. */
d88cd9c4
NS
2313
2314struct parallel
2315{
2316 /* Parent parallel. */
2317 parallel *parent;
2318
2319 /* Next sibling parallel. */
2320 parallel *next;
2321
2322 /* First child parallel. */
2323 parallel *inner;
2324
2325 /* Partitioning mask of the parallel. */
2326 unsigned mask;
2327
2328 /* Partitioning used within inner parallels. */
2329 unsigned inner_mask;
2330
2331 /* Location of parallel forked and join. The forked is the first
2332 block in the parallel and the join is the first block after of
2333 the partition. */
2334 basic_block forked_block;
2335 basic_block join_block;
2336
2337 rtx_insn *forked_insn;
2338 rtx_insn *join_insn;
2339
2340 rtx_insn *fork_insn;
2341 rtx_insn *joining_insn;
2342
2343 /* Basic blocks in this parallel, but not in child parallels. The
2344 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2345 blocks are not. */
2346 auto_vec<basic_block> blocks;
2347
2348public:
2349 parallel (parallel *parent, unsigned mode);
2350 ~parallel ();
2351};
2352
2353/* Constructor links the new parallel into it's parent's chain of
2354 children. */
2355
2356parallel::parallel (parallel *parent_, unsigned mask_)
2357 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2358{
2359 forked_block = join_block = 0;
2360 forked_insn = join_insn = 0;
2361 fork_insn = joining_insn = 0;
2362
2363 if (parent)
2364 {
2365 next = parent->inner;
2366 parent->inner = this;
2367 }
2368}
2369
2370parallel::~parallel ()
2371{
2372 delete inner;
2373 delete next;
2374}
2375
2376/* Map of basic blocks to insns */
2377typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2378
2379/* A tuple of an insn of interest and the BB in which it resides. */
2380typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2381typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2382
2383/* Split basic blocks such that each forked and join unspecs are at
2384 the start of their basic blocks. Thus afterwards each block will
2385 have a single partitioning mode. We also do the same for return
2386 insns, as they are executed by every thread. Return the
2387 partitioning mode of the function as a whole. Populate MAP with
2388 head and tail blocks. We also clear the BB visited flag, which is
2389 used when finding partitions. */
2390
2391static void
2392nvptx_split_blocks (bb_insn_map_t *map)
2393{
2394 insn_bb_vec_t worklist;
2395 basic_block block;
2396 rtx_insn *insn;
2397
2398 /* Locate all the reorg instructions of interest. */
2399 FOR_ALL_BB_FN (block, cfun)
2400 {
2401 bool seen_insn = false;
2402
2403 /* Clear visited flag, for use by parallel locator */
2404 block->flags &= ~BB_VISITED;
2405
2406 FOR_BB_INSNS (block, insn)
2407 {
2408 if (!INSN_P (insn))
2409 continue;
2410 switch (recog_memoized (insn))
2411 {
2412 default:
2413 seen_insn = true;
2414 continue;
2415 case CODE_FOR_nvptx_forked:
2416 case CODE_FOR_nvptx_join:
2417 break;
2418
2419 case CODE_FOR_return:
2420 /* We also need to split just before return insns, as
2421 that insn needs executing by all threads, but the
2422 block it is in probably does not. */
2423 break;
2424 }
2425
2426 if (seen_insn)
2427 /* We've found an instruction that must be at the start of
2428 a block, but isn't. Add it to the worklist. */
2429 worklist.safe_push (insn_bb_t (insn, block));
2430 else
2431 /* It was already the first instruction. Just add it to
2432 the map. */
2433 map->get_or_insert (block) = insn;
2434 seen_insn = true;
2435 }
2436 }
2437
2438 /* Split blocks on the worklist. */
2439 unsigned ix;
2440 insn_bb_t *elt;
2441 basic_block remap = 0;
2442 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2443 {
2444 if (remap != elt->second)
2445 {
2446 block = elt->second;
2447 remap = block;
2448 }
2449
2450 /* Split block before insn. The insn is in the new block */
2451 edge e = split_block (block, PREV_INSN (elt->first));
2452
2453 block = e->dest;
2454 map->get_or_insert (block) = elt->first;
2455 }
2456}
2457
2458/* BLOCK is a basic block containing a head or tail instruction.
2459 Locate the associated prehead or pretail instruction, which must be
2460 in the single predecessor block. */
2461
2462static rtx_insn *
2463nvptx_discover_pre (basic_block block, int expected)
2464{
2465 gcc_assert (block->preds->length () == 1);
2466 basic_block pre_block = (*block->preds)[0]->src;
2467 rtx_insn *pre_insn;
2468
2469 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2470 pre_insn = PREV_INSN (pre_insn))
2471 gcc_assert (pre_insn != BB_HEAD (pre_block));
2472
2473 gcc_assert (recog_memoized (pre_insn) == expected);
2474 return pre_insn;
2475}
2476
2477/* Dump this parallel and all its inner parallels. */
2478
2479static void
2480nvptx_dump_pars (parallel *par, unsigned depth)
2481{
2482 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2483 depth, par->mask,
2484 par->forked_block ? par->forked_block->index : -1,
2485 par->join_block ? par->join_block->index : -1);
2486
2487 fprintf (dump_file, " blocks:");
2488
2489 basic_block block;
2490 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2491 fprintf (dump_file, " %d", block->index);
2492 fprintf (dump_file, "\n");
2493 if (par->inner)
2494 nvptx_dump_pars (par->inner, depth + 1);
2495
2496 if (par->next)
2497 nvptx_dump_pars (par->next, depth);
2498}
2499
2500/* If BLOCK contains a fork/join marker, process it to create or
2501 terminate a loop structure. Add this block to the current loop,
2502 and then walk successor blocks. */
2503
2504static parallel *
2505nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2506{
2507 if (block->flags & BB_VISITED)
2508 return par;
2509 block->flags |= BB_VISITED;
2510
2511 if (rtx_insn **endp = map->get (block))
2512 {
2513 rtx_insn *end = *endp;
2514
2515 /* This is a block head or tail, or return instruction. */
2516 switch (recog_memoized (end))
2517 {
2518 case CODE_FOR_return:
2519 /* Return instructions are in their own block, and we
2520 don't need to do anything more. */
2521 return par;
2522
2523 case CODE_FOR_nvptx_forked:
2524 /* Loop head, create a new inner loop and add it into
2525 our parent's child list. */
2526 {
2527 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2528
2529 gcc_assert (mask);
2530 par = new parallel (par, mask);
2531 par->forked_block = block;
2532 par->forked_insn = end;
2533 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2534 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2535 par->fork_insn
2536 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2537 }
2538 break;
2539
2540 case CODE_FOR_nvptx_join:
2541 /* A loop tail. Finish the current loop and return to
2542 parent. */
2543 {
2544 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2545
2546 gcc_assert (par->mask == mask);
2547 par->join_block = block;
2548 par->join_insn = end;
2549 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2550 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2551 par->joining_insn
2552 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2553 par = par->parent;
2554 }
2555 break;
2556
2557 default:
2558 gcc_unreachable ();
2559 }
2560 }
2561
2562 if (par)
2563 /* Add this block onto the current loop's list of blocks. */
2564 par->blocks.safe_push (block);
2565 else
2566 /* This must be the entry block. Create a NULL parallel. */
2567 par = new parallel (0, 0);
2568
2569 /* Walk successor blocks. */
2570 edge e;
2571 edge_iterator ei;
2572
2573 FOR_EACH_EDGE (e, ei, block->succs)
2574 nvptx_find_par (map, par, e->dest);
2575
2576 return par;
2577}
2578
2579/* DFS walk the CFG looking for fork & join markers. Construct
2580 loop structures as we go. MAP is a mapping of basic blocks
2581 to head & tail markers, discovered when splitting blocks. This
2582 speeds up the discovery. We rely on the BB visited flag having
2583 been cleared when splitting blocks. */
2584
2585static parallel *
2586nvptx_discover_pars (bb_insn_map_t *map)
2587{
2588 basic_block block;
2589
2590 /* Mark exit blocks as visited. */
2591 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2592 block->flags |= BB_VISITED;
2593
2594 /* And entry block as not. */
2595 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2596 block->flags &= ~BB_VISITED;
2597
2598 parallel *par = nvptx_find_par (map, 0, block);
2599
2600 if (dump_file)
2601 {
2602 fprintf (dump_file, "\nLoops\n");
2603 nvptx_dump_pars (par, 0);
2604 fprintf (dump_file, "\n");
2605 }
2606
2607 return par;
2608}
2609
912442c2
NS
2610/* Analyse a group of BBs within a partitioned region and create N
2611 Single-Entry-Single-Exit regions. Some of those regions will be
2612 trivial ones consisting of a single BB. The blocks of a
2613 partitioned region might form a set of disjoint graphs -- because
2614 the region encloses a differently partitoned sub region.
2615
2616 We use the linear time algorithm described in 'Finding Regions Fast:
2617 Single Entry Single Exit and control Regions in Linear Time'
2618 Johnson, Pearson & Pingali. That algorithm deals with complete
2619 CFGs, where a back edge is inserted from END to START, and thus the
2620 problem becomes one of finding equivalent loops.
2621
2622 In this case we have a partial CFG. We complete it by redirecting
2623 any incoming edge to the graph to be from an arbitrary external BB,
2624 and similarly redirecting any outgoing edge to be to that BB.
2625 Thus we end up with a closed graph.
2626
2627 The algorithm works by building a spanning tree of an undirected
2628 graph and keeping track of back edges from nodes further from the
2629 root in the tree to nodes nearer to the root in the tree. In the
2630 description below, the root is up and the tree grows downwards.
2631
2632 We avoid having to deal with degenerate back-edges to the same
2633 block, by splitting each BB into 3 -- one for input edges, one for
2634 the node itself and one for the output edges. Such back edges are
2635 referred to as 'Brackets'. Cycle equivalent nodes will have the
2636 same set of brackets.
2637
2638 Determining bracket equivalency is done by maintaining a list of
2639 brackets in such a manner that the list length and final bracket
2640 uniquely identify the set.
2641
2642 We use coloring to mark all BBs with cycle equivalency with the
2643 same color. This is the output of the 'Finding Regions Fast'
2644 algorithm. Notice it doesn't actually find the set of nodes within
2645 a particular region, just unorderd sets of nodes that are the
2646 entries and exits of SESE regions.
2647
2648 After determining cycle equivalency, we need to find the minimal
2649 set of SESE regions. Do this with a DFS coloring walk of the
2650 complete graph. We're either 'looking' or 'coloring'. When
2651 looking, and we're in the subgraph, we start coloring the color of
2652 the current node, and remember that node as the start of the
2653 current color's SESE region. Every time we go to a new node, we
2654 decrement the count of nodes with thet color. If it reaches zero,
2655 we remember that node as the end of the current color's SESE region
2656 and return to 'looking'. Otherwise we color the node the current
2657 color.
2658
2659 This way we end up with coloring the inside of non-trivial SESE
2660 regions with the color of that region. */
2661
2662/* A pair of BBs. We use this to represent SESE regions. */
2663typedef std::pair<basic_block, basic_block> bb_pair_t;
2664typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2665
2666/* A node in the undirected CFG. The discriminator SECOND indicates just
2667 above or just below the BB idicated by FIRST. */
2668typedef std::pair<basic_block, int> pseudo_node_t;
2669
2670/* A bracket indicates an edge towards the root of the spanning tree of the
2671 undirected graph. Each bracket has a color, determined
2672 from the currrent set of brackets. */
2673struct bracket
2674{
2675 pseudo_node_t back; /* Back target */
2676
2677 /* Current color and size of set. */
2678 unsigned color;
2679 unsigned size;
2680
2681 bracket (pseudo_node_t back_)
2682 : back (back_), color (~0u), size (~0u)
2683 {
2684 }
2685
2686 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2687 {
2688 if (length != size)
2689 {
2690 size = length;
2691 color = color_counts.length ();
2692 color_counts.quick_push (0);
2693 }
2694 color_counts[color]++;
2695 return color;
2696 }
2697};
2698
2699typedef auto_vec<bracket> bracket_vec_t;
2700
2701/* Basic block info for finding SESE regions. */
2702
2703struct bb_sese
2704{
2705 int node; /* Node number in spanning tree. */
2706 int parent; /* Parent node number. */
2707
2708 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2709 edges arrive at pseudo-node Ai and the outgoing edges leave at
2710 pseudo-node Ao. We have to remember which way we arrived at a
2711 particular node when generating the spanning tree. dir > 0 means
2712 we arrived at Ai, dir < 0 means we arrived at Ao. */
2713 int dir;
2714
2715 /* Lowest numbered pseudo-node reached via a backedge from thsis
2716 node, or any descendant. */
2717 pseudo_node_t high;
2718
2719 int color; /* Cycle-equivalence color */
2720
2721 /* Stack of brackets for this node. */
2722 bracket_vec_t brackets;
2723
2724 bb_sese (unsigned node_, unsigned p, int dir_)
2725 :node (node_), parent (p), dir (dir_)
2726 {
2727 }
2728 ~bb_sese ();
2729
2730 /* Push a bracket ending at BACK. */
2731 void push (const pseudo_node_t &back)
2732 {
2733 if (dump_file)
2734 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2735 back.first ? back.first->index : 0, back.second);
2736 brackets.safe_push (bracket (back));
2737 }
2738
2739 void append (bb_sese *child);
2740 void remove (const pseudo_node_t &);
2741
2742 /* Set node's color. */
2743 void set_color (auto_vec<unsigned> &color_counts)
2744 {
2745 color = brackets.last ().get_color (color_counts, brackets.length ());
2746 }
2747};
2748
2749bb_sese::~bb_sese ()
2750{
2751}
2752
2753/* Destructively append CHILD's brackets. */
2754
2755void
2756bb_sese::append (bb_sese *child)
2757{
2758 if (int len = child->brackets.length ())
2759 {
2760 int ix;
2761
2762 if (dump_file)
2763 {
2764 for (ix = 0; ix < len; ix++)
2765 {
2766 const pseudo_node_t &pseudo = child->brackets[ix].back;
2767 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2768 child->node, pseudo.first ? pseudo.first->index : 0,
2769 pseudo.second);
2770 }
2771 }
2772 if (!brackets.length ())
2773 std::swap (brackets, child->brackets);
2774 else
2775 {
2776 brackets.reserve (len);
2777 for (ix = 0; ix < len; ix++)
2778 brackets.quick_push (child->brackets[ix]);
2779 }
2780 }
2781}
2782
2783/* Remove brackets that terminate at PSEUDO. */
2784
2785void
2786bb_sese::remove (const pseudo_node_t &pseudo)
2787{
2788 unsigned removed = 0;
2789 int len = brackets.length ();
2790
2791 for (int ix = 0; ix < len; ix++)
2792 {
2793 if (brackets[ix].back == pseudo)
2794 {
2795 if (dump_file)
2796 fprintf (dump_file, "Removing backedge %d:%+d\n",
2797 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2798 removed++;
2799 }
2800 else if (removed)
2801 brackets[ix-removed] = brackets[ix];
2802 }
2803 while (removed--)
2804 brackets.pop ();
2805}
2806
2807/* Accessors for BB's aux pointer. */
2808#define BB_SET_SESE(B, S) ((B)->aux = (S))
2809#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2810
2811/* DFS walk creating SESE data structures. Only cover nodes with
2812 BB_VISITED set. Append discovered blocks to LIST. We number in
2813 increments of 3 so that the above and below pseudo nodes can be
2814 implicitly numbered too. */
2815
2816static int
2817nvptx_sese_number (int n, int p, int dir, basic_block b,
2818 auto_vec<basic_block> *list)
2819{
2820 if (BB_GET_SESE (b))
2821 return n;
2822
2823 if (dump_file)
2824 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2825 b->index, n, p, dir);
2826
2827 BB_SET_SESE (b, new bb_sese (n, p, dir));
2828 p = n;
2829
2830 n += 3;
2831 list->quick_push (b);
2832
2833 /* First walk the nodes on the 'other side' of this node, then walk
2834 the nodes on the same side. */
2835 for (unsigned ix = 2; ix; ix--)
2836 {
2837 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2838 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2839 : offsetof (edge_def, src));
2840 edge e;
2841 edge_iterator (ei);
2842
2843 FOR_EACH_EDGE (e, ei, edges)
2844 {
2845 basic_block target = *(basic_block *)((char *)e + offset);
2846
2847 if (target->flags & BB_VISITED)
2848 n = nvptx_sese_number (n, p, dir, target, list);
2849 }
2850 dir = -dir;
2851 }
2852 return n;
2853}
2854
2855/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2856 EDGES are the outgoing edges and OFFSET is the offset to the src
2857 or dst block on the edges. */
2858
2859static void
2860nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2861 vec<edge, va_gc> *edges, size_t offset)
2862{
2863 edge e;
2864 edge_iterator (ei);
2865 int hi_back = depth;
2866 pseudo_node_t node_back (0, depth);
2867 int hi_child = depth;
2868 pseudo_node_t node_child (0, depth);
2869 basic_block child = NULL;
2870 unsigned num_children = 0;
2871 int usd = -dir * sese->dir;
2872
2873 if (dump_file)
2874 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2875 me->index, sese->node, dir);
2876
2877 if (dir < 0)
2878 {
2879 /* This is the above pseudo-child. It has the BB itself as an
2880 additional child node. */
2881 node_child = sese->high;
2882 hi_child = node_child.second;
2883 if (node_child.first)
2884 hi_child += BB_GET_SESE (node_child.first)->node;
2885 num_children++;
2886 }
2887
2888 /* Examine each edge.
2889 - if it is a child (a) append its bracket list and (b) record
2890 whether it is the child with the highest reaching bracket.
2891 - if it is an edge to ancestor, record whether it's the highest
2892 reaching backlink. */
2893 FOR_EACH_EDGE (e, ei, edges)
2894 {
2895 basic_block target = *(basic_block *)((char *)e + offset);
2896
2897 if (bb_sese *t_sese = BB_GET_SESE (target))
2898 {
2899 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2900 {
2901 /* Child node. Append its bracket list. */
2902 num_children++;
2903 sese->append (t_sese);
2904
2905 /* Compare it's hi value. */
2906 int t_hi = t_sese->high.second;
2907
2908 if (basic_block child_hi_block = t_sese->high.first)
2909 t_hi += BB_GET_SESE (child_hi_block)->node;
2910
2911 if (hi_child > t_hi)
2912 {
2913 hi_child = t_hi;
2914 node_child = t_sese->high;
2915 child = target;
2916 }
2917 }
2918 else if (t_sese->node < sese->node + dir
2919 && !(dir < 0 && sese->parent == t_sese->node))
2920 {
2921 /* Non-parental ancestor node -- a backlink. */
2922 int d = usd * t_sese->dir;
2923 int back = t_sese->node + d;
2924
2925 if (hi_back > back)
2926 {
2927 hi_back = back;
2928 node_back = pseudo_node_t (target, d);
2929 }
2930 }
2931 }
2932 else
2933 { /* Fallen off graph, backlink to entry node. */
2934 hi_back = 0;
2935 node_back = pseudo_node_t (0, 0);
2936 }
2937 }
2938
2939 /* Remove any brackets that terminate at this pseudo node. */
2940 sese->remove (pseudo_node_t (me, dir));
2941
2942 /* Now push any backlinks from this pseudo node. */
2943 FOR_EACH_EDGE (e, ei, edges)
2944 {
2945 basic_block target = *(basic_block *)((char *)e + offset);
2946 if (bb_sese *t_sese = BB_GET_SESE (target))
2947 {
2948 if (t_sese->node < sese->node + dir
2949 && !(dir < 0 && sese->parent == t_sese->node))
2950 /* Non-parental ancestor node - backedge from me. */
2951 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2952 }
2953 else
2954 {
2955 /* back edge to entry node */
2956 sese->push (pseudo_node_t (0, 0));
2957 }
2958 }
2959
2960 /* If this node leads directly or indirectly to a no-return region of
2961 the graph, then fake a backedge to entry node. */
2962 if (!sese->brackets.length () || !edges || !edges->length ())
2963 {
2964 hi_back = 0;
2965 node_back = pseudo_node_t (0, 0);
2966 sese->push (node_back);
2967 }
2968
2969 /* Record the highest reaching backedge from us or a descendant. */
2970 sese->high = hi_back < hi_child ? node_back : node_child;
2971
2972 if (num_children > 1)
2973 {
2974 /* There is more than one child -- this is a Y shaped piece of
2975 spanning tree. We have to insert a fake backedge from this
2976 node to the highest ancestor reached by not-the-highest
2977 reaching child. Note that there may be multiple children
2978 with backedges to the same highest node. That's ok and we
2979 insert the edge to that highest node. */
2980 hi_child = depth;
2981 if (dir < 0 && child)
2982 {
2983 node_child = sese->high;
2984 hi_child = node_child.second;
2985 if (node_child.first)
2986 hi_child += BB_GET_SESE (node_child.first)->node;
2987 }
2988
2989 FOR_EACH_EDGE (e, ei, edges)
2990 {
2991 basic_block target = *(basic_block *)((char *)e + offset);
2992
2993 if (target == child)
2994 /* Ignore the highest child. */
2995 continue;
2996
2997 bb_sese *t_sese = BB_GET_SESE (target);
2998 if (!t_sese)
2999 continue;
3000 if (t_sese->parent != sese->node)
3001 /* Not a child. */
3002 continue;
3003
3004 /* Compare its hi value. */
3005 int t_hi = t_sese->high.second;
3006
3007 if (basic_block child_hi_block = t_sese->high.first)
3008 t_hi += BB_GET_SESE (child_hi_block)->node;
3009
3010 if (hi_child > t_hi)
3011 {
3012 hi_child = t_hi;
3013 node_child = t_sese->high;
3014 }
3015 }
3016
3017 sese->push (node_child);
3018 }
3019}
3020
3021
3022/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3023 proceed to successors. Set SESE entry and exit nodes of
3024 REGIONS. */
3025
3026static void
3027nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3028 basic_block block, int coloring)
3029{
3030 bb_sese *sese = BB_GET_SESE (block);
3031
3032 if (block->flags & BB_VISITED)
3033 {
3034 /* If we've already encountered this block, either we must not
3035 be coloring, or it must have been colored the current color. */
3036 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3037 return;
3038 }
3039
3040 block->flags |= BB_VISITED;
3041
3042 if (sese)
3043 {
3044 if (coloring < 0)
3045 {
3046 /* Start coloring a region. */
3047 regions[sese->color].first = block;
3048 coloring = sese->color;
3049 }
3050
3051 if (!--color_counts[sese->color] && sese->color == coloring)
3052 {
3053 /* Found final block of SESE region. */
3054 regions[sese->color].second = block;
3055 coloring = -1;
3056 }
3057 else
3058 /* Color the node, so we can assert on revisiting the node
3059 that the graph is indeed SESE. */
3060 sese->color = coloring;
3061 }
3062 else
3063 /* Fallen off the subgraph, we cannot be coloring. */
3064 gcc_assert (coloring < 0);
3065
3066 /* Walk each successor block. */
3067 if (block->succs && block->succs->length ())
3068 {
3069 edge e;
3070 edge_iterator ei;
3071
3072 FOR_EACH_EDGE (e, ei, block->succs)
3073 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3074 }
3075 else
3076 gcc_assert (coloring < 0);
3077}
3078
3079/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3080 end up with NULL entries in it. */
3081
3082static void
3083nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3084{
3085 basic_block block;
3086 int ix;
3087
3088 /* First clear each BB of the whole function. */
3089 FOR_EACH_BB_FN (block, cfun)
3090 {
3091 block->flags &= ~BB_VISITED;
3092 BB_SET_SESE (block, 0);
3093 }
3094 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3095 block->flags &= ~BB_VISITED;
3096 BB_SET_SESE (block, 0);
3097 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3098 block->flags &= ~BB_VISITED;
3099 BB_SET_SESE (block, 0);
3100
3101 /* Mark blocks in the function that are in this graph. */
3102 for (ix = 0; blocks.iterate (ix, &block); ix++)
3103 block->flags |= BB_VISITED;
3104
3105 /* Counts of nodes assigned to each color. There cannot be more
3106 colors than blocks (and hopefully there will be fewer). */
3107 auto_vec<unsigned> color_counts;
3108 color_counts.reserve (blocks.length ());
3109
3110 /* Worklist of nodes in the spanning tree. Again, there cannot be
3111 more nodes in the tree than blocks (there will be fewer if the
3112 CFG of blocks is disjoint). */
3113 auto_vec<basic_block> spanlist;
3114 spanlist.reserve (blocks.length ());
3115
3116 /* Make sure every block has its cycle class determined. */
3117 for (ix = 0; blocks.iterate (ix, &block); ix++)
3118 {
3119 if (BB_GET_SESE (block))
3120 /* We already met this block in an earlier graph solve. */
3121 continue;
3122
3123 if (dump_file)
3124 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3125
3126 /* Number the nodes reachable from block initial DFS order. */
3127 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3128
3129 /* Now walk in reverse DFS order to find cycle equivalents. */
3130 while (spanlist.length ())
3131 {
3132 block = spanlist.pop ();
3133 bb_sese *sese = BB_GET_SESE (block);
3134
3135 /* Do the pseudo node below. */
3136 nvptx_sese_pseudo (block, sese, depth, +1,
3137 sese->dir > 0 ? block->succs : block->preds,
3138 (sese->dir > 0 ? offsetof (edge_def, dest)
3139 : offsetof (edge_def, src)));
3140 sese->set_color (color_counts);
3141 /* Do the pseudo node above. */
3142 nvptx_sese_pseudo (block, sese, depth, -1,
3143 sese->dir < 0 ? block->succs : block->preds,
3144 (sese->dir < 0 ? offsetof (edge_def, dest)
3145 : offsetof (edge_def, src)));
3146 }
3147 if (dump_file)
3148 fprintf (dump_file, "\n");
3149 }
3150
3151 if (dump_file)
3152 {
3153 unsigned count;
3154 const char *comma = "";
3155
3156 fprintf (dump_file, "Found %d cycle equivalents\n",
3157 color_counts.length ());
3158 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3159 {
3160 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3161
3162 comma = "";
3163 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3164 if (BB_GET_SESE (block)->color == ix)
3165 {
3166 block->flags |= BB_VISITED;
3167 fprintf (dump_file, "%s%d", comma, block->index);
3168 comma=",";
3169 }
3170 fprintf (dump_file, "}");
3171 comma = ", ";
3172 }
3173 fprintf (dump_file, "\n");
3174 }
3175
3176 /* Now we've colored every block in the subgraph. We now need to
3177 determine the minimal set of SESE regions that cover that
3178 subgraph. Do this with a DFS walk of the complete function.
3179 During the walk we're either 'looking' or 'coloring'. When we
3180 reach the last node of a particular color, we stop coloring and
3181 return to looking. */
3182
3183 /* There cannot be more SESE regions than colors. */
3184 regions.reserve (color_counts.length ());
3185 for (ix = color_counts.length (); ix--;)
3186 regions.quick_push (bb_pair_t (0, 0));
3187
3188 for (ix = 0; blocks.iterate (ix, &block); ix++)
3189 block->flags &= ~BB_VISITED;
3190
3191 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3192
3193 if (dump_file)
3194 {
3195 const char *comma = "";
3196 int len = regions.length ();
3197
3198 fprintf (dump_file, "SESE regions:");
3199 for (ix = 0; ix != len; ix++)
3200 {
3201 basic_block from = regions[ix].first;
3202 basic_block to = regions[ix].second;
3203
3204 if (from)
3205 {
3206 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3207 if (to != from)
3208 fprintf (dump_file, "->%d", to->index);
3209
3210 int color = BB_GET_SESE (from)->color;
3211
3212 /* Print the blocks within the region (excluding ends). */
3213 FOR_EACH_BB_FN (block, cfun)
3214 {
3215 bb_sese *sese = BB_GET_SESE (block);
3216
3217 if (sese && sese->color == color
3218 && block != from && block != to)
3219 fprintf (dump_file, ".%d", block->index);
3220 }
3221 fprintf (dump_file, "}");
3222 }
3223 comma = ",";
3224 }
3225 fprintf (dump_file, "\n\n");
3226 }
3227
3228 for (ix = 0; blocks.iterate (ix, &block); ix++)
3229 delete BB_GET_SESE (block);
3230}
3231
3232#undef BB_SET_SESE
3233#undef BB_GET_SESE
3234
d88cd9c4
NS
3235/* Propagate live state at the start of a partitioned region. BLOCK
3236 provides the live register information, and might not contain
3237 INSN. Propagation is inserted just after INSN. RW indicates whether
3238 we are reading and/or writing state. This
3239 separation is needed for worker-level proppagation where we
3240 essentially do a spill & fill. FN is the underlying worker
3241 function to generate the propagation instructions for single
3242 register. DATA is user data.
3243
3244 We propagate the live register set and the entire frame. We could
3245 do better by (a) propagating just the live set that is used within
3246 the partitioned regions and (b) only propagating stack entries that
3247 are used. The latter might be quite hard to determine. */
3248
3249typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3250
3251static void
3252nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3253 propagator_fn fn, void *data)
3254{
3255 bitmap live = DF_LIVE_IN (block);
3256 bitmap_iterator iterator;
3257 unsigned ix;
3258
3259 /* Copy the frame array. */
3260 HOST_WIDE_INT fs = get_frame_size ();
3261 if (fs)
3262 {
3263 rtx tmp = gen_reg_rtx (DImode);
3264 rtx idx = NULL_RTX;
3265 rtx ptr = gen_reg_rtx (Pmode);
3266 rtx pred = NULL_RTX;
3267 rtx_code_label *label = NULL;
3268
3269 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3270 fs /= GET_MODE_SIZE (DImode);
3271 /* Detect single iteration loop. */
3272 if (fs == 1)
3273 fs = 0;
3274
3275 start_sequence ();
3276 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3277 if (fs)
3278 {
3279 idx = gen_reg_rtx (SImode);
3280 pred = gen_reg_rtx (BImode);
3281 label = gen_label_rtx ();
3282
3283 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3284 /* Allow worker function to initialize anything needed. */
3285 rtx init = fn (tmp, PM_loop_begin, fs, data);
3286 if (init)
3287 emit_insn (init);
3288 emit_label (label);
3289 LABEL_NUSES (label)++;
3290 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3291 }
3292 if (rw & PM_read)
3293 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3294 emit_insn (fn (tmp, rw, fs, data));
3295 if (rw & PM_write)
3296 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3297 if (fs)
3298 {
3299 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3300 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3301 emit_insn (gen_br_true_uni (pred, label));
3302 rtx fini = fn (tmp, PM_loop_end, fs, data);
3303 if (fini)
3304 emit_insn (fini);
3305 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3306 }
3307 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3308 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3309 rtx cpy = get_insns ();
3310 end_sequence ();
3311 insn = emit_insn_after (cpy, insn);
3312 }
3313
3314 /* Copy live registers. */
3315 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3316 {
3317 rtx reg = regno_reg_rtx[ix];
3318
3319 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3320 {
3321 rtx bcast = fn (reg, rw, 0, data);
3322
3323 insn = emit_insn_after (bcast, insn);
3324 }
3325 }
3326}
3327
3328/* Worker for nvptx_vpropagate. */
3329
3330static rtx
3331vprop_gen (rtx reg, propagate_mask pm,
3332 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3333{
3334 if (!(pm & PM_read_write))
3335 return 0;
3336
3337 return nvptx_gen_vcast (reg);
3338}
3339
3340/* Propagate state that is live at start of BLOCK across the vectors
3341 of a single warp. Propagation is inserted just after INSN. */
3342
3343static void
3344nvptx_vpropagate (basic_block block, rtx_insn *insn)
3345{
3346 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3347}
3348
3349/* Worker for nvptx_wpropagate. */
3350
3351static rtx
3352wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3353{
3354 wcast_data_t *data = (wcast_data_t *)data_;
3355
3356 if (pm & PM_loop_begin)
3357 {
3358 /* Starting a loop, initialize pointer. */
3359 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3360
3361 if (align > worker_bcast_align)
3362 worker_bcast_align = align;
3363 data->offset = (data->offset + align - 1) & ~(align - 1);
3364
3365 data->ptr = gen_reg_rtx (Pmode);
3366
3367 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3368 }
3369 else if (pm & PM_loop_end)
3370 {
3371 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3372 data->ptr = NULL_RTX;
3373 return clobber;
3374 }
3375 else
3376 return nvptx_gen_wcast (reg, pm, rep, data);
3377}
3378
3379/* Spill or fill live state that is live at start of BLOCK. PRE_P
3380 indicates if this is just before partitioned mode (do spill), or
3381 just after it starts (do fill). Sequence is inserted just after
3382 INSN. */
3383
3384static void
3385nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3386{
3387 wcast_data_t data;
3388
3389 data.base = gen_reg_rtx (Pmode);
3390 data.offset = 0;
3391 data.ptr = NULL_RTX;
3392
3393 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3394 if (data.offset)
3395 {
3396 /* Stuff was emitted, initialize the base pointer now. */
3397 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3398 emit_insn_after (init, insn);
3399
3400 if (worker_bcast_size < data.offset)
3401 worker_bcast_size = data.offset;
3402 }
3403}
3404
3405/* Emit a worker-level synchronization barrier. We use different
3406 markers for before and after synchronizations. */
3407
3408static rtx
3409nvptx_wsync (bool after)
3410{
3411 return gen_nvptx_barsync (GEN_INT (after));
3412}
3413
3414/* Single neutering according to MASK. FROM is the incoming block and
3415 TO is the outgoing block. These may be the same block. Insert at
3416 start of FROM:
3417
3418 if (tid.<axis>) goto end.
3419
3420 and insert before ending branch of TO (if there is such an insn):
3421
3422 end:
3423 <possibly-broadcast-cond>
3424 <branch>
3425
3426 We currently only use differnt FROM and TO when skipping an entire
3427 loop. We could do more if we detected superblocks. */
3428
3429static void
3430nvptx_single (unsigned mask, basic_block from, basic_block to)
3431{
3432 rtx_insn *head = BB_HEAD (from);
3433 rtx_insn *tail = BB_END (to);
3434 unsigned skip_mask = mask;
3435
3436 /* Find first insn of from block */
3437 while (head != BB_END (from) && !INSN_P (head))
3438 head = NEXT_INSN (head);
3439
3440 /* Find last insn of to block */
3441 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3442 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3443 tail = PREV_INSN (tail);
3444
3445 /* Detect if tail is a branch. */
3446 rtx tail_branch = NULL_RTX;
3447 rtx cond_branch = NULL_RTX;
3448 if (tail && INSN_P (tail))
3449 {
3450 tail_branch = PATTERN (tail);
3451 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3452 tail_branch = NULL_RTX;
3453 else
3454 {
3455 cond_branch = SET_SRC (tail_branch);
3456 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3457 cond_branch = NULL_RTX;
3458 }
3459 }
3460
3461 if (tail == head)
3462 {
3463 /* If this is empty, do nothing. */
3464 if (!head || !INSN_P (head))
3465 return;
3466
3467 /* If this is a dummy insn, do nothing. */
3468 switch (recog_memoized (head))
3469 {
3470 default:
3471 break;
3472 case CODE_FOR_nvptx_fork:
3473 case CODE_FOR_nvptx_forked:
3474 case CODE_FOR_nvptx_joining:
3475 case CODE_FOR_nvptx_join:
3476 return;
3477 }
3478
3479 if (cond_branch)
3480 {
3481 /* If we're only doing vector single, there's no need to
3482 emit skip code because we'll not insert anything. */
3483 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3484 skip_mask = 0;
3485 }
3486 else if (tail_branch)
3487 /* Block with only unconditional branch. Nothing to do. */
3488 return;
3489 }
3490
3491 /* Insert the vector test inside the worker test. */
3492 unsigned mode;
3493 rtx_insn *before = tail;
3494 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3495 if (GOMP_DIM_MASK (mode) & skip_mask)
3496 {
3497 rtx_code_label *label = gen_label_rtx ();
3498 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3499
3500 if (!pred)
3501 {
3502 pred = gen_reg_rtx (BImode);
3503 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3504 }
3505
3506 rtx br;
3507 if (mode == GOMP_DIM_VECTOR)
3508 br = gen_br_true (pred, label);
3509 else
3510 br = gen_br_true_uni (pred, label);
3511 emit_insn_before (br, head);
3512
3513 LABEL_NUSES (label)++;
3514 if (tail_branch)
3515 before = emit_label_before (label, before);
3516 else
3517 emit_label_after (label, tail);
3518 }
3519
3520 /* Now deal with propagating the branch condition. */
3521 if (cond_branch)
3522 {
3523 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3524
3525 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3526 {
3527 /* Vector mode only, do a shuffle. */
3528 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3529 }
3530 else
3531 {
3532 /* Includes worker mode, do spill & fill. By construction
3533 we should never have worker mode only. */
3534 wcast_data_t data;
3535
3536 data.base = worker_bcast_sym;
3537 data.ptr = 0;
3538
3539 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3540 worker_bcast_size = GET_MODE_SIZE (SImode);
3541
3542 data.offset = 0;
3543 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3544 before);
3545 /* Barrier so other workers can see the write. */
3546 emit_insn_before (nvptx_wsync (false), tail);
3547 data.offset = 0;
3548 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3549 /* This barrier is needed to avoid worker zero clobbering
3550 the broadcast buffer before all the other workers have
3551 had a chance to read this instance of it. */
3552 emit_insn_before (nvptx_wsync (true), tail);
3553 }
3554
3555 extract_insn (tail);
3556 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3557 UNSPEC_BR_UNIFIED);
3558 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3559 }
3560}
3561
3562/* PAR is a parallel that is being skipped in its entirety according to
3563 MASK. Treat this as skipping a superblock starting at forked
3564 and ending at joining. */
3565
3566static void
3567nvptx_skip_par (unsigned mask, parallel *par)
3568{
3569 basic_block tail = par->join_block;
3570 gcc_assert (tail->preds->length () == 1);
3571
3572 basic_block pre_tail = (*tail->preds)[0]->src;
3573 gcc_assert (pre_tail->succs->length () == 1);
3574
3575 nvptx_single (mask, par->forked_block, pre_tail);
3576}
3577
dba619f3
NS
3578/* If PAR has a single inner parallel and PAR itself only contains
3579 empty entry and exit blocks, swallow the inner PAR. */
3580
3581static void
3582nvptx_optimize_inner (parallel *par)
3583{
3584 parallel *inner = par->inner;
3585
3586 /* We mustn't be the outer dummy par. */
3587 if (!par->mask)
3588 return;
3589
3590 /* We must have a single inner par. */
3591 if (!inner || inner->next)
3592 return;
3593
3594 /* We must only contain 2 blocks ourselves -- the head and tail of
3595 the inner par. */
3596 if (par->blocks.length () != 2)
3597 return;
3598
3599 /* We must be disjoint partitioning. As we only have vector and
3600 worker partitioning, this is sufficient to guarantee the pars
3601 have adjacent partitioning. */
3602 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3603 /* This indicates malformed code generation. */
3604 return;
3605
3606 /* The outer forked insn should be immediately followed by the inner
3607 fork insn. */
3608 rtx_insn *forked = par->forked_insn;
3609 rtx_insn *fork = BB_END (par->forked_block);
3610
3611 if (NEXT_INSN (forked) != fork)
3612 return;
3613 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3614
3615 /* The outer joining insn must immediately follow the inner join
3616 insn. */
3617 rtx_insn *joining = par->joining_insn;
3618 rtx_insn *join = inner->join_insn;
3619 if (NEXT_INSN (join) != joining)
3620 return;
3621
3622 /* Preconditions met. Swallow the inner par. */
3623 if (dump_file)
3624 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3625 inner->mask, inner->forked_block->index,
3626 inner->join_block->index,
3627 par->mask, par->forked_block->index, par->join_block->index);
3628
3629 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3630
3631 par->blocks.reserve (inner->blocks.length ());
3632 while (inner->blocks.length ())
3633 par->blocks.quick_push (inner->blocks.pop ());
3634
3635 par->inner = inner->inner;
3636 inner->inner = NULL;
3637
3638 delete inner;
3639}
3640
d88cd9c4
NS
3641/* Process the parallel PAR and all its contained
3642 parallels. We do everything but the neutering. Return mask of
3643 partitioned modes used within this parallel. */
3644
3645static unsigned
3646nvptx_process_pars (parallel *par)
3647{
dba619f3
NS
3648 if (nvptx_optimize)
3649 nvptx_optimize_inner (par);
3650
d88cd9c4
NS
3651 unsigned inner_mask = par->mask;
3652
3653 /* Do the inner parallels first. */
3654 if (par->inner)
3655 {
3656 par->inner_mask = nvptx_process_pars (par->inner);
3657 inner_mask |= par->inner_mask;
3658 }
3659
3660 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3661 /* No propagation needed for a call. */;
5d306e55 3662 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3663 {
3664 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3665 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3666 /* Insert begin and end synchronizations. */
3667 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3668 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3669 }
3670 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3671 nvptx_vpropagate (par->forked_block, par->forked_insn);
3672
3673 /* Now do siblings. */
3674 if (par->next)
3675 inner_mask |= nvptx_process_pars (par->next);
3676 return inner_mask;
3677}
3678
3679/* Neuter the parallel described by PAR. We recurse in depth-first
3680 order. MODES are the partitioning of the execution and OUTER is
3681 the partitioning of the parallels we are contained in. */
3682
3683static void
3684nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3685{
3686 unsigned me = (par->mask
3687 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3688 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3689 unsigned skip_mask = 0, neuter_mask = 0;
3690
3691 if (par->inner)
3692 nvptx_neuter_pars (par->inner, modes, outer | me);
3693
3694 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3695 {
3696 if ((outer | me) & GOMP_DIM_MASK (mode))
3697 {} /* Mode is partitioned: no neutering. */
3698 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 3699 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
3700 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3701 || !par->forked_insn)
3702 /* Partitioned in inner parallels, or we're not a partitioned
3703 at all: neuter individual blocks. */
3704 neuter_mask |= GOMP_DIM_MASK (mode);
3705 else if (!par->parent || !par->parent->forked_insn
3706 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3707 /* Parent isn't a parallel or contains this paralleling: skip
3708 parallel at this level. */
3709 skip_mask |= GOMP_DIM_MASK (mode);
3710 else
3711 {} /* Parent will skip this parallel itself. */
3712 }
3713
3714 if (neuter_mask)
3715 {
912442c2 3716 int ix, len;
d88cd9c4 3717
912442c2
NS
3718 if (nvptx_optimize)
3719 {
3720 /* Neuter whole SESE regions. */
3721 bb_pair_vec_t regions;
3722
3723 nvptx_find_sese (par->blocks, regions);
3724 len = regions.length ();
3725 for (ix = 0; ix != len; ix++)
3726 {
3727 basic_block from = regions[ix].first;
3728 basic_block to = regions[ix].second;
3729
3730 if (from)
3731 nvptx_single (neuter_mask, from, to);
3732 else
3733 gcc_assert (!to);
3734 }
3735 }
3736 else
d88cd9c4 3737 {
912442c2
NS
3738 /* Neuter each BB individually. */
3739 len = par->blocks.length ();
3740 for (ix = 0; ix != len; ix++)
3741 {
3742 basic_block block = par->blocks[ix];
d88cd9c4 3743
912442c2
NS
3744 nvptx_single (neuter_mask, block, block);
3745 }
d88cd9c4
NS
3746 }
3747 }
3748
3749 if (skip_mask)
3750 nvptx_skip_par (skip_mask, par);
3751
3752 if (par->next)
3753 nvptx_neuter_pars (par->next, modes, outer);
3754}
3755
517665b3 3756/* PTX-specific reorganization
d88cd9c4 3757 - Split blocks at fork and join instructions
c38f0d8c
NS
3758 - Compute live registers
3759 - Mark now-unused registers, so function begin doesn't declare
517665b3 3760 unused registers.
d88cd9c4
NS
3761 - Insert state propagation when entering partitioned mode
3762 - Insert neutering instructions when in single mode
c38f0d8c 3763 - Replace subregs with suitable sequences.
517665b3
NS
3764*/
3765
3766static void
3767nvptx_reorg (void)
3768{
517665b3
NS
3769 /* We are freeing block_for_insn in the toplev to keep compatibility
3770 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3771 compute_bb_for_insn ();
3772
3773 thread_prologue_and_epilogue_insns ();
3774
d88cd9c4
NS
3775 /* Split blocks and record interesting unspecs. */
3776 bb_insn_map_t bb_insn_map;
3777
3778 nvptx_split_blocks (&bb_insn_map);
3779
c38f0d8c 3780 /* Compute live regs */
517665b3
NS
3781 df_clear_flags (DF_LR_RUN_DCE);
3782 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
3783 df_live_add_problem ();
3784 df_live_set_all_dirty ();
517665b3 3785 df_analyze ();
738f2522
BS
3786 regstat_init_n_sets_and_refs ();
3787
d88cd9c4
NS
3788 if (dump_file)
3789 df_dump (dump_file);
3790
517665b3 3791 /* Mark unused regs as unused. */
d88cd9c4 3792 int max_regs = max_reg_num ();
517665b3 3793 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
3794 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3795 regno_reg_rtx[i] = const0_rtx;
517665b3 3796
d88cd9c4
NS
3797 /* Determine launch dimensions of the function. If it is not an
3798 offloaded function (i.e. this is a regular compiler), the
3799 function has no neutering. */
3800 tree attr = get_oacc_fn_attrib (current_function_decl);
3801 if (attr)
3802 {
3803 /* If we determined this mask before RTL expansion, we could
3804 elide emission of some levels of forks and joins. */
3805 unsigned mask = 0;
3806 tree dims = TREE_VALUE (attr);
3807 unsigned ix;
3808
3809 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3810 {
3811 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3812 tree allowed = TREE_PURPOSE (dims);
3813
3814 if (size != 1 && !(allowed && integer_zerop (allowed)))
3815 mask |= GOMP_DIM_MASK (ix);
3816 }
3817 /* If there is worker neutering, there must be vector
3818 neutering. Otherwise the hardware will fail. */
3819 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3820 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3821
3822 /* Discover & process partitioned regions. */
3823 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3824 nvptx_process_pars (pars);
3825 nvptx_neuter_pars (pars, mask, 0);
3826 delete pars;
3827 }
3828
517665b3 3829 /* Replace subregs. */
c03b0416 3830 nvptx_reorg_subreg ();
517665b3 3831
738f2522 3832 regstat_free_n_sets_and_refs ();
517665b3
NS
3833
3834 df_finish_pass (true);
738f2522
BS
3835}
3836\f
3837/* Handle a "kernel" attribute; arguments as in
3838 struct attribute_spec.handler. */
3839
3840static tree
3841nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3842 int ARG_UNUSED (flags), bool *no_add_attrs)
3843{
3844 tree decl = *node;
3845
3846 if (TREE_CODE (decl) != FUNCTION_DECL)
3847 {
3848 error ("%qE attribute only applies to functions", name);
3849 *no_add_attrs = true;
3850 }
3851
3852 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3853 {
3854 error ("%qE attribute requires a void return type", name);
3855 *no_add_attrs = true;
3856 }
3857
3858 return NULL_TREE;
3859}
3860
3861/* Table of valid machine attributes. */
3862static const struct attribute_spec nvptx_attribute_table[] =
3863{
3864 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3865 affects_type_identity } */
3866 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3867 { NULL, 0, 0, false, false, false, NULL, false }
3868};
3869\f
3870/* Limit vector alignments to BIGGEST_ALIGNMENT. */
3871
3872static HOST_WIDE_INT
3873nvptx_vector_alignment (const_tree type)
3874{
3875 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3876
3877 return MIN (align, BIGGEST_ALIGNMENT);
3878}
d88cd9c4
NS
3879
3880/* Indicate that INSN cannot be duplicated. */
3881
3882static bool
3883nvptx_cannot_copy_insn_p (rtx_insn *insn)
3884{
3885 switch (recog_memoized (insn))
3886 {
3887 case CODE_FOR_nvptx_shufflesi:
3888 case CODE_FOR_nvptx_shufflesf:
3889 case CODE_FOR_nvptx_barsync:
3890 case CODE_FOR_nvptx_fork:
3891 case CODE_FOR_nvptx_forked:
3892 case CODE_FOR_nvptx_joining:
3893 case CODE_FOR_nvptx_join:
3894 return true;
3895 default:
3896 return false;
3897 }
3898}
a794bd20
NS
3899
3900/* Section anchors do not work. Initialization for flag_section_anchor
3901 probes the existence of the anchoring target hooks and prevents
3902 anchoring if they don't exist. However, we may be being used with
3903 a host-side compiler that does support anchoring, and hence see
3904 the anchor flag set (as it's not recalculated). So provide an
3905 implementation denying anchoring. */
3906
3907static bool
3908nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3909{
3910 return false;
3911}
738f2522 3912\f
1f83528e
TS
3913/* Record a symbol for mkoffload to enter into the mapping table. */
3914
3915static void
3916nvptx_record_offload_symbol (tree decl)
3917{
3e32ee19
NS
3918 switch (TREE_CODE (decl))
3919 {
3920 case VAR_DECL:
3921 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3922 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3923 break;
3924
3925 case FUNCTION_DECL:
3926 {
3927 tree attr = get_oacc_fn_attrib (decl);
5d306e55 3928 tree dims = TREE_VALUE (attr);
3e32ee19
NS
3929 unsigned ix;
3930
3e32ee19
NS
3931 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3932 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3933
5d306e55 3934 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3e32ee19 3935 {
5d306e55 3936 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 3937
5d306e55 3938 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
3939 fprintf (asm_out_file, ", %#x", size);
3940 }
d2d47a28 3941
3e32ee19
NS
3942 fprintf (asm_out_file, "\n");
3943 }
3944 break;
d2d47a28 3945
3e32ee19
NS
3946 default:
3947 gcc_unreachable ();
3948 }
1f83528e
TS
3949}
3950
738f2522
BS
3951/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3952 at the start of a file. */
3953
3954static void
3955nvptx_file_start (void)
3956{
3957 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3958 fputs ("\t.version\t3.1\n", asm_out_file);
3959 fputs ("\t.target\tsm_30\n", asm_out_file);
3960 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3961 fputs ("// END PREAMBLE\n", asm_out_file);
3962}
3963
ecf6e535
BS
3964/* Write out the function declarations we've collected and declare storage
3965 for the broadcast buffer. */
738f2522
BS
3966
3967static void
3968nvptx_file_end (void)
3969{
f3dba894
TS
3970 hash_table<tree_hasher>::iterator iter;
3971 tree decl;
3972 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3973 nvptx_record_fndecl (decl, true);
738f2522 3974 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
3975
3976 if (worker_bcast_size)
3977 {
3978 /* Define the broadcast buffer. */
3979
3980 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3981 & ~(worker_bcast_align - 1);
3982
cf08c344 3983 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_bcast_name);
d88cd9c4
NS
3984 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3985 worker_bcast_align,
3986 worker_bcast_name, worker_bcast_size);
3987 }
f3552158
NS
3988
3989 if (worker_red_size)
3990 {
3991 /* Define the reduction buffer. */
3992
3993 worker_red_size = ((worker_red_size + worker_red_align - 1)
3994 & ~(worker_red_align - 1));
3995
cf08c344 3996 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_red_name);
f3552158
NS
3997 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3998 worker_red_align,
3999 worker_red_name, worker_red_size);
4000 }
4001}
4002
4003/* Expander for the shuffle builtins. */
4004
4005static rtx
4006nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4007{
4008 if (ignore)
4009 return target;
4010
4011 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4012 NULL_RTX, mode, EXPAND_NORMAL);
4013 if (!REG_P (src))
4014 src = copy_to_mode_reg (mode, src);
4015
4016 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4017 NULL_RTX, SImode, EXPAND_NORMAL);
4018 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4019 NULL_RTX, SImode, EXPAND_NORMAL);
4020
4021 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4022 idx = copy_to_mode_reg (SImode, idx);
4023
4024 rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op));
4025 if (pat)
4026 emit_insn (pat);
4027
4028 return target;
4029}
4030
4031/* Worker reduction address expander. */
4032
4033static rtx
4034nvptx_expand_worker_addr (tree exp, rtx target,
4035 machine_mode ARG_UNUSED (mode), int ignore)
4036{
4037 if (ignore)
4038 return target;
4039
4040 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4041 if (align > worker_red_align)
4042 worker_red_align = align;
4043
4044 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4045 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4046 if (size + offset > worker_red_size)
4047 worker_red_size = size + offset;
4048
4049 emit_insn (gen_rtx_SET (target, worker_red_sym));
4050
4051 if (offset)
4052 emit_insn (gen_rtx_SET (target,
4053 gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
4054
4055 emit_insn (gen_rtx_SET (target,
4056 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
4057 UNSPEC_FROM_SHARED)));
4058
4059 return target;
4060}
4061
4062/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4063 not require taking the address of any object, other than the memory
4064 cell being operated on. */
4065
4066static rtx
4067nvptx_expand_cmp_swap (tree exp, rtx target,
4068 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4069{
4070 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4071
4072 if (!target)
4073 target = gen_reg_rtx (mode);
4074
4075 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4076 NULL_RTX, Pmode, EXPAND_NORMAL);
4077 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4078 NULL_RTX, mode, EXPAND_NORMAL);
4079 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4080 NULL_RTX, mode, EXPAND_NORMAL);
4081 rtx pat;
4082
4083 mem = gen_rtx_MEM (mode, mem);
4084 if (!REG_P (cmp))
4085 cmp = copy_to_mode_reg (mode, cmp);
4086 if (!REG_P (src))
4087 src = copy_to_mode_reg (mode, src);
4088
4089 if (mode == SImode)
4090 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4091 else
4092 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4093
4094 emit_insn (pat);
4095
4096 return target;
4097}
4098
4099
4100/* Codes for all the NVPTX builtins. */
4101enum nvptx_builtins
4102{
4103 NVPTX_BUILTIN_SHUFFLE,
4104 NVPTX_BUILTIN_SHUFFLELL,
4105 NVPTX_BUILTIN_WORKER_ADDR,
4106 NVPTX_BUILTIN_CMP_SWAP,
4107 NVPTX_BUILTIN_CMP_SWAPLL,
4108 NVPTX_BUILTIN_MAX
4109};
4110
4111static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4112
4113/* Return the NVPTX builtin for CODE. */
4114
4115static tree
4116nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4117{
4118 if (code >= NVPTX_BUILTIN_MAX)
4119 return error_mark_node;
4120
4121 return nvptx_builtin_decls[code];
4122}
4123
4124/* Set up all builtin functions for this target. */
4125
4126static void
4127nvptx_init_builtins (void)
4128{
4129#define DEF(ID, NAME, T) \
4130 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4131 = add_builtin_function ("__builtin_nvptx_" NAME, \
4132 build_function_type_list T, \
4133 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4134#define ST sizetype
4135#define UINT unsigned_type_node
4136#define LLUINT long_long_unsigned_type_node
4137#define PTRVOID ptr_type_node
4138
4139 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4140 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4141 DEF (WORKER_ADDR, "worker_addr",
4142 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4143 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4144 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4145
4146#undef DEF
4147#undef ST
4148#undef UINT
4149#undef LLUINT
4150#undef PTRVOID
4151}
4152
4153/* Expand an expression EXP that calls a built-in function,
4154 with result going to TARGET if that's convenient
4155 (and in mode MODE if that's convenient).
4156 SUBTARGET may be used as the target for computing one of EXP's operands.
4157 IGNORE is nonzero if the value is to be ignored. */
4158
4159static rtx
4160nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4161 machine_mode mode, int ignore)
4162{
4163 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4164 switch (DECL_FUNCTION_CODE (fndecl))
4165 {
4166 case NVPTX_BUILTIN_SHUFFLE:
4167 case NVPTX_BUILTIN_SHUFFLELL:
4168 return nvptx_expand_shuffle (exp, target, mode, ignore);
4169
4170 case NVPTX_BUILTIN_WORKER_ADDR:
4171 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4172
4173 case NVPTX_BUILTIN_CMP_SWAP:
4174 case NVPTX_BUILTIN_CMP_SWAPLL:
4175 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4176
4177 default: gcc_unreachable ();
4178 }
738f2522
BS
4179}
4180\f
f3552158
NS
4181/* Define dimension sizes for known hardware. */
4182#define PTX_VECTOR_LENGTH 32
4183#define PTX_WORKER_LENGTH 32
4184
94829f87
NS
4185/* Validate compute dimensions of an OpenACC offload or routine, fill
4186 in non-unity defaults. FN_LEVEL indicates the level at which a
4187 routine might spawn a loop. It is negative for non-routines. */
4188
4189static bool
5d306e55 4190nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4191{
4192 bool changed = false;
4193
ccc8282b
NS
4194 /* The vector size must be 32, unless this is a SEQ routine. */
4195 if (fn_level <= GOMP_DIM_VECTOR
4196 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4197 {
4198 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4199 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4200 dims[GOMP_DIM_VECTOR]
4201 ? "using vector_length (%d), ignoring %d"
4202 : "using vector_length (%d), ignoring runtime setting",
4203 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4204 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4205 changed = true;
4206 }
4207
4208 /* Check the num workers is not too large. */
4209 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4210 {
4211 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4212 "using num_workers (%d), ignoring %d",
4213 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4214 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4215 changed = true;
4216 }
94829f87
NS
4217
4218 return changed;
4219}
d88cd9c4 4220
bd751975
NS
4221/* Return maximum dimension size, or zero for unbounded. */
4222
4223static int
4224nvptx_dim_limit (int axis)
4225{
4226 switch (axis)
4227 {
4228 case GOMP_DIM_WORKER:
4229 return PTX_WORKER_LENGTH;
4230
4231 case GOMP_DIM_VECTOR:
4232 return PTX_VECTOR_LENGTH;
4233
4234 default:
4235 break;
4236 }
4237 return 0;
4238}
4239
d88cd9c4
NS
4240/* Determine whether fork & joins are needed. */
4241
4242static bool
4243nvptx_goacc_fork_join (gcall *call, const int dims[],
4244 bool ARG_UNUSED (is_fork))
4245{
4246 tree arg = gimple_call_arg (call, 2);
4247 unsigned axis = TREE_INT_CST_LOW (arg);
4248
4249 /* We only care about worker and vector partitioning. */
4250 if (axis < GOMP_DIM_WORKER)
4251 return false;
4252
4253 /* If the size is 1, there's no partitioning. */
4254 if (dims[axis] == 1)
4255 return false;
4256
4257 return true;
4258}
4259
f3552158
NS
4260/* Generate a PTX builtin function call that returns the address in
4261 the worker reduction buffer at OFFSET. TYPE is the type of the
4262 data at that location. */
4263
4264static tree
4265nvptx_get_worker_red_addr (tree type, tree offset)
4266{
4267 machine_mode mode = TYPE_MODE (type);
4268 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4269 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4270 tree align = build_int_cst (unsigned_type_node,
4271 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4272 tree call = build_call_expr (fndecl, 3, offset, size, align);
4273
4274 return fold_convert (build_pointer_type (type), call);
4275}
4276
4277/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4278 will cast the variable if necessary. */
4279
4280static void
4281nvptx_generate_vector_shuffle (location_t loc,
4282 tree dest_var, tree var, unsigned shift,
4283 gimple_seq *seq)
4284{
4285 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4286 tree_code code = NOP_EXPR;
dd3c1b14
NS
4287 tree arg_type = unsigned_type_node;
4288 tree var_type = TREE_TYPE (var);
4289 tree dest_type = var_type;
f3552158 4290
dd3c1b14
NS
4291 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4292 var_type = TREE_TYPE (var_type);
4293
4294 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4295 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4296
4297 if (TYPE_SIZE (var_type)
4298 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4299 {
4300 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4301 arg_type = long_long_unsigned_type_node;
f3552158 4302 }
dd3c1b14 4303
f3552158 4304 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4305 tree bits = build_int_cst (unsigned_type_node, shift);
4306 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4307 tree expr;
4308
4309 if (var_type != dest_type)
4310 {
4311 /* Do real and imaginary parts separately. */
4312 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4313 real = fold_build1 (code, arg_type, real);
4314 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4315 real = fold_build1 (code, var_type, real);
f3552158 4316
dd3c1b14
NS
4317 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4318 imag = fold_build1 (code, arg_type, imag);
4319 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4320 imag = fold_build1 (code, var_type, imag);
4321
4322 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4323 }
4324 else
4325 {
4326 expr = fold_build1 (code, arg_type, var);
4327 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4328 expr = fold_build1 (code, dest_type, expr);
4329 }
f3552158 4330
dd3c1b14 4331 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4332}
4333
33f47f42
NS
4334/* Lazily generate the global lock var decl and return its address. */
4335
4336static tree
4337nvptx_global_lock_addr ()
4338{
4339 tree v = global_lock_var;
4340
4341 if (!v)
4342 {
4343 tree name = get_identifier ("__reduction_lock");
4344 tree type = build_qualified_type (unsigned_type_node,
4345 TYPE_QUAL_VOLATILE);
4346 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4347 global_lock_var = v;
4348 DECL_ARTIFICIAL (v) = 1;
4349 DECL_EXTERNAL (v) = 1;
4350 TREE_STATIC (v) = 1;
4351 TREE_PUBLIC (v) = 1;
4352 TREE_USED (v) = 1;
4353 mark_addressable (v);
4354 mark_decl_referenced (v);
4355 }
4356
4357 return build_fold_addr_expr (v);
4358}
4359
4360/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4361 GSI. We use a lockless scheme for nearly all case, which looks
4362 like:
4363 actual = initval(OP);
4364 do {
4365 guess = actual;
4366 write = guess OP myval;
4367 actual = cmp&swap (ptr, guess, write)
4368 } while (actual bit-different-to guess);
4369 return write;
4370
4371 This relies on a cmp&swap instruction, which is available for 32-
4372 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4373
4374static tree
4375nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4376 tree ptr, tree var, tree_code op)
4377{
4378 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4379 tree_code code = NOP_EXPR;
33f47f42
NS
4380 tree arg_type = unsigned_type_node;
4381 tree var_type = TREE_TYPE (var);
f3552158 4382
33f47f42
NS
4383 if (TREE_CODE (var_type) == COMPLEX_TYPE
4384 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4385 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4386
4387 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4388 {
33f47f42 4389 arg_type = long_long_unsigned_type_node;
f3552158 4390 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4391 }
4392
33f47f42
NS
4393 tree swap_fn = nvptx_builtin_decl (fn, true);
4394
f3552158 4395 gimple_seq init_seq = NULL;
33f47f42
NS
4396 tree init_var = make_ssa_name (arg_type);
4397 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4398 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4399 gimplify_assign (init_var, init_expr, &init_seq);
4400 gimple *init_end = gimple_seq_last (init_seq);
4401
4402 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4403
f3552158
NS
4404 /* Split the block just after the init stmts. */
4405 basic_block pre_bb = gsi_bb (*gsi);
4406 edge pre_edge = split_block (pre_bb, init_end);
4407 basic_block loop_bb = pre_edge->dest;
4408 pre_bb = pre_edge->src;
4409 /* Reset the iterator. */
4410 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4411
33f47f42
NS
4412 tree expect_var = make_ssa_name (arg_type);
4413 tree actual_var = make_ssa_name (arg_type);
4414 tree write_var = make_ssa_name (arg_type);
4415
4416 /* Build and insert the reduction calculation. */
4417 gimple_seq red_seq = NULL;
4418 tree write_expr = fold_build1 (code, var_type, expect_var);
4419 write_expr = fold_build2 (op, var_type, write_expr, var);
4420 write_expr = fold_build1 (code, arg_type, write_expr);
4421 gimplify_assign (write_var, write_expr, &red_seq);
4422
4423 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4424
4425 /* Build & insert the cmp&swap sequence. */
4426 gimple_seq latch_seq = NULL;
4427 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4428 ptr, expect_var, write_var);
4429 gimplify_assign (actual_var, swap_expr, &latch_seq);
4430
4431 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4432 NULL_TREE, NULL_TREE);
4433 gimple_seq_add_stmt (&latch_seq, cond);
4434
4435 gimple *latch_end = gimple_seq_last (latch_seq);
4436 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4437
33f47f42
NS
4438 /* Split the block just after the latch stmts. */
4439 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4440 basic_block post_bb = post_edge->dest;
4441 loop_bb = post_edge->src;
4442 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4443
4444 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4445 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4446 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4447 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4448
4449 gphi *phi = create_phi_node (expect_var, loop_bb);
4450 add_phi_arg (phi, init_var, pre_edge, loc);
4451 add_phi_arg (phi, actual_var, loop_edge, loc);
4452
4453 loop *loop = alloc_loop ();
4454 loop->header = loop_bb;
4455 loop->latch = loop_bb;
4456 add_loop (loop, loop_bb->loop_father);
4457
33f47f42
NS
4458 return fold_build1 (code, var_type, write_var);
4459}
4460
4461/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4462 GSI. This is necessary for types larger than 64 bits, where there
4463 is no cmp&swap instruction to implement a lockless scheme. We use
4464 a lock variable in global memory.
4465
4466 while (cmp&swap (&lock_var, 0, 1))
4467 continue;
4468 T accum = *ptr;
4469 accum = accum OP var;
4470 *ptr = accum;
4471 cmp&swap (&lock_var, 1, 0);
4472 return accum;
4473
4474 A lock in global memory is necessary to force execution engine
4475 descheduling and avoid resource starvation that can occur if the
4476 lock is in .shared memory. */
4477
4478static tree
4479nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4480 tree ptr, tree var, tree_code op)
4481{
4482 tree var_type = TREE_TYPE (var);
4483 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4484 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4485 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4486
4487 /* Split the block just before the gsi. Insert a gimple nop to make
4488 this easier. */
4489 gimple *nop = gimple_build_nop ();
4490 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4491 basic_block entry_bb = gsi_bb (*gsi);
4492 edge entry_edge = split_block (entry_bb, nop);
4493 basic_block lock_bb = entry_edge->dest;
4494 /* Reset the iterator. */
4495 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4496
4497 /* Build and insert the locking sequence. */
4498 gimple_seq lock_seq = NULL;
4499 tree lock_var = make_ssa_name (unsigned_type_node);
4500 tree lock_expr = nvptx_global_lock_addr ();
4501 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4502 uns_unlocked, uns_locked);
4503 gimplify_assign (lock_var, lock_expr, &lock_seq);
4504 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4505 NULL_TREE, NULL_TREE);
4506 gimple_seq_add_stmt (&lock_seq, cond);
4507 gimple *lock_end = gimple_seq_last (lock_seq);
4508 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4509
4510 /* Split the block just after the lock sequence. */
4511 edge locked_edge = split_block (lock_bb, lock_end);
4512 basic_block update_bb = locked_edge->dest;
4513 lock_bb = locked_edge->src;
4514 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4515
4516 /* Create the lock loop ... */
4517 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4518 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4519 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4520 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4521
4522 /* ... and the loop structure. */
4523 loop *lock_loop = alloc_loop ();
4524 lock_loop->header = lock_bb;
4525 lock_loop->latch = lock_bb;
4526 lock_loop->nb_iterations_estimate = 1;
4527 lock_loop->any_estimate = true;
4528 add_loop (lock_loop, entry_bb->loop_father);
4529
4530 /* Build and insert the reduction calculation. */
4531 gimple_seq red_seq = NULL;
4532 tree acc_in = make_ssa_name (var_type);
4533 tree ref_in = build_simple_mem_ref (ptr);
4534 TREE_THIS_VOLATILE (ref_in) = 1;
4535 gimplify_assign (acc_in, ref_in, &red_seq);
4536
4537 tree acc_out = make_ssa_name (var_type);
4538 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4539 gimplify_assign (acc_out, update_expr, &red_seq);
4540
4541 tree ref_out = build_simple_mem_ref (ptr);
4542 TREE_THIS_VOLATILE (ref_out) = 1;
4543 gimplify_assign (ref_out, acc_out, &red_seq);
4544
4545 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4546
4547 /* Build & insert the unlock sequence. */
4548 gimple_seq unlock_seq = NULL;
4549 tree unlock_expr = nvptx_global_lock_addr ();
4550 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4551 uns_locked, uns_unlocked);
4552 gimplify_and_add (unlock_expr, &unlock_seq);
4553 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4554
4555 return acc_out;
4556}
4557
4558/* Emit a sequence to update a reduction accumlator at *PTR with the
4559 value held in VAR using operator OP. Return the updated value.
4560
4561 TODO: optimize for atomic ops and indepedent complex ops. */
4562
4563static tree
4564nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4565 tree ptr, tree var, tree_code op)
4566{
4567 tree type = TREE_TYPE (var);
4568 tree size = TYPE_SIZE (type);
4569
4570 if (size == TYPE_SIZE (unsigned_type_node)
4571 || size == TYPE_SIZE (long_long_unsigned_type_node))
4572 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4573 else
4574 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
4575}
4576
4577/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4578
4579static void
4580nvptx_goacc_reduction_setup (gcall *call)
4581{
4582 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4583 tree lhs = gimple_call_lhs (call);
4584 tree var = gimple_call_arg (call, 2);
4585 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4586 gimple_seq seq = NULL;
4587
4588 push_gimplify_context (true);
4589
4590 if (level != GOMP_DIM_GANG)
4591 {
4592 /* Copy the receiver object. */
4593 tree ref_to_res = gimple_call_arg (call, 1);
4594
4595 if (!integer_zerop (ref_to_res))
4596 var = build_simple_mem_ref (ref_to_res);
4597 }
4598
4599 if (level == GOMP_DIM_WORKER)
4600 {
4601 /* Store incoming value to worker reduction buffer. */
4602 tree offset = gimple_call_arg (call, 5);
4603 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4604 tree ptr = make_ssa_name (TREE_TYPE (call));
4605
4606 gimplify_assign (ptr, call, &seq);
4607 tree ref = build_simple_mem_ref (ptr);
4608 TREE_THIS_VOLATILE (ref) = 1;
4609 gimplify_assign (ref, var, &seq);
4610 }
4611
4612 if (lhs)
4613 gimplify_assign (lhs, var, &seq);
4614
4615 pop_gimplify_context (NULL);
4616 gsi_replace_with_seq (&gsi, seq, true);
4617}
4618
4619/* NVPTX implementation of GOACC_REDUCTION_INIT. */
4620
4621static void
4622nvptx_goacc_reduction_init (gcall *call)
4623{
4624 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4625 tree lhs = gimple_call_lhs (call);
4626 tree var = gimple_call_arg (call, 2);
4627 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4628 enum tree_code rcode
4629 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4630 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4631 TREE_TYPE (var));
4632 gimple_seq seq = NULL;
4633
4634 push_gimplify_context (true);
4635
4636 if (level == GOMP_DIM_VECTOR)
4637 {
4638 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4639 tree tid = make_ssa_name (integer_type_node);
4640 tree dim_vector = gimple_call_arg (call, 3);
4641 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4642 dim_vector);
4643 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4644 NULL_TREE, NULL_TREE);
4645
4646 gimple_call_set_lhs (tid_call, tid);
4647 gimple_seq_add_stmt (&seq, tid_call);
4648 gimple_seq_add_stmt (&seq, cond_stmt);
4649
4650 /* Split the block just after the call. */
4651 edge init_edge = split_block (gsi_bb (gsi), call);
4652 basic_block init_bb = init_edge->dest;
4653 basic_block call_bb = init_edge->src;
4654
4655 /* Fixup flags from call_bb to init_bb. */
4656 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4657
4658 /* Set the initialization stmts. */
4659 gimple_seq init_seq = NULL;
4660 tree init_var = make_ssa_name (TREE_TYPE (var));
4661 gimplify_assign (init_var, init, &init_seq);
4662 gsi = gsi_start_bb (init_bb);
4663 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4664
4665 /* Split block just after the init stmt. */
4666 gsi_prev (&gsi);
4667 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4668 basic_block dst_bb = inited_edge->dest;
4669
4670 /* Create false edge from call_bb to dst_bb. */
4671 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4672
4673 /* Create phi node in dst block. */
4674 gphi *phi = create_phi_node (lhs, dst_bb);
4675 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4676 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4677
4678 /* Reset dominator of dst bb. */
4679 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4680
4681 /* Reset the gsi. */
4682 gsi = gsi_for_stmt (call);
4683 }
4684 else
4685 {
4686 if (level == GOMP_DIM_GANG)
4687 {
4688 /* If there's no receiver object, propagate the incoming VAR. */
4689 tree ref_to_res = gimple_call_arg (call, 1);
4690 if (integer_zerop (ref_to_res))
4691 init = var;
4692 }
4693
4694 gimplify_assign (lhs, init, &seq);
4695 }
4696
4697 pop_gimplify_context (NULL);
4698 gsi_replace_with_seq (&gsi, seq, true);
4699}
4700
4701/* NVPTX implementation of GOACC_REDUCTION_FINI. */
4702
4703static void
4704nvptx_goacc_reduction_fini (gcall *call)
4705{
4706 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4707 tree lhs = gimple_call_lhs (call);
4708 tree ref_to_res = gimple_call_arg (call, 1);
4709 tree var = gimple_call_arg (call, 2);
4710 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4711 enum tree_code op
4712 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4713 gimple_seq seq = NULL;
4714 tree r = NULL_TREE;;
4715
4716 push_gimplify_context (true);
4717
4718 if (level == GOMP_DIM_VECTOR)
4719 {
4720 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4721 but that requires a method of emitting a unified jump at the
4722 gimple level. */
4723 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4724 {
4725 tree other_var = make_ssa_name (TREE_TYPE (var));
4726 nvptx_generate_vector_shuffle (gimple_location (call),
4727 other_var, var, shfl, &seq);
4728
4729 r = make_ssa_name (TREE_TYPE (var));
4730 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4731 var, other_var), &seq);
4732 var = r;
4733 }
4734 }
4735 else
4736 {
4737 tree accum = NULL_TREE;
4738
4739 if (level == GOMP_DIM_WORKER)
4740 {
4741 /* Get reduction buffer address. */
4742 tree offset = gimple_call_arg (call, 5);
4743 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4744 tree ptr = make_ssa_name (TREE_TYPE (call));
4745
4746 gimplify_assign (ptr, call, &seq);
4747 accum = ptr;
4748 }
4749 else if (integer_zerop (ref_to_res))
4750 r = var;
4751 else
4752 accum = ref_to_res;
4753
4754 if (accum)
4755 {
33f47f42 4756 /* UPDATE the accumulator. */
f3552158
NS
4757 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4758 seq = NULL;
33f47f42
NS
4759 r = nvptx_reduction_update (gimple_location (call), &gsi,
4760 accum, var, op);
f3552158
NS
4761 }
4762 }
4763
4764 if (lhs)
4765 gimplify_assign (lhs, r, &seq);
4766 pop_gimplify_context (NULL);
4767
4768 gsi_replace_with_seq (&gsi, seq, true);
4769}
4770
4771/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4772
4773static void
4774nvptx_goacc_reduction_teardown (gcall *call)
4775{
4776 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4777 tree lhs = gimple_call_lhs (call);
4778 tree var = gimple_call_arg (call, 2);
4779 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4780 gimple_seq seq = NULL;
4781
4782 push_gimplify_context (true);
4783 if (level == GOMP_DIM_WORKER)
4784 {
4785 /* Read the worker reduction buffer. */
4786 tree offset = gimple_call_arg (call, 5);
4787 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4788 tree ptr = make_ssa_name (TREE_TYPE (call));
4789
4790 gimplify_assign (ptr, call, &seq);
4791 var = build_simple_mem_ref (ptr);
4792 TREE_THIS_VOLATILE (var) = 1;
4793 }
4794
4795 if (level != GOMP_DIM_GANG)
4796 {
4797 /* Write to the receiver object. */
4798 tree ref_to_res = gimple_call_arg (call, 1);
4799
4800 if (!integer_zerop (ref_to_res))
4801 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4802 }
4803
4804 if (lhs)
4805 gimplify_assign (lhs, var, &seq);
4806
4807 pop_gimplify_context (NULL);
4808
4809 gsi_replace_with_seq (&gsi, seq, true);
4810}
4811
4812/* NVPTX reduction expander. */
4813
4814void
4815nvptx_goacc_reduction (gcall *call)
4816{
4817 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4818
4819 switch (code)
4820 {
4821 case IFN_GOACC_REDUCTION_SETUP:
4822 nvptx_goacc_reduction_setup (call);
4823 break;
4824
4825 case IFN_GOACC_REDUCTION_INIT:
4826 nvptx_goacc_reduction_init (call);
4827 break;
4828
4829 case IFN_GOACC_REDUCTION_FINI:
4830 nvptx_goacc_reduction_fini (call);
4831 break;
4832
4833 case IFN_GOACC_REDUCTION_TEARDOWN:
4834 nvptx_goacc_reduction_teardown (call);
4835 break;
4836
4837 default:
4838 gcc_unreachable ();
4839 }
4840}
4841
738f2522
BS
4842#undef TARGET_OPTION_OVERRIDE
4843#define TARGET_OPTION_OVERRIDE nvptx_option_override
4844
4845#undef TARGET_ATTRIBUTE_TABLE
4846#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4847
4848#undef TARGET_LEGITIMATE_ADDRESS_P
4849#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4850
4851#undef TARGET_PROMOTE_FUNCTION_MODE
4852#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4853
4854#undef TARGET_FUNCTION_ARG
4855#define TARGET_FUNCTION_ARG nvptx_function_arg
4856#undef TARGET_FUNCTION_INCOMING_ARG
4857#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4858#undef TARGET_FUNCTION_ARG_ADVANCE
4859#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4860#undef TARGET_FUNCTION_ARG_BOUNDARY
4861#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4862#undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4863#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4864#undef TARGET_PASS_BY_REFERENCE
4865#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4866#undef TARGET_FUNCTION_VALUE_REGNO_P
4867#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4868#undef TARGET_FUNCTION_VALUE
4869#define TARGET_FUNCTION_VALUE nvptx_function_value
4870#undef TARGET_LIBCALL_VALUE
4871#define TARGET_LIBCALL_VALUE nvptx_libcall_value
4872#undef TARGET_FUNCTION_OK_FOR_SIBCALL
4873#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
4874#undef TARGET_GET_DRAP_RTX
4875#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
4876#undef TARGET_SPLIT_COMPLEX_ARG
4877#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4878#undef TARGET_RETURN_IN_MEMORY
4879#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4880#undef TARGET_OMIT_STRUCT_RETURN_REG
4881#define TARGET_OMIT_STRUCT_RETURN_REG true
4882#undef TARGET_STRICT_ARGUMENT_NAMING
4883#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4884#undef TARGET_STATIC_CHAIN
4885#define TARGET_STATIC_CHAIN nvptx_static_chain
4886
4887#undef TARGET_CALL_ARGS
4888#define TARGET_CALL_ARGS nvptx_call_args
4889#undef TARGET_END_CALL_ARGS
4890#define TARGET_END_CALL_ARGS nvptx_end_call_args
4891
4892#undef TARGET_ASM_FILE_START
4893#define TARGET_ASM_FILE_START nvptx_file_start
4894#undef TARGET_ASM_FILE_END
4895#define TARGET_ASM_FILE_END nvptx_file_end
4896#undef TARGET_ASM_GLOBALIZE_LABEL
4897#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4898#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4899#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4900#undef TARGET_PRINT_OPERAND
4901#define TARGET_PRINT_OPERAND nvptx_print_operand
4902#undef TARGET_PRINT_OPERAND_ADDRESS
4903#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4904#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4905#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4906#undef TARGET_ASM_INTEGER
4907#define TARGET_ASM_INTEGER nvptx_assemble_integer
4908#undef TARGET_ASM_DECL_END
4909#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4910#undef TARGET_ASM_DECLARE_CONSTANT_NAME
4911#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4912#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4913#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4914#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4915#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4916
4917#undef TARGET_MACHINE_DEPENDENT_REORG
4918#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4919#undef TARGET_NO_REGISTER_ALLOCATION
4920#define TARGET_NO_REGISTER_ALLOCATION true
4921
1f83528e
TS
4922#undef TARGET_RECORD_OFFLOAD_SYMBOL
4923#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4924
738f2522
BS
4925#undef TARGET_VECTOR_ALIGNMENT
4926#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4927
d88cd9c4
NS
4928#undef TARGET_CANNOT_COPY_INSN_P
4929#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4930
a794bd20
NS
4931#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4932#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4933
f3552158
NS
4934#undef TARGET_INIT_BUILTINS
4935#define TARGET_INIT_BUILTINS nvptx_init_builtins
4936#undef TARGET_EXPAND_BUILTIN
4937#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4938#undef TARGET_BUILTIN_DECL
4939#define TARGET_BUILTIN_DECL nvptx_builtin_decl
4940
94829f87
NS
4941#undef TARGET_GOACC_VALIDATE_DIMS
4942#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4943
bd751975
NS
4944#undef TARGET_GOACC_DIM_LIMIT
4945#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4946
d88cd9c4
NS
4947#undef TARGET_GOACC_FORK_JOIN
4948#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4949
f3552158
NS
4950#undef TARGET_GOACC_REDUCTION
4951#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4952
738f2522
BS
4953struct gcc_target targetm = TARGET_INITIALIZER;
4954
4955#include "gt-nvptx.h"