]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
Improve tests for valid values of iostream bitmask types
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
5624e564 2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
e11c4407
AM
31#include "tm_p.h"
32#include "expmed.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic.h"
40e23961 38#include "alias.h"
738f2522
BS
39#include "insn-flags.h"
40#include "output.h"
41#include "insn-attr.h"
36566b39 42#include "flags.h"
36566b39
PK
43#include "dojump.h"
44#include "explow.h"
45#include "calls.h"
36566b39
PK
46#include "varasm.h"
47#include "stmt.h"
738f2522 48#include "expr.h"
738f2522
BS
49#include "tm-preds.h"
50#include "tm-constrs.h"
738f2522
BS
51#include "langhooks.h"
52#include "dbxout.h"
738f2522 53#include "cfgrtl.h"
d88cd9c4 54#include "gimple.h"
738f2522 55#include "stor-layout.h"
738f2522 56#include "builtins.h"
3e32ee19
NS
57#include "omp-low.h"
58#include "gomp-constants.h"
d88cd9c4 59#include "dumpfile.h"
f3552158
NS
60#include "internal-fn.h"
61#include "gimple-iterator.h"
62#include "stringpool.h"
63#include "tree-ssa-operands.h"
64#include "tree-ssanames.h"
65#include "gimplify.h"
66#include "tree-phinodes.h"
67#include "cfgloop.h"
68#include "fold-const.h"
738f2522 69
994c5d85 70/* This file should be included last. */
d58627a0
RS
71#include "target-def.h"
72
d88cd9c4
NS
73#define SHUFFLE_UP 0
74#define SHUFFLE_DOWN 1
75#define SHUFFLE_BFLY 2
76#define SHUFFLE_IDX 3
77
738f2522
BS
78/* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80static std::stringstream func_decls;
f3dba894 81
6c907cff 82struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
83{
84 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
85 static bool equal (rtx a, rtx b) { return a == b; }
86};
87
88static GTY((cache))
89 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
90
6c907cff 91struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
92{
93 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
94 static bool equal (tree a, tree b) { return a == b; }
95};
96
97static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
98static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 99
f3552158
NS
100/* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
d88cd9c4
NS
105static unsigned worker_bcast_size;
106static unsigned worker_bcast_align;
107#define worker_bcast_name "__worker_bcast"
108static GTY(()) rtx worker_bcast_sym;
109
f3552158
NS
110/* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112static unsigned worker_red_size;
113static unsigned worker_red_align;
114#define worker_red_name "__worker_red"
115static GTY(()) rtx worker_red_sym;
116
33f47f42
NS
117/* Global lock variable, needed for 128bit worker & gang reductions. */
118static GTY(()) tree global_lock_var;
119
738f2522
BS
120/* Allocate a new, cleared machine_function structure. */
121
122static struct machine_function *
123nvptx_init_machine_status (void)
124{
125 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
126 p->ret_reg_mode = VOIDmode;
127 return p;
128}
129
130/* Implement TARGET_OPTION_OVERRIDE. */
131
132static void
133nvptx_option_override (void)
134{
135 init_machine_status = nvptx_init_machine_status;
136 /* Gives us a predictable order, which we need especially for variables. */
137 flag_toplevel_reorder = 1;
138 /* Assumes that it will see only hard registers. */
139 flag_var_tracking = 0;
f324806d
NS
140 write_symbols = NO_DEBUG;
141 debug_info_level = DINFO_LEVEL_NONE;
738f2522 142
dba619f3
NS
143 if (nvptx_optimize < 0)
144 nvptx_optimize = optimize > 0;
145
f3dba894
TS
146 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
147 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 148 declared_libfuncs_htab
f3dba894 149 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4
NS
150
151 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
152 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158
NS
153
154 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
155 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
738f2522
BS
156}
157
158/* Return the mode to be used when declaring a ptx object for OBJ.
159 For objects with subparts such as complex modes this is the mode
160 of the subpart. */
161
162machine_mode
163nvptx_underlying_object_mode (rtx obj)
164{
165 if (GET_CODE (obj) == SUBREG)
166 obj = SUBREG_REG (obj);
167 machine_mode mode = GET_MODE (obj);
168 if (mode == TImode)
169 return DImode;
170 if (COMPLEX_MODE_P (mode))
171 return GET_MODE_INNER (mode);
172 return mode;
173}
174
175/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
176 deal with ptx ideosyncracies. */
177
178const char *
179nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
180{
181 switch (mode)
182 {
183 case BLKmode:
184 return ".b8";
185 case BImode:
186 return ".pred";
187 case QImode:
188 if (promote)
189 return ".u32";
190 else
191 return ".u8";
192 case HImode:
193 return ".u16";
194 case SImode:
195 return ".u32";
196 case DImode:
197 return ".u64";
198
199 case SFmode:
200 return ".f32";
201 case DFmode:
202 return ".f64";
203
204 default:
205 gcc_unreachable ();
206 }
207}
208
d7479262
NS
209/* If MODE should be treated as two registers of an inner mode, return
210 that inner mode. Otherwise return VOIDmode. */
738f2522 211
d7479262
NS
212static machine_mode
213maybe_split_mode (machine_mode mode)
738f2522 214{
738f2522 215 if (COMPLEX_MODE_P (mode))
d7479262 216 return GET_MODE_INNER (mode);
738f2522 217
738f2522 218 if (mode == TImode)
d7479262
NS
219 return DImode;
220
221 return VOIDmode;
738f2522
BS
222}
223
d88cd9c4
NS
224/* Emit forking instructions for MASK. */
225
226static void
227nvptx_emit_forking (unsigned mask, bool is_call)
228{
229 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
230 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
231 if (mask)
232 {
233 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
234
235 /* Emit fork at all levels. This helps form SESE regions, as
236 it creates a block with a single successor before entering a
237 partitooned region. That is a good candidate for the end of
238 an SESE region. */
239 if (!is_call)
240 emit_insn (gen_nvptx_fork (op));
241 emit_insn (gen_nvptx_forked (op));
242 }
243}
244
245/* Emit joining instructions for MASK. */
246
247static void
248nvptx_emit_joining (unsigned mask, bool is_call)
249{
250 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
251 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
252 if (mask)
253 {
254 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
255
256 /* Emit joining for all non-call pars to ensure there's a single
257 predecessor for the block the join insn ends up in. This is
258 needed for skipping entire loops. */
259 if (!is_call)
260 emit_insn (gen_nvptx_joining (op));
261 emit_insn (gen_nvptx_join (op));
262 }
263}
264
738f2522
BS
265#define PASS_IN_REG_P(MODE, TYPE) \
266 ((GET_MODE_CLASS (MODE) == MODE_INT \
267 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
268 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
269 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
270 && !AGGREGATE_TYPE_P (TYPE))) \
271 && (MODE) != TImode)
272
273#define RETURN_IN_REG_P(MODE) \
274 ((GET_MODE_CLASS (MODE) == MODE_INT \
275 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
276 && GET_MODE_SIZE (MODE) <= 8)
277\f
278/* Perform a mode promotion for a function argument with MODE. Return
279 the promoted mode. */
280
281static machine_mode
282arg_promotion (machine_mode mode)
283{
284 if (mode == QImode || mode == HImode)
285 return SImode;
286 return mode;
287}
288
289/* Write the declaration of a function arg of TYPE to S. I is the index
290 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
291 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
292
293static int
294write_one_arg (std::stringstream &s, tree type, int i, machine_mode mode,
295 bool no_arg_types)
296{
297 if (!PASS_IN_REG_P (mode, type))
298 mode = Pmode;
299
d7479262
NS
300 machine_mode split = maybe_split_mode (mode);
301 if (split != VOIDmode)
738f2522 302 {
d7479262
NS
303 write_one_arg (s, NULL_TREE, i, split, false);
304 write_one_arg (s, NULL_TREE, i + 1, split, false);
738f2522
BS
305 return i + 1;
306 }
307
308 if (no_arg_types && !AGGREGATE_TYPE_P (type))
309 {
310 if (mode == SFmode)
311 mode = DFmode;
312 mode = arg_promotion (mode);
313 }
314
315 if (i > 0)
316 s << ", ";
317 s << ".param" << nvptx_ptx_type_from_mode (mode, false) << " %in_ar"
318 << (i + 1) << (mode == QImode || mode == HImode ? "[1]" : "");
319 if (mode == BLKmode)
320 s << "[" << int_size_in_bytes (type) << "]";
321 return i;
322}
323
324/* Look for attributes in ATTRS that would indicate we must write a function
325 as a .entry kernel rather than a .func. Return true if one is found. */
326
327static bool
328write_as_kernel (tree attrs)
329{
330 return (lookup_attribute ("kernel", attrs) != NULL_TREE
331 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
332}
333
ecf6e535
BS
334/* Write a function decl for DECL to S, where NAME is the name to be used.
335 This includes ptx .visible or .extern specifiers, .func or .kernel, and
336 argument and return types. */
738f2522
BS
337
338static void
339nvptx_write_function_decl (std::stringstream &s, const char *name, const_tree decl)
340{
341 tree fntype = TREE_TYPE (decl);
342 tree result_type = TREE_TYPE (fntype);
343 tree args = TYPE_ARG_TYPES (fntype);
344 tree attrs = DECL_ATTRIBUTES (decl);
345 bool kernel = write_as_kernel (attrs);
346 bool is_main = strcmp (name, "main") == 0;
347 bool args_from_decl = false;
348
349 /* We get:
350 NULL in TYPE_ARG_TYPES, for old-style functions
351 NULL in DECL_ARGUMENTS, for builtin functions without another
352 declaration.
353 So we have to pick the best one we have. */
354 if (args == 0)
355 {
356 args = DECL_ARGUMENTS (decl);
357 args_from_decl = true;
358 }
359
360 if (DECL_EXTERNAL (decl))
361 s << ".extern ";
362 else if (TREE_PUBLIC (decl))
0766660b 363 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
738f2522
BS
364
365 if (kernel)
366 s << ".entry ";
367 else
368 s << ".func ";
369
370 /* Declare the result. */
371 bool return_in_mem = false;
372 if (TYPE_MODE (result_type) != VOIDmode)
373 {
374 machine_mode mode = TYPE_MODE (result_type);
375 if (!RETURN_IN_REG_P (mode))
376 return_in_mem = true;
377 else
378 {
379 mode = arg_promotion (mode);
380 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
381 << " %out_retval)";
382 }
383 }
384
385 if (name[0] == '*')
386 s << (name + 1);
387 else
388 s << name;
389
390 /* Declare argument types. */
391 if ((args != NULL_TREE
1fe6befc
NS
392 && !(TREE_CODE (args) == TREE_LIST
393 && TREE_VALUE (args) == void_type_node))
738f2522
BS
394 || is_main
395 || return_in_mem
396 || DECL_STATIC_CHAIN (decl))
397 {
398 s << "(";
399 int i = 0;
400 bool any_args = false;
401 if (return_in_mem)
402 {
403 s << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_ar1";
404 i++;
405 }
406 while (args != NULL_TREE)
407 {
408 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
409 machine_mode mode = TYPE_MODE (type);
410
411 if (mode != VOIDmode)
412 {
413 i = write_one_arg (s, type, i, mode,
414 TYPE_ARG_TYPES (fntype) == 0);
415 any_args = true;
416 i++;
417 }
418 args = TREE_CHAIN (args);
419 }
420 if (stdarg_p (fntype))
421 {
422 gcc_assert (i > 0);
423 s << ", .param.u" << GET_MODE_BITSIZE (Pmode) << " %in_argp";
424 }
425 if (DECL_STATIC_CHAIN (decl))
426 {
427 if (i > 0)
428 s << ", ";
429 s << ".reg.u" << GET_MODE_BITSIZE (Pmode)
430 << reg_names [STATIC_CHAIN_REGNUM];
431 }
432 if (!any_args && is_main)
433 s << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode)
434 << " %argv";
435 s << ")";
436 }
437}
438
439/* Walk either ARGTYPES or ARGS if the former is null, and write out part of
440 the function header to FILE. If WRITE_COPY is false, write reg
441 declarations, otherwise write the copy from the incoming argument to that
442 reg. RETURN_IN_MEM indicates whether to start counting arg numbers at 1
443 instead of 0. */
444
445static void
446walk_args_for_param (FILE *file, tree argtypes, tree args, bool write_copy,
447 bool return_in_mem)
448{
449 int i;
450
451 bool args_from_decl = false;
452 if (argtypes == 0)
453 args_from_decl = true;
454 else
455 args = argtypes;
456
457 for (i = return_in_mem ? 1 : 0; args != NULL_TREE; args = TREE_CHAIN (args))
458 {
459 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
460 machine_mode mode = TYPE_MODE (type);
d7479262 461 int count = 1;
738f2522
BS
462
463 if (mode == VOIDmode)
464 break;
465
466 if (!PASS_IN_REG_P (mode, type))
467 mode = Pmode;
468
d7479262
NS
469 machine_mode split = maybe_split_mode (mode);
470 if (split != VOIDmode)
738f2522 471 {
d7479262
NS
472 count = 2;
473 mode = split;
738f2522 474 }
d7479262
NS
475 else if (argtypes == NULL && !AGGREGATE_TYPE_P (type) && mode == SFmode)
476 mode = DFmode;
477
7373d132 478 mode = arg_promotion (mode);
d7479262 479 while (count--)
738f2522
BS
480 {
481 i++;
482 if (write_copy)
483 fprintf (file, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
7373d132 484 nvptx_ptx_type_from_mode (mode, false), i, i);
738f2522
BS
485 else
486 fprintf (file, "\t.reg%s %%ar%d;\n",
7373d132 487 nvptx_ptx_type_from_mode (mode, false), i);
738f2522
BS
488 }
489 }
490}
491
492/* Write a .func or .kernel declaration (not a definition) along with
493 a helper comment for use by ld. S is the stream to write to, DECL
494 the decl for the function with name NAME. */
495
496static void
497write_function_decl_and_comment (std::stringstream &s, const char *name, const_tree decl)
498{
cf08c344 499 s << "\n// BEGIN";
738f2522
BS
500 if (TREE_PUBLIC (decl))
501 s << " GLOBAL";
502 s << " FUNCTION DECL: ";
503 if (name[0] == '*')
504 s << (name + 1);
505 else
506 s << name;
507 s << "\n";
508 nvptx_write_function_decl (s, name, decl);
509 s << ";\n";
510}
511
512/* Check NAME for special function names and redirect them by returning a
513 replacement. This applies to malloc, free and realloc, for which we
514 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
515
516static const char *
517nvptx_name_replacement (const char *name)
518{
519 if (strcmp (name, "call") == 0)
520 return "__nvptx_call";
521 if (strcmp (name, "malloc") == 0)
522 return "__nvptx_malloc";
523 if (strcmp (name, "free") == 0)
524 return "__nvptx_free";
525 if (strcmp (name, "realloc") == 0)
526 return "__nvptx_realloc";
527 return name;
528}
529
530/* If DECL is a FUNCTION_DECL, check the hash table to see if we
531 already encountered it, and if not, insert it and write a ptx
532 declarations that will be output at the end of compilation. */
533
534static bool
535nvptx_record_fndecl (tree decl, bool force = false)
536{
537 if (decl == NULL_TREE || TREE_CODE (decl) != FUNCTION_DECL
538 || !DECL_EXTERNAL (decl))
539 return true;
540
541 if (!force && TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
542 return false;
543
f3dba894 544 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
545 if (*slot == NULL)
546 {
547 *slot = decl;
548 const char *name = get_fnname_from_decl (decl);
549 name = nvptx_name_replacement (name);
550 write_function_decl_and_comment (func_decls, name, decl);
551 }
552 return true;
553}
554
555/* Record that we need to emit a ptx decl for DECL. Either do it now, or
556 record it for later in case we have no argument information at this
557 point. */
558
559void
560nvptx_record_needed_fndecl (tree decl)
561{
562 if (nvptx_record_fndecl (decl))
563 return;
564
f3dba894 565 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
566 if (*slot == NULL)
567 *slot = decl;
568}
569
d88cd9c4
NS
570/* Emit code to initialize the REGNO predicate register to indicate
571 whether we are not lane zero on the NAME axis. */
572
573static void
574nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
575{
576 fprintf (file, "\t{\n");
577 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
578 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
579 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
580 fprintf (file, "\t}\n");
581}
582
738f2522
BS
583/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
584 function, including local var decls and copies from the arguments to
585 local regs. */
586
587void
588nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
589{
590 tree fntype = TREE_TYPE (decl);
591 tree result_type = TREE_TYPE (fntype);
592
593 name = nvptx_name_replacement (name);
594
595 std::stringstream s;
596 write_function_decl_and_comment (s, name, decl);
597 s << "// BEGIN";
598 if (TREE_PUBLIC (decl))
599 s << " GLOBAL";
600 s << " FUNCTION DEF: ";
601
602 if (name[0] == '*')
603 s << (name + 1);
604 else
605 s << name;
606 s << "\n";
607
608 nvptx_write_function_decl (s, name, decl);
609 fprintf (file, "%s", s.str().c_str());
610
25662751
NS
611 bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
612 && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
738f2522
BS
613
614 fprintf (file, "\n{\n");
615
616 /* Ensure all arguments that should live in a register have one
617 declared. We'll emit the copies below. */
618 walk_args_for_param (file, TYPE_ARG_TYPES (fntype), DECL_ARGUMENTS (decl),
619 false, return_in_mem);
620 if (return_in_mem)
621 fprintf (file, "\t.reg.u%d %%ar1;\n", GET_MODE_BITSIZE (Pmode));
25662751
NS
622
623 /* C++11 ABI causes us to return a reference to the passed in
624 pointer for return_in_mem. */
625 if (cfun->machine->ret_reg_mode != VOIDmode)
738f2522 626 {
25662751
NS
627 machine_mode mode = arg_promotion
628 ((machine_mode)cfun->machine->ret_reg_mode);
ac952181 629 fprintf (file, "\t.reg%s %%retval;\n",
738f2522
BS
630 nvptx_ptx_type_from_mode (mode, false));
631 }
632
633 if (stdarg_p (fntype))
634 fprintf (file, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode));
635
636 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
637 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
638
639 /* Declare the pseudos we have as ptx registers. */
640 int maxregs = max_reg_num ();
641 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
642 {
643 if (regno_reg_rtx[i] != const0_rtx)
644 {
645 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262
NS
646 machine_mode split = maybe_split_mode (mode);
647 if (split != VOIDmode)
738f2522 648 {
d7479262
NS
649 fprintf (file, "\t.reg%s %%r%d$%d;\n",
650 nvptx_ptx_type_from_mode (split, true), i, 0);
651 fprintf (file, "\t.reg%s %%r%d$%d;\n",
652 nvptx_ptx_type_from_mode (split, true), i, 1);
738f2522
BS
653 }
654 else
655 fprintf (file, "\t.reg%s %%r%d;\n",
d7479262 656 nvptx_ptx_type_from_mode (mode, true), i);
738f2522
BS
657 }
658 }
659
660 /* The only reason we might be using outgoing args is if we call a stdargs
661 function. Allocate the space for this. If we called varargs functions
662 without passing any variadic arguments, we'll see a reference to outargs
663 even with a zero outgoing_args_size. */
664 HOST_WIDE_INT sz = crtl->outgoing_args_size;
665 if (sz == 0)
666 sz = 1;
667 if (cfun->machine->has_call_with_varargs)
668 fprintf (file, "\t.reg.u%d %%outargs;\n"
16998094 669 "\t.local.align 8 .b8 %%outargs_ar[" HOST_WIDE_INT_PRINT_DEC"];\n",
738f2522
BS
670 BITS_PER_WORD, sz);
671 if (cfun->machine->punning_buffer_size > 0)
672 fprintf (file, "\t.reg.u%d %%punbuffer;\n"
673 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
674 BITS_PER_WORD, cfun->machine->punning_buffer_size);
675
676 /* Declare a local variable for the frame. */
677 sz = get_frame_size ();
678 if (sz > 0 || cfun->machine->has_call_with_sc)
679 {
18c05628
NS
680 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
681
738f2522 682 fprintf (file, "\t.reg.u%d %%frame;\n"
18c05628
NS
683 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
684 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
738f2522
BS
685 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
686 BITS_PER_WORD);
687 }
688
689 if (cfun->machine->has_call_with_varargs)
690 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
691 BITS_PER_WORD);
692 if (cfun->machine->punning_buffer_size > 0)
693 fprintf (file, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
694 BITS_PER_WORD);
695
696 /* Now emit any copies necessary for arguments. */
697 walk_args_for_param (file, TYPE_ARG_TYPES (fntype), DECL_ARGUMENTS (decl),
698 true, return_in_mem);
699 if (return_in_mem)
ac952181 700 fprintf (file, "\tld.param.u%d %%ar1, [%%in_ar1];\n",
738f2522
BS
701 GET_MODE_BITSIZE (Pmode));
702 if (stdarg_p (fntype))
ac952181 703 fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
738f2522 704 GET_MODE_BITSIZE (Pmode));
d88cd9c4
NS
705
706 /* Emit axis predicates. */
707 if (cfun->machine->axis_predicate[0])
708 nvptx_init_axis_predicate (file,
709 REGNO (cfun->machine->axis_predicate[0]), "y");
710 if (cfun->machine->axis_predicate[1])
711 nvptx_init_axis_predicate (file,
712 REGNO (cfun->machine->axis_predicate[1]), "x");
738f2522
BS
713}
714
715/* Output a return instruction. Also copy the return value to its outgoing
716 location. */
717
718const char *
719nvptx_output_return (void)
720{
25662751
NS
721 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
722
723 if (mode != VOIDmode)
738f2522 724 {
25662751
NS
725 mode = arg_promotion (mode);
726 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
727 nvptx_ptx_type_from_mode (mode, false));
738f2522
BS
728 }
729
730 return "ret;";
731}
732
733/* Construct a function declaration from a call insn. This can be
734 necessary for two reasons - either we have an indirect call which
735 requires a .callprototype declaration, or we have a libcall
736 generated by emit_library_call for which no decl exists. */
737
738static void
739write_func_decl_from_insn (std::stringstream &s, rtx result, rtx pat,
740 rtx callee)
741{
742 bool callprototype = register_operand (callee, Pmode);
743 const char *name = "_";
744 if (!callprototype)
745 {
746 name = XSTR (callee, 0);
747 name = nvptx_name_replacement (name);
cf08c344 748 s << "\n// BEGIN GLOBAL FUNCTION DECL: " << name << "\n";
738f2522
BS
749 }
750 s << (callprototype ? "\t.callprototype\t" : "\t.extern .func ");
751
752 if (result != NULL_RTX)
753 {
754 s << "(.param";
755 s << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
756 false);
757 s << " ";
758 if (callprototype)
759 s << "_";
760 else
761 s << "%out_retval";
762 s << ")";
763 }
764
765 s << name;
766
f324806d
NS
767 int arg_end = XVECLEN (pat, 0);
768
769 if (1 < arg_end)
738f2522 770 {
f324806d 771 const char *comma = "";
738f2522 772 s << " (";
f324806d 773 for (int i = 1; i < arg_end; i++)
738f2522 774 {
f324806d 775 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522 776 machine_mode mode = GET_MODE (t);
d7479262
NS
777 machine_mode split = maybe_split_mode (mode);
778 int count = 1;
779
780 if (split != VOIDmode)
781 {
782 mode = split;
783 count = 2;
784 }
738f2522 785
f324806d 786 while (count--)
738f2522 787 {
f324806d 788 s << comma << ".param";
738f2522
BS
789 s << nvptx_ptx_type_from_mode (mode, false);
790 s << " ";
791 if (callprototype)
792 s << "_";
793 else
f324806d 794 s << "%arg" << i - 1;
738f2522
BS
795 if (mode == QImode || mode == HImode)
796 s << "[1]";
f324806d 797 comma = ", ";
738f2522
BS
798 }
799 }
800 s << ")";
801 }
802 s << ";\n";
803}
804
805/* Terminate a function by writing a closing brace to FILE. */
806
807void
808nvptx_function_end (FILE *file)
809{
cf08c344 810 fprintf (file, "}\n");
738f2522
BS
811}
812\f
813/* Decide whether we can make a sibling call to a function. For ptx, we
814 can't. */
815
816static bool
817nvptx_function_ok_for_sibcall (tree, tree)
818{
819 return false;
820}
821
18c05628
NS
822/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
823
824static rtx
825nvptx_get_drap_rtx (void)
826{
827 return NULL_RTX;
828}
829
738f2522
BS
830/* Implement the TARGET_CALL_ARGS hook. Record information about one
831 argument to the next call. */
832
833static void
834nvptx_call_args (rtx arg, tree funtype)
835{
836 if (cfun->machine->start_call == NULL_RTX)
837 {
838 cfun->machine->call_args = NULL;
839 cfun->machine->funtype = funtype;
840 cfun->machine->start_call = const0_rtx;
841 }
842 if (arg == pc_rtx)
843 return;
844
845 rtx_expr_list *args_so_far = cfun->machine->call_args;
846 if (REG_P (arg))
847 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
848}
849
850/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
851 information we recorded. */
852
853static void
854nvptx_end_call_args (void)
855{
856 cfun->machine->start_call = NULL_RTX;
857 free_EXPR_LIST_list (&cfun->machine->call_args);
858}
859
ecf6e535
BS
860/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
861 track of whether calls involving static chains or varargs were seen
862 in the current function.
863 For libcalls, maintain a hash table of decls we have seen, and
864 record a function decl for later when encountering a new one. */
738f2522
BS
865
866void
867nvptx_expand_call (rtx retval, rtx address)
868{
f324806d 869 int nargs = 0;
738f2522
BS
870 rtx callee = XEXP (address, 0);
871 rtx pat, t;
872 rtvec vec;
873 bool external_decl = false;
f324806d
NS
874 rtx varargs = NULL_RTX;
875 tree decl_type = NULL_TREE;
d88cd9c4 876 unsigned parallel = 0;
738f2522 877
738f2522
BS
878 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
879 nargs++;
880
738f2522
BS
881 if (!call_insn_operand (callee, Pmode))
882 {
883 callee = force_reg (Pmode, callee);
884 address = change_address (address, QImode, callee);
885 }
886
887 if (GET_CODE (callee) == SYMBOL_REF)
888 {
889 tree decl = SYMBOL_REF_DECL (callee);
890 if (decl != NULL_TREE)
891 {
892 decl_type = TREE_TYPE (decl);
893 if (DECL_STATIC_CHAIN (decl))
894 cfun->machine->has_call_with_sc = true;
895 if (DECL_EXTERNAL (decl))
896 external_decl = true;
d88cd9c4
NS
897 tree attr = get_oacc_fn_attrib (decl);
898 if (attr)
899 {
900 tree dims = TREE_VALUE (attr);
901
902 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
903 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
904 {
905 if (TREE_PURPOSE (dims)
906 && !integer_zerop (TREE_PURPOSE (dims)))
907 break;
908 /* Not on this axis. */
909 parallel ^= GOMP_DIM_MASK (ix);
910 dims = TREE_CHAIN (dims);
911 }
912 }
738f2522
BS
913 }
914 }
c38f0d8c 915
738f2522
BS
916 if (cfun->machine->funtype
917 /* It's possible to construct testcases where we call a variable.
918 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
919 in such a case. */
920 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
921 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
922 && stdarg_p (cfun->machine->funtype))
923 {
f324806d 924 varargs = gen_reg_rtx (Pmode);
738f2522 925 if (Pmode == DImode)
f324806d 926 emit_move_insn (varargs, stack_pointer_rtx);
738f2522 927 else
f324806d
NS
928 emit_move_insn (varargs, stack_pointer_rtx);
929 cfun->machine->has_call_with_varargs = true;
738f2522 930 }
f324806d
NS
931 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
932 pat = gen_rtx_PARALLEL (VOIDmode, vec);
738f2522 933
f324806d
NS
934 int vec_pos = 0;
935
738f2522
BS
936 rtx tmp_retval = retval;
937 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
938 if (retval != NULL_RTX)
939 {
940 if (!nvptx_register_operand (retval, GET_MODE (retval)))
941 tmp_retval = gen_reg_rtx (GET_MODE (retval));
f7df4a84 942 t = gen_rtx_SET (tmp_retval, t);
738f2522 943 }
f324806d
NS
944 XVECEXP (pat, 0, vec_pos++) = t;
945
946 /* Construct the call insn, including a USE for each argument pseudo
947 register. These will be used when printing the insn. */
948 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
949 {
950 rtx this_arg = XEXP (arg, 0);
951 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
952 }
953
954 if (varargs)
cf08c344 955 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
956
957 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535
BS
958
959 /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
960 undeclared function, we'll have an external decl without arg types.
961 In either case we have to try to construct a ptx declaration from one of
962 the calls to the function. */
738f2522
BS
963 if (!REG_P (callee)
964 && (decl_type == NULL_TREE
965 || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
966 {
f3dba894 967 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
738f2522
BS
968 if (*slot == NULL)
969 {
970 *slot = callee;
971 write_func_decl_from_insn (func_decls, retval, pat, callee);
972 }
973 }
d88cd9c4
NS
974
975 nvptx_emit_forking (parallel, true);
738f2522 976 emit_call_insn (pat);
d88cd9c4
NS
977 nvptx_emit_joining (parallel, true);
978
738f2522
BS
979 if (tmp_retval != retval)
980 emit_move_insn (retval, tmp_retval);
981}
982
983/* Implement TARGET_FUNCTION_ARG. */
984
985static rtx
986nvptx_function_arg (cumulative_args_t, machine_mode mode,
987 const_tree, bool named)
988{
989 if (mode == VOIDmode)
990 return NULL_RTX;
991
992 if (named)
993 return gen_reg_rtx (mode);
994 return NULL_RTX;
995}
996
997/* Implement TARGET_FUNCTION_INCOMING_ARG. */
998
999static rtx
1000nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
1001 const_tree, bool named)
1002{
1003 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1004 if (mode == VOIDmode)
1005 return NULL_RTX;
1006
1007 if (!named)
1008 return NULL_RTX;
1009
1010 /* No need to deal with split modes here, the only case that can
1011 happen is complex modes and those are dealt with by
1012 TARGET_SPLIT_COMPLEX_ARG. */
1013 return gen_rtx_UNSPEC (mode,
1014 gen_rtvec (1, GEN_INT (1 + cum->count)),
1015 UNSPEC_ARG_REG);
1016}
1017
1018/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
1019
1020static void
1021nvptx_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
1022 const_tree type ATTRIBUTE_UNUSED,
1023 bool named ATTRIBUTE_UNUSED)
1024{
1025 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1026 if (mode == TImode)
1027 cum->count += 2;
1028 else
1029 cum->count++;
1030}
1031
1032/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
1033
1034 For nvptx, we know how to handle functions declared as stdarg: by
1035 passing an extra pointer to the unnamed arguments. However, the
1036 Fortran frontend can produce a different situation, where a
1037 function pointer is declared with no arguments, but the actual
1038 function and calls to it take more arguments. In that case, we
1039 want to ensure the call matches the definition of the function. */
1040
1041static bool
1042nvptx_strict_argument_naming (cumulative_args_t cum_v)
1043{
1044 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1045 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
1046}
1047
1048/* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1049
1050static unsigned int
1051nvptx_function_arg_boundary (machine_mode mode, const_tree type)
1052{
1053 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
1054
1055 if (boundary > BITS_PER_WORD)
1056 return 2 * BITS_PER_WORD;
1057
1058 if (mode == BLKmode)
1059 {
1060 HOST_WIDE_INT size = int_size_in_bytes (type);
1061 if (size > 4)
1062 return 2 * BITS_PER_WORD;
1063 if (boundary < BITS_PER_WORD)
1064 {
1065 if (size >= 3)
1066 return BITS_PER_WORD;
1067 if (size >= 2)
1068 return 2 * BITS_PER_UNIT;
1069 }
1070 }
1071 return boundary;
1072}
1073
1074/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1075 where function FUNC returns or receives a value of data type TYPE. */
1076
1077static rtx
1078nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
1079 bool outgoing)
1080{
1081 int unsignedp = TYPE_UNSIGNED (type);
1082 machine_mode orig_mode = TYPE_MODE (type);
1083 machine_mode mode = promote_function_mode (type, orig_mode,
1084 &unsignedp, NULL_TREE, 1);
1085 if (outgoing)
1086 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1087 if (cfun->machine->start_call == NULL_RTX)
1088 /* Pretend to return in a hard reg for early uses before pseudos can be
1089 generated. */
1090 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1091 return gen_reg_rtx (mode);
1092}
1093
1094/* Implement TARGET_LIBCALL_VALUE. */
1095
1096static rtx
1097nvptx_libcall_value (machine_mode mode, const_rtx)
1098{
1099 if (cfun->machine->start_call == NULL_RTX)
1100 /* Pretend to return in a hard reg for early uses before pseudos can be
1101 generated. */
1102 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1103 return gen_reg_rtx (mode);
1104}
1105
1106/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1107
1108static bool
1109nvptx_function_value_regno_p (const unsigned int regno)
1110{
1111 return regno == NVPTX_RETURN_REGNUM;
1112}
1113
1114/* Types with a mode other than those supported by the machine are passed by
1115 reference in memory. */
1116
1117static bool
1118nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
1119 const_tree type, bool)
1120{
1121 return !PASS_IN_REG_P (mode, type);
1122}
1123
1124/* Implement TARGET_RETURN_IN_MEMORY. */
1125
1126static bool
1127nvptx_return_in_memory (const_tree type, const_tree)
1128{
1129 machine_mode mode = TYPE_MODE (type);
1130 if (!RETURN_IN_REG_P (mode))
1131 return true;
1132 return false;
1133}
1134
1135/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1136
1137static machine_mode
1138nvptx_promote_function_mode (const_tree type, machine_mode mode,
1139 int *punsignedp,
1140 const_tree funtype, int for_return)
1141{
1142 if (type == NULL_TREE)
1143 return mode;
1144 if (for_return)
1145 return promote_mode (type, mode, punsignedp);
1146 /* For K&R-style functions, try to match the language promotion rules to
1147 minimize type mismatches at assembly time. */
1148 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
1149 && type != NULL_TREE
1150 && !AGGREGATE_TYPE_P (type))
1151 {
1152 if (mode == SFmode)
1153 mode = DFmode;
1154 mode = arg_promotion (mode);
1155 }
1156
1157 return mode;
1158}
1159
1160/* Implement TARGET_STATIC_CHAIN. */
1161
1162static rtx
1163nvptx_static_chain (const_tree fndecl, bool incoming_p)
1164{
1165 if (!DECL_STATIC_CHAIN (fndecl))
1166 return NULL;
1167
1168 if (incoming_p)
1169 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
1170 else
1171 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
1172}
1173\f
1174/* Emit a comparison COMPARE, and return the new test to be used in the
1175 jump. */
1176
1177rtx
1178nvptx_expand_compare (rtx compare)
1179{
1180 rtx pred = gen_reg_rtx (BImode);
1181 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1182 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1183 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1184 return gen_rtx_NE (BImode, pred, const0_rtx);
1185}
1186
d88cd9c4
NS
1187/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1188
1189void
1190nvptx_expand_oacc_fork (unsigned mode)
1191{
1192 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1193}
1194
1195void
1196nvptx_expand_oacc_join (unsigned mode)
1197{
1198 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1199}
1200
1201/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1202 objects. */
1203
1204static rtx
1205nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1206{
1207 rtx res;
1208
1209 switch (GET_MODE (src))
1210 {
1211 case DImode:
1212 res = gen_unpackdisi2 (dst0, dst1, src);
1213 break;
1214 case DFmode:
1215 res = gen_unpackdfsi2 (dst0, dst1, src);
1216 break;
1217 default: gcc_unreachable ();
1218 }
1219 return res;
1220}
1221
1222/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1223 object. */
1224
1225static rtx
1226nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1227{
1228 rtx res;
1229
1230 switch (GET_MODE (dst))
1231 {
1232 case DImode:
1233 res = gen_packsidi2 (dst, src0, src1);
1234 break;
1235 case DFmode:
1236 res = gen_packsidf2 (dst, src0, src1);
1237 break;
1238 default: gcc_unreachable ();
1239 }
1240 return res;
1241}
1242
1243/* Generate an instruction or sequence to broadcast register REG
1244 across the vectors of a single warp. */
1245
1246static rtx
1247nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
1248{
1249 rtx res;
1250
1251 switch (GET_MODE (dst))
1252 {
1253 case SImode:
1254 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1255 break;
1256 case SFmode:
1257 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1258 break;
1259 case DImode:
1260 case DFmode:
1261 {
1262 rtx tmp0 = gen_reg_rtx (SImode);
1263 rtx tmp1 = gen_reg_rtx (SImode);
1264
1265 start_sequence ();
1266 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1267 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1268 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1269 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1270 res = get_insns ();
1271 end_sequence ();
1272 }
1273 break;
1274 case BImode:
1275 {
1276 rtx tmp = gen_reg_rtx (SImode);
1277
1278 start_sequence ();
1279 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1280 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1281 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1282 res = get_insns ();
1283 end_sequence ();
1284 }
1285 break;
1286
1287 default:
1288 gcc_unreachable ();
1289 }
1290 return res;
1291}
1292
1293/* Generate an instruction or sequence to broadcast register REG
1294 across the vectors of a single warp. */
1295
1296static rtx
1297nvptx_gen_vcast (rtx reg)
1298{
1299 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1300}
1301
1302/* Structure used when generating a worker-level spill or fill. */
1303
1304struct wcast_data_t
1305{
1306 rtx base; /* Register holding base addr of buffer. */
1307 rtx ptr; /* Iteration var, if needed. */
1308 unsigned offset; /* Offset into worker buffer. */
1309};
1310
1311/* Direction of the spill/fill and looping setup/teardown indicator. */
1312
1313enum propagate_mask
1314 {
1315 PM_read = 1 << 0,
1316 PM_write = 1 << 1,
1317 PM_loop_begin = 1 << 2,
1318 PM_loop_end = 1 << 3,
1319
1320 PM_read_write = PM_read | PM_write
1321 };
1322
1323/* Generate instruction(s) to spill or fill register REG to/from the
1324 worker broadcast array. PM indicates what is to be done, REP
1325 how many loop iterations will be executed (0 for not a loop). */
1326
1327static rtx
1328nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1329{
1330 rtx res;
1331 machine_mode mode = GET_MODE (reg);
1332
1333 switch (mode)
1334 {
1335 case BImode:
1336 {
1337 rtx tmp = gen_reg_rtx (SImode);
1338
1339 start_sequence ();
1340 if (pm & PM_read)
1341 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1342 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1343 if (pm & PM_write)
1344 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1345 res = get_insns ();
1346 end_sequence ();
1347 }
1348 break;
1349
1350 default:
1351 {
1352 rtx addr = data->ptr;
1353
1354 if (!addr)
1355 {
1356 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1357
1358 if (align > worker_bcast_align)
1359 worker_bcast_align = align;
1360 data->offset = (data->offset + align - 1) & ~(align - 1);
1361 addr = data->base;
1362 if (data->offset)
1363 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1364 }
1365
1366 addr = gen_rtx_MEM (mode, addr);
1367 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1368 if (pm == PM_read)
1369 res = gen_rtx_SET (addr, reg);
1370 else if (pm == PM_write)
1371 res = gen_rtx_SET (reg, addr);
1372 else
1373 gcc_unreachable ();
1374
1375 if (data->ptr)
1376 {
1377 /* We're using a ptr, increment it. */
1378 start_sequence ();
1379
1380 emit_insn (res);
1381 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1382 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1383 res = get_insns ();
1384 end_sequence ();
1385 }
1386 else
1387 rep = 1;
1388 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1389 }
1390 break;
1391 }
1392 return res;
1393}
1394
738f2522
BS
1395/* When loading an operand ORIG_OP, verify whether an address space
1396 conversion to generic is required, and if so, perform it. Also
1397 check for SYMBOL_REFs for function decls and call
1398 nvptx_record_needed_fndecl as needed.
1399 Return either the original operand, or the converted one. */
1400
1401rtx
1402nvptx_maybe_convert_symbolic_operand (rtx orig_op)
1403{
1404 if (GET_MODE (orig_op) != Pmode)
1405 return orig_op;
1406
1407 rtx op = orig_op;
1408 while (GET_CODE (op) == PLUS || GET_CODE (op) == CONST)
1409 op = XEXP (op, 0);
1410 if (GET_CODE (op) != SYMBOL_REF)
1411 return orig_op;
1412
1413 tree decl = SYMBOL_REF_DECL (op);
1414 if (decl && TREE_CODE (decl) == FUNCTION_DECL)
1415 {
1416 nvptx_record_needed_fndecl (decl);
1417 return orig_op;
1418 }
1419
1420 addr_space_t as = nvptx_addr_space_from_address (op);
1421 if (as == ADDR_SPACE_GENERIC)
1422 return orig_op;
1423
1424 enum unspec code;
1425 code = (as == ADDR_SPACE_GLOBAL ? UNSPEC_FROM_GLOBAL
1426 : as == ADDR_SPACE_LOCAL ? UNSPEC_FROM_LOCAL
1427 : as == ADDR_SPACE_SHARED ? UNSPEC_FROM_SHARED
1428 : as == ADDR_SPACE_CONST ? UNSPEC_FROM_CONST
1429 : UNSPEC_FROM_PARAM);
1430 rtx dest = gen_reg_rtx (Pmode);
f7df4a84
RS
1431 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, orig_op),
1432 code)));
738f2522
BS
1433 return dest;
1434}
1435\f
1436/* Returns true if X is a valid address for use in a memory reference. */
1437
1438static bool
1439nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1440{
1441 enum rtx_code code = GET_CODE (x);
1442
1443 switch (code)
1444 {
1445 case REG:
1446 return true;
1447
1448 case PLUS:
1449 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1450 return true;
1451 return false;
1452
1453 case CONST:
1454 case SYMBOL_REF:
1455 case LABEL_REF:
1456 return true;
1457
1458 default:
1459 return false;
1460 }
1461}
1462
1463/* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1464 to ensure that the return register's mode isn't changed. */
1465
1466bool
1467nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1468{
1469 if (regno != NVPTX_RETURN_REGNUM
1470 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1471 return true;
1472 return mode == cfun->machine->ret_reg_mode;
1473}
1474\f
1475/* Convert an address space AS to the corresponding ptx string. */
1476
1477const char *
1478nvptx_section_from_addr_space (addr_space_t as)
1479{
1480 switch (as)
1481 {
1482 case ADDR_SPACE_CONST:
1483 return ".const";
1484
1485 case ADDR_SPACE_GLOBAL:
1486 return ".global";
1487
1488 case ADDR_SPACE_SHARED:
1489 return ".shared";
1490
1491 case ADDR_SPACE_GENERIC:
1492 return "";
1493
1494 default:
1495 gcc_unreachable ();
1496 }
1497}
1498
1499/* Determine whether DECL goes into .const or .global. */
1500
1501const char *
1502nvptx_section_for_decl (const_tree decl)
1503{
1504 bool is_const = (CONSTANT_CLASS_P (decl)
1505 || TREE_CODE (decl) == CONST_DECL
1506 || TREE_READONLY (decl));
1507 if (is_const)
1508 return ".const";
1509
1510 return ".global";
1511}
1512
1513/* Look for a SYMBOL_REF in ADDR and return the address space to be used
1514 for the insn referencing this address. */
1515
1516addr_space_t
1517nvptx_addr_space_from_address (rtx addr)
1518{
1519 while (GET_CODE (addr) == PLUS || GET_CODE (addr) == CONST)
1520 addr = XEXP (addr, 0);
1521 if (GET_CODE (addr) != SYMBOL_REF)
1522 return ADDR_SPACE_GENERIC;
1523
1524 tree decl = SYMBOL_REF_DECL (addr);
1525 if (decl == NULL_TREE || TREE_CODE (decl) == FUNCTION_DECL)
1526 return ADDR_SPACE_GENERIC;
1527
1528 bool is_const = (CONSTANT_CLASS_P (decl)
1529 || TREE_CODE (decl) == CONST_DECL
1530 || TREE_READONLY (decl));
1531 if (is_const)
1532 return ADDR_SPACE_CONST;
1533
1534 return ADDR_SPACE_GLOBAL;
1535}
1536\f
ecf6e535
BS
1537/* Machinery to output constant initializers. When beginning an initializer,
1538 we decide on a chunk size (which is visible in ptx in the type used), and
1539 then all initializer data is buffered until a chunk is filled and ready to
1540 be written out. */
738f2522
BS
1541
1542/* Used when assembling integers to ensure data is emitted in
1543 pieces whose size matches the declaration we printed. */
1544static unsigned int decl_chunk_size;
1545static machine_mode decl_chunk_mode;
1546/* Used in the same situation, to keep track of the byte offset
1547 into the initializer. */
1548static unsigned HOST_WIDE_INT decl_offset;
1549/* The initializer part we are currently processing. */
1550static HOST_WIDE_INT init_part;
1551/* The total size of the object. */
1552static unsigned HOST_WIDE_INT object_size;
1553/* True if we found a skip extending to the end of the object. Used to
1554 assert that no data follows. */
1555static bool object_finished;
1556
1557/* Write the necessary separator string to begin a new initializer value. */
1558
1559static void
1560begin_decl_field (void)
1561{
1562 /* We never see decl_offset at zero by the time we get here. */
1563 if (decl_offset == decl_chunk_size)
1564 fprintf (asm_out_file, " = { ");
1565 else
1566 fprintf (asm_out_file, ", ");
1567}
1568
1569/* Output the currently stored chunk as an initializer value. */
1570
1571static void
1572output_decl_chunk (void)
1573{
1574 begin_decl_field ();
cc8ca59e 1575 output_address (VOIDmode, gen_int_mode (init_part, decl_chunk_mode));
738f2522
BS
1576 init_part = 0;
1577}
1578
1579/* Add value VAL sized SIZE to the data we're emitting, and keep writing
1580 out chunks as they fill up. */
1581
1582static void
1583nvptx_assemble_value (HOST_WIDE_INT val, unsigned int size)
1584{
1585 unsigned HOST_WIDE_INT chunk_offset = decl_offset % decl_chunk_size;
1586 gcc_assert (!object_finished);
1587 while (size > 0)
1588 {
1589 int this_part = size;
1590 if (chunk_offset + this_part > decl_chunk_size)
1591 this_part = decl_chunk_size - chunk_offset;
1592 HOST_WIDE_INT val_part;
1593 HOST_WIDE_INT mask = 2;
1594 mask <<= this_part * BITS_PER_UNIT - 1;
1595 val_part = val & (mask - 1);
1596 init_part |= val_part << (BITS_PER_UNIT * chunk_offset);
1597 val >>= BITS_PER_UNIT * this_part;
1598 size -= this_part;
1599 decl_offset += this_part;
1600 if (decl_offset % decl_chunk_size == 0)
1601 output_decl_chunk ();
1602
1603 chunk_offset = 0;
1604 }
1605}
1606
1607/* Target hook for assembling integer object X of size SIZE. */
1608
1609static bool
1610nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1611{
1612 if (GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == CONST)
1613 {
1614 gcc_assert (size = decl_chunk_size);
1615 if (decl_offset % decl_chunk_size != 0)
1616 sorry ("cannot emit unaligned pointers in ptx assembly");
1617 decl_offset += size;
1618 begin_decl_field ();
1619
1620 HOST_WIDE_INT off = 0;
1621 if (GET_CODE (x) == CONST)
1622 x = XEXP (x, 0);
1623 if (GET_CODE (x) == PLUS)
1624 {
1625 off = INTVAL (XEXP (x, 1));
1626 x = XEXP (x, 0);
1627 }
1628 if (GET_CODE (x) == SYMBOL_REF)
1629 {
1630 nvptx_record_needed_fndecl (SYMBOL_REF_DECL (x));
1631 fprintf (asm_out_file, "generic(");
cc8ca59e 1632 output_address (VOIDmode, x);
738f2522
BS
1633 fprintf (asm_out_file, ")");
1634 }
1635 if (off != 0)
1636 fprintf (asm_out_file, " + " HOST_WIDE_INT_PRINT_DEC, off);
1637 return true;
1638 }
1639
1640 HOST_WIDE_INT val;
1641 switch (GET_CODE (x))
1642 {
1643 case CONST_INT:
1644 val = INTVAL (x);
1645 break;
1646 case CONST_DOUBLE:
1647 gcc_unreachable ();
1648 break;
1649 default:
1650 gcc_unreachable ();
1651 }
1652
1653 nvptx_assemble_value (val, size);
1654 return true;
1655}
1656
1657/* Output SIZE zero bytes. We ignore the FILE argument since the
1658 functions we're calling to perform the output just use
1659 asm_out_file. */
1660
1661void
1662nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1663{
1664 if (decl_offset + size >= object_size)
1665 {
1666 if (decl_offset % decl_chunk_size != 0)
1667 nvptx_assemble_value (0, decl_chunk_size);
1668 object_finished = true;
1669 return;
1670 }
1671
1672 while (size > decl_chunk_size)
1673 {
1674 nvptx_assemble_value (0, decl_chunk_size);
1675 size -= decl_chunk_size;
1676 }
1677 while (size-- > 0)
1678 nvptx_assemble_value (0, 1);
1679}
1680
1681/* Output a string STR with length SIZE. As in nvptx_output_skip we
1682 ignore the FILE arg. */
1683
1684void
1685nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1686{
1687 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1688 nvptx_assemble_value (str[i], 1);
1689}
1690
1691/* Called when the initializer for a decl has been completely output through
1692 combinations of the three functions above. */
1693
1694static void
1695nvptx_assemble_decl_end (void)
1696{
1697 if (decl_offset != 0)
1698 {
1699 if (!object_finished && decl_offset % decl_chunk_size != 0)
1700 nvptx_assemble_value (0, decl_chunk_size);
1701
1702 fprintf (asm_out_file, " }");
1703 }
1704 fprintf (asm_out_file, ";\n");
1705}
1706
1707/* Start a declaration of a variable of TYPE with NAME to
1708 FILE. IS_PUBLIC says whether this will be externally visible.
1709 Here we just write the linker hint and decide on the chunk size
1710 to use. */
1711
1712static void
1713init_output_initializer (FILE *file, const char *name, const_tree type,
1714 bool is_public)
1715{
cf08c344 1716 fprintf (file, "\n// BEGIN%s VAR DEF: ", is_public ? " GLOBAL" : "");
738f2522
BS
1717 assemble_name_raw (file, name);
1718 fputc ('\n', file);
1719
1720 if (TREE_CODE (type) == ARRAY_TYPE)
1721 type = TREE_TYPE (type);
1722 int sz = int_size_in_bytes (type);
1723 if ((TREE_CODE (type) != INTEGER_TYPE
1724 && TREE_CODE (type) != ENUMERAL_TYPE
1725 && TREE_CODE (type) != REAL_TYPE)
1726 || sz < 0
1727 || sz > HOST_BITS_PER_WIDE_INT)
1728 type = ptr_type_node;
1729 decl_chunk_size = int_size_in_bytes (type);
1730 decl_chunk_mode = int_mode_for_mode (TYPE_MODE (type));
1731 decl_offset = 0;
1732 init_part = 0;
1733 object_finished = false;
1734}
1735
1736/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1737 writing a constant variable EXP with NAME and SIZE and its
1738 initializer to FILE. */
1739
1740static void
1741nvptx_asm_declare_constant_name (FILE *file, const char *name,
1742 const_tree exp, HOST_WIDE_INT size)
1743{
1744 tree type = TREE_TYPE (exp);
1745 init_output_initializer (file, name, type, false);
1746 fprintf (file, "\t.const .align %d .u%d ",
1747 TYPE_ALIGN (TREE_TYPE (exp)) / BITS_PER_UNIT,
1748 decl_chunk_size * BITS_PER_UNIT);
1749 assemble_name (file, name);
1750 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1751 (size + decl_chunk_size - 1) / decl_chunk_size);
1752 object_size = size;
1753}
1754
1755/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1756 a variable DECL with NAME to FILE. */
1757
1758void
1759nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1760{
1761 if (decl && DECL_SIZE (decl))
1762 {
1763 tree type = TREE_TYPE (decl);
1764 unsigned HOST_WIDE_INT size;
1765
1766 init_output_initializer (file, name, type, TREE_PUBLIC (decl));
1767 size = tree_to_uhwi (DECL_SIZE_UNIT (decl));
1768 const char *section = nvptx_section_for_decl (decl);
1769 fprintf (file, "\t%s%s .align %d .u%d ",
0766660b
NS
1770 !TREE_PUBLIC (decl) ? ""
1771 : DECL_WEAK (decl) ? ".weak" : ".visible",
1772 section, DECL_ALIGN (decl) / BITS_PER_UNIT,
738f2522
BS
1773 decl_chunk_size * BITS_PER_UNIT);
1774 assemble_name (file, name);
1775 if (size > 0)
1776 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1777 (size + decl_chunk_size - 1) / decl_chunk_size);
1778 else
1779 object_finished = true;
1780 object_size = size;
1781 }
1782}
1783
1784/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1785
1786static void
1787nvptx_globalize_label (FILE *, const char *)
1788{
1789}
1790
1791/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1792 declaration only for variable DECL with NAME to FILE. */
1793static void
1794nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1795{
1796 if (TREE_CODE (decl) != VAR_DECL)
1797 return;
1798 const char *section = nvptx_section_for_decl (decl);
cf08c344
NS
1799 fprintf (file, "\n// BEGIN%s VAR DECL: ",
1800 TREE_PUBLIC (decl) ? " GLOBAL" : "");
738f2522
BS
1801 assemble_name_raw (file, name);
1802 fputs ("\n", file);
1803 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (decl));
1804 fprintf (file, ".extern %s .b8 ", section);
1805 assemble_name_raw (file, name);
1806 if (size > 0)
16998094 1807 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
738f2522
BS
1808 fprintf (file, ";\n\n");
1809}
1810
1811/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
1812 involves writing .param declarations and in/out copies into them. For
1813 indirect calls, also write the .callprototype. */
738f2522
BS
1814
1815const char *
1816nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1817{
1818 char buf[256];
1819 static int labelno;
1820 bool needs_tgt = register_operand (callee, Pmode);
1821 rtx pat = PATTERN (insn);
f324806d 1822 int arg_end = XVECLEN (pat, 0);
738f2522
BS
1823 tree decl = NULL_TREE;
1824
1825 fprintf (asm_out_file, "\t{\n");
1826 if (result != NULL)
f324806d
NS
1827 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1828 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1829 false));
738f2522 1830
ecf6e535 1831 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
1832 if (GET_CODE (callee) == SYMBOL_REF)
1833 {
1834 decl = SYMBOL_REF_DECL (callee);
1835 if (decl && DECL_EXTERNAL (decl))
1836 nvptx_record_fndecl (decl);
1837 }
1838
1839 if (needs_tgt)
1840 {
1841 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1842 labelno++;
1843 ASM_OUTPUT_LABEL (asm_out_file, buf);
1844 std::stringstream s;
1845 write_func_decl_from_insn (s, result, pat, callee);
1846 fputs (s.str().c_str(), asm_out_file);
1847 }
1848
f324806d 1849 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1850 {
f324806d 1851 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522 1852 machine_mode mode = GET_MODE (t);
d7479262
NS
1853 machine_mode split = maybe_split_mode (mode);
1854 int count = 1;
1855
1856 if (split != VOIDmode)
1857 {
1858 mode = split;
1859 count = 2;
1860 }
738f2522 1861
d7479262 1862 for (int n = 0; n != count; n++)
738f2522 1863 {
d7479262
NS
1864 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1865 nvptx_ptx_type_from_mode (mode, false), argno,
1866 mode == QImode || mode == HImode ? "[1]" : "");
1867 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d",
1868 nvptx_ptx_type_from_mode (mode, false), argno,
1869 REGNO (t));
1870 if (split != VOIDmode)
1871 fprintf (asm_out_file, "$%d", n);
1872 fprintf (asm_out_file, ";\n");
1873 argno++;
738f2522 1874 }
d7479262 1875
738f2522
BS
1876 }
1877
1878 fprintf (asm_out_file, "\t\tcall ");
1879 if (result != NULL_RTX)
1880 fprintf (asm_out_file, "(%%retval_in), ");
1881
1882 if (decl)
1883 {
1884 const char *name = get_fnname_from_decl (decl);
1885 name = nvptx_name_replacement (name);
1886 assemble_name (asm_out_file, name);
1887 }
1888 else
cc8ca59e 1889 output_address (VOIDmode, callee);
738f2522 1890
f324806d 1891 if (arg_end > 1 || (decl && DECL_STATIC_CHAIN (decl)))
738f2522 1892 {
f324806d
NS
1893 const char *comma = "";
1894
738f2522 1895 fprintf (asm_out_file, ", (");
f324806d 1896 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1897 {
f324806d 1898 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522 1899 machine_mode mode = GET_MODE (t);
d7479262 1900 machine_mode split = maybe_split_mode (mode);
738f2522 1901
d7479262 1902 if (split != VOIDmode)
738f2522 1903 {
f324806d
NS
1904 fprintf (asm_out_file, "%s%%out_arg%d", comma, argno++);
1905 comma = ", ";
738f2522 1906 }
d7479262
NS
1907 fprintf (asm_out_file, "%s%%out_arg%d", comma, argno++);
1908 comma = ", ";
738f2522
BS
1909 }
1910 if (decl && DECL_STATIC_CHAIN (decl))
f324806d
NS
1911 fprintf (asm_out_file, "%s%s", comma,
1912 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
738f2522
BS
1913
1914 fprintf (asm_out_file, ")");
1915 }
f324806d 1916
738f2522
BS
1917 if (needs_tgt)
1918 {
1919 fprintf (asm_out_file, ", ");
1920 assemble_name (asm_out_file, buf);
1921 }
1922 fprintf (asm_out_file, ";\n");
1923 if (result != NULL_RTX)
cf08c344 1924 return "\tld.param%t0\t%0, [%%retval_in];\n\t}";
738f2522
BS
1925
1926 return "}";
1927}
1928
1929/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1930
1931static bool
1932nvptx_print_operand_punct_valid_p (unsigned char c)
1933{
1934 return c == '.' || c== '#';
1935}
1936
1937static void nvptx_print_operand (FILE *, rtx, int);
1938
1939/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1940
1941static void
1942nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1943{
1944 rtx off;
1945 if (GET_CODE (x) == CONST)
1946 x = XEXP (x, 0);
1947 switch (GET_CODE (x))
1948 {
1949 case PLUS:
1950 off = XEXP (x, 1);
cc8ca59e 1951 output_address (VOIDmode, XEXP (x, 0));
738f2522 1952 fprintf (file, "+");
cc8ca59e 1953 output_address (VOIDmode, off);
738f2522
BS
1954 break;
1955
1956 case SYMBOL_REF:
1957 case LABEL_REF:
1958 output_addr_const (file, x);
1959 break;
1960
1961 default:
1962 gcc_assert (GET_CODE (x) != MEM);
1963 nvptx_print_operand (file, x, 0);
1964 break;
1965 }
1966}
1967
1968/* Write assembly language output for the address ADDR to FILE. */
1969
1970static void
cc8ca59e 1971nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 1972{
cc8ca59e 1973 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
1974}
1975
1976/* Print an operand, X, to FILE, with an optional modifier in CODE.
1977
1978 Meaning of CODE:
1979 . -- print the predicate for the instruction or an emptry string for an
1980 unconditional one.
1981 # -- print a rounding mode for the instruction
1982
1983 A -- print an address space identifier for a MEM
1984 c -- print an opcode suffix for a comparison operator, including a type code
738f2522 1985 f -- print a full reg even for something that must always be split
d88cd9c4 1986 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
1987 t -- print a type opcode suffix, promoting QImode to 32 bits
1988 T -- print a type size in bits
1989 u -- print a type opcode suffix without promotions. */
1990
1991static void
1992nvptx_print_operand (FILE *file, rtx x, int code)
1993{
1994 rtx orig_x = x;
1995 machine_mode op_mode;
1996
1997 if (code == '.')
1998 {
1999 x = current_insn_predicate;
2000 if (x)
2001 {
2002 unsigned int regno = REGNO (XEXP (x, 0));
2003 fputs ("[", file);
2004 if (GET_CODE (x) == EQ)
2005 fputs ("!", file);
2006 fputs (reg_names [regno], file);
2007 fputs ("]", file);
2008 }
2009 return;
2010 }
2011 else if (code == '#')
2012 {
2013 fputs (".rn", file);
2014 return;
2015 }
2016
2017 enum rtx_code x_code = GET_CODE (x);
2018
2019 switch (code)
2020 {
2021 case 'A':
2022 {
2023 addr_space_t as = nvptx_addr_space_from_address (XEXP (x, 0));
2024 fputs (nvptx_section_from_addr_space (as), file);
2025 }
2026 break;
2027
738f2522
BS
2028 case 't':
2029 op_mode = nvptx_underlying_object_mode (x);
2030 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
2031 break;
2032
2033 case 'u':
2034 op_mode = nvptx_underlying_object_mode (x);
2035 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
2036 break;
2037
d88cd9c4
NS
2038 case 'S':
2039 {
2040 unsigned kind = UINTVAL (x);
2041 static const char *const kinds[] =
2042 {"up", "down", "bfly", "idx"};
2043 fprintf (file, ".%s", kinds[kind]);
2044 }
2045 break;
2046
738f2522
BS
2047 case 'T':
2048 fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
2049 break;
2050
2051 case 'j':
2052 fprintf (file, "@");
2053 goto common;
2054
2055 case 'J':
2056 fprintf (file, "@!");
2057 goto common;
2058
2059 case 'c':
2060 op_mode = GET_MODE (XEXP (x, 0));
2061 switch (x_code)
2062 {
2063 case EQ:
2064 fputs (".eq", file);
2065 break;
2066 case NE:
2067 if (FLOAT_MODE_P (op_mode))
2068 fputs (".neu", file);
2069 else
2070 fputs (".ne", file);
2071 break;
2072 case LE:
2073 fputs (".le", file);
2074 break;
2075 case GE:
2076 fputs (".ge", file);
2077 break;
2078 case LT:
2079 fputs (".lt", file);
2080 break;
2081 case GT:
2082 fputs (".gt", file);
2083 break;
2084 case LEU:
2085 fputs (".ls", file);
2086 break;
2087 case GEU:
2088 fputs (".hs", file);
2089 break;
2090 case LTU:
2091 fputs (".lo", file);
2092 break;
2093 case GTU:
2094 fputs (".hi", file);
2095 break;
2096 case LTGT:
2097 fputs (".ne", file);
2098 break;
2099 case UNEQ:
2100 fputs (".equ", file);
2101 break;
2102 case UNLE:
2103 fputs (".leu", file);
2104 break;
2105 case UNGE:
2106 fputs (".geu", file);
2107 break;
2108 case UNLT:
2109 fputs (".ltu", file);
2110 break;
2111 case UNGT:
2112 fputs (".gtu", file);
2113 break;
2114 case UNORDERED:
2115 fputs (".nan", file);
2116 break;
2117 case ORDERED:
2118 fputs (".num", file);
2119 break;
2120 default:
2121 gcc_unreachable ();
2122 }
2123 if (FLOAT_MODE_P (op_mode)
2124 || x_code == EQ || x_code == NE
2125 || x_code == GEU || x_code == GTU
2126 || x_code == LEU || x_code == LTU)
2127 fputs (nvptx_ptx_type_from_mode (op_mode, true), file);
2128 else
2129 fprintf (file, ".s%d", GET_MODE_BITSIZE (op_mode));
2130 break;
2131 default:
2132 common:
2133 switch (x_code)
2134 {
2135 case SUBREG:
2136 x = SUBREG_REG (x);
2137 /* fall through */
2138
2139 case REG:
2140 if (HARD_REGISTER_P (x))
2141 fprintf (file, "%s", reg_names[REGNO (x)]);
2142 else
2143 fprintf (file, "%%r%d", REGNO (x));
d7479262 2144 if (code != 'f' && maybe_split_mode (GET_MODE (x)) != VOIDmode)
738f2522
BS
2145 {
2146 gcc_assert (GET_CODE (orig_x) == SUBREG
d7479262 2147 && maybe_split_mode (GET_MODE (orig_x)) == VOIDmode);
738f2522
BS
2148 fprintf (file, "$%d", SUBREG_BYTE (orig_x) / UNITS_PER_WORD);
2149 }
2150 break;
2151
2152 case MEM:
2153 fputc ('[', file);
2154 nvptx_print_address_operand (file, XEXP (x, 0), GET_MODE (x));
2155 fputc (']', file);
2156 break;
2157
2158 case CONST_INT:
2159 output_addr_const (file, x);
2160 break;
2161
2162 case CONST:
2163 case SYMBOL_REF:
2164 case LABEL_REF:
2165 /* We could use output_addr_const, but that can print things like
2166 "x-8", which breaks ptxas. Need to ensure it is output as
2167 "x+-8". */
2168 nvptx_print_address_operand (file, x, VOIDmode);
2169 break;
2170
2171 case CONST_DOUBLE:
2172 long vals[2];
34a72c33 2173 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), GET_MODE (x));
738f2522
BS
2174 vals[0] &= 0xffffffff;
2175 vals[1] &= 0xffffffff;
2176 if (GET_MODE (x) == SFmode)
2177 fprintf (file, "0f%08lx", vals[0]);
2178 else
2179 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2180 break;
2181
2182 default:
2183 output_addr_const (file, x);
2184 }
2185 }
2186}
2187\f
2188/* Record replacement regs used to deal with subreg operands. */
2189struct reg_replace
2190{
2191 rtx replacement[MAX_RECOG_OPERANDS];
2192 machine_mode mode;
2193 int n_allocated;
2194 int n_in_use;
2195};
2196
2197/* Allocate or reuse a replacement in R and return the rtx. */
2198
2199static rtx
2200get_replacement (struct reg_replace *r)
2201{
2202 if (r->n_allocated == r->n_in_use)
2203 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2204 return r->replacement[r->n_in_use++];
2205}
2206
2207/* Clean up subreg operands. In ptx assembly, everything is typed, and
2208 the presence of subregs would break the rules for most instructions.
2209 Replace them with a suitable new register of the right size, plus
2210 conversion copyin/copyout instructions. */
2211
2212static void
517665b3 2213nvptx_reorg_subreg (void)
738f2522
BS
2214{
2215 struct reg_replace qiregs, hiregs, siregs, diregs;
2216 rtx_insn *insn, *next;
2217
738f2522
BS
2218 qiregs.n_allocated = 0;
2219 hiregs.n_allocated = 0;
2220 siregs.n_allocated = 0;
2221 diregs.n_allocated = 0;
2222 qiregs.mode = QImode;
2223 hiregs.mode = HImode;
2224 siregs.mode = SImode;
2225 diregs.mode = DImode;
2226
2227 for (insn = get_insns (); insn; insn = next)
2228 {
2229 next = NEXT_INSN (insn);
2230 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2231 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2232 || GET_CODE (PATTERN (insn)) == USE
2233 || GET_CODE (PATTERN (insn)) == CLOBBER)
2234 continue;
f324806d 2235
738f2522
BS
2236 qiregs.n_in_use = 0;
2237 hiregs.n_in_use = 0;
2238 siregs.n_in_use = 0;
2239 diregs.n_in_use = 0;
2240 extract_insn (insn);
2241 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2242
738f2522
BS
2243 for (int i = 0; i < recog_data.n_operands; i++)
2244 {
2245 rtx op = recog_data.operand[i];
2246 if (GET_CODE (op) != SUBREG)
2247 continue;
2248
2249 rtx inner = SUBREG_REG (op);
2250
2251 machine_mode outer_mode = GET_MODE (op);
2252 machine_mode inner_mode = GET_MODE (inner);
2253 gcc_assert (s_ok);
2254 if (s_ok
2255 && (GET_MODE_PRECISION (inner_mode)
2256 >= GET_MODE_PRECISION (outer_mode)))
2257 continue;
2258 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2259 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2260 : outer_mode == HImode ? &hiregs
2261 : outer_mode == SImode ? &siregs
2262 : &diregs);
2263 rtx new_reg = get_replacement (r);
2264
2265 if (recog_data.operand_type[i] != OP_OUT)
2266 {
2267 enum rtx_code code;
2268 if (GET_MODE_PRECISION (inner_mode)
2269 < GET_MODE_PRECISION (outer_mode))
2270 code = ZERO_EXTEND;
2271 else
2272 code = TRUNCATE;
2273
f7df4a84 2274 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2275 gen_rtx_fmt_e (code, outer_mode, inner));
2276 emit_insn_before (pat, insn);
2277 }
2278
2279 if (recog_data.operand_type[i] != OP_IN)
2280 {
2281 enum rtx_code code;
2282 if (GET_MODE_PRECISION (inner_mode)
2283 < GET_MODE_PRECISION (outer_mode))
2284 code = TRUNCATE;
2285 else
2286 code = ZERO_EXTEND;
2287
f7df4a84 2288 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2289 gen_rtx_fmt_e (code, inner_mode, new_reg));
2290 emit_insn_after (pat, insn);
2291 }
2292 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2293 }
2294 }
517665b3 2295}
738f2522 2296
d2d47a28
NS
2297/* Loop structure of the function. The entire function is described as
2298 a NULL loop. */
d88cd9c4
NS
2299
2300struct parallel
2301{
2302 /* Parent parallel. */
2303 parallel *parent;
2304
2305 /* Next sibling parallel. */
2306 parallel *next;
2307
2308 /* First child parallel. */
2309 parallel *inner;
2310
2311 /* Partitioning mask of the parallel. */
2312 unsigned mask;
2313
2314 /* Partitioning used within inner parallels. */
2315 unsigned inner_mask;
2316
2317 /* Location of parallel forked and join. The forked is the first
2318 block in the parallel and the join is the first block after of
2319 the partition. */
2320 basic_block forked_block;
2321 basic_block join_block;
2322
2323 rtx_insn *forked_insn;
2324 rtx_insn *join_insn;
2325
2326 rtx_insn *fork_insn;
2327 rtx_insn *joining_insn;
2328
2329 /* Basic blocks in this parallel, but not in child parallels. The
2330 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2331 blocks are not. */
2332 auto_vec<basic_block> blocks;
2333
2334public:
2335 parallel (parallel *parent, unsigned mode);
2336 ~parallel ();
2337};
2338
2339/* Constructor links the new parallel into it's parent's chain of
2340 children. */
2341
2342parallel::parallel (parallel *parent_, unsigned mask_)
2343 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2344{
2345 forked_block = join_block = 0;
2346 forked_insn = join_insn = 0;
2347 fork_insn = joining_insn = 0;
2348
2349 if (parent)
2350 {
2351 next = parent->inner;
2352 parent->inner = this;
2353 }
2354}
2355
2356parallel::~parallel ()
2357{
2358 delete inner;
2359 delete next;
2360}
2361
2362/* Map of basic blocks to insns */
2363typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2364
2365/* A tuple of an insn of interest and the BB in which it resides. */
2366typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2367typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2368
2369/* Split basic blocks such that each forked and join unspecs are at
2370 the start of their basic blocks. Thus afterwards each block will
2371 have a single partitioning mode. We also do the same for return
2372 insns, as they are executed by every thread. Return the
2373 partitioning mode of the function as a whole. Populate MAP with
2374 head and tail blocks. We also clear the BB visited flag, which is
2375 used when finding partitions. */
2376
2377static void
2378nvptx_split_blocks (bb_insn_map_t *map)
2379{
2380 insn_bb_vec_t worklist;
2381 basic_block block;
2382 rtx_insn *insn;
2383
2384 /* Locate all the reorg instructions of interest. */
2385 FOR_ALL_BB_FN (block, cfun)
2386 {
2387 bool seen_insn = false;
2388
2389 /* Clear visited flag, for use by parallel locator */
2390 block->flags &= ~BB_VISITED;
2391
2392 FOR_BB_INSNS (block, insn)
2393 {
2394 if (!INSN_P (insn))
2395 continue;
2396 switch (recog_memoized (insn))
2397 {
2398 default:
2399 seen_insn = true;
2400 continue;
2401 case CODE_FOR_nvptx_forked:
2402 case CODE_FOR_nvptx_join:
2403 break;
2404
2405 case CODE_FOR_return:
2406 /* We also need to split just before return insns, as
2407 that insn needs executing by all threads, but the
2408 block it is in probably does not. */
2409 break;
2410 }
2411
2412 if (seen_insn)
2413 /* We've found an instruction that must be at the start of
2414 a block, but isn't. Add it to the worklist. */
2415 worklist.safe_push (insn_bb_t (insn, block));
2416 else
2417 /* It was already the first instruction. Just add it to
2418 the map. */
2419 map->get_or_insert (block) = insn;
2420 seen_insn = true;
2421 }
2422 }
2423
2424 /* Split blocks on the worklist. */
2425 unsigned ix;
2426 insn_bb_t *elt;
2427 basic_block remap = 0;
2428 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2429 {
2430 if (remap != elt->second)
2431 {
2432 block = elt->second;
2433 remap = block;
2434 }
2435
2436 /* Split block before insn. The insn is in the new block */
2437 edge e = split_block (block, PREV_INSN (elt->first));
2438
2439 block = e->dest;
2440 map->get_or_insert (block) = elt->first;
2441 }
2442}
2443
2444/* BLOCK is a basic block containing a head or tail instruction.
2445 Locate the associated prehead or pretail instruction, which must be
2446 in the single predecessor block. */
2447
2448static rtx_insn *
2449nvptx_discover_pre (basic_block block, int expected)
2450{
2451 gcc_assert (block->preds->length () == 1);
2452 basic_block pre_block = (*block->preds)[0]->src;
2453 rtx_insn *pre_insn;
2454
2455 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2456 pre_insn = PREV_INSN (pre_insn))
2457 gcc_assert (pre_insn != BB_HEAD (pre_block));
2458
2459 gcc_assert (recog_memoized (pre_insn) == expected);
2460 return pre_insn;
2461}
2462
2463/* Dump this parallel and all its inner parallels. */
2464
2465static void
2466nvptx_dump_pars (parallel *par, unsigned depth)
2467{
2468 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2469 depth, par->mask,
2470 par->forked_block ? par->forked_block->index : -1,
2471 par->join_block ? par->join_block->index : -1);
2472
2473 fprintf (dump_file, " blocks:");
2474
2475 basic_block block;
2476 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2477 fprintf (dump_file, " %d", block->index);
2478 fprintf (dump_file, "\n");
2479 if (par->inner)
2480 nvptx_dump_pars (par->inner, depth + 1);
2481
2482 if (par->next)
2483 nvptx_dump_pars (par->next, depth);
2484}
2485
2486/* If BLOCK contains a fork/join marker, process it to create or
2487 terminate a loop structure. Add this block to the current loop,
2488 and then walk successor blocks. */
2489
2490static parallel *
2491nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2492{
2493 if (block->flags & BB_VISITED)
2494 return par;
2495 block->flags |= BB_VISITED;
2496
2497 if (rtx_insn **endp = map->get (block))
2498 {
2499 rtx_insn *end = *endp;
2500
2501 /* This is a block head or tail, or return instruction. */
2502 switch (recog_memoized (end))
2503 {
2504 case CODE_FOR_return:
2505 /* Return instructions are in their own block, and we
2506 don't need to do anything more. */
2507 return par;
2508
2509 case CODE_FOR_nvptx_forked:
2510 /* Loop head, create a new inner loop and add it into
2511 our parent's child list. */
2512 {
2513 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2514
2515 gcc_assert (mask);
2516 par = new parallel (par, mask);
2517 par->forked_block = block;
2518 par->forked_insn = end;
2519 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2520 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2521 par->fork_insn
2522 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2523 }
2524 break;
2525
2526 case CODE_FOR_nvptx_join:
2527 /* A loop tail. Finish the current loop and return to
2528 parent. */
2529 {
2530 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2531
2532 gcc_assert (par->mask == mask);
2533 par->join_block = block;
2534 par->join_insn = end;
2535 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2536 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2537 par->joining_insn
2538 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2539 par = par->parent;
2540 }
2541 break;
2542
2543 default:
2544 gcc_unreachable ();
2545 }
2546 }
2547
2548 if (par)
2549 /* Add this block onto the current loop's list of blocks. */
2550 par->blocks.safe_push (block);
2551 else
2552 /* This must be the entry block. Create a NULL parallel. */
2553 par = new parallel (0, 0);
2554
2555 /* Walk successor blocks. */
2556 edge e;
2557 edge_iterator ei;
2558
2559 FOR_EACH_EDGE (e, ei, block->succs)
2560 nvptx_find_par (map, par, e->dest);
2561
2562 return par;
2563}
2564
2565/* DFS walk the CFG looking for fork & join markers. Construct
2566 loop structures as we go. MAP is a mapping of basic blocks
2567 to head & tail markers, discovered when splitting blocks. This
2568 speeds up the discovery. We rely on the BB visited flag having
2569 been cleared when splitting blocks. */
2570
2571static parallel *
2572nvptx_discover_pars (bb_insn_map_t *map)
2573{
2574 basic_block block;
2575
2576 /* Mark exit blocks as visited. */
2577 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2578 block->flags |= BB_VISITED;
2579
2580 /* And entry block as not. */
2581 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2582 block->flags &= ~BB_VISITED;
2583
2584 parallel *par = nvptx_find_par (map, 0, block);
2585
2586 if (dump_file)
2587 {
2588 fprintf (dump_file, "\nLoops\n");
2589 nvptx_dump_pars (par, 0);
2590 fprintf (dump_file, "\n");
2591 }
2592
2593 return par;
2594}
2595
912442c2
NS
2596/* Analyse a group of BBs within a partitioned region and create N
2597 Single-Entry-Single-Exit regions. Some of those regions will be
2598 trivial ones consisting of a single BB. The blocks of a
2599 partitioned region might form a set of disjoint graphs -- because
2600 the region encloses a differently partitoned sub region.
2601
2602 We use the linear time algorithm described in 'Finding Regions Fast:
2603 Single Entry Single Exit and control Regions in Linear Time'
2604 Johnson, Pearson & Pingali. That algorithm deals with complete
2605 CFGs, where a back edge is inserted from END to START, and thus the
2606 problem becomes one of finding equivalent loops.
2607
2608 In this case we have a partial CFG. We complete it by redirecting
2609 any incoming edge to the graph to be from an arbitrary external BB,
2610 and similarly redirecting any outgoing edge to be to that BB.
2611 Thus we end up with a closed graph.
2612
2613 The algorithm works by building a spanning tree of an undirected
2614 graph and keeping track of back edges from nodes further from the
2615 root in the tree to nodes nearer to the root in the tree. In the
2616 description below, the root is up and the tree grows downwards.
2617
2618 We avoid having to deal with degenerate back-edges to the same
2619 block, by splitting each BB into 3 -- one for input edges, one for
2620 the node itself and one for the output edges. Such back edges are
2621 referred to as 'Brackets'. Cycle equivalent nodes will have the
2622 same set of brackets.
2623
2624 Determining bracket equivalency is done by maintaining a list of
2625 brackets in such a manner that the list length and final bracket
2626 uniquely identify the set.
2627
2628 We use coloring to mark all BBs with cycle equivalency with the
2629 same color. This is the output of the 'Finding Regions Fast'
2630 algorithm. Notice it doesn't actually find the set of nodes within
2631 a particular region, just unorderd sets of nodes that are the
2632 entries and exits of SESE regions.
2633
2634 After determining cycle equivalency, we need to find the minimal
2635 set of SESE regions. Do this with a DFS coloring walk of the
2636 complete graph. We're either 'looking' or 'coloring'. When
2637 looking, and we're in the subgraph, we start coloring the color of
2638 the current node, and remember that node as the start of the
2639 current color's SESE region. Every time we go to a new node, we
2640 decrement the count of nodes with thet color. If it reaches zero,
2641 we remember that node as the end of the current color's SESE region
2642 and return to 'looking'. Otherwise we color the node the current
2643 color.
2644
2645 This way we end up with coloring the inside of non-trivial SESE
2646 regions with the color of that region. */
2647
2648/* A pair of BBs. We use this to represent SESE regions. */
2649typedef std::pair<basic_block, basic_block> bb_pair_t;
2650typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2651
2652/* A node in the undirected CFG. The discriminator SECOND indicates just
2653 above or just below the BB idicated by FIRST. */
2654typedef std::pair<basic_block, int> pseudo_node_t;
2655
2656/* A bracket indicates an edge towards the root of the spanning tree of the
2657 undirected graph. Each bracket has a color, determined
2658 from the currrent set of brackets. */
2659struct bracket
2660{
2661 pseudo_node_t back; /* Back target */
2662
2663 /* Current color and size of set. */
2664 unsigned color;
2665 unsigned size;
2666
2667 bracket (pseudo_node_t back_)
2668 : back (back_), color (~0u), size (~0u)
2669 {
2670 }
2671
2672 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2673 {
2674 if (length != size)
2675 {
2676 size = length;
2677 color = color_counts.length ();
2678 color_counts.quick_push (0);
2679 }
2680 color_counts[color]++;
2681 return color;
2682 }
2683};
2684
2685typedef auto_vec<bracket> bracket_vec_t;
2686
2687/* Basic block info for finding SESE regions. */
2688
2689struct bb_sese
2690{
2691 int node; /* Node number in spanning tree. */
2692 int parent; /* Parent node number. */
2693
2694 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2695 edges arrive at pseudo-node Ai and the outgoing edges leave at
2696 pseudo-node Ao. We have to remember which way we arrived at a
2697 particular node when generating the spanning tree. dir > 0 means
2698 we arrived at Ai, dir < 0 means we arrived at Ao. */
2699 int dir;
2700
2701 /* Lowest numbered pseudo-node reached via a backedge from thsis
2702 node, or any descendant. */
2703 pseudo_node_t high;
2704
2705 int color; /* Cycle-equivalence color */
2706
2707 /* Stack of brackets for this node. */
2708 bracket_vec_t brackets;
2709
2710 bb_sese (unsigned node_, unsigned p, int dir_)
2711 :node (node_), parent (p), dir (dir_)
2712 {
2713 }
2714 ~bb_sese ();
2715
2716 /* Push a bracket ending at BACK. */
2717 void push (const pseudo_node_t &back)
2718 {
2719 if (dump_file)
2720 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2721 back.first ? back.first->index : 0, back.second);
2722 brackets.safe_push (bracket (back));
2723 }
2724
2725 void append (bb_sese *child);
2726 void remove (const pseudo_node_t &);
2727
2728 /* Set node's color. */
2729 void set_color (auto_vec<unsigned> &color_counts)
2730 {
2731 color = brackets.last ().get_color (color_counts, brackets.length ());
2732 }
2733};
2734
2735bb_sese::~bb_sese ()
2736{
2737}
2738
2739/* Destructively append CHILD's brackets. */
2740
2741void
2742bb_sese::append (bb_sese *child)
2743{
2744 if (int len = child->brackets.length ())
2745 {
2746 int ix;
2747
2748 if (dump_file)
2749 {
2750 for (ix = 0; ix < len; ix++)
2751 {
2752 const pseudo_node_t &pseudo = child->brackets[ix].back;
2753 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2754 child->node, pseudo.first ? pseudo.first->index : 0,
2755 pseudo.second);
2756 }
2757 }
2758 if (!brackets.length ())
2759 std::swap (brackets, child->brackets);
2760 else
2761 {
2762 brackets.reserve (len);
2763 for (ix = 0; ix < len; ix++)
2764 brackets.quick_push (child->brackets[ix]);
2765 }
2766 }
2767}
2768
2769/* Remove brackets that terminate at PSEUDO. */
2770
2771void
2772bb_sese::remove (const pseudo_node_t &pseudo)
2773{
2774 unsigned removed = 0;
2775 int len = brackets.length ();
2776
2777 for (int ix = 0; ix < len; ix++)
2778 {
2779 if (brackets[ix].back == pseudo)
2780 {
2781 if (dump_file)
2782 fprintf (dump_file, "Removing backedge %d:%+d\n",
2783 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2784 removed++;
2785 }
2786 else if (removed)
2787 brackets[ix-removed] = brackets[ix];
2788 }
2789 while (removed--)
2790 brackets.pop ();
2791}
2792
2793/* Accessors for BB's aux pointer. */
2794#define BB_SET_SESE(B, S) ((B)->aux = (S))
2795#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2796
2797/* DFS walk creating SESE data structures. Only cover nodes with
2798 BB_VISITED set. Append discovered blocks to LIST. We number in
2799 increments of 3 so that the above and below pseudo nodes can be
2800 implicitly numbered too. */
2801
2802static int
2803nvptx_sese_number (int n, int p, int dir, basic_block b,
2804 auto_vec<basic_block> *list)
2805{
2806 if (BB_GET_SESE (b))
2807 return n;
2808
2809 if (dump_file)
2810 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2811 b->index, n, p, dir);
2812
2813 BB_SET_SESE (b, new bb_sese (n, p, dir));
2814 p = n;
2815
2816 n += 3;
2817 list->quick_push (b);
2818
2819 /* First walk the nodes on the 'other side' of this node, then walk
2820 the nodes on the same side. */
2821 for (unsigned ix = 2; ix; ix--)
2822 {
2823 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2824 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2825 : offsetof (edge_def, src));
2826 edge e;
2827 edge_iterator (ei);
2828
2829 FOR_EACH_EDGE (e, ei, edges)
2830 {
2831 basic_block target = *(basic_block *)((char *)e + offset);
2832
2833 if (target->flags & BB_VISITED)
2834 n = nvptx_sese_number (n, p, dir, target, list);
2835 }
2836 dir = -dir;
2837 }
2838 return n;
2839}
2840
2841/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2842 EDGES are the outgoing edges and OFFSET is the offset to the src
2843 or dst block on the edges. */
2844
2845static void
2846nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2847 vec<edge, va_gc> *edges, size_t offset)
2848{
2849 edge e;
2850 edge_iterator (ei);
2851 int hi_back = depth;
2852 pseudo_node_t node_back (0, depth);
2853 int hi_child = depth;
2854 pseudo_node_t node_child (0, depth);
2855 basic_block child = NULL;
2856 unsigned num_children = 0;
2857 int usd = -dir * sese->dir;
2858
2859 if (dump_file)
2860 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2861 me->index, sese->node, dir);
2862
2863 if (dir < 0)
2864 {
2865 /* This is the above pseudo-child. It has the BB itself as an
2866 additional child node. */
2867 node_child = sese->high;
2868 hi_child = node_child.second;
2869 if (node_child.first)
2870 hi_child += BB_GET_SESE (node_child.first)->node;
2871 num_children++;
2872 }
2873
2874 /* Examine each edge.
2875 - if it is a child (a) append its bracket list and (b) record
2876 whether it is the child with the highest reaching bracket.
2877 - if it is an edge to ancestor, record whether it's the highest
2878 reaching backlink. */
2879 FOR_EACH_EDGE (e, ei, edges)
2880 {
2881 basic_block target = *(basic_block *)((char *)e + offset);
2882
2883 if (bb_sese *t_sese = BB_GET_SESE (target))
2884 {
2885 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2886 {
2887 /* Child node. Append its bracket list. */
2888 num_children++;
2889 sese->append (t_sese);
2890
2891 /* Compare it's hi value. */
2892 int t_hi = t_sese->high.second;
2893
2894 if (basic_block child_hi_block = t_sese->high.first)
2895 t_hi += BB_GET_SESE (child_hi_block)->node;
2896
2897 if (hi_child > t_hi)
2898 {
2899 hi_child = t_hi;
2900 node_child = t_sese->high;
2901 child = target;
2902 }
2903 }
2904 else if (t_sese->node < sese->node + dir
2905 && !(dir < 0 && sese->parent == t_sese->node))
2906 {
2907 /* Non-parental ancestor node -- a backlink. */
2908 int d = usd * t_sese->dir;
2909 int back = t_sese->node + d;
2910
2911 if (hi_back > back)
2912 {
2913 hi_back = back;
2914 node_back = pseudo_node_t (target, d);
2915 }
2916 }
2917 }
2918 else
2919 { /* Fallen off graph, backlink to entry node. */
2920 hi_back = 0;
2921 node_back = pseudo_node_t (0, 0);
2922 }
2923 }
2924
2925 /* Remove any brackets that terminate at this pseudo node. */
2926 sese->remove (pseudo_node_t (me, dir));
2927
2928 /* Now push any backlinks from this pseudo node. */
2929 FOR_EACH_EDGE (e, ei, edges)
2930 {
2931 basic_block target = *(basic_block *)((char *)e + offset);
2932 if (bb_sese *t_sese = BB_GET_SESE (target))
2933 {
2934 if (t_sese->node < sese->node + dir
2935 && !(dir < 0 && sese->parent == t_sese->node))
2936 /* Non-parental ancestor node - backedge from me. */
2937 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2938 }
2939 else
2940 {
2941 /* back edge to entry node */
2942 sese->push (pseudo_node_t (0, 0));
2943 }
2944 }
2945
2946 /* If this node leads directly or indirectly to a no-return region of
2947 the graph, then fake a backedge to entry node. */
2948 if (!sese->brackets.length () || !edges || !edges->length ())
2949 {
2950 hi_back = 0;
2951 node_back = pseudo_node_t (0, 0);
2952 sese->push (node_back);
2953 }
2954
2955 /* Record the highest reaching backedge from us or a descendant. */
2956 sese->high = hi_back < hi_child ? node_back : node_child;
2957
2958 if (num_children > 1)
2959 {
2960 /* There is more than one child -- this is a Y shaped piece of
2961 spanning tree. We have to insert a fake backedge from this
2962 node to the highest ancestor reached by not-the-highest
2963 reaching child. Note that there may be multiple children
2964 with backedges to the same highest node. That's ok and we
2965 insert the edge to that highest node. */
2966 hi_child = depth;
2967 if (dir < 0 && child)
2968 {
2969 node_child = sese->high;
2970 hi_child = node_child.second;
2971 if (node_child.first)
2972 hi_child += BB_GET_SESE (node_child.first)->node;
2973 }
2974
2975 FOR_EACH_EDGE (e, ei, edges)
2976 {
2977 basic_block target = *(basic_block *)((char *)e + offset);
2978
2979 if (target == child)
2980 /* Ignore the highest child. */
2981 continue;
2982
2983 bb_sese *t_sese = BB_GET_SESE (target);
2984 if (!t_sese)
2985 continue;
2986 if (t_sese->parent != sese->node)
2987 /* Not a child. */
2988 continue;
2989
2990 /* Compare its hi value. */
2991 int t_hi = t_sese->high.second;
2992
2993 if (basic_block child_hi_block = t_sese->high.first)
2994 t_hi += BB_GET_SESE (child_hi_block)->node;
2995
2996 if (hi_child > t_hi)
2997 {
2998 hi_child = t_hi;
2999 node_child = t_sese->high;
3000 }
3001 }
3002
3003 sese->push (node_child);
3004 }
3005}
3006
3007
3008/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3009 proceed to successors. Set SESE entry and exit nodes of
3010 REGIONS. */
3011
3012static void
3013nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3014 basic_block block, int coloring)
3015{
3016 bb_sese *sese = BB_GET_SESE (block);
3017
3018 if (block->flags & BB_VISITED)
3019 {
3020 /* If we've already encountered this block, either we must not
3021 be coloring, or it must have been colored the current color. */
3022 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3023 return;
3024 }
3025
3026 block->flags |= BB_VISITED;
3027
3028 if (sese)
3029 {
3030 if (coloring < 0)
3031 {
3032 /* Start coloring a region. */
3033 regions[sese->color].first = block;
3034 coloring = sese->color;
3035 }
3036
3037 if (!--color_counts[sese->color] && sese->color == coloring)
3038 {
3039 /* Found final block of SESE region. */
3040 regions[sese->color].second = block;
3041 coloring = -1;
3042 }
3043 else
3044 /* Color the node, so we can assert on revisiting the node
3045 that the graph is indeed SESE. */
3046 sese->color = coloring;
3047 }
3048 else
3049 /* Fallen off the subgraph, we cannot be coloring. */
3050 gcc_assert (coloring < 0);
3051
3052 /* Walk each successor block. */
3053 if (block->succs && block->succs->length ())
3054 {
3055 edge e;
3056 edge_iterator ei;
3057
3058 FOR_EACH_EDGE (e, ei, block->succs)
3059 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3060 }
3061 else
3062 gcc_assert (coloring < 0);
3063}
3064
3065/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3066 end up with NULL entries in it. */
3067
3068static void
3069nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3070{
3071 basic_block block;
3072 int ix;
3073
3074 /* First clear each BB of the whole function. */
3075 FOR_EACH_BB_FN (block, cfun)
3076 {
3077 block->flags &= ~BB_VISITED;
3078 BB_SET_SESE (block, 0);
3079 }
3080 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3081 block->flags &= ~BB_VISITED;
3082 BB_SET_SESE (block, 0);
3083 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3084 block->flags &= ~BB_VISITED;
3085 BB_SET_SESE (block, 0);
3086
3087 /* Mark blocks in the function that are in this graph. */
3088 for (ix = 0; blocks.iterate (ix, &block); ix++)
3089 block->flags |= BB_VISITED;
3090
3091 /* Counts of nodes assigned to each color. There cannot be more
3092 colors than blocks (and hopefully there will be fewer). */
3093 auto_vec<unsigned> color_counts;
3094 color_counts.reserve (blocks.length ());
3095
3096 /* Worklist of nodes in the spanning tree. Again, there cannot be
3097 more nodes in the tree than blocks (there will be fewer if the
3098 CFG of blocks is disjoint). */
3099 auto_vec<basic_block> spanlist;
3100 spanlist.reserve (blocks.length ());
3101
3102 /* Make sure every block has its cycle class determined. */
3103 for (ix = 0; blocks.iterate (ix, &block); ix++)
3104 {
3105 if (BB_GET_SESE (block))
3106 /* We already met this block in an earlier graph solve. */
3107 continue;
3108
3109 if (dump_file)
3110 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3111
3112 /* Number the nodes reachable from block initial DFS order. */
3113 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3114
3115 /* Now walk in reverse DFS order to find cycle equivalents. */
3116 while (spanlist.length ())
3117 {
3118 block = spanlist.pop ();
3119 bb_sese *sese = BB_GET_SESE (block);
3120
3121 /* Do the pseudo node below. */
3122 nvptx_sese_pseudo (block, sese, depth, +1,
3123 sese->dir > 0 ? block->succs : block->preds,
3124 (sese->dir > 0 ? offsetof (edge_def, dest)
3125 : offsetof (edge_def, src)));
3126 sese->set_color (color_counts);
3127 /* Do the pseudo node above. */
3128 nvptx_sese_pseudo (block, sese, depth, -1,
3129 sese->dir < 0 ? block->succs : block->preds,
3130 (sese->dir < 0 ? offsetof (edge_def, dest)
3131 : offsetof (edge_def, src)));
3132 }
3133 if (dump_file)
3134 fprintf (dump_file, "\n");
3135 }
3136
3137 if (dump_file)
3138 {
3139 unsigned count;
3140 const char *comma = "";
3141
3142 fprintf (dump_file, "Found %d cycle equivalents\n",
3143 color_counts.length ());
3144 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3145 {
3146 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3147
3148 comma = "";
3149 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3150 if (BB_GET_SESE (block)->color == ix)
3151 {
3152 block->flags |= BB_VISITED;
3153 fprintf (dump_file, "%s%d", comma, block->index);
3154 comma=",";
3155 }
3156 fprintf (dump_file, "}");
3157 comma = ", ";
3158 }
3159 fprintf (dump_file, "\n");
3160 }
3161
3162 /* Now we've colored every block in the subgraph. We now need to
3163 determine the minimal set of SESE regions that cover that
3164 subgraph. Do this with a DFS walk of the complete function.
3165 During the walk we're either 'looking' or 'coloring'. When we
3166 reach the last node of a particular color, we stop coloring and
3167 return to looking. */
3168
3169 /* There cannot be more SESE regions than colors. */
3170 regions.reserve (color_counts.length ());
3171 for (ix = color_counts.length (); ix--;)
3172 regions.quick_push (bb_pair_t (0, 0));
3173
3174 for (ix = 0; blocks.iterate (ix, &block); ix++)
3175 block->flags &= ~BB_VISITED;
3176
3177 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3178
3179 if (dump_file)
3180 {
3181 const char *comma = "";
3182 int len = regions.length ();
3183
3184 fprintf (dump_file, "SESE regions:");
3185 for (ix = 0; ix != len; ix++)
3186 {
3187 basic_block from = regions[ix].first;
3188 basic_block to = regions[ix].second;
3189
3190 if (from)
3191 {
3192 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3193 if (to != from)
3194 fprintf (dump_file, "->%d", to->index);
3195
3196 int color = BB_GET_SESE (from)->color;
3197
3198 /* Print the blocks within the region (excluding ends). */
3199 FOR_EACH_BB_FN (block, cfun)
3200 {
3201 bb_sese *sese = BB_GET_SESE (block);
3202
3203 if (sese && sese->color == color
3204 && block != from && block != to)
3205 fprintf (dump_file, ".%d", block->index);
3206 }
3207 fprintf (dump_file, "}");
3208 }
3209 comma = ",";
3210 }
3211 fprintf (dump_file, "\n\n");
3212 }
3213
3214 for (ix = 0; blocks.iterate (ix, &block); ix++)
3215 delete BB_GET_SESE (block);
3216}
3217
3218#undef BB_SET_SESE
3219#undef BB_GET_SESE
3220
d88cd9c4
NS
3221/* Propagate live state at the start of a partitioned region. BLOCK
3222 provides the live register information, and might not contain
3223 INSN. Propagation is inserted just after INSN. RW indicates whether
3224 we are reading and/or writing state. This
3225 separation is needed for worker-level proppagation where we
3226 essentially do a spill & fill. FN is the underlying worker
3227 function to generate the propagation instructions for single
3228 register. DATA is user data.
3229
3230 We propagate the live register set and the entire frame. We could
3231 do better by (a) propagating just the live set that is used within
3232 the partitioned regions and (b) only propagating stack entries that
3233 are used. The latter might be quite hard to determine. */
3234
3235typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3236
3237static void
3238nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3239 propagator_fn fn, void *data)
3240{
3241 bitmap live = DF_LIVE_IN (block);
3242 bitmap_iterator iterator;
3243 unsigned ix;
3244
3245 /* Copy the frame array. */
3246 HOST_WIDE_INT fs = get_frame_size ();
3247 if (fs)
3248 {
3249 rtx tmp = gen_reg_rtx (DImode);
3250 rtx idx = NULL_RTX;
3251 rtx ptr = gen_reg_rtx (Pmode);
3252 rtx pred = NULL_RTX;
3253 rtx_code_label *label = NULL;
3254
3255 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3256 fs /= GET_MODE_SIZE (DImode);
3257 /* Detect single iteration loop. */
3258 if (fs == 1)
3259 fs = 0;
3260
3261 start_sequence ();
3262 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3263 if (fs)
3264 {
3265 idx = gen_reg_rtx (SImode);
3266 pred = gen_reg_rtx (BImode);
3267 label = gen_label_rtx ();
3268
3269 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3270 /* Allow worker function to initialize anything needed. */
3271 rtx init = fn (tmp, PM_loop_begin, fs, data);
3272 if (init)
3273 emit_insn (init);
3274 emit_label (label);
3275 LABEL_NUSES (label)++;
3276 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3277 }
3278 if (rw & PM_read)
3279 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3280 emit_insn (fn (tmp, rw, fs, data));
3281 if (rw & PM_write)
3282 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3283 if (fs)
3284 {
3285 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3286 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3287 emit_insn (gen_br_true_uni (pred, label));
3288 rtx fini = fn (tmp, PM_loop_end, fs, data);
3289 if (fini)
3290 emit_insn (fini);
3291 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3292 }
3293 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3294 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3295 rtx cpy = get_insns ();
3296 end_sequence ();
3297 insn = emit_insn_after (cpy, insn);
3298 }
3299
3300 /* Copy live registers. */
3301 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3302 {
3303 rtx reg = regno_reg_rtx[ix];
3304
3305 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3306 {
3307 rtx bcast = fn (reg, rw, 0, data);
3308
3309 insn = emit_insn_after (bcast, insn);
3310 }
3311 }
3312}
3313
3314/* Worker for nvptx_vpropagate. */
3315
3316static rtx
3317vprop_gen (rtx reg, propagate_mask pm,
3318 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3319{
3320 if (!(pm & PM_read_write))
3321 return 0;
3322
3323 return nvptx_gen_vcast (reg);
3324}
3325
3326/* Propagate state that is live at start of BLOCK across the vectors
3327 of a single warp. Propagation is inserted just after INSN. */
3328
3329static void
3330nvptx_vpropagate (basic_block block, rtx_insn *insn)
3331{
3332 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3333}
3334
3335/* Worker for nvptx_wpropagate. */
3336
3337static rtx
3338wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3339{
3340 wcast_data_t *data = (wcast_data_t *)data_;
3341
3342 if (pm & PM_loop_begin)
3343 {
3344 /* Starting a loop, initialize pointer. */
3345 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3346
3347 if (align > worker_bcast_align)
3348 worker_bcast_align = align;
3349 data->offset = (data->offset + align - 1) & ~(align - 1);
3350
3351 data->ptr = gen_reg_rtx (Pmode);
3352
3353 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3354 }
3355 else if (pm & PM_loop_end)
3356 {
3357 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3358 data->ptr = NULL_RTX;
3359 return clobber;
3360 }
3361 else
3362 return nvptx_gen_wcast (reg, pm, rep, data);
3363}
3364
3365/* Spill or fill live state that is live at start of BLOCK. PRE_P
3366 indicates if this is just before partitioned mode (do spill), or
3367 just after it starts (do fill). Sequence is inserted just after
3368 INSN. */
3369
3370static void
3371nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3372{
3373 wcast_data_t data;
3374
3375 data.base = gen_reg_rtx (Pmode);
3376 data.offset = 0;
3377 data.ptr = NULL_RTX;
3378
3379 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3380 if (data.offset)
3381 {
3382 /* Stuff was emitted, initialize the base pointer now. */
3383 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3384 emit_insn_after (init, insn);
3385
3386 if (worker_bcast_size < data.offset)
3387 worker_bcast_size = data.offset;
3388 }
3389}
3390
3391/* Emit a worker-level synchronization barrier. We use different
3392 markers for before and after synchronizations. */
3393
3394static rtx
3395nvptx_wsync (bool after)
3396{
3397 return gen_nvptx_barsync (GEN_INT (after));
3398}
3399
3400/* Single neutering according to MASK. FROM is the incoming block and
3401 TO is the outgoing block. These may be the same block. Insert at
3402 start of FROM:
3403
3404 if (tid.<axis>) goto end.
3405
3406 and insert before ending branch of TO (if there is such an insn):
3407
3408 end:
3409 <possibly-broadcast-cond>
3410 <branch>
3411
3412 We currently only use differnt FROM and TO when skipping an entire
3413 loop. We could do more if we detected superblocks. */
3414
3415static void
3416nvptx_single (unsigned mask, basic_block from, basic_block to)
3417{
3418 rtx_insn *head = BB_HEAD (from);
3419 rtx_insn *tail = BB_END (to);
3420 unsigned skip_mask = mask;
3421
3422 /* Find first insn of from block */
3423 while (head != BB_END (from) && !INSN_P (head))
3424 head = NEXT_INSN (head);
3425
3426 /* Find last insn of to block */
3427 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3428 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3429 tail = PREV_INSN (tail);
3430
3431 /* Detect if tail is a branch. */
3432 rtx tail_branch = NULL_RTX;
3433 rtx cond_branch = NULL_RTX;
3434 if (tail && INSN_P (tail))
3435 {
3436 tail_branch = PATTERN (tail);
3437 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3438 tail_branch = NULL_RTX;
3439 else
3440 {
3441 cond_branch = SET_SRC (tail_branch);
3442 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3443 cond_branch = NULL_RTX;
3444 }
3445 }
3446
3447 if (tail == head)
3448 {
3449 /* If this is empty, do nothing. */
3450 if (!head || !INSN_P (head))
3451 return;
3452
3453 /* If this is a dummy insn, do nothing. */
3454 switch (recog_memoized (head))
3455 {
3456 default:
3457 break;
3458 case CODE_FOR_nvptx_fork:
3459 case CODE_FOR_nvptx_forked:
3460 case CODE_FOR_nvptx_joining:
3461 case CODE_FOR_nvptx_join:
3462 return;
3463 }
3464
3465 if (cond_branch)
3466 {
3467 /* If we're only doing vector single, there's no need to
3468 emit skip code because we'll not insert anything. */
3469 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3470 skip_mask = 0;
3471 }
3472 else if (tail_branch)
3473 /* Block with only unconditional branch. Nothing to do. */
3474 return;
3475 }
3476
3477 /* Insert the vector test inside the worker test. */
3478 unsigned mode;
3479 rtx_insn *before = tail;
3480 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3481 if (GOMP_DIM_MASK (mode) & skip_mask)
3482 {
3483 rtx_code_label *label = gen_label_rtx ();
3484 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3485
3486 if (!pred)
3487 {
3488 pred = gen_reg_rtx (BImode);
3489 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3490 }
3491
3492 rtx br;
3493 if (mode == GOMP_DIM_VECTOR)
3494 br = gen_br_true (pred, label);
3495 else
3496 br = gen_br_true_uni (pred, label);
3497 emit_insn_before (br, head);
3498
3499 LABEL_NUSES (label)++;
3500 if (tail_branch)
3501 before = emit_label_before (label, before);
3502 else
3503 emit_label_after (label, tail);
3504 }
3505
3506 /* Now deal with propagating the branch condition. */
3507 if (cond_branch)
3508 {
3509 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3510
3511 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3512 {
3513 /* Vector mode only, do a shuffle. */
3514 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3515 }
3516 else
3517 {
3518 /* Includes worker mode, do spill & fill. By construction
3519 we should never have worker mode only. */
3520 wcast_data_t data;
3521
3522 data.base = worker_bcast_sym;
3523 data.ptr = 0;
3524
3525 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3526 worker_bcast_size = GET_MODE_SIZE (SImode);
3527
3528 data.offset = 0;
3529 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3530 before);
3531 /* Barrier so other workers can see the write. */
3532 emit_insn_before (nvptx_wsync (false), tail);
3533 data.offset = 0;
3534 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3535 /* This barrier is needed to avoid worker zero clobbering
3536 the broadcast buffer before all the other workers have
3537 had a chance to read this instance of it. */
3538 emit_insn_before (nvptx_wsync (true), tail);
3539 }
3540
3541 extract_insn (tail);
3542 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3543 UNSPEC_BR_UNIFIED);
3544 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3545 }
3546}
3547
3548/* PAR is a parallel that is being skipped in its entirety according to
3549 MASK. Treat this as skipping a superblock starting at forked
3550 and ending at joining. */
3551
3552static void
3553nvptx_skip_par (unsigned mask, parallel *par)
3554{
3555 basic_block tail = par->join_block;
3556 gcc_assert (tail->preds->length () == 1);
3557
3558 basic_block pre_tail = (*tail->preds)[0]->src;
3559 gcc_assert (pre_tail->succs->length () == 1);
3560
3561 nvptx_single (mask, par->forked_block, pre_tail);
3562}
3563
dba619f3
NS
3564/* If PAR has a single inner parallel and PAR itself only contains
3565 empty entry and exit blocks, swallow the inner PAR. */
3566
3567static void
3568nvptx_optimize_inner (parallel *par)
3569{
3570 parallel *inner = par->inner;
3571
3572 /* We mustn't be the outer dummy par. */
3573 if (!par->mask)
3574 return;
3575
3576 /* We must have a single inner par. */
3577 if (!inner || inner->next)
3578 return;
3579
3580 /* We must only contain 2 blocks ourselves -- the head and tail of
3581 the inner par. */
3582 if (par->blocks.length () != 2)
3583 return;
3584
3585 /* We must be disjoint partitioning. As we only have vector and
3586 worker partitioning, this is sufficient to guarantee the pars
3587 have adjacent partitioning. */
3588 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3589 /* This indicates malformed code generation. */
3590 return;
3591
3592 /* The outer forked insn should be immediately followed by the inner
3593 fork insn. */
3594 rtx_insn *forked = par->forked_insn;
3595 rtx_insn *fork = BB_END (par->forked_block);
3596
3597 if (NEXT_INSN (forked) != fork)
3598 return;
3599 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3600
3601 /* The outer joining insn must immediately follow the inner join
3602 insn. */
3603 rtx_insn *joining = par->joining_insn;
3604 rtx_insn *join = inner->join_insn;
3605 if (NEXT_INSN (join) != joining)
3606 return;
3607
3608 /* Preconditions met. Swallow the inner par. */
3609 if (dump_file)
3610 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3611 inner->mask, inner->forked_block->index,
3612 inner->join_block->index,
3613 par->mask, par->forked_block->index, par->join_block->index);
3614
3615 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3616
3617 par->blocks.reserve (inner->blocks.length ());
3618 while (inner->blocks.length ())
3619 par->blocks.quick_push (inner->blocks.pop ());
3620
3621 par->inner = inner->inner;
3622 inner->inner = NULL;
3623
3624 delete inner;
3625}
3626
d88cd9c4
NS
3627/* Process the parallel PAR and all its contained
3628 parallels. We do everything but the neutering. Return mask of
3629 partitioned modes used within this parallel. */
3630
3631static unsigned
3632nvptx_process_pars (parallel *par)
3633{
dba619f3
NS
3634 if (nvptx_optimize)
3635 nvptx_optimize_inner (par);
3636
d88cd9c4
NS
3637 unsigned inner_mask = par->mask;
3638
3639 /* Do the inner parallels first. */
3640 if (par->inner)
3641 {
3642 par->inner_mask = nvptx_process_pars (par->inner);
3643 inner_mask |= par->inner_mask;
3644 }
3645
3646 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3647 /* No propagation needed for a call. */;
5d306e55 3648 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3649 {
3650 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3651 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3652 /* Insert begin and end synchronizations. */
3653 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3654 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3655 }
3656 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3657 nvptx_vpropagate (par->forked_block, par->forked_insn);
3658
3659 /* Now do siblings. */
3660 if (par->next)
3661 inner_mask |= nvptx_process_pars (par->next);
3662 return inner_mask;
3663}
3664
3665/* Neuter the parallel described by PAR. We recurse in depth-first
3666 order. MODES are the partitioning of the execution and OUTER is
3667 the partitioning of the parallels we are contained in. */
3668
3669static void
3670nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3671{
3672 unsigned me = (par->mask
3673 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3674 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3675 unsigned skip_mask = 0, neuter_mask = 0;
3676
3677 if (par->inner)
3678 nvptx_neuter_pars (par->inner, modes, outer | me);
3679
3680 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3681 {
3682 if ((outer | me) & GOMP_DIM_MASK (mode))
3683 {} /* Mode is partitioned: no neutering. */
3684 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 3685 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
3686 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3687 || !par->forked_insn)
3688 /* Partitioned in inner parallels, or we're not a partitioned
3689 at all: neuter individual blocks. */
3690 neuter_mask |= GOMP_DIM_MASK (mode);
3691 else if (!par->parent || !par->parent->forked_insn
3692 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3693 /* Parent isn't a parallel or contains this paralleling: skip
3694 parallel at this level. */
3695 skip_mask |= GOMP_DIM_MASK (mode);
3696 else
3697 {} /* Parent will skip this parallel itself. */
3698 }
3699
3700 if (neuter_mask)
3701 {
912442c2 3702 int ix, len;
d88cd9c4 3703
912442c2
NS
3704 if (nvptx_optimize)
3705 {
3706 /* Neuter whole SESE regions. */
3707 bb_pair_vec_t regions;
3708
3709 nvptx_find_sese (par->blocks, regions);
3710 len = regions.length ();
3711 for (ix = 0; ix != len; ix++)
3712 {
3713 basic_block from = regions[ix].first;
3714 basic_block to = regions[ix].second;
3715
3716 if (from)
3717 nvptx_single (neuter_mask, from, to);
3718 else
3719 gcc_assert (!to);
3720 }
3721 }
3722 else
d88cd9c4 3723 {
912442c2
NS
3724 /* Neuter each BB individually. */
3725 len = par->blocks.length ();
3726 for (ix = 0; ix != len; ix++)
3727 {
3728 basic_block block = par->blocks[ix];
d88cd9c4 3729
912442c2
NS
3730 nvptx_single (neuter_mask, block, block);
3731 }
d88cd9c4
NS
3732 }
3733 }
3734
3735 if (skip_mask)
3736 nvptx_skip_par (skip_mask, par);
3737
3738 if (par->next)
3739 nvptx_neuter_pars (par->next, modes, outer);
3740}
3741
517665b3 3742/* PTX-specific reorganization
d88cd9c4 3743 - Split blocks at fork and join instructions
c38f0d8c
NS
3744 - Compute live registers
3745 - Mark now-unused registers, so function begin doesn't declare
517665b3 3746 unused registers.
d88cd9c4
NS
3747 - Insert state propagation when entering partitioned mode
3748 - Insert neutering instructions when in single mode
c38f0d8c 3749 - Replace subregs with suitable sequences.
517665b3
NS
3750*/
3751
3752static void
3753nvptx_reorg (void)
3754{
517665b3
NS
3755 /* We are freeing block_for_insn in the toplev to keep compatibility
3756 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3757 compute_bb_for_insn ();
3758
3759 thread_prologue_and_epilogue_insns ();
3760
d88cd9c4
NS
3761 /* Split blocks and record interesting unspecs. */
3762 bb_insn_map_t bb_insn_map;
3763
3764 nvptx_split_blocks (&bb_insn_map);
3765
c38f0d8c 3766 /* Compute live regs */
517665b3
NS
3767 df_clear_flags (DF_LR_RUN_DCE);
3768 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
3769 df_live_add_problem ();
3770 df_live_set_all_dirty ();
517665b3 3771 df_analyze ();
738f2522
BS
3772 regstat_init_n_sets_and_refs ();
3773
d88cd9c4
NS
3774 if (dump_file)
3775 df_dump (dump_file);
3776
517665b3 3777 /* Mark unused regs as unused. */
d88cd9c4 3778 int max_regs = max_reg_num ();
517665b3 3779 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
3780 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3781 regno_reg_rtx[i] = const0_rtx;
517665b3 3782
d88cd9c4
NS
3783 /* Determine launch dimensions of the function. If it is not an
3784 offloaded function (i.e. this is a regular compiler), the
3785 function has no neutering. */
3786 tree attr = get_oacc_fn_attrib (current_function_decl);
3787 if (attr)
3788 {
3789 /* If we determined this mask before RTL expansion, we could
3790 elide emission of some levels of forks and joins. */
3791 unsigned mask = 0;
3792 tree dims = TREE_VALUE (attr);
3793 unsigned ix;
3794
3795 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3796 {
3797 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3798 tree allowed = TREE_PURPOSE (dims);
3799
3800 if (size != 1 && !(allowed && integer_zerop (allowed)))
3801 mask |= GOMP_DIM_MASK (ix);
3802 }
3803 /* If there is worker neutering, there must be vector
3804 neutering. Otherwise the hardware will fail. */
3805 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3806 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3807
3808 /* Discover & process partitioned regions. */
3809 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3810 nvptx_process_pars (pars);
3811 nvptx_neuter_pars (pars, mask, 0);
3812 delete pars;
3813 }
3814
517665b3 3815 /* Replace subregs. */
c03b0416 3816 nvptx_reorg_subreg ();
517665b3 3817
738f2522 3818 regstat_free_n_sets_and_refs ();
517665b3
NS
3819
3820 df_finish_pass (true);
738f2522
BS
3821}
3822\f
3823/* Handle a "kernel" attribute; arguments as in
3824 struct attribute_spec.handler. */
3825
3826static tree
3827nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3828 int ARG_UNUSED (flags), bool *no_add_attrs)
3829{
3830 tree decl = *node;
3831
3832 if (TREE_CODE (decl) != FUNCTION_DECL)
3833 {
3834 error ("%qE attribute only applies to functions", name);
3835 *no_add_attrs = true;
3836 }
3837
3838 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3839 {
3840 error ("%qE attribute requires a void return type", name);
3841 *no_add_attrs = true;
3842 }
3843
3844 return NULL_TREE;
3845}
3846
3847/* Table of valid machine attributes. */
3848static const struct attribute_spec nvptx_attribute_table[] =
3849{
3850 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3851 affects_type_identity } */
3852 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3853 { NULL, 0, 0, false, false, false, NULL, false }
3854};
3855\f
3856/* Limit vector alignments to BIGGEST_ALIGNMENT. */
3857
3858static HOST_WIDE_INT
3859nvptx_vector_alignment (const_tree type)
3860{
3861 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3862
3863 return MIN (align, BIGGEST_ALIGNMENT);
3864}
d88cd9c4
NS
3865
3866/* Indicate that INSN cannot be duplicated. */
3867
3868static bool
3869nvptx_cannot_copy_insn_p (rtx_insn *insn)
3870{
3871 switch (recog_memoized (insn))
3872 {
3873 case CODE_FOR_nvptx_shufflesi:
3874 case CODE_FOR_nvptx_shufflesf:
3875 case CODE_FOR_nvptx_barsync:
3876 case CODE_FOR_nvptx_fork:
3877 case CODE_FOR_nvptx_forked:
3878 case CODE_FOR_nvptx_joining:
3879 case CODE_FOR_nvptx_join:
3880 return true;
3881 default:
3882 return false;
3883 }
3884}
a794bd20
NS
3885
3886/* Section anchors do not work. Initialization for flag_section_anchor
3887 probes the existence of the anchoring target hooks and prevents
3888 anchoring if they don't exist. However, we may be being used with
3889 a host-side compiler that does support anchoring, and hence see
3890 the anchor flag set (as it's not recalculated). So provide an
3891 implementation denying anchoring. */
3892
3893static bool
3894nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3895{
3896 return false;
3897}
738f2522 3898\f
1f83528e
TS
3899/* Record a symbol for mkoffload to enter into the mapping table. */
3900
3901static void
3902nvptx_record_offload_symbol (tree decl)
3903{
3e32ee19
NS
3904 switch (TREE_CODE (decl))
3905 {
3906 case VAR_DECL:
3907 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3908 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3909 break;
3910
3911 case FUNCTION_DECL:
3912 {
3913 tree attr = get_oacc_fn_attrib (decl);
5d306e55 3914 tree dims = TREE_VALUE (attr);
3e32ee19
NS
3915 unsigned ix;
3916
3e32ee19
NS
3917 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3918 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3919
5d306e55 3920 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3e32ee19 3921 {
5d306e55 3922 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 3923
5d306e55 3924 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
3925 fprintf (asm_out_file, ", %#x", size);
3926 }
d2d47a28 3927
3e32ee19
NS
3928 fprintf (asm_out_file, "\n");
3929 }
3930 break;
d2d47a28 3931
3e32ee19
NS
3932 default:
3933 gcc_unreachable ();
3934 }
1f83528e
TS
3935}
3936
738f2522
BS
3937/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3938 at the start of a file. */
3939
3940static void
3941nvptx_file_start (void)
3942{
3943 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3944 fputs ("\t.version\t3.1\n", asm_out_file);
3945 fputs ("\t.target\tsm_30\n", asm_out_file);
3946 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3947 fputs ("// END PREAMBLE\n", asm_out_file);
3948}
3949
ecf6e535
BS
3950/* Write out the function declarations we've collected and declare storage
3951 for the broadcast buffer. */
738f2522
BS
3952
3953static void
3954nvptx_file_end (void)
3955{
f3dba894
TS
3956 hash_table<tree_hasher>::iterator iter;
3957 tree decl;
3958 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3959 nvptx_record_fndecl (decl, true);
738f2522 3960 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
3961
3962 if (worker_bcast_size)
3963 {
3964 /* Define the broadcast buffer. */
3965
3966 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3967 & ~(worker_bcast_align - 1);
3968
cf08c344 3969 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_bcast_name);
d88cd9c4
NS
3970 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3971 worker_bcast_align,
3972 worker_bcast_name, worker_bcast_size);
3973 }
f3552158
NS
3974
3975 if (worker_red_size)
3976 {
3977 /* Define the reduction buffer. */
3978
3979 worker_red_size = ((worker_red_size + worker_red_align - 1)
3980 & ~(worker_red_align - 1));
3981
cf08c344 3982 fprintf (asm_out_file, "\n// BEGIN VAR DEF: %s\n", worker_red_name);
f3552158
NS
3983 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3984 worker_red_align,
3985 worker_red_name, worker_red_size);
3986 }
3987}
3988
3989/* Expander for the shuffle builtins. */
3990
3991static rtx
3992nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3993{
3994 if (ignore)
3995 return target;
3996
3997 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3998 NULL_RTX, mode, EXPAND_NORMAL);
3999 if (!REG_P (src))
4000 src = copy_to_mode_reg (mode, src);
4001
4002 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4003 NULL_RTX, SImode, EXPAND_NORMAL);
4004 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4005 NULL_RTX, SImode, EXPAND_NORMAL);
4006
4007 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4008 idx = copy_to_mode_reg (SImode, idx);
4009
4010 rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op));
4011 if (pat)
4012 emit_insn (pat);
4013
4014 return target;
4015}
4016
4017/* Worker reduction address expander. */
4018
4019static rtx
4020nvptx_expand_worker_addr (tree exp, rtx target,
4021 machine_mode ARG_UNUSED (mode), int ignore)
4022{
4023 if (ignore)
4024 return target;
4025
4026 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4027 if (align > worker_red_align)
4028 worker_red_align = align;
4029
4030 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4031 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4032 if (size + offset > worker_red_size)
4033 worker_red_size = size + offset;
4034
4035 emit_insn (gen_rtx_SET (target, worker_red_sym));
4036
4037 if (offset)
4038 emit_insn (gen_rtx_SET (target,
4039 gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
4040
4041 emit_insn (gen_rtx_SET (target,
4042 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
4043 UNSPEC_FROM_SHARED)));
4044
4045 return target;
4046}
4047
4048/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4049 not require taking the address of any object, other than the memory
4050 cell being operated on. */
4051
4052static rtx
4053nvptx_expand_cmp_swap (tree exp, rtx target,
4054 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4055{
4056 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4057
4058 if (!target)
4059 target = gen_reg_rtx (mode);
4060
4061 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4062 NULL_RTX, Pmode, EXPAND_NORMAL);
4063 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4064 NULL_RTX, mode, EXPAND_NORMAL);
4065 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4066 NULL_RTX, mode, EXPAND_NORMAL);
4067 rtx pat;
4068
4069 mem = gen_rtx_MEM (mode, mem);
4070 if (!REG_P (cmp))
4071 cmp = copy_to_mode_reg (mode, cmp);
4072 if (!REG_P (src))
4073 src = copy_to_mode_reg (mode, src);
4074
4075 if (mode == SImode)
4076 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4077 else
4078 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4079
4080 emit_insn (pat);
4081
4082 return target;
4083}
4084
4085
4086/* Codes for all the NVPTX builtins. */
4087enum nvptx_builtins
4088{
4089 NVPTX_BUILTIN_SHUFFLE,
4090 NVPTX_BUILTIN_SHUFFLELL,
4091 NVPTX_BUILTIN_WORKER_ADDR,
4092 NVPTX_BUILTIN_CMP_SWAP,
4093 NVPTX_BUILTIN_CMP_SWAPLL,
4094 NVPTX_BUILTIN_MAX
4095};
4096
4097static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4098
4099/* Return the NVPTX builtin for CODE. */
4100
4101static tree
4102nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4103{
4104 if (code >= NVPTX_BUILTIN_MAX)
4105 return error_mark_node;
4106
4107 return nvptx_builtin_decls[code];
4108}
4109
4110/* Set up all builtin functions for this target. */
4111
4112static void
4113nvptx_init_builtins (void)
4114{
4115#define DEF(ID, NAME, T) \
4116 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4117 = add_builtin_function ("__builtin_nvptx_" NAME, \
4118 build_function_type_list T, \
4119 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4120#define ST sizetype
4121#define UINT unsigned_type_node
4122#define LLUINT long_long_unsigned_type_node
4123#define PTRVOID ptr_type_node
4124
4125 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4126 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4127 DEF (WORKER_ADDR, "worker_addr",
4128 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4129 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4130 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4131
4132#undef DEF
4133#undef ST
4134#undef UINT
4135#undef LLUINT
4136#undef PTRVOID
4137}
4138
4139/* Expand an expression EXP that calls a built-in function,
4140 with result going to TARGET if that's convenient
4141 (and in mode MODE if that's convenient).
4142 SUBTARGET may be used as the target for computing one of EXP's operands.
4143 IGNORE is nonzero if the value is to be ignored. */
4144
4145static rtx
4146nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4147 machine_mode mode, int ignore)
4148{
4149 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4150 switch (DECL_FUNCTION_CODE (fndecl))
4151 {
4152 case NVPTX_BUILTIN_SHUFFLE:
4153 case NVPTX_BUILTIN_SHUFFLELL:
4154 return nvptx_expand_shuffle (exp, target, mode, ignore);
4155
4156 case NVPTX_BUILTIN_WORKER_ADDR:
4157 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4158
4159 case NVPTX_BUILTIN_CMP_SWAP:
4160 case NVPTX_BUILTIN_CMP_SWAPLL:
4161 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4162
4163 default: gcc_unreachable ();
4164 }
738f2522
BS
4165}
4166\f
f3552158
NS
4167/* Define dimension sizes for known hardware. */
4168#define PTX_VECTOR_LENGTH 32
4169#define PTX_WORKER_LENGTH 32
4170
94829f87
NS
4171/* Validate compute dimensions of an OpenACC offload or routine, fill
4172 in non-unity defaults. FN_LEVEL indicates the level at which a
4173 routine might spawn a loop. It is negative for non-routines. */
4174
4175static bool
5d306e55 4176nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4177{
4178 bool changed = false;
4179
ccc8282b
NS
4180 /* The vector size must be 32, unless this is a SEQ routine. */
4181 if (fn_level <= GOMP_DIM_VECTOR
4182 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4183 {
4184 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
4185 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4186 dims[GOMP_DIM_VECTOR]
4187 ? "using vector_length (%d), ignoring %d"
4188 : "using vector_length (%d), ignoring runtime setting",
4189 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4190 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4191 changed = true;
4192 }
4193
4194 /* Check the num workers is not too large. */
4195 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4196 {
4197 warning_at (DECL_SOURCE_LOCATION (decl), 0,
4198 "using num_workers (%d), ignoring %d",
4199 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4200 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4201 changed = true;
4202 }
94829f87
NS
4203
4204 return changed;
4205}
d88cd9c4 4206
bd751975
NS
4207/* Return maximum dimension size, or zero for unbounded. */
4208
4209static int
4210nvptx_dim_limit (int axis)
4211{
4212 switch (axis)
4213 {
4214 case GOMP_DIM_WORKER:
4215 return PTX_WORKER_LENGTH;
4216
4217 case GOMP_DIM_VECTOR:
4218 return PTX_VECTOR_LENGTH;
4219
4220 default:
4221 break;
4222 }
4223 return 0;
4224}
4225
d88cd9c4
NS
4226/* Determine whether fork & joins are needed. */
4227
4228static bool
4229nvptx_goacc_fork_join (gcall *call, const int dims[],
4230 bool ARG_UNUSED (is_fork))
4231{
4232 tree arg = gimple_call_arg (call, 2);
4233 unsigned axis = TREE_INT_CST_LOW (arg);
4234
4235 /* We only care about worker and vector partitioning. */
4236 if (axis < GOMP_DIM_WORKER)
4237 return false;
4238
4239 /* If the size is 1, there's no partitioning. */
4240 if (dims[axis] == 1)
4241 return false;
4242
4243 return true;
4244}
4245
f3552158
NS
4246/* Generate a PTX builtin function call that returns the address in
4247 the worker reduction buffer at OFFSET. TYPE is the type of the
4248 data at that location. */
4249
4250static tree
4251nvptx_get_worker_red_addr (tree type, tree offset)
4252{
4253 machine_mode mode = TYPE_MODE (type);
4254 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4255 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4256 tree align = build_int_cst (unsigned_type_node,
4257 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4258 tree call = build_call_expr (fndecl, 3, offset, size, align);
4259
4260 return fold_convert (build_pointer_type (type), call);
4261}
4262
4263/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4264 will cast the variable if necessary. */
4265
4266static void
4267nvptx_generate_vector_shuffle (location_t loc,
4268 tree dest_var, tree var, unsigned shift,
4269 gimple_seq *seq)
4270{
4271 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4272 tree_code code = NOP_EXPR;
dd3c1b14
NS
4273 tree arg_type = unsigned_type_node;
4274 tree var_type = TREE_TYPE (var);
4275 tree dest_type = var_type;
f3552158 4276
dd3c1b14
NS
4277 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4278 var_type = TREE_TYPE (var_type);
4279
4280 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4281 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4282
4283 if (TYPE_SIZE (var_type)
4284 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4285 {
4286 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4287 arg_type = long_long_unsigned_type_node;
f3552158 4288 }
dd3c1b14 4289
f3552158 4290 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4291 tree bits = build_int_cst (unsigned_type_node, shift);
4292 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4293 tree expr;
4294
4295 if (var_type != dest_type)
4296 {
4297 /* Do real and imaginary parts separately. */
4298 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4299 real = fold_build1 (code, arg_type, real);
4300 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4301 real = fold_build1 (code, var_type, real);
f3552158 4302
dd3c1b14
NS
4303 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4304 imag = fold_build1 (code, arg_type, imag);
4305 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4306 imag = fold_build1 (code, var_type, imag);
4307
4308 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4309 }
4310 else
4311 {
4312 expr = fold_build1 (code, arg_type, var);
4313 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4314 expr = fold_build1 (code, dest_type, expr);
4315 }
f3552158 4316
dd3c1b14 4317 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4318}
4319
33f47f42
NS
4320/* Lazily generate the global lock var decl and return its address. */
4321
4322static tree
4323nvptx_global_lock_addr ()
4324{
4325 tree v = global_lock_var;
4326
4327 if (!v)
4328 {
4329 tree name = get_identifier ("__reduction_lock");
4330 tree type = build_qualified_type (unsigned_type_node,
4331 TYPE_QUAL_VOLATILE);
4332 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4333 global_lock_var = v;
4334 DECL_ARTIFICIAL (v) = 1;
4335 DECL_EXTERNAL (v) = 1;
4336 TREE_STATIC (v) = 1;
4337 TREE_PUBLIC (v) = 1;
4338 TREE_USED (v) = 1;
4339 mark_addressable (v);
4340 mark_decl_referenced (v);
4341 }
4342
4343 return build_fold_addr_expr (v);
4344}
4345
4346/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4347 GSI. We use a lockless scheme for nearly all case, which looks
4348 like:
4349 actual = initval(OP);
4350 do {
4351 guess = actual;
4352 write = guess OP myval;
4353 actual = cmp&swap (ptr, guess, write)
4354 } while (actual bit-different-to guess);
4355 return write;
4356
4357 This relies on a cmp&swap instruction, which is available for 32-
4358 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4359
4360static tree
4361nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4362 tree ptr, tree var, tree_code op)
4363{
4364 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4365 tree_code code = NOP_EXPR;
33f47f42
NS
4366 tree arg_type = unsigned_type_node;
4367 tree var_type = TREE_TYPE (var);
f3552158 4368
33f47f42
NS
4369 if (TREE_CODE (var_type) == COMPLEX_TYPE
4370 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4371 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4372
4373 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4374 {
33f47f42 4375 arg_type = long_long_unsigned_type_node;
f3552158 4376 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4377 }
4378
33f47f42
NS
4379 tree swap_fn = nvptx_builtin_decl (fn, true);
4380
f3552158 4381 gimple_seq init_seq = NULL;
33f47f42
NS
4382 tree init_var = make_ssa_name (arg_type);
4383 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4384 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4385 gimplify_assign (init_var, init_expr, &init_seq);
4386 gimple *init_end = gimple_seq_last (init_seq);
4387
4388 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4389
f3552158
NS
4390 /* Split the block just after the init stmts. */
4391 basic_block pre_bb = gsi_bb (*gsi);
4392 edge pre_edge = split_block (pre_bb, init_end);
4393 basic_block loop_bb = pre_edge->dest;
4394 pre_bb = pre_edge->src;
4395 /* Reset the iterator. */
4396 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4397
33f47f42
NS
4398 tree expect_var = make_ssa_name (arg_type);
4399 tree actual_var = make_ssa_name (arg_type);
4400 tree write_var = make_ssa_name (arg_type);
4401
4402 /* Build and insert the reduction calculation. */
4403 gimple_seq red_seq = NULL;
4404 tree write_expr = fold_build1 (code, var_type, expect_var);
4405 write_expr = fold_build2 (op, var_type, write_expr, var);
4406 write_expr = fold_build1 (code, arg_type, write_expr);
4407 gimplify_assign (write_var, write_expr, &red_seq);
4408
4409 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4410
4411 /* Build & insert the cmp&swap sequence. */
4412 gimple_seq latch_seq = NULL;
4413 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4414 ptr, expect_var, write_var);
4415 gimplify_assign (actual_var, swap_expr, &latch_seq);
4416
4417 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4418 NULL_TREE, NULL_TREE);
4419 gimple_seq_add_stmt (&latch_seq, cond);
4420
4421 gimple *latch_end = gimple_seq_last (latch_seq);
4422 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4423
33f47f42
NS
4424 /* Split the block just after the latch stmts. */
4425 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4426 basic_block post_bb = post_edge->dest;
4427 loop_bb = post_edge->src;
4428 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4429
4430 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4431 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4432 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4433 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4434
4435 gphi *phi = create_phi_node (expect_var, loop_bb);
4436 add_phi_arg (phi, init_var, pre_edge, loc);
4437 add_phi_arg (phi, actual_var, loop_edge, loc);
4438
4439 loop *loop = alloc_loop ();
4440 loop->header = loop_bb;
4441 loop->latch = loop_bb;
4442 add_loop (loop, loop_bb->loop_father);
4443
33f47f42
NS
4444 return fold_build1 (code, var_type, write_var);
4445}
4446
4447/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4448 GSI. This is necessary for types larger than 64 bits, where there
4449 is no cmp&swap instruction to implement a lockless scheme. We use
4450 a lock variable in global memory.
4451
4452 while (cmp&swap (&lock_var, 0, 1))
4453 continue;
4454 T accum = *ptr;
4455 accum = accum OP var;
4456 *ptr = accum;
4457 cmp&swap (&lock_var, 1, 0);
4458 return accum;
4459
4460 A lock in global memory is necessary to force execution engine
4461 descheduling and avoid resource starvation that can occur if the
4462 lock is in .shared memory. */
4463
4464static tree
4465nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4466 tree ptr, tree var, tree_code op)
4467{
4468 tree var_type = TREE_TYPE (var);
4469 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4470 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4471 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4472
4473 /* Split the block just before the gsi. Insert a gimple nop to make
4474 this easier. */
4475 gimple *nop = gimple_build_nop ();
4476 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4477 basic_block entry_bb = gsi_bb (*gsi);
4478 edge entry_edge = split_block (entry_bb, nop);
4479 basic_block lock_bb = entry_edge->dest;
4480 /* Reset the iterator. */
4481 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4482
4483 /* Build and insert the locking sequence. */
4484 gimple_seq lock_seq = NULL;
4485 tree lock_var = make_ssa_name (unsigned_type_node);
4486 tree lock_expr = nvptx_global_lock_addr ();
4487 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4488 uns_unlocked, uns_locked);
4489 gimplify_assign (lock_var, lock_expr, &lock_seq);
4490 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4491 NULL_TREE, NULL_TREE);
4492 gimple_seq_add_stmt (&lock_seq, cond);
4493 gimple *lock_end = gimple_seq_last (lock_seq);
4494 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4495
4496 /* Split the block just after the lock sequence. */
4497 edge locked_edge = split_block (lock_bb, lock_end);
4498 basic_block update_bb = locked_edge->dest;
4499 lock_bb = locked_edge->src;
4500 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4501
4502 /* Create the lock loop ... */
4503 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4504 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4505 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4506 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4507
4508 /* ... and the loop structure. */
4509 loop *lock_loop = alloc_loop ();
4510 lock_loop->header = lock_bb;
4511 lock_loop->latch = lock_bb;
4512 lock_loop->nb_iterations_estimate = 1;
4513 lock_loop->any_estimate = true;
4514 add_loop (lock_loop, entry_bb->loop_father);
4515
4516 /* Build and insert the reduction calculation. */
4517 gimple_seq red_seq = NULL;
4518 tree acc_in = make_ssa_name (var_type);
4519 tree ref_in = build_simple_mem_ref (ptr);
4520 TREE_THIS_VOLATILE (ref_in) = 1;
4521 gimplify_assign (acc_in, ref_in, &red_seq);
4522
4523 tree acc_out = make_ssa_name (var_type);
4524 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4525 gimplify_assign (acc_out, update_expr, &red_seq);
4526
4527 tree ref_out = build_simple_mem_ref (ptr);
4528 TREE_THIS_VOLATILE (ref_out) = 1;
4529 gimplify_assign (ref_out, acc_out, &red_seq);
4530
4531 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4532
4533 /* Build & insert the unlock sequence. */
4534 gimple_seq unlock_seq = NULL;
4535 tree unlock_expr = nvptx_global_lock_addr ();
4536 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4537 uns_locked, uns_unlocked);
4538 gimplify_and_add (unlock_expr, &unlock_seq);
4539 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4540
4541 return acc_out;
4542}
4543
4544/* Emit a sequence to update a reduction accumlator at *PTR with the
4545 value held in VAR using operator OP. Return the updated value.
4546
4547 TODO: optimize for atomic ops and indepedent complex ops. */
4548
4549static tree
4550nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4551 tree ptr, tree var, tree_code op)
4552{
4553 tree type = TREE_TYPE (var);
4554 tree size = TYPE_SIZE (type);
4555
4556 if (size == TYPE_SIZE (unsigned_type_node)
4557 || size == TYPE_SIZE (long_long_unsigned_type_node))
4558 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4559 else
4560 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
4561}
4562
4563/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4564
4565static void
4566nvptx_goacc_reduction_setup (gcall *call)
4567{
4568 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4569 tree lhs = gimple_call_lhs (call);
4570 tree var = gimple_call_arg (call, 2);
4571 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4572 gimple_seq seq = NULL;
4573
4574 push_gimplify_context (true);
4575
4576 if (level != GOMP_DIM_GANG)
4577 {
4578 /* Copy the receiver object. */
4579 tree ref_to_res = gimple_call_arg (call, 1);
4580
4581 if (!integer_zerop (ref_to_res))
4582 var = build_simple_mem_ref (ref_to_res);
4583 }
4584
4585 if (level == GOMP_DIM_WORKER)
4586 {
4587 /* Store incoming value to worker reduction buffer. */
4588 tree offset = gimple_call_arg (call, 5);
4589 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4590 tree ptr = make_ssa_name (TREE_TYPE (call));
4591
4592 gimplify_assign (ptr, call, &seq);
4593 tree ref = build_simple_mem_ref (ptr);
4594 TREE_THIS_VOLATILE (ref) = 1;
4595 gimplify_assign (ref, var, &seq);
4596 }
4597
4598 if (lhs)
4599 gimplify_assign (lhs, var, &seq);
4600
4601 pop_gimplify_context (NULL);
4602 gsi_replace_with_seq (&gsi, seq, true);
4603}
4604
4605/* NVPTX implementation of GOACC_REDUCTION_INIT. */
4606
4607static void
4608nvptx_goacc_reduction_init (gcall *call)
4609{
4610 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4611 tree lhs = gimple_call_lhs (call);
4612 tree var = gimple_call_arg (call, 2);
4613 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4614 enum tree_code rcode
4615 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4616 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4617 TREE_TYPE (var));
4618 gimple_seq seq = NULL;
4619
4620 push_gimplify_context (true);
4621
4622 if (level == GOMP_DIM_VECTOR)
4623 {
4624 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4625 tree tid = make_ssa_name (integer_type_node);
4626 tree dim_vector = gimple_call_arg (call, 3);
4627 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4628 dim_vector);
4629 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4630 NULL_TREE, NULL_TREE);
4631
4632 gimple_call_set_lhs (tid_call, tid);
4633 gimple_seq_add_stmt (&seq, tid_call);
4634 gimple_seq_add_stmt (&seq, cond_stmt);
4635
4636 /* Split the block just after the call. */
4637 edge init_edge = split_block (gsi_bb (gsi), call);
4638 basic_block init_bb = init_edge->dest;
4639 basic_block call_bb = init_edge->src;
4640
4641 /* Fixup flags from call_bb to init_bb. */
4642 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4643
4644 /* Set the initialization stmts. */
4645 gimple_seq init_seq = NULL;
4646 tree init_var = make_ssa_name (TREE_TYPE (var));
4647 gimplify_assign (init_var, init, &init_seq);
4648 gsi = gsi_start_bb (init_bb);
4649 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4650
4651 /* Split block just after the init stmt. */
4652 gsi_prev (&gsi);
4653 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4654 basic_block dst_bb = inited_edge->dest;
4655
4656 /* Create false edge from call_bb to dst_bb. */
4657 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4658
4659 /* Create phi node in dst block. */
4660 gphi *phi = create_phi_node (lhs, dst_bb);
4661 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4662 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4663
4664 /* Reset dominator of dst bb. */
4665 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4666
4667 /* Reset the gsi. */
4668 gsi = gsi_for_stmt (call);
4669 }
4670 else
4671 {
4672 if (level == GOMP_DIM_GANG)
4673 {
4674 /* If there's no receiver object, propagate the incoming VAR. */
4675 tree ref_to_res = gimple_call_arg (call, 1);
4676 if (integer_zerop (ref_to_res))
4677 init = var;
4678 }
4679
4680 gimplify_assign (lhs, init, &seq);
4681 }
4682
4683 pop_gimplify_context (NULL);
4684 gsi_replace_with_seq (&gsi, seq, true);
4685}
4686
4687/* NVPTX implementation of GOACC_REDUCTION_FINI. */
4688
4689static void
4690nvptx_goacc_reduction_fini (gcall *call)
4691{
4692 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4693 tree lhs = gimple_call_lhs (call);
4694 tree ref_to_res = gimple_call_arg (call, 1);
4695 tree var = gimple_call_arg (call, 2);
4696 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4697 enum tree_code op
4698 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4699 gimple_seq seq = NULL;
4700 tree r = NULL_TREE;;
4701
4702 push_gimplify_context (true);
4703
4704 if (level == GOMP_DIM_VECTOR)
4705 {
4706 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4707 but that requires a method of emitting a unified jump at the
4708 gimple level. */
4709 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4710 {
4711 tree other_var = make_ssa_name (TREE_TYPE (var));
4712 nvptx_generate_vector_shuffle (gimple_location (call),
4713 other_var, var, shfl, &seq);
4714
4715 r = make_ssa_name (TREE_TYPE (var));
4716 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4717 var, other_var), &seq);
4718 var = r;
4719 }
4720 }
4721 else
4722 {
4723 tree accum = NULL_TREE;
4724
4725 if (level == GOMP_DIM_WORKER)
4726 {
4727 /* Get reduction buffer address. */
4728 tree offset = gimple_call_arg (call, 5);
4729 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4730 tree ptr = make_ssa_name (TREE_TYPE (call));
4731
4732 gimplify_assign (ptr, call, &seq);
4733 accum = ptr;
4734 }
4735 else if (integer_zerop (ref_to_res))
4736 r = var;
4737 else
4738 accum = ref_to_res;
4739
4740 if (accum)
4741 {
33f47f42 4742 /* UPDATE the accumulator. */
f3552158
NS
4743 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4744 seq = NULL;
33f47f42
NS
4745 r = nvptx_reduction_update (gimple_location (call), &gsi,
4746 accum, var, op);
f3552158
NS
4747 }
4748 }
4749
4750 if (lhs)
4751 gimplify_assign (lhs, r, &seq);
4752 pop_gimplify_context (NULL);
4753
4754 gsi_replace_with_seq (&gsi, seq, true);
4755}
4756
4757/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4758
4759static void
4760nvptx_goacc_reduction_teardown (gcall *call)
4761{
4762 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4763 tree lhs = gimple_call_lhs (call);
4764 tree var = gimple_call_arg (call, 2);
4765 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4766 gimple_seq seq = NULL;
4767
4768 push_gimplify_context (true);
4769 if (level == GOMP_DIM_WORKER)
4770 {
4771 /* Read the worker reduction buffer. */
4772 tree offset = gimple_call_arg (call, 5);
4773 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4774 tree ptr = make_ssa_name (TREE_TYPE (call));
4775
4776 gimplify_assign (ptr, call, &seq);
4777 var = build_simple_mem_ref (ptr);
4778 TREE_THIS_VOLATILE (var) = 1;
4779 }
4780
4781 if (level != GOMP_DIM_GANG)
4782 {
4783 /* Write to the receiver object. */
4784 tree ref_to_res = gimple_call_arg (call, 1);
4785
4786 if (!integer_zerop (ref_to_res))
4787 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4788 }
4789
4790 if (lhs)
4791 gimplify_assign (lhs, var, &seq);
4792
4793 pop_gimplify_context (NULL);
4794
4795 gsi_replace_with_seq (&gsi, seq, true);
4796}
4797
4798/* NVPTX reduction expander. */
4799
4800void
4801nvptx_goacc_reduction (gcall *call)
4802{
4803 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4804
4805 switch (code)
4806 {
4807 case IFN_GOACC_REDUCTION_SETUP:
4808 nvptx_goacc_reduction_setup (call);
4809 break;
4810
4811 case IFN_GOACC_REDUCTION_INIT:
4812 nvptx_goacc_reduction_init (call);
4813 break;
4814
4815 case IFN_GOACC_REDUCTION_FINI:
4816 nvptx_goacc_reduction_fini (call);
4817 break;
4818
4819 case IFN_GOACC_REDUCTION_TEARDOWN:
4820 nvptx_goacc_reduction_teardown (call);
4821 break;
4822
4823 default:
4824 gcc_unreachable ();
4825 }
4826}
4827
738f2522
BS
4828#undef TARGET_OPTION_OVERRIDE
4829#define TARGET_OPTION_OVERRIDE nvptx_option_override
4830
4831#undef TARGET_ATTRIBUTE_TABLE
4832#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4833
4834#undef TARGET_LEGITIMATE_ADDRESS_P
4835#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4836
4837#undef TARGET_PROMOTE_FUNCTION_MODE
4838#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4839
4840#undef TARGET_FUNCTION_ARG
4841#define TARGET_FUNCTION_ARG nvptx_function_arg
4842#undef TARGET_FUNCTION_INCOMING_ARG
4843#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4844#undef TARGET_FUNCTION_ARG_ADVANCE
4845#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
4846#undef TARGET_FUNCTION_ARG_BOUNDARY
4847#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
4848#undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
4849#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
4850#undef TARGET_PASS_BY_REFERENCE
4851#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4852#undef TARGET_FUNCTION_VALUE_REGNO_P
4853#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4854#undef TARGET_FUNCTION_VALUE
4855#define TARGET_FUNCTION_VALUE nvptx_function_value
4856#undef TARGET_LIBCALL_VALUE
4857#define TARGET_LIBCALL_VALUE nvptx_libcall_value
4858#undef TARGET_FUNCTION_OK_FOR_SIBCALL
4859#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
4860#undef TARGET_GET_DRAP_RTX
4861#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
4862#undef TARGET_SPLIT_COMPLEX_ARG
4863#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4864#undef TARGET_RETURN_IN_MEMORY
4865#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4866#undef TARGET_OMIT_STRUCT_RETURN_REG
4867#define TARGET_OMIT_STRUCT_RETURN_REG true
4868#undef TARGET_STRICT_ARGUMENT_NAMING
4869#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
4870#undef TARGET_STATIC_CHAIN
4871#define TARGET_STATIC_CHAIN nvptx_static_chain
4872
4873#undef TARGET_CALL_ARGS
4874#define TARGET_CALL_ARGS nvptx_call_args
4875#undef TARGET_END_CALL_ARGS
4876#define TARGET_END_CALL_ARGS nvptx_end_call_args
4877
4878#undef TARGET_ASM_FILE_START
4879#define TARGET_ASM_FILE_START nvptx_file_start
4880#undef TARGET_ASM_FILE_END
4881#define TARGET_ASM_FILE_END nvptx_file_end
4882#undef TARGET_ASM_GLOBALIZE_LABEL
4883#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4884#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4885#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4886#undef TARGET_PRINT_OPERAND
4887#define TARGET_PRINT_OPERAND nvptx_print_operand
4888#undef TARGET_PRINT_OPERAND_ADDRESS
4889#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4890#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4891#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4892#undef TARGET_ASM_INTEGER
4893#define TARGET_ASM_INTEGER nvptx_assemble_integer
4894#undef TARGET_ASM_DECL_END
4895#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4896#undef TARGET_ASM_DECLARE_CONSTANT_NAME
4897#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4898#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4899#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4900#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4901#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4902
4903#undef TARGET_MACHINE_DEPENDENT_REORG
4904#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4905#undef TARGET_NO_REGISTER_ALLOCATION
4906#define TARGET_NO_REGISTER_ALLOCATION true
4907
1f83528e
TS
4908#undef TARGET_RECORD_OFFLOAD_SYMBOL
4909#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4910
738f2522
BS
4911#undef TARGET_VECTOR_ALIGNMENT
4912#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4913
d88cd9c4
NS
4914#undef TARGET_CANNOT_COPY_INSN_P
4915#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4916
a794bd20
NS
4917#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4918#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4919
f3552158
NS
4920#undef TARGET_INIT_BUILTINS
4921#define TARGET_INIT_BUILTINS nvptx_init_builtins
4922#undef TARGET_EXPAND_BUILTIN
4923#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4924#undef TARGET_BUILTIN_DECL
4925#define TARGET_BUILTIN_DECL nvptx_builtin_decl
4926
94829f87
NS
4927#undef TARGET_GOACC_VALIDATE_DIMS
4928#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4929
bd751975
NS
4930#undef TARGET_GOACC_DIM_LIMIT
4931#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4932
d88cd9c4
NS
4933#undef TARGET_GOACC_FORK_JOIN
4934#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4935
f3552158
NS
4936#undef TARGET_GOACC_REDUCTION
4937#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4938
738f2522
BS
4939struct gcc_target targetm = TARGET_INITIALIZER;
4940
4941#include "gt-nvptx.h"