]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
cgraph.c: Spelling fixes - behaviour -> behavior and neighbour -> neighbor.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
818ab71a 2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
e11c4407
AM
31#include "tm_p.h"
32#include "expmed.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic.h"
40e23961 38#include "alias.h"
738f2522
BS
39#include "insn-flags.h"
40#include "output.h"
41#include "insn-attr.h"
36566b39 42#include "flags.h"
36566b39
PK
43#include "dojump.h"
44#include "explow.h"
45#include "calls.h"
36566b39
PK
46#include "varasm.h"
47#include "stmt.h"
738f2522 48#include "expr.h"
738f2522
BS
49#include "tm-preds.h"
50#include "tm-constrs.h"
738f2522
BS
51#include "langhooks.h"
52#include "dbxout.h"
738f2522 53#include "cfgrtl.h"
d88cd9c4 54#include "gimple.h"
738f2522 55#include "stor-layout.h"
738f2522 56#include "builtins.h"
3e32ee19
NS
57#include "omp-low.h"
58#include "gomp-constants.h"
d88cd9c4 59#include "dumpfile.h"
f3552158
NS
60#include "internal-fn.h"
61#include "gimple-iterator.h"
62#include "stringpool.h"
63#include "tree-ssa-operands.h"
64#include "tree-ssanames.h"
65#include "gimplify.h"
66#include "tree-phinodes.h"
67#include "cfgloop.h"
68#include "fold-const.h"
738f2522 69
994c5d85 70/* This file should be included last. */
d58627a0
RS
71#include "target-def.h"
72
59263259
NS
73/* The kind of shuffe instruction. */
74enum nvptx_shuffle_kind
75{
76 SHUFFLE_UP,
77 SHUFFLE_DOWN,
78 SHUFFLE_BFLY,
79 SHUFFLE_IDX,
80 SHUFFLE_MAX
81};
d88cd9c4 82
9a863523
NS
83/* The various PTX memory areas an object might reside in. */
84enum nvptx_data_area
85{
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
93};
94
95/* We record the data area in the target symbol flags. */
96#define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101
738f2522
BS
102/* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104static std::stringstream func_decls;
f3dba894 105
6c907cff 106struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
107{
108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
109 static bool equal (rtx a, rtx b) { return a == b; }
110};
111
112static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114
6c907cff 115struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
116{
117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
118 static bool equal (tree a, tree b) { return a == b; }
119};
120
121static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 123
f3552158
NS
124/* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
d88cd9c4
NS
129static unsigned worker_bcast_size;
130static unsigned worker_bcast_align;
d88cd9c4
NS
131static GTY(()) rtx worker_bcast_sym;
132
f3552158
NS
133/* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135static unsigned worker_red_size;
136static unsigned worker_red_align;
f3552158
NS
137static GTY(()) rtx worker_red_sym;
138
33f47f42
NS
139/* Global lock variable, needed for 128bit worker & gang reductions. */
140static GTY(()) tree global_lock_var;
141
738f2522
BS
142/* Allocate a new, cleared machine_function structure. */
143
144static struct machine_function *
145nvptx_init_machine_status (void)
146{
147 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
44c068ae 148 p->return_mode = VOIDmode;
738f2522
BS
149 return p;
150}
151
152/* Implement TARGET_OPTION_OVERRIDE. */
153
154static void
155nvptx_option_override (void)
156{
157 init_machine_status = nvptx_init_machine_status;
158 /* Gives us a predictable order, which we need especially for variables. */
159 flag_toplevel_reorder = 1;
160 /* Assumes that it will see only hard registers. */
161 flag_var_tracking = 0;
1e5154e7
NS
162
163 if (write_symbols == DBX_DEBUG)
164 /* The stabs testcases want to know stabs isn't supported. */
165 sorry ("stabs debug format not supported");
166
167 /* Actually we don't have any debug format, but don't be
168 unneccesarily noisy. */
f324806d
NS
169 write_symbols = NO_DEBUG;
170 debug_info_level = DINFO_LEVEL_NONE;
738f2522 171
dba619f3
NS
172 if (nvptx_optimize < 0)
173 nvptx_optimize = optimize > 0;
174
f3dba894
TS
175 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
176 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 177 declared_libfuncs_htab
f3dba894 178 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4 179
15ab6f00 180 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
9a863523 181 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
d88cd9c4 182 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158 183
15ab6f00 184 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
9a863523 185 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
f3552158 186 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
738f2522
BS
187}
188
738f2522
BS
189/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
190 deal with ptx ideosyncracies. */
191
192const char *
193nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
194{
195 switch (mode)
196 {
197 case BLKmode:
198 return ".b8";
199 case BImode:
200 return ".pred";
201 case QImode:
202 if (promote)
203 return ".u32";
204 else
205 return ".u8";
206 case HImode:
207 return ".u16";
208 case SImode:
209 return ".u32";
210 case DImode:
211 return ".u64";
212
213 case SFmode:
214 return ".f32";
215 case DFmode:
216 return ".f64";
217
218 default:
219 gcc_unreachable ();
220 }
221}
222
9a863523
NS
223/* Encode the PTX data area that DECL (which might not actually be a
224 _DECL) should reside in. */
7b8edc29 225
9a863523
NS
226static void
227nvptx_encode_section_info (tree decl, rtx rtl, int first)
7b8edc29 228{
9a863523
NS
229 default_encode_section_info (decl, rtl, first);
230 if (first && MEM_P (rtl))
231 {
232 nvptx_data_area area = DATA_AREA_GENERIC;
7b8edc29 233
9a863523
NS
234 if (TREE_CONSTANT (decl))
235 area = DATA_AREA_CONST;
236 else if (TREE_CODE (decl) == VAR_DECL)
237 /* TODO: This would be a good place to check for a .shared or
238 other section name. */
239 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
7b8edc29 240
9a863523
NS
241 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
242 }
243}
244
245/* Return the PTX name of the data area in which SYM should be
246 placed. The symbol must have already been processed by
247 nvptx_encode_seciton_info, or equivalent. */
248
249static const char *
250section_for_sym (rtx sym)
251{
252 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
253 /* Same order as nvptx_data_area enum. */
254 static char const *const areas[] =
255 {"", ".global", ".shared", ".local", ".const", ".param"};
256
257 return areas[area];
258}
259
260/* Similarly for a decl. */
261
262static const char *
263section_for_decl (const_tree decl)
264{
265 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
7b8edc29
NS
266}
267
b699adcc
NS
268/* Check NAME for special function names and redirect them by returning a
269 replacement. This applies to malloc, free and realloc, for which we
270 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
271
272static const char *
273nvptx_name_replacement (const char *name)
274{
275 if (strcmp (name, "call") == 0)
276 return "__nvptx_call";
277 if (strcmp (name, "malloc") == 0)
278 return "__nvptx_malloc";
279 if (strcmp (name, "free") == 0)
280 return "__nvptx_free";
281 if (strcmp (name, "realloc") == 0)
282 return "__nvptx_realloc";
283 return name;
284}
285
d7479262
NS
286/* If MODE should be treated as two registers of an inner mode, return
287 that inner mode. Otherwise return VOIDmode. */
738f2522 288
d7479262
NS
289static machine_mode
290maybe_split_mode (machine_mode mode)
738f2522 291{
738f2522 292 if (COMPLEX_MODE_P (mode))
d7479262 293 return GET_MODE_INNER (mode);
738f2522 294
738f2522 295 if (mode == TImode)
d7479262
NS
296 return DImode;
297
298 return VOIDmode;
738f2522
BS
299}
300
f313d112
NS
301/* Output a register, subreg, or register pair (with optional
302 enclosing braces). */
303
304static void
305output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
306 int subreg_offset = -1)
307{
308 if (inner_mode == VOIDmode)
309 {
310 if (HARD_REGISTER_NUM_P (regno))
311 fprintf (file, "%s", reg_names[regno]);
312 else
313 fprintf (file, "%%r%d", regno);
314 }
315 else if (subreg_offset >= 0)
316 {
317 output_reg (file, regno, VOIDmode);
318 fprintf (file, "$%d", subreg_offset);
319 }
320 else
321 {
322 if (subreg_offset == -1)
323 fprintf (file, "{");
324 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
325 fprintf (file, ",");
326 output_reg (file, regno, inner_mode, 0);
327 if (subreg_offset == -1)
328 fprintf (file, "}");
329 }
330}
331
d88cd9c4
NS
332/* Emit forking instructions for MASK. */
333
334static void
335nvptx_emit_forking (unsigned mask, bool is_call)
336{
337 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
338 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
339 if (mask)
340 {
341 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
342
343 /* Emit fork at all levels. This helps form SESE regions, as
344 it creates a block with a single successor before entering a
345 partitooned region. That is a good candidate for the end of
346 an SESE region. */
347 if (!is_call)
348 emit_insn (gen_nvptx_fork (op));
349 emit_insn (gen_nvptx_forked (op));
350 }
351}
352
353/* Emit joining instructions for MASK. */
354
355static void
356nvptx_emit_joining (unsigned mask, bool is_call)
357{
358 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
359 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
360 if (mask)
361 {
362 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
363
364 /* Emit joining for all non-call pars to ensure there's a single
365 predecessor for the block the join insn ends up in. This is
366 needed for skipping entire loops. */
367 if (!is_call)
368 emit_insn (gen_nvptx_joining (op));
369 emit_insn (gen_nvptx_join (op));
370 }
371}
372
738f2522 373\f
44eba92d
NS
374/* Determine whether MODE and TYPE (possibly NULL) should be passed or
375 returned in memory. Integer and floating types supported by the
376 machine are passed in registers, everything else is passed in
377 memory. Complex types are split. */
378
379static bool
380pass_in_memory (machine_mode mode, const_tree type, bool for_return)
381{
382 if (type)
383 {
384 if (AGGREGATE_TYPE_P (type))
385 return true;
386 if (TREE_CODE (type) == VECTOR_TYPE)
387 return true;
388 }
389
390 if (!for_return && COMPLEX_MODE_P (mode))
391 /* Complex types are passed as two underlying args. */
392 mode = GET_MODE_INNER (mode);
393
394 if (GET_MODE_CLASS (mode) != MODE_INT
395 && GET_MODE_CLASS (mode) != MODE_FLOAT)
396 return true;
397
398 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
399 return true;
400
401 return false;
402}
403
404/* A non-memory argument of mode MODE is being passed, determine the mode it
405 should be promoted to. This is also used for determining return
406 type promotion. */
407
408static machine_mode
409promote_arg (machine_mode mode, bool prototyped)
410{
411 if (!prototyped && mode == SFmode)
412 /* K&R float promotion for unprototyped functions. */
413 mode = DFmode;
414 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
415 mode = SImode;
416
417 return mode;
418}
419
420/* A non-memory return type of MODE is being returned. Determine the
421 mode it should be promoted to. */
422
423static machine_mode
424promote_return (machine_mode mode)
425{
426 return promote_arg (mode, true);
427}
428
e74f7152 429/* Implement TARGET_FUNCTION_ARG. */
dc3d2aeb 430
e74f7152 431static rtx
b49e35a9 432nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
e74f7152
NS
433 const_tree, bool named)
434{
b49e35a9 435 if (mode == VOIDmode || !named)
e74f7152 436 return NULL_RTX;
738f2522 437
b49e35a9 438 return gen_reg_rtx (mode);
e74f7152
NS
439}
440
441/* Implement TARGET_FUNCTION_INCOMING_ARG. */
442
443static rtx
444nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
445 const_tree, bool named)
738f2522 446{
e74f7152 447 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
dc3d2aeb 448
b49e35a9 449 if (mode == VOIDmode || !named)
e74f7152 450 return NULL_RTX;
738f2522 451
e74f7152
NS
452 /* No need to deal with split modes here, the only case that can
453 happen is complex modes and those are dealt with by
454 TARGET_SPLIT_COMPLEX_ARG. */
455 return gen_rtx_UNSPEC (mode,
456 gen_rtvec (1, GEN_INT (cum->count)),
457 UNSPEC_ARG_REG);
458}
459
460/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
461
462static void
463nvptx_function_arg_advance (cumulative_args_t cum_v,
464 machine_mode ARG_UNUSED (mode),
465 const_tree ARG_UNUSED (type),
466 bool ARG_UNUSED (named))
467{
468 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 469
e74f7152
NS
470 cum->count++;
471}
472
473/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
474
475 For nvptx, we know how to handle functions declared as stdarg: by
476 passing an extra pointer to the unnamed arguments. However, the
477 Fortran frontend can produce a different situation, where a
478 function pointer is declared with no arguments, but the actual
479 function and calls to it take more arguments. In that case, we
480 want to ensure the call matches the definition of the function. */
481
482static bool
483nvptx_strict_argument_naming (cumulative_args_t cum_v)
484{
485 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
b49e35a9 486
e74f7152
NS
487 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
488}
489
e74f7152
NS
490/* Implement TARGET_LIBCALL_VALUE. */
491
492static rtx
493nvptx_libcall_value (machine_mode mode, const_rtx)
494{
44c068ae 495 if (!cfun->machine->doing_call)
e74f7152
NS
496 /* Pretend to return in a hard reg for early uses before pseudos can be
497 generated. */
498 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
b49e35a9 499
e74f7152
NS
500 return gen_reg_rtx (mode);
501}
502
503/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
504 where function FUNC returns or receives a value of data type TYPE. */
505
506static rtx
44eba92d 507nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
e74f7152
NS
508 bool outgoing)
509{
44eba92d
NS
510 machine_mode mode = promote_return (TYPE_MODE (type));
511
e74f7152 512 if (outgoing)
5c036f3f 513 {
44c068ae 514 cfun->machine->return_mode = mode;
5c036f3f
NS
515 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
516 }
b49e35a9
NS
517
518 return nvptx_libcall_value (mode, NULL_RTX);
e74f7152
NS
519}
520
521/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
522
523static bool
524nvptx_function_value_regno_p (const unsigned int regno)
525{
526 return regno == NVPTX_RETURN_REGNUM;
527}
528
529/* Types with a mode other than those supported by the machine are passed by
530 reference in memory. */
531
532static bool
5563d5c0
NS
533nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
534 machine_mode mode, const_tree type,
535 bool ARG_UNUSED (named))
e74f7152 536{
44eba92d 537 return pass_in_memory (mode, type, false);
e74f7152
NS
538}
539
540/* Implement TARGET_RETURN_IN_MEMORY. */
541
542static bool
543nvptx_return_in_memory (const_tree type, const_tree)
544{
44eba92d 545 return pass_in_memory (TYPE_MODE (type), type, true);
e74f7152
NS
546}
547
548/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
549
550static machine_mode
551nvptx_promote_function_mode (const_tree type, machine_mode mode,
44eba92d 552 int *ARG_UNUSED (punsignedp),
e74f7152
NS
553 const_tree funtype, int for_return)
554{
44eba92d 555 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
e74f7152
NS
556}
557
e74f7152
NS
558/* Helper for write_arg. Emit a single PTX argument of MODE, either
559 in a prototype, or as copy in a function prologue. ARGNO is the
560 index of this argument in the PTX function. FOR_REG is negative,
561 if we're emitting the PTX prototype. It is zero if we're copying
562 to an argument register and it is greater than zero if we're
563 copying to a specific hard register. */
564
565static int
1f065954
NS
566write_arg_mode (std::stringstream &s, int for_reg, int argno,
567 machine_mode mode)
e74f7152
NS
568{
569 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
570
dc3d2aeb
NS
571 if (for_reg < 0)
572 {
573 /* Writing PTX prototype. */
574 s << (argno ? ", " : " (");
e74f7152 575 s << ".param" << ptx_type << " %in_ar" << argno;
dc3d2aeb
NS
576 }
577 else
578 {
e74f7152 579 s << "\t.reg" << ptx_type << " ";
dc3d2aeb
NS
580 if (for_reg)
581 s << reg_names[for_reg];
582 else
583 s << "%ar" << argno;
584 s << ";\n";
5563d5c0
NS
585 if (argno >= 0)
586 {
587 s << "\tld.param" << ptx_type << " ";
588 if (for_reg)
589 s << reg_names[for_reg];
590 else
591 s << "%ar" << argno;
592 s << ", [%in_ar" << argno << "];\n";
593 }
dc3d2aeb
NS
594 }
595 return argno + 1;
738f2522
BS
596}
597
e74f7152 598/* Process function parameter TYPE to emit one or more PTX
1f065954 599 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
44eba92d
NS
600 is true, if this is a prototyped function, rather than an old-style
601 C declaration. Returns the next argument number to use.
e74f7152 602
9c582551 603 The promotion behavior here must match the regular GCC function
e74f7152
NS
604 parameter marshalling machinery. */
605
606static int
1f065954
NS
607write_arg_type (std::stringstream &s, int for_reg, int argno,
608 tree type, bool prototyped)
e74f7152
NS
609{
610 machine_mode mode = TYPE_MODE (type);
611
612 if (mode == VOIDmode)
613 return argno;
614
44eba92d 615 if (pass_in_memory (mode, type, false))
e74f7152 616 mode = Pmode;
44eba92d
NS
617 else
618 {
619 bool split = TREE_CODE (type) == COMPLEX_TYPE;
e74f7152 620
44eba92d
NS
621 if (split)
622 {
623 /* Complex types are sent as two separate args. */
624 type = TREE_TYPE (type);
5563d5c0 625 mode = TYPE_MODE (type);
44eba92d
NS
626 prototyped = true;
627 }
e74f7152 628
44eba92d
NS
629 mode = promote_arg (mode, prototyped);
630 if (split)
1f065954 631 argno = write_arg_mode (s, for_reg, argno, mode);
e74f7152 632 }
e74f7152 633
1f065954
NS
634 return write_arg_mode (s, for_reg, argno, mode);
635}
636
637/* Emit a PTX return as a prototype or function prologue declaration
638 for MODE. */
639
640static void
641write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
642{
643 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
644 const char *pfx = "\t.reg";
645 const char *sfx = ";\n";
646
647 if (for_proto)
648 pfx = "(.param", sfx = "_out) ";
649
650 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
e74f7152
NS
651}
652
44eba92d 653/* Process a function return TYPE to emit a PTX return as a prototype
1f065954 654 or function prologue declaration. Returns true if return is via an
9c582551 655 additional pointer parameter. The promotion behavior here must
1f065954 656 match the regular GCC function return mashalling. */
44eba92d 657
0beb7c71 658static bool
1f065954 659write_return_type (std::stringstream &s, bool for_proto, tree type)
0beb7c71
NS
660{
661 machine_mode mode = TYPE_MODE (type);
0beb7c71 662
44eba92d
NS
663 if (mode == VOIDmode)
664 return false;
665
666 bool return_in_mem = pass_in_memory (mode, type, true);
667
668 if (return_in_mem)
0beb7c71 669 {
44eba92d
NS
670 if (for_proto)
671 return return_in_mem;
672
673 /* Named return values can cause us to return a pointer as well
674 as expect an argument for the return location. This is
675 optimization-level specific, so no caller can make use of
676 this data, but more importantly for us, we must ensure it
677 doesn't change the PTX prototype. */
44c068ae 678 mode = (machine_mode) cfun->machine->return_mode;
5c036f3f 679
44eba92d
NS
680 if (mode == VOIDmode)
681 return return_in_mem;
682
44c068ae 683 /* Clear return_mode to inhibit copy of retval to non-existent
44eba92d 684 retval parameter. */
44c068ae 685 cfun->machine->return_mode = VOIDmode;
0beb7c71
NS
686 }
687 else
44eba92d
NS
688 mode = promote_return (mode);
689
1f065954 690 write_return_mode (s, for_proto, mode);
0beb7c71
NS
691
692 return return_in_mem;
693}
694
738f2522
BS
695/* Look for attributes in ATTRS that would indicate we must write a function
696 as a .entry kernel rather than a .func. Return true if one is found. */
697
698static bool
699write_as_kernel (tree attrs)
700{
701 return (lookup_attribute ("kernel", attrs) != NULL_TREE
702 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
703}
704
69823d76
NS
705/* Emit a linker marker for a function decl or defn. */
706
707static void
708write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
709 const char *name)
710{
711 s << "\n// BEGIN";
712 if (globalize)
713 s << " GLOBAL";
714 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
715 s << name << "\n";
716}
717
718/* Emit a linker marker for a variable decl or defn. */
719
720static void
721write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
722{
723 fprintf (file, "\n// BEGIN%s VAR %s: ",
724 globalize ? " GLOBAL" : "",
725 is_defn ? "DEF" : "DECL");
726 assemble_name_raw (file, name);
727 fputs ("\n", file);
728}
729
b699adcc
NS
730/* Write a .func or .kernel declaration or definition along with
731 a helper comment for use by ld. S is the stream to write to, DECL
732 the decl for the function with name NAME. For definitions, emit
733 a declaration too. */
738f2522 734
b699adcc
NS
735static const char *
736write_fn_proto (std::stringstream &s, bool is_defn,
737 const char *name, const_tree decl)
738f2522 738{
b699adcc
NS
739 if (is_defn)
740 /* Emit a declaration. The PTX assembler gets upset without it. */
741 name = write_fn_proto (s, false, name, decl);
742 else
738f2522 743 {
b699adcc
NS
744 /* Avoid repeating the name replacement. */
745 name = nvptx_name_replacement (name);
746 if (name[0] == '*')
747 name++;
738f2522
BS
748 }
749
69823d76 750 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
b699adcc
NS
751
752 /* PTX declaration. */
738f2522
BS
753 if (DECL_EXTERNAL (decl))
754 s << ".extern ";
755 else if (TREE_PUBLIC (decl))
0766660b 756 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
b699adcc 757 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
738f2522 758
b699adcc
NS
759 tree fntype = TREE_TYPE (decl);
760 tree result_type = TREE_TYPE (fntype);
738f2522
BS
761
762 /* Declare the result. */
1f065954 763 bool return_in_mem = write_return_type (s, true, result_type);
738f2522 764
b699adcc
NS
765 s << name;
766
dc3d2aeb 767 int argno = 0;
b699adcc
NS
768
769 /* Emit argument list. */
770 if (return_in_mem)
1f065954 771 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
df1bdded 772
b699adcc
NS
773 /* We get:
774 NULL in TYPE_ARG_TYPES, for old-style functions
775 NULL in DECL_ARGUMENTS, for builtin functions without another
776 declaration.
777 So we have to pick the best one we have. */
778 tree args = TYPE_ARG_TYPES (fntype);
dc3d2aeb
NS
779 bool prototyped = true;
780 if (!args)
781 {
782 args = DECL_ARGUMENTS (decl);
783 prototyped = false;
784 }
738f2522 785
b699adcc
NS
786 for (; args; args = TREE_CHAIN (args))
787 {
dc3d2aeb 788 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
b699adcc 789
1f065954 790 argno = write_arg_type (s, -1, argno, type, prototyped);
738f2522 791 }
738f2522 792
b699adcc 793 if (stdarg_p (fntype))
1f065954 794 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
738f2522 795
b699adcc 796 if (DECL_STATIC_CHAIN (decl))
1f065954 797 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc 798
dc3d2aeb 799 if (!argno && strcmp (name, "main") == 0)
b699adcc 800 {
1f065954
NS
801 argno = write_arg_type (s, -1, argno, integer_type_node, true);
802 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
b699adcc
NS
803 }
804
dc3d2aeb 805 if (argno)
b699adcc
NS
806 s << ")";
807
808 s << (is_defn ? "\n" : ";\n");
809
810 return name;
738f2522
BS
811}
812
00e52418
NS
813/* Construct a function declaration from a call insn. This can be
814 necessary for two reasons - either we have an indirect call which
815 requires a .callprototype declaration, or we have a libcall
816 generated by emit_library_call for which no decl exists. */
817
818static void
b699adcc
NS
819write_fn_proto_from_insn (std::stringstream &s, const char *name,
820 rtx result, rtx pat)
00e52418
NS
821{
822 if (!name)
823 {
824 s << "\t.callprototype ";
825 name = "_";
826 }
827 else
828 {
b699adcc 829 name = nvptx_name_replacement (name);
69823d76 830 write_fn_marker (s, false, true, name);
00e52418
NS
831 s << "\t.extern .func ";
832 }
833
834 if (result != NULL_RTX)
1f065954 835 write_return_mode (s, true, GET_MODE (result));
00e52418
NS
836
837 s << name;
838
00e52418
NS
839 int arg_end = XVECLEN (pat, 0);
840 for (int i = 1; i < arg_end; i++)
841 {
1f065954
NS
842 /* We don't have to deal with mode splitting & promotion here,
843 as that was already done when generating the call
844 sequence. */
00e52418
NS
845 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
846
1f065954 847 write_arg_mode (s, -1, i - 1, mode);
00e52418
NS
848 }
849 if (arg_end != 1)
850 s << ")";
851 s << ";\n";
852}
853
00e52418
NS
854/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
855 table and and write a ptx prototype. These are emitted at end of
856 compilation. */
738f2522 857
00e52418
NS
858static void
859nvptx_record_fndecl (tree decl)
738f2522 860{
f3dba894 861 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
862 if (*slot == NULL)
863 {
864 *slot = decl;
865 const char *name = get_fnname_from_decl (decl);
b699adcc 866 write_fn_proto (func_decls, false, name, decl);
738f2522 867 }
738f2522
BS
868}
869
00e52418
NS
870/* Record a libcall or unprototyped external function. CALLEE is the
871 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
872 declaration for it. */
873
874static void
875nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
876{
877 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
878 if (*slot == NULL)
879 {
880 *slot = callee;
881
882 const char *name = XSTR (callee, 0);
b699adcc 883 write_fn_proto_from_insn (func_decls, name, retval, pat);
00e52418
NS
884 }
885}
886
887/* DECL is an external FUNCTION_DECL, that we're referencing. If it
888 is prototyped, record it now. Otherwise record it as needed at end
889 of compilation, when we might have more information about it. */
738f2522
BS
890
891void
892nvptx_record_needed_fndecl (tree decl)
893{
00e52418
NS
894 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
895 {
896 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
897 if (*slot == NULL)
898 *slot = decl;
899 }
900 else
901 nvptx_record_fndecl (decl);
902}
738f2522 903
00e52418
NS
904/* SYM is a SYMBOL_REF. If it refers to an external function, record
905 it as needed. */
906
907static void
908nvptx_maybe_record_fnsym (rtx sym)
909{
910 tree decl = SYMBOL_REF_DECL (sym);
911
912 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
913 nvptx_record_needed_fndecl (decl);
738f2522
BS
914}
915
1f065954 916/* Emit a local array to hold some part of a conventional stack frame
5563d5c0
NS
917 and initialize REGNO to point to it. If the size is zero, it'll
918 never be valid to dereference, so we can simply initialize to
919 zero. */
1f065954
NS
920
921static void
922init_frame (FILE *file, int regno, unsigned align, unsigned size)
923{
5563d5c0
NS
924 if (size)
925 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
926 align, reg_names[regno], size);
927 fprintf (file, "\t.reg.u%d %s;\n",
928 POINTER_SIZE, reg_names[regno]);
929 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
930 : "\tmov.u%d %s, 0;\n"),
1f065954
NS
931 POINTER_SIZE, reg_names[regno], reg_names[regno]);
932}
933
d88cd9c4
NS
934/* Emit code to initialize the REGNO predicate register to indicate
935 whether we are not lane zero on the NAME axis. */
936
937static void
938nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
939{
940 fprintf (file, "\t{\n");
941 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
942 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
943 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
944 fprintf (file, "\t}\n");
945}
946
738f2522
BS
947/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
948 function, including local var decls and copies from the arguments to
949 local regs. */
950
951void
952nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
953{
954 tree fntype = TREE_TYPE (decl);
955 tree result_type = TREE_TYPE (fntype);
dc3d2aeb 956 int argno = 0;
738f2522 957
dc3d2aeb
NS
958 /* We construct the initial part of the function into a string
959 stream, in order to share the prototype writing code. */
738f2522 960 std::stringstream s;
b699adcc 961 write_fn_proto (s, true, name, decl);
dc3d2aeb 962 s << "{\n";
738f2522 963
1f065954 964 bool return_in_mem = write_return_type (s, false, result_type);
738f2522 965 if (return_in_mem)
1f065954 966 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
dc3d2aeb 967
5ab662d5 968 /* Declare and initialize incoming arguments. */
dc3d2aeb
NS
969 tree args = TYPE_ARG_TYPES (fntype);
970 bool prototyped = true;
971 if (!args)
5ab662d5 972 {
dc3d2aeb
NS
973 args = DECL_ARGUMENTS (decl);
974 prototyped = false;
5ab662d5
NS
975 }
976
977 for (; args != NULL_TREE; args = TREE_CHAIN (args))
978 {
979 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
5ab662d5 980
1f065954 981 argno = write_arg_type (s, 0, argno, type, prototyped);
dc3d2aeb 982 }
5ab662d5 983
dc3d2aeb 984 if (stdarg_p (fntype))
5563d5c0 985 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1f065954 986 true);
5ab662d5 987
5563d5c0
NS
988 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
989 write_arg_type (s, STATIC_CHAIN_REGNUM,
990 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
991 true);
992
dc3d2aeb 993 fprintf (file, "%s", s.str().c_str());
25662751 994
44c068ae
NS
995 /* Declare a local var for outgoing varargs. */
996 if (cfun->machine->has_varadic)
997 init_frame (file, STACK_POINTER_REGNUM,
998 UNITS_PER_WORD, crtl->outgoing_args_size);
999
1000 /* Declare a local variable for the frame. */
1001 HOST_WIDE_INT sz = get_frame_size ();
1002 if (sz || cfun->machine->has_chain)
1003 init_frame (file, FRAME_POINTER_REGNUM,
1004 crtl->stack_alignment_needed / BITS_PER_UNIT, sz);
1005
738f2522
BS
1006 /* Declare the pseudos we have as ptx registers. */
1007 int maxregs = max_reg_num ();
1008 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1009 {
1010 if (regno_reg_rtx[i] != const0_rtx)
1011 {
1012 machine_mode mode = PSEUDO_REGNO_MODE (i);
d7479262 1013 machine_mode split = maybe_split_mode (mode);
f313d112 1014
d7479262 1015 if (split != VOIDmode)
f313d112
NS
1016 mode = split;
1017 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1018 output_reg (file, i, split, -2);
1019 fprintf (file, ";\n");
738f2522
BS
1020 }
1021 }
1022
d88cd9c4
NS
1023 /* Emit axis predicates. */
1024 if (cfun->machine->axis_predicate[0])
1025 nvptx_init_axis_predicate (file,
1026 REGNO (cfun->machine->axis_predicate[0]), "y");
1027 if (cfun->machine->axis_predicate[1])
1028 nvptx_init_axis_predicate (file,
1029 REGNO (cfun->machine->axis_predicate[1]), "x");
738f2522
BS
1030}
1031
1032/* Output a return instruction. Also copy the return value to its outgoing
1033 location. */
1034
1035const char *
1036nvptx_output_return (void)
1037{
44c068ae 1038 machine_mode mode = (machine_mode)cfun->machine->return_mode;
25662751
NS
1039
1040 if (mode != VOIDmode)
1f065954
NS
1041 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1042 nvptx_ptx_type_from_mode (mode, false),
1043 reg_names[NVPTX_RETURN_REGNUM],
1044 reg_names[NVPTX_RETURN_REGNUM]);
738f2522
BS
1045
1046 return "ret;";
1047}
1048
738f2522
BS
1049/* Terminate a function by writing a closing brace to FILE. */
1050
1051void
1052nvptx_function_end (FILE *file)
1053{
cf08c344 1054 fprintf (file, "}\n");
738f2522
BS
1055}
1056\f
1057/* Decide whether we can make a sibling call to a function. For ptx, we
1058 can't. */
1059
1060static bool
1061nvptx_function_ok_for_sibcall (tree, tree)
1062{
1063 return false;
1064}
1065
18c05628
NS
1066/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1067
1068static rtx
1069nvptx_get_drap_rtx (void)
1070{
1071 return NULL_RTX;
1072}
1073
738f2522
BS
1074/* Implement the TARGET_CALL_ARGS hook. Record information about one
1075 argument to the next call. */
1076
1077static void
44c068ae 1078nvptx_call_args (rtx arg, tree fntype)
738f2522 1079{
44c068ae 1080 if (!cfun->machine->doing_call)
738f2522 1081 {
44c068ae
NS
1082 cfun->machine->doing_call = true;
1083 cfun->machine->is_varadic = false;
1084 cfun->machine->num_args = 0;
1085
1086 if (fntype && stdarg_p (fntype))
1087 {
1088 cfun->machine->is_varadic = true;
1089 cfun->machine->has_varadic = true;
1090 cfun->machine->num_args++;
1091 }
738f2522 1092 }
738f2522 1093
44c068ae
NS
1094 if (REG_P (arg) && arg != pc_rtx)
1095 {
1096 cfun->machine->num_args++;
1097 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1098 cfun->machine->call_args);
1099 }
738f2522
BS
1100}
1101
1102/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1103 information we recorded. */
1104
1105static void
1106nvptx_end_call_args (void)
1107{
44c068ae 1108 cfun->machine->doing_call = false;
738f2522
BS
1109 free_EXPR_LIST_list (&cfun->machine->call_args);
1110}
1111
ecf6e535
BS
1112/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1113 track of whether calls involving static chains or varargs were seen
1114 in the current function.
1115 For libcalls, maintain a hash table of decls we have seen, and
1116 record a function decl for later when encountering a new one. */
738f2522
BS
1117
1118void
1119nvptx_expand_call (rtx retval, rtx address)
1120{
738f2522 1121 rtx callee = XEXP (address, 0);
f324806d 1122 rtx varargs = NULL_RTX;
d88cd9c4 1123 unsigned parallel = 0;
738f2522 1124
738f2522
BS
1125 if (!call_insn_operand (callee, Pmode))
1126 {
1127 callee = force_reg (Pmode, callee);
1128 address = change_address (address, QImode, callee);
1129 }
1130
1131 if (GET_CODE (callee) == SYMBOL_REF)
1132 {
1133 tree decl = SYMBOL_REF_DECL (callee);
1134 if (decl != NULL_TREE)
1135 {
738f2522 1136 if (DECL_STATIC_CHAIN (decl))
44c068ae 1137 cfun->machine->has_chain = true;
00e52418 1138
d88cd9c4
NS
1139 tree attr = get_oacc_fn_attrib (decl);
1140 if (attr)
1141 {
1142 tree dims = TREE_VALUE (attr);
1143
1144 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1145 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1146 {
1147 if (TREE_PURPOSE (dims)
1148 && !integer_zerop (TREE_PURPOSE (dims)))
1149 break;
1150 /* Not on this axis. */
1151 parallel ^= GOMP_DIM_MASK (ix);
1152 dims = TREE_CHAIN (dims);
1153 }
1154 }
738f2522
BS
1155 }
1156 }
c38f0d8c 1157
44c068ae
NS
1158 unsigned nargs = cfun->machine->num_args;
1159 if (cfun->machine->is_varadic)
738f2522 1160 {
f324806d 1161 varargs = gen_reg_rtx (Pmode);
863af9a4 1162 emit_move_insn (varargs, stack_pointer_rtx);
738f2522
BS
1163 }
1164
44c068ae
NS
1165 rtvec vec = rtvec_alloc (nargs + 1);
1166 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
f324806d 1167 int vec_pos = 0;
44c068ae
NS
1168
1169 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
738f2522 1170 rtx tmp_retval = retval;
44c068ae 1171 if (retval)
738f2522
BS
1172 {
1173 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1174 tmp_retval = gen_reg_rtx (GET_MODE (retval));
44c068ae 1175 call = gen_rtx_SET (tmp_retval, call);
738f2522 1176 }
44c068ae 1177 XVECEXP (pat, 0, vec_pos++) = call;
f324806d
NS
1178
1179 /* Construct the call insn, including a USE for each argument pseudo
1180 register. These will be used when printing the insn. */
1181 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
44c068ae 1182 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
f324806d
NS
1183
1184 if (varargs)
cf08c344 1185 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
f324806d
NS
1186
1187 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535 1188
d88cd9c4 1189 nvptx_emit_forking (parallel, true);
738f2522 1190 emit_call_insn (pat);
d88cd9c4
NS
1191 nvptx_emit_joining (parallel, true);
1192
738f2522
BS
1193 if (tmp_retval != retval)
1194 emit_move_insn (retval, tmp_retval);
1195}
44eba92d 1196
738f2522
BS
1197/* Emit a comparison COMPARE, and return the new test to be used in the
1198 jump. */
1199
1200rtx
1201nvptx_expand_compare (rtx compare)
1202{
1203 rtx pred = gen_reg_rtx (BImode);
1204 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1205 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1206 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1207 return gen_rtx_NE (BImode, pred, const0_rtx);
1208}
1209
d88cd9c4
NS
1210/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1211
1212void
1213nvptx_expand_oacc_fork (unsigned mode)
1214{
1215 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1216}
1217
1218void
1219nvptx_expand_oacc_join (unsigned mode)
1220{
1221 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1222}
1223
1224/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1225 objects. */
1226
1227static rtx
1228nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1229{
1230 rtx res;
1231
1232 switch (GET_MODE (src))
1233 {
1234 case DImode:
1235 res = gen_unpackdisi2 (dst0, dst1, src);
1236 break;
1237 case DFmode:
1238 res = gen_unpackdfsi2 (dst0, dst1, src);
1239 break;
1240 default: gcc_unreachable ();
1241 }
1242 return res;
1243}
1244
1245/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1246 object. */
1247
1248static rtx
1249nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1250{
1251 rtx res;
1252
1253 switch (GET_MODE (dst))
1254 {
1255 case DImode:
1256 res = gen_packsidi2 (dst, src0, src1);
1257 break;
1258 case DFmode:
1259 res = gen_packsidf2 (dst, src0, src1);
1260 break;
1261 default: gcc_unreachable ();
1262 }
1263 return res;
1264}
1265
1266/* Generate an instruction or sequence to broadcast register REG
1267 across the vectors of a single warp. */
1268
1269static rtx
59263259 1270nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
d88cd9c4
NS
1271{
1272 rtx res;
1273
1274 switch (GET_MODE (dst))
1275 {
1276 case SImode:
1277 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1278 break;
1279 case SFmode:
1280 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1281 break;
1282 case DImode:
1283 case DFmode:
1284 {
1285 rtx tmp0 = gen_reg_rtx (SImode);
1286 rtx tmp1 = gen_reg_rtx (SImode);
1287
1288 start_sequence ();
1289 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1290 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1291 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1292 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1293 res = get_insns ();
1294 end_sequence ();
1295 }
1296 break;
1297 case BImode:
1298 {
1299 rtx tmp = gen_reg_rtx (SImode);
1300
1301 start_sequence ();
1302 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1303 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1304 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1305 res = get_insns ();
1306 end_sequence ();
1307 }
1308 break;
1309
1310 default:
1311 gcc_unreachable ();
1312 }
1313 return res;
1314}
1315
1316/* Generate an instruction or sequence to broadcast register REG
1317 across the vectors of a single warp. */
1318
1319static rtx
1320nvptx_gen_vcast (rtx reg)
1321{
1322 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1323}
1324
1325/* Structure used when generating a worker-level spill or fill. */
1326
1327struct wcast_data_t
1328{
1329 rtx base; /* Register holding base addr of buffer. */
1330 rtx ptr; /* Iteration var, if needed. */
1331 unsigned offset; /* Offset into worker buffer. */
1332};
1333
1334/* Direction of the spill/fill and looping setup/teardown indicator. */
1335
1336enum propagate_mask
1337 {
1338 PM_read = 1 << 0,
1339 PM_write = 1 << 1,
1340 PM_loop_begin = 1 << 2,
1341 PM_loop_end = 1 << 3,
1342
1343 PM_read_write = PM_read | PM_write
1344 };
1345
1346/* Generate instruction(s) to spill or fill register REG to/from the
1347 worker broadcast array. PM indicates what is to be done, REP
1348 how many loop iterations will be executed (0 for not a loop). */
1349
1350static rtx
1351nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1352{
1353 rtx res;
1354 machine_mode mode = GET_MODE (reg);
1355
1356 switch (mode)
1357 {
1358 case BImode:
1359 {
1360 rtx tmp = gen_reg_rtx (SImode);
1361
1362 start_sequence ();
1363 if (pm & PM_read)
1364 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1365 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1366 if (pm & PM_write)
1367 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1368 res = get_insns ();
1369 end_sequence ();
1370 }
1371 break;
1372
1373 default:
1374 {
1375 rtx addr = data->ptr;
1376
1377 if (!addr)
1378 {
1379 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1380
1381 if (align > worker_bcast_align)
1382 worker_bcast_align = align;
1383 data->offset = (data->offset + align - 1) & ~(align - 1);
1384 addr = data->base;
1385 if (data->offset)
1386 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1387 }
1388
1389 addr = gen_rtx_MEM (mode, addr);
d88cd9c4
NS
1390 if (pm == PM_read)
1391 res = gen_rtx_SET (addr, reg);
1392 else if (pm == PM_write)
1393 res = gen_rtx_SET (reg, addr);
1394 else
1395 gcc_unreachable ();
1396
1397 if (data->ptr)
1398 {
1399 /* We're using a ptr, increment it. */
1400 start_sequence ();
1401
1402 emit_insn (res);
1403 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1404 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1405 res = get_insns ();
1406 end_sequence ();
1407 }
1408 else
1409 rep = 1;
1410 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1411 }
1412 break;
1413 }
1414 return res;
1415}
738f2522
BS
1416\f
1417/* Returns true if X is a valid address for use in a memory reference. */
1418
1419static bool
1420nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1421{
1422 enum rtx_code code = GET_CODE (x);
1423
1424 switch (code)
1425 {
1426 case REG:
1427 return true;
1428
1429 case PLUS:
1430 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1431 return true;
1432 return false;
1433
1434 case CONST:
1435 case SYMBOL_REF:
1436 case LABEL_REF:
1437 return true;
1438
1439 default:
1440 return false;
1441 }
1442}
738f2522 1443\f
4d5438cd
NS
1444/* Machinery to output constant initializers. When beginning an
1445 initializer, we decide on a fragment size (which is visible in ptx
1446 in the type used), and then all initializer data is buffered until
1447 a fragment is filled and ready to be written out. */
1448
1449static struct
1450{
1451 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1452 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1453 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1454 out. */
1455 unsigned size; /* Fragment size to accumulate. */
1456 unsigned offset; /* Offset within current fragment. */
1457 bool started; /* Whether we've output any initializer. */
1458} init_frag;
1459
1460/* The current fragment is full, write it out. SYM may provide a
1461 symbolic reference we should output, in which case the fragment
1462 value is the addend. */
738f2522
BS
1463
1464static void
4d5438cd 1465output_init_frag (rtx sym)
738f2522 1466{
4d5438cd
NS
1467 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1468 unsigned HOST_WIDE_INT val = init_frag.val;
738f2522 1469
4d5438cd
NS
1470 init_frag.started = true;
1471 init_frag.val = 0;
1472 init_frag.offset = 0;
1473 init_frag.remaining--;
1474
1475 if (sym)
1476 {
1477 fprintf (asm_out_file, "generic(");
1478 output_address (VOIDmode, sym);
1479 fprintf (asm_out_file, val ? ") + " : ")");
1480 }
738f2522 1481
4d5438cd
NS
1482 if (!sym || val)
1483 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
738f2522
BS
1484}
1485
4d5438cd
NS
1486/* Add value VAL of size SIZE to the data we're emitting, and keep
1487 writing out chunks as they fill up. */
738f2522
BS
1488
1489static void
4d5438cd 1490nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
738f2522 1491{
4d5438cd
NS
1492 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1493
1494 for (unsigned part = 0; size; size -= part)
738f2522 1495 {
4d5438cd
NS
1496 val >>= part * BITS_PER_UNIT;
1497 part = init_frag.size - init_frag.offset;
1498 if (part > size)
1499 part = size;
1500
1501 unsigned HOST_WIDE_INT partial
1502 = val << (init_frag.offset * BITS_PER_UNIT);
1503 init_frag.val |= partial & init_frag.mask;
1504 init_frag.offset += part;
1505
1506 if (init_frag.offset == init_frag.size)
1507 output_init_frag (NULL);
738f2522
BS
1508 }
1509}
1510
1511/* Target hook for assembling integer object X of size SIZE. */
1512
1513static bool
1514nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1515{
00e52418
NS
1516 HOST_WIDE_INT val = 0;
1517
1518 switch (GET_CODE (x))
738f2522 1519 {
00e52418 1520 default:
a9000e1e
NS
1521 /* Let the generic machinery figure it out, usually for a
1522 CONST_WIDE_INT. */
1523 return false;
00e52418
NS
1524
1525 case CONST_INT:
4d5438cd 1526 nvptx_assemble_value (INTVAL (x), size);
00e52418
NS
1527 break;
1528
1529 case CONST:
1530 x = XEXP (x, 0);
1531 gcc_assert (GET_CODE (x) == PLUS);
1532 val = INTVAL (XEXP (x, 1));
1533 x = XEXP (x, 0);
1534 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1535 /* FALLTHROUGH */
1536
1537 case SYMBOL_REF:
4d5438cd
NS
1538 gcc_assert (size == init_frag.size);
1539 if (init_frag.offset)
738f2522 1540 sorry ("cannot emit unaligned pointers in ptx assembly");
738f2522 1541
00e52418 1542 nvptx_maybe_record_fnsym (x);
4d5438cd
NS
1543 init_frag.val = val;
1544 output_init_frag (x);
738f2522 1545 break;
738f2522
BS
1546 }
1547
738f2522
BS
1548 return true;
1549}
1550
1551/* Output SIZE zero bytes. We ignore the FILE argument since the
1552 functions we're calling to perform the output just use
1553 asm_out_file. */
1554
1555void
1556nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1557{
4d5438cd
NS
1558 /* Finish the current fragment, if it's started. */
1559 if (init_frag.offset)
738f2522 1560 {
4d5438cd
NS
1561 unsigned part = init_frag.size - init_frag.offset;
1562 if (part > size)
1563 part = (unsigned) size;
1564 size -= part;
1565 nvptx_assemble_value (0, part);
738f2522
BS
1566 }
1567
4d5438cd
NS
1568 /* If this skip doesn't terminate the initializer, write as many
1569 remaining pieces as possible directly. */
1570 if (size < init_frag.remaining * init_frag.size)
738f2522 1571 {
4d5438cd
NS
1572 while (size >= init_frag.size)
1573 {
1574 size -= init_frag.size;
1575 output_init_frag (NULL_RTX);
1576 }
1577 if (size)
1578 nvptx_assemble_value (0, size);
738f2522 1579 }
738f2522
BS
1580}
1581
1582/* Output a string STR with length SIZE. As in nvptx_output_skip we
1583 ignore the FILE arg. */
1584
1585void
1586nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1587{
1588 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1589 nvptx_assemble_value (str[i], 1);
1590}
1591
4ff3145a
NS
1592/* Emit a PTX variable decl and prepare for emission of its
1593 initializer. NAME is the symbol name and SETION the PTX data
1594 area. The type is TYPE, object size SIZE and alignment is ALIGN.
1595 The caller has already emitted any indentation and linkage
1596 specifier. It is responsible for any initializer, terminating ;
1597 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
1598 this is the opposite way round that PTX wants them! */
1599
1600static void
1601nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
1602 const_tree type, HOST_WIDE_INT size, unsigned align)
1603{
1604 while (TREE_CODE (type) == ARRAY_TYPE)
1605 type = TREE_TYPE (type);
1606
fc0efeea
NS
1607 if (TREE_CODE (type) == VECTOR_TYPE
1608 || TREE_CODE (type) == COMPLEX_TYPE)
1609 /* Neither vector nor complex types can contain the other. */
1610 type = TREE_TYPE (type);
1611
4ff3145a 1612 unsigned elt_size = int_size_in_bytes (type);
fc0efeea
NS
1613
1614 /* Largest mode we're prepared to accept. For BLKmode types we
1615 don't know if it'll contain pointer constants, so have to choose
1616 pointer size, otherwise we can choose DImode. */
1617 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
1618
1619 elt_size |= GET_MODE_SIZE (elt_mode);
1620 elt_size &= -elt_size; /* Extract LSB set. */
4ff3145a 1621
4d5438cd 1622 init_frag.size = elt_size;
9c582551 1623 /* Avoid undefined shift behavior by using '2'. */
4d5438cd
NS
1624 init_frag.mask = ((unsigned HOST_WIDE_INT)2
1625 << (elt_size * BITS_PER_UNIT - 1)) - 1;
1626 init_frag.val = 0;
1627 init_frag.offset = 0;
1628 init_frag.started = false;
1629 /* Size might not be a multiple of elt size, if there's an
1630 initialized trailing struct array with smaller type than
1631 elt_size. */
1632 init_frag.remaining = (size + elt_size - 1) / elt_size;
4ff3145a
NS
1633
1634 fprintf (file, "%s .align %d .u%d ",
1635 section, align / BITS_PER_UNIT,
1636 elt_size * BITS_PER_UNIT);
1637 assemble_name (file, name);
1638
1639 if (size)
1640 /* We make everything an array, to simplify any initialization
1641 emission. */
4d5438cd 1642 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
4ff3145a
NS
1643}
1644
738f2522
BS
1645/* Called when the initializer for a decl has been completely output through
1646 combinations of the three functions above. */
1647
1648static void
1649nvptx_assemble_decl_end (void)
1650{
4d5438cd
NS
1651 if (init_frag.offset)
1652 /* This can happen with a packed struct with trailing array member. */
1653 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
1654 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
738f2522
BS
1655}
1656
69823d76
NS
1657/* Output an uninitialized common or file-scope variable. */
1658
1659void
1660nvptx_output_aligned_decl (FILE *file, const char *name,
1661 const_tree decl, HOST_WIDE_INT size, unsigned align)
1662{
1663 write_var_marker (file, true, TREE_PUBLIC (decl), name);
1664
1665 /* If this is public, it is common. The nearest thing we have to
1666 common is weak. */
4ff3145a
NS
1667 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
1668
1669 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1670 TREE_TYPE (decl), size, align);
4d5438cd 1671 nvptx_assemble_decl_end ();
69823d76
NS
1672}
1673
738f2522
BS
1674/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1675 writing a constant variable EXP with NAME and SIZE and its
1676 initializer to FILE. */
1677
1678static void
1679nvptx_asm_declare_constant_name (FILE *file, const char *name,
4ff3145a 1680 const_tree exp, HOST_WIDE_INT obj_size)
738f2522 1681{
4ff3145a
NS
1682 write_var_marker (file, true, false, name);
1683
1684 fprintf (file, "\t");
1685
738f2522 1686 tree type = TREE_TYPE (exp);
4ff3145a
NS
1687 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
1688 TYPE_ALIGN (type));
738f2522
BS
1689}
1690
1691/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1692 a variable DECL with NAME to FILE. */
1693
1694void
1695nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1696{
4ff3145a 1697 write_var_marker (file, true, TREE_PUBLIC (decl), name);
9a863523 1698
4ff3145a
NS
1699 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
1700 : DECL_WEAK (decl) ? ".weak " : ".visible "));
9a863523 1701
4ff3145a
NS
1702 tree type = TREE_TYPE (decl);
1703 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
1704 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1705 type, obj_size, DECL_ALIGN (decl));
738f2522
BS
1706}
1707
1708/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1709
1710static void
1711nvptx_globalize_label (FILE *, const char *)
1712{
1713}
1714
1715/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1716 declaration only for variable DECL with NAME to FILE. */
f313d112 1717
738f2522
BS
1718static void
1719nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1720{
0a0f74aa
NS
1721 /* The middle end can place constant pool decls into the varpool as
1722 undefined. Until that is fixed, catch the problem here. */
1723 if (DECL_IN_CONSTANT_POOL (decl))
1724 return;
1725
69823d76
NS
1726 write_var_marker (file, false, TREE_PUBLIC (decl), name);
1727
4ff3145a
NS
1728 fprintf (file, "\t.extern ");
1729 tree size = DECL_SIZE_UNIT (decl);
1730 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
1731 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
1732 DECL_ALIGN (decl));
1e5154e7 1733 nvptx_assemble_decl_end ();
738f2522
BS
1734}
1735
f313d112
NS
1736/* Output a pattern for a move instruction. */
1737
1738const char *
1739nvptx_output_mov_insn (rtx dst, rtx src)
1740{
1741 machine_mode dst_mode = GET_MODE (dst);
1742 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
1743 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
1744 machine_mode src_inner = (GET_CODE (src) == SUBREG
1745 ? GET_MODE (XEXP (src, 0)) : dst_mode);
1746
15113b03
NS
1747 rtx sym = src;
1748 if (GET_CODE (sym) == CONST)
1749 sym = XEXP (XEXP (sym, 0), 0);
bd602b7f
NS
1750 if (SYMBOL_REF_P (sym))
1751 {
1752 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
1753 return "%.\tcvta%D1%t0\t%0, %1;";
1754 nvptx_maybe_record_fnsym (sym);
1755 }
15113b03 1756
f313d112
NS
1757 if (src_inner == dst_inner)
1758 return "%.\tmov%t0\t%0, %1;";
1759
1760 if (CONSTANT_P (src))
1761 return (GET_MODE_CLASS (dst_inner) == MODE_INT
1762 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
1763 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
1764
1765 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
1766 return "%.\tmov.b%T0\t%0, %1;";
1767
1768 return "%.\tcvt%t0%t1\t%0, %1;";
1769}
1770
738f2522 1771/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
1772 involves writing .param declarations and in/out copies into them. For
1773 indirect calls, also write the .callprototype. */
738f2522
BS
1774
1775const char *
1776nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1777{
863af9a4 1778 char buf[16];
738f2522
BS
1779 static int labelno;
1780 bool needs_tgt = register_operand (callee, Pmode);
1781 rtx pat = PATTERN (insn);
f324806d 1782 int arg_end = XVECLEN (pat, 0);
738f2522
BS
1783 tree decl = NULL_TREE;
1784
1785 fprintf (asm_out_file, "\t{\n");
1786 if (result != NULL)
1f065954
NS
1787 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
1788 nvptx_ptx_type_from_mode (GET_MODE (result), false),
1789 reg_names[NVPTX_RETURN_REGNUM]);
738f2522 1790
ecf6e535 1791 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
1792 if (GET_CODE (callee) == SYMBOL_REF)
1793 {
1794 decl = SYMBOL_REF_DECL (callee);
00e52418
NS
1795 if (!decl
1796 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
1797 nvptx_record_libfunc (callee, result, pat);
1798 else if (DECL_EXTERNAL (decl))
738f2522
BS
1799 nvptx_record_fndecl (decl);
1800 }
1801
1802 if (needs_tgt)
1803 {
1804 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1805 labelno++;
1806 ASM_OUTPUT_LABEL (asm_out_file, buf);
1807 std::stringstream s;
b699adcc 1808 write_fn_proto_from_insn (s, NULL, result, pat);
738f2522
BS
1809 fputs (s.str().c_str(), asm_out_file);
1810 }
1811
863af9a4 1812 for (int argno = 1; argno < arg_end; argno++)
738f2522 1813 {
863af9a4 1814 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
738f2522 1815 machine_mode mode = GET_MODE (t);
a02d84b6 1816 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
738f2522 1817
863af9a4 1818 /* Mode splitting has already been done. */
a02d84b6
NS
1819 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
1820 "\t\tst.param%s [%%out_arg%d], ",
1821 ptx_type, argno, ptx_type, argno);
1822 output_reg (asm_out_file, REGNO (t), VOIDmode);
1823 fprintf (asm_out_file, ";\n");
738f2522
BS
1824 }
1825
1826 fprintf (asm_out_file, "\t\tcall ");
1827 if (result != NULL_RTX)
1f065954
NS
1828 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
1829
738f2522
BS
1830 if (decl)
1831 {
1832 const char *name = get_fnname_from_decl (decl);
1833 name = nvptx_name_replacement (name);
1834 assemble_name (asm_out_file, name);
1835 }
1836 else
cc8ca59e 1837 output_address (VOIDmode, callee);
738f2522 1838
863af9a4
NS
1839 const char *open = "(";
1840 for (int argno = 1; argno < arg_end; argno++)
738f2522 1841 {
863af9a4
NS
1842 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
1843 open = "";
738f2522 1844 }
863af9a4
NS
1845 if (decl && DECL_STATIC_CHAIN (decl))
1846 {
5563d5c0 1847 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
863af9a4
NS
1848 open = "";
1849 }
1850 if (!open[0])
1851 fprintf (asm_out_file, ")");
f324806d 1852
738f2522
BS
1853 if (needs_tgt)
1854 {
1855 fprintf (asm_out_file, ", ");
1856 assemble_name (asm_out_file, buf);
1857 }
1858 fprintf (asm_out_file, ";\n");
738f2522 1859
51baf85a
NS
1860 if (find_reg_note (insn, REG_NORETURN, NULL))
1861 /* No return functions confuse the PTX JIT, as it doesn't realize
1862 the flow control barrier they imply. It can seg fault if it
1863 encounters what looks like an unexitable loop. Emit a trailing
1864 trap, which it does grok. */
1865 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
1866
1f065954
NS
1867 if (result)
1868 {
1869 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
1870
1871 if (!rval[0])
1872 /* We must escape the '%' that starts RETURN_REGNUM. */
1873 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
1874 reg_names[NVPTX_RETURN_REGNUM]);
1875 return rval;
1876 }
1877
1878 return "}";
738f2522
BS
1879}
1880
1881/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1882
1883static bool
1884nvptx_print_operand_punct_valid_p (unsigned char c)
1885{
1886 return c == '.' || c== '#';
1887}
1888
1889static void nvptx_print_operand (FILE *, rtx, int);
1890
1891/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1892
1893static void
1894nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1895{
1896 rtx off;
1897 if (GET_CODE (x) == CONST)
1898 x = XEXP (x, 0);
1899 switch (GET_CODE (x))
1900 {
1901 case PLUS:
1902 off = XEXP (x, 1);
cc8ca59e 1903 output_address (VOIDmode, XEXP (x, 0));
738f2522 1904 fprintf (file, "+");
cc8ca59e 1905 output_address (VOIDmode, off);
738f2522
BS
1906 break;
1907
1908 case SYMBOL_REF:
1909 case LABEL_REF:
1910 output_addr_const (file, x);
1911 break;
1912
1913 default:
1914 gcc_assert (GET_CODE (x) != MEM);
1915 nvptx_print_operand (file, x, 0);
1916 break;
1917 }
1918}
1919
1920/* Write assembly language output for the address ADDR to FILE. */
1921
1922static void
cc8ca59e 1923nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
738f2522 1924{
cc8ca59e 1925 nvptx_print_address_operand (file, addr, mode);
738f2522
BS
1926}
1927
1928/* Print an operand, X, to FILE, with an optional modifier in CODE.
1929
1930 Meaning of CODE:
1931 . -- print the predicate for the instruction or an emptry string for an
1932 unconditional one.
1933 # -- print a rounding mode for the instruction
1934
9a863523 1935 A -- print a data area for a MEM
738f2522 1936 c -- print an opcode suffix for a comparison operator, including a type code
9a863523 1937 D -- print a data area for a MEM operand
d88cd9c4 1938 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
1939 t -- print a type opcode suffix, promoting QImode to 32 bits
1940 T -- print a type size in bits
1941 u -- print a type opcode suffix without promotions. */
1942
1943static void
1944nvptx_print_operand (FILE *file, rtx x, int code)
1945{
738f2522
BS
1946 if (code == '.')
1947 {
1948 x = current_insn_predicate;
1949 if (x)
1950 {
1951 unsigned int regno = REGNO (XEXP (x, 0));
1952 fputs ("[", file);
1953 if (GET_CODE (x) == EQ)
1954 fputs ("!", file);
1955 fputs (reg_names [regno], file);
1956 fputs ("]", file);
1957 }
1958 return;
1959 }
1960 else if (code == '#')
1961 {
1962 fputs (".rn", file);
1963 return;
1964 }
1965
1966 enum rtx_code x_code = GET_CODE (x);
f313d112 1967 machine_mode mode = GET_MODE (x);
738f2522
BS
1968
1969 switch (code)
1970 {
1971 case 'A':
9a863523
NS
1972 x = XEXP (x, 0);
1973 /* FALLTHROUGH. */
7b8edc29 1974
9a863523
NS
1975 case 'D':
1976 if (GET_CODE (x) == CONST)
1977 x = XEXP (x, 0);
1978 if (GET_CODE (x) == PLUS)
1979 x = XEXP (x, 0);
7b8edc29 1980
9a863523
NS
1981 if (GET_CODE (x) == SYMBOL_REF)
1982 fputs (section_for_sym (x), file);
738f2522
BS
1983 break;
1984
738f2522 1985 case 't':
738f2522 1986 case 'u':
f313d112
NS
1987 if (x_code == SUBREG)
1988 {
1989 mode = GET_MODE (SUBREG_REG (x));
1990 if (mode == TImode)
1991 mode = DImode;
1992 else if (COMPLEX_MODE_P (mode))
1993 mode = GET_MODE_INNER (mode);
1994 }
1995 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
738f2522
BS
1996 break;
1997
d88cd9c4
NS
1998 case 'S':
1999 {
59263259
NS
2000 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2001 /* Same order as nvptx_shuffle_kind. */
d88cd9c4 2002 static const char *const kinds[] =
59263259
NS
2003 {".up", ".down", ".bfly", ".idx"};
2004 fputs (kinds[kind], file);
d88cd9c4
NS
2005 }
2006 break;
2007
738f2522 2008 case 'T':
f313d112 2009 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2010 break;
2011
2012 case 'j':
2013 fprintf (file, "@");
2014 goto common;
2015
2016 case 'J':
2017 fprintf (file, "@!");
2018 goto common;
2019
2020 case 'c':
f313d112 2021 mode = GET_MODE (XEXP (x, 0));
738f2522
BS
2022 switch (x_code)
2023 {
2024 case EQ:
2025 fputs (".eq", file);
2026 break;
2027 case NE:
f313d112 2028 if (FLOAT_MODE_P (mode))
738f2522
BS
2029 fputs (".neu", file);
2030 else
2031 fputs (".ne", file);
2032 break;
2033 case LE:
578fb225 2034 case LEU:
738f2522
BS
2035 fputs (".le", file);
2036 break;
2037 case GE:
578fb225 2038 case GEU:
738f2522
BS
2039 fputs (".ge", file);
2040 break;
2041 case LT:
578fb225 2042 case LTU:
738f2522
BS
2043 fputs (".lt", file);
2044 break;
2045 case GT:
738f2522 2046 case GTU:
578fb225 2047 fputs (".gt", file);
738f2522
BS
2048 break;
2049 case LTGT:
2050 fputs (".ne", file);
2051 break;
2052 case UNEQ:
2053 fputs (".equ", file);
2054 break;
2055 case UNLE:
2056 fputs (".leu", file);
2057 break;
2058 case UNGE:
2059 fputs (".geu", file);
2060 break;
2061 case UNLT:
2062 fputs (".ltu", file);
2063 break;
2064 case UNGT:
2065 fputs (".gtu", file);
2066 break;
2067 case UNORDERED:
2068 fputs (".nan", file);
2069 break;
2070 case ORDERED:
2071 fputs (".num", file);
2072 break;
2073 default:
2074 gcc_unreachable ();
2075 }
f313d112 2076 if (FLOAT_MODE_P (mode)
738f2522
BS
2077 || x_code == EQ || x_code == NE
2078 || x_code == GEU || x_code == GTU
2079 || x_code == LEU || x_code == LTU)
f313d112 2080 fputs (nvptx_ptx_type_from_mode (mode, true), file);
738f2522 2081 else
f313d112 2082 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
738f2522
BS
2083 break;
2084 default:
2085 common:
2086 switch (x_code)
2087 {
2088 case SUBREG:
f313d112
NS
2089 {
2090 rtx inner_x = SUBREG_REG (x);
2091 machine_mode inner_mode = GET_MODE (inner_x);
2092 machine_mode split = maybe_split_mode (inner_mode);
2093
2094 if (split != VOIDmode
2095 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2096 output_reg (file, REGNO (inner_x), split);
2097 else
2098 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2099 }
2100 break;
738f2522
BS
2101
2102 case REG:
f313d112 2103 output_reg (file, REGNO (x), maybe_split_mode (mode));
738f2522
BS
2104 break;
2105
2106 case MEM:
2107 fputc ('[', file);
f313d112 2108 nvptx_print_address_operand (file, XEXP (x, 0), mode);
738f2522
BS
2109 fputc (']', file);
2110 break;
2111
2112 case CONST_INT:
2113 output_addr_const (file, x);
2114 break;
2115
2116 case CONST:
2117 case SYMBOL_REF:
2118 case LABEL_REF:
2119 /* We could use output_addr_const, but that can print things like
2120 "x-8", which breaks ptxas. Need to ensure it is output as
2121 "x+-8". */
2122 nvptx_print_address_operand (file, x, VOIDmode);
2123 break;
2124
2125 case CONST_DOUBLE:
2126 long vals[2];
f313d112 2127 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
738f2522
BS
2128 vals[0] &= 0xffffffff;
2129 vals[1] &= 0xffffffff;
f313d112 2130 if (mode == SFmode)
738f2522
BS
2131 fprintf (file, "0f%08lx", vals[0]);
2132 else
2133 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2134 break;
2135
2136 default:
2137 output_addr_const (file, x);
2138 }
2139 }
2140}
2141\f
2142/* Record replacement regs used to deal with subreg operands. */
2143struct reg_replace
2144{
2145 rtx replacement[MAX_RECOG_OPERANDS];
2146 machine_mode mode;
2147 int n_allocated;
2148 int n_in_use;
2149};
2150
2151/* Allocate or reuse a replacement in R and return the rtx. */
2152
2153static rtx
2154get_replacement (struct reg_replace *r)
2155{
2156 if (r->n_allocated == r->n_in_use)
2157 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2158 return r->replacement[r->n_in_use++];
2159}
2160
2161/* Clean up subreg operands. In ptx assembly, everything is typed, and
2162 the presence of subregs would break the rules for most instructions.
2163 Replace them with a suitable new register of the right size, plus
2164 conversion copyin/copyout instructions. */
2165
2166static void
517665b3 2167nvptx_reorg_subreg (void)
738f2522
BS
2168{
2169 struct reg_replace qiregs, hiregs, siregs, diregs;
2170 rtx_insn *insn, *next;
2171
738f2522
BS
2172 qiregs.n_allocated = 0;
2173 hiregs.n_allocated = 0;
2174 siregs.n_allocated = 0;
2175 diregs.n_allocated = 0;
2176 qiregs.mode = QImode;
2177 hiregs.mode = HImode;
2178 siregs.mode = SImode;
2179 diregs.mode = DImode;
2180
2181 for (insn = get_insns (); insn; insn = next)
2182 {
2183 next = NEXT_INSN (insn);
2184 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2185 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2186 || GET_CODE (PATTERN (insn)) == USE
2187 || GET_CODE (PATTERN (insn)) == CLOBBER)
2188 continue;
f324806d 2189
738f2522
BS
2190 qiregs.n_in_use = 0;
2191 hiregs.n_in_use = 0;
2192 siregs.n_in_use = 0;
2193 diregs.n_in_use = 0;
2194 extract_insn (insn);
2195 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2196
738f2522
BS
2197 for (int i = 0; i < recog_data.n_operands; i++)
2198 {
2199 rtx op = recog_data.operand[i];
2200 if (GET_CODE (op) != SUBREG)
2201 continue;
2202
2203 rtx inner = SUBREG_REG (op);
2204
2205 machine_mode outer_mode = GET_MODE (op);
2206 machine_mode inner_mode = GET_MODE (inner);
2207 gcc_assert (s_ok);
2208 if (s_ok
2209 && (GET_MODE_PRECISION (inner_mode)
2210 >= GET_MODE_PRECISION (outer_mode)))
2211 continue;
2212 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2213 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2214 : outer_mode == HImode ? &hiregs
2215 : outer_mode == SImode ? &siregs
2216 : &diregs);
2217 rtx new_reg = get_replacement (r);
2218
2219 if (recog_data.operand_type[i] != OP_OUT)
2220 {
2221 enum rtx_code code;
2222 if (GET_MODE_PRECISION (inner_mode)
2223 < GET_MODE_PRECISION (outer_mode))
2224 code = ZERO_EXTEND;
2225 else
2226 code = TRUNCATE;
2227
f7df4a84 2228 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2229 gen_rtx_fmt_e (code, outer_mode, inner));
2230 emit_insn_before (pat, insn);
2231 }
2232
2233 if (recog_data.operand_type[i] != OP_IN)
2234 {
2235 enum rtx_code code;
2236 if (GET_MODE_PRECISION (inner_mode)
2237 < GET_MODE_PRECISION (outer_mode))
2238 code = TRUNCATE;
2239 else
2240 code = ZERO_EXTEND;
2241
f7df4a84 2242 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2243 gen_rtx_fmt_e (code, inner_mode, new_reg));
2244 emit_insn_after (pat, insn);
2245 }
2246 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2247 }
2248 }
517665b3 2249}
738f2522 2250
d2d47a28
NS
2251/* Loop structure of the function. The entire function is described as
2252 a NULL loop. */
d88cd9c4
NS
2253
2254struct parallel
2255{
2256 /* Parent parallel. */
2257 parallel *parent;
2258
2259 /* Next sibling parallel. */
2260 parallel *next;
2261
2262 /* First child parallel. */
2263 parallel *inner;
2264
2265 /* Partitioning mask of the parallel. */
2266 unsigned mask;
2267
2268 /* Partitioning used within inner parallels. */
2269 unsigned inner_mask;
2270
2271 /* Location of parallel forked and join. The forked is the first
2272 block in the parallel and the join is the first block after of
2273 the partition. */
2274 basic_block forked_block;
2275 basic_block join_block;
2276
2277 rtx_insn *forked_insn;
2278 rtx_insn *join_insn;
2279
2280 rtx_insn *fork_insn;
2281 rtx_insn *joining_insn;
2282
2283 /* Basic blocks in this parallel, but not in child parallels. The
2284 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2285 blocks are not. */
2286 auto_vec<basic_block> blocks;
2287
2288public:
2289 parallel (parallel *parent, unsigned mode);
2290 ~parallel ();
2291};
2292
2293/* Constructor links the new parallel into it's parent's chain of
2294 children. */
2295
2296parallel::parallel (parallel *parent_, unsigned mask_)
2297 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2298{
2299 forked_block = join_block = 0;
2300 forked_insn = join_insn = 0;
2301 fork_insn = joining_insn = 0;
2302
2303 if (parent)
2304 {
2305 next = parent->inner;
2306 parent->inner = this;
2307 }
2308}
2309
2310parallel::~parallel ()
2311{
2312 delete inner;
2313 delete next;
2314}
2315
2316/* Map of basic blocks to insns */
2317typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2318
2319/* A tuple of an insn of interest and the BB in which it resides. */
2320typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2321typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2322
2323/* Split basic blocks such that each forked and join unspecs are at
2324 the start of their basic blocks. Thus afterwards each block will
2325 have a single partitioning mode. We also do the same for return
2326 insns, as they are executed by every thread. Return the
2327 partitioning mode of the function as a whole. Populate MAP with
2328 head and tail blocks. We also clear the BB visited flag, which is
2329 used when finding partitions. */
2330
2331static void
2332nvptx_split_blocks (bb_insn_map_t *map)
2333{
2334 insn_bb_vec_t worklist;
2335 basic_block block;
2336 rtx_insn *insn;
2337
2338 /* Locate all the reorg instructions of interest. */
2339 FOR_ALL_BB_FN (block, cfun)
2340 {
2341 bool seen_insn = false;
2342
2343 /* Clear visited flag, for use by parallel locator */
2344 block->flags &= ~BB_VISITED;
2345
2346 FOR_BB_INSNS (block, insn)
2347 {
2348 if (!INSN_P (insn))
2349 continue;
2350 switch (recog_memoized (insn))
2351 {
2352 default:
2353 seen_insn = true;
2354 continue;
2355 case CODE_FOR_nvptx_forked:
2356 case CODE_FOR_nvptx_join:
2357 break;
2358
2359 case CODE_FOR_return:
2360 /* We also need to split just before return insns, as
2361 that insn needs executing by all threads, but the
2362 block it is in probably does not. */
2363 break;
2364 }
2365
2366 if (seen_insn)
2367 /* We've found an instruction that must be at the start of
2368 a block, but isn't. Add it to the worklist. */
2369 worklist.safe_push (insn_bb_t (insn, block));
2370 else
2371 /* It was already the first instruction. Just add it to
2372 the map. */
2373 map->get_or_insert (block) = insn;
2374 seen_insn = true;
2375 }
2376 }
2377
2378 /* Split blocks on the worklist. */
2379 unsigned ix;
2380 insn_bb_t *elt;
2381 basic_block remap = 0;
2382 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2383 {
2384 if (remap != elt->second)
2385 {
2386 block = elt->second;
2387 remap = block;
2388 }
2389
2390 /* Split block before insn. The insn is in the new block */
2391 edge e = split_block (block, PREV_INSN (elt->first));
2392
2393 block = e->dest;
2394 map->get_or_insert (block) = elt->first;
2395 }
2396}
2397
2398/* BLOCK is a basic block containing a head or tail instruction.
2399 Locate the associated prehead or pretail instruction, which must be
2400 in the single predecessor block. */
2401
2402static rtx_insn *
2403nvptx_discover_pre (basic_block block, int expected)
2404{
2405 gcc_assert (block->preds->length () == 1);
2406 basic_block pre_block = (*block->preds)[0]->src;
2407 rtx_insn *pre_insn;
2408
2409 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2410 pre_insn = PREV_INSN (pre_insn))
2411 gcc_assert (pre_insn != BB_HEAD (pre_block));
2412
2413 gcc_assert (recog_memoized (pre_insn) == expected);
2414 return pre_insn;
2415}
2416
2417/* Dump this parallel and all its inner parallels. */
2418
2419static void
2420nvptx_dump_pars (parallel *par, unsigned depth)
2421{
2422 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2423 depth, par->mask,
2424 par->forked_block ? par->forked_block->index : -1,
2425 par->join_block ? par->join_block->index : -1);
2426
2427 fprintf (dump_file, " blocks:");
2428
2429 basic_block block;
2430 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2431 fprintf (dump_file, " %d", block->index);
2432 fprintf (dump_file, "\n");
2433 if (par->inner)
2434 nvptx_dump_pars (par->inner, depth + 1);
2435
2436 if (par->next)
2437 nvptx_dump_pars (par->next, depth);
2438}
2439
2440/* If BLOCK contains a fork/join marker, process it to create or
2441 terminate a loop structure. Add this block to the current loop,
2442 and then walk successor blocks. */
2443
2444static parallel *
2445nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2446{
2447 if (block->flags & BB_VISITED)
2448 return par;
2449 block->flags |= BB_VISITED;
2450
2451 if (rtx_insn **endp = map->get (block))
2452 {
2453 rtx_insn *end = *endp;
2454
2455 /* This is a block head or tail, or return instruction. */
2456 switch (recog_memoized (end))
2457 {
2458 case CODE_FOR_return:
2459 /* Return instructions are in their own block, and we
2460 don't need to do anything more. */
2461 return par;
2462
2463 case CODE_FOR_nvptx_forked:
2464 /* Loop head, create a new inner loop and add it into
2465 our parent's child list. */
2466 {
2467 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2468
2469 gcc_assert (mask);
2470 par = new parallel (par, mask);
2471 par->forked_block = block;
2472 par->forked_insn = end;
2473 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2474 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2475 par->fork_insn
2476 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2477 }
2478 break;
2479
2480 case CODE_FOR_nvptx_join:
2481 /* A loop tail. Finish the current loop and return to
2482 parent. */
2483 {
2484 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2485
2486 gcc_assert (par->mask == mask);
2487 par->join_block = block;
2488 par->join_insn = end;
2489 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2490 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2491 par->joining_insn
2492 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2493 par = par->parent;
2494 }
2495 break;
2496
2497 default:
2498 gcc_unreachable ();
2499 }
2500 }
2501
2502 if (par)
2503 /* Add this block onto the current loop's list of blocks. */
2504 par->blocks.safe_push (block);
2505 else
2506 /* This must be the entry block. Create a NULL parallel. */
2507 par = new parallel (0, 0);
2508
2509 /* Walk successor blocks. */
2510 edge e;
2511 edge_iterator ei;
2512
2513 FOR_EACH_EDGE (e, ei, block->succs)
2514 nvptx_find_par (map, par, e->dest);
2515
2516 return par;
2517}
2518
2519/* DFS walk the CFG looking for fork & join markers. Construct
2520 loop structures as we go. MAP is a mapping of basic blocks
2521 to head & tail markers, discovered when splitting blocks. This
2522 speeds up the discovery. We rely on the BB visited flag having
2523 been cleared when splitting blocks. */
2524
2525static parallel *
2526nvptx_discover_pars (bb_insn_map_t *map)
2527{
2528 basic_block block;
2529
2530 /* Mark exit blocks as visited. */
2531 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2532 block->flags |= BB_VISITED;
2533
2534 /* And entry block as not. */
2535 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2536 block->flags &= ~BB_VISITED;
2537
2538 parallel *par = nvptx_find_par (map, 0, block);
2539
2540 if (dump_file)
2541 {
2542 fprintf (dump_file, "\nLoops\n");
2543 nvptx_dump_pars (par, 0);
2544 fprintf (dump_file, "\n");
2545 }
2546
2547 return par;
2548}
2549
912442c2
NS
2550/* Analyse a group of BBs within a partitioned region and create N
2551 Single-Entry-Single-Exit regions. Some of those regions will be
2552 trivial ones consisting of a single BB. The blocks of a
2553 partitioned region might form a set of disjoint graphs -- because
2554 the region encloses a differently partitoned sub region.
2555
2556 We use the linear time algorithm described in 'Finding Regions Fast:
2557 Single Entry Single Exit and control Regions in Linear Time'
2558 Johnson, Pearson & Pingali. That algorithm deals with complete
2559 CFGs, where a back edge is inserted from END to START, and thus the
2560 problem becomes one of finding equivalent loops.
2561
2562 In this case we have a partial CFG. We complete it by redirecting
2563 any incoming edge to the graph to be from an arbitrary external BB,
2564 and similarly redirecting any outgoing edge to be to that BB.
2565 Thus we end up with a closed graph.
2566
2567 The algorithm works by building a spanning tree of an undirected
2568 graph and keeping track of back edges from nodes further from the
2569 root in the tree to nodes nearer to the root in the tree. In the
2570 description below, the root is up and the tree grows downwards.
2571
2572 We avoid having to deal with degenerate back-edges to the same
2573 block, by splitting each BB into 3 -- one for input edges, one for
2574 the node itself and one for the output edges. Such back edges are
2575 referred to as 'Brackets'. Cycle equivalent nodes will have the
2576 same set of brackets.
2577
2578 Determining bracket equivalency is done by maintaining a list of
2579 brackets in such a manner that the list length and final bracket
2580 uniquely identify the set.
2581
2582 We use coloring to mark all BBs with cycle equivalency with the
2583 same color. This is the output of the 'Finding Regions Fast'
2584 algorithm. Notice it doesn't actually find the set of nodes within
2585 a particular region, just unorderd sets of nodes that are the
2586 entries and exits of SESE regions.
2587
2588 After determining cycle equivalency, we need to find the minimal
2589 set of SESE regions. Do this with a DFS coloring walk of the
2590 complete graph. We're either 'looking' or 'coloring'. When
2591 looking, and we're in the subgraph, we start coloring the color of
2592 the current node, and remember that node as the start of the
2593 current color's SESE region. Every time we go to a new node, we
2594 decrement the count of nodes with thet color. If it reaches zero,
2595 we remember that node as the end of the current color's SESE region
2596 and return to 'looking'. Otherwise we color the node the current
2597 color.
2598
2599 This way we end up with coloring the inside of non-trivial SESE
2600 regions with the color of that region. */
2601
2602/* A pair of BBs. We use this to represent SESE regions. */
2603typedef std::pair<basic_block, basic_block> bb_pair_t;
2604typedef auto_vec<bb_pair_t> bb_pair_vec_t;
2605
2606/* A node in the undirected CFG. The discriminator SECOND indicates just
2607 above or just below the BB idicated by FIRST. */
2608typedef std::pair<basic_block, int> pseudo_node_t;
2609
2610/* A bracket indicates an edge towards the root of the spanning tree of the
2611 undirected graph. Each bracket has a color, determined
2612 from the currrent set of brackets. */
2613struct bracket
2614{
2615 pseudo_node_t back; /* Back target */
2616
2617 /* Current color and size of set. */
2618 unsigned color;
2619 unsigned size;
2620
2621 bracket (pseudo_node_t back_)
2622 : back (back_), color (~0u), size (~0u)
2623 {
2624 }
2625
2626 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
2627 {
2628 if (length != size)
2629 {
2630 size = length;
2631 color = color_counts.length ();
2632 color_counts.quick_push (0);
2633 }
2634 color_counts[color]++;
2635 return color;
2636 }
2637};
2638
2639typedef auto_vec<bracket> bracket_vec_t;
2640
2641/* Basic block info for finding SESE regions. */
2642
2643struct bb_sese
2644{
2645 int node; /* Node number in spanning tree. */
2646 int parent; /* Parent node number. */
2647
2648 /* The algorithm splits each node A into Ai, A', Ao. The incoming
2649 edges arrive at pseudo-node Ai and the outgoing edges leave at
2650 pseudo-node Ao. We have to remember which way we arrived at a
2651 particular node when generating the spanning tree. dir > 0 means
2652 we arrived at Ai, dir < 0 means we arrived at Ao. */
2653 int dir;
2654
2655 /* Lowest numbered pseudo-node reached via a backedge from thsis
2656 node, or any descendant. */
2657 pseudo_node_t high;
2658
2659 int color; /* Cycle-equivalence color */
2660
2661 /* Stack of brackets for this node. */
2662 bracket_vec_t brackets;
2663
2664 bb_sese (unsigned node_, unsigned p, int dir_)
2665 :node (node_), parent (p), dir (dir_)
2666 {
2667 }
2668 ~bb_sese ();
2669
2670 /* Push a bracket ending at BACK. */
2671 void push (const pseudo_node_t &back)
2672 {
2673 if (dump_file)
2674 fprintf (dump_file, "Pushing backedge %d:%+d\n",
2675 back.first ? back.first->index : 0, back.second);
2676 brackets.safe_push (bracket (back));
2677 }
2678
2679 void append (bb_sese *child);
2680 void remove (const pseudo_node_t &);
2681
2682 /* Set node's color. */
2683 void set_color (auto_vec<unsigned> &color_counts)
2684 {
2685 color = brackets.last ().get_color (color_counts, brackets.length ());
2686 }
2687};
2688
2689bb_sese::~bb_sese ()
2690{
2691}
2692
2693/* Destructively append CHILD's brackets. */
2694
2695void
2696bb_sese::append (bb_sese *child)
2697{
2698 if (int len = child->brackets.length ())
2699 {
2700 int ix;
2701
2702 if (dump_file)
2703 {
2704 for (ix = 0; ix < len; ix++)
2705 {
2706 const pseudo_node_t &pseudo = child->brackets[ix].back;
2707 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
2708 child->node, pseudo.first ? pseudo.first->index : 0,
2709 pseudo.second);
2710 }
2711 }
2712 if (!brackets.length ())
2713 std::swap (brackets, child->brackets);
2714 else
2715 {
2716 brackets.reserve (len);
2717 for (ix = 0; ix < len; ix++)
2718 brackets.quick_push (child->brackets[ix]);
2719 }
2720 }
2721}
2722
2723/* Remove brackets that terminate at PSEUDO. */
2724
2725void
2726bb_sese::remove (const pseudo_node_t &pseudo)
2727{
2728 unsigned removed = 0;
2729 int len = brackets.length ();
2730
2731 for (int ix = 0; ix < len; ix++)
2732 {
2733 if (brackets[ix].back == pseudo)
2734 {
2735 if (dump_file)
2736 fprintf (dump_file, "Removing backedge %d:%+d\n",
2737 pseudo.first ? pseudo.first->index : 0, pseudo.second);
2738 removed++;
2739 }
2740 else if (removed)
2741 brackets[ix-removed] = brackets[ix];
2742 }
2743 while (removed--)
2744 brackets.pop ();
2745}
2746
2747/* Accessors for BB's aux pointer. */
2748#define BB_SET_SESE(B, S) ((B)->aux = (S))
2749#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
2750
2751/* DFS walk creating SESE data structures. Only cover nodes with
2752 BB_VISITED set. Append discovered blocks to LIST. We number in
2753 increments of 3 so that the above and below pseudo nodes can be
2754 implicitly numbered too. */
2755
2756static int
2757nvptx_sese_number (int n, int p, int dir, basic_block b,
2758 auto_vec<basic_block> *list)
2759{
2760 if (BB_GET_SESE (b))
2761 return n;
2762
2763 if (dump_file)
2764 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
2765 b->index, n, p, dir);
2766
2767 BB_SET_SESE (b, new bb_sese (n, p, dir));
2768 p = n;
2769
2770 n += 3;
2771 list->quick_push (b);
2772
2773 /* First walk the nodes on the 'other side' of this node, then walk
2774 the nodes on the same side. */
2775 for (unsigned ix = 2; ix; ix--)
2776 {
2777 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
2778 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
2779 : offsetof (edge_def, src));
2780 edge e;
2781 edge_iterator (ei);
2782
2783 FOR_EACH_EDGE (e, ei, edges)
2784 {
2785 basic_block target = *(basic_block *)((char *)e + offset);
2786
2787 if (target->flags & BB_VISITED)
2788 n = nvptx_sese_number (n, p, dir, target, list);
2789 }
2790 dir = -dir;
2791 }
2792 return n;
2793}
2794
2795/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
2796 EDGES are the outgoing edges and OFFSET is the offset to the src
2797 or dst block on the edges. */
2798
2799static void
2800nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
2801 vec<edge, va_gc> *edges, size_t offset)
2802{
2803 edge e;
2804 edge_iterator (ei);
2805 int hi_back = depth;
2806 pseudo_node_t node_back (0, depth);
2807 int hi_child = depth;
2808 pseudo_node_t node_child (0, depth);
2809 basic_block child = NULL;
2810 unsigned num_children = 0;
2811 int usd = -dir * sese->dir;
2812
2813 if (dump_file)
2814 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
2815 me->index, sese->node, dir);
2816
2817 if (dir < 0)
2818 {
2819 /* This is the above pseudo-child. It has the BB itself as an
2820 additional child node. */
2821 node_child = sese->high;
2822 hi_child = node_child.second;
2823 if (node_child.first)
2824 hi_child += BB_GET_SESE (node_child.first)->node;
2825 num_children++;
2826 }
2827
2828 /* Examine each edge.
2829 - if it is a child (a) append its bracket list and (b) record
2830 whether it is the child with the highest reaching bracket.
2831 - if it is an edge to ancestor, record whether it's the highest
2832 reaching backlink. */
2833 FOR_EACH_EDGE (e, ei, edges)
2834 {
2835 basic_block target = *(basic_block *)((char *)e + offset);
2836
2837 if (bb_sese *t_sese = BB_GET_SESE (target))
2838 {
2839 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
2840 {
2841 /* Child node. Append its bracket list. */
2842 num_children++;
2843 sese->append (t_sese);
2844
2845 /* Compare it's hi value. */
2846 int t_hi = t_sese->high.second;
2847
2848 if (basic_block child_hi_block = t_sese->high.first)
2849 t_hi += BB_GET_SESE (child_hi_block)->node;
2850
2851 if (hi_child > t_hi)
2852 {
2853 hi_child = t_hi;
2854 node_child = t_sese->high;
2855 child = target;
2856 }
2857 }
2858 else if (t_sese->node < sese->node + dir
2859 && !(dir < 0 && sese->parent == t_sese->node))
2860 {
2861 /* Non-parental ancestor node -- a backlink. */
2862 int d = usd * t_sese->dir;
2863 int back = t_sese->node + d;
2864
2865 if (hi_back > back)
2866 {
2867 hi_back = back;
2868 node_back = pseudo_node_t (target, d);
2869 }
2870 }
2871 }
2872 else
2873 { /* Fallen off graph, backlink to entry node. */
2874 hi_back = 0;
2875 node_back = pseudo_node_t (0, 0);
2876 }
2877 }
2878
2879 /* Remove any brackets that terminate at this pseudo node. */
2880 sese->remove (pseudo_node_t (me, dir));
2881
2882 /* Now push any backlinks from this pseudo node. */
2883 FOR_EACH_EDGE (e, ei, edges)
2884 {
2885 basic_block target = *(basic_block *)((char *)e + offset);
2886 if (bb_sese *t_sese = BB_GET_SESE (target))
2887 {
2888 if (t_sese->node < sese->node + dir
2889 && !(dir < 0 && sese->parent == t_sese->node))
2890 /* Non-parental ancestor node - backedge from me. */
2891 sese->push (pseudo_node_t (target, usd * t_sese->dir));
2892 }
2893 else
2894 {
2895 /* back edge to entry node */
2896 sese->push (pseudo_node_t (0, 0));
2897 }
2898 }
2899
2900 /* If this node leads directly or indirectly to a no-return region of
2901 the graph, then fake a backedge to entry node. */
2902 if (!sese->brackets.length () || !edges || !edges->length ())
2903 {
2904 hi_back = 0;
2905 node_back = pseudo_node_t (0, 0);
2906 sese->push (node_back);
2907 }
2908
2909 /* Record the highest reaching backedge from us or a descendant. */
2910 sese->high = hi_back < hi_child ? node_back : node_child;
2911
2912 if (num_children > 1)
2913 {
2914 /* There is more than one child -- this is a Y shaped piece of
2915 spanning tree. We have to insert a fake backedge from this
2916 node to the highest ancestor reached by not-the-highest
2917 reaching child. Note that there may be multiple children
2918 with backedges to the same highest node. That's ok and we
2919 insert the edge to that highest node. */
2920 hi_child = depth;
2921 if (dir < 0 && child)
2922 {
2923 node_child = sese->high;
2924 hi_child = node_child.second;
2925 if (node_child.first)
2926 hi_child += BB_GET_SESE (node_child.first)->node;
2927 }
2928
2929 FOR_EACH_EDGE (e, ei, edges)
2930 {
2931 basic_block target = *(basic_block *)((char *)e + offset);
2932
2933 if (target == child)
2934 /* Ignore the highest child. */
2935 continue;
2936
2937 bb_sese *t_sese = BB_GET_SESE (target);
2938 if (!t_sese)
2939 continue;
2940 if (t_sese->parent != sese->node)
2941 /* Not a child. */
2942 continue;
2943
2944 /* Compare its hi value. */
2945 int t_hi = t_sese->high.second;
2946
2947 if (basic_block child_hi_block = t_sese->high.first)
2948 t_hi += BB_GET_SESE (child_hi_block)->node;
2949
2950 if (hi_child > t_hi)
2951 {
2952 hi_child = t_hi;
2953 node_child = t_sese->high;
2954 }
2955 }
2956
2957 sese->push (node_child);
2958 }
2959}
2960
2961
2962/* DFS walk of BB graph. Color node BLOCK according to COLORING then
2963 proceed to successors. Set SESE entry and exit nodes of
2964 REGIONS. */
2965
2966static void
2967nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
2968 basic_block block, int coloring)
2969{
2970 bb_sese *sese = BB_GET_SESE (block);
2971
2972 if (block->flags & BB_VISITED)
2973 {
2974 /* If we've already encountered this block, either we must not
2975 be coloring, or it must have been colored the current color. */
2976 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
2977 return;
2978 }
2979
2980 block->flags |= BB_VISITED;
2981
2982 if (sese)
2983 {
2984 if (coloring < 0)
2985 {
2986 /* Start coloring a region. */
2987 regions[sese->color].first = block;
2988 coloring = sese->color;
2989 }
2990
2991 if (!--color_counts[sese->color] && sese->color == coloring)
2992 {
2993 /* Found final block of SESE region. */
2994 regions[sese->color].second = block;
2995 coloring = -1;
2996 }
2997 else
2998 /* Color the node, so we can assert on revisiting the node
2999 that the graph is indeed SESE. */
3000 sese->color = coloring;
3001 }
3002 else
3003 /* Fallen off the subgraph, we cannot be coloring. */
3004 gcc_assert (coloring < 0);
3005
3006 /* Walk each successor block. */
3007 if (block->succs && block->succs->length ())
3008 {
3009 edge e;
3010 edge_iterator ei;
3011
3012 FOR_EACH_EDGE (e, ei, block->succs)
3013 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3014 }
3015 else
3016 gcc_assert (coloring < 0);
3017}
3018
3019/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3020 end up with NULL entries in it. */
3021
3022static void
3023nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3024{
3025 basic_block block;
3026 int ix;
3027
3028 /* First clear each BB of the whole function. */
3029 FOR_EACH_BB_FN (block, cfun)
3030 {
3031 block->flags &= ~BB_VISITED;
3032 BB_SET_SESE (block, 0);
3033 }
3034 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3035 block->flags &= ~BB_VISITED;
3036 BB_SET_SESE (block, 0);
3037 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3038 block->flags &= ~BB_VISITED;
3039 BB_SET_SESE (block, 0);
3040
3041 /* Mark blocks in the function that are in this graph. */
3042 for (ix = 0; blocks.iterate (ix, &block); ix++)
3043 block->flags |= BB_VISITED;
3044
3045 /* Counts of nodes assigned to each color. There cannot be more
3046 colors than blocks (and hopefully there will be fewer). */
3047 auto_vec<unsigned> color_counts;
3048 color_counts.reserve (blocks.length ());
3049
3050 /* Worklist of nodes in the spanning tree. Again, there cannot be
3051 more nodes in the tree than blocks (there will be fewer if the
3052 CFG of blocks is disjoint). */
3053 auto_vec<basic_block> spanlist;
3054 spanlist.reserve (blocks.length ());
3055
3056 /* Make sure every block has its cycle class determined. */
3057 for (ix = 0; blocks.iterate (ix, &block); ix++)
3058 {
3059 if (BB_GET_SESE (block))
3060 /* We already met this block in an earlier graph solve. */
3061 continue;
3062
3063 if (dump_file)
3064 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3065
3066 /* Number the nodes reachable from block initial DFS order. */
3067 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3068
3069 /* Now walk in reverse DFS order to find cycle equivalents. */
3070 while (spanlist.length ())
3071 {
3072 block = spanlist.pop ();
3073 bb_sese *sese = BB_GET_SESE (block);
3074
3075 /* Do the pseudo node below. */
3076 nvptx_sese_pseudo (block, sese, depth, +1,
3077 sese->dir > 0 ? block->succs : block->preds,
3078 (sese->dir > 0 ? offsetof (edge_def, dest)
3079 : offsetof (edge_def, src)));
3080 sese->set_color (color_counts);
3081 /* Do the pseudo node above. */
3082 nvptx_sese_pseudo (block, sese, depth, -1,
3083 sese->dir < 0 ? block->succs : block->preds,
3084 (sese->dir < 0 ? offsetof (edge_def, dest)
3085 : offsetof (edge_def, src)));
3086 }
3087 if (dump_file)
3088 fprintf (dump_file, "\n");
3089 }
3090
3091 if (dump_file)
3092 {
3093 unsigned count;
3094 const char *comma = "";
3095
3096 fprintf (dump_file, "Found %d cycle equivalents\n",
3097 color_counts.length ());
3098 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3099 {
3100 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3101
3102 comma = "";
3103 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3104 if (BB_GET_SESE (block)->color == ix)
3105 {
3106 block->flags |= BB_VISITED;
3107 fprintf (dump_file, "%s%d", comma, block->index);
3108 comma=",";
3109 }
3110 fprintf (dump_file, "}");
3111 comma = ", ";
3112 }
3113 fprintf (dump_file, "\n");
3114 }
3115
3116 /* Now we've colored every block in the subgraph. We now need to
3117 determine the minimal set of SESE regions that cover that
3118 subgraph. Do this with a DFS walk of the complete function.
3119 During the walk we're either 'looking' or 'coloring'. When we
3120 reach the last node of a particular color, we stop coloring and
3121 return to looking. */
3122
3123 /* There cannot be more SESE regions than colors. */
3124 regions.reserve (color_counts.length ());
3125 for (ix = color_counts.length (); ix--;)
3126 regions.quick_push (bb_pair_t (0, 0));
3127
3128 for (ix = 0; blocks.iterate (ix, &block); ix++)
3129 block->flags &= ~BB_VISITED;
3130
3131 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3132
3133 if (dump_file)
3134 {
3135 const char *comma = "";
3136 int len = regions.length ();
3137
3138 fprintf (dump_file, "SESE regions:");
3139 for (ix = 0; ix != len; ix++)
3140 {
3141 basic_block from = regions[ix].first;
3142 basic_block to = regions[ix].second;
3143
3144 if (from)
3145 {
3146 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3147 if (to != from)
3148 fprintf (dump_file, "->%d", to->index);
3149
3150 int color = BB_GET_SESE (from)->color;
3151
3152 /* Print the blocks within the region (excluding ends). */
3153 FOR_EACH_BB_FN (block, cfun)
3154 {
3155 bb_sese *sese = BB_GET_SESE (block);
3156
3157 if (sese && sese->color == color
3158 && block != from && block != to)
3159 fprintf (dump_file, ".%d", block->index);
3160 }
3161 fprintf (dump_file, "}");
3162 }
3163 comma = ",";
3164 }
3165 fprintf (dump_file, "\n\n");
3166 }
3167
3168 for (ix = 0; blocks.iterate (ix, &block); ix++)
3169 delete BB_GET_SESE (block);
3170}
3171
3172#undef BB_SET_SESE
3173#undef BB_GET_SESE
3174
d88cd9c4
NS
3175/* Propagate live state at the start of a partitioned region. BLOCK
3176 provides the live register information, and might not contain
3177 INSN. Propagation is inserted just after INSN. RW indicates whether
3178 we are reading and/or writing state. This
3179 separation is needed for worker-level proppagation where we
3180 essentially do a spill & fill. FN is the underlying worker
3181 function to generate the propagation instructions for single
3182 register. DATA is user data.
3183
3184 We propagate the live register set and the entire frame. We could
3185 do better by (a) propagating just the live set that is used within
3186 the partitioned regions and (b) only propagating stack entries that
3187 are used. The latter might be quite hard to determine. */
3188
3189typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3190
3191static void
3192nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3193 propagator_fn fn, void *data)
3194{
3195 bitmap live = DF_LIVE_IN (block);
3196 bitmap_iterator iterator;
3197 unsigned ix;
3198
3199 /* Copy the frame array. */
3200 HOST_WIDE_INT fs = get_frame_size ();
3201 if (fs)
3202 {
3203 rtx tmp = gen_reg_rtx (DImode);
3204 rtx idx = NULL_RTX;
3205 rtx ptr = gen_reg_rtx (Pmode);
3206 rtx pred = NULL_RTX;
3207 rtx_code_label *label = NULL;
3208
3209 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
3210 fs /= GET_MODE_SIZE (DImode);
3211 /* Detect single iteration loop. */
3212 if (fs == 1)
3213 fs = 0;
3214
3215 start_sequence ();
3216 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3217 if (fs)
3218 {
3219 idx = gen_reg_rtx (SImode);
3220 pred = gen_reg_rtx (BImode);
3221 label = gen_label_rtx ();
3222
3223 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3224 /* Allow worker function to initialize anything needed. */
3225 rtx init = fn (tmp, PM_loop_begin, fs, data);
3226 if (init)
3227 emit_insn (init);
3228 emit_label (label);
3229 LABEL_NUSES (label)++;
3230 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3231 }
3232 if (rw & PM_read)
3233 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3234 emit_insn (fn (tmp, rw, fs, data));
3235 if (rw & PM_write)
3236 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3237 if (fs)
3238 {
3239 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3240 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3241 emit_insn (gen_br_true_uni (pred, label));
3242 rtx fini = fn (tmp, PM_loop_end, fs, data);
3243 if (fini)
3244 emit_insn (fini);
3245 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3246 }
3247 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3248 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3249 rtx cpy = get_insns ();
3250 end_sequence ();
3251 insn = emit_insn_after (cpy, insn);
3252 }
3253
3254 /* Copy live registers. */
3255 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3256 {
3257 rtx reg = regno_reg_rtx[ix];
3258
3259 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3260 {
3261 rtx bcast = fn (reg, rw, 0, data);
3262
3263 insn = emit_insn_after (bcast, insn);
3264 }
3265 }
3266}
3267
3268/* Worker for nvptx_vpropagate. */
3269
3270static rtx
3271vprop_gen (rtx reg, propagate_mask pm,
3272 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3273{
3274 if (!(pm & PM_read_write))
3275 return 0;
3276
3277 return nvptx_gen_vcast (reg);
3278}
3279
3280/* Propagate state that is live at start of BLOCK across the vectors
3281 of a single warp. Propagation is inserted just after INSN. */
3282
3283static void
3284nvptx_vpropagate (basic_block block, rtx_insn *insn)
3285{
3286 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3287}
3288
3289/* Worker for nvptx_wpropagate. */
3290
3291static rtx
3292wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3293{
3294 wcast_data_t *data = (wcast_data_t *)data_;
3295
3296 if (pm & PM_loop_begin)
3297 {
3298 /* Starting a loop, initialize pointer. */
3299 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3300
3301 if (align > worker_bcast_align)
3302 worker_bcast_align = align;
3303 data->offset = (data->offset + align - 1) & ~(align - 1);
3304
3305 data->ptr = gen_reg_rtx (Pmode);
3306
3307 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3308 }
3309 else if (pm & PM_loop_end)
3310 {
3311 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3312 data->ptr = NULL_RTX;
3313 return clobber;
3314 }
3315 else
3316 return nvptx_gen_wcast (reg, pm, rep, data);
3317}
3318
3319/* Spill or fill live state that is live at start of BLOCK. PRE_P
3320 indicates if this is just before partitioned mode (do spill), or
3321 just after it starts (do fill). Sequence is inserted just after
3322 INSN. */
3323
3324static void
3325nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3326{
3327 wcast_data_t data;
3328
3329 data.base = gen_reg_rtx (Pmode);
3330 data.offset = 0;
3331 data.ptr = NULL_RTX;
3332
3333 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3334 if (data.offset)
3335 {
3336 /* Stuff was emitted, initialize the base pointer now. */
bd602b7f 3337 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
d88cd9c4 3338 emit_insn_after (init, insn);
15ab6f00 3339
d88cd9c4
NS
3340 if (worker_bcast_size < data.offset)
3341 worker_bcast_size = data.offset;
3342 }
3343}
3344
3345/* Emit a worker-level synchronization barrier. We use different
3346 markers for before and after synchronizations. */
3347
3348static rtx
3349nvptx_wsync (bool after)
3350{
3351 return gen_nvptx_barsync (GEN_INT (after));
3352}
3353
3354/* Single neutering according to MASK. FROM is the incoming block and
3355 TO is the outgoing block. These may be the same block. Insert at
3356 start of FROM:
3357
3358 if (tid.<axis>) goto end.
3359
3360 and insert before ending branch of TO (if there is such an insn):
3361
3362 end:
3363 <possibly-broadcast-cond>
3364 <branch>
3365
3366 We currently only use differnt FROM and TO when skipping an entire
3367 loop. We could do more if we detected superblocks. */
3368
3369static void
3370nvptx_single (unsigned mask, basic_block from, basic_block to)
3371{
3372 rtx_insn *head = BB_HEAD (from);
3373 rtx_insn *tail = BB_END (to);
3374 unsigned skip_mask = mask;
3375
3376 /* Find first insn of from block */
3377 while (head != BB_END (from) && !INSN_P (head))
3378 head = NEXT_INSN (head);
3379
3380 /* Find last insn of to block */
3381 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3382 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3383 tail = PREV_INSN (tail);
3384
3385 /* Detect if tail is a branch. */
3386 rtx tail_branch = NULL_RTX;
3387 rtx cond_branch = NULL_RTX;
3388 if (tail && INSN_P (tail))
3389 {
3390 tail_branch = PATTERN (tail);
3391 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3392 tail_branch = NULL_RTX;
3393 else
3394 {
3395 cond_branch = SET_SRC (tail_branch);
3396 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3397 cond_branch = NULL_RTX;
3398 }
3399 }
3400
3401 if (tail == head)
3402 {
3403 /* If this is empty, do nothing. */
3404 if (!head || !INSN_P (head))
3405 return;
3406
3407 /* If this is a dummy insn, do nothing. */
3408 switch (recog_memoized (head))
3409 {
3410 default:
3411 break;
3412 case CODE_FOR_nvptx_fork:
3413 case CODE_FOR_nvptx_forked:
3414 case CODE_FOR_nvptx_joining:
3415 case CODE_FOR_nvptx_join:
3416 return;
3417 }
3418
3419 if (cond_branch)
3420 {
3421 /* If we're only doing vector single, there's no need to
3422 emit skip code because we'll not insert anything. */
3423 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
3424 skip_mask = 0;
3425 }
3426 else if (tail_branch)
3427 /* Block with only unconditional branch. Nothing to do. */
3428 return;
3429 }
3430
3431 /* Insert the vector test inside the worker test. */
3432 unsigned mode;
3433 rtx_insn *before = tail;
3434 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3435 if (GOMP_DIM_MASK (mode) & skip_mask)
3436 {
3437 rtx_code_label *label = gen_label_rtx ();
3438 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
3439
3440 if (!pred)
3441 {
3442 pred = gen_reg_rtx (BImode);
3443 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
3444 }
3445
3446 rtx br;
3447 if (mode == GOMP_DIM_VECTOR)
3448 br = gen_br_true (pred, label);
3449 else
3450 br = gen_br_true_uni (pred, label);
3451 emit_insn_before (br, head);
3452
3453 LABEL_NUSES (label)++;
3454 if (tail_branch)
3455 before = emit_label_before (label, before);
3456 else
3457 emit_label_after (label, tail);
3458 }
3459
3460 /* Now deal with propagating the branch condition. */
3461 if (cond_branch)
3462 {
3463 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
3464
3465 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
3466 {
3467 /* Vector mode only, do a shuffle. */
3468 emit_insn_before (nvptx_gen_vcast (pvar), tail);
3469 }
3470 else
3471 {
3472 /* Includes worker mode, do spill & fill. By construction
3473 we should never have worker mode only. */
3474 wcast_data_t data;
3475
3476 data.base = worker_bcast_sym;
3477 data.ptr = 0;
3478
3479 if (worker_bcast_size < GET_MODE_SIZE (SImode))
3480 worker_bcast_size = GET_MODE_SIZE (SImode);
3481
3482 data.offset = 0;
3483 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
3484 before);
3485 /* Barrier so other workers can see the write. */
3486 emit_insn_before (nvptx_wsync (false), tail);
3487 data.offset = 0;
3488 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
3489 /* This barrier is needed to avoid worker zero clobbering
3490 the broadcast buffer before all the other workers have
3491 had a chance to read this instance of it. */
3492 emit_insn_before (nvptx_wsync (true), tail);
3493 }
3494
3495 extract_insn (tail);
3496 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
3497 UNSPEC_BR_UNIFIED);
3498 validate_change (tail, recog_data.operand_loc[0], unsp, false);
3499 }
3500}
3501
3502/* PAR is a parallel that is being skipped in its entirety according to
3503 MASK. Treat this as skipping a superblock starting at forked
3504 and ending at joining. */
3505
3506static void
3507nvptx_skip_par (unsigned mask, parallel *par)
3508{
3509 basic_block tail = par->join_block;
3510 gcc_assert (tail->preds->length () == 1);
3511
3512 basic_block pre_tail = (*tail->preds)[0]->src;
3513 gcc_assert (pre_tail->succs->length () == 1);
3514
3515 nvptx_single (mask, par->forked_block, pre_tail);
3516}
3517
dba619f3
NS
3518/* If PAR has a single inner parallel and PAR itself only contains
3519 empty entry and exit blocks, swallow the inner PAR. */
3520
3521static void
3522nvptx_optimize_inner (parallel *par)
3523{
3524 parallel *inner = par->inner;
3525
3526 /* We mustn't be the outer dummy par. */
3527 if (!par->mask)
3528 return;
3529
3530 /* We must have a single inner par. */
3531 if (!inner || inner->next)
3532 return;
3533
3534 /* We must only contain 2 blocks ourselves -- the head and tail of
3535 the inner par. */
3536 if (par->blocks.length () != 2)
3537 return;
3538
3539 /* We must be disjoint partitioning. As we only have vector and
3540 worker partitioning, this is sufficient to guarantee the pars
3541 have adjacent partitioning. */
3542 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
3543 /* This indicates malformed code generation. */
3544 return;
3545
3546 /* The outer forked insn should be immediately followed by the inner
3547 fork insn. */
3548 rtx_insn *forked = par->forked_insn;
3549 rtx_insn *fork = BB_END (par->forked_block);
3550
3551 if (NEXT_INSN (forked) != fork)
3552 return;
3553 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
3554
3555 /* The outer joining insn must immediately follow the inner join
3556 insn. */
3557 rtx_insn *joining = par->joining_insn;
3558 rtx_insn *join = inner->join_insn;
3559 if (NEXT_INSN (join) != joining)
3560 return;
3561
3562 /* Preconditions met. Swallow the inner par. */
3563 if (dump_file)
3564 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
3565 inner->mask, inner->forked_block->index,
3566 inner->join_block->index,
3567 par->mask, par->forked_block->index, par->join_block->index);
3568
3569 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
3570
3571 par->blocks.reserve (inner->blocks.length ());
3572 while (inner->blocks.length ())
3573 par->blocks.quick_push (inner->blocks.pop ());
3574
3575 par->inner = inner->inner;
3576 inner->inner = NULL;
3577
3578 delete inner;
3579}
3580
d88cd9c4
NS
3581/* Process the parallel PAR and all its contained
3582 parallels. We do everything but the neutering. Return mask of
3583 partitioned modes used within this parallel. */
3584
3585static unsigned
3586nvptx_process_pars (parallel *par)
3587{
dba619f3
NS
3588 if (nvptx_optimize)
3589 nvptx_optimize_inner (par);
3590
d88cd9c4
NS
3591 unsigned inner_mask = par->mask;
3592
3593 /* Do the inner parallels first. */
3594 if (par->inner)
3595 {
3596 par->inner_mask = nvptx_process_pars (par->inner);
3597 inner_mask |= par->inner_mask;
3598 }
3599
3600 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3601 /* No propagation needed for a call. */;
5d306e55 3602 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
d88cd9c4
NS
3603 {
3604 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
3605 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
3606 /* Insert begin and end synchronizations. */
3607 emit_insn_after (nvptx_wsync (false), par->forked_insn);
3608 emit_insn_before (nvptx_wsync (true), par->joining_insn);
3609 }
3610 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3611 nvptx_vpropagate (par->forked_block, par->forked_insn);
3612
3613 /* Now do siblings. */
3614 if (par->next)
3615 inner_mask |= nvptx_process_pars (par->next);
3616 return inner_mask;
3617}
3618
3619/* Neuter the parallel described by PAR. We recurse in depth-first
3620 order. MODES are the partitioning of the execution and OUTER is
3621 the partitioning of the parallels we are contained in. */
3622
3623static void
3624nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
3625{
3626 unsigned me = (par->mask
3627 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
3628 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3629 unsigned skip_mask = 0, neuter_mask = 0;
3630
3631 if (par->inner)
3632 nvptx_neuter_pars (par->inner, modes, outer | me);
3633
3634 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
3635 {
3636 if ((outer | me) & GOMP_DIM_MASK (mode))
3637 {} /* Mode is partitioned: no neutering. */
3638 else if (!(modes & GOMP_DIM_MASK (mode)))
5d306e55 3639 {} /* Mode is not used: nothing to do. */
d88cd9c4
NS
3640 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3641 || !par->forked_insn)
3642 /* Partitioned in inner parallels, or we're not a partitioned
3643 at all: neuter individual blocks. */
3644 neuter_mask |= GOMP_DIM_MASK (mode);
3645 else if (!par->parent || !par->parent->forked_insn
3646 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3647 /* Parent isn't a parallel or contains this paralleling: skip
3648 parallel at this level. */
3649 skip_mask |= GOMP_DIM_MASK (mode);
3650 else
3651 {} /* Parent will skip this parallel itself. */
3652 }
3653
3654 if (neuter_mask)
3655 {
912442c2 3656 int ix, len;
d88cd9c4 3657
912442c2
NS
3658 if (nvptx_optimize)
3659 {
3660 /* Neuter whole SESE regions. */
3661 bb_pair_vec_t regions;
3662
3663 nvptx_find_sese (par->blocks, regions);
3664 len = regions.length ();
3665 for (ix = 0; ix != len; ix++)
3666 {
3667 basic_block from = regions[ix].first;
3668 basic_block to = regions[ix].second;
3669
3670 if (from)
3671 nvptx_single (neuter_mask, from, to);
3672 else
3673 gcc_assert (!to);
3674 }
3675 }
3676 else
d88cd9c4 3677 {
912442c2
NS
3678 /* Neuter each BB individually. */
3679 len = par->blocks.length ();
3680 for (ix = 0; ix != len; ix++)
3681 {
3682 basic_block block = par->blocks[ix];
d88cd9c4 3683
912442c2
NS
3684 nvptx_single (neuter_mask, block, block);
3685 }
d88cd9c4
NS
3686 }
3687 }
3688
3689 if (skip_mask)
3690 nvptx_skip_par (skip_mask, par);
3691
3692 if (par->next)
3693 nvptx_neuter_pars (par->next, modes, outer);
3694}
3695
517665b3 3696/* PTX-specific reorganization
d88cd9c4 3697 - Split blocks at fork and join instructions
c38f0d8c
NS
3698 - Compute live registers
3699 - Mark now-unused registers, so function begin doesn't declare
517665b3 3700 unused registers.
d88cd9c4
NS
3701 - Insert state propagation when entering partitioned mode
3702 - Insert neutering instructions when in single mode
c38f0d8c 3703 - Replace subregs with suitable sequences.
517665b3
NS
3704*/
3705
3706static void
3707nvptx_reorg (void)
3708{
517665b3
NS
3709 /* We are freeing block_for_insn in the toplev to keep compatibility
3710 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3711 compute_bb_for_insn ();
3712
3713 thread_prologue_and_epilogue_insns ();
3714
d88cd9c4
NS
3715 /* Split blocks and record interesting unspecs. */
3716 bb_insn_map_t bb_insn_map;
3717
3718 nvptx_split_blocks (&bb_insn_map);
3719
c38f0d8c 3720 /* Compute live regs */
517665b3
NS
3721 df_clear_flags (DF_LR_RUN_DCE);
3722 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
3723 df_live_add_problem ();
3724 df_live_set_all_dirty ();
517665b3 3725 df_analyze ();
738f2522
BS
3726 regstat_init_n_sets_and_refs ();
3727
d88cd9c4
NS
3728 if (dump_file)
3729 df_dump (dump_file);
3730
517665b3 3731 /* Mark unused regs as unused. */
d88cd9c4 3732 int max_regs = max_reg_num ();
44c068ae 3733 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
3734 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3735 regno_reg_rtx[i] = const0_rtx;
517665b3 3736
d88cd9c4
NS
3737 /* Determine launch dimensions of the function. If it is not an
3738 offloaded function (i.e. this is a regular compiler), the
3739 function has no neutering. */
3740 tree attr = get_oacc_fn_attrib (current_function_decl);
3741 if (attr)
3742 {
3743 /* If we determined this mask before RTL expansion, we could
3744 elide emission of some levels of forks and joins. */
3745 unsigned mask = 0;
3746 tree dims = TREE_VALUE (attr);
3747 unsigned ix;
3748
3749 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3750 {
3751 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3752 tree allowed = TREE_PURPOSE (dims);
3753
3754 if (size != 1 && !(allowed && integer_zerop (allowed)))
3755 mask |= GOMP_DIM_MASK (ix);
3756 }
3757 /* If there is worker neutering, there must be vector
3758 neutering. Otherwise the hardware will fail. */
3759 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3760 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3761
3762 /* Discover & process partitioned regions. */
3763 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3764 nvptx_process_pars (pars);
3765 nvptx_neuter_pars (pars, mask, 0);
3766 delete pars;
3767 }
3768
517665b3 3769 /* Replace subregs. */
c03b0416 3770 nvptx_reorg_subreg ();
517665b3 3771
738f2522 3772 regstat_free_n_sets_and_refs ();
517665b3
NS
3773
3774 df_finish_pass (true);
738f2522
BS
3775}
3776\f
3777/* Handle a "kernel" attribute; arguments as in
3778 struct attribute_spec.handler. */
3779
3780static tree
3781nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3782 int ARG_UNUSED (flags), bool *no_add_attrs)
3783{
3784 tree decl = *node;
3785
3786 if (TREE_CODE (decl) != FUNCTION_DECL)
3787 {
3788 error ("%qE attribute only applies to functions", name);
3789 *no_add_attrs = true;
3790 }
b49e35a9 3791 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
738f2522
BS
3792 {
3793 error ("%qE attribute requires a void return type", name);
3794 *no_add_attrs = true;
3795 }
3796
3797 return NULL_TREE;
3798}
3799
3800/* Table of valid machine attributes. */
3801static const struct attribute_spec nvptx_attribute_table[] =
3802{
3803 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3804 affects_type_identity } */
3805 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3806 { NULL, 0, 0, false, false, false, NULL, false }
3807};
3808\f
3809/* Limit vector alignments to BIGGEST_ALIGNMENT. */
3810
3811static HOST_WIDE_INT
3812nvptx_vector_alignment (const_tree type)
3813{
3814 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3815
3816 return MIN (align, BIGGEST_ALIGNMENT);
3817}
d88cd9c4
NS
3818
3819/* Indicate that INSN cannot be duplicated. */
3820
3821static bool
3822nvptx_cannot_copy_insn_p (rtx_insn *insn)
3823{
3824 switch (recog_memoized (insn))
3825 {
3826 case CODE_FOR_nvptx_shufflesi:
3827 case CODE_FOR_nvptx_shufflesf:
3828 case CODE_FOR_nvptx_barsync:
3829 case CODE_FOR_nvptx_fork:
3830 case CODE_FOR_nvptx_forked:
3831 case CODE_FOR_nvptx_joining:
3832 case CODE_FOR_nvptx_join:
3833 return true;
3834 default:
3835 return false;
3836 }
3837}
a794bd20
NS
3838
3839/* Section anchors do not work. Initialization for flag_section_anchor
3840 probes the existence of the anchoring target hooks and prevents
3841 anchoring if they don't exist. However, we may be being used with
3842 a host-side compiler that does support anchoring, and hence see
3843 the anchor flag set (as it's not recalculated). So provide an
3844 implementation denying anchoring. */
3845
3846static bool
3847nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
3848{
3849 return false;
3850}
738f2522 3851\f
1f83528e
TS
3852/* Record a symbol for mkoffload to enter into the mapping table. */
3853
3854static void
3855nvptx_record_offload_symbol (tree decl)
3856{
3e32ee19
NS
3857 switch (TREE_CODE (decl))
3858 {
3859 case VAR_DECL:
3860 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3861 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3862 break;
3863
3864 case FUNCTION_DECL:
3865 {
3866 tree attr = get_oacc_fn_attrib (decl);
5d306e55 3867 tree dims = TREE_VALUE (attr);
3e32ee19
NS
3868 unsigned ix;
3869
3e32ee19
NS
3870 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3871 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3872
5d306e55 3873 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3e32ee19 3874 {
5d306e55 3875 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3e32ee19 3876
5d306e55 3877 gcc_assert (!TREE_PURPOSE (dims));
3e32ee19
NS
3878 fprintf (asm_out_file, ", %#x", size);
3879 }
d2d47a28 3880
3e32ee19
NS
3881 fprintf (asm_out_file, "\n");
3882 }
3883 break;
d2d47a28 3884
3e32ee19
NS
3885 default:
3886 gcc_unreachable ();
3887 }
1f83528e
TS
3888}
3889
738f2522
BS
3890/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3891 at the start of a file. */
3892
3893static void
3894nvptx_file_start (void)
3895{
3896 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3897 fputs ("\t.version\t3.1\n", asm_out_file);
3898 fputs ("\t.target\tsm_30\n", asm_out_file);
3899 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3900 fputs ("// END PREAMBLE\n", asm_out_file);
3901}
3902
15ab6f00
NS
3903/* Emit a declaration for a worker-level buffer in .shared memory. */
3904
3905static void
3906write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
3907{
3908 const char *name = XSTR (sym, 0);
3909
3910 write_var_marker (file, true, false, name);
3911 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
3912 align, name, size);
3913}
3914
ecf6e535
BS
3915/* Write out the function declarations we've collected and declare storage
3916 for the broadcast buffer. */
738f2522
BS
3917
3918static void
3919nvptx_file_end (void)
3920{
f3dba894
TS
3921 hash_table<tree_hasher>::iterator iter;
3922 tree decl;
3923 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
00e52418 3924 nvptx_record_fndecl (decl);
738f2522 3925 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
3926
3927 if (worker_bcast_size)
15ab6f00
NS
3928 write_worker_buffer (asm_out_file, worker_bcast_sym,
3929 worker_bcast_align, worker_bcast_size);
f3552158
NS
3930
3931 if (worker_red_size)
15ab6f00
NS
3932 write_worker_buffer (asm_out_file, worker_red_sym,
3933 worker_red_align, worker_red_size);
f3552158
NS
3934}
3935
3936/* Expander for the shuffle builtins. */
3937
3938static rtx
3939nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3940{
3941 if (ignore)
3942 return target;
3943
3944 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3945 NULL_RTX, mode, EXPAND_NORMAL);
3946 if (!REG_P (src))
3947 src = copy_to_mode_reg (mode, src);
3948
3949 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3950 NULL_RTX, SImode, EXPAND_NORMAL);
3951 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3952 NULL_RTX, SImode, EXPAND_NORMAL);
3953
3954 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3955 idx = copy_to_mode_reg (SImode, idx);
3956
59263259
NS
3957 rtx pat = nvptx_gen_shuffle (target, src, idx,
3958 (nvptx_shuffle_kind) INTVAL (op));
f3552158
NS
3959 if (pat)
3960 emit_insn (pat);
3961
3962 return target;
3963}
3964
3965/* Worker reduction address expander. */
3966
3967static rtx
3968nvptx_expand_worker_addr (tree exp, rtx target,
3969 machine_mode ARG_UNUSED (mode), int ignore)
3970{
3971 if (ignore)
3972 return target;
3973
3974 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3975 if (align > worker_red_align)
3976 worker_red_align = align;
3977
3978 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3979 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3980 if (size + offset > worker_red_size)
3981 worker_red_size = size + offset;
3982
9a863523 3983 rtx addr = worker_red_sym;
f3552158 3984 if (offset)
9a863523
NS
3985 {
3986 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
3987 addr = gen_rtx_CONST (Pmode, addr);
3988 }
f3552158 3989
9a863523 3990 emit_move_insn (target, addr);
f3552158
NS
3991
3992 return target;
3993}
3994
3995/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3996 not require taking the address of any object, other than the memory
3997 cell being operated on. */
3998
3999static rtx
4000nvptx_expand_cmp_swap (tree exp, rtx target,
4001 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4002{
4003 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4004
4005 if (!target)
4006 target = gen_reg_rtx (mode);
4007
4008 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4009 NULL_RTX, Pmode, EXPAND_NORMAL);
4010 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4011 NULL_RTX, mode, EXPAND_NORMAL);
4012 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4013 NULL_RTX, mode, EXPAND_NORMAL);
4014 rtx pat;
4015
4016 mem = gen_rtx_MEM (mode, mem);
4017 if (!REG_P (cmp))
4018 cmp = copy_to_mode_reg (mode, cmp);
4019 if (!REG_P (src))
4020 src = copy_to_mode_reg (mode, src);
4021
4022 if (mode == SImode)
4023 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4024 else
4025 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4026
4027 emit_insn (pat);
4028
4029 return target;
4030}
4031
4032
4033/* Codes for all the NVPTX builtins. */
4034enum nvptx_builtins
4035{
4036 NVPTX_BUILTIN_SHUFFLE,
4037 NVPTX_BUILTIN_SHUFFLELL,
4038 NVPTX_BUILTIN_WORKER_ADDR,
4039 NVPTX_BUILTIN_CMP_SWAP,
4040 NVPTX_BUILTIN_CMP_SWAPLL,
4041 NVPTX_BUILTIN_MAX
4042};
4043
4044static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4045
4046/* Return the NVPTX builtin for CODE. */
4047
4048static tree
4049nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4050{
4051 if (code >= NVPTX_BUILTIN_MAX)
4052 return error_mark_node;
4053
4054 return nvptx_builtin_decls[code];
4055}
4056
4057/* Set up all builtin functions for this target. */
4058
4059static void
4060nvptx_init_builtins (void)
4061{
4062#define DEF(ID, NAME, T) \
4063 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4064 = add_builtin_function ("__builtin_nvptx_" NAME, \
4065 build_function_type_list T, \
4066 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4067#define ST sizetype
4068#define UINT unsigned_type_node
4069#define LLUINT long_long_unsigned_type_node
4070#define PTRVOID ptr_type_node
4071
4072 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4073 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4074 DEF (WORKER_ADDR, "worker_addr",
4075 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4076 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4077 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4078
4079#undef DEF
4080#undef ST
4081#undef UINT
4082#undef LLUINT
4083#undef PTRVOID
4084}
4085
4086/* Expand an expression EXP that calls a built-in function,
4087 with result going to TARGET if that's convenient
4088 (and in mode MODE if that's convenient).
4089 SUBTARGET may be used as the target for computing one of EXP's operands.
4090 IGNORE is nonzero if the value is to be ignored. */
4091
4092static rtx
4093nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4094 machine_mode mode, int ignore)
4095{
4096 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4097 switch (DECL_FUNCTION_CODE (fndecl))
4098 {
4099 case NVPTX_BUILTIN_SHUFFLE:
4100 case NVPTX_BUILTIN_SHUFFLELL:
4101 return nvptx_expand_shuffle (exp, target, mode, ignore);
4102
4103 case NVPTX_BUILTIN_WORKER_ADDR:
4104 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4105
4106 case NVPTX_BUILTIN_CMP_SWAP:
4107 case NVPTX_BUILTIN_CMP_SWAPLL:
4108 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4109
4110 default: gcc_unreachable ();
4111 }
738f2522
BS
4112}
4113\f
f3552158
NS
4114/* Define dimension sizes for known hardware. */
4115#define PTX_VECTOR_LENGTH 32
4116#define PTX_WORKER_LENGTH 32
b6adbb9f 4117#define PTX_GANG_DEFAULT 32
f3552158 4118
94829f87
NS
4119/* Validate compute dimensions of an OpenACC offload or routine, fill
4120 in non-unity defaults. FN_LEVEL indicates the level at which a
b6adbb9f
NS
4121 routine might spawn a loop. It is negative for non-routines. If
4122 DECL is null, we are validating the default dimensions. */
94829f87
NS
4123
4124static bool
5d306e55 4125nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
94829f87
NS
4126{
4127 bool changed = false;
4128
ccc8282b 4129 /* The vector size must be 32, unless this is a SEQ routine. */
b6adbb9f
NS
4130 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4131 && dims[GOMP_DIM_VECTOR] >= 0
ccc8282b
NS
4132 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4133 {
b6adbb9f
NS
4134 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4135 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
4136 dims[GOMP_DIM_VECTOR]
4137 ? "using vector_length (%d), ignoring %d"
4138 : "using vector_length (%d), ignoring runtime setting",
4139 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4140 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4141 changed = true;
4142 }
4143
4144 /* Check the num workers is not too large. */
4145 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4146 {
b6adbb9f 4147 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
ccc8282b
NS
4148 "using num_workers (%d), ignoring %d",
4149 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4150 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4151 changed = true;
4152 }
94829f87 4153
b6adbb9f
NS
4154 if (!decl)
4155 {
4156 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4157 if (dims[GOMP_DIM_WORKER] < 0)
4158 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4159 if (dims[GOMP_DIM_GANG] < 0)
4160 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4161 changed = true;
4162 }
4163
94829f87
NS
4164 return changed;
4165}
d88cd9c4 4166
bd751975
NS
4167/* Return maximum dimension size, or zero for unbounded. */
4168
4169static int
4170nvptx_dim_limit (int axis)
4171{
4172 switch (axis)
4173 {
4174 case GOMP_DIM_WORKER:
4175 return PTX_WORKER_LENGTH;
4176
4177 case GOMP_DIM_VECTOR:
4178 return PTX_VECTOR_LENGTH;
4179
4180 default:
4181 break;
4182 }
4183 return 0;
4184}
4185
d88cd9c4
NS
4186/* Determine whether fork & joins are needed. */
4187
4188static bool
4189nvptx_goacc_fork_join (gcall *call, const int dims[],
4190 bool ARG_UNUSED (is_fork))
4191{
4192 tree arg = gimple_call_arg (call, 2);
4193 unsigned axis = TREE_INT_CST_LOW (arg);
4194
4195 /* We only care about worker and vector partitioning. */
4196 if (axis < GOMP_DIM_WORKER)
4197 return false;
4198
4199 /* If the size is 1, there's no partitioning. */
4200 if (dims[axis] == 1)
4201 return false;
4202
4203 return true;
4204}
4205
f3552158
NS
4206/* Generate a PTX builtin function call that returns the address in
4207 the worker reduction buffer at OFFSET. TYPE is the type of the
4208 data at that location. */
4209
4210static tree
4211nvptx_get_worker_red_addr (tree type, tree offset)
4212{
4213 machine_mode mode = TYPE_MODE (type);
4214 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4215 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4216 tree align = build_int_cst (unsigned_type_node,
4217 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4218 tree call = build_call_expr (fndecl, 3, offset, size, align);
4219
4220 return fold_convert (build_pointer_type (type), call);
4221}
4222
4223/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4224 will cast the variable if necessary. */
4225
4226static void
4227nvptx_generate_vector_shuffle (location_t loc,
4228 tree dest_var, tree var, unsigned shift,
4229 gimple_seq *seq)
4230{
4231 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4232 tree_code code = NOP_EXPR;
dd3c1b14
NS
4233 tree arg_type = unsigned_type_node;
4234 tree var_type = TREE_TYPE (var);
4235 tree dest_type = var_type;
f3552158 4236
dd3c1b14
NS
4237 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4238 var_type = TREE_TYPE (var_type);
4239
4240 if (TREE_CODE (var_type) == REAL_TYPE)
f3552158 4241 code = VIEW_CONVERT_EXPR;
dd3c1b14
NS
4242
4243 if (TYPE_SIZE (var_type)
4244 == TYPE_SIZE (long_long_unsigned_type_node))
f3552158
NS
4245 {
4246 fn = NVPTX_BUILTIN_SHUFFLELL;
dd3c1b14 4247 arg_type = long_long_unsigned_type_node;
f3552158 4248 }
dd3c1b14 4249
f3552158 4250 tree call = nvptx_builtin_decl (fn, true);
dd3c1b14
NS
4251 tree bits = build_int_cst (unsigned_type_node, shift);
4252 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4253 tree expr;
4254
4255 if (var_type != dest_type)
4256 {
4257 /* Do real and imaginary parts separately. */
4258 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4259 real = fold_build1 (code, arg_type, real);
4260 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4261 real = fold_build1 (code, var_type, real);
f3552158 4262
dd3c1b14
NS
4263 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4264 imag = fold_build1 (code, arg_type, imag);
4265 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4266 imag = fold_build1 (code, var_type, imag);
4267
4268 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4269 }
4270 else
4271 {
4272 expr = fold_build1 (code, arg_type, var);
4273 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4274 expr = fold_build1 (code, dest_type, expr);
4275 }
f3552158 4276
dd3c1b14 4277 gimplify_assign (dest_var, expr, seq);
f3552158
NS
4278}
4279
33f47f42
NS
4280/* Lazily generate the global lock var decl and return its address. */
4281
4282static tree
4283nvptx_global_lock_addr ()
4284{
4285 tree v = global_lock_var;
4286
4287 if (!v)
4288 {
4289 tree name = get_identifier ("__reduction_lock");
4290 tree type = build_qualified_type (unsigned_type_node,
4291 TYPE_QUAL_VOLATILE);
4292 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4293 global_lock_var = v;
4294 DECL_ARTIFICIAL (v) = 1;
4295 DECL_EXTERNAL (v) = 1;
4296 TREE_STATIC (v) = 1;
4297 TREE_PUBLIC (v) = 1;
4298 TREE_USED (v) = 1;
4299 mark_addressable (v);
4300 mark_decl_referenced (v);
4301 }
4302
4303 return build_fold_addr_expr (v);
4304}
4305
4306/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4307 GSI. We use a lockless scheme for nearly all case, which looks
4308 like:
4309 actual = initval(OP);
4310 do {
4311 guess = actual;
4312 write = guess OP myval;
4313 actual = cmp&swap (ptr, guess, write)
4314 } while (actual bit-different-to guess);
4315 return write;
4316
4317 This relies on a cmp&swap instruction, which is available for 32-
4318 and 64-bit types. Larger types must use a locking scheme. */
f3552158
NS
4319
4320static tree
4321nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
4322 tree ptr, tree var, tree_code op)
4323{
4324 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
4325 tree_code code = NOP_EXPR;
33f47f42
NS
4326 tree arg_type = unsigned_type_node;
4327 tree var_type = TREE_TYPE (var);
f3552158 4328
33f47f42
NS
4329 if (TREE_CODE (var_type) == COMPLEX_TYPE
4330 || TREE_CODE (var_type) == REAL_TYPE)
f3552158 4331 code = VIEW_CONVERT_EXPR;
33f47f42
NS
4332
4333 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
f3552158 4334 {
33f47f42 4335 arg_type = long_long_unsigned_type_node;
f3552158 4336 fn = NVPTX_BUILTIN_CMP_SWAPLL;
f3552158
NS
4337 }
4338
33f47f42
NS
4339 tree swap_fn = nvptx_builtin_decl (fn, true);
4340
f3552158 4341 gimple_seq init_seq = NULL;
33f47f42
NS
4342 tree init_var = make_ssa_name (arg_type);
4343 tree init_expr = omp_reduction_init_op (loc, op, var_type);
4344 init_expr = fold_build1 (code, arg_type, init_expr);
f3552158
NS
4345 gimplify_assign (init_var, init_expr, &init_seq);
4346 gimple *init_end = gimple_seq_last (init_seq);
4347
4348 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
4349
f3552158
NS
4350 /* Split the block just after the init stmts. */
4351 basic_block pre_bb = gsi_bb (*gsi);
4352 edge pre_edge = split_block (pre_bb, init_end);
4353 basic_block loop_bb = pre_edge->dest;
4354 pre_bb = pre_edge->src;
4355 /* Reset the iterator. */
4356 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4357
33f47f42
NS
4358 tree expect_var = make_ssa_name (arg_type);
4359 tree actual_var = make_ssa_name (arg_type);
4360 tree write_var = make_ssa_name (arg_type);
4361
4362 /* Build and insert the reduction calculation. */
4363 gimple_seq red_seq = NULL;
4364 tree write_expr = fold_build1 (code, var_type, expect_var);
4365 write_expr = fold_build2 (op, var_type, write_expr, var);
4366 write_expr = fold_build1 (code, arg_type, write_expr);
4367 gimplify_assign (write_var, write_expr, &red_seq);
4368
4369 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4370
4371 /* Build & insert the cmp&swap sequence. */
4372 gimple_seq latch_seq = NULL;
4373 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
4374 ptr, expect_var, write_var);
4375 gimplify_assign (actual_var, swap_expr, &latch_seq);
4376
4377 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
4378 NULL_TREE, NULL_TREE);
4379 gimple_seq_add_stmt (&latch_seq, cond);
4380
4381 gimple *latch_end = gimple_seq_last (latch_seq);
4382 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
f3552158 4383
33f47f42
NS
4384 /* Split the block just after the latch stmts. */
4385 edge post_edge = split_block (loop_bb, latch_end);
f3552158
NS
4386 basic_block post_bb = post_edge->dest;
4387 loop_bb = post_edge->src;
4388 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4389
4390 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4391 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
4392 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
4393 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
4394
4395 gphi *phi = create_phi_node (expect_var, loop_bb);
4396 add_phi_arg (phi, init_var, pre_edge, loc);
4397 add_phi_arg (phi, actual_var, loop_edge, loc);
4398
4399 loop *loop = alloc_loop ();
4400 loop->header = loop_bb;
4401 loop->latch = loop_bb;
4402 add_loop (loop, loop_bb->loop_father);
4403
33f47f42
NS
4404 return fold_build1 (code, var_type, write_var);
4405}
4406
4407/* Insert code to lockfully update *PTR with *PTR OP VAR just before
4408 GSI. This is necessary for types larger than 64 bits, where there
4409 is no cmp&swap instruction to implement a lockless scheme. We use
4410 a lock variable in global memory.
4411
4412 while (cmp&swap (&lock_var, 0, 1))
4413 continue;
4414 T accum = *ptr;
4415 accum = accum OP var;
4416 *ptr = accum;
4417 cmp&swap (&lock_var, 1, 0);
4418 return accum;
4419
4420 A lock in global memory is necessary to force execution engine
4421 descheduling and avoid resource starvation that can occur if the
4422 lock is in .shared memory. */
4423
4424static tree
4425nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
4426 tree ptr, tree var, tree_code op)
4427{
4428 tree var_type = TREE_TYPE (var);
4429 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
4430 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
4431 tree uns_locked = build_int_cst (unsigned_type_node, 1);
4432
4433 /* Split the block just before the gsi. Insert a gimple nop to make
4434 this easier. */
4435 gimple *nop = gimple_build_nop ();
4436 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
4437 basic_block entry_bb = gsi_bb (*gsi);
4438 edge entry_edge = split_block (entry_bb, nop);
4439 basic_block lock_bb = entry_edge->dest;
4440 /* Reset the iterator. */
4441 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4442
4443 /* Build and insert the locking sequence. */
4444 gimple_seq lock_seq = NULL;
4445 tree lock_var = make_ssa_name (unsigned_type_node);
4446 tree lock_expr = nvptx_global_lock_addr ();
4447 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
4448 uns_unlocked, uns_locked);
4449 gimplify_assign (lock_var, lock_expr, &lock_seq);
4450 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
4451 NULL_TREE, NULL_TREE);
4452 gimple_seq_add_stmt (&lock_seq, cond);
4453 gimple *lock_end = gimple_seq_last (lock_seq);
4454 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
4455
4456 /* Split the block just after the lock sequence. */
4457 edge locked_edge = split_block (lock_bb, lock_end);
4458 basic_block update_bb = locked_edge->dest;
4459 lock_bb = locked_edge->src;
4460 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
4461
4462 /* Create the lock loop ... */
4463 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
4464 make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
4465 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
4466 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
4467
4468 /* ... and the loop structure. */
4469 loop *lock_loop = alloc_loop ();
4470 lock_loop->header = lock_bb;
4471 lock_loop->latch = lock_bb;
4472 lock_loop->nb_iterations_estimate = 1;
4473 lock_loop->any_estimate = true;
4474 add_loop (lock_loop, entry_bb->loop_father);
4475
4476 /* Build and insert the reduction calculation. */
4477 gimple_seq red_seq = NULL;
4478 tree acc_in = make_ssa_name (var_type);
4479 tree ref_in = build_simple_mem_ref (ptr);
4480 TREE_THIS_VOLATILE (ref_in) = 1;
4481 gimplify_assign (acc_in, ref_in, &red_seq);
4482
4483 tree acc_out = make_ssa_name (var_type);
4484 tree update_expr = fold_build2 (op, var_type, ref_in, var);
4485 gimplify_assign (acc_out, update_expr, &red_seq);
4486
4487 tree ref_out = build_simple_mem_ref (ptr);
4488 TREE_THIS_VOLATILE (ref_out) = 1;
4489 gimplify_assign (ref_out, acc_out, &red_seq);
4490
4491 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
4492
4493 /* Build & insert the unlock sequence. */
4494 gimple_seq unlock_seq = NULL;
4495 tree unlock_expr = nvptx_global_lock_addr ();
4496 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
4497 uns_locked, uns_unlocked);
4498 gimplify_and_add (unlock_expr, &unlock_seq);
4499 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
4500
4501 return acc_out;
4502}
4503
4504/* Emit a sequence to update a reduction accumlator at *PTR with the
4505 value held in VAR using operator OP. Return the updated value.
4506
4507 TODO: optimize for atomic ops and indepedent complex ops. */
4508
4509static tree
4510nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
4511 tree ptr, tree var, tree_code op)
4512{
4513 tree type = TREE_TYPE (var);
4514 tree size = TYPE_SIZE (type);
4515
4516 if (size == TYPE_SIZE (unsigned_type_node)
4517 || size == TYPE_SIZE (long_long_unsigned_type_node))
4518 return nvptx_lockless_update (loc, gsi, ptr, var, op);
4519 else
4520 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
f3552158
NS
4521}
4522
4523/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
4524
4525static void
4526nvptx_goacc_reduction_setup (gcall *call)
4527{
4528 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4529 tree lhs = gimple_call_lhs (call);
4530 tree var = gimple_call_arg (call, 2);
4531 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4532 gimple_seq seq = NULL;
4533
4534 push_gimplify_context (true);
4535
4536 if (level != GOMP_DIM_GANG)
4537 {
4538 /* Copy the receiver object. */
4539 tree ref_to_res = gimple_call_arg (call, 1);
4540
4541 if (!integer_zerop (ref_to_res))
4542 var = build_simple_mem_ref (ref_to_res);
4543 }
4544
4545 if (level == GOMP_DIM_WORKER)
4546 {
4547 /* Store incoming value to worker reduction buffer. */
4548 tree offset = gimple_call_arg (call, 5);
4549 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4550 tree ptr = make_ssa_name (TREE_TYPE (call));
4551
4552 gimplify_assign (ptr, call, &seq);
4553 tree ref = build_simple_mem_ref (ptr);
4554 TREE_THIS_VOLATILE (ref) = 1;
4555 gimplify_assign (ref, var, &seq);
4556 }
4557
4558 if (lhs)
4559 gimplify_assign (lhs, var, &seq);
4560
4561 pop_gimplify_context (NULL);
4562 gsi_replace_with_seq (&gsi, seq, true);
4563}
4564
4565/* NVPTX implementation of GOACC_REDUCTION_INIT. */
4566
4567static void
4568nvptx_goacc_reduction_init (gcall *call)
4569{
4570 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4571 tree lhs = gimple_call_lhs (call);
4572 tree var = gimple_call_arg (call, 2);
4573 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4574 enum tree_code rcode
4575 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4576 tree init = omp_reduction_init_op (gimple_location (call), rcode,
4577 TREE_TYPE (var));
4578 gimple_seq seq = NULL;
4579
4580 push_gimplify_context (true);
4581
4582 if (level == GOMP_DIM_VECTOR)
4583 {
4584 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
4585 tree tid = make_ssa_name (integer_type_node);
4586 tree dim_vector = gimple_call_arg (call, 3);
4587 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
4588 dim_vector);
4589 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
4590 NULL_TREE, NULL_TREE);
4591
4592 gimple_call_set_lhs (tid_call, tid);
4593 gimple_seq_add_stmt (&seq, tid_call);
4594 gimple_seq_add_stmt (&seq, cond_stmt);
4595
4596 /* Split the block just after the call. */
4597 edge init_edge = split_block (gsi_bb (gsi), call);
4598 basic_block init_bb = init_edge->dest;
4599 basic_block call_bb = init_edge->src;
4600
4601 /* Fixup flags from call_bb to init_bb. */
4602 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
4603
4604 /* Set the initialization stmts. */
4605 gimple_seq init_seq = NULL;
4606 tree init_var = make_ssa_name (TREE_TYPE (var));
4607 gimplify_assign (init_var, init, &init_seq);
4608 gsi = gsi_start_bb (init_bb);
4609 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
4610
4611 /* Split block just after the init stmt. */
4612 gsi_prev (&gsi);
4613 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
4614 basic_block dst_bb = inited_edge->dest;
4615
4616 /* Create false edge from call_bb to dst_bb. */
4617 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
4618
4619 /* Create phi node in dst block. */
4620 gphi *phi = create_phi_node (lhs, dst_bb);
4621 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
4622 add_phi_arg (phi, var, nop_edge, gimple_location (call));
4623
4624 /* Reset dominator of dst bb. */
4625 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
4626
4627 /* Reset the gsi. */
4628 gsi = gsi_for_stmt (call);
4629 }
4630 else
4631 {
4632 if (level == GOMP_DIM_GANG)
4633 {
4634 /* If there's no receiver object, propagate the incoming VAR. */
4635 tree ref_to_res = gimple_call_arg (call, 1);
4636 if (integer_zerop (ref_to_res))
4637 init = var;
4638 }
4639
4640 gimplify_assign (lhs, init, &seq);
4641 }
4642
4643 pop_gimplify_context (NULL);
4644 gsi_replace_with_seq (&gsi, seq, true);
4645}
4646
4647/* NVPTX implementation of GOACC_REDUCTION_FINI. */
4648
4649static void
4650nvptx_goacc_reduction_fini (gcall *call)
4651{
4652 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4653 tree lhs = gimple_call_lhs (call);
4654 tree ref_to_res = gimple_call_arg (call, 1);
4655 tree var = gimple_call_arg (call, 2);
4656 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4657 enum tree_code op
4658 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
4659 gimple_seq seq = NULL;
4660 tree r = NULL_TREE;;
4661
4662 push_gimplify_context (true);
4663
4664 if (level == GOMP_DIM_VECTOR)
4665 {
4666 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
4667 but that requires a method of emitting a unified jump at the
4668 gimple level. */
4669 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
4670 {
4671 tree other_var = make_ssa_name (TREE_TYPE (var));
4672 nvptx_generate_vector_shuffle (gimple_location (call),
4673 other_var, var, shfl, &seq);
4674
4675 r = make_ssa_name (TREE_TYPE (var));
4676 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
4677 var, other_var), &seq);
4678 var = r;
4679 }
4680 }
4681 else
4682 {
4683 tree accum = NULL_TREE;
4684
4685 if (level == GOMP_DIM_WORKER)
4686 {
4687 /* Get reduction buffer address. */
4688 tree offset = gimple_call_arg (call, 5);
4689 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
4690 tree ptr = make_ssa_name (TREE_TYPE (call));
4691
4692 gimplify_assign (ptr, call, &seq);
4693 accum = ptr;
4694 }
4695 else if (integer_zerop (ref_to_res))
4696 r = var;
4697 else
4698 accum = ref_to_res;
4699
4700 if (accum)
4701 {
33f47f42 4702 /* UPDATE the accumulator. */
f3552158
NS
4703 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4704 seq = NULL;
33f47f42
NS
4705 r = nvptx_reduction_update (gimple_location (call), &gsi,
4706 accum, var, op);
f3552158
NS
4707 }
4708 }
4709
4710 if (lhs)
4711 gimplify_assign (lhs, r, &seq);
4712 pop_gimplify_context (NULL);
4713
4714 gsi_replace_with_seq (&gsi, seq, true);
4715}
4716
4717/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
4718
4719static void
4720nvptx_goacc_reduction_teardown (gcall *call)
4721{
4722 gimple_stmt_iterator gsi = gsi_for_stmt (call);
4723 tree lhs = gimple_call_lhs (call);
4724 tree var = gimple_call_arg (call, 2);
4725 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
4726 gimple_seq seq = NULL;
4727
4728 push_gimplify_context (true);
4729 if (level == GOMP_DIM_WORKER)
4730 {
4731 /* Read the worker reduction buffer. */
4732 tree offset = gimple_call_arg (call, 5);
4733 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
4734 tree ptr = make_ssa_name (TREE_TYPE (call));
4735
4736 gimplify_assign (ptr, call, &seq);
4737 var = build_simple_mem_ref (ptr);
4738 TREE_THIS_VOLATILE (var) = 1;
4739 }
4740
4741 if (level != GOMP_DIM_GANG)
4742 {
4743 /* Write to the receiver object. */
4744 tree ref_to_res = gimple_call_arg (call, 1);
4745
4746 if (!integer_zerop (ref_to_res))
4747 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
4748 }
4749
4750 if (lhs)
4751 gimplify_assign (lhs, var, &seq);
4752
4753 pop_gimplify_context (NULL);
4754
4755 gsi_replace_with_seq (&gsi, seq, true);
4756}
4757
4758/* NVPTX reduction expander. */
4759
5563d5c0 4760static void
f3552158
NS
4761nvptx_goacc_reduction (gcall *call)
4762{
4763 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
4764
4765 switch (code)
4766 {
4767 case IFN_GOACC_REDUCTION_SETUP:
4768 nvptx_goacc_reduction_setup (call);
4769 break;
4770
4771 case IFN_GOACC_REDUCTION_INIT:
4772 nvptx_goacc_reduction_init (call);
4773 break;
4774
4775 case IFN_GOACC_REDUCTION_FINI:
4776 nvptx_goacc_reduction_fini (call);
4777 break;
4778
4779 case IFN_GOACC_REDUCTION_TEARDOWN:
4780 nvptx_goacc_reduction_teardown (call);
4781 break;
4782
4783 default:
4784 gcc_unreachable ();
4785 }
4786}
4787
738f2522
BS
4788#undef TARGET_OPTION_OVERRIDE
4789#define TARGET_OPTION_OVERRIDE nvptx_option_override
4790
4791#undef TARGET_ATTRIBUTE_TABLE
4792#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
4793
4794#undef TARGET_LEGITIMATE_ADDRESS_P
4795#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
4796
4797#undef TARGET_PROMOTE_FUNCTION_MODE
4798#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
4799
4800#undef TARGET_FUNCTION_ARG
4801#define TARGET_FUNCTION_ARG nvptx_function_arg
4802#undef TARGET_FUNCTION_INCOMING_ARG
4803#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
4804#undef TARGET_FUNCTION_ARG_ADVANCE
4805#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
738f2522
BS
4806#undef TARGET_PASS_BY_REFERENCE
4807#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
4808#undef TARGET_FUNCTION_VALUE_REGNO_P
4809#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
4810#undef TARGET_FUNCTION_VALUE
4811#define TARGET_FUNCTION_VALUE nvptx_function_value
4812#undef TARGET_LIBCALL_VALUE
4813#define TARGET_LIBCALL_VALUE nvptx_libcall_value
4814#undef TARGET_FUNCTION_OK_FOR_SIBCALL
4815#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
4816#undef TARGET_GET_DRAP_RTX
4817#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
4818#undef TARGET_SPLIT_COMPLEX_ARG
4819#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
4820#undef TARGET_RETURN_IN_MEMORY
4821#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
4822#undef TARGET_OMIT_STRUCT_RETURN_REG
4823#define TARGET_OMIT_STRUCT_RETURN_REG true
4824#undef TARGET_STRICT_ARGUMENT_NAMING
4825#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
738f2522
BS
4826#undef TARGET_CALL_ARGS
4827#define TARGET_CALL_ARGS nvptx_call_args
4828#undef TARGET_END_CALL_ARGS
4829#define TARGET_END_CALL_ARGS nvptx_end_call_args
4830
4831#undef TARGET_ASM_FILE_START
4832#define TARGET_ASM_FILE_START nvptx_file_start
4833#undef TARGET_ASM_FILE_END
4834#define TARGET_ASM_FILE_END nvptx_file_end
4835#undef TARGET_ASM_GLOBALIZE_LABEL
4836#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
4837#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
4838#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
4839#undef TARGET_PRINT_OPERAND
4840#define TARGET_PRINT_OPERAND nvptx_print_operand
4841#undef TARGET_PRINT_OPERAND_ADDRESS
4842#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
4843#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
4844#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
4845#undef TARGET_ASM_INTEGER
4846#define TARGET_ASM_INTEGER nvptx_assemble_integer
4847#undef TARGET_ASM_DECL_END
4848#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
4849#undef TARGET_ASM_DECLARE_CONSTANT_NAME
4850#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
4851#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
4852#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
4853#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
4854#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
4855
4856#undef TARGET_MACHINE_DEPENDENT_REORG
4857#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
4858#undef TARGET_NO_REGISTER_ALLOCATION
4859#define TARGET_NO_REGISTER_ALLOCATION true
4860
9a863523
NS
4861#undef TARGET_ENCODE_SECTION_INFO
4862#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
1f83528e
TS
4863#undef TARGET_RECORD_OFFLOAD_SYMBOL
4864#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4865
738f2522
BS
4866#undef TARGET_VECTOR_ALIGNMENT
4867#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4868
d88cd9c4
NS
4869#undef TARGET_CANNOT_COPY_INSN_P
4870#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4871
a794bd20
NS
4872#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
4873#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
4874
f3552158
NS
4875#undef TARGET_INIT_BUILTINS
4876#define TARGET_INIT_BUILTINS nvptx_init_builtins
4877#undef TARGET_EXPAND_BUILTIN
4878#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4879#undef TARGET_BUILTIN_DECL
4880#define TARGET_BUILTIN_DECL nvptx_builtin_decl
4881
94829f87
NS
4882#undef TARGET_GOACC_VALIDATE_DIMS
4883#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4884
bd751975
NS
4885#undef TARGET_GOACC_DIM_LIMIT
4886#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
4887
d88cd9c4
NS
4888#undef TARGET_GOACC_FORK_JOIN
4889#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4890
f3552158
NS
4891#undef TARGET_GOACC_REDUCTION
4892#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4893
738f2522
BS
4894struct gcc_target targetm = TARGET_INITIALIZER;
4895
4896#include "gt-nvptx.h"