]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
Turn CANNOT_CHANGE_MODE_CLASS into a hook
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
8ce80784 1/* Target code for NVPTX.
aad93da1 2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
8ce80784 3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
6401d4b8 22#include <sstream>
8ce80784 23#include "system.h"
24#include "coretypes.h"
9ef16211 25#include "backend.h"
c1eb80de 26#include "target.h"
8ce80784 27#include "rtl.h"
c1eb80de 28#include "tree.h"
29#include "cfghooks.h"
9ef16211 30#include "df.h"
ad7b10a2 31#include "memmodel.h"
c1eb80de 32#include "tm_p.h"
33#include "expmed.h"
34#include "optabs.h"
35#include "regs.h"
36#include "emit-rtl.h"
37#include "recog.h"
38#include "diagnostic.h"
b20a8bb4 39#include "alias.h"
8ce80784 40#include "insn-flags.h"
41#include "output.h"
42#include "insn-attr.h"
d53441c8 43#include "flags.h"
d53441c8 44#include "dojump.h"
45#include "explow.h"
46#include "calls.h"
d53441c8 47#include "varasm.h"
48#include "stmt.h"
8ce80784 49#include "expr.h"
8ce80784 50#include "tm-preds.h"
51#include "tm-constrs.h"
8ce80784 52#include "langhooks.h"
53#include "dbxout.h"
8ce80784 54#include "cfgrtl.h"
b3787ae4 55#include "gimple.h"
8ce80784 56#include "stor-layout.h"
8ce80784 57#include "builtins.h"
4954efd4 58#include "omp-general.h"
e561d5e1 59#include "omp-low.h"
60#include "gomp-constants.h"
b3787ae4 61#include "dumpfile.h"
78a78aac 62#include "internal-fn.h"
63#include "gimple-iterator.h"
64#include "stringpool.h"
30a86690 65#include "attribs.h"
bd02df23 66#include "tree-vrp.h"
78a78aac 67#include "tree-ssa-operands.h"
68#include "tree-ssanames.h"
69#include "gimplify.h"
70#include "tree-phinodes.h"
71#include "cfgloop.h"
72#include "fold-const.h"
6ee7a985 73#include "intl.h"
8ce80784 74
0c71fb4f 75/* This file should be included last. */
4b498588 76#include "target-def.h"
77
8b73a457 78#define WORKAROUND_PTXJIT_BUG 1
79
d7ed88be 80/* The various PTX memory areas an object might reside in. */
81enum nvptx_data_area
82{
83 DATA_AREA_GENERIC,
84 DATA_AREA_GLOBAL,
85 DATA_AREA_SHARED,
86 DATA_AREA_LOCAL,
87 DATA_AREA_CONST,
88 DATA_AREA_PARAM,
89 DATA_AREA_MAX
90};
91
92/* We record the data area in the target symbol flags. */
93#define SYMBOL_DATA_AREA(SYM) \
94 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
95 & 7)
96#define SET_SYMBOL_DATA_AREA(SYM,AREA) \
97 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
98
8ce80784 99/* Record the function decls we've written, and the libfuncs and function
100 decls corresponding to them. */
101static std::stringstream func_decls;
b0c5be65 102
eae1ecb4 103struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
b0c5be65 104{
105 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
106 static bool equal (rtx a, rtx b) { return a == b; }
107};
108
109static GTY((cache))
110 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
111
eae1ecb4 112struct tree_hasher : ggc_cache_ptr_hash<tree_node>
b0c5be65 113{
114 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
115 static bool equal (tree a, tree b) { return a == b; }
116};
117
118static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
119static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
8ce80784 120
78a78aac 121/* Buffer needed to broadcast across workers. This is used for both
122 worker-neutering and worker broadcasting. It is shared by all
123 functions emitted. The buffer is placed in shared memory. It'd be
124 nice if PTX supported common blocks, because then this could be
125 shared across TUs (taking the largest size). */
b3787ae4 126static unsigned worker_bcast_size;
127static unsigned worker_bcast_align;
b3787ae4 128static GTY(()) rtx worker_bcast_sym;
129
78a78aac 130/* Buffer needed for worker reductions. This has to be distinct from
131 the worker broadcast array, as both may be live concurrently. */
132static unsigned worker_red_size;
133static unsigned worker_red_align;
78a78aac 134static GTY(()) rtx worker_red_sym;
135
1927fff5 136/* Global lock variable, needed for 128bit worker & gang reductions. */
137static GTY(()) tree global_lock_var;
138
7fce8768 139/* True if any function references __nvptx_stacks. */
140static bool need_softstack_decl;
141
142/* True if any function references __nvptx_uni. */
143static bool need_unisimt_decl;
144
8ce80784 145/* Allocate a new, cleared machine_function structure. */
146
147static struct machine_function *
148nvptx_init_machine_status (void)
149{
150 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
6e71bbf2 151 p->return_mode = VOIDmode;
8ce80784 152 return p;
153}
154
7fce8768 155/* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
156 and -fopenacc is also enabled. */
157
158static void
159diagnose_openacc_conflict (bool optval, const char *optname)
160{
161 if (flag_openacc && optval)
162 error ("option %s is not supported together with -fopenacc", optname);
163}
164
8ce80784 165/* Implement TARGET_OPTION_OVERRIDE. */
166
167static void
168nvptx_option_override (void)
169{
170 init_machine_status = nvptx_init_machine_status;
c68e61d8 171
5e95fd5f 172 /* Set toplevel_reorder, unless explicitly disabled. We need
173 reordering so that we emit necessary assembler decls of
174 undeclared variables. */
175 if (!global_options_set.x_flag_toplevel_reorder)
176 flag_toplevel_reorder = 1;
177
c68e61d8 178 /* Set flag_no_common, unless explicitly disabled. We fake common
179 using .weak, and that's not entirely accurate, so avoid it
180 unless forced. */
181 if (!global_options_set.x_flag_no_common)
182 flag_no_common = 1;
183
bf37bfae 184 /* The patch area requires nops, which we don't have. */
185 if (function_entry_patch_area_size > 0)
186 sorry ("not generating patch area, nops not supported");
187
8ce80784 188 /* Assumes that it will see only hard registers. */
189 flag_var_tracking = 0;
fecf1848 190
8b921b21 191 if (nvptx_optimize < 0)
192 nvptx_optimize = optimize > 0;
193
b0c5be65 194 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
195 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
8ce80784 196 declared_libfuncs_htab
b0c5be65 197 = hash_table<declared_libfunc_hasher>::create_ggc (17);
b3787ae4 198
31a633e4 199 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
d7ed88be 200 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
b3787ae4 201 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
78a78aac 202
31a633e4 203 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
d7ed88be 204 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
78a78aac 205 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
7fce8768 206
207 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
208 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
209 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
210
211 if (TARGET_GOMP)
212 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
8ce80784 213}
214
8ce80784 215/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
216 deal with ptx ideosyncracies. */
217
218const char *
219nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
220{
221 switch (mode)
222 {
916ace94 223 case E_BLKmode:
8ce80784 224 return ".b8";
916ace94 225 case E_BImode:
8ce80784 226 return ".pred";
916ace94 227 case E_QImode:
8ce80784 228 if (promote)
229 return ".u32";
230 else
231 return ".u8";
916ace94 232 case E_HImode:
8ce80784 233 return ".u16";
916ace94 234 case E_SImode:
8ce80784 235 return ".u32";
916ace94 236 case E_DImode:
8ce80784 237 return ".u64";
238
916ace94 239 case E_SFmode:
8ce80784 240 return ".f32";
916ace94 241 case E_DFmode:
8ce80784 242 return ".f64";
243
916ace94 244 case E_V2SImode:
fcac805e 245 return ".v2.u32";
916ace94 246 case E_V2DImode:
ffaae5bd 247 return ".v2.u64";
fcac805e 248
8ce80784 249 default:
250 gcc_unreachable ();
251 }
252}
253
d7ed88be 254/* Encode the PTX data area that DECL (which might not actually be a
255 _DECL) should reside in. */
ef33ea8e 256
d7ed88be 257static void
258nvptx_encode_section_info (tree decl, rtx rtl, int first)
ef33ea8e 259{
d7ed88be 260 default_encode_section_info (decl, rtl, first);
261 if (first && MEM_P (rtl))
262 {
263 nvptx_data_area area = DATA_AREA_GENERIC;
ef33ea8e 264
d7ed88be 265 if (TREE_CONSTANT (decl))
266 area = DATA_AREA_CONST;
267 else if (TREE_CODE (decl) == VAR_DECL)
7fce8768 268 {
269 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
270 {
271 area = DATA_AREA_SHARED;
272 if (DECL_INITIAL (decl))
273 error ("static initialization of variable %q+D in %<.shared%>"
274 " memory is not supported", decl);
275 }
276 else
277 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
278 }
ef33ea8e 279
d7ed88be 280 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
281 }
282}
283
284/* Return the PTX name of the data area in which SYM should be
285 placed. The symbol must have already been processed by
286 nvptx_encode_seciton_info, or equivalent. */
287
288static const char *
289section_for_sym (rtx sym)
290{
291 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
292 /* Same order as nvptx_data_area enum. */
293 static char const *const areas[] =
294 {"", ".global", ".shared", ".local", ".const", ".param"};
295
296 return areas[area];
297}
298
299/* Similarly for a decl. */
300
301static const char *
302section_for_decl (const_tree decl)
303{
304 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
ef33ea8e 305}
306
16e75570 307/* Check NAME for special function names and redirect them by returning a
308 replacement. This applies to malloc, free and realloc, for which we
309 want to use libgcc wrappers, and call, which triggers a bug in
310 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
311 not active in an offload compiler -- the names are all set by the
312 host-side compiler. */
313
314static const char *
315nvptx_name_replacement (const char *name)
316{
317 if (strcmp (name, "call") == 0)
318 return "__nvptx_call";
319 if (strcmp (name, "malloc") == 0)
320 return "__nvptx_malloc";
321 if (strcmp (name, "free") == 0)
322 return "__nvptx_free";
323 if (strcmp (name, "realloc") == 0)
324 return "__nvptx_realloc";
325 return name;
326}
327
6526ac4a 328/* If MODE should be treated as two registers of an inner mode, return
329 that inner mode. Otherwise return VOIDmode. */
8ce80784 330
6526ac4a 331static machine_mode
332maybe_split_mode (machine_mode mode)
8ce80784 333{
8ce80784 334 if (COMPLEX_MODE_P (mode))
6526ac4a 335 return GET_MODE_INNER (mode);
8ce80784 336
8ce80784 337 if (mode == TImode)
6526ac4a 338 return DImode;
339
340 return VOIDmode;
8ce80784 341}
342
9f547971 343/* Return true if mode should be treated as two registers. */
344
345static bool
346split_mode_p (machine_mode mode)
347{
348 return maybe_split_mode (mode) != VOIDmode;
349}
350
6196ad64 351/* Output a register, subreg, or register pair (with optional
352 enclosing braces). */
353
354static void
355output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
356 int subreg_offset = -1)
357{
358 if (inner_mode == VOIDmode)
359 {
360 if (HARD_REGISTER_NUM_P (regno))
361 fprintf (file, "%s", reg_names[regno]);
362 else
363 fprintf (file, "%%r%d", regno);
364 }
365 else if (subreg_offset >= 0)
366 {
367 output_reg (file, regno, VOIDmode);
368 fprintf (file, "$%d", subreg_offset);
369 }
370 else
371 {
372 if (subreg_offset == -1)
373 fprintf (file, "{");
374 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
375 fprintf (file, ",");
376 output_reg (file, regno, inner_mode, 0);
377 if (subreg_offset == -1)
378 fprintf (file, "}");
379 }
380}
381
b3787ae4 382/* Emit forking instructions for MASK. */
383
384static void
385nvptx_emit_forking (unsigned mask, bool is_call)
386{
387 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
388 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
389 if (mask)
390 {
391 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
392
393 /* Emit fork at all levels. This helps form SESE regions, as
394 it creates a block with a single successor before entering a
395 partitooned region. That is a good candidate for the end of
396 an SESE region. */
397 if (!is_call)
398 emit_insn (gen_nvptx_fork (op));
399 emit_insn (gen_nvptx_forked (op));
400 }
401}
402
403/* Emit joining instructions for MASK. */
404
405static void
406nvptx_emit_joining (unsigned mask, bool is_call)
407{
408 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
409 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
410 if (mask)
411 {
412 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
413
414 /* Emit joining for all non-call pars to ensure there's a single
415 predecessor for the block the join insn ends up in. This is
416 needed for skipping entire loops. */
417 if (!is_call)
418 emit_insn (gen_nvptx_joining (op));
419 emit_insn (gen_nvptx_join (op));
420 }
421}
422
8ce80784 423\f
df931be4 424/* Determine whether MODE and TYPE (possibly NULL) should be passed or
425 returned in memory. Integer and floating types supported by the
426 machine are passed in registers, everything else is passed in
427 memory. Complex types are split. */
428
429static bool
430pass_in_memory (machine_mode mode, const_tree type, bool for_return)
431{
432 if (type)
433 {
434 if (AGGREGATE_TYPE_P (type))
435 return true;
436 if (TREE_CODE (type) == VECTOR_TYPE)
437 return true;
438 }
439
440 if (!for_return && COMPLEX_MODE_P (mode))
441 /* Complex types are passed as two underlying args. */
442 mode = GET_MODE_INNER (mode);
443
444 if (GET_MODE_CLASS (mode) != MODE_INT
445 && GET_MODE_CLASS (mode) != MODE_FLOAT)
446 return true;
447
448 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
449 return true;
450
451 return false;
452}
453
454/* A non-memory argument of mode MODE is being passed, determine the mode it
455 should be promoted to. This is also used for determining return
456 type promotion. */
457
458static machine_mode
459promote_arg (machine_mode mode, bool prototyped)
460{
461 if (!prototyped && mode == SFmode)
462 /* K&R float promotion for unprototyped functions. */
463 mode = DFmode;
464 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
465 mode = SImode;
466
467 return mode;
468}
469
470/* A non-memory return type of MODE is being returned. Determine the
471 mode it should be promoted to. */
472
473static machine_mode
474promote_return (machine_mode mode)
475{
476 return promote_arg (mode, true);
477}
478
fcfe0df4 479/* Implement TARGET_FUNCTION_ARG. */
e78f526f 480
fcfe0df4 481static rtx
c666c7b6 482nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
fcfe0df4 483 const_tree, bool named)
484{
c666c7b6 485 if (mode == VOIDmode || !named)
fcfe0df4 486 return NULL_RTX;
8ce80784 487
c666c7b6 488 return gen_reg_rtx (mode);
fcfe0df4 489}
490
491/* Implement TARGET_FUNCTION_INCOMING_ARG. */
492
493static rtx
494nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
495 const_tree, bool named)
8ce80784 496{
fcfe0df4 497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
e78f526f 498
c666c7b6 499 if (mode == VOIDmode || !named)
fcfe0df4 500 return NULL_RTX;
8ce80784 501
fcfe0df4 502 /* No need to deal with split modes here, the only case that can
503 happen is complex modes and those are dealt with by
504 TARGET_SPLIT_COMPLEX_ARG. */
505 return gen_rtx_UNSPEC (mode,
506 gen_rtvec (1, GEN_INT (cum->count)),
507 UNSPEC_ARG_REG);
508}
509
510/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
511
512static void
513nvptx_function_arg_advance (cumulative_args_t cum_v,
514 machine_mode ARG_UNUSED (mode),
515 const_tree ARG_UNUSED (type),
516 bool ARG_UNUSED (named))
517{
518 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
c666c7b6 519
fcfe0df4 520 cum->count++;
521}
522
a2246979 523/* Implement TARGET_FUNCTION_ARG_BOUNDARY.
524
525 For nvptx This is only used for varadic args. The type has already
526 been promoted and/or converted to invisible reference. */
527
528static unsigned
529nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
530{
531 return GET_MODE_ALIGNMENT (mode);
532}
533
fcfe0df4 534/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
535
536 For nvptx, we know how to handle functions declared as stdarg: by
537 passing an extra pointer to the unnamed arguments. However, the
538 Fortran frontend can produce a different situation, where a
539 function pointer is declared with no arguments, but the actual
540 function and calls to it take more arguments. In that case, we
541 want to ensure the call matches the definition of the function. */
542
543static bool
544nvptx_strict_argument_naming (cumulative_args_t cum_v)
545{
546 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
c666c7b6 547
fcfe0df4 548 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
549}
550
fcfe0df4 551/* Implement TARGET_LIBCALL_VALUE. */
552
553static rtx
554nvptx_libcall_value (machine_mode mode, const_rtx)
555{
5fbb6177 556 if (!cfun || !cfun->machine->doing_call)
fcfe0df4 557 /* Pretend to return in a hard reg for early uses before pseudos can be
558 generated. */
559 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
c666c7b6 560
fcfe0df4 561 return gen_reg_rtx (mode);
562}
563
564/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
565 where function FUNC returns or receives a value of data type TYPE. */
566
567static rtx
df931be4 568nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
fcfe0df4 569 bool outgoing)
570{
df931be4 571 machine_mode mode = promote_return (TYPE_MODE (type));
572
fcfe0df4 573 if (outgoing)
8f46e324 574 {
5fbb6177 575 gcc_assert (cfun);
6e71bbf2 576 cfun->machine->return_mode = mode;
8f46e324 577 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
578 }
c666c7b6 579
580 return nvptx_libcall_value (mode, NULL_RTX);
fcfe0df4 581}
582
583/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
584
585static bool
586nvptx_function_value_regno_p (const unsigned int regno)
587{
588 return regno == NVPTX_RETURN_REGNUM;
589}
590
591/* Types with a mode other than those supported by the machine are passed by
592 reference in memory. */
593
594static bool
7bb66bb9 595nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
596 machine_mode mode, const_tree type,
597 bool ARG_UNUSED (named))
fcfe0df4 598{
df931be4 599 return pass_in_memory (mode, type, false);
fcfe0df4 600}
601
602/* Implement TARGET_RETURN_IN_MEMORY. */
603
604static bool
605nvptx_return_in_memory (const_tree type, const_tree)
606{
df931be4 607 return pass_in_memory (TYPE_MODE (type), type, true);
fcfe0df4 608}
609
610/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
611
612static machine_mode
613nvptx_promote_function_mode (const_tree type, machine_mode mode,
df931be4 614 int *ARG_UNUSED (punsignedp),
fcfe0df4 615 const_tree funtype, int for_return)
616{
df931be4 617 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
fcfe0df4 618}
619
fcfe0df4 620/* Helper for write_arg. Emit a single PTX argument of MODE, either
621 in a prototype, or as copy in a function prologue. ARGNO is the
622 index of this argument in the PTX function. FOR_REG is negative,
623 if we're emitting the PTX prototype. It is zero if we're copying
624 to an argument register and it is greater than zero if we're
625 copying to a specific hard register. */
626
627static int
ffd95e04 628write_arg_mode (std::stringstream &s, int for_reg, int argno,
629 machine_mode mode)
fcfe0df4 630{
631 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
632
e78f526f 633 if (for_reg < 0)
634 {
635 /* Writing PTX prototype. */
636 s << (argno ? ", " : " (");
fcfe0df4 637 s << ".param" << ptx_type << " %in_ar" << argno;
e78f526f 638 }
639 else
640 {
fcfe0df4 641 s << "\t.reg" << ptx_type << " ";
e78f526f 642 if (for_reg)
643 s << reg_names[for_reg];
644 else
645 s << "%ar" << argno;
646 s << ";\n";
7bb66bb9 647 if (argno >= 0)
648 {
649 s << "\tld.param" << ptx_type << " ";
650 if (for_reg)
651 s << reg_names[for_reg];
652 else
653 s << "%ar" << argno;
654 s << ", [%in_ar" << argno << "];\n";
655 }
e78f526f 656 }
657 return argno + 1;
8ce80784 658}
659
fcfe0df4 660/* Process function parameter TYPE to emit one or more PTX
ffd95e04 661 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
df931be4 662 is true, if this is a prototyped function, rather than an old-style
663 C declaration. Returns the next argument number to use.
fcfe0df4 664
67cf9b55 665 The promotion behavior here must match the regular GCC function
fcfe0df4 666 parameter marshalling machinery. */
667
668static int
ffd95e04 669write_arg_type (std::stringstream &s, int for_reg, int argno,
670 tree type, bool prototyped)
fcfe0df4 671{
672 machine_mode mode = TYPE_MODE (type);
673
674 if (mode == VOIDmode)
675 return argno;
676
df931be4 677 if (pass_in_memory (mode, type, false))
fcfe0df4 678 mode = Pmode;
df931be4 679 else
680 {
681 bool split = TREE_CODE (type) == COMPLEX_TYPE;
fcfe0df4 682
df931be4 683 if (split)
684 {
685 /* Complex types are sent as two separate args. */
686 type = TREE_TYPE (type);
7bb66bb9 687 mode = TYPE_MODE (type);
df931be4 688 prototyped = true;
689 }
fcfe0df4 690
df931be4 691 mode = promote_arg (mode, prototyped);
692 if (split)
ffd95e04 693 argno = write_arg_mode (s, for_reg, argno, mode);
fcfe0df4 694 }
fcfe0df4 695
ffd95e04 696 return write_arg_mode (s, for_reg, argno, mode);
697}
698
699/* Emit a PTX return as a prototype or function prologue declaration
700 for MODE. */
701
702static void
703write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
704{
705 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
706 const char *pfx = "\t.reg";
707 const char *sfx = ";\n";
708
709 if (for_proto)
710 pfx = "(.param", sfx = "_out) ";
711
712 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
fcfe0df4 713}
714
df931be4 715/* Process a function return TYPE to emit a PTX return as a prototype
ffd95e04 716 or function prologue declaration. Returns true if return is via an
67cf9b55 717 additional pointer parameter. The promotion behavior here must
ffd95e04 718 match the regular GCC function return mashalling. */
df931be4 719
a01b98fc 720static bool
ffd95e04 721write_return_type (std::stringstream &s, bool for_proto, tree type)
a01b98fc 722{
723 machine_mode mode = TYPE_MODE (type);
a01b98fc 724
df931be4 725 if (mode == VOIDmode)
726 return false;
727
728 bool return_in_mem = pass_in_memory (mode, type, true);
729
730 if (return_in_mem)
a01b98fc 731 {
df931be4 732 if (for_proto)
733 return return_in_mem;
734
735 /* Named return values can cause us to return a pointer as well
736 as expect an argument for the return location. This is
737 optimization-level specific, so no caller can make use of
738 this data, but more importantly for us, we must ensure it
739 doesn't change the PTX prototype. */
6e71bbf2 740 mode = (machine_mode) cfun->machine->return_mode;
8f46e324 741
df931be4 742 if (mode == VOIDmode)
743 return return_in_mem;
744
6e71bbf2 745 /* Clear return_mode to inhibit copy of retval to non-existent
df931be4 746 retval parameter. */
6e71bbf2 747 cfun->machine->return_mode = VOIDmode;
a01b98fc 748 }
749 else
df931be4 750 mode = promote_return (mode);
751
ffd95e04 752 write_return_mode (s, for_proto, mode);
a01b98fc 753
754 return return_in_mem;
755}
756
8ce80784 757/* Look for attributes in ATTRS that would indicate we must write a function
758 as a .entry kernel rather than a .func. Return true if one is found. */
759
760static bool
761write_as_kernel (tree attrs)
762{
763 return (lookup_attribute ("kernel", attrs) != NULL_TREE
7fce8768 764 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
765 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
766 /* For OpenMP target regions, the corresponding kernel entry is emitted from
767 write_omp_entry as a separate function. */
8ce80784 768}
769
c0ddd9a0 770/* Emit a linker marker for a function decl or defn. */
771
772static void
773write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
774 const char *name)
775{
776 s << "\n// BEGIN";
777 if (globalize)
778 s << " GLOBAL";
779 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
780 s << name << "\n";
781}
782
783/* Emit a linker marker for a variable decl or defn. */
784
785static void
786write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
787{
788 fprintf (file, "\n// BEGIN%s VAR %s: ",
789 globalize ? " GLOBAL" : "",
790 is_defn ? "DEF" : "DECL");
791 assemble_name_raw (file, name);
792 fputs ("\n", file);
793}
794
087b2f04 795/* Write a .func or .kernel declaration or definition along with
796 a helper comment for use by ld. S is the stream to write to, DECL
797 the decl for the function with name NAME. For definitions, emit
798 a declaration too. */
8ce80784 799
087b2f04 800static const char *
801write_fn_proto (std::stringstream &s, bool is_defn,
802 const char *name, const_tree decl)
8ce80784 803{
087b2f04 804 if (is_defn)
805 /* Emit a declaration. The PTX assembler gets upset without it. */
806 name = write_fn_proto (s, false, name, decl);
16e75570 807 else
808 {
809 /* Avoid repeating the name replacement. */
810 name = nvptx_name_replacement (name);
811 if (name[0] == '*')
812 name++;
813 }
8ce80784 814
c0ddd9a0 815 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
087b2f04 816
817 /* PTX declaration. */
8ce80784 818 if (DECL_EXTERNAL (decl))
819 s << ".extern ";
820 else if (TREE_PUBLIC (decl))
c40a4143 821 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
087b2f04 822 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
8ce80784 823
087b2f04 824 tree fntype = TREE_TYPE (decl);
825 tree result_type = TREE_TYPE (fntype);
8ce80784 826
a39a0392 827 /* atomic_compare_exchange_$n builtins have an exceptional calling
828 convention. */
829 int not_atomic_weak_arg = -1;
830 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
831 switch (DECL_FUNCTION_CODE (decl))
832 {
833 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
834 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
835 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
836 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
838 /* These atomics skip the 'weak' parm in an actual library
839 call. We must skip it in the prototype too. */
840 not_atomic_weak_arg = 3;
841 break;
842
843 default:
844 break;
845 }
846
8ce80784 847 /* Declare the result. */
ffd95e04 848 bool return_in_mem = write_return_type (s, true, result_type);
8ce80784 849
087b2f04 850 s << name;
851
e78f526f 852 int argno = 0;
087b2f04 853
854 /* Emit argument list. */
855 if (return_in_mem)
ffd95e04 856 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
18cefec0 857
087b2f04 858 /* We get:
859 NULL in TYPE_ARG_TYPES, for old-style functions
860 NULL in DECL_ARGUMENTS, for builtin functions without another
861 declaration.
862 So we have to pick the best one we have. */
863 tree args = TYPE_ARG_TYPES (fntype);
e78f526f 864 bool prototyped = true;
865 if (!args)
866 {
867 args = DECL_ARGUMENTS (decl);
868 prototyped = false;
869 }
8ce80784 870
a39a0392 871 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
087b2f04 872 {
e78f526f 873 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
a39a0392 874
875 if (not_atomic_weak_arg)
876 argno = write_arg_type (s, -1, argno, type, prototyped);
877 else
878 gcc_assert (type == boolean_type_node);
8ce80784 879 }
8ce80784 880
087b2f04 881 if (stdarg_p (fntype))
ffd95e04 882 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
8ce80784 883
087b2f04 884 if (DECL_STATIC_CHAIN (decl))
ffd95e04 885 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
087b2f04 886
e78f526f 887 if (!argno && strcmp (name, "main") == 0)
087b2f04 888 {
ffd95e04 889 argno = write_arg_type (s, -1, argno, integer_type_node, true);
890 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
087b2f04 891 }
892
e78f526f 893 if (argno)
087b2f04 894 s << ")";
895
896 s << (is_defn ? "\n" : ";\n");
897
898 return name;
8ce80784 899}
900
2583dd18 901/* Construct a function declaration from a call insn. This can be
902 necessary for two reasons - either we have an indirect call which
903 requires a .callprototype declaration, or we have a libcall
904 generated by emit_library_call for which no decl exists. */
905
906static void
087b2f04 907write_fn_proto_from_insn (std::stringstream &s, const char *name,
908 rtx result, rtx pat)
2583dd18 909{
910 if (!name)
911 {
912 s << "\t.callprototype ";
913 name = "_";
914 }
915 else
916 {
16e75570 917 name = nvptx_name_replacement (name);
c0ddd9a0 918 write_fn_marker (s, false, true, name);
2583dd18 919 s << "\t.extern .func ";
920 }
921
922 if (result != NULL_RTX)
ffd95e04 923 write_return_mode (s, true, GET_MODE (result));
2583dd18 924
925 s << name;
926
2583dd18 927 int arg_end = XVECLEN (pat, 0);
928 for (int i = 1; i < arg_end; i++)
929 {
ffd95e04 930 /* We don't have to deal with mode splitting & promotion here,
931 as that was already done when generating the call
932 sequence. */
2583dd18 933 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
934
ffd95e04 935 write_arg_mode (s, -1, i - 1, mode);
2583dd18 936 }
937 if (arg_end != 1)
938 s << ")";
939 s << ";\n";
940}
941
2583dd18 942/* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
943 table and and write a ptx prototype. These are emitted at end of
944 compilation. */
8ce80784 945
2583dd18 946static void
947nvptx_record_fndecl (tree decl)
8ce80784 948{
b0c5be65 949 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
8ce80784 950 if (*slot == NULL)
951 {
952 *slot = decl;
953 const char *name = get_fnname_from_decl (decl);
087b2f04 954 write_fn_proto (func_decls, false, name, decl);
8ce80784 955 }
8ce80784 956}
957
2583dd18 958/* Record a libcall or unprototyped external function. CALLEE is the
959 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
960 declaration for it. */
961
962static void
963nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
964{
965 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
966 if (*slot == NULL)
967 {
968 *slot = callee;
969
970 const char *name = XSTR (callee, 0);
087b2f04 971 write_fn_proto_from_insn (func_decls, name, retval, pat);
2583dd18 972 }
973}
974
975/* DECL is an external FUNCTION_DECL, that we're referencing. If it
976 is prototyped, record it now. Otherwise record it as needed at end
977 of compilation, when we might have more information about it. */
8ce80784 978
979void
980nvptx_record_needed_fndecl (tree decl)
981{
2583dd18 982 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
983 {
984 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
985 if (*slot == NULL)
986 *slot = decl;
987 }
988 else
989 nvptx_record_fndecl (decl);
990}
8ce80784 991
2583dd18 992/* SYM is a SYMBOL_REF. If it refers to an external function, record
993 it as needed. */
994
995static void
996nvptx_maybe_record_fnsym (rtx sym)
997{
998 tree decl = SYMBOL_REF_DECL (sym);
999
1000 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1001 nvptx_record_needed_fndecl (decl);
8ce80784 1002}
1003
ffd95e04 1004/* Emit a local array to hold some part of a conventional stack frame
7bb66bb9 1005 and initialize REGNO to point to it. If the size is zero, it'll
1006 never be valid to dereference, so we can simply initialize to
1007 zero. */
ffd95e04 1008
1009static void
1010init_frame (FILE *file, int regno, unsigned align, unsigned size)
1011{
7bb66bb9 1012 if (size)
1013 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1014 align, reg_names[regno], size);
1015 fprintf (file, "\t.reg.u%d %s;\n",
1016 POINTER_SIZE, reg_names[regno]);
1017 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1018 : "\tmov.u%d %s, 0;\n"),
ffd95e04 1019 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1020}
1021
7fce8768 1022/* Emit soft stack frame setup sequence. */
1023
1024static void
1025init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1026{
1027 /* Maintain 64-bit stack alignment. */
1028 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1029 size = ROUND_UP (size, keep_align);
1030 int bits = POINTER_SIZE;
1031 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1032 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1033 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1034 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1035 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1036 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1037 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1038 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1039 fprintf (file, "\t{\n");
1040 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1041 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1042 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1043 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1044 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1045 bits == 64 ? ".wide" : ".lo", bits / 8);
1046 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1047
1048 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1049 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1050
1051 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1052 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1053 bits, reg_sspprev, reg_sspslot);
1054
1055 /* Initialize %frame = %sspprev - size. */
1056 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1057 bits, reg_frame, reg_sspprev, size);
1058
1059 /* Apply alignment, if larger than 64. */
1060 if (alignment > keep_align)
1061 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1062 bits, reg_frame, reg_frame, -alignment);
1063
1064 size = crtl->outgoing_args_size;
1065 gcc_assert (size % keep_align == 0);
1066
1067 /* Initialize %stack. */
1068 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1069 bits, reg_stack, reg_frame, size);
1070
7fce8768 1071 if (!crtl->is_leaf)
1072 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1073 bits, reg_sspslot, reg_stack);
1074 fprintf (file, "\t}\n");
1075 cfun->machine->has_softstack = true;
1076 need_softstack_decl = true;
1077}
1078
b3787ae4 1079/* Emit code to initialize the REGNO predicate register to indicate
1080 whether we are not lane zero on the NAME axis. */
1081
1082static void
1083nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1084{
1085 fprintf (file, "\t{\n");
1086 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1087 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1088 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1089 fprintf (file, "\t}\n");
1090}
1091
7fce8768 1092/* Emit code to initialize predicate and master lane index registers for
1093 -muniform-simt code generation variant. */
1094
1095static void
1096nvptx_init_unisimt_predicate (FILE *file)
1097{
1b576300 1098 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1099 int loc = REGNO (cfun->machine->unisimt_location);
7fce8768 1100 int bits = POINTER_SIZE;
1b576300 1101 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
7fce8768 1102 fprintf (file, "\t{\n");
1103 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1104 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
7fce8768 1105 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1106 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1107 bits == 64 ? ".wide" : ".lo");
1b576300 1108 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1109 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1110 if (cfun->machine->unisimt_predicate)
1111 {
1112 int master = REGNO (cfun->machine->unisimt_master);
1113 int pred = REGNO (cfun->machine->unisimt_predicate);
1114 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1115 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1116 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1117 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1118 /* Compute predicate as 'tid.x == master'. */
1119 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1120 }
7fce8768 1121 fprintf (file, "\t}\n");
1122 need_unisimt_decl = true;
1123}
1124
1125/* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1126
1127 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1128 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1129 {
1130 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1131 __nvptx_uni[tid.y] = 0;
1132 gomp_nvptx_main (ORIG, arg);
1133 }
1134 ORIG itself should not be emitted as a PTX .entry function. */
1135
1136static void
1137write_omp_entry (FILE *file, const char *name, const char *orig)
1138{
1139 static bool gomp_nvptx_main_declared;
1140 if (!gomp_nvptx_main_declared)
1141 {
1142 gomp_nvptx_main_declared = true;
1143 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1144 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1145 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1146 }
74a4a36d 1147 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1148#define NTID_Y "%ntid.y"
7fce8768 1149#define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1150 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1151{\n\
1152 .reg.u32 %r<3>;\n\
1153 .reg.u" PS " %R<4>;\n\
1154 mov.u32 %r0, %tid.y;\n\
74a4a36d 1155 mov.u32 %r1, " NTID_Y ";\n\
7fce8768 1156 mov.u32 %r2, %ctaid.x;\n\
1157 cvt.u" PS ".u32 %R1, %r0;\n\
1158 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1159 mov.u" PS " %R0, __nvptx_stacks;\n\
1160 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1161 ld.param.u" PS " %R2, [%stack];\n\
1162 ld.param.u" PS " %R3, [%sz];\n\
1163 add.u" PS " %R2, %R2, %R3;\n\
1164 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1165 st.shared.u" PS " [%R0], %R2;\n\
1166 mov.u" PS " %R0, __nvptx_uni;\n\
1167 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1168 mov.u32 %r0, 0;\n\
1169 st.shared.u32 [%R0], %r0;\n\
1170 mov.u" PS " %R0, \0;\n\
1171 ld.param.u" PS " %R1, [%arg];\n\
1172 {\n\
1173 .param.u" PS " %P<2>;\n\
1174 st.param.u" PS " [%P0], %R0;\n\
1175 st.param.u" PS " [%P1], %R1;\n\
1176 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1177 }\n\
1178 ret.uni;\n\
1179}\n"
1180 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1181 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1182#undef ENTRY_TEMPLATE
74a4a36d 1183#undef NTID_Y
7fce8768 1184 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1185 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1186 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1187 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1188 need_softstack_decl = need_unisimt_decl = true;
1189}
1190
8ce80784 1191/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1192 function, including local var decls and copies from the arguments to
1193 local regs. */
1194
1195void
1196nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1197{
1198 tree fntype = TREE_TYPE (decl);
1199 tree result_type = TREE_TYPE (fntype);
e78f526f 1200 int argno = 0;
8ce80784 1201
7fce8768 1202 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1203 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1204 {
1205 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1206 sprintf (buf, "%s$impl", name);
1207 write_omp_entry (file, name, buf);
1208 name = buf;
1209 }
e78f526f 1210 /* We construct the initial part of the function into a string
1211 stream, in order to share the prototype writing code. */
8ce80784 1212 std::stringstream s;
087b2f04 1213 write_fn_proto (s, true, name, decl);
e78f526f 1214 s << "{\n";
8ce80784 1215
ffd95e04 1216 bool return_in_mem = write_return_type (s, false, result_type);
8ce80784 1217 if (return_in_mem)
ffd95e04 1218 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
e78f526f 1219
c8649f54 1220 /* Declare and initialize incoming arguments. */
e78f526f 1221 tree args = TYPE_ARG_TYPES (fntype);
1222 bool prototyped = true;
1223 if (!args)
c8649f54 1224 {
e78f526f 1225 args = DECL_ARGUMENTS (decl);
1226 prototyped = false;
c8649f54 1227 }
1228
1229 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1230 {
1231 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
c8649f54 1232
ffd95e04 1233 argno = write_arg_type (s, 0, argno, type, prototyped);
e78f526f 1234 }
c8649f54 1235
e78f526f 1236 if (stdarg_p (fntype))
7bb66bb9 1237 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
ffd95e04 1238 true);
c8649f54 1239
7bb66bb9 1240 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1241 write_arg_type (s, STATIC_CHAIN_REGNUM,
1242 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1243 true);
1244
e78f526f 1245 fprintf (file, "%s", s.str().c_str());
420aadad 1246
1b576300 1247 /* Usually 'crtl->is_leaf' is computed during register allocator
1248 initialization (which is not done on NVPTX) or for pressure-sensitive
1249 optimizations. Initialize it here, except if already set. */
1250 if (!crtl->is_leaf)
1251 crtl->is_leaf = leaf_function_p ();
1252
6e71bbf2 1253 HOST_WIDE_INT sz = get_frame_size ();
7fce8768 1254 bool need_frameptr = sz || cfun->machine->has_chain;
1255 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1256 if (!TARGET_SOFT_STACK)
1257 {
1258 /* Declare a local var for outgoing varargs. */
1259 if (cfun->machine->has_varadic)
1260 init_frame (file, STACK_POINTER_REGNUM,
1261 UNITS_PER_WORD, crtl->outgoing_args_size);
1262
1263 /* Declare a local variable for the frame. Force its size to be
1264 DImode-compatible. */
1265 if (need_frameptr)
1266 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1267 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1268 }
1b576300 1269 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1270 || (cfun->machine->has_simtreg && !crtl->is_leaf))
7fce8768 1271 init_softstack_frame (file, alignment, sz);
6e71bbf2 1272
1b576300 1273 if (cfun->machine->has_simtreg)
1274 {
1275 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1276 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1277 align = MAX (align, GET_MODE_SIZE (DImode));
1278 if (!crtl->is_leaf || cfun->calls_alloca)
1279 simtsz = HOST_WIDE_INT_M1U;
1280 if (simtsz == HOST_WIDE_INT_M1U)
1281 simtsz = nvptx_softstack_size;
1282 if (cfun->machine->has_softstack)
1283 simtsz += POINTER_SIZE / 8;
1284 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1285 if (align > GET_MODE_SIZE (DImode))
1286 simtsz += align - GET_MODE_SIZE (DImode);
1287 if (simtsz)
1288 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1289 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1290 }
8ce80784 1291 /* Declare the pseudos we have as ptx registers. */
1292 int maxregs = max_reg_num ();
1293 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1294 {
1295 if (regno_reg_rtx[i] != const0_rtx)
1296 {
1297 machine_mode mode = PSEUDO_REGNO_MODE (i);
6526ac4a 1298 machine_mode split = maybe_split_mode (mode);
6196ad64 1299
9f547971 1300 if (split_mode_p (mode))
6196ad64 1301 mode = split;
1302 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1303 output_reg (file, i, split, -2);
1304 fprintf (file, ";\n");
8ce80784 1305 }
1306 }
1307
b3787ae4 1308 /* Emit axis predicates. */
1309 if (cfun->machine->axis_predicate[0])
1310 nvptx_init_axis_predicate (file,
1311 REGNO (cfun->machine->axis_predicate[0]), "y");
1312 if (cfun->machine->axis_predicate[1])
1313 nvptx_init_axis_predicate (file,
1314 REGNO (cfun->machine->axis_predicate[1]), "x");
1b576300 1315 if (cfun->machine->unisimt_predicate
1316 || (cfun->machine->has_simtreg && !crtl->is_leaf))
7fce8768 1317 nvptx_init_unisimt_predicate (file);
8ce80784 1318}
1319
1b576300 1320/* Output code for switching uniform-simt state. ENTERING indicates whether
1321 we are entering or leaving non-uniform execution region. */
1322
1323static void
1324nvptx_output_unisimt_switch (FILE *file, bool entering)
1325{
1326 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1327 return;
1328 fprintf (file, "\t{\n");
1329 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1330 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1331 if (!crtl->is_leaf)
1332 {
1333 int loc = REGNO (cfun->machine->unisimt_location);
1334 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1335 }
1336 if (cfun->machine->unisimt_predicate)
1337 {
1338 int master = REGNO (cfun->machine->unisimt_master);
1339 int pred = REGNO (cfun->machine->unisimt_predicate);
1340 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1341 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1342 master, entering ? "%ustmp2" : "0");
1343 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1344 }
1345 fprintf (file, "\t}\n");
1346}
1347
1348/* Output code for allocating per-lane storage and switching soft-stack pointer.
1349 ENTERING indicates whether we are entering or leaving non-uniform execution.
1350 PTR is the register pointing to allocated storage, it is assigned to on
1351 entering and used to restore state on leaving. SIZE and ALIGN are used only
1352 on entering. */
1353
1354static void
1355nvptx_output_softstack_switch (FILE *file, bool entering,
1356 rtx ptr, rtx size, rtx align)
1357{
1358 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1359 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1360 return;
1361 int bits = POINTER_SIZE, regno = REGNO (ptr);
1362 fprintf (file, "\t{\n");
1363 if (entering)
1364 {
1365 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1366 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1367 cfun->machine->simt_stack_size);
1368 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1369 if (CONST_INT_P (size))
1370 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1371 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1372 else
1373 output_reg (file, REGNO (size), VOIDmode);
1374 fputs (";\n", file);
1375 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
9e0805b2 1376 fprintf (file,
1377 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1b576300 1378 bits, regno, regno, UINTVAL (align));
1379 }
1380 if (cfun->machine->has_softstack)
1381 {
1382 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1383 if (entering)
1384 {
1385 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1386 bits, regno, bits / 8, reg_stack);
1387 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1388 bits, reg_stack, regno, bits / 8);
1389 }
1390 else
1391 {
1392 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1393 bits, reg_stack, regno, bits / 8);
1394 }
1395 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1396 }
1397 fprintf (file, "\t}\n");
1398}
1399
1400/* Output code to enter non-uniform execution region. DEST is a register
1401 to hold a per-lane allocation given by SIZE and ALIGN. */
1402
1403const char *
1404nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1405{
1406 nvptx_output_unisimt_switch (asm_out_file, true);
1407 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1408 return "";
1409}
1410
1411/* Output code to leave non-uniform execution region. SRC is the register
1412 holding per-lane storage previously allocated by omp_simt_enter insn. */
1413
1414const char *
1415nvptx_output_simt_exit (rtx src)
1416{
1417 nvptx_output_unisimt_switch (asm_out_file, false);
1418 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1419 return "";
1420}
1421
7fce8768 1422/* Output instruction that sets soft stack pointer in shared memory to the
1423 value in register given by SRC_REGNO. */
1424
1425const char *
1426nvptx_output_set_softstack (unsigned src_regno)
1427{
1428 if (cfun->machine->has_softstack && !crtl->is_leaf)
1429 {
1430 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1431 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1432 output_reg (asm_out_file, src_regno, VOIDmode);
1433 fprintf (asm_out_file, ";\n");
1434 }
1435 return "";
1436}
8ce80784 1437/* Output a return instruction. Also copy the return value to its outgoing
1438 location. */
1439
1440const char *
1441nvptx_output_return (void)
1442{
6e71bbf2 1443 machine_mode mode = (machine_mode)cfun->machine->return_mode;
420aadad 1444
1445 if (mode != VOIDmode)
ffd95e04 1446 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1447 nvptx_ptx_type_from_mode (mode, false),
1448 reg_names[NVPTX_RETURN_REGNUM],
1449 reg_names[NVPTX_RETURN_REGNUM]);
8ce80784 1450
1451 return "ret;";
1452}
1453
8ce80784 1454/* Terminate a function by writing a closing brace to FILE. */
1455
1456void
1457nvptx_function_end (FILE *file)
1458{
a259e35c 1459 fprintf (file, "}\n");
8ce80784 1460}
1461\f
1462/* Decide whether we can make a sibling call to a function. For ptx, we
1463 can't. */
1464
1465static bool
1466nvptx_function_ok_for_sibcall (tree, tree)
1467{
1468 return false;
1469}
1470
f289122f 1471/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1472
1473static rtx
1474nvptx_get_drap_rtx (void)
1475{
7fce8768 1476 if (TARGET_SOFT_STACK && stack_realign_drap)
1477 return arg_pointer_rtx;
f289122f 1478 return NULL_RTX;
1479}
1480
8ce80784 1481/* Implement the TARGET_CALL_ARGS hook. Record information about one
1482 argument to the next call. */
1483
1484static void
6e71bbf2 1485nvptx_call_args (rtx arg, tree fntype)
8ce80784 1486{
6e71bbf2 1487 if (!cfun->machine->doing_call)
8ce80784 1488 {
6e71bbf2 1489 cfun->machine->doing_call = true;
1490 cfun->machine->is_varadic = false;
1491 cfun->machine->num_args = 0;
1492
1493 if (fntype && stdarg_p (fntype))
1494 {
1495 cfun->machine->is_varadic = true;
1496 cfun->machine->has_varadic = true;
1497 cfun->machine->num_args++;
1498 }
8ce80784 1499 }
8ce80784 1500
6e71bbf2 1501 if (REG_P (arg) && arg != pc_rtx)
1502 {
1503 cfun->machine->num_args++;
1504 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1505 cfun->machine->call_args);
1506 }
8ce80784 1507}
1508
1509/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1510 information we recorded. */
1511
1512static void
1513nvptx_end_call_args (void)
1514{
6e71bbf2 1515 cfun->machine->doing_call = false;
8ce80784 1516 free_EXPR_LIST_list (&cfun->machine->call_args);
1517}
1518
7794f2c9 1519/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1520 track of whether calls involving static chains or varargs were seen
1521 in the current function.
1522 For libcalls, maintain a hash table of decls we have seen, and
1523 record a function decl for later when encountering a new one. */
8ce80784 1524
1525void
1526nvptx_expand_call (rtx retval, rtx address)
1527{
8ce80784 1528 rtx callee = XEXP (address, 0);
b27697ca 1529 rtx varargs = NULL_RTX;
b3787ae4 1530 unsigned parallel = 0;
8ce80784 1531
8ce80784 1532 if (!call_insn_operand (callee, Pmode))
1533 {
1534 callee = force_reg (Pmode, callee);
1535 address = change_address (address, QImode, callee);
1536 }
1537
1538 if (GET_CODE (callee) == SYMBOL_REF)
1539 {
1540 tree decl = SYMBOL_REF_DECL (callee);
1541 if (decl != NULL_TREE)
1542 {
8ce80784 1543 if (DECL_STATIC_CHAIN (decl))
6e71bbf2 1544 cfun->machine->has_chain = true;
2583dd18 1545
4954efd4 1546 tree attr = oacc_get_fn_attrib (decl);
b3787ae4 1547 if (attr)
1548 {
1549 tree dims = TREE_VALUE (attr);
1550
1551 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1552 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1553 {
1554 if (TREE_PURPOSE (dims)
1555 && !integer_zerop (TREE_PURPOSE (dims)))
1556 break;
1557 /* Not on this axis. */
1558 parallel ^= GOMP_DIM_MASK (ix);
1559 dims = TREE_CHAIN (dims);
1560 }
1561 }
8ce80784 1562 }
1563 }
857788d2 1564
6e71bbf2 1565 unsigned nargs = cfun->machine->num_args;
1566 if (cfun->machine->is_varadic)
8ce80784 1567 {
b27697ca 1568 varargs = gen_reg_rtx (Pmode);
f09b32f3 1569 emit_move_insn (varargs, stack_pointer_rtx);
8ce80784 1570 }
1571
6e71bbf2 1572 rtvec vec = rtvec_alloc (nargs + 1);
1573 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
b27697ca 1574 int vec_pos = 0;
6e71bbf2 1575
1576 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
8ce80784 1577 rtx tmp_retval = retval;
6e71bbf2 1578 if (retval)
8ce80784 1579 {
1580 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1581 tmp_retval = gen_reg_rtx (GET_MODE (retval));
6e71bbf2 1582 call = gen_rtx_SET (tmp_retval, call);
8ce80784 1583 }
6e71bbf2 1584 XVECEXP (pat, 0, vec_pos++) = call;
b27697ca 1585
1586 /* Construct the call insn, including a USE for each argument pseudo
1587 register. These will be used when printing the insn. */
1588 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
6e71bbf2 1589 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
b27697ca 1590
1591 if (varargs)
a259e35c 1592 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
b27697ca 1593
1594 gcc_assert (vec_pos = XVECLEN (pat, 0));
7794f2c9 1595
b3787ae4 1596 nvptx_emit_forking (parallel, true);
8ce80784 1597 emit_call_insn (pat);
b3787ae4 1598 nvptx_emit_joining (parallel, true);
1599
8ce80784 1600 if (tmp_retval != retval)
1601 emit_move_insn (retval, tmp_retval);
1602}
df931be4 1603
8ce80784 1604/* Emit a comparison COMPARE, and return the new test to be used in the
1605 jump. */
1606
1607rtx
1608nvptx_expand_compare (rtx compare)
1609{
1610 rtx pred = gen_reg_rtx (BImode);
1611 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1612 XEXP (compare, 0), XEXP (compare, 1));
d1f9b275 1613 emit_insn (gen_rtx_SET (pred, cmp));
8ce80784 1614 return gen_rtx_NE (BImode, pred, const0_rtx);
1615}
1616
b3787ae4 1617/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1618
1619void
1620nvptx_expand_oacc_fork (unsigned mode)
1621{
1622 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1623}
1624
1625void
1626nvptx_expand_oacc_join (unsigned mode)
1627{
1628 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1629}
1630
1631/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1632 objects. */
1633
1634static rtx
1635nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1636{
1637 rtx res;
1638
1639 switch (GET_MODE (src))
1640 {
916ace94 1641 case E_DImode:
b3787ae4 1642 res = gen_unpackdisi2 (dst0, dst1, src);
1643 break;
916ace94 1644 case E_DFmode:
b3787ae4 1645 res = gen_unpackdfsi2 (dst0, dst1, src);
1646 break;
1647 default: gcc_unreachable ();
1648 }
1649 return res;
1650}
1651
1652/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1653 object. */
1654
1655static rtx
1656nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1657{
1658 rtx res;
1659
1660 switch (GET_MODE (dst))
1661 {
916ace94 1662 case E_DImode:
b3787ae4 1663 res = gen_packsidi2 (dst, src0, src1);
1664 break;
916ace94 1665 case E_DFmode:
b3787ae4 1666 res = gen_packsidf2 (dst, src0, src1);
1667 break;
1668 default: gcc_unreachable ();
1669 }
1670 return res;
1671}
1672
1673/* Generate an instruction or sequence to broadcast register REG
1674 across the vectors of a single warp. */
1675
7fce8768 1676rtx
8702ba1e 1677nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
b3787ae4 1678{
1679 rtx res;
1680
1681 switch (GET_MODE (dst))
1682 {
916ace94 1683 case E_SImode:
b3787ae4 1684 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1685 break;
916ace94 1686 case E_SFmode:
b3787ae4 1687 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1688 break;
916ace94 1689 case E_DImode:
1690 case E_DFmode:
b3787ae4 1691 {
1692 rtx tmp0 = gen_reg_rtx (SImode);
1693 rtx tmp1 = gen_reg_rtx (SImode);
1694
1695 start_sequence ();
1696 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1697 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1698 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1699 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1700 res = get_insns ();
1701 end_sequence ();
1702 }
1703 break;
916ace94 1704 case E_BImode:
b3787ae4 1705 {
1706 rtx tmp = gen_reg_rtx (SImode);
1707
1708 start_sequence ();
1709 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1710 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1711 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1712 res = get_insns ();
1713 end_sequence ();
1714 }
1715 break;
916ace94 1716 case E_QImode:
1717 case E_HImode:
51ce1c6e 1718 {
1719 rtx tmp = gen_reg_rtx (SImode);
1720
1721 start_sequence ();
1722 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1723 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1724 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1725 tmp)));
1726 res = get_insns ();
1727 end_sequence ();
1728 }
1729 break;
b3787ae4 1730
1731 default:
1732 gcc_unreachable ();
1733 }
1734 return res;
1735}
1736
1737/* Generate an instruction or sequence to broadcast register REG
1738 across the vectors of a single warp. */
1739
1740static rtx
1741nvptx_gen_vcast (rtx reg)
1742{
1743 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1744}
1745
1746/* Structure used when generating a worker-level spill or fill. */
1747
1748struct wcast_data_t
1749{
1750 rtx base; /* Register holding base addr of buffer. */
1751 rtx ptr; /* Iteration var, if needed. */
1752 unsigned offset; /* Offset into worker buffer. */
1753};
1754
1755/* Direction of the spill/fill and looping setup/teardown indicator. */
1756
1757enum propagate_mask
1758 {
1759 PM_read = 1 << 0,
1760 PM_write = 1 << 1,
1761 PM_loop_begin = 1 << 2,
1762 PM_loop_end = 1 << 3,
1763
1764 PM_read_write = PM_read | PM_write
1765 };
1766
1767/* Generate instruction(s) to spill or fill register REG to/from the
1768 worker broadcast array. PM indicates what is to be done, REP
1769 how many loop iterations will be executed (0 for not a loop). */
1770
1771static rtx
1772nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1773{
1774 rtx res;
1775 machine_mode mode = GET_MODE (reg);
1776
1777 switch (mode)
1778 {
916ace94 1779 case E_BImode:
b3787ae4 1780 {
1781 rtx tmp = gen_reg_rtx (SImode);
1782
1783 start_sequence ();
1784 if (pm & PM_read)
1785 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1786 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1787 if (pm & PM_write)
1788 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1789 res = get_insns ();
1790 end_sequence ();
1791 }
1792 break;
1793
1794 default:
1795 {
1796 rtx addr = data->ptr;
1797
1798 if (!addr)
1799 {
1800 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1801
1802 if (align > worker_bcast_align)
1803 worker_bcast_align = align;
1804 data->offset = (data->offset + align - 1) & ~(align - 1);
1805 addr = data->base;
1806 if (data->offset)
1807 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1808 }
1809
1810 addr = gen_rtx_MEM (mode, addr);
b3787ae4 1811 if (pm == PM_read)
1812 res = gen_rtx_SET (addr, reg);
1813 else if (pm == PM_write)
1814 res = gen_rtx_SET (reg, addr);
1815 else
1816 gcc_unreachable ();
1817
1818 if (data->ptr)
1819 {
1820 /* We're using a ptr, increment it. */
1821 start_sequence ();
1822
1823 emit_insn (res);
1824 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1825 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1826 res = get_insns ();
1827 end_sequence ();
1828 }
1829 else
1830 rep = 1;
1831 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1832 }
1833 break;
1834 }
1835 return res;
1836}
8ce80784 1837\f
1838/* Returns true if X is a valid address for use in a memory reference. */
1839
1840static bool
1841nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1842{
1843 enum rtx_code code = GET_CODE (x);
1844
1845 switch (code)
1846 {
1847 case REG:
1848 return true;
1849
1850 case PLUS:
1851 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1852 return true;
1853 return false;
1854
1855 case CONST:
1856 case SYMBOL_REF:
1857 case LABEL_REF:
1858 return true;
1859
1860 default:
1861 return false;
1862 }
1863}
8ce80784 1864\f
48effc50 1865/* Machinery to output constant initializers. When beginning an
1866 initializer, we decide on a fragment size (which is visible in ptx
1867 in the type used), and then all initializer data is buffered until
1868 a fragment is filled and ready to be written out. */
1869
1870static struct
1871{
1872 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1873 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1874 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1875 out. */
1876 unsigned size; /* Fragment size to accumulate. */
1877 unsigned offset; /* Offset within current fragment. */
1878 bool started; /* Whether we've output any initializer. */
1879} init_frag;
1880
1881/* The current fragment is full, write it out. SYM may provide a
1882 symbolic reference we should output, in which case the fragment
1883 value is the addend. */
8ce80784 1884
1885static void
48effc50 1886output_init_frag (rtx sym)
8ce80784 1887{
48effc50 1888 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1889 unsigned HOST_WIDE_INT val = init_frag.val;
8ce80784 1890
48effc50 1891 init_frag.started = true;
1892 init_frag.val = 0;
1893 init_frag.offset = 0;
1894 init_frag.remaining--;
1895
1896 if (sym)
1897 {
1898 fprintf (asm_out_file, "generic(");
1899 output_address (VOIDmode, sym);
1900 fprintf (asm_out_file, val ? ") + " : ")");
1901 }
8ce80784 1902
48effc50 1903 if (!sym || val)
1904 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
8ce80784 1905}
1906
48effc50 1907/* Add value VAL of size SIZE to the data we're emitting, and keep
1908 writing out chunks as they fill up. */
8ce80784 1909
1910static void
48effc50 1911nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
8ce80784 1912{
48effc50 1913 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1914
1915 for (unsigned part = 0; size; size -= part)
8ce80784 1916 {
48effc50 1917 val >>= part * BITS_PER_UNIT;
1918 part = init_frag.size - init_frag.offset;
1919 if (part > size)
1920 part = size;
1921
1922 unsigned HOST_WIDE_INT partial
1923 = val << (init_frag.offset * BITS_PER_UNIT);
1924 init_frag.val |= partial & init_frag.mask;
1925 init_frag.offset += part;
1926
1927 if (init_frag.offset == init_frag.size)
1928 output_init_frag (NULL);
8ce80784 1929 }
1930}
1931
1932/* Target hook for assembling integer object X of size SIZE. */
1933
1934static bool
1935nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1936{
2583dd18 1937 HOST_WIDE_INT val = 0;
1938
1939 switch (GET_CODE (x))
8ce80784 1940 {
2583dd18 1941 default:
8c3f7986 1942 /* Let the generic machinery figure it out, usually for a
1943 CONST_WIDE_INT. */
1944 return false;
2583dd18 1945
1946 case CONST_INT:
48effc50 1947 nvptx_assemble_value (INTVAL (x), size);
2583dd18 1948 break;
1949
1950 case CONST:
1951 x = XEXP (x, 0);
1952 gcc_assert (GET_CODE (x) == PLUS);
1953 val = INTVAL (XEXP (x, 1));
1954 x = XEXP (x, 0);
1955 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1956 /* FALLTHROUGH */
1957
1958 case SYMBOL_REF:
48effc50 1959 gcc_assert (size == init_frag.size);
1960 if (init_frag.offset)
8ce80784 1961 sorry ("cannot emit unaligned pointers in ptx assembly");
8ce80784 1962
2583dd18 1963 nvptx_maybe_record_fnsym (x);
48effc50 1964 init_frag.val = val;
1965 output_init_frag (x);
8ce80784 1966 break;
8ce80784 1967 }
1968
8ce80784 1969 return true;
1970}
1971
1972/* Output SIZE zero bytes. We ignore the FILE argument since the
1973 functions we're calling to perform the output just use
1974 asm_out_file. */
1975
1976void
1977nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1978{
48effc50 1979 /* Finish the current fragment, if it's started. */
1980 if (init_frag.offset)
8ce80784 1981 {
48effc50 1982 unsigned part = init_frag.size - init_frag.offset;
1983 if (part > size)
1984 part = (unsigned) size;
1985 size -= part;
1986 nvptx_assemble_value (0, part);
8ce80784 1987 }
1988
48effc50 1989 /* If this skip doesn't terminate the initializer, write as many
1990 remaining pieces as possible directly. */
1991 if (size < init_frag.remaining * init_frag.size)
8ce80784 1992 {
48effc50 1993 while (size >= init_frag.size)
1994 {
1995 size -= init_frag.size;
1996 output_init_frag (NULL_RTX);
1997 }
1998 if (size)
1999 nvptx_assemble_value (0, size);
8ce80784 2000 }
8ce80784 2001}
2002
2003/* Output a string STR with length SIZE. As in nvptx_output_skip we
2004 ignore the FILE arg. */
2005
2006void
2007nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2008{
2009 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2010 nvptx_assemble_value (str[i], 1);
2011}
2012
2f67d2d7 2013/* Emit a PTX variable decl and prepare for emission of its
2014 initializer. NAME is the symbol name and SETION the PTX data
2015 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2016 The caller has already emitted any indentation and linkage
2017 specifier. It is responsible for any initializer, terminating ;
2018 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2019 this is the opposite way round that PTX wants them! */
2020
2021static void
2022nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2023 const_tree type, HOST_WIDE_INT size, unsigned align)
2024{
2025 while (TREE_CODE (type) == ARRAY_TYPE)
2026 type = TREE_TYPE (type);
2027
7e6e9bff 2028 if (TREE_CODE (type) == VECTOR_TYPE
2029 || TREE_CODE (type) == COMPLEX_TYPE)
2030 /* Neither vector nor complex types can contain the other. */
2031 type = TREE_TYPE (type);
2032
2f67d2d7 2033 unsigned elt_size = int_size_in_bytes (type);
7e6e9bff 2034
2035 /* Largest mode we're prepared to accept. For BLKmode types we
2036 don't know if it'll contain pointer constants, so have to choose
2037 pointer size, otherwise we can choose DImode. */
2038 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2039
2040 elt_size |= GET_MODE_SIZE (elt_mode);
2041 elt_size &= -elt_size; /* Extract LSB set. */
2f67d2d7 2042
48effc50 2043 init_frag.size = elt_size;
67cf9b55 2044 /* Avoid undefined shift behavior by using '2'. */
48effc50 2045 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2046 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2047 init_frag.val = 0;
2048 init_frag.offset = 0;
2049 init_frag.started = false;
2050 /* Size might not be a multiple of elt size, if there's an
2051 initialized trailing struct array with smaller type than
2052 elt_size. */
2053 init_frag.remaining = (size + elt_size - 1) / elt_size;
2f67d2d7 2054
2055 fprintf (file, "%s .align %d .u%d ",
2056 section, align / BITS_PER_UNIT,
2057 elt_size * BITS_PER_UNIT);
2058 assemble_name (file, name);
2059
2060 if (size)
2061 /* We make everything an array, to simplify any initialization
2062 emission. */
48effc50 2063 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2f67d2d7 2064}
2065
8ce80784 2066/* Called when the initializer for a decl has been completely output through
2067 combinations of the three functions above. */
2068
2069static void
2070nvptx_assemble_decl_end (void)
2071{
48effc50 2072 if (init_frag.offset)
2073 /* This can happen with a packed struct with trailing array member. */
2074 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2075 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
8ce80784 2076}
2077
c0ddd9a0 2078/* Output an uninitialized common or file-scope variable. */
2079
2080void
2081nvptx_output_aligned_decl (FILE *file, const char *name,
2082 const_tree decl, HOST_WIDE_INT size, unsigned align)
2083{
2084 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2085
2086 /* If this is public, it is common. The nearest thing we have to
2087 common is weak. */
2f67d2d7 2088 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2089
2090 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2091 TREE_TYPE (decl), size, align);
48effc50 2092 nvptx_assemble_decl_end ();
c0ddd9a0 2093}
2094
8ce80784 2095/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2096 writing a constant variable EXP with NAME and SIZE and its
2097 initializer to FILE. */
2098
2099static void
2100nvptx_asm_declare_constant_name (FILE *file, const char *name,
2f67d2d7 2101 const_tree exp, HOST_WIDE_INT obj_size)
8ce80784 2102{
2f67d2d7 2103 write_var_marker (file, true, false, name);
2104
2105 fprintf (file, "\t");
2106
8ce80784 2107 tree type = TREE_TYPE (exp);
2f67d2d7 2108 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2109 TYPE_ALIGN (type));
8ce80784 2110}
2111
2112/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2113 a variable DECL with NAME to FILE. */
2114
2115void
2116nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2117{
2f67d2d7 2118 write_var_marker (file, true, TREE_PUBLIC (decl), name);
d7ed88be 2119
2f67d2d7 2120 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2121 : DECL_WEAK (decl) ? ".weak " : ".visible "));
d7ed88be 2122
2f67d2d7 2123 tree type = TREE_TYPE (decl);
2124 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2125 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2126 type, obj_size, DECL_ALIGN (decl));
8ce80784 2127}
2128
2129/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2130
2131static void
2132nvptx_globalize_label (FILE *, const char *)
2133{
2134}
2135
2136/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2137 declaration only for variable DECL with NAME to FILE. */
6196ad64 2138
8ce80784 2139static void
2140nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2141{
037c2abc 2142 /* The middle end can place constant pool decls into the varpool as
2143 undefined. Until that is fixed, catch the problem here. */
2144 if (DECL_IN_CONSTANT_POOL (decl))
2145 return;
2146
d27a79f6 2147 /* We support weak defintions, and hence have the right
2148 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2149 if (DECL_WEAK (decl))
2150 error_at (DECL_SOURCE_LOCATION (decl),
2151 "PTX does not support weak declarations"
2152 " (only weak definitions)");
c0ddd9a0 2153 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2154
2f67d2d7 2155 fprintf (file, "\t.extern ");
2156 tree size = DECL_SIZE_UNIT (decl);
2157 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2158 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2159 DECL_ALIGN (decl));
fecf1848 2160 nvptx_assemble_decl_end ();
8ce80784 2161}
2162
6196ad64 2163/* Output a pattern for a move instruction. */
2164
2165const char *
2166nvptx_output_mov_insn (rtx dst, rtx src)
2167{
2168 machine_mode dst_mode = GET_MODE (dst);
2169 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2170 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2171 machine_mode src_inner = (GET_CODE (src) == SUBREG
2172 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2173
6bd291cd 2174 rtx sym = src;
2175 if (GET_CODE (sym) == CONST)
2176 sym = XEXP (XEXP (sym, 0), 0);
9224dd1f 2177 if (SYMBOL_REF_P (sym))
2178 {
2179 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2180 return "%.\tcvta%D1%t0\t%0, %1;";
2181 nvptx_maybe_record_fnsym (sym);
2182 }
6bd291cd 2183
6196ad64 2184 if (src_inner == dst_inner)
2185 return "%.\tmov%t0\t%0, %1;";
2186
2187 if (CONSTANT_P (src))
2188 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2189 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2190 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2191
2192 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
ffaae5bd 2193 {
2194 if (GET_MODE_BITSIZE (dst_mode) == 128
2195 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2196 {
2197 /* mov.b128 is not supported. */
2198 if (dst_inner == V2DImode && src_inner == TImode)
2199 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2200 else if (dst_inner == TImode && src_inner == V2DImode)
2201 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2202
2203 gcc_unreachable ();
2204 }
2205 return "%.\tmov.b%T0\t%0, %1;";
2206 }
6196ad64 2207
2208 return "%.\tcvt%t0%t1\t%0, %1;";
2209}
2210
7fce8768 2211static void nvptx_print_operand (FILE *, rtx, int);
2212
8ce80784 2213/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
7794f2c9 2214 involves writing .param declarations and in/out copies into them. For
2215 indirect calls, also write the .callprototype. */
8ce80784 2216
2217const char *
2218nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2219{
f09b32f3 2220 char buf[16];
8ce80784 2221 static int labelno;
2222 bool needs_tgt = register_operand (callee, Pmode);
2223 rtx pat = PATTERN (insn);
7fce8768 2224 if (GET_CODE (pat) == COND_EXEC)
2225 pat = COND_EXEC_CODE (pat);
b27697ca 2226 int arg_end = XVECLEN (pat, 0);
8ce80784 2227 tree decl = NULL_TREE;
2228
2229 fprintf (asm_out_file, "\t{\n");
2230 if (result != NULL)
ffd95e04 2231 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2232 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2233 reg_names[NVPTX_RETURN_REGNUM]);
8ce80784 2234
7794f2c9 2235 /* Ensure we have a ptx declaration in the output if necessary. */
8ce80784 2236 if (GET_CODE (callee) == SYMBOL_REF)
2237 {
2238 decl = SYMBOL_REF_DECL (callee);
2583dd18 2239 if (!decl
2240 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2241 nvptx_record_libfunc (callee, result, pat);
2242 else if (DECL_EXTERNAL (decl))
8ce80784 2243 nvptx_record_fndecl (decl);
2244 }
2245
2246 if (needs_tgt)
2247 {
2248 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2249 labelno++;
2250 ASM_OUTPUT_LABEL (asm_out_file, buf);
2251 std::stringstream s;
087b2f04 2252 write_fn_proto_from_insn (s, NULL, result, pat);
8ce80784 2253 fputs (s.str().c_str(), asm_out_file);
2254 }
2255
f09b32f3 2256 for (int argno = 1; argno < arg_end; argno++)
8ce80784 2257 {
f09b32f3 2258 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
8ce80784 2259 machine_mode mode = GET_MODE (t);
50ad9277 2260 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
8ce80784 2261
f09b32f3 2262 /* Mode splitting has already been done. */
50ad9277 2263 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2264 "\t\tst.param%s [%%out_arg%d], ",
2265 ptx_type, argno, ptx_type, argno);
2266 output_reg (asm_out_file, REGNO (t), VOIDmode);
2267 fprintf (asm_out_file, ";\n");
8ce80784 2268 }
2269
7fce8768 2270 /* The '.' stands for the call's predicate, if any. */
2271 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
8ce80784 2272 fprintf (asm_out_file, "\t\tcall ");
2273 if (result != NULL_RTX)
ffd95e04 2274 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2275
8ce80784 2276 if (decl)
2277 {
2278 const char *name = get_fnname_from_decl (decl);
16e75570 2279 name = nvptx_name_replacement (name);
8ce80784 2280 assemble_name (asm_out_file, name);
2281 }
2282 else
3c047fe9 2283 output_address (VOIDmode, callee);
8ce80784 2284
f09b32f3 2285 const char *open = "(";
2286 for (int argno = 1; argno < arg_end; argno++)
8ce80784 2287 {
f09b32f3 2288 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2289 open = "";
8ce80784 2290 }
f09b32f3 2291 if (decl && DECL_STATIC_CHAIN (decl))
2292 {
7bb66bb9 2293 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
f09b32f3 2294 open = "";
2295 }
2296 if (!open[0])
2297 fprintf (asm_out_file, ")");
b27697ca 2298
8ce80784 2299 if (needs_tgt)
2300 {
2301 fprintf (asm_out_file, ", ");
2302 assemble_name (asm_out_file, buf);
2303 }
2304 fprintf (asm_out_file, ";\n");
8ce80784 2305
64204dac 2306 if (find_reg_note (insn, REG_NORETURN, NULL))
2307 /* No return functions confuse the PTX JIT, as it doesn't realize
2308 the flow control barrier they imply. It can seg fault if it
2309 encounters what looks like an unexitable loop. Emit a trailing
2310 trap, which it does grok. */
2311 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2312
ffd95e04 2313 if (result)
2314 {
2315 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2316
2317 if (!rval[0])
2318 /* We must escape the '%' that starts RETURN_REGNUM. */
2319 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2320 reg_names[NVPTX_RETURN_REGNUM]);
2321 return rval;
2322 }
2323
2324 return "}";
8ce80784 2325}
2326
2327/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2328
2329static bool
2330nvptx_print_operand_punct_valid_p (unsigned char c)
2331{
2332 return c == '.' || c== '#';
2333}
2334
8ce80784 2335/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2336
2337static void
2338nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2339{
2340 rtx off;
2341 if (GET_CODE (x) == CONST)
2342 x = XEXP (x, 0);
2343 switch (GET_CODE (x))
2344 {
2345 case PLUS:
2346 off = XEXP (x, 1);
3c047fe9 2347 output_address (VOIDmode, XEXP (x, 0));
8ce80784 2348 fprintf (file, "+");
3c047fe9 2349 output_address (VOIDmode, off);
8ce80784 2350 break;
2351
2352 case SYMBOL_REF:
2353 case LABEL_REF:
2354 output_addr_const (file, x);
2355 break;
2356
2357 default:
2358 gcc_assert (GET_CODE (x) != MEM);
2359 nvptx_print_operand (file, x, 0);
2360 break;
2361 }
2362}
2363
2364/* Write assembly language output for the address ADDR to FILE. */
2365
2366static void
3c047fe9 2367nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
8ce80784 2368{
3c047fe9 2369 nvptx_print_address_operand (file, addr, mode);
8ce80784 2370}
2371
2372/* Print an operand, X, to FILE, with an optional modifier in CODE.
2373
2374 Meaning of CODE:
2375 . -- print the predicate for the instruction or an emptry string for an
2376 unconditional one.
2377 # -- print a rounding mode for the instruction
2378
d7ed88be 2379 A -- print a data area for a MEM
8ce80784 2380 c -- print an opcode suffix for a comparison operator, including a type code
d7ed88be 2381 D -- print a data area for a MEM operand
b3787ae4 2382 S -- print a shuffle kind specified by CONST_INT
8ce80784 2383 t -- print a type opcode suffix, promoting QImode to 32 bits
2384 T -- print a type size in bits
2385 u -- print a type opcode suffix without promotions. */
2386
2387static void
2388nvptx_print_operand (FILE *file, rtx x, int code)
2389{
8ce80784 2390 if (code == '.')
2391 {
2392 x = current_insn_predicate;
2393 if (x)
2394 {
7fce8768 2395 fputs ("@", file);
8ce80784 2396 if (GET_CODE (x) == EQ)
2397 fputs ("!", file);
7fce8768 2398 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
8ce80784 2399 }
2400 return;
2401 }
2402 else if (code == '#')
2403 {
2404 fputs (".rn", file);
2405 return;
2406 }
2407
2408 enum rtx_code x_code = GET_CODE (x);
6196ad64 2409 machine_mode mode = GET_MODE (x);
8ce80784 2410
2411 switch (code)
2412 {
2413 case 'A':
d7ed88be 2414 x = XEXP (x, 0);
2415 /* FALLTHROUGH. */
ef33ea8e 2416
d7ed88be 2417 case 'D':
2418 if (GET_CODE (x) == CONST)
2419 x = XEXP (x, 0);
2420 if (GET_CODE (x) == PLUS)
2421 x = XEXP (x, 0);
ef33ea8e 2422
d7ed88be 2423 if (GET_CODE (x) == SYMBOL_REF)
2424 fputs (section_for_sym (x), file);
8ce80784 2425 break;
2426
8ce80784 2427 case 't':
8ce80784 2428 case 'u':
6196ad64 2429 if (x_code == SUBREG)
2430 {
557d7041 2431 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2432 if (VECTOR_MODE_P (inner_mode)
2433 && (GET_MODE_SIZE (mode)
2434 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2435 mode = GET_MODE_INNER (inner_mode);
2436 else if (split_mode_p (inner_mode))
2437 mode = maybe_split_mode (inner_mode);
2438 else
2439 mode = inner_mode;
6196ad64 2440 }
2441 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
8ce80784 2442 break;
2443
ffaae5bd 2444 case 'H':
2445 case 'L':
2446 {
2447 rtx inner_x = SUBREG_REG (x);
2448 machine_mode inner_mode = GET_MODE (inner_x);
2449 machine_mode split = maybe_split_mode (inner_mode);
2450
2451 output_reg (file, REGNO (inner_x), split,
2452 (code == 'H'
2453 ? GET_MODE_SIZE (inner_mode) / 2
2454 : 0));
2455 }
2456 break;
2457
b3787ae4 2458 case 'S':
2459 {
8702ba1e 2460 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2461 /* Same order as nvptx_shuffle_kind. */
b3787ae4 2462 static const char *const kinds[] =
8702ba1e 2463 {".up", ".down", ".bfly", ".idx"};
2464 fputs (kinds[kind], file);
b3787ae4 2465 }
2466 break;
2467
8ce80784 2468 case 'T':
6196ad64 2469 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
8ce80784 2470 break;
2471
2472 case 'j':
2473 fprintf (file, "@");
2474 goto common;
2475
2476 case 'J':
2477 fprintf (file, "@!");
2478 goto common;
2479
2480 case 'c':
6196ad64 2481 mode = GET_MODE (XEXP (x, 0));
8ce80784 2482 switch (x_code)
2483 {
2484 case EQ:
2485 fputs (".eq", file);
2486 break;
2487 case NE:
6196ad64 2488 if (FLOAT_MODE_P (mode))
8ce80784 2489 fputs (".neu", file);
2490 else
2491 fputs (".ne", file);
2492 break;
2493 case LE:
b10e3d47 2494 case LEU:
8ce80784 2495 fputs (".le", file);
2496 break;
2497 case GE:
b10e3d47 2498 case GEU:
8ce80784 2499 fputs (".ge", file);
2500 break;
2501 case LT:
b10e3d47 2502 case LTU:
8ce80784 2503 fputs (".lt", file);
2504 break;
2505 case GT:
8ce80784 2506 case GTU:
b10e3d47 2507 fputs (".gt", file);
8ce80784 2508 break;
2509 case LTGT:
2510 fputs (".ne", file);
2511 break;
2512 case UNEQ:
2513 fputs (".equ", file);
2514 break;
2515 case UNLE:
2516 fputs (".leu", file);
2517 break;
2518 case UNGE:
2519 fputs (".geu", file);
2520 break;
2521 case UNLT:
2522 fputs (".ltu", file);
2523 break;
2524 case UNGT:
2525 fputs (".gtu", file);
2526 break;
2527 case UNORDERED:
2528 fputs (".nan", file);
2529 break;
2530 case ORDERED:
2531 fputs (".num", file);
2532 break;
2533 default:
2534 gcc_unreachable ();
2535 }
6196ad64 2536 if (FLOAT_MODE_P (mode)
8ce80784 2537 || x_code == EQ || x_code == NE
2538 || x_code == GEU || x_code == GTU
2539 || x_code == LEU || x_code == LTU)
6196ad64 2540 fputs (nvptx_ptx_type_from_mode (mode, true), file);
8ce80784 2541 else
6196ad64 2542 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
8ce80784 2543 break;
2544 default:
2545 common:
2546 switch (x_code)
2547 {
2548 case SUBREG:
6196ad64 2549 {
2550 rtx inner_x = SUBREG_REG (x);
2551 machine_mode inner_mode = GET_MODE (inner_x);
2552 machine_mode split = maybe_split_mode (inner_mode);
2553
557d7041 2554 if (VECTOR_MODE_P (inner_mode)
2555 && (GET_MODE_SIZE (mode)
2556 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2557 {
2558 output_reg (file, REGNO (inner_x), VOIDmode);
2559 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2560 }
2561 else if (split_mode_p (inner_mode)
6196ad64 2562 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2563 output_reg (file, REGNO (inner_x), split);
2564 else
2565 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2566 }
2567 break;
8ce80784 2568
2569 case REG:
6196ad64 2570 output_reg (file, REGNO (x), maybe_split_mode (mode));
8ce80784 2571 break;
2572
2573 case MEM:
2574 fputc ('[', file);
6196ad64 2575 nvptx_print_address_operand (file, XEXP (x, 0), mode);
8ce80784 2576 fputc (']', file);
2577 break;
2578
2579 case CONST_INT:
2580 output_addr_const (file, x);
2581 break;
2582
2583 case CONST:
2584 case SYMBOL_REF:
2585 case LABEL_REF:
2586 /* We could use output_addr_const, but that can print things like
2587 "x-8", which breaks ptxas. Need to ensure it is output as
2588 "x+-8". */
2589 nvptx_print_address_operand (file, x, VOIDmode);
2590 break;
2591
2592 case CONST_DOUBLE:
2593 long vals[2];
6196ad64 2594 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
8ce80784 2595 vals[0] &= 0xffffffff;
2596 vals[1] &= 0xffffffff;
6196ad64 2597 if (mode == SFmode)
8ce80784 2598 fprintf (file, "0f%08lx", vals[0]);
2599 else
2600 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2601 break;
2602
557d7041 2603 case CONST_VECTOR:
2604 {
2605 unsigned n = CONST_VECTOR_NUNITS (x);
2606 fprintf (file, "{ ");
2607 for (unsigned i = 0; i < n; ++i)
2608 {
2609 if (i != 0)
2610 fprintf (file, ", ");
2611
2612 rtx elem = CONST_VECTOR_ELT (x, i);
2613 output_addr_const (file, elem);
2614 }
2615 fprintf (file, " }");
2616 }
2617 break;
2618
8ce80784 2619 default:
2620 output_addr_const (file, x);
2621 }
2622 }
2623}
2624\f
2625/* Record replacement regs used to deal with subreg operands. */
2626struct reg_replace
2627{
2628 rtx replacement[MAX_RECOG_OPERANDS];
2629 machine_mode mode;
2630 int n_allocated;
2631 int n_in_use;
2632};
2633
2634/* Allocate or reuse a replacement in R and return the rtx. */
2635
2636static rtx
2637get_replacement (struct reg_replace *r)
2638{
2639 if (r->n_allocated == r->n_in_use)
2640 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2641 return r->replacement[r->n_in_use++];
2642}
2643
2644/* Clean up subreg operands. In ptx assembly, everything is typed, and
2645 the presence of subregs would break the rules for most instructions.
2646 Replace them with a suitable new register of the right size, plus
2647 conversion copyin/copyout instructions. */
2648
2649static void
f8cec994 2650nvptx_reorg_subreg (void)
8ce80784 2651{
2652 struct reg_replace qiregs, hiregs, siregs, diregs;
2653 rtx_insn *insn, *next;
2654
8ce80784 2655 qiregs.n_allocated = 0;
2656 hiregs.n_allocated = 0;
2657 siregs.n_allocated = 0;
2658 diregs.n_allocated = 0;
2659 qiregs.mode = QImode;
2660 hiregs.mode = HImode;
2661 siregs.mode = SImode;
2662 diregs.mode = DImode;
2663
2664 for (insn = get_insns (); insn; insn = next)
2665 {
2666 next = NEXT_INSN (insn);
2667 if (!NONDEBUG_INSN_P (insn)
de0c53e1 2668 || asm_noperands (PATTERN (insn)) >= 0
8ce80784 2669 || GET_CODE (PATTERN (insn)) == USE
2670 || GET_CODE (PATTERN (insn)) == CLOBBER)
2671 continue;
b27697ca 2672
8ce80784 2673 qiregs.n_in_use = 0;
2674 hiregs.n_in_use = 0;
2675 siregs.n_in_use = 0;
2676 diregs.n_in_use = 0;
2677 extract_insn (insn);
2678 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
b27697ca 2679
8ce80784 2680 for (int i = 0; i < recog_data.n_operands; i++)
2681 {
2682 rtx op = recog_data.operand[i];
2683 if (GET_CODE (op) != SUBREG)
2684 continue;
2685
2686 rtx inner = SUBREG_REG (op);
2687
2688 machine_mode outer_mode = GET_MODE (op);
2689 machine_mode inner_mode = GET_MODE (inner);
2690 gcc_assert (s_ok);
2691 if (s_ok
2692 && (GET_MODE_PRECISION (inner_mode)
2693 >= GET_MODE_PRECISION (outer_mode)))
2694 continue;
2695 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2696 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2697 : outer_mode == HImode ? &hiregs
2698 : outer_mode == SImode ? &siregs
2699 : &diregs);
2700 rtx new_reg = get_replacement (r);
2701
2702 if (recog_data.operand_type[i] != OP_OUT)
2703 {
2704 enum rtx_code code;
2705 if (GET_MODE_PRECISION (inner_mode)
2706 < GET_MODE_PRECISION (outer_mode))
2707 code = ZERO_EXTEND;
2708 else
2709 code = TRUNCATE;
2710
d1f9b275 2711 rtx pat = gen_rtx_SET (new_reg,
8ce80784 2712 gen_rtx_fmt_e (code, outer_mode, inner));
2713 emit_insn_before (pat, insn);
2714 }
2715
2716 if (recog_data.operand_type[i] != OP_IN)
2717 {
2718 enum rtx_code code;
2719 if (GET_MODE_PRECISION (inner_mode)
2720 < GET_MODE_PRECISION (outer_mode))
2721 code = TRUNCATE;
2722 else
2723 code = ZERO_EXTEND;
2724
d1f9b275 2725 rtx pat = gen_rtx_SET (inner,
8ce80784 2726 gen_rtx_fmt_e (code, inner_mode, new_reg));
2727 emit_insn_after (pat, insn);
2728 }
2729 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2730 }
2731 }
f8cec994 2732}
8ce80784 2733
7fce8768 2734/* Return a SImode "master lane index" register for uniform-simt, allocating on
2735 first use. */
2736
2737static rtx
2738nvptx_get_unisimt_master ()
2739{
2740 rtx &master = cfun->machine->unisimt_master;
2741 return master ? master : master = gen_reg_rtx (SImode);
2742}
2743
2744/* Return a BImode "predicate" register for uniform-simt, similar to above. */
2745
2746static rtx
2747nvptx_get_unisimt_predicate ()
2748{
2749 rtx &pred = cfun->machine->unisimt_predicate;
2750 return pred ? pred : pred = gen_reg_rtx (BImode);
2751}
2752
2753/* Return true if given call insn references one of the functions provided by
2754 the CUDA runtime: malloc, free, vprintf. */
2755
2756static bool
2757nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2758{
2759 rtx pat = PATTERN (insn);
2760 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2761 pat = XVECEXP (pat, 0, 0);
2762 if (GET_CODE (pat) == SET)
2763 pat = SET_SRC (pat);
2764 gcc_checking_assert (GET_CODE (pat) == CALL
2765 && GET_CODE (XEXP (pat, 0)) == MEM);
2766 rtx addr = XEXP (XEXP (pat, 0), 0);
2767 if (GET_CODE (addr) != SYMBOL_REF)
2768 return false;
2769 const char *name = XSTR (addr, 0);
2770 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2771 references with forced assembler name refer to PTX syscalls. For vprintf,
2772 accept both normal and forced-assembler-name references. */
2773 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2774 || !strcmp (name, "*malloc")
2775 || !strcmp (name, "*free"));
2776}
2777
2778/* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2779 propagate its value from lane MASTER to current lane. */
2780
2781static void
2782nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2783{
2784 rtx reg;
2785 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2786 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2787}
2788
2789/* Adjust code for uniform-simt code generation variant by making atomics and
2790 "syscalls" conditionally executed, and inserting shuffle-based propagation
2791 for registers being set. */
2792
2793static void
2794nvptx_reorg_uniform_simt ()
2795{
2796 rtx_insn *insn, *next;
2797
2798 for (insn = get_insns (); insn; insn = next)
2799 {
2800 next = NEXT_INSN (insn);
2801 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2802 && !(NONJUMP_INSN_P (insn)
2803 && GET_CODE (PATTERN (insn)) == PARALLEL
2804 && get_attr_atomic (insn)))
2805 continue;
2806 rtx pat = PATTERN (insn);
2807 rtx master = nvptx_get_unisimt_master ();
2808 for (int i = 0; i < XVECLEN (pat, 0); i++)
2809 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2810 rtx pred = nvptx_get_unisimt_predicate ();
2811 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2812 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2813 validate_change (insn, &PATTERN (insn), pat, false);
2814 }
2815}
2816
5f05c4a3 2817/* Loop structure of the function. The entire function is described as
2818 a NULL loop. */
b3787ae4 2819
2820struct parallel
2821{
2822 /* Parent parallel. */
2823 parallel *parent;
2824
2825 /* Next sibling parallel. */
2826 parallel *next;
2827
2828 /* First child parallel. */
2829 parallel *inner;
2830
2831 /* Partitioning mask of the parallel. */
2832 unsigned mask;
2833
2834 /* Partitioning used within inner parallels. */
2835 unsigned inner_mask;
2836
2837 /* Location of parallel forked and join. The forked is the first
2838 block in the parallel and the join is the first block after of
2839 the partition. */
2840 basic_block forked_block;
2841 basic_block join_block;
2842
2843 rtx_insn *forked_insn;
2844 rtx_insn *join_insn;
2845
2846 rtx_insn *fork_insn;
2847 rtx_insn *joining_insn;
2848
2849 /* Basic blocks in this parallel, but not in child parallels. The
2850 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2851 blocks are not. */
2852 auto_vec<basic_block> blocks;
2853
2854public:
2855 parallel (parallel *parent, unsigned mode);
2856 ~parallel ();
2857};
2858
2859/* Constructor links the new parallel into it's parent's chain of
2860 children. */
2861
2862parallel::parallel (parallel *parent_, unsigned mask_)
2863 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2864{
2865 forked_block = join_block = 0;
2866 forked_insn = join_insn = 0;
2867 fork_insn = joining_insn = 0;
2868
2869 if (parent)
2870 {
2871 next = parent->inner;
2872 parent->inner = this;
2873 }
2874}
2875
2876parallel::~parallel ()
2877{
2878 delete inner;
2879 delete next;
2880}
2881
2882/* Map of basic blocks to insns */
2883typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2884
2885/* A tuple of an insn of interest and the BB in which it resides. */
2886typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2887typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2888
2889/* Split basic blocks such that each forked and join unspecs are at
2890 the start of their basic blocks. Thus afterwards each block will
2891 have a single partitioning mode. We also do the same for return
2892 insns, as they are executed by every thread. Return the
2893 partitioning mode of the function as a whole. Populate MAP with
2894 head and tail blocks. We also clear the BB visited flag, which is
2895 used when finding partitions. */
2896
2897static void
2898nvptx_split_blocks (bb_insn_map_t *map)
2899{
2900 insn_bb_vec_t worklist;
2901 basic_block block;
2902 rtx_insn *insn;
2903
2904 /* Locate all the reorg instructions of interest. */
2905 FOR_ALL_BB_FN (block, cfun)
2906 {
2907 bool seen_insn = false;
2908
2909 /* Clear visited flag, for use by parallel locator */
2910 block->flags &= ~BB_VISITED;
2911
2912 FOR_BB_INSNS (block, insn)
2913 {
2914 if (!INSN_P (insn))
2915 continue;
2916 switch (recog_memoized (insn))
2917 {
2918 default:
2919 seen_insn = true;
2920 continue;
2921 case CODE_FOR_nvptx_forked:
2922 case CODE_FOR_nvptx_join:
2923 break;
2924
2925 case CODE_FOR_return:
2926 /* We also need to split just before return insns, as
2927 that insn needs executing by all threads, but the
2928 block it is in probably does not. */
2929 break;
2930 }
2931
2932 if (seen_insn)
2933 /* We've found an instruction that must be at the start of
2934 a block, but isn't. Add it to the worklist. */
2935 worklist.safe_push (insn_bb_t (insn, block));
2936 else
2937 /* It was already the first instruction. Just add it to
2938 the map. */
2939 map->get_or_insert (block) = insn;
2940 seen_insn = true;
2941 }
2942 }
2943
2944 /* Split blocks on the worklist. */
2945 unsigned ix;
2946 insn_bb_t *elt;
2947 basic_block remap = 0;
2948 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2949 {
2950 if (remap != elt->second)
2951 {
2952 block = elt->second;
2953 remap = block;
2954 }
2955
2956 /* Split block before insn. The insn is in the new block */
2957 edge e = split_block (block, PREV_INSN (elt->first));
2958
2959 block = e->dest;
2960 map->get_or_insert (block) = elt->first;
2961 }
2962}
2963
2964/* BLOCK is a basic block containing a head or tail instruction.
2965 Locate the associated prehead or pretail instruction, which must be
2966 in the single predecessor block. */
2967
2968static rtx_insn *
2969nvptx_discover_pre (basic_block block, int expected)
2970{
2971 gcc_assert (block->preds->length () == 1);
2972 basic_block pre_block = (*block->preds)[0]->src;
2973 rtx_insn *pre_insn;
2974
2975 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2976 pre_insn = PREV_INSN (pre_insn))
2977 gcc_assert (pre_insn != BB_HEAD (pre_block));
2978
2979 gcc_assert (recog_memoized (pre_insn) == expected);
2980 return pre_insn;
2981}
2982
2983/* Dump this parallel and all its inner parallels. */
2984
2985static void
2986nvptx_dump_pars (parallel *par, unsigned depth)
2987{
2988 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2989 depth, par->mask,
2990 par->forked_block ? par->forked_block->index : -1,
2991 par->join_block ? par->join_block->index : -1);
2992
2993 fprintf (dump_file, " blocks:");
2994
2995 basic_block block;
2996 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2997 fprintf (dump_file, " %d", block->index);
2998 fprintf (dump_file, "\n");
2999 if (par->inner)
3000 nvptx_dump_pars (par->inner, depth + 1);
3001
3002 if (par->next)
3003 nvptx_dump_pars (par->next, depth);
3004}
3005
3006/* If BLOCK contains a fork/join marker, process it to create or
3007 terminate a loop structure. Add this block to the current loop,
3008 and then walk successor blocks. */
3009
3010static parallel *
3011nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3012{
3013 if (block->flags & BB_VISITED)
3014 return par;
3015 block->flags |= BB_VISITED;
3016
3017 if (rtx_insn **endp = map->get (block))
3018 {
3019 rtx_insn *end = *endp;
3020
3021 /* This is a block head or tail, or return instruction. */
3022 switch (recog_memoized (end))
3023 {
3024 case CODE_FOR_return:
3025 /* Return instructions are in their own block, and we
3026 don't need to do anything more. */
3027 return par;
3028
3029 case CODE_FOR_nvptx_forked:
3030 /* Loop head, create a new inner loop and add it into
3031 our parent's child list. */
3032 {
3033 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3034
3035 gcc_assert (mask);
3036 par = new parallel (par, mask);
3037 par->forked_block = block;
3038 par->forked_insn = end;
3039 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3040 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3041 par->fork_insn
3042 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3043 }
3044 break;
3045
3046 case CODE_FOR_nvptx_join:
3047 /* A loop tail. Finish the current loop and return to
3048 parent. */
3049 {
3050 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3051
3052 gcc_assert (par->mask == mask);
3053 par->join_block = block;
3054 par->join_insn = end;
3055 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
3056 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
3057 par->joining_insn
3058 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3059 par = par->parent;
3060 }
3061 break;
3062
3063 default:
3064 gcc_unreachable ();
3065 }
3066 }
3067
3068 if (par)
3069 /* Add this block onto the current loop's list of blocks. */
3070 par->blocks.safe_push (block);
3071 else
3072 /* This must be the entry block. Create a NULL parallel. */
3073 par = new parallel (0, 0);
3074
3075 /* Walk successor blocks. */
3076 edge e;
3077 edge_iterator ei;
3078
3079 FOR_EACH_EDGE (e, ei, block->succs)
3080 nvptx_find_par (map, par, e->dest);
3081
3082 return par;
3083}
3084
3085/* DFS walk the CFG looking for fork & join markers. Construct
3086 loop structures as we go. MAP is a mapping of basic blocks
3087 to head & tail markers, discovered when splitting blocks. This
3088 speeds up the discovery. We rely on the BB visited flag having
3089 been cleared when splitting blocks. */
3090
3091static parallel *
3092nvptx_discover_pars (bb_insn_map_t *map)
3093{
3094 basic_block block;
3095
3096 /* Mark exit blocks as visited. */
3097 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3098 block->flags |= BB_VISITED;
3099
3100 /* And entry block as not. */
3101 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3102 block->flags &= ~BB_VISITED;
3103
3104 parallel *par = nvptx_find_par (map, 0, block);
3105
3106 if (dump_file)
3107 {
3108 fprintf (dump_file, "\nLoops\n");
3109 nvptx_dump_pars (par, 0);
3110 fprintf (dump_file, "\n");
3111 }
3112
3113 return par;
3114}
3115
29f8c37e 3116/* Analyse a group of BBs within a partitioned region and create N
3117 Single-Entry-Single-Exit regions. Some of those regions will be
3118 trivial ones consisting of a single BB. The blocks of a
3119 partitioned region might form a set of disjoint graphs -- because
3120 the region encloses a differently partitoned sub region.
3121
3122 We use the linear time algorithm described in 'Finding Regions Fast:
3123 Single Entry Single Exit and control Regions in Linear Time'
3124 Johnson, Pearson & Pingali. That algorithm deals with complete
3125 CFGs, where a back edge is inserted from END to START, and thus the
3126 problem becomes one of finding equivalent loops.
3127
3128 In this case we have a partial CFG. We complete it by redirecting
3129 any incoming edge to the graph to be from an arbitrary external BB,
3130 and similarly redirecting any outgoing edge to be to that BB.
3131 Thus we end up with a closed graph.
3132
3133 The algorithm works by building a spanning tree of an undirected
3134 graph and keeping track of back edges from nodes further from the
3135 root in the tree to nodes nearer to the root in the tree. In the
3136 description below, the root is up and the tree grows downwards.
3137
3138 We avoid having to deal with degenerate back-edges to the same
3139 block, by splitting each BB into 3 -- one for input edges, one for
3140 the node itself and one for the output edges. Such back edges are
3141 referred to as 'Brackets'. Cycle equivalent nodes will have the
3142 same set of brackets.
3143
3144 Determining bracket equivalency is done by maintaining a list of
3145 brackets in such a manner that the list length and final bracket
3146 uniquely identify the set.
3147
3148 We use coloring to mark all BBs with cycle equivalency with the
3149 same color. This is the output of the 'Finding Regions Fast'
3150 algorithm. Notice it doesn't actually find the set of nodes within
3151 a particular region, just unorderd sets of nodes that are the
3152 entries and exits of SESE regions.
3153
3154 After determining cycle equivalency, we need to find the minimal
3155 set of SESE regions. Do this with a DFS coloring walk of the
3156 complete graph. We're either 'looking' or 'coloring'. When
3157 looking, and we're in the subgraph, we start coloring the color of
3158 the current node, and remember that node as the start of the
3159 current color's SESE region. Every time we go to a new node, we
3160 decrement the count of nodes with thet color. If it reaches zero,
3161 we remember that node as the end of the current color's SESE region
3162 and return to 'looking'. Otherwise we color the node the current
3163 color.
3164
3165 This way we end up with coloring the inside of non-trivial SESE
3166 regions with the color of that region. */
3167
3168/* A pair of BBs. We use this to represent SESE regions. */
3169typedef std::pair<basic_block, basic_block> bb_pair_t;
3170typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3171
3172/* A node in the undirected CFG. The discriminator SECOND indicates just
3173 above or just below the BB idicated by FIRST. */
3174typedef std::pair<basic_block, int> pseudo_node_t;
3175
3176/* A bracket indicates an edge towards the root of the spanning tree of the
3177 undirected graph. Each bracket has a color, determined
3178 from the currrent set of brackets. */
3179struct bracket
3180{
3181 pseudo_node_t back; /* Back target */
3182
3183 /* Current color and size of set. */
3184 unsigned color;
3185 unsigned size;
3186
3187 bracket (pseudo_node_t back_)
3188 : back (back_), color (~0u), size (~0u)
3189 {
3190 }
3191
3192 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3193 {
3194 if (length != size)
3195 {
3196 size = length;
3197 color = color_counts.length ();
3198 color_counts.quick_push (0);
3199 }
3200 color_counts[color]++;
3201 return color;
3202 }
3203};
3204
3205typedef auto_vec<bracket> bracket_vec_t;
3206
3207/* Basic block info for finding SESE regions. */
3208
3209struct bb_sese
3210{
3211 int node; /* Node number in spanning tree. */
3212 int parent; /* Parent node number. */
3213
3214 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3215 edges arrive at pseudo-node Ai and the outgoing edges leave at
3216 pseudo-node Ao. We have to remember which way we arrived at a
3217 particular node when generating the spanning tree. dir > 0 means
3218 we arrived at Ai, dir < 0 means we arrived at Ao. */
3219 int dir;
3220
3221 /* Lowest numbered pseudo-node reached via a backedge from thsis
3222 node, or any descendant. */
3223 pseudo_node_t high;
3224
3225 int color; /* Cycle-equivalence color */
3226
3227 /* Stack of brackets for this node. */
3228 bracket_vec_t brackets;
3229
3230 bb_sese (unsigned node_, unsigned p, int dir_)
3231 :node (node_), parent (p), dir (dir_)
3232 {
3233 }
3234 ~bb_sese ();
3235
3236 /* Push a bracket ending at BACK. */
3237 void push (const pseudo_node_t &back)
3238 {
3239 if (dump_file)
3240 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3241 back.first ? back.first->index : 0, back.second);
3242 brackets.safe_push (bracket (back));
3243 }
3244
3245 void append (bb_sese *child);
3246 void remove (const pseudo_node_t &);
3247
3248 /* Set node's color. */
3249 void set_color (auto_vec<unsigned> &color_counts)
3250 {
3251 color = brackets.last ().get_color (color_counts, brackets.length ());
3252 }
3253};
3254
3255bb_sese::~bb_sese ()
3256{
3257}
3258
3259/* Destructively append CHILD's brackets. */
3260
3261void
3262bb_sese::append (bb_sese *child)
3263{
3264 if (int len = child->brackets.length ())
3265 {
3266 int ix;
3267
3268 if (dump_file)
3269 {
3270 for (ix = 0; ix < len; ix++)
3271 {
3272 const pseudo_node_t &pseudo = child->brackets[ix].back;
3273 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3274 child->node, pseudo.first ? pseudo.first->index : 0,
3275 pseudo.second);
3276 }
3277 }
3278 if (!brackets.length ())
3279 std::swap (brackets, child->brackets);
3280 else
3281 {
3282 brackets.reserve (len);
3283 for (ix = 0; ix < len; ix++)
3284 brackets.quick_push (child->brackets[ix]);
3285 }
3286 }
3287}
3288
3289/* Remove brackets that terminate at PSEUDO. */
3290
3291void
3292bb_sese::remove (const pseudo_node_t &pseudo)
3293{
3294 unsigned removed = 0;
3295 int len = brackets.length ();
3296
3297 for (int ix = 0; ix < len; ix++)
3298 {
3299 if (brackets[ix].back == pseudo)
3300 {
3301 if (dump_file)
3302 fprintf (dump_file, "Removing backedge %d:%+d\n",
3303 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3304 removed++;
3305 }
3306 else if (removed)
3307 brackets[ix-removed] = brackets[ix];
3308 }
3309 while (removed--)
3310 brackets.pop ();
3311}
3312
3313/* Accessors for BB's aux pointer. */
3314#define BB_SET_SESE(B, S) ((B)->aux = (S))
3315#define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3316
3317/* DFS walk creating SESE data structures. Only cover nodes with
3318 BB_VISITED set. Append discovered blocks to LIST. We number in
3319 increments of 3 so that the above and below pseudo nodes can be
3320 implicitly numbered too. */
3321
3322static int
3323nvptx_sese_number (int n, int p, int dir, basic_block b,
3324 auto_vec<basic_block> *list)
3325{
3326 if (BB_GET_SESE (b))
3327 return n;
3328
3329 if (dump_file)
3330 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3331 b->index, n, p, dir);
3332
3333 BB_SET_SESE (b, new bb_sese (n, p, dir));
3334 p = n;
3335
3336 n += 3;
3337 list->quick_push (b);
3338
3339 /* First walk the nodes on the 'other side' of this node, then walk
3340 the nodes on the same side. */
3341 for (unsigned ix = 2; ix; ix--)
3342 {
3343 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3344 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3345 : offsetof (edge_def, src));
3346 edge e;
3347 edge_iterator (ei);
3348
3349 FOR_EACH_EDGE (e, ei, edges)
3350 {
3351 basic_block target = *(basic_block *)((char *)e + offset);
3352
3353 if (target->flags & BB_VISITED)
3354 n = nvptx_sese_number (n, p, dir, target, list);
3355 }
3356 dir = -dir;
3357 }
3358 return n;
3359}
3360
3361/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3362 EDGES are the outgoing edges and OFFSET is the offset to the src
3363 or dst block on the edges. */
3364
3365static void
3366nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3367 vec<edge, va_gc> *edges, size_t offset)
3368{
3369 edge e;
3370 edge_iterator (ei);
3371 int hi_back = depth;
3372 pseudo_node_t node_back (0, depth);
3373 int hi_child = depth;
3374 pseudo_node_t node_child (0, depth);
3375 basic_block child = NULL;
3376 unsigned num_children = 0;
3377 int usd = -dir * sese->dir;
3378
3379 if (dump_file)
3380 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3381 me->index, sese->node, dir);
3382
3383 if (dir < 0)
3384 {
3385 /* This is the above pseudo-child. It has the BB itself as an
3386 additional child node. */
3387 node_child = sese->high;
3388 hi_child = node_child.second;
3389 if (node_child.first)
3390 hi_child += BB_GET_SESE (node_child.first)->node;
3391 num_children++;
3392 }
3393
3394 /* Examine each edge.
3395 - if it is a child (a) append its bracket list and (b) record
3396 whether it is the child with the highest reaching bracket.
3397 - if it is an edge to ancestor, record whether it's the highest
3398 reaching backlink. */
3399 FOR_EACH_EDGE (e, ei, edges)
3400 {
3401 basic_block target = *(basic_block *)((char *)e + offset);
3402
3403 if (bb_sese *t_sese = BB_GET_SESE (target))
3404 {
3405 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3406 {
3407 /* Child node. Append its bracket list. */
3408 num_children++;
3409 sese->append (t_sese);
3410
3411 /* Compare it's hi value. */
3412 int t_hi = t_sese->high.second;
3413
3414 if (basic_block child_hi_block = t_sese->high.first)
3415 t_hi += BB_GET_SESE (child_hi_block)->node;
3416
3417 if (hi_child > t_hi)
3418 {
3419 hi_child = t_hi;
3420 node_child = t_sese->high;
3421 child = target;
3422 }
3423 }
3424 else if (t_sese->node < sese->node + dir
3425 && !(dir < 0 && sese->parent == t_sese->node))
3426 {
3427 /* Non-parental ancestor node -- a backlink. */
3428 int d = usd * t_sese->dir;
3429 int back = t_sese->node + d;
3430
3431 if (hi_back > back)
3432 {
3433 hi_back = back;
3434 node_back = pseudo_node_t (target, d);
3435 }
3436 }
3437 }
3438 else
3439 { /* Fallen off graph, backlink to entry node. */
3440 hi_back = 0;
3441 node_back = pseudo_node_t (0, 0);
3442 }
3443 }
3444
3445 /* Remove any brackets that terminate at this pseudo node. */
3446 sese->remove (pseudo_node_t (me, dir));
3447
3448 /* Now push any backlinks from this pseudo node. */
3449 FOR_EACH_EDGE (e, ei, edges)
3450 {
3451 basic_block target = *(basic_block *)((char *)e + offset);
3452 if (bb_sese *t_sese = BB_GET_SESE (target))
3453 {
3454 if (t_sese->node < sese->node + dir
3455 && !(dir < 0 && sese->parent == t_sese->node))
3456 /* Non-parental ancestor node - backedge from me. */
3457 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3458 }
3459 else
3460 {
3461 /* back edge to entry node */
3462 sese->push (pseudo_node_t (0, 0));
3463 }
3464 }
3465
3466 /* If this node leads directly or indirectly to a no-return region of
3467 the graph, then fake a backedge to entry node. */
3468 if (!sese->brackets.length () || !edges || !edges->length ())
3469 {
3470 hi_back = 0;
3471 node_back = pseudo_node_t (0, 0);
3472 sese->push (node_back);
3473 }
3474
3475 /* Record the highest reaching backedge from us or a descendant. */
3476 sese->high = hi_back < hi_child ? node_back : node_child;
3477
3478 if (num_children > 1)
3479 {
3480 /* There is more than one child -- this is a Y shaped piece of
3481 spanning tree. We have to insert a fake backedge from this
3482 node to the highest ancestor reached by not-the-highest
3483 reaching child. Note that there may be multiple children
3484 with backedges to the same highest node. That's ok and we
3485 insert the edge to that highest node. */
3486 hi_child = depth;
3487 if (dir < 0 && child)
3488 {
3489 node_child = sese->high;
3490 hi_child = node_child.second;
3491 if (node_child.first)
3492 hi_child += BB_GET_SESE (node_child.first)->node;
3493 }
3494
3495 FOR_EACH_EDGE (e, ei, edges)
3496 {
3497 basic_block target = *(basic_block *)((char *)e + offset);
3498
3499 if (target == child)
3500 /* Ignore the highest child. */
3501 continue;
3502
3503 bb_sese *t_sese = BB_GET_SESE (target);
3504 if (!t_sese)
3505 continue;
3506 if (t_sese->parent != sese->node)
3507 /* Not a child. */
3508 continue;
3509
3510 /* Compare its hi value. */
3511 int t_hi = t_sese->high.second;
3512
3513 if (basic_block child_hi_block = t_sese->high.first)
3514 t_hi += BB_GET_SESE (child_hi_block)->node;
3515
3516 if (hi_child > t_hi)
3517 {
3518 hi_child = t_hi;
3519 node_child = t_sese->high;
3520 }
3521 }
3522
3523 sese->push (node_child);
3524 }
3525}
3526
3527
3528/* DFS walk of BB graph. Color node BLOCK according to COLORING then
3529 proceed to successors. Set SESE entry and exit nodes of
3530 REGIONS. */
3531
3532static void
3533nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3534 basic_block block, int coloring)
3535{
3536 bb_sese *sese = BB_GET_SESE (block);
3537
3538 if (block->flags & BB_VISITED)
3539 {
3540 /* If we've already encountered this block, either we must not
3541 be coloring, or it must have been colored the current color. */
3542 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3543 return;
3544 }
3545
3546 block->flags |= BB_VISITED;
3547
3548 if (sese)
3549 {
3550 if (coloring < 0)
3551 {
3552 /* Start coloring a region. */
3553 regions[sese->color].first = block;
3554 coloring = sese->color;
3555 }
3556
3557 if (!--color_counts[sese->color] && sese->color == coloring)
3558 {
3559 /* Found final block of SESE region. */
3560 regions[sese->color].second = block;
3561 coloring = -1;
3562 }
3563 else
3564 /* Color the node, so we can assert on revisiting the node
3565 that the graph is indeed SESE. */
3566 sese->color = coloring;
3567 }
3568 else
3569 /* Fallen off the subgraph, we cannot be coloring. */
3570 gcc_assert (coloring < 0);
3571
3572 /* Walk each successor block. */
3573 if (block->succs && block->succs->length ())
3574 {
3575 edge e;
3576 edge_iterator ei;
3577
3578 FOR_EACH_EDGE (e, ei, block->succs)
3579 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3580 }
3581 else
3582 gcc_assert (coloring < 0);
3583}
3584
3585/* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3586 end up with NULL entries in it. */
3587
3588static void
3589nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3590{
3591 basic_block block;
3592 int ix;
3593
3594 /* First clear each BB of the whole function. */
19591744 3595 FOR_ALL_BB_FN (block, cfun)
29f8c37e 3596 {
3597 block->flags &= ~BB_VISITED;
3598 BB_SET_SESE (block, 0);
3599 }
29f8c37e 3600
3601 /* Mark blocks in the function that are in this graph. */
3602 for (ix = 0; blocks.iterate (ix, &block); ix++)
3603 block->flags |= BB_VISITED;
3604
3605 /* Counts of nodes assigned to each color. There cannot be more
3606 colors than blocks (and hopefully there will be fewer). */
3607 auto_vec<unsigned> color_counts;
3608 color_counts.reserve (blocks.length ());
3609
3610 /* Worklist of nodes in the spanning tree. Again, there cannot be
3611 more nodes in the tree than blocks (there will be fewer if the
3612 CFG of blocks is disjoint). */
3613 auto_vec<basic_block> spanlist;
3614 spanlist.reserve (blocks.length ());
3615
3616 /* Make sure every block has its cycle class determined. */
3617 for (ix = 0; blocks.iterate (ix, &block); ix++)
3618 {
3619 if (BB_GET_SESE (block))
3620 /* We already met this block in an earlier graph solve. */
3621 continue;
3622
3623 if (dump_file)
3624 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3625
3626 /* Number the nodes reachable from block initial DFS order. */
3627 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3628
3629 /* Now walk in reverse DFS order to find cycle equivalents. */
3630 while (spanlist.length ())
3631 {
3632 block = spanlist.pop ();
3633 bb_sese *sese = BB_GET_SESE (block);
3634
3635 /* Do the pseudo node below. */
3636 nvptx_sese_pseudo (block, sese, depth, +1,
3637 sese->dir > 0 ? block->succs : block->preds,
3638 (sese->dir > 0 ? offsetof (edge_def, dest)
3639 : offsetof (edge_def, src)));
3640 sese->set_color (color_counts);
3641 /* Do the pseudo node above. */
3642 nvptx_sese_pseudo (block, sese, depth, -1,
3643 sese->dir < 0 ? block->succs : block->preds,
3644 (sese->dir < 0 ? offsetof (edge_def, dest)
3645 : offsetof (edge_def, src)));
3646 }
3647 if (dump_file)
3648 fprintf (dump_file, "\n");
3649 }
3650
3651 if (dump_file)
3652 {
3653 unsigned count;
3654 const char *comma = "";
3655
3656 fprintf (dump_file, "Found %d cycle equivalents\n",
3657 color_counts.length ());
3658 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3659 {
3660 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3661
3662 comma = "";
3663 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3664 if (BB_GET_SESE (block)->color == ix)
3665 {
3666 block->flags |= BB_VISITED;
3667 fprintf (dump_file, "%s%d", comma, block->index);
3668 comma=",";
3669 }
3670 fprintf (dump_file, "}");
3671 comma = ", ";
3672 }
3673 fprintf (dump_file, "\n");
3674 }
3675
3676 /* Now we've colored every block in the subgraph. We now need to
3677 determine the minimal set of SESE regions that cover that
3678 subgraph. Do this with a DFS walk of the complete function.
3679 During the walk we're either 'looking' or 'coloring'. When we
3680 reach the last node of a particular color, we stop coloring and
3681 return to looking. */
3682
3683 /* There cannot be more SESE regions than colors. */
3684 regions.reserve (color_counts.length ());
3685 for (ix = color_counts.length (); ix--;)
3686 regions.quick_push (bb_pair_t (0, 0));
3687
3688 for (ix = 0; blocks.iterate (ix, &block); ix++)
3689 block->flags &= ~BB_VISITED;
3690
3691 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3692
3693 if (dump_file)
3694 {
3695 const char *comma = "";
3696 int len = regions.length ();
3697
3698 fprintf (dump_file, "SESE regions:");
3699 for (ix = 0; ix != len; ix++)
3700 {
3701 basic_block from = regions[ix].first;
3702 basic_block to = regions[ix].second;
3703
3704 if (from)
3705 {
3706 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3707 if (to != from)
3708 fprintf (dump_file, "->%d", to->index);
3709
3710 int color = BB_GET_SESE (from)->color;
3711
3712 /* Print the blocks within the region (excluding ends). */
3713 FOR_EACH_BB_FN (block, cfun)
3714 {
3715 bb_sese *sese = BB_GET_SESE (block);
3716
3717 if (sese && sese->color == color
3718 && block != from && block != to)
3719 fprintf (dump_file, ".%d", block->index);
3720 }
3721 fprintf (dump_file, "}");
3722 }
3723 comma = ",";
3724 }
3725 fprintf (dump_file, "\n\n");
3726 }
3727
3728 for (ix = 0; blocks.iterate (ix, &block); ix++)
3729 delete BB_GET_SESE (block);
3730}
3731
3732#undef BB_SET_SESE
3733#undef BB_GET_SESE
3734
b3787ae4 3735/* Propagate live state at the start of a partitioned region. BLOCK
3736 provides the live register information, and might not contain
3737 INSN. Propagation is inserted just after INSN. RW indicates whether
3738 we are reading and/or writing state. This
3739 separation is needed for worker-level proppagation where we
3740 essentially do a spill & fill. FN is the underlying worker
3741 function to generate the propagation instructions for single
3742 register. DATA is user data.
3743
3744 We propagate the live register set and the entire frame. We could
3745 do better by (a) propagating just the live set that is used within
3746 the partitioned regions and (b) only propagating stack entries that
3747 are used. The latter might be quite hard to determine. */
3748
3749typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3750
3751static void
3752nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
3753 propagator_fn fn, void *data)
3754{
3755 bitmap live = DF_LIVE_IN (block);
3756 bitmap_iterator iterator;
3757 unsigned ix;
3758
3759 /* Copy the frame array. */
3760 HOST_WIDE_INT fs = get_frame_size ();
3761 if (fs)
3762 {
3763 rtx tmp = gen_reg_rtx (DImode);
3764 rtx idx = NULL_RTX;
3765 rtx ptr = gen_reg_rtx (Pmode);
3766 rtx pred = NULL_RTX;
3767 rtx_code_label *label = NULL;
3768
90099972 3769 /* The frame size might not be DImode compatible, but the frame
3770 array's declaration will be. So it's ok to round up here. */
3771 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
b3787ae4 3772 /* Detect single iteration loop. */
3773 if (fs == 1)
3774 fs = 0;
3775
3776 start_sequence ();
3777 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3778 if (fs)
3779 {
3780 idx = gen_reg_rtx (SImode);
3781 pred = gen_reg_rtx (BImode);
3782 label = gen_label_rtx ();
3783
3784 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3785 /* Allow worker function to initialize anything needed. */
3786 rtx init = fn (tmp, PM_loop_begin, fs, data);
3787 if (init)
3788 emit_insn (init);
3789 emit_label (label);
3790 LABEL_NUSES (label)++;
3791 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3792 }
3793 if (rw & PM_read)
3794 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3795 emit_insn (fn (tmp, rw, fs, data));
3796 if (rw & PM_write)
3797 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3798 if (fs)
3799 {
3800 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3801 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3802 emit_insn (gen_br_true_uni (pred, label));
3803 rtx fini = fn (tmp, PM_loop_end, fs, data);
3804 if (fini)
3805 emit_insn (fini);
3806 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3807 }
3808 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3809 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3810 rtx cpy = get_insns ();
3811 end_sequence ();
3812 insn = emit_insn_after (cpy, insn);
3813 }
3814
3815 /* Copy live registers. */
3816 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3817 {
3818 rtx reg = regno_reg_rtx[ix];
3819
3820 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3821 {
3822 rtx bcast = fn (reg, rw, 0, data);
3823
3824 insn = emit_insn_after (bcast, insn);
3825 }
3826 }
3827}
3828
3829/* Worker for nvptx_vpropagate. */
3830
3831static rtx
3832vprop_gen (rtx reg, propagate_mask pm,
3833 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3834{
3835 if (!(pm & PM_read_write))
3836 return 0;
3837
3838 return nvptx_gen_vcast (reg);
3839}
3840
3841/* Propagate state that is live at start of BLOCK across the vectors
3842 of a single warp. Propagation is inserted just after INSN. */
3843
3844static void
3845nvptx_vpropagate (basic_block block, rtx_insn *insn)
3846{
3847 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
3848}
3849
3850/* Worker for nvptx_wpropagate. */
3851
3852static rtx
3853wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3854{
3855 wcast_data_t *data = (wcast_data_t *)data_;
3856
3857 if (pm & PM_loop_begin)
3858 {
3859 /* Starting a loop, initialize pointer. */
3860 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3861
3862 if (align > worker_bcast_align)
3863 worker_bcast_align = align;
3864 data->offset = (data->offset + align - 1) & ~(align - 1);
3865
3866 data->ptr = gen_reg_rtx (Pmode);
3867
3868 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3869 }
3870 else if (pm & PM_loop_end)
3871 {
3872 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3873 data->ptr = NULL_RTX;
3874 return clobber;
3875 }
3876 else
3877 return nvptx_gen_wcast (reg, pm, rep, data);
3878}
3879
3880/* Spill or fill live state that is live at start of BLOCK. PRE_P
3881 indicates if this is just before partitioned mode (do spill), or
3882 just after it starts (do fill). Sequence is inserted just after
3883 INSN. */
3884
3885static void
3886nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
3887{
3888 wcast_data_t data;
3889
3890 data.base = gen_reg_rtx (Pmode);
3891 data.offset = 0;
3892 data.ptr = NULL_RTX;
3893
3894 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
3895 if (data.offset)
3896 {
3897 /* Stuff was emitted, initialize the base pointer now. */
9224dd1f 3898 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
b3787ae4 3899 emit_insn_after (init, insn);
31a633e4 3900
b3787ae4 3901 if (worker_bcast_size < data.offset)
3902 worker_bcast_size = data.offset;
3903 }
3904}
3905
3906/* Emit a worker-level synchronization barrier. We use different
3907 markers for before and after synchronizations. */
3908
3909static rtx
3910nvptx_wsync (bool after)
3911{
3912 return gen_nvptx_barsync (GEN_INT (after));
3913}
3914
8b73a457 3915#if WORKAROUND_PTXJIT_BUG
3916/* Return first real insn in BB, or return NULL_RTX if BB does not contain
3917 real insns. */
3918
3919static rtx_insn *
3920bb_first_real_insn (basic_block bb)
3921{
3922 rtx_insn *insn;
3923
3924 /* Find first insn of from block. */
3925 FOR_BB_INSNS (bb, insn)
3926 if (INSN_P (insn))
3927 return insn;
3928
3929 return 0;
3930}
3931#endif
3932
b3787ae4 3933/* Single neutering according to MASK. FROM is the incoming block and
3934 TO is the outgoing block. These may be the same block. Insert at
3935 start of FROM:
3936
3937 if (tid.<axis>) goto end.
3938
3939 and insert before ending branch of TO (if there is such an insn):
3940
3941 end:
3942 <possibly-broadcast-cond>
3943 <branch>
3944
3945 We currently only use differnt FROM and TO when skipping an entire
3946 loop. We could do more if we detected superblocks. */
3947
3948static void
3949nvptx_single (unsigned mask, basic_block from, basic_block to)
3950{
3951 rtx_insn *head = BB_HEAD (from);
3952 rtx_insn *tail = BB_END (to);
3953 unsigned skip_mask = mask;
3954
33834752 3955 while (true)
3956 {
3957 /* Find first insn of from block. */
3958 while (head != BB_END (from) && !INSN_P (head))
3959 head = NEXT_INSN (head);
3960
3961 if (from == to)
3962 break;
3963
3964 if (!(JUMP_P (head) && single_succ_p (from)))
3965 break;
3966
3967 basic_block jump_target = single_succ (from);
3968 if (!single_pred_p (jump_target))
3969 break;
3970
3971 from = jump_target;
3972 head = BB_HEAD (from);
3973 }
b3787ae4 3974
3975 /* Find last insn of to block */
3976 rtx_insn *limit = from == to ? head : BB_HEAD (to);
3977 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
3978 tail = PREV_INSN (tail);
3979
3980 /* Detect if tail is a branch. */
3981 rtx tail_branch = NULL_RTX;
3982 rtx cond_branch = NULL_RTX;
3983 if (tail && INSN_P (tail))
3984 {
3985 tail_branch = PATTERN (tail);
3986 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
3987 tail_branch = NULL_RTX;
3988 else
3989 {
3990 cond_branch = SET_SRC (tail_branch);
3991 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
3992 cond_branch = NULL_RTX;
3993 }
3994 }
3995
3996 if (tail == head)
3997 {
3998 /* If this is empty, do nothing. */
3999 if (!head || !INSN_P (head))
4000 return;
4001
4002 /* If this is a dummy insn, do nothing. */
4003 switch (recog_memoized (head))
4004 {
4005 default:
4006 break;
4007 case CODE_FOR_nvptx_fork:
4008 case CODE_FOR_nvptx_forked:
4009 case CODE_FOR_nvptx_joining:
4010 case CODE_FOR_nvptx_join:
4011 return;
4012 }
4013
4014 if (cond_branch)
4015 {
4016 /* If we're only doing vector single, there's no need to
4017 emit skip code because we'll not insert anything. */
4018 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4019 skip_mask = 0;
4020 }
4021 else if (tail_branch)
4022 /* Block with only unconditional branch. Nothing to do. */
4023 return;
4024 }
4025
4026 /* Insert the vector test inside the worker test. */
4027 unsigned mode;
4028 rtx_insn *before = tail;
4029 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4030 if (GOMP_DIM_MASK (mode) & skip_mask)
4031 {
4032 rtx_code_label *label = gen_label_rtx ();
4033 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4034
4035 if (!pred)
4036 {
4037 pred = gen_reg_rtx (BImode);
4038 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4039 }
4040
4041 rtx br;
4042 if (mode == GOMP_DIM_VECTOR)
4043 br = gen_br_true (pred, label);
4044 else
4045 br = gen_br_true_uni (pred, label);
4046 emit_insn_before (br, head);
4047
4048 LABEL_NUSES (label)++;
4049 if (tail_branch)
4050 before = emit_label_before (label, before);
4051 else
4052 emit_label_after (label, tail);
4053 }
4054
4055 /* Now deal with propagating the branch condition. */
4056 if (cond_branch)
4057 {
4058 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4059
4060 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4061 {
4062 /* Vector mode only, do a shuffle. */
8b73a457 4063#if WORKAROUND_PTXJIT_BUG
4064 /* The branch condition %rcond is propagated like this:
4065
4066 {
4067 .reg .u32 %x;
4068 mov.u32 %x,%tid.x;
4069 setp.ne.u32 %rnotvzero,%x,0;
4070 }
4071
4072 @%rnotvzero bra Lskip;
4073 setp.<op>.<type> %rcond,op1,op2;
4074 Lskip:
4075 selp.u32 %rcondu32,1,0,%rcond;
4076 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4077 setp.ne.u32 %rcond,%rcondu32,0;
4078
4079 There seems to be a bug in the ptx JIT compiler (observed at driver
4080 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4081 unless %rcond is initialized to something before 'bra Lskip'. The
4082 bug is not observed with ptxas from cuda 8.0.61.
4083
4084 It is true that the code is non-trivial: at Lskip, %rcond is
4085 uninitialized in threads 1-31, and after the selp the same holds
4086 for %rcondu32. But shfl propagates the defined value in thread 0
4087 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4088 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4089
4090 There is nothing in the PTX spec to suggest that this is wrong, or
4091 to explain why the extra initialization is needed. So, we classify
4092 it as a JIT bug, and the extra initialization as workaround. */
4093 emit_insn_before (gen_movbi (pvar, const0_rtx),
4094 bb_first_real_insn (from));
4095#endif
b3787ae4 4096 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4097 }
4098 else
4099 {
4100 /* Includes worker mode, do spill & fill. By construction
4101 we should never have worker mode only. */
4102 wcast_data_t data;
4103
4104 data.base = worker_bcast_sym;
4105 data.ptr = 0;
4106
4107 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4108 worker_bcast_size = GET_MODE_SIZE (SImode);
4109
4110 data.offset = 0;
4111 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4112 before);
4113 /* Barrier so other workers can see the write. */
4114 emit_insn_before (nvptx_wsync (false), tail);
4115 data.offset = 0;
4116 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4117 /* This barrier is needed to avoid worker zero clobbering
4118 the broadcast buffer before all the other workers have
4119 had a chance to read this instance of it. */
4120 emit_insn_before (nvptx_wsync (true), tail);
4121 }
4122
4123 extract_insn (tail);
4124 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4125 UNSPEC_BR_UNIFIED);
4126 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4127 }
4128}
4129
4130/* PAR is a parallel that is being skipped in its entirety according to
4131 MASK. Treat this as skipping a superblock starting at forked
4132 and ending at joining. */
4133
4134static void
4135nvptx_skip_par (unsigned mask, parallel *par)
4136{
4137 basic_block tail = par->join_block;
4138 gcc_assert (tail->preds->length () == 1);
4139
4140 basic_block pre_tail = (*tail->preds)[0]->src;
4141 gcc_assert (pre_tail->succs->length () == 1);
4142
4143 nvptx_single (mask, par->forked_block, pre_tail);
4144}
4145
8b921b21 4146/* If PAR has a single inner parallel and PAR itself only contains
4147 empty entry and exit blocks, swallow the inner PAR. */
4148
4149static void
4150nvptx_optimize_inner (parallel *par)
4151{
4152 parallel *inner = par->inner;
4153
4154 /* We mustn't be the outer dummy par. */
4155 if (!par->mask)
4156 return;
4157
4158 /* We must have a single inner par. */
4159 if (!inner || inner->next)
4160 return;
4161
4162 /* We must only contain 2 blocks ourselves -- the head and tail of
4163 the inner par. */
4164 if (par->blocks.length () != 2)
4165 return;
4166
4167 /* We must be disjoint partitioning. As we only have vector and
4168 worker partitioning, this is sufficient to guarantee the pars
4169 have adjacent partitioning. */
4170 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4171 /* This indicates malformed code generation. */
4172 return;
4173
4174 /* The outer forked insn should be immediately followed by the inner
4175 fork insn. */
4176 rtx_insn *forked = par->forked_insn;
4177 rtx_insn *fork = BB_END (par->forked_block);
4178
4179 if (NEXT_INSN (forked) != fork)
4180 return;
4181 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4182
4183 /* The outer joining insn must immediately follow the inner join
4184 insn. */
4185 rtx_insn *joining = par->joining_insn;
4186 rtx_insn *join = inner->join_insn;
4187 if (NEXT_INSN (join) != joining)
4188 return;
4189
4190 /* Preconditions met. Swallow the inner par. */
4191 if (dump_file)
4192 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4193 inner->mask, inner->forked_block->index,
4194 inner->join_block->index,
4195 par->mask, par->forked_block->index, par->join_block->index);
4196
4197 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4198
4199 par->blocks.reserve (inner->blocks.length ());
4200 while (inner->blocks.length ())
4201 par->blocks.quick_push (inner->blocks.pop ());
4202
4203 par->inner = inner->inner;
4204 inner->inner = NULL;
4205
4206 delete inner;
4207}
4208
b3787ae4 4209/* Process the parallel PAR and all its contained
4210 parallels. We do everything but the neutering. Return mask of
4211 partitioned modes used within this parallel. */
4212
4213static unsigned
4214nvptx_process_pars (parallel *par)
4215{
8b921b21 4216 if (nvptx_optimize)
4217 nvptx_optimize_inner (par);
4218
b3787ae4 4219 unsigned inner_mask = par->mask;
4220
4221 /* Do the inner parallels first. */
4222 if (par->inner)
4223 {
4224 par->inner_mask = nvptx_process_pars (par->inner);
4225 inner_mask |= par->inner_mask;
4226 }
4227
4228 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
4229 /* No propagation needed for a call. */;
6e90d066 4230 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
b3787ae4 4231 {
4232 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
4233 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
4234 /* Insert begin and end synchronizations. */
4235 emit_insn_after (nvptx_wsync (false), par->forked_insn);
4236 emit_insn_before (nvptx_wsync (true), par->joining_insn);
4237 }
4238 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4239 nvptx_vpropagate (par->forked_block, par->forked_insn);
4240
4241 /* Now do siblings. */
4242 if (par->next)
4243 inner_mask |= nvptx_process_pars (par->next);
4244 return inner_mask;
4245}
4246
4247/* Neuter the parallel described by PAR. We recurse in depth-first
4248 order. MODES are the partitioning of the execution and OUTER is
4249 the partitioning of the parallels we are contained in. */
4250
4251static void
4252nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4253{
4254 unsigned me = (par->mask
4255 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4256 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4257 unsigned skip_mask = 0, neuter_mask = 0;
4258
4259 if (par->inner)
4260 nvptx_neuter_pars (par->inner, modes, outer | me);
4261
4262 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4263 {
4264 if ((outer | me) & GOMP_DIM_MASK (mode))
4265 {} /* Mode is partitioned: no neutering. */
4266 else if (!(modes & GOMP_DIM_MASK (mode)))
6e90d066 4267 {} /* Mode is not used: nothing to do. */
b3787ae4 4268 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4269 || !par->forked_insn)
4270 /* Partitioned in inner parallels, or we're not a partitioned
4271 at all: neuter individual blocks. */
4272 neuter_mask |= GOMP_DIM_MASK (mode);
4273 else if (!par->parent || !par->parent->forked_insn
4274 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4275 /* Parent isn't a parallel or contains this paralleling: skip
4276 parallel at this level. */
4277 skip_mask |= GOMP_DIM_MASK (mode);
4278 else
4279 {} /* Parent will skip this parallel itself. */
4280 }
4281
4282 if (neuter_mask)
4283 {
29f8c37e 4284 int ix, len;
b3787ae4 4285
29f8c37e 4286 if (nvptx_optimize)
4287 {
4288 /* Neuter whole SESE regions. */
4289 bb_pair_vec_t regions;
4290
4291 nvptx_find_sese (par->blocks, regions);
4292 len = regions.length ();
4293 for (ix = 0; ix != len; ix++)
4294 {
4295 basic_block from = regions[ix].first;
4296 basic_block to = regions[ix].second;
4297
4298 if (from)
4299 nvptx_single (neuter_mask, from, to);
4300 else
4301 gcc_assert (!to);
4302 }
4303 }
4304 else
b3787ae4 4305 {
29f8c37e 4306 /* Neuter each BB individually. */
4307 len = par->blocks.length ();
4308 for (ix = 0; ix != len; ix++)
4309 {
4310 basic_block block = par->blocks[ix];
b3787ae4 4311
29f8c37e 4312 nvptx_single (neuter_mask, block, block);
4313 }
b3787ae4 4314 }
4315 }
4316
4317 if (skip_mask)
4318 nvptx_skip_par (skip_mask, par);
4319
4320 if (par->next)
4321 nvptx_neuter_pars (par->next, modes, outer);
4322}
4323
f8cec994 4324/* PTX-specific reorganization
b3787ae4 4325 - Split blocks at fork and join instructions
857788d2 4326 - Compute live registers
4327 - Mark now-unused registers, so function begin doesn't declare
f8cec994 4328 unused registers.
b3787ae4 4329 - Insert state propagation when entering partitioned mode
4330 - Insert neutering instructions when in single mode
857788d2 4331 - Replace subregs with suitable sequences.
f8cec994 4332*/
4333
4334static void
4335nvptx_reorg (void)
4336{
f8cec994 4337 /* We are freeing block_for_insn in the toplev to keep compatibility
4338 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4339 compute_bb_for_insn ();
4340
4341 thread_prologue_and_epilogue_insns ();
4342
b3787ae4 4343 /* Split blocks and record interesting unspecs. */
4344 bb_insn_map_t bb_insn_map;
4345
4346 nvptx_split_blocks (&bb_insn_map);
4347
857788d2 4348 /* Compute live regs */
f8cec994 4349 df_clear_flags (DF_LR_RUN_DCE);
4350 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
b3787ae4 4351 df_live_add_problem ();
4352 df_live_set_all_dirty ();
f8cec994 4353 df_analyze ();
8ce80784 4354 regstat_init_n_sets_and_refs ();
4355
b3787ae4 4356 if (dump_file)
4357 df_dump (dump_file);
4358
f8cec994 4359 /* Mark unused regs as unused. */
b3787ae4 4360 int max_regs = max_reg_num ();
6e71bbf2 4361 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
8ce80784 4362 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4363 regno_reg_rtx[i] = const0_rtx;
f8cec994 4364
b3787ae4 4365 /* Determine launch dimensions of the function. If it is not an
4366 offloaded function (i.e. this is a regular compiler), the
4367 function has no neutering. */
4954efd4 4368 tree attr = oacc_get_fn_attrib (current_function_decl);
b3787ae4 4369 if (attr)
4370 {
4371 /* If we determined this mask before RTL expansion, we could
4372 elide emission of some levels of forks and joins. */
4373 unsigned mask = 0;
4374 tree dims = TREE_VALUE (attr);
4375 unsigned ix;
4376
4377 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4378 {
4379 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4380 tree allowed = TREE_PURPOSE (dims);
4381
4382 if (size != 1 && !(allowed && integer_zerop (allowed)))
4383 mask |= GOMP_DIM_MASK (ix);
4384 }
4385 /* If there is worker neutering, there must be vector
4386 neutering. Otherwise the hardware will fail. */
4387 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4388 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4389
4390 /* Discover & process partitioned regions. */
4391 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4392 nvptx_process_pars (pars);
4393 nvptx_neuter_pars (pars, mask, 0);
4394 delete pars;
4395 }
4396
f8cec994 4397 /* Replace subregs. */
67842e8e 4398 nvptx_reorg_subreg ();
f8cec994 4399
7fce8768 4400 if (TARGET_UNIFORM_SIMT)
4401 nvptx_reorg_uniform_simt ();
4402
8ce80784 4403 regstat_free_n_sets_and_refs ();
f8cec994 4404
4405 df_finish_pass (true);
8ce80784 4406}
4407\f
4408/* Handle a "kernel" attribute; arguments as in
4409 struct attribute_spec.handler. */
4410
4411static tree
4412nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4413 int ARG_UNUSED (flags), bool *no_add_attrs)
4414{
4415 tree decl = *node;
4416
4417 if (TREE_CODE (decl) != FUNCTION_DECL)
4418 {
4419 error ("%qE attribute only applies to functions", name);
4420 *no_add_attrs = true;
4421 }
c666c7b6 4422 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
8ce80784 4423 {
4424 error ("%qE attribute requires a void return type", name);
4425 *no_add_attrs = true;
4426 }
4427
4428 return NULL_TREE;
4429}
4430
7fce8768 4431/* Handle a "shared" attribute; arguments as in
4432 struct attribute_spec.handler. */
4433
4434static tree
4435nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4436 int ARG_UNUSED (flags), bool *no_add_attrs)
4437{
4438 tree decl = *node;
4439
4440 if (TREE_CODE (decl) != VAR_DECL)
4441 {
4442 error ("%qE attribute only applies to variables", name);
4443 *no_add_attrs = true;
4444 }
4445 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4446 {
4447 error ("%qE attribute not allowed with auto storage class", name);
4448 *no_add_attrs = true;
4449 }
4450
4451 return NULL_TREE;
4452}
4453
8ce80784 4454/* Table of valid machine attributes. */
4455static const struct attribute_spec nvptx_attribute_table[] =
4456{
4457 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
4458 affects_type_identity } */
4459 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
7fce8768 4460 { "shared", 0, 0, true, false, false, nvptx_handle_shared_attribute, false },
8ce80784 4461 { NULL, 0, 0, false, false, false, NULL, false }
4462};
4463\f
4464/* Limit vector alignments to BIGGEST_ALIGNMENT. */
4465
4466static HOST_WIDE_INT
4467nvptx_vector_alignment (const_tree type)
4468{
4469 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4470
4471 return MIN (align, BIGGEST_ALIGNMENT);
4472}
b3787ae4 4473
4474/* Indicate that INSN cannot be duplicated. */
4475
4476static bool
4477nvptx_cannot_copy_insn_p (rtx_insn *insn)
4478{
4479 switch (recog_memoized (insn))
4480 {
4481 case CODE_FOR_nvptx_shufflesi:
4482 case CODE_FOR_nvptx_shufflesf:
4483 case CODE_FOR_nvptx_barsync:
4484 case CODE_FOR_nvptx_fork:
4485 case CODE_FOR_nvptx_forked:
4486 case CODE_FOR_nvptx_joining:
4487 case CODE_FOR_nvptx_join:
4488 return true;
4489 default:
4490 return false;
4491 }
4492}
c33494f0 4493
4494/* Section anchors do not work. Initialization for flag_section_anchor
4495 probes the existence of the anchoring target hooks and prevents
4496 anchoring if they don't exist. However, we may be being used with
4497 a host-side compiler that does support anchoring, and hence see
4498 the anchor flag set (as it's not recalculated). So provide an
4499 implementation denying anchoring. */
4500
4501static bool
4502nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4503{
4504 return false;
4505}
8ce80784 4506\f
9c08fbb3 4507/* Record a symbol for mkoffload to enter into the mapping table. */
4508
4509static void
4510nvptx_record_offload_symbol (tree decl)
4511{
e561d5e1 4512 switch (TREE_CODE (decl))
4513 {
4514 case VAR_DECL:
4515 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4516 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4517 break;
4518
4519 case FUNCTION_DECL:
4520 {
4954efd4 4521 tree attr = oacc_get_fn_attrib (decl);
7fce8768 4522 /* OpenMP offloading does not set this attribute. */
4523 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
e561d5e1 4524
e561d5e1 4525 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4526 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4527
7fce8768 4528 for (; dims; dims = TREE_CHAIN (dims))
e561d5e1 4529 {
6e90d066 4530 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
e561d5e1 4531
6e90d066 4532 gcc_assert (!TREE_PURPOSE (dims));
e561d5e1 4533 fprintf (asm_out_file, ", %#x", size);
4534 }
5f05c4a3 4535
e561d5e1 4536 fprintf (asm_out_file, "\n");
4537 }
4538 break;
5f05c4a3 4539
e561d5e1 4540 default:
4541 gcc_unreachable ();
4542 }
9c08fbb3 4543}
4544
8ce80784 4545/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4546 at the start of a file. */
4547
4548static void
4549nvptx_file_start (void)
4550{
4551 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4552 fputs ("\t.version\t3.1\n", asm_out_file);
4553 fputs ("\t.target\tsm_30\n", asm_out_file);
4554 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4555 fputs ("// END PREAMBLE\n", asm_out_file);
4556}
4557
31a633e4 4558/* Emit a declaration for a worker-level buffer in .shared memory. */
4559
4560static void
4561write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4562{
4563 const char *name = XSTR (sym, 0);
4564
4565 write_var_marker (file, true, false, name);
4566 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4567 align, name, size);
4568}
4569
7794f2c9 4570/* Write out the function declarations we've collected and declare storage
4571 for the broadcast buffer. */
8ce80784 4572
4573static void
4574nvptx_file_end (void)
4575{
b0c5be65 4576 hash_table<tree_hasher>::iterator iter;
4577 tree decl;
4578 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
2583dd18 4579 nvptx_record_fndecl (decl);
8ce80784 4580 fputs (func_decls.str().c_str(), asm_out_file);
b3787ae4 4581
4582 if (worker_bcast_size)
31a633e4 4583 write_worker_buffer (asm_out_file, worker_bcast_sym,
4584 worker_bcast_align, worker_bcast_size);
78a78aac 4585
4586 if (worker_red_size)
31a633e4 4587 write_worker_buffer (asm_out_file, worker_red_sym,
4588 worker_red_align, worker_red_size);
7fce8768 4589
4590 if (need_softstack_decl)
4591 {
4592 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4593 /* 32 is the maximum number of warps in a block. Even though it's an
4594 external declaration, emit the array size explicitly; otherwise, it
4595 may fail at PTX JIT time if the definition is later in link order. */
4596 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4597 POINTER_SIZE);
4598 }
4599 if (need_unisimt_decl)
4600 {
4601 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4602 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4603 }
78a78aac 4604}
4605
4606/* Expander for the shuffle builtins. */
4607
4608static rtx
4609nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4610{
4611 if (ignore)
4612 return target;
4613
4614 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4615 NULL_RTX, mode, EXPAND_NORMAL);
4616 if (!REG_P (src))
4617 src = copy_to_mode_reg (mode, src);
4618
4619 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4620 NULL_RTX, SImode, EXPAND_NORMAL);
4621 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4622 NULL_RTX, SImode, EXPAND_NORMAL);
4623
4624 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4625 idx = copy_to_mode_reg (SImode, idx);
4626
8702ba1e 4627 rtx pat = nvptx_gen_shuffle (target, src, idx,
4628 (nvptx_shuffle_kind) INTVAL (op));
78a78aac 4629 if (pat)
4630 emit_insn (pat);
4631
4632 return target;
4633}
4634
4635/* Worker reduction address expander. */
4636
4637static rtx
4638nvptx_expand_worker_addr (tree exp, rtx target,
4639 machine_mode ARG_UNUSED (mode), int ignore)
4640{
4641 if (ignore)
4642 return target;
4643
4644 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4645 if (align > worker_red_align)
4646 worker_red_align = align;
4647
4648 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4649 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4650 if (size + offset > worker_red_size)
4651 worker_red_size = size + offset;
4652
d7ed88be 4653 rtx addr = worker_red_sym;
78a78aac 4654 if (offset)
d7ed88be 4655 {
4656 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4657 addr = gen_rtx_CONST (Pmode, addr);
4658 }
78a78aac 4659
d7ed88be 4660 emit_move_insn (target, addr);
78a78aac 4661
4662 return target;
4663}
4664
4665/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4666 not require taking the address of any object, other than the memory
4667 cell being operated on. */
4668
4669static rtx
4670nvptx_expand_cmp_swap (tree exp, rtx target,
4671 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4672{
4673 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4674
4675 if (!target)
4676 target = gen_reg_rtx (mode);
4677
4678 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4679 NULL_RTX, Pmode, EXPAND_NORMAL);
4680 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4681 NULL_RTX, mode, EXPAND_NORMAL);
4682 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4683 NULL_RTX, mode, EXPAND_NORMAL);
4684 rtx pat;
4685
4686 mem = gen_rtx_MEM (mode, mem);
4687 if (!REG_P (cmp))
4688 cmp = copy_to_mode_reg (mode, cmp);
4689 if (!REG_P (src))
4690 src = copy_to_mode_reg (mode, src);
4691
4692 if (mode == SImode)
4693 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4694 else
4695 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4696
4697 emit_insn (pat);
4698
4699 return target;
4700}
4701
4702
4703/* Codes for all the NVPTX builtins. */
4704enum nvptx_builtins
4705{
4706 NVPTX_BUILTIN_SHUFFLE,
4707 NVPTX_BUILTIN_SHUFFLELL,
4708 NVPTX_BUILTIN_WORKER_ADDR,
4709 NVPTX_BUILTIN_CMP_SWAP,
4710 NVPTX_BUILTIN_CMP_SWAPLL,
4711 NVPTX_BUILTIN_MAX
4712};
4713
4714static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4715
4716/* Return the NVPTX builtin for CODE. */
4717
4718static tree
4719nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4720{
4721 if (code >= NVPTX_BUILTIN_MAX)
4722 return error_mark_node;
4723
4724 return nvptx_builtin_decls[code];
4725}
4726
4727/* Set up all builtin functions for this target. */
4728
4729static void
4730nvptx_init_builtins (void)
4731{
4732#define DEF(ID, NAME, T) \
4733 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4734 = add_builtin_function ("__builtin_nvptx_" NAME, \
4735 build_function_type_list T, \
4736 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4737#define ST sizetype
4738#define UINT unsigned_type_node
4739#define LLUINT long_long_unsigned_type_node
4740#define PTRVOID ptr_type_node
4741
4742 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4743 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4744 DEF (WORKER_ADDR, "worker_addr",
4745 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4746 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4747 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4748
4749#undef DEF
4750#undef ST
4751#undef UINT
4752#undef LLUINT
4753#undef PTRVOID
4754}
4755
4756/* Expand an expression EXP that calls a built-in function,
4757 with result going to TARGET if that's convenient
4758 (and in mode MODE if that's convenient).
4759 SUBTARGET may be used as the target for computing one of EXP's operands.
4760 IGNORE is nonzero if the value is to be ignored. */
4761
4762static rtx
4763nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4764 machine_mode mode, int ignore)
4765{
4766 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4767 switch (DECL_FUNCTION_CODE (fndecl))
4768 {
4769 case NVPTX_BUILTIN_SHUFFLE:
4770 case NVPTX_BUILTIN_SHUFFLELL:
4771 return nvptx_expand_shuffle (exp, target, mode, ignore);
4772
4773 case NVPTX_BUILTIN_WORKER_ADDR:
4774 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4775
4776 case NVPTX_BUILTIN_CMP_SWAP:
4777 case NVPTX_BUILTIN_CMP_SWAPLL:
4778 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4779
4780 default: gcc_unreachable ();
4781 }
8ce80784 4782}
4783\f
78a78aac 4784/* Define dimension sizes for known hardware. */
4785#define PTX_VECTOR_LENGTH 32
4786#define PTX_WORKER_LENGTH 32
b7aef2f7 4787#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
78a78aac 4788
7fce8768 4789/* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4790
4791static int
4792nvptx_simt_vf ()
4793{
4794 return PTX_VECTOR_LENGTH;
4795}
4796
0bb0f256 4797/* Validate compute dimensions of an OpenACC offload or routine, fill
4798 in non-unity defaults. FN_LEVEL indicates the level at which a
948eee2f 4799 routine might spawn a loop. It is negative for non-routines. If
4800 DECL is null, we are validating the default dimensions. */
0bb0f256 4801
4802static bool
6e90d066 4803nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
0bb0f256 4804{
4805 bool changed = false;
4806
6255cae7 4807 /* The vector size must be 32, unless this is a SEQ routine. */
948eee2f 4808 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
4809 && dims[GOMP_DIM_VECTOR] >= 0
6255cae7 4810 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
4811 {
948eee2f 4812 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
4813 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6255cae7 4814 dims[GOMP_DIM_VECTOR]
1caf9cb4 4815 ? G_("using vector_length (%d), ignoring %d")
4816 : G_("using vector_length (%d), ignoring runtime setting"),
6255cae7 4817 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
4818 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4819 changed = true;
4820 }
4821
4822 /* Check the num workers is not too large. */
4823 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
4824 {
948eee2f 4825 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6255cae7 4826 "using num_workers (%d), ignoring %d",
4827 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
4828 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4829 changed = true;
4830 }
0bb0f256 4831
948eee2f 4832 if (!decl)
4833 {
4834 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
4835 if (dims[GOMP_DIM_WORKER] < 0)
4836 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
4837 if (dims[GOMP_DIM_GANG] < 0)
4838 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
4839 changed = true;
4840 }
4841
0bb0f256 4842 return changed;
4843}
b3787ae4 4844
e1037942 4845/* Return maximum dimension size, or zero for unbounded. */
4846
4847static int
4848nvptx_dim_limit (int axis)
4849{
4850 switch (axis)
4851 {
4852 case GOMP_DIM_WORKER:
4853 return PTX_WORKER_LENGTH;
4854
4855 case GOMP_DIM_VECTOR:
4856 return PTX_VECTOR_LENGTH;
4857
4858 default:
4859 break;
4860 }
4861 return 0;
4862}
4863
b3787ae4 4864/* Determine whether fork & joins are needed. */
4865
4866static bool
4867nvptx_goacc_fork_join (gcall *call, const int dims[],
4868 bool ARG_UNUSED (is_fork))
4869{
4870 tree arg = gimple_call_arg (call, 2);
4871 unsigned axis = TREE_INT_CST_LOW (arg);
4872
4873 /* We only care about worker and vector partitioning. */
4874 if (axis < GOMP_DIM_WORKER)
4875 return false;
4876
4877 /* If the size is 1, there's no partitioning. */
4878 if (dims[axis] == 1)
4879 return false;
4880
4881 return true;
4882}
4883
78a78aac 4884/* Generate a PTX builtin function call that returns the address in
4885 the worker reduction buffer at OFFSET. TYPE is the type of the
4886 data at that location. */
4887
4888static tree
4889nvptx_get_worker_red_addr (tree type, tree offset)
4890{
4891 machine_mode mode = TYPE_MODE (type);
4892 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
4893 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
4894 tree align = build_int_cst (unsigned_type_node,
4895 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
4896 tree call = build_call_expr (fndecl, 3, offset, size, align);
4897
4898 return fold_convert (build_pointer_type (type), call);
4899}
4900
4901/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
4902 will cast the variable if necessary. */
4903
4904static void
4905nvptx_generate_vector_shuffle (location_t loc,
4906 tree dest_var, tree var, unsigned shift,
4907 gimple_seq *seq)
4908{
4909 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
4910 tree_code code = NOP_EXPR;
bde24c35 4911 tree arg_type = unsigned_type_node;
4912 tree var_type = TREE_TYPE (var);
4913 tree dest_type = var_type;
78a78aac 4914
bde24c35 4915 if (TREE_CODE (var_type) == COMPLEX_TYPE)
4916 var_type = TREE_TYPE (var_type);
4917
4918 if (TREE_CODE (var_type) == REAL_TYPE)
78a78aac 4919 code = VIEW_CONVERT_EXPR;
bde24c35 4920
4921 if (TYPE_SIZE (var_type)
4922 == TYPE_SIZE (long_long_unsigned_type_node))
78a78aac 4923 {
4924 fn = NVPTX_BUILTIN_SHUFFLELL;
bde24c35 4925 arg_type = long_long_unsigned_type_node;
78a78aac 4926 }
bde24c35 4927
78a78aac 4928 tree call = nvptx_builtin_decl (fn, true);
bde24c35 4929 tree bits = build_int_cst (unsigned_type_node, shift);
4930 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
4931 tree expr;
4932
4933 if (var_type != dest_type)
4934 {
4935 /* Do real and imaginary parts separately. */
4936 tree real = fold_build1 (REALPART_EXPR, var_type, var);
4937 real = fold_build1 (code, arg_type, real);
4938 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
4939 real = fold_build1 (code, var_type, real);
78a78aac 4940
bde24c35 4941 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
4942 imag = fold_build1 (code, arg_type, imag);
4943 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
4944 imag = fold_build1 (code, var_type, imag);
4945
4946 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
4947 }
4948 else
4949 {
4950 expr = fold_build1 (code, arg_type, var);
4951 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
4952 expr = fold_build1 (code, dest_type, expr);
4953 }
78a78aac 4954
bde24c35 4955 gimplify_assign (dest_var, expr, seq);
78a78aac 4956}
4957
1927fff5 4958/* Lazily generate the global lock var decl and return its address. */
4959
4960static tree
4961nvptx_global_lock_addr ()
4962{
4963 tree v = global_lock_var;
4964
4965 if (!v)
4966 {
4967 tree name = get_identifier ("__reduction_lock");
4968 tree type = build_qualified_type (unsigned_type_node,
4969 TYPE_QUAL_VOLATILE);
4970 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
4971 global_lock_var = v;
4972 DECL_ARTIFICIAL (v) = 1;
4973 DECL_EXTERNAL (v) = 1;
4974 TREE_STATIC (v) = 1;
4975 TREE_PUBLIC (v) = 1;
4976 TREE_USED (v) = 1;
4977 mark_addressable (v);
4978 mark_decl_referenced (v);
4979 }
4980
4981 return build_fold_addr_expr (v);
4982}
4983
4984/* Insert code to locklessly update *PTR with *PTR OP VAR just before
4985 GSI. We use a lockless scheme for nearly all case, which looks
4986 like:
4987 actual = initval(OP);
4988 do {
4989 guess = actual;
4990 write = guess OP myval;
4991 actual = cmp&swap (ptr, guess, write)
4992 } while (actual bit-different-to guess);
4993 return write;
4994
4995 This relies on a cmp&swap instruction, which is available for 32-
4996 and 64-bit types. Larger types must use a locking scheme. */
78a78aac 4997
4998static tree
4999nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5000 tree ptr, tree var, tree_code op)
5001{
5002 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5003 tree_code code = NOP_EXPR;
1927fff5 5004 tree arg_type = unsigned_type_node;
5005 tree var_type = TREE_TYPE (var);
78a78aac 5006
1927fff5 5007 if (TREE_CODE (var_type) == COMPLEX_TYPE
5008 || TREE_CODE (var_type) == REAL_TYPE)
78a78aac 5009 code = VIEW_CONVERT_EXPR;
1927fff5 5010
5011 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
78a78aac 5012 {
1927fff5 5013 arg_type = long_long_unsigned_type_node;
78a78aac 5014 fn = NVPTX_BUILTIN_CMP_SWAPLL;
78a78aac 5015 }
5016
1927fff5 5017 tree swap_fn = nvptx_builtin_decl (fn, true);
5018
78a78aac 5019 gimple_seq init_seq = NULL;
1927fff5 5020 tree init_var = make_ssa_name (arg_type);
5021 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5022 init_expr = fold_build1 (code, arg_type, init_expr);
78a78aac 5023 gimplify_assign (init_var, init_expr, &init_seq);
5024 gimple *init_end = gimple_seq_last (init_seq);
5025
5026 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5027
78a78aac 5028 /* Split the block just after the init stmts. */
5029 basic_block pre_bb = gsi_bb (*gsi);
5030 edge pre_edge = split_block (pre_bb, init_end);
5031 basic_block loop_bb = pre_edge->dest;
5032 pre_bb = pre_edge->src;
5033 /* Reset the iterator. */
5034 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5035
1927fff5 5036 tree expect_var = make_ssa_name (arg_type);
5037 tree actual_var = make_ssa_name (arg_type);
5038 tree write_var = make_ssa_name (arg_type);
5039
5040 /* Build and insert the reduction calculation. */
5041 gimple_seq red_seq = NULL;
5042 tree write_expr = fold_build1 (code, var_type, expect_var);
5043 write_expr = fold_build2 (op, var_type, write_expr, var);
5044 write_expr = fold_build1 (code, arg_type, write_expr);
5045 gimplify_assign (write_var, write_expr, &red_seq);
5046
5047 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5048
5049 /* Build & insert the cmp&swap sequence. */
5050 gimple_seq latch_seq = NULL;
5051 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5052 ptr, expect_var, write_var);
5053 gimplify_assign (actual_var, swap_expr, &latch_seq);
5054
5055 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5056 NULL_TREE, NULL_TREE);
5057 gimple_seq_add_stmt (&latch_seq, cond);
5058
5059 gimple *latch_end = gimple_seq_last (latch_seq);
5060 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
78a78aac 5061
1927fff5 5062 /* Split the block just after the latch stmts. */
5063 edge post_edge = split_block (loop_bb, latch_end);
78a78aac 5064 basic_block post_bb = post_edge->dest;
5065 loop_bb = post_edge->src;
5066 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5067
5068 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
446e05fd 5069 post_edge->probability = profile_probability::even ();
78a78aac 5070 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
446e05fd 5071 loop_edge->probability = profile_probability::even ();
78a78aac 5072 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5073 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5074
5075 gphi *phi = create_phi_node (expect_var, loop_bb);
5076 add_phi_arg (phi, init_var, pre_edge, loc);
5077 add_phi_arg (phi, actual_var, loop_edge, loc);
5078
5079 loop *loop = alloc_loop ();
5080 loop->header = loop_bb;
5081 loop->latch = loop_bb;
5082 add_loop (loop, loop_bb->loop_father);
5083
1927fff5 5084 return fold_build1 (code, var_type, write_var);
5085}
5086
5087/* Insert code to lockfully update *PTR with *PTR OP VAR just before
5088 GSI. This is necessary for types larger than 64 bits, where there
5089 is no cmp&swap instruction to implement a lockless scheme. We use
5090 a lock variable in global memory.
5091
5092 while (cmp&swap (&lock_var, 0, 1))
5093 continue;
5094 T accum = *ptr;
5095 accum = accum OP var;
5096 *ptr = accum;
5097 cmp&swap (&lock_var, 1, 0);
5098 return accum;
5099
5100 A lock in global memory is necessary to force execution engine
5101 descheduling and avoid resource starvation that can occur if the
5102 lock is in .shared memory. */
5103
5104static tree
5105nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5106 tree ptr, tree var, tree_code op)
5107{
5108 tree var_type = TREE_TYPE (var);
5109 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5110 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5111 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5112
5113 /* Split the block just before the gsi. Insert a gimple nop to make
5114 this easier. */
5115 gimple *nop = gimple_build_nop ();
5116 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5117 basic_block entry_bb = gsi_bb (*gsi);
5118 edge entry_edge = split_block (entry_bb, nop);
5119 basic_block lock_bb = entry_edge->dest;
5120 /* Reset the iterator. */
5121 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5122
5123 /* Build and insert the locking sequence. */
5124 gimple_seq lock_seq = NULL;
5125 tree lock_var = make_ssa_name (unsigned_type_node);
5126 tree lock_expr = nvptx_global_lock_addr ();
5127 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5128 uns_unlocked, uns_locked);
5129 gimplify_assign (lock_var, lock_expr, &lock_seq);
5130 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5131 NULL_TREE, NULL_TREE);
5132 gimple_seq_add_stmt (&lock_seq, cond);
5133 gimple *lock_end = gimple_seq_last (lock_seq);
5134 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5135
5136 /* Split the block just after the lock sequence. */
5137 edge locked_edge = split_block (lock_bb, lock_end);
5138 basic_block update_bb = locked_edge->dest;
5139 lock_bb = locked_edge->src;
5140 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5141
5142 /* Create the lock loop ... */
5143 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
446e05fd 5144 locked_edge->probability = profile_probability::even ();
5145 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5146 loop_edge->probability = profile_probability::even ();
1927fff5 5147 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5148 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5149
5150 /* ... and the loop structure. */
5151 loop *lock_loop = alloc_loop ();
5152 lock_loop->header = lock_bb;
5153 lock_loop->latch = lock_bb;
5154 lock_loop->nb_iterations_estimate = 1;
5155 lock_loop->any_estimate = true;
5156 add_loop (lock_loop, entry_bb->loop_father);
5157
5158 /* Build and insert the reduction calculation. */
5159 gimple_seq red_seq = NULL;
5160 tree acc_in = make_ssa_name (var_type);
5161 tree ref_in = build_simple_mem_ref (ptr);
5162 TREE_THIS_VOLATILE (ref_in) = 1;
5163 gimplify_assign (acc_in, ref_in, &red_seq);
5164
5165 tree acc_out = make_ssa_name (var_type);
5166 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5167 gimplify_assign (acc_out, update_expr, &red_seq);
5168
5169 tree ref_out = build_simple_mem_ref (ptr);
5170 TREE_THIS_VOLATILE (ref_out) = 1;
5171 gimplify_assign (ref_out, acc_out, &red_seq);
5172
5173 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5174
5175 /* Build & insert the unlock sequence. */
5176 gimple_seq unlock_seq = NULL;
5177 tree unlock_expr = nvptx_global_lock_addr ();
5178 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5179 uns_locked, uns_unlocked);
5180 gimplify_and_add (unlock_expr, &unlock_seq);
5181 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5182
5183 return acc_out;
5184}
5185
5186/* Emit a sequence to update a reduction accumlator at *PTR with the
5187 value held in VAR using operator OP. Return the updated value.
5188
5189 TODO: optimize for atomic ops and indepedent complex ops. */
5190
5191static tree
5192nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5193 tree ptr, tree var, tree_code op)
5194{
5195 tree type = TREE_TYPE (var);
5196 tree size = TYPE_SIZE (type);
5197
5198 if (size == TYPE_SIZE (unsigned_type_node)
5199 || size == TYPE_SIZE (long_long_unsigned_type_node))
5200 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5201 else
5202 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
78a78aac 5203}
5204
5205/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5206
5207static void
5208nvptx_goacc_reduction_setup (gcall *call)
5209{
5210 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5211 tree lhs = gimple_call_lhs (call);
5212 tree var = gimple_call_arg (call, 2);
5213 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5214 gimple_seq seq = NULL;
5215
5216 push_gimplify_context (true);
5217
5218 if (level != GOMP_DIM_GANG)
5219 {
5220 /* Copy the receiver object. */
5221 tree ref_to_res = gimple_call_arg (call, 1);
5222
5223 if (!integer_zerop (ref_to_res))
5224 var = build_simple_mem_ref (ref_to_res);
5225 }
5226
5227 if (level == GOMP_DIM_WORKER)
5228 {
5229 /* Store incoming value to worker reduction buffer. */
5230 tree offset = gimple_call_arg (call, 5);
5231 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5232 tree ptr = make_ssa_name (TREE_TYPE (call));
5233
5234 gimplify_assign (ptr, call, &seq);
5235 tree ref = build_simple_mem_ref (ptr);
5236 TREE_THIS_VOLATILE (ref) = 1;
5237 gimplify_assign (ref, var, &seq);
5238 }
5239
5240 if (lhs)
5241 gimplify_assign (lhs, var, &seq);
5242
5243 pop_gimplify_context (NULL);
5244 gsi_replace_with_seq (&gsi, seq, true);
5245}
5246
5247/* NVPTX implementation of GOACC_REDUCTION_INIT. */
5248
5249static void
5250nvptx_goacc_reduction_init (gcall *call)
5251{
5252 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5253 tree lhs = gimple_call_lhs (call);
5254 tree var = gimple_call_arg (call, 2);
5255 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5256 enum tree_code rcode
5257 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5258 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5259 TREE_TYPE (var));
5260 gimple_seq seq = NULL;
5261
5262 push_gimplify_context (true);
5263
5264 if (level == GOMP_DIM_VECTOR)
5265 {
5266 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5267 tree tid = make_ssa_name (integer_type_node);
5268 tree dim_vector = gimple_call_arg (call, 3);
5269 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5270 dim_vector);
5271 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5272 NULL_TREE, NULL_TREE);
5273
5274 gimple_call_set_lhs (tid_call, tid);
5275 gimple_seq_add_stmt (&seq, tid_call);
5276 gimple_seq_add_stmt (&seq, cond_stmt);
5277
5278 /* Split the block just after the call. */
5279 edge init_edge = split_block (gsi_bb (gsi), call);
5280 basic_block init_bb = init_edge->dest;
5281 basic_block call_bb = init_edge->src;
5282
5283 /* Fixup flags from call_bb to init_bb. */
5284 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
8aaedbe7 5285 init_edge->probability = profile_probability::even ();
78a78aac 5286
5287 /* Set the initialization stmts. */
5288 gimple_seq init_seq = NULL;
5289 tree init_var = make_ssa_name (TREE_TYPE (var));
5290 gimplify_assign (init_var, init, &init_seq);
5291 gsi = gsi_start_bb (init_bb);
5292 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5293
5294 /* Split block just after the init stmt. */
5295 gsi_prev (&gsi);
5296 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5297 basic_block dst_bb = inited_edge->dest;
5298
5299 /* Create false edge from call_bb to dst_bb. */
5300 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
8aaedbe7 5301 nop_edge->probability = profile_probability::even ();
78a78aac 5302
5303 /* Create phi node in dst block. */
5304 gphi *phi = create_phi_node (lhs, dst_bb);
5305 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5306 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5307
5308 /* Reset dominator of dst bb. */
5309 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5310
5311 /* Reset the gsi. */
5312 gsi = gsi_for_stmt (call);
5313 }
5314 else
5315 {
5316 if (level == GOMP_DIM_GANG)
5317 {
5318 /* If there's no receiver object, propagate the incoming VAR. */
5319 tree ref_to_res = gimple_call_arg (call, 1);
5320 if (integer_zerop (ref_to_res))
5321 init = var;
5322 }
5323
5324 gimplify_assign (lhs, init, &seq);
5325 }
5326
5327 pop_gimplify_context (NULL);
5328 gsi_replace_with_seq (&gsi, seq, true);
5329}
5330
5331/* NVPTX implementation of GOACC_REDUCTION_FINI. */
5332
5333static void
5334nvptx_goacc_reduction_fini (gcall *call)
5335{
5336 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5337 tree lhs = gimple_call_lhs (call);
5338 tree ref_to_res = gimple_call_arg (call, 1);
5339 tree var = gimple_call_arg (call, 2);
5340 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5341 enum tree_code op
5342 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5343 gimple_seq seq = NULL;
5344 tree r = NULL_TREE;;
5345
5346 push_gimplify_context (true);
5347
5348 if (level == GOMP_DIM_VECTOR)
5349 {
5350 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5351 but that requires a method of emitting a unified jump at the
5352 gimple level. */
5353 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5354 {
5355 tree other_var = make_ssa_name (TREE_TYPE (var));
5356 nvptx_generate_vector_shuffle (gimple_location (call),
5357 other_var, var, shfl, &seq);
5358
5359 r = make_ssa_name (TREE_TYPE (var));
5360 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5361 var, other_var), &seq);
5362 var = r;
5363 }
5364 }
5365 else
5366 {
5367 tree accum = NULL_TREE;
5368
5369 if (level == GOMP_DIM_WORKER)
5370 {
5371 /* Get reduction buffer address. */
5372 tree offset = gimple_call_arg (call, 5);
5373 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5374 tree ptr = make_ssa_name (TREE_TYPE (call));
5375
5376 gimplify_assign (ptr, call, &seq);
5377 accum = ptr;
5378 }
5379 else if (integer_zerop (ref_to_res))
5380 r = var;
5381 else
5382 accum = ref_to_res;
5383
5384 if (accum)
5385 {
1927fff5 5386 /* UPDATE the accumulator. */
78a78aac 5387 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5388 seq = NULL;
1927fff5 5389 r = nvptx_reduction_update (gimple_location (call), &gsi,
5390 accum, var, op);
78a78aac 5391 }
5392 }
5393
5394 if (lhs)
5395 gimplify_assign (lhs, r, &seq);
5396 pop_gimplify_context (NULL);
5397
5398 gsi_replace_with_seq (&gsi, seq, true);
5399}
5400
5401/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5402
5403static void
5404nvptx_goacc_reduction_teardown (gcall *call)
5405{
5406 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5407 tree lhs = gimple_call_lhs (call);
5408 tree var = gimple_call_arg (call, 2);
5409 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5410 gimple_seq seq = NULL;
5411
5412 push_gimplify_context (true);
5413 if (level == GOMP_DIM_WORKER)
5414 {
5415 /* Read the worker reduction buffer. */
5416 tree offset = gimple_call_arg (call, 5);
5417 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5418 tree ptr = make_ssa_name (TREE_TYPE (call));
5419
5420 gimplify_assign (ptr, call, &seq);
5421 var = build_simple_mem_ref (ptr);
5422 TREE_THIS_VOLATILE (var) = 1;
5423 }
5424
5425 if (level != GOMP_DIM_GANG)
5426 {
5427 /* Write to the receiver object. */
5428 tree ref_to_res = gimple_call_arg (call, 1);
5429
5430 if (!integer_zerop (ref_to_res))
5431 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5432 }
5433
5434 if (lhs)
5435 gimplify_assign (lhs, var, &seq);
5436
5437 pop_gimplify_context (NULL);
5438
5439 gsi_replace_with_seq (&gsi, seq, true);
5440}
5441
5442/* NVPTX reduction expander. */
5443
7bb66bb9 5444static void
78a78aac 5445nvptx_goacc_reduction (gcall *call)
5446{
5447 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5448
5449 switch (code)
5450 {
5451 case IFN_GOACC_REDUCTION_SETUP:
5452 nvptx_goacc_reduction_setup (call);
5453 break;
5454
5455 case IFN_GOACC_REDUCTION_INIT:
5456 nvptx_goacc_reduction_init (call);
5457 break;
5458
5459 case IFN_GOACC_REDUCTION_FINI:
5460 nvptx_goacc_reduction_fini (call);
5461 break;
5462
5463 case IFN_GOACC_REDUCTION_TEARDOWN:
5464 nvptx_goacc_reduction_teardown (call);
5465 break;
5466
5467 default:
5468 gcc_unreachable ();
5469 }
5470}
5471
5759d97e 5472static bool
5473nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5474 rtx x ATTRIBUTE_UNUSED)
5475{
5476 return true;
5477}
5478
fcac805e 5479static bool
5480nvptx_vector_mode_supported (machine_mode mode)
5481{
ffaae5bd 5482 return (mode == V2SImode
5483 || mode == V2DImode);
5484}
5485
5486/* Return the preferred mode for vectorizing scalar MODE. */
5487
5488static machine_mode
4c1a1be2 5489nvptx_preferred_simd_mode (scalar_mode mode)
ffaae5bd 5490{
5491 switch (mode)
5492 {
916ace94 5493 case E_DImode:
ffaae5bd 5494 return V2DImode;
916ace94 5495 case E_SImode:
ffaae5bd 5496 return V2SImode;
5497
5498 default:
5499 return default_preferred_simd_mode (mode);
5500 }
5501}
5502
5503unsigned int
5504nvptx_data_alignment (const_tree type, unsigned int basic_align)
5505{
5506 if (TREE_CODE (type) == INTEGER_TYPE)
5507 {
5508 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5509 if (size == GET_MODE_SIZE (TImode))
5510 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5511 }
5512
5513 return basic_align;
fcac805e 5514}
5515
5f6dcf1a 5516/* Implement TARGET_MODES_TIEABLE_P. */
5517
5518static bool
5519nvptx_modes_tieable_p (machine_mode, machine_mode)
5520{
5521 return false;
5522}
5523
74f68e49 5524/* Implement TARGET_HARD_REGNO_NREGS. */
5525
5526static unsigned int
5527nvptx_hard_regno_nregs (unsigned int, machine_mode)
5528{
5529 return 1;
5530}
5531
b56a9dbc 5532/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5533
5534static bool
5535nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5536{
5537 return false;
5538}
5539
8ce80784 5540#undef TARGET_OPTION_OVERRIDE
5541#define TARGET_OPTION_OVERRIDE nvptx_option_override
5542
5543#undef TARGET_ATTRIBUTE_TABLE
5544#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5545
e46fbef5 5546#undef TARGET_LRA_P
5547#define TARGET_LRA_P hook_bool_void_false
5548
8ce80784 5549#undef TARGET_LEGITIMATE_ADDRESS_P
5550#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5551
5552#undef TARGET_PROMOTE_FUNCTION_MODE
5553#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5554
5555#undef TARGET_FUNCTION_ARG
5556#define TARGET_FUNCTION_ARG nvptx_function_arg
5557#undef TARGET_FUNCTION_INCOMING_ARG
5558#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5559#undef TARGET_FUNCTION_ARG_ADVANCE
5560#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
a2246979 5561#undef TARGET_FUNCTION_ARG_BOUNDARY
5562#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
8ce80784 5563#undef TARGET_PASS_BY_REFERENCE
5564#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5565#undef TARGET_FUNCTION_VALUE_REGNO_P
5566#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5567#undef TARGET_FUNCTION_VALUE
5568#define TARGET_FUNCTION_VALUE nvptx_function_value
5569#undef TARGET_LIBCALL_VALUE
5570#define TARGET_LIBCALL_VALUE nvptx_libcall_value
5571#undef TARGET_FUNCTION_OK_FOR_SIBCALL
5572#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
f289122f 5573#undef TARGET_GET_DRAP_RTX
5574#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
8ce80784 5575#undef TARGET_SPLIT_COMPLEX_ARG
5576#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5577#undef TARGET_RETURN_IN_MEMORY
5578#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5579#undef TARGET_OMIT_STRUCT_RETURN_REG
5580#define TARGET_OMIT_STRUCT_RETURN_REG true
5581#undef TARGET_STRICT_ARGUMENT_NAMING
5582#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
8ce80784 5583#undef TARGET_CALL_ARGS
5584#define TARGET_CALL_ARGS nvptx_call_args
5585#undef TARGET_END_CALL_ARGS
5586#define TARGET_END_CALL_ARGS nvptx_end_call_args
5587
5588#undef TARGET_ASM_FILE_START
5589#define TARGET_ASM_FILE_START nvptx_file_start
5590#undef TARGET_ASM_FILE_END
5591#define TARGET_ASM_FILE_END nvptx_file_end
5592#undef TARGET_ASM_GLOBALIZE_LABEL
5593#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5594#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5595#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5596#undef TARGET_PRINT_OPERAND
5597#define TARGET_PRINT_OPERAND nvptx_print_operand
5598#undef TARGET_PRINT_OPERAND_ADDRESS
5599#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5600#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5601#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5602#undef TARGET_ASM_INTEGER
5603#define TARGET_ASM_INTEGER nvptx_assemble_integer
5604#undef TARGET_ASM_DECL_END
5605#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5606#undef TARGET_ASM_DECLARE_CONSTANT_NAME
5607#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5608#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5609#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5610#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5611#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5612
5613#undef TARGET_MACHINE_DEPENDENT_REORG
5614#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5615#undef TARGET_NO_REGISTER_ALLOCATION
5616#define TARGET_NO_REGISTER_ALLOCATION true
5617
d7ed88be 5618#undef TARGET_ENCODE_SECTION_INFO
5619#define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
9c08fbb3 5620#undef TARGET_RECORD_OFFLOAD_SYMBOL
5621#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5622
8ce80784 5623#undef TARGET_VECTOR_ALIGNMENT
5624#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5625
b3787ae4 5626#undef TARGET_CANNOT_COPY_INSN_P
5627#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5628
c33494f0 5629#undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5630#define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5631
78a78aac 5632#undef TARGET_INIT_BUILTINS
5633#define TARGET_INIT_BUILTINS nvptx_init_builtins
5634#undef TARGET_EXPAND_BUILTIN
5635#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5636#undef TARGET_BUILTIN_DECL
5637#define TARGET_BUILTIN_DECL nvptx_builtin_decl
5638
7fce8768 5639#undef TARGET_SIMT_VF
5640#define TARGET_SIMT_VF nvptx_simt_vf
5641
0bb0f256 5642#undef TARGET_GOACC_VALIDATE_DIMS
5643#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5644
e1037942 5645#undef TARGET_GOACC_DIM_LIMIT
5646#define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5647
b3787ae4 5648#undef TARGET_GOACC_FORK_JOIN
5649#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5650
78a78aac 5651#undef TARGET_GOACC_REDUCTION
5652#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5653
5759d97e 5654#undef TARGET_CANNOT_FORCE_CONST_MEM
5655#define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5656
fcac805e 5657#undef TARGET_VECTOR_MODE_SUPPORTED_P
5658#define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5659
ffaae5bd 5660#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5661#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5662 nvptx_preferred_simd_mode
5663
5f6dcf1a 5664#undef TARGET_MODES_TIEABLE_P
5665#define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5666
74f68e49 5667#undef TARGET_HARD_REGNO_NREGS
5668#define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5669
b56a9dbc 5670#undef TARGET_CAN_CHANGE_MODE_CLASS
5671#define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5672
8ce80784 5673struct gcc_target targetm = TARGET_INITIALIZER;
5674
5675#include "gt-nvptx.h"