]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/nvptx/nvptx.c
tree-vect-generic.c (do_compare): Use -1 for true result instead of 1.
[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
CommitLineData
738f2522 1/* Target code for NVPTX.
5624e564 2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
738f2522
BS
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
3a4d1cb1 22#include <sstream>
738f2522
BS
23#include "system.h"
24#include "coretypes.h"
c7131fb2 25#include "backend.h"
e11c4407 26#include "target.h"
738f2522 27#include "rtl.h"
e11c4407
AM
28#include "tree.h"
29#include "cfghooks.h"
c7131fb2 30#include "df.h"
e11c4407
AM
31#include "tm_p.h"
32#include "expmed.h"
33#include "optabs.h"
34#include "regs.h"
35#include "emit-rtl.h"
36#include "recog.h"
37#include "diagnostic.h"
40e23961 38#include "alias.h"
738f2522
BS
39#include "insn-flags.h"
40#include "output.h"
41#include "insn-attr.h"
36566b39 42#include "flags.h"
36566b39
PK
43#include "dojump.h"
44#include "explow.h"
45#include "calls.h"
36566b39
PK
46#include "varasm.h"
47#include "stmt.h"
738f2522 48#include "expr.h"
738f2522
BS
49#include "tm-preds.h"
50#include "tm-constrs.h"
738f2522
BS
51#include "langhooks.h"
52#include "dbxout.h"
738f2522 53#include "cfgrtl.h"
d88cd9c4 54#include "gimple.h"
738f2522 55#include "stor-layout.h"
738f2522 56#include "builtins.h"
3e32ee19
NS
57#include "omp-low.h"
58#include "gomp-constants.h"
d88cd9c4 59#include "dumpfile.h"
f3552158
NS
60#include "internal-fn.h"
61#include "gimple-iterator.h"
62#include "stringpool.h"
63#include "tree-ssa-operands.h"
64#include "tree-ssanames.h"
65#include "gimplify.h"
66#include "tree-phinodes.h"
67#include "cfgloop.h"
68#include "fold-const.h"
738f2522 69
994c5d85 70/* This file should be included last. */
d58627a0
RS
71#include "target-def.h"
72
d88cd9c4
NS
73#define SHUFFLE_UP 0
74#define SHUFFLE_DOWN 1
75#define SHUFFLE_BFLY 2
76#define SHUFFLE_IDX 3
77
738f2522
BS
78/* Record the function decls we've written, and the libfuncs and function
79 decls corresponding to them. */
80static std::stringstream func_decls;
f3dba894 81
6c907cff 82struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
f3dba894
TS
83{
84 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
85 static bool equal (rtx a, rtx b) { return a == b; }
86};
87
88static GTY((cache))
89 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
90
6c907cff 91struct tree_hasher : ggc_cache_ptr_hash<tree_node>
f3dba894
TS
92{
93 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
94 static bool equal (tree a, tree b) { return a == b; }
95};
96
97static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
98static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
738f2522 99
f3552158
NS
100/* Buffer needed to broadcast across workers. This is used for both
101 worker-neutering and worker broadcasting. It is shared by all
102 functions emitted. The buffer is placed in shared memory. It'd be
103 nice if PTX supported common blocks, because then this could be
104 shared across TUs (taking the largest size). */
d88cd9c4
NS
105static unsigned worker_bcast_size;
106static unsigned worker_bcast_align;
107#define worker_bcast_name "__worker_bcast"
108static GTY(()) rtx worker_bcast_sym;
109
f3552158
NS
110/* Buffer needed for worker reductions. This has to be distinct from
111 the worker broadcast array, as both may be live concurrently. */
112static unsigned worker_red_size;
113static unsigned worker_red_align;
114#define worker_red_name "__worker_red"
115static GTY(()) rtx worker_red_sym;
116
738f2522
BS
117/* Allocate a new, cleared machine_function structure. */
118
119static struct machine_function *
120nvptx_init_machine_status (void)
121{
122 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
123 p->ret_reg_mode = VOIDmode;
124 return p;
125}
126
127/* Implement TARGET_OPTION_OVERRIDE. */
128
129static void
130nvptx_option_override (void)
131{
132 init_machine_status = nvptx_init_machine_status;
133 /* Gives us a predictable order, which we need especially for variables. */
134 flag_toplevel_reorder = 1;
135 /* Assumes that it will see only hard registers. */
136 flag_var_tracking = 0;
f324806d
NS
137 write_symbols = NO_DEBUG;
138 debug_info_level = DINFO_LEVEL_NONE;
738f2522 139
f3dba894
TS
140 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
141 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
738f2522 142 declared_libfuncs_htab
f3dba894 143 = hash_table<declared_libfunc_hasher>::create_ggc (17);
d88cd9c4
NS
144
145 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
146 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
f3552158
NS
147
148 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, worker_red_name);
149 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
738f2522
BS
150}
151
152/* Return the mode to be used when declaring a ptx object for OBJ.
153 For objects with subparts such as complex modes this is the mode
154 of the subpart. */
155
156machine_mode
157nvptx_underlying_object_mode (rtx obj)
158{
159 if (GET_CODE (obj) == SUBREG)
160 obj = SUBREG_REG (obj);
161 machine_mode mode = GET_MODE (obj);
162 if (mode == TImode)
163 return DImode;
164 if (COMPLEX_MODE_P (mode))
165 return GET_MODE_INNER (mode);
166 return mode;
167}
168
169/* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
170 deal with ptx ideosyncracies. */
171
172const char *
173nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
174{
175 switch (mode)
176 {
177 case BLKmode:
178 return ".b8";
179 case BImode:
180 return ".pred";
181 case QImode:
182 if (promote)
183 return ".u32";
184 else
185 return ".u8";
186 case HImode:
187 return ".u16";
188 case SImode:
189 return ".u32";
190 case DImode:
191 return ".u64";
192
193 case SFmode:
194 return ".f32";
195 case DFmode:
196 return ".f64";
197
198 default:
199 gcc_unreachable ();
200 }
201}
202
203/* Return the number of pieces to use when dealing with a pseudo of *PMODE.
204 Alter *PMODE if we return a number greater than one. */
205
206static int
207maybe_split_mode (machine_mode *pmode)
208{
209 machine_mode mode = *pmode;
210
211 if (COMPLEX_MODE_P (mode))
212 {
213 *pmode = GET_MODE_INNER (mode);
214 return 2;
215 }
216 else if (mode == TImode)
217 {
218 *pmode = DImode;
219 return 2;
220 }
221 return 1;
222}
223
224/* Like maybe_split_mode, but only return whether or not the mode
225 needs to be split. */
226static bool
227nvptx_split_reg_p (machine_mode mode)
228{
229 if (COMPLEX_MODE_P (mode))
230 return true;
231 if (mode == TImode)
232 return true;
233 return false;
234}
235
d88cd9c4
NS
236/* Emit forking instructions for MASK. */
237
238static void
239nvptx_emit_forking (unsigned mask, bool is_call)
240{
241 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
242 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
243 if (mask)
244 {
245 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
246
247 /* Emit fork at all levels. This helps form SESE regions, as
248 it creates a block with a single successor before entering a
249 partitooned region. That is a good candidate for the end of
250 an SESE region. */
251 if (!is_call)
252 emit_insn (gen_nvptx_fork (op));
253 emit_insn (gen_nvptx_forked (op));
254 }
255}
256
257/* Emit joining instructions for MASK. */
258
259static void
260nvptx_emit_joining (unsigned mask, bool is_call)
261{
262 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
263 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
264 if (mask)
265 {
266 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
267
268 /* Emit joining for all non-call pars to ensure there's a single
269 predecessor for the block the join insn ends up in. This is
270 needed for skipping entire loops. */
271 if (!is_call)
272 emit_insn (gen_nvptx_joining (op));
273 emit_insn (gen_nvptx_join (op));
274 }
275}
276
738f2522
BS
277#define PASS_IN_REG_P(MODE, TYPE) \
278 ((GET_MODE_CLASS (MODE) == MODE_INT \
279 || GET_MODE_CLASS (MODE) == MODE_FLOAT \
280 || ((GET_MODE_CLASS (MODE) == MODE_COMPLEX_INT \
281 || GET_MODE_CLASS (MODE) == MODE_COMPLEX_FLOAT) \
282 && !AGGREGATE_TYPE_P (TYPE))) \
283 && (MODE) != TImode)
284
285#define RETURN_IN_REG_P(MODE) \
286 ((GET_MODE_CLASS (MODE) == MODE_INT \
287 || GET_MODE_CLASS (MODE) == MODE_FLOAT) \
288 && GET_MODE_SIZE (MODE) <= 8)
289\f
290/* Perform a mode promotion for a function argument with MODE. Return
291 the promoted mode. */
292
293static machine_mode
294arg_promotion (machine_mode mode)
295{
296 if (mode == QImode || mode == HImode)
297 return SImode;
298 return mode;
299}
300
301/* Write the declaration of a function arg of TYPE to S. I is the index
302 of the argument, MODE its mode. NO_ARG_TYPES is true if this is for
303 a decl with zero TYPE_ARG_TYPES, i.e. an old-style C decl. */
304
305static int
306write_one_arg (std::stringstream &s, tree type, int i, machine_mode mode,
307 bool no_arg_types)
308{
309 if (!PASS_IN_REG_P (mode, type))
310 mode = Pmode;
311
312 int count = maybe_split_mode (&mode);
313
314 if (count == 2)
315 {
316 write_one_arg (s, NULL_TREE, i, mode, false);
317 write_one_arg (s, NULL_TREE, i + 1, mode, false);
318 return i + 1;
319 }
320
321 if (no_arg_types && !AGGREGATE_TYPE_P (type))
322 {
323 if (mode == SFmode)
324 mode = DFmode;
325 mode = arg_promotion (mode);
326 }
327
328 if (i > 0)
329 s << ", ";
330 s << ".param" << nvptx_ptx_type_from_mode (mode, false) << " %in_ar"
331 << (i + 1) << (mode == QImode || mode == HImode ? "[1]" : "");
332 if (mode == BLKmode)
333 s << "[" << int_size_in_bytes (type) << "]";
334 return i;
335}
336
337/* Look for attributes in ATTRS that would indicate we must write a function
338 as a .entry kernel rather than a .func. Return true if one is found. */
339
340static bool
341write_as_kernel (tree attrs)
342{
343 return (lookup_attribute ("kernel", attrs) != NULL_TREE
344 || lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE);
345}
346
ecf6e535
BS
347/* Write a function decl for DECL to S, where NAME is the name to be used.
348 This includes ptx .visible or .extern specifiers, .func or .kernel, and
349 argument and return types. */
738f2522
BS
350
351static void
352nvptx_write_function_decl (std::stringstream &s, const char *name, const_tree decl)
353{
354 tree fntype = TREE_TYPE (decl);
355 tree result_type = TREE_TYPE (fntype);
356 tree args = TYPE_ARG_TYPES (fntype);
357 tree attrs = DECL_ATTRIBUTES (decl);
358 bool kernel = write_as_kernel (attrs);
359 bool is_main = strcmp (name, "main") == 0;
360 bool args_from_decl = false;
361
362 /* We get:
363 NULL in TYPE_ARG_TYPES, for old-style functions
364 NULL in DECL_ARGUMENTS, for builtin functions without another
365 declaration.
366 So we have to pick the best one we have. */
367 if (args == 0)
368 {
369 args = DECL_ARGUMENTS (decl);
370 args_from_decl = true;
371 }
372
373 if (DECL_EXTERNAL (decl))
374 s << ".extern ";
375 else if (TREE_PUBLIC (decl))
376 s << ".visible ";
377
378 if (kernel)
379 s << ".entry ";
380 else
381 s << ".func ";
382
383 /* Declare the result. */
384 bool return_in_mem = false;
385 if (TYPE_MODE (result_type) != VOIDmode)
386 {
387 machine_mode mode = TYPE_MODE (result_type);
388 if (!RETURN_IN_REG_P (mode))
389 return_in_mem = true;
390 else
391 {
392 mode = arg_promotion (mode);
393 s << "(.param" << nvptx_ptx_type_from_mode (mode, false)
394 << " %out_retval)";
395 }
396 }
397
398 if (name[0] == '*')
399 s << (name + 1);
400 else
401 s << name;
402
403 /* Declare argument types. */
404 if ((args != NULL_TREE
1fe6befc
NS
405 && !(TREE_CODE (args) == TREE_LIST
406 && TREE_VALUE (args) == void_type_node))
738f2522
BS
407 || is_main
408 || return_in_mem
409 || DECL_STATIC_CHAIN (decl))
410 {
411 s << "(";
412 int i = 0;
413 bool any_args = false;
414 if (return_in_mem)
415 {
416 s << ".param.u" << GET_MODE_BITSIZE (Pmode) << " %in_ar1";
417 i++;
418 }
419 while (args != NULL_TREE)
420 {
421 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
422 machine_mode mode = TYPE_MODE (type);
423
424 if (mode != VOIDmode)
425 {
426 i = write_one_arg (s, type, i, mode,
427 TYPE_ARG_TYPES (fntype) == 0);
428 any_args = true;
429 i++;
430 }
431 args = TREE_CHAIN (args);
432 }
433 if (stdarg_p (fntype))
434 {
435 gcc_assert (i > 0);
436 s << ", .param.u" << GET_MODE_BITSIZE (Pmode) << " %in_argp";
437 }
438 if (DECL_STATIC_CHAIN (decl))
439 {
440 if (i > 0)
441 s << ", ";
442 s << ".reg.u" << GET_MODE_BITSIZE (Pmode)
443 << reg_names [STATIC_CHAIN_REGNUM];
444 }
445 if (!any_args && is_main)
446 s << ".param.u32 %argc, .param.u" << GET_MODE_BITSIZE (Pmode)
447 << " %argv";
448 s << ")";
449 }
450}
451
452/* Walk either ARGTYPES or ARGS if the former is null, and write out part of
453 the function header to FILE. If WRITE_COPY is false, write reg
454 declarations, otherwise write the copy from the incoming argument to that
455 reg. RETURN_IN_MEM indicates whether to start counting arg numbers at 1
456 instead of 0. */
457
458static void
459walk_args_for_param (FILE *file, tree argtypes, tree args, bool write_copy,
460 bool return_in_mem)
461{
462 int i;
463
464 bool args_from_decl = false;
465 if (argtypes == 0)
466 args_from_decl = true;
467 else
468 args = argtypes;
469
470 for (i = return_in_mem ? 1 : 0; args != NULL_TREE; args = TREE_CHAIN (args))
471 {
472 tree type = args_from_decl ? TREE_TYPE (args) : TREE_VALUE (args);
473 machine_mode mode = TYPE_MODE (type);
474
475 if (mode == VOIDmode)
476 break;
477
478 if (!PASS_IN_REG_P (mode, type))
479 mode = Pmode;
480
481 int count = maybe_split_mode (&mode);
482 if (count == 1)
483 {
484 if (argtypes == NULL && !AGGREGATE_TYPE_P (type))
485 {
486 if (mode == SFmode)
487 mode = DFmode;
488
489 }
738f2522 490 }
7373d132 491 mode = arg_promotion (mode);
738f2522
BS
492 while (count-- > 0)
493 {
494 i++;
495 if (write_copy)
496 fprintf (file, "\tld.param%s %%ar%d, [%%in_ar%d];\n",
7373d132 497 nvptx_ptx_type_from_mode (mode, false), i, i);
738f2522
BS
498 else
499 fprintf (file, "\t.reg%s %%ar%d;\n",
7373d132 500 nvptx_ptx_type_from_mode (mode, false), i);
738f2522
BS
501 }
502 }
503}
504
505/* Write a .func or .kernel declaration (not a definition) along with
506 a helper comment for use by ld. S is the stream to write to, DECL
507 the decl for the function with name NAME. */
508
509static void
510write_function_decl_and_comment (std::stringstream &s, const char *name, const_tree decl)
511{
512 s << "// BEGIN";
513 if (TREE_PUBLIC (decl))
514 s << " GLOBAL";
515 s << " FUNCTION DECL: ";
516 if (name[0] == '*')
517 s << (name + 1);
518 else
519 s << name;
520 s << "\n";
521 nvptx_write_function_decl (s, name, decl);
522 s << ";\n";
523}
524
525/* Check NAME for special function names and redirect them by returning a
526 replacement. This applies to malloc, free and realloc, for which we
527 want to use libgcc wrappers, and call, which triggers a bug in ptxas. */
528
529static const char *
530nvptx_name_replacement (const char *name)
531{
532 if (strcmp (name, "call") == 0)
533 return "__nvptx_call";
534 if (strcmp (name, "malloc") == 0)
535 return "__nvptx_malloc";
536 if (strcmp (name, "free") == 0)
537 return "__nvptx_free";
538 if (strcmp (name, "realloc") == 0)
539 return "__nvptx_realloc";
540 return name;
541}
542
543/* If DECL is a FUNCTION_DECL, check the hash table to see if we
544 already encountered it, and if not, insert it and write a ptx
545 declarations that will be output at the end of compilation. */
546
547static bool
548nvptx_record_fndecl (tree decl, bool force = false)
549{
550 if (decl == NULL_TREE || TREE_CODE (decl) != FUNCTION_DECL
551 || !DECL_EXTERNAL (decl))
552 return true;
553
554 if (!force && TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
555 return false;
556
f3dba894 557 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
558 if (*slot == NULL)
559 {
560 *slot = decl;
561 const char *name = get_fnname_from_decl (decl);
562 name = nvptx_name_replacement (name);
563 write_function_decl_and_comment (func_decls, name, decl);
564 }
565 return true;
566}
567
568/* Record that we need to emit a ptx decl for DECL. Either do it now, or
569 record it for later in case we have no argument information at this
570 point. */
571
572void
573nvptx_record_needed_fndecl (tree decl)
574{
575 if (nvptx_record_fndecl (decl))
576 return;
577
f3dba894 578 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
738f2522
BS
579 if (*slot == NULL)
580 *slot = decl;
581}
582
d88cd9c4
NS
583/* Emit code to initialize the REGNO predicate register to indicate
584 whether we are not lane zero on the NAME axis. */
585
586static void
587nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
588{
589 fprintf (file, "\t{\n");
590 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
591 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
592 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
593 fprintf (file, "\t}\n");
594}
595
738f2522
BS
596/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
597 function, including local var decls and copies from the arguments to
598 local regs. */
599
600void
601nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
602{
603 tree fntype = TREE_TYPE (decl);
604 tree result_type = TREE_TYPE (fntype);
605
606 name = nvptx_name_replacement (name);
607
608 std::stringstream s;
609 write_function_decl_and_comment (s, name, decl);
610 s << "// BEGIN";
611 if (TREE_PUBLIC (decl))
612 s << " GLOBAL";
613 s << " FUNCTION DEF: ";
614
615 if (name[0] == '*')
616 s << (name + 1);
617 else
618 s << name;
619 s << "\n";
620
621 nvptx_write_function_decl (s, name, decl);
622 fprintf (file, "%s", s.str().c_str());
623
25662751
NS
624 bool return_in_mem = (TYPE_MODE (result_type) != VOIDmode
625 && !RETURN_IN_REG_P (TYPE_MODE (result_type)));
738f2522
BS
626
627 fprintf (file, "\n{\n");
628
629 /* Ensure all arguments that should live in a register have one
630 declared. We'll emit the copies below. */
631 walk_args_for_param (file, TYPE_ARG_TYPES (fntype), DECL_ARGUMENTS (decl),
632 false, return_in_mem);
633 if (return_in_mem)
634 fprintf (file, "\t.reg.u%d %%ar1;\n", GET_MODE_BITSIZE (Pmode));
25662751
NS
635
636 /* C++11 ABI causes us to return a reference to the passed in
637 pointer for return_in_mem. */
638 if (cfun->machine->ret_reg_mode != VOIDmode)
738f2522 639 {
25662751
NS
640 machine_mode mode = arg_promotion
641 ((machine_mode)cfun->machine->ret_reg_mode);
ac952181 642 fprintf (file, "\t.reg%s %%retval;\n",
738f2522
BS
643 nvptx_ptx_type_from_mode (mode, false));
644 }
645
646 if (stdarg_p (fntype))
647 fprintf (file, "\t.reg.u%d %%argp;\n", GET_MODE_BITSIZE (Pmode));
648
649 fprintf (file, "\t.reg.u%d %s;\n", GET_MODE_BITSIZE (Pmode),
650 reg_names[OUTGOING_STATIC_CHAIN_REGNUM]);
651
652 /* Declare the pseudos we have as ptx registers. */
653 int maxregs = max_reg_num ();
654 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
655 {
656 if (regno_reg_rtx[i] != const0_rtx)
657 {
658 machine_mode mode = PSEUDO_REGNO_MODE (i);
659 int count = maybe_split_mode (&mode);
660 if (count > 1)
661 {
662 while (count-- > 0)
663 fprintf (file, "\t.reg%s %%r%d$%d;\n",
664 nvptx_ptx_type_from_mode (mode, true),
665 i, count);
666 }
667 else
668 fprintf (file, "\t.reg%s %%r%d;\n",
669 nvptx_ptx_type_from_mode (mode, true),
670 i);
671 }
672 }
673
674 /* The only reason we might be using outgoing args is if we call a stdargs
675 function. Allocate the space for this. If we called varargs functions
676 without passing any variadic arguments, we'll see a reference to outargs
677 even with a zero outgoing_args_size. */
678 HOST_WIDE_INT sz = crtl->outgoing_args_size;
679 if (sz == 0)
680 sz = 1;
681 if (cfun->machine->has_call_with_varargs)
682 fprintf (file, "\t.reg.u%d %%outargs;\n"
16998094 683 "\t.local.align 8 .b8 %%outargs_ar[" HOST_WIDE_INT_PRINT_DEC"];\n",
738f2522
BS
684 BITS_PER_WORD, sz);
685 if (cfun->machine->punning_buffer_size > 0)
686 fprintf (file, "\t.reg.u%d %%punbuffer;\n"
687 "\t.local.align 8 .b8 %%punbuffer_ar[%d];\n",
688 BITS_PER_WORD, cfun->machine->punning_buffer_size);
689
690 /* Declare a local variable for the frame. */
691 sz = get_frame_size ();
692 if (sz > 0 || cfun->machine->has_call_with_sc)
693 {
18c05628
NS
694 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
695
738f2522 696 fprintf (file, "\t.reg.u%d %%frame;\n"
18c05628
NS
697 "\t.local.align %d .b8 %%farray[" HOST_WIDE_INT_PRINT_DEC"];\n",
698 BITS_PER_WORD, alignment, sz == 0 ? 1 : sz);
738f2522
BS
699 fprintf (file, "\tcvta.local.u%d %%frame, %%farray;\n",
700 BITS_PER_WORD);
701 }
702
703 if (cfun->machine->has_call_with_varargs)
704 fprintf (file, "\tcvta.local.u%d %%outargs, %%outargs_ar;\n",
705 BITS_PER_WORD);
706 if (cfun->machine->punning_buffer_size > 0)
707 fprintf (file, "\tcvta.local.u%d %%punbuffer, %%punbuffer_ar;\n",
708 BITS_PER_WORD);
709
710 /* Now emit any copies necessary for arguments. */
711 walk_args_for_param (file, TYPE_ARG_TYPES (fntype), DECL_ARGUMENTS (decl),
712 true, return_in_mem);
713 if (return_in_mem)
ac952181 714 fprintf (file, "\tld.param.u%d %%ar1, [%%in_ar1];\n",
738f2522
BS
715 GET_MODE_BITSIZE (Pmode));
716 if (stdarg_p (fntype))
ac952181 717 fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
738f2522 718 GET_MODE_BITSIZE (Pmode));
d88cd9c4
NS
719
720 /* Emit axis predicates. */
721 if (cfun->machine->axis_predicate[0])
722 nvptx_init_axis_predicate (file,
723 REGNO (cfun->machine->axis_predicate[0]), "y");
724 if (cfun->machine->axis_predicate[1])
725 nvptx_init_axis_predicate (file,
726 REGNO (cfun->machine->axis_predicate[1]), "x");
738f2522
BS
727}
728
729/* Output a return instruction. Also copy the return value to its outgoing
730 location. */
731
732const char *
733nvptx_output_return (void)
734{
25662751
NS
735 machine_mode mode = (machine_mode)cfun->machine->ret_reg_mode;
736
737 if (mode != VOIDmode)
738f2522 738 {
25662751
NS
739 mode = arg_promotion (mode);
740 fprintf (asm_out_file, "\tst.param%s\t[%%out_retval], %%retval;\n",
741 nvptx_ptx_type_from_mode (mode, false));
738f2522
BS
742 }
743
744 return "ret;";
745}
746
747/* Construct a function declaration from a call insn. This can be
748 necessary for two reasons - either we have an indirect call which
749 requires a .callprototype declaration, or we have a libcall
750 generated by emit_library_call for which no decl exists. */
751
752static void
753write_func_decl_from_insn (std::stringstream &s, rtx result, rtx pat,
754 rtx callee)
755{
756 bool callprototype = register_operand (callee, Pmode);
757 const char *name = "_";
758 if (!callprototype)
759 {
760 name = XSTR (callee, 0);
761 name = nvptx_name_replacement (name);
762 s << "// BEGIN GLOBAL FUNCTION DECL: " << name << "\n";
763 }
764 s << (callprototype ? "\t.callprototype\t" : "\t.extern .func ");
765
766 if (result != NULL_RTX)
767 {
768 s << "(.param";
769 s << nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
770 false);
771 s << " ";
772 if (callprototype)
773 s << "_";
774 else
775 s << "%out_retval";
776 s << ")";
777 }
778
779 s << name;
780
f324806d
NS
781 int arg_end = XVECLEN (pat, 0);
782
783 if (1 < arg_end)
738f2522 784 {
f324806d 785 const char *comma = "";
738f2522 786 s << " (";
f324806d 787 for (int i = 1; i < arg_end; i++)
738f2522 788 {
f324806d 789 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
790 machine_mode mode = GET_MODE (t);
791 int count = maybe_split_mode (&mode);
792
f324806d 793 while (count--)
738f2522 794 {
f324806d 795 s << comma << ".param";
738f2522
BS
796 s << nvptx_ptx_type_from_mode (mode, false);
797 s << " ";
798 if (callprototype)
799 s << "_";
800 else
f324806d 801 s << "%arg" << i - 1;
738f2522
BS
802 if (mode == QImode || mode == HImode)
803 s << "[1]";
f324806d 804 comma = ", ";
738f2522
BS
805 }
806 }
807 s << ")";
808 }
809 s << ";\n";
810}
811
812/* Terminate a function by writing a closing brace to FILE. */
813
814void
815nvptx_function_end (FILE *file)
816{
817 fprintf (file, "\t}\n");
818}
819\f
820/* Decide whether we can make a sibling call to a function. For ptx, we
821 can't. */
822
823static bool
824nvptx_function_ok_for_sibcall (tree, tree)
825{
826 return false;
827}
828
18c05628
NS
829/* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
830
831static rtx
832nvptx_get_drap_rtx (void)
833{
834 return NULL_RTX;
835}
836
738f2522
BS
837/* Implement the TARGET_CALL_ARGS hook. Record information about one
838 argument to the next call. */
839
840static void
841nvptx_call_args (rtx arg, tree funtype)
842{
843 if (cfun->machine->start_call == NULL_RTX)
844 {
845 cfun->machine->call_args = NULL;
846 cfun->machine->funtype = funtype;
847 cfun->machine->start_call = const0_rtx;
848 }
849 if (arg == pc_rtx)
850 return;
851
852 rtx_expr_list *args_so_far = cfun->machine->call_args;
853 if (REG_P (arg))
854 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, args_so_far);
855}
856
857/* Implement the corresponding END_CALL_ARGS hook. Clear and free the
858 information we recorded. */
859
860static void
861nvptx_end_call_args (void)
862{
863 cfun->machine->start_call = NULL_RTX;
864 free_EXPR_LIST_list (&cfun->machine->call_args);
865}
866
ecf6e535
BS
867/* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
868 track of whether calls involving static chains or varargs were seen
869 in the current function.
870 For libcalls, maintain a hash table of decls we have seen, and
871 record a function decl for later when encountering a new one. */
738f2522
BS
872
873void
874nvptx_expand_call (rtx retval, rtx address)
875{
f324806d 876 int nargs = 0;
738f2522
BS
877 rtx callee = XEXP (address, 0);
878 rtx pat, t;
879 rtvec vec;
880 bool external_decl = false;
f324806d
NS
881 rtx varargs = NULL_RTX;
882 tree decl_type = NULL_TREE;
d88cd9c4 883 unsigned parallel = 0;
738f2522 884
738f2522
BS
885 for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
886 nargs++;
887
738f2522
BS
888 if (!call_insn_operand (callee, Pmode))
889 {
890 callee = force_reg (Pmode, callee);
891 address = change_address (address, QImode, callee);
892 }
893
894 if (GET_CODE (callee) == SYMBOL_REF)
895 {
896 tree decl = SYMBOL_REF_DECL (callee);
897 if (decl != NULL_TREE)
898 {
899 decl_type = TREE_TYPE (decl);
900 if (DECL_STATIC_CHAIN (decl))
901 cfun->machine->has_call_with_sc = true;
902 if (DECL_EXTERNAL (decl))
903 external_decl = true;
d88cd9c4
NS
904 tree attr = get_oacc_fn_attrib (decl);
905 if (attr)
906 {
907 tree dims = TREE_VALUE (attr);
908
909 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
910 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
911 {
912 if (TREE_PURPOSE (dims)
913 && !integer_zerop (TREE_PURPOSE (dims)))
914 break;
915 /* Not on this axis. */
916 parallel ^= GOMP_DIM_MASK (ix);
917 dims = TREE_CHAIN (dims);
918 }
919 }
738f2522
BS
920 }
921 }
c38f0d8c 922
738f2522
BS
923 if (cfun->machine->funtype
924 /* It's possible to construct testcases where we call a variable.
925 See compile/20020129-1.c. stdarg_p will crash so avoid calling it
926 in such a case. */
927 && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
928 || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
929 && stdarg_p (cfun->machine->funtype))
930 {
f324806d 931 varargs = gen_reg_rtx (Pmode);
738f2522 932 if (Pmode == DImode)
f324806d 933 emit_move_insn (varargs, stack_pointer_rtx);
738f2522 934 else
f324806d
NS
935 emit_move_insn (varargs, stack_pointer_rtx);
936 cfun->machine->has_call_with_varargs = true;
738f2522 937 }
f324806d
NS
938 vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
939 pat = gen_rtx_PARALLEL (VOIDmode, vec);
738f2522 940
f324806d
NS
941 int vec_pos = 0;
942
738f2522
BS
943 rtx tmp_retval = retval;
944 t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
945 if (retval != NULL_RTX)
946 {
947 if (!nvptx_register_operand (retval, GET_MODE (retval)))
948 tmp_retval = gen_reg_rtx (GET_MODE (retval));
f7df4a84 949 t = gen_rtx_SET (tmp_retval, t);
738f2522 950 }
f324806d
NS
951 XVECEXP (pat, 0, vec_pos++) = t;
952
953 /* Construct the call insn, including a USE for each argument pseudo
954 register. These will be used when printing the insn. */
955 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
956 {
957 rtx this_arg = XEXP (arg, 0);
958 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
959 }
960
961 if (varargs)
962 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
963
964 gcc_assert (vec_pos = XVECLEN (pat, 0));
ecf6e535
BS
965
966 /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
967 undeclared function, we'll have an external decl without arg types.
968 In either case we have to try to construct a ptx declaration from one of
969 the calls to the function. */
738f2522
BS
970 if (!REG_P (callee)
971 && (decl_type == NULL_TREE
972 || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
973 {
f3dba894 974 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
738f2522
BS
975 if (*slot == NULL)
976 {
977 *slot = callee;
978 write_func_decl_from_insn (func_decls, retval, pat, callee);
979 }
980 }
d88cd9c4
NS
981
982 nvptx_emit_forking (parallel, true);
738f2522 983 emit_call_insn (pat);
d88cd9c4
NS
984 nvptx_emit_joining (parallel, true);
985
738f2522
BS
986 if (tmp_retval != retval)
987 emit_move_insn (retval, tmp_retval);
988}
989
990/* Implement TARGET_FUNCTION_ARG. */
991
992static rtx
993nvptx_function_arg (cumulative_args_t, machine_mode mode,
994 const_tree, bool named)
995{
996 if (mode == VOIDmode)
997 return NULL_RTX;
998
999 if (named)
1000 return gen_reg_rtx (mode);
1001 return NULL_RTX;
1002}
1003
1004/* Implement TARGET_FUNCTION_INCOMING_ARG. */
1005
1006static rtx
1007nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
1008 const_tree, bool named)
1009{
1010 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1011 if (mode == VOIDmode)
1012 return NULL_RTX;
1013
1014 if (!named)
1015 return NULL_RTX;
1016
1017 /* No need to deal with split modes here, the only case that can
1018 happen is complex modes and those are dealt with by
1019 TARGET_SPLIT_COMPLEX_ARG. */
1020 return gen_rtx_UNSPEC (mode,
1021 gen_rtvec (1, GEN_INT (1 + cum->count)),
1022 UNSPEC_ARG_REG);
1023}
1024
1025/* Implement TARGET_FUNCTION_ARG_ADVANCE. */
1026
1027static void
1028nvptx_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
1029 const_tree type ATTRIBUTE_UNUSED,
1030 bool named ATTRIBUTE_UNUSED)
1031{
1032 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1033 if (mode == TImode)
1034 cum->count += 2;
1035 else
1036 cum->count++;
1037}
1038
1039/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
1040
1041 For nvptx, we know how to handle functions declared as stdarg: by
1042 passing an extra pointer to the unnamed arguments. However, the
1043 Fortran frontend can produce a different situation, where a
1044 function pointer is declared with no arguments, but the actual
1045 function and calls to it take more arguments. In that case, we
1046 want to ensure the call matches the definition of the function. */
1047
1048static bool
1049nvptx_strict_argument_naming (cumulative_args_t cum_v)
1050{
1051 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
1052 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
1053}
1054
1055/* Implement TARGET_FUNCTION_ARG_BOUNDARY. */
1056
1057static unsigned int
1058nvptx_function_arg_boundary (machine_mode mode, const_tree type)
1059{
1060 unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
1061
1062 if (boundary > BITS_PER_WORD)
1063 return 2 * BITS_PER_WORD;
1064
1065 if (mode == BLKmode)
1066 {
1067 HOST_WIDE_INT size = int_size_in_bytes (type);
1068 if (size > 4)
1069 return 2 * BITS_PER_WORD;
1070 if (boundary < BITS_PER_WORD)
1071 {
1072 if (size >= 3)
1073 return BITS_PER_WORD;
1074 if (size >= 2)
1075 return 2 * BITS_PER_UNIT;
1076 }
1077 }
1078 return boundary;
1079}
1080
1081/* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
1082 where function FUNC returns or receives a value of data type TYPE. */
1083
1084static rtx
1085nvptx_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED,
1086 bool outgoing)
1087{
1088 int unsignedp = TYPE_UNSIGNED (type);
1089 machine_mode orig_mode = TYPE_MODE (type);
1090 machine_mode mode = promote_function_mode (type, orig_mode,
1091 &unsignedp, NULL_TREE, 1);
1092 if (outgoing)
1093 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1094 if (cfun->machine->start_call == NULL_RTX)
1095 /* Pretend to return in a hard reg for early uses before pseudos can be
1096 generated. */
1097 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1098 return gen_reg_rtx (mode);
1099}
1100
1101/* Implement TARGET_LIBCALL_VALUE. */
1102
1103static rtx
1104nvptx_libcall_value (machine_mode mode, const_rtx)
1105{
1106 if (cfun->machine->start_call == NULL_RTX)
1107 /* Pretend to return in a hard reg for early uses before pseudos can be
1108 generated. */
1109 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
1110 return gen_reg_rtx (mode);
1111}
1112
1113/* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
1114
1115static bool
1116nvptx_function_value_regno_p (const unsigned int regno)
1117{
1118 return regno == NVPTX_RETURN_REGNUM;
1119}
1120
1121/* Types with a mode other than those supported by the machine are passed by
1122 reference in memory. */
1123
1124static bool
1125nvptx_pass_by_reference (cumulative_args_t, machine_mode mode,
1126 const_tree type, bool)
1127{
1128 return !PASS_IN_REG_P (mode, type);
1129}
1130
1131/* Implement TARGET_RETURN_IN_MEMORY. */
1132
1133static bool
1134nvptx_return_in_memory (const_tree type, const_tree)
1135{
1136 machine_mode mode = TYPE_MODE (type);
1137 if (!RETURN_IN_REG_P (mode))
1138 return true;
1139 return false;
1140}
1141
1142/* Implement TARGET_PROMOTE_FUNCTION_MODE. */
1143
1144static machine_mode
1145nvptx_promote_function_mode (const_tree type, machine_mode mode,
1146 int *punsignedp,
1147 const_tree funtype, int for_return)
1148{
1149 if (type == NULL_TREE)
1150 return mode;
1151 if (for_return)
1152 return promote_mode (type, mode, punsignedp);
1153 /* For K&R-style functions, try to match the language promotion rules to
1154 minimize type mismatches at assembly time. */
1155 if (TYPE_ARG_TYPES (funtype) == NULL_TREE
1156 && type != NULL_TREE
1157 && !AGGREGATE_TYPE_P (type))
1158 {
1159 if (mode == SFmode)
1160 mode = DFmode;
1161 mode = arg_promotion (mode);
1162 }
1163
1164 return mode;
1165}
1166
1167/* Implement TARGET_STATIC_CHAIN. */
1168
1169static rtx
1170nvptx_static_chain (const_tree fndecl, bool incoming_p)
1171{
1172 if (!DECL_STATIC_CHAIN (fndecl))
1173 return NULL;
1174
1175 if (incoming_p)
1176 return gen_rtx_REG (Pmode, STATIC_CHAIN_REGNUM);
1177 else
1178 return gen_rtx_REG (Pmode, OUTGOING_STATIC_CHAIN_REGNUM);
1179}
1180\f
1181/* Emit a comparison COMPARE, and return the new test to be used in the
1182 jump. */
1183
1184rtx
1185nvptx_expand_compare (rtx compare)
1186{
1187 rtx pred = gen_reg_rtx (BImode);
1188 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1189 XEXP (compare, 0), XEXP (compare, 1));
f7df4a84 1190 emit_insn (gen_rtx_SET (pred, cmp));
738f2522
BS
1191 return gen_rtx_NE (BImode, pred, const0_rtx);
1192}
1193
d88cd9c4
NS
1194/* Expand the oacc fork & join primitive into ptx-required unspecs. */
1195
1196void
1197nvptx_expand_oacc_fork (unsigned mode)
1198{
1199 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1200}
1201
1202void
1203nvptx_expand_oacc_join (unsigned mode)
1204{
1205 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1206}
1207
1208/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1209 objects. */
1210
1211static rtx
1212nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1213{
1214 rtx res;
1215
1216 switch (GET_MODE (src))
1217 {
1218 case DImode:
1219 res = gen_unpackdisi2 (dst0, dst1, src);
1220 break;
1221 case DFmode:
1222 res = gen_unpackdfsi2 (dst0, dst1, src);
1223 break;
1224 default: gcc_unreachable ();
1225 }
1226 return res;
1227}
1228
1229/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1230 object. */
1231
1232static rtx
1233nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1234{
1235 rtx res;
1236
1237 switch (GET_MODE (dst))
1238 {
1239 case DImode:
1240 res = gen_packsidi2 (dst, src0, src1);
1241 break;
1242 case DFmode:
1243 res = gen_packsidf2 (dst, src0, src1);
1244 break;
1245 default: gcc_unreachable ();
1246 }
1247 return res;
1248}
1249
1250/* Generate an instruction or sequence to broadcast register REG
1251 across the vectors of a single warp. */
1252
1253static rtx
1254nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
1255{
1256 rtx res;
1257
1258 switch (GET_MODE (dst))
1259 {
1260 case SImode:
1261 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1262 break;
1263 case SFmode:
1264 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1265 break;
1266 case DImode:
1267 case DFmode:
1268 {
1269 rtx tmp0 = gen_reg_rtx (SImode);
1270 rtx tmp1 = gen_reg_rtx (SImode);
1271
1272 start_sequence ();
1273 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1274 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1275 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1276 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1277 res = get_insns ();
1278 end_sequence ();
1279 }
1280 break;
1281 case BImode:
1282 {
1283 rtx tmp = gen_reg_rtx (SImode);
1284
1285 start_sequence ();
1286 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1287 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1288 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1289 res = get_insns ();
1290 end_sequence ();
1291 }
1292 break;
1293
1294 default:
1295 gcc_unreachable ();
1296 }
1297 return res;
1298}
1299
1300/* Generate an instruction or sequence to broadcast register REG
1301 across the vectors of a single warp. */
1302
1303static rtx
1304nvptx_gen_vcast (rtx reg)
1305{
1306 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1307}
1308
1309/* Structure used when generating a worker-level spill or fill. */
1310
1311struct wcast_data_t
1312{
1313 rtx base; /* Register holding base addr of buffer. */
1314 rtx ptr; /* Iteration var, if needed. */
1315 unsigned offset; /* Offset into worker buffer. */
1316};
1317
1318/* Direction of the spill/fill and looping setup/teardown indicator. */
1319
1320enum propagate_mask
1321 {
1322 PM_read = 1 << 0,
1323 PM_write = 1 << 1,
1324 PM_loop_begin = 1 << 2,
1325 PM_loop_end = 1 << 3,
1326
1327 PM_read_write = PM_read | PM_write
1328 };
1329
1330/* Generate instruction(s) to spill or fill register REG to/from the
1331 worker broadcast array. PM indicates what is to be done, REP
1332 how many loop iterations will be executed (0 for not a loop). */
1333
1334static rtx
1335nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1336{
1337 rtx res;
1338 machine_mode mode = GET_MODE (reg);
1339
1340 switch (mode)
1341 {
1342 case BImode:
1343 {
1344 rtx tmp = gen_reg_rtx (SImode);
1345
1346 start_sequence ();
1347 if (pm & PM_read)
1348 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1349 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1350 if (pm & PM_write)
1351 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1352 res = get_insns ();
1353 end_sequence ();
1354 }
1355 break;
1356
1357 default:
1358 {
1359 rtx addr = data->ptr;
1360
1361 if (!addr)
1362 {
1363 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1364
1365 if (align > worker_bcast_align)
1366 worker_bcast_align = align;
1367 data->offset = (data->offset + align - 1) & ~(align - 1);
1368 addr = data->base;
1369 if (data->offset)
1370 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1371 }
1372
1373 addr = gen_rtx_MEM (mode, addr);
1374 addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
1375 if (pm == PM_read)
1376 res = gen_rtx_SET (addr, reg);
1377 else if (pm == PM_write)
1378 res = gen_rtx_SET (reg, addr);
1379 else
1380 gcc_unreachable ();
1381
1382 if (data->ptr)
1383 {
1384 /* We're using a ptr, increment it. */
1385 start_sequence ();
1386
1387 emit_insn (res);
1388 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1389 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1390 res = get_insns ();
1391 end_sequence ();
1392 }
1393 else
1394 rep = 1;
1395 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1396 }
1397 break;
1398 }
1399 return res;
1400}
1401
738f2522
BS
1402/* When loading an operand ORIG_OP, verify whether an address space
1403 conversion to generic is required, and if so, perform it. Also
1404 check for SYMBOL_REFs for function decls and call
1405 nvptx_record_needed_fndecl as needed.
1406 Return either the original operand, or the converted one. */
1407
1408rtx
1409nvptx_maybe_convert_symbolic_operand (rtx orig_op)
1410{
1411 if (GET_MODE (orig_op) != Pmode)
1412 return orig_op;
1413
1414 rtx op = orig_op;
1415 while (GET_CODE (op) == PLUS || GET_CODE (op) == CONST)
1416 op = XEXP (op, 0);
1417 if (GET_CODE (op) != SYMBOL_REF)
1418 return orig_op;
1419
1420 tree decl = SYMBOL_REF_DECL (op);
1421 if (decl && TREE_CODE (decl) == FUNCTION_DECL)
1422 {
1423 nvptx_record_needed_fndecl (decl);
1424 return orig_op;
1425 }
1426
1427 addr_space_t as = nvptx_addr_space_from_address (op);
1428 if (as == ADDR_SPACE_GENERIC)
1429 return orig_op;
1430
1431 enum unspec code;
1432 code = (as == ADDR_SPACE_GLOBAL ? UNSPEC_FROM_GLOBAL
1433 : as == ADDR_SPACE_LOCAL ? UNSPEC_FROM_LOCAL
1434 : as == ADDR_SPACE_SHARED ? UNSPEC_FROM_SHARED
1435 : as == ADDR_SPACE_CONST ? UNSPEC_FROM_CONST
1436 : UNSPEC_FROM_PARAM);
1437 rtx dest = gen_reg_rtx (Pmode);
f7df4a84
RS
1438 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, orig_op),
1439 code)));
738f2522
BS
1440 return dest;
1441}
1442\f
1443/* Returns true if X is a valid address for use in a memory reference. */
1444
1445static bool
1446nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1447{
1448 enum rtx_code code = GET_CODE (x);
1449
1450 switch (code)
1451 {
1452 case REG:
1453 return true;
1454
1455 case PLUS:
1456 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1457 return true;
1458 return false;
1459
1460 case CONST:
1461 case SYMBOL_REF:
1462 case LABEL_REF:
1463 return true;
1464
1465 default:
1466 return false;
1467 }
1468}
1469
1470/* Implement HARD_REGNO_MODE_OK. We barely use hard regs, but we want
1471 to ensure that the return register's mode isn't changed. */
1472
1473bool
1474nvptx_hard_regno_mode_ok (int regno, machine_mode mode)
1475{
1476 if (regno != NVPTX_RETURN_REGNUM
1477 || cfun == NULL || cfun->machine->ret_reg_mode == VOIDmode)
1478 return true;
1479 return mode == cfun->machine->ret_reg_mode;
1480}
1481\f
1482/* Convert an address space AS to the corresponding ptx string. */
1483
1484const char *
1485nvptx_section_from_addr_space (addr_space_t as)
1486{
1487 switch (as)
1488 {
1489 case ADDR_SPACE_CONST:
1490 return ".const";
1491
1492 case ADDR_SPACE_GLOBAL:
1493 return ".global";
1494
1495 case ADDR_SPACE_SHARED:
1496 return ".shared";
1497
1498 case ADDR_SPACE_GENERIC:
1499 return "";
1500
1501 default:
1502 gcc_unreachable ();
1503 }
1504}
1505
1506/* Determine whether DECL goes into .const or .global. */
1507
1508const char *
1509nvptx_section_for_decl (const_tree decl)
1510{
1511 bool is_const = (CONSTANT_CLASS_P (decl)
1512 || TREE_CODE (decl) == CONST_DECL
1513 || TREE_READONLY (decl));
1514 if (is_const)
1515 return ".const";
1516
1517 return ".global";
1518}
1519
1520/* Look for a SYMBOL_REF in ADDR and return the address space to be used
1521 for the insn referencing this address. */
1522
1523addr_space_t
1524nvptx_addr_space_from_address (rtx addr)
1525{
1526 while (GET_CODE (addr) == PLUS || GET_CODE (addr) == CONST)
1527 addr = XEXP (addr, 0);
1528 if (GET_CODE (addr) != SYMBOL_REF)
1529 return ADDR_SPACE_GENERIC;
1530
1531 tree decl = SYMBOL_REF_DECL (addr);
1532 if (decl == NULL_TREE || TREE_CODE (decl) == FUNCTION_DECL)
1533 return ADDR_SPACE_GENERIC;
1534
1535 bool is_const = (CONSTANT_CLASS_P (decl)
1536 || TREE_CODE (decl) == CONST_DECL
1537 || TREE_READONLY (decl));
1538 if (is_const)
1539 return ADDR_SPACE_CONST;
1540
1541 return ADDR_SPACE_GLOBAL;
1542}
1543\f
ecf6e535
BS
1544/* Machinery to output constant initializers. When beginning an initializer,
1545 we decide on a chunk size (which is visible in ptx in the type used), and
1546 then all initializer data is buffered until a chunk is filled and ready to
1547 be written out. */
738f2522
BS
1548
1549/* Used when assembling integers to ensure data is emitted in
1550 pieces whose size matches the declaration we printed. */
1551static unsigned int decl_chunk_size;
1552static machine_mode decl_chunk_mode;
1553/* Used in the same situation, to keep track of the byte offset
1554 into the initializer. */
1555static unsigned HOST_WIDE_INT decl_offset;
1556/* The initializer part we are currently processing. */
1557static HOST_WIDE_INT init_part;
1558/* The total size of the object. */
1559static unsigned HOST_WIDE_INT object_size;
1560/* True if we found a skip extending to the end of the object. Used to
1561 assert that no data follows. */
1562static bool object_finished;
1563
1564/* Write the necessary separator string to begin a new initializer value. */
1565
1566static void
1567begin_decl_field (void)
1568{
1569 /* We never see decl_offset at zero by the time we get here. */
1570 if (decl_offset == decl_chunk_size)
1571 fprintf (asm_out_file, " = { ");
1572 else
1573 fprintf (asm_out_file, ", ");
1574}
1575
1576/* Output the currently stored chunk as an initializer value. */
1577
1578static void
1579output_decl_chunk (void)
1580{
1581 begin_decl_field ();
1582 output_address (gen_int_mode (init_part, decl_chunk_mode));
1583 init_part = 0;
1584}
1585
1586/* Add value VAL sized SIZE to the data we're emitting, and keep writing
1587 out chunks as they fill up. */
1588
1589static void
1590nvptx_assemble_value (HOST_WIDE_INT val, unsigned int size)
1591{
1592 unsigned HOST_WIDE_INT chunk_offset = decl_offset % decl_chunk_size;
1593 gcc_assert (!object_finished);
1594 while (size > 0)
1595 {
1596 int this_part = size;
1597 if (chunk_offset + this_part > decl_chunk_size)
1598 this_part = decl_chunk_size - chunk_offset;
1599 HOST_WIDE_INT val_part;
1600 HOST_WIDE_INT mask = 2;
1601 mask <<= this_part * BITS_PER_UNIT - 1;
1602 val_part = val & (mask - 1);
1603 init_part |= val_part << (BITS_PER_UNIT * chunk_offset);
1604 val >>= BITS_PER_UNIT * this_part;
1605 size -= this_part;
1606 decl_offset += this_part;
1607 if (decl_offset % decl_chunk_size == 0)
1608 output_decl_chunk ();
1609
1610 chunk_offset = 0;
1611 }
1612}
1613
1614/* Target hook for assembling integer object X of size SIZE. */
1615
1616static bool
1617nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1618{
1619 if (GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == CONST)
1620 {
1621 gcc_assert (size = decl_chunk_size);
1622 if (decl_offset % decl_chunk_size != 0)
1623 sorry ("cannot emit unaligned pointers in ptx assembly");
1624 decl_offset += size;
1625 begin_decl_field ();
1626
1627 HOST_WIDE_INT off = 0;
1628 if (GET_CODE (x) == CONST)
1629 x = XEXP (x, 0);
1630 if (GET_CODE (x) == PLUS)
1631 {
1632 off = INTVAL (XEXP (x, 1));
1633 x = XEXP (x, 0);
1634 }
1635 if (GET_CODE (x) == SYMBOL_REF)
1636 {
1637 nvptx_record_needed_fndecl (SYMBOL_REF_DECL (x));
1638 fprintf (asm_out_file, "generic(");
1639 output_address (x);
1640 fprintf (asm_out_file, ")");
1641 }
1642 if (off != 0)
1643 fprintf (asm_out_file, " + " HOST_WIDE_INT_PRINT_DEC, off);
1644 return true;
1645 }
1646
1647 HOST_WIDE_INT val;
1648 switch (GET_CODE (x))
1649 {
1650 case CONST_INT:
1651 val = INTVAL (x);
1652 break;
1653 case CONST_DOUBLE:
1654 gcc_unreachable ();
1655 break;
1656 default:
1657 gcc_unreachable ();
1658 }
1659
1660 nvptx_assemble_value (val, size);
1661 return true;
1662}
1663
1664/* Output SIZE zero bytes. We ignore the FILE argument since the
1665 functions we're calling to perform the output just use
1666 asm_out_file. */
1667
1668void
1669nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1670{
1671 if (decl_offset + size >= object_size)
1672 {
1673 if (decl_offset % decl_chunk_size != 0)
1674 nvptx_assemble_value (0, decl_chunk_size);
1675 object_finished = true;
1676 return;
1677 }
1678
1679 while (size > decl_chunk_size)
1680 {
1681 nvptx_assemble_value (0, decl_chunk_size);
1682 size -= decl_chunk_size;
1683 }
1684 while (size-- > 0)
1685 nvptx_assemble_value (0, 1);
1686}
1687
1688/* Output a string STR with length SIZE. As in nvptx_output_skip we
1689 ignore the FILE arg. */
1690
1691void
1692nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
1693{
1694 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
1695 nvptx_assemble_value (str[i], 1);
1696}
1697
1698/* Called when the initializer for a decl has been completely output through
1699 combinations of the three functions above. */
1700
1701static void
1702nvptx_assemble_decl_end (void)
1703{
1704 if (decl_offset != 0)
1705 {
1706 if (!object_finished && decl_offset % decl_chunk_size != 0)
1707 nvptx_assemble_value (0, decl_chunk_size);
1708
1709 fprintf (asm_out_file, " }");
1710 }
1711 fprintf (asm_out_file, ";\n");
1712}
1713
1714/* Start a declaration of a variable of TYPE with NAME to
1715 FILE. IS_PUBLIC says whether this will be externally visible.
1716 Here we just write the linker hint and decide on the chunk size
1717 to use. */
1718
1719static void
1720init_output_initializer (FILE *file, const char *name, const_tree type,
1721 bool is_public)
1722{
1723 fprintf (file, "// BEGIN%s VAR DEF: ", is_public ? " GLOBAL" : "");
1724 assemble_name_raw (file, name);
1725 fputc ('\n', file);
1726
1727 if (TREE_CODE (type) == ARRAY_TYPE)
1728 type = TREE_TYPE (type);
1729 int sz = int_size_in_bytes (type);
1730 if ((TREE_CODE (type) != INTEGER_TYPE
1731 && TREE_CODE (type) != ENUMERAL_TYPE
1732 && TREE_CODE (type) != REAL_TYPE)
1733 || sz < 0
1734 || sz > HOST_BITS_PER_WIDE_INT)
1735 type = ptr_type_node;
1736 decl_chunk_size = int_size_in_bytes (type);
1737 decl_chunk_mode = int_mode_for_mode (TYPE_MODE (type));
1738 decl_offset = 0;
1739 init_part = 0;
1740 object_finished = false;
1741}
1742
1743/* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
1744 writing a constant variable EXP with NAME and SIZE and its
1745 initializer to FILE. */
1746
1747static void
1748nvptx_asm_declare_constant_name (FILE *file, const char *name,
1749 const_tree exp, HOST_WIDE_INT size)
1750{
1751 tree type = TREE_TYPE (exp);
1752 init_output_initializer (file, name, type, false);
1753 fprintf (file, "\t.const .align %d .u%d ",
1754 TYPE_ALIGN (TREE_TYPE (exp)) / BITS_PER_UNIT,
1755 decl_chunk_size * BITS_PER_UNIT);
1756 assemble_name (file, name);
1757 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1758 (size + decl_chunk_size - 1) / decl_chunk_size);
1759 object_size = size;
1760}
1761
1762/* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
1763 a variable DECL with NAME to FILE. */
1764
1765void
1766nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
1767{
1768 if (decl && DECL_SIZE (decl))
1769 {
1770 tree type = TREE_TYPE (decl);
1771 unsigned HOST_WIDE_INT size;
1772
1773 init_output_initializer (file, name, type, TREE_PUBLIC (decl));
1774 size = tree_to_uhwi (DECL_SIZE_UNIT (decl));
1775 const char *section = nvptx_section_for_decl (decl);
1776 fprintf (file, "\t%s%s .align %d .u%d ",
1777 TREE_PUBLIC (decl) ? " .visible" : "", section,
1778 DECL_ALIGN (decl) / BITS_PER_UNIT,
1779 decl_chunk_size * BITS_PER_UNIT);
1780 assemble_name (file, name);
1781 if (size > 0)
1782 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]",
1783 (size + decl_chunk_size - 1) / decl_chunk_size);
1784 else
1785 object_finished = true;
1786 object_size = size;
1787 }
1788}
1789
1790/* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
1791
1792static void
1793nvptx_globalize_label (FILE *, const char *)
1794{
1795}
1796
1797/* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
1798 declaration only for variable DECL with NAME to FILE. */
1799static void
1800nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
1801{
1802 if (TREE_CODE (decl) != VAR_DECL)
1803 return;
1804 const char *section = nvptx_section_for_decl (decl);
1805 fprintf (file, "// BEGIN%s VAR DECL: ", TREE_PUBLIC (decl) ? " GLOBAL" : "");
1806 assemble_name_raw (file, name);
1807 fputs ("\n", file);
1808 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (decl));
1809 fprintf (file, ".extern %s .b8 ", section);
1810 assemble_name_raw (file, name);
1811 if (size > 0)
16998094 1812 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC"]", size);
738f2522
BS
1813 fprintf (file, ";\n\n");
1814}
1815
1816/* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
ecf6e535
BS
1817 involves writing .param declarations and in/out copies into them. For
1818 indirect calls, also write the .callprototype. */
738f2522
BS
1819
1820const char *
1821nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
1822{
1823 char buf[256];
1824 static int labelno;
1825 bool needs_tgt = register_operand (callee, Pmode);
1826 rtx pat = PATTERN (insn);
f324806d 1827 int arg_end = XVECLEN (pat, 0);
738f2522
BS
1828 tree decl = NULL_TREE;
1829
1830 fprintf (asm_out_file, "\t{\n");
1831 if (result != NULL)
f324806d
NS
1832 fprintf (asm_out_file, "\t\t.param%s %%retval_in;\n",
1833 nvptx_ptx_type_from_mode (arg_promotion (GET_MODE (result)),
1834 false));
738f2522 1835
ecf6e535 1836 /* Ensure we have a ptx declaration in the output if necessary. */
738f2522
BS
1837 if (GET_CODE (callee) == SYMBOL_REF)
1838 {
1839 decl = SYMBOL_REF_DECL (callee);
1840 if (decl && DECL_EXTERNAL (decl))
1841 nvptx_record_fndecl (decl);
1842 }
1843
1844 if (needs_tgt)
1845 {
1846 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
1847 labelno++;
1848 ASM_OUTPUT_LABEL (asm_out_file, buf);
1849 std::stringstream s;
1850 write_func_decl_from_insn (s, result, pat, callee);
1851 fputs (s.str().c_str(), asm_out_file);
1852 }
1853
f324806d 1854 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1855 {
f324806d 1856 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
1857 machine_mode mode = GET_MODE (t);
1858 int count = maybe_split_mode (&mode);
1859
f324806d 1860 while (count--)
738f2522
BS
1861 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d%s;\n",
1862 nvptx_ptx_type_from_mode (mode, false), argno++,
1863 mode == QImode || mode == HImode ? "[1]" : "");
1864 }
f324806d 1865 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1866 {
f324806d 1867 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
1868 gcc_assert (REG_P (t));
1869 machine_mode mode = GET_MODE (t);
1870 int count = maybe_split_mode (&mode);
1871
1872 if (count == 1)
1873 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d;\n",
1874 nvptx_ptx_type_from_mode (mode, false), argno++,
1875 REGNO (t));
1876 else
1877 {
1878 int n = 0;
f324806d 1879 while (count--)
738f2522
BS
1880 fprintf (asm_out_file, "\t\tst.param%s [%%out_arg%d], %%r%d$%d;\n",
1881 nvptx_ptx_type_from_mode (mode, false), argno++,
1882 REGNO (t), n++);
1883 }
1884 }
1885
1886 fprintf (asm_out_file, "\t\tcall ");
1887 if (result != NULL_RTX)
1888 fprintf (asm_out_file, "(%%retval_in), ");
1889
1890 if (decl)
1891 {
1892 const char *name = get_fnname_from_decl (decl);
1893 name = nvptx_name_replacement (name);
1894 assemble_name (asm_out_file, name);
1895 }
1896 else
1897 output_address (callee);
1898
f324806d 1899 if (arg_end > 1 || (decl && DECL_STATIC_CHAIN (decl)))
738f2522 1900 {
f324806d
NS
1901 const char *comma = "";
1902
738f2522 1903 fprintf (asm_out_file, ", (");
f324806d 1904 for (int i = 1, argno = 0; i < arg_end; i++)
738f2522 1905 {
f324806d 1906 rtx t = XEXP (XVECEXP (pat, 0, i), 0);
738f2522
BS
1907 machine_mode mode = GET_MODE (t);
1908 int count = maybe_split_mode (&mode);
1909
f324806d 1910 while (count--)
738f2522 1911 {
f324806d
NS
1912 fprintf (asm_out_file, "%s%%out_arg%d", comma, argno++);
1913 comma = ", ";
738f2522
BS
1914 }
1915 }
1916 if (decl && DECL_STATIC_CHAIN (decl))
f324806d
NS
1917 fprintf (asm_out_file, "%s%s", comma,
1918 reg_names [OUTGOING_STATIC_CHAIN_REGNUM]);
738f2522
BS
1919
1920 fprintf (asm_out_file, ")");
1921 }
f324806d 1922
738f2522
BS
1923 if (needs_tgt)
1924 {
1925 fprintf (asm_out_file, ", ");
1926 assemble_name (asm_out_file, buf);
1927 }
1928 fprintf (asm_out_file, ";\n");
1929 if (result != NULL_RTX)
1930 return "ld.param%t0\t%0, [%%retval_in];\n\t}";
1931
1932 return "}";
1933}
1934
1935/* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
1936
1937static bool
1938nvptx_print_operand_punct_valid_p (unsigned char c)
1939{
1940 return c == '.' || c== '#';
1941}
1942
1943static void nvptx_print_operand (FILE *, rtx, int);
1944
1945/* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
1946
1947static void
1948nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
1949{
1950 rtx off;
1951 if (GET_CODE (x) == CONST)
1952 x = XEXP (x, 0);
1953 switch (GET_CODE (x))
1954 {
1955 case PLUS:
1956 off = XEXP (x, 1);
1957 output_address (XEXP (x, 0));
1958 fprintf (file, "+");
1959 output_address (off);
1960 break;
1961
1962 case SYMBOL_REF:
1963 case LABEL_REF:
1964 output_addr_const (file, x);
1965 break;
1966
1967 default:
1968 gcc_assert (GET_CODE (x) != MEM);
1969 nvptx_print_operand (file, x, 0);
1970 break;
1971 }
1972}
1973
1974/* Write assembly language output for the address ADDR to FILE. */
1975
1976static void
1977nvptx_print_operand_address (FILE *file, rtx addr)
1978{
1979 nvptx_print_address_operand (file, addr, VOIDmode);
1980}
1981
1982/* Print an operand, X, to FILE, with an optional modifier in CODE.
1983
1984 Meaning of CODE:
1985 . -- print the predicate for the instruction or an emptry string for an
1986 unconditional one.
1987 # -- print a rounding mode for the instruction
1988
1989 A -- print an address space identifier for a MEM
1990 c -- print an opcode suffix for a comparison operator, including a type code
738f2522 1991 f -- print a full reg even for something that must always be split
d88cd9c4 1992 S -- print a shuffle kind specified by CONST_INT
738f2522
BS
1993 t -- print a type opcode suffix, promoting QImode to 32 bits
1994 T -- print a type size in bits
1995 u -- print a type opcode suffix without promotions. */
1996
1997static void
1998nvptx_print_operand (FILE *file, rtx x, int code)
1999{
2000 rtx orig_x = x;
2001 machine_mode op_mode;
2002
2003 if (code == '.')
2004 {
2005 x = current_insn_predicate;
2006 if (x)
2007 {
2008 unsigned int regno = REGNO (XEXP (x, 0));
2009 fputs ("[", file);
2010 if (GET_CODE (x) == EQ)
2011 fputs ("!", file);
2012 fputs (reg_names [regno], file);
2013 fputs ("]", file);
2014 }
2015 return;
2016 }
2017 else if (code == '#')
2018 {
2019 fputs (".rn", file);
2020 return;
2021 }
2022
2023 enum rtx_code x_code = GET_CODE (x);
2024
2025 switch (code)
2026 {
2027 case 'A':
2028 {
2029 addr_space_t as = nvptx_addr_space_from_address (XEXP (x, 0));
2030 fputs (nvptx_section_from_addr_space (as), file);
2031 }
2032 break;
2033
738f2522
BS
2034 case 't':
2035 op_mode = nvptx_underlying_object_mode (x);
2036 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
2037 break;
2038
2039 case 'u':
2040 op_mode = nvptx_underlying_object_mode (x);
2041 fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
2042 break;
2043
d88cd9c4
NS
2044 case 'S':
2045 {
2046 unsigned kind = UINTVAL (x);
2047 static const char *const kinds[] =
2048 {"up", "down", "bfly", "idx"};
2049 fprintf (file, ".%s", kinds[kind]);
2050 }
2051 break;
2052
738f2522
BS
2053 case 'T':
2054 fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
2055 break;
2056
2057 case 'j':
2058 fprintf (file, "@");
2059 goto common;
2060
2061 case 'J':
2062 fprintf (file, "@!");
2063 goto common;
2064
2065 case 'c':
2066 op_mode = GET_MODE (XEXP (x, 0));
2067 switch (x_code)
2068 {
2069 case EQ:
2070 fputs (".eq", file);
2071 break;
2072 case NE:
2073 if (FLOAT_MODE_P (op_mode))
2074 fputs (".neu", file);
2075 else
2076 fputs (".ne", file);
2077 break;
2078 case LE:
2079 fputs (".le", file);
2080 break;
2081 case GE:
2082 fputs (".ge", file);
2083 break;
2084 case LT:
2085 fputs (".lt", file);
2086 break;
2087 case GT:
2088 fputs (".gt", file);
2089 break;
2090 case LEU:
2091 fputs (".ls", file);
2092 break;
2093 case GEU:
2094 fputs (".hs", file);
2095 break;
2096 case LTU:
2097 fputs (".lo", file);
2098 break;
2099 case GTU:
2100 fputs (".hi", file);
2101 break;
2102 case LTGT:
2103 fputs (".ne", file);
2104 break;
2105 case UNEQ:
2106 fputs (".equ", file);
2107 break;
2108 case UNLE:
2109 fputs (".leu", file);
2110 break;
2111 case UNGE:
2112 fputs (".geu", file);
2113 break;
2114 case UNLT:
2115 fputs (".ltu", file);
2116 break;
2117 case UNGT:
2118 fputs (".gtu", file);
2119 break;
2120 case UNORDERED:
2121 fputs (".nan", file);
2122 break;
2123 case ORDERED:
2124 fputs (".num", file);
2125 break;
2126 default:
2127 gcc_unreachable ();
2128 }
2129 if (FLOAT_MODE_P (op_mode)
2130 || x_code == EQ || x_code == NE
2131 || x_code == GEU || x_code == GTU
2132 || x_code == LEU || x_code == LTU)
2133 fputs (nvptx_ptx_type_from_mode (op_mode, true), file);
2134 else
2135 fprintf (file, ".s%d", GET_MODE_BITSIZE (op_mode));
2136 break;
2137 default:
2138 common:
2139 switch (x_code)
2140 {
2141 case SUBREG:
2142 x = SUBREG_REG (x);
2143 /* fall through */
2144
2145 case REG:
2146 if (HARD_REGISTER_P (x))
2147 fprintf (file, "%s", reg_names[REGNO (x)]);
2148 else
2149 fprintf (file, "%%r%d", REGNO (x));
2150 if (code != 'f' && nvptx_split_reg_p (GET_MODE (x)))
2151 {
2152 gcc_assert (GET_CODE (orig_x) == SUBREG
2153 && !nvptx_split_reg_p (GET_MODE (orig_x)));
2154 fprintf (file, "$%d", SUBREG_BYTE (orig_x) / UNITS_PER_WORD);
2155 }
2156 break;
2157
2158 case MEM:
2159 fputc ('[', file);
2160 nvptx_print_address_operand (file, XEXP (x, 0), GET_MODE (x));
2161 fputc (']', file);
2162 break;
2163
2164 case CONST_INT:
2165 output_addr_const (file, x);
2166 break;
2167
2168 case CONST:
2169 case SYMBOL_REF:
2170 case LABEL_REF:
2171 /* We could use output_addr_const, but that can print things like
2172 "x-8", which breaks ptxas. Need to ensure it is output as
2173 "x+-8". */
2174 nvptx_print_address_operand (file, x, VOIDmode);
2175 break;
2176
2177 case CONST_DOUBLE:
2178 long vals[2];
34a72c33 2179 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), GET_MODE (x));
738f2522
BS
2180 vals[0] &= 0xffffffff;
2181 vals[1] &= 0xffffffff;
2182 if (GET_MODE (x) == SFmode)
2183 fprintf (file, "0f%08lx", vals[0]);
2184 else
2185 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2186 break;
2187
2188 default:
2189 output_addr_const (file, x);
2190 }
2191 }
2192}
2193\f
2194/* Record replacement regs used to deal with subreg operands. */
2195struct reg_replace
2196{
2197 rtx replacement[MAX_RECOG_OPERANDS];
2198 machine_mode mode;
2199 int n_allocated;
2200 int n_in_use;
2201};
2202
2203/* Allocate or reuse a replacement in R and return the rtx. */
2204
2205static rtx
2206get_replacement (struct reg_replace *r)
2207{
2208 if (r->n_allocated == r->n_in_use)
2209 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2210 return r->replacement[r->n_in_use++];
2211}
2212
2213/* Clean up subreg operands. In ptx assembly, everything is typed, and
2214 the presence of subregs would break the rules for most instructions.
2215 Replace them with a suitable new register of the right size, plus
2216 conversion copyin/copyout instructions. */
2217
2218static void
517665b3 2219nvptx_reorg_subreg (void)
738f2522
BS
2220{
2221 struct reg_replace qiregs, hiregs, siregs, diregs;
2222 rtx_insn *insn, *next;
2223
738f2522
BS
2224 qiregs.n_allocated = 0;
2225 hiregs.n_allocated = 0;
2226 siregs.n_allocated = 0;
2227 diregs.n_allocated = 0;
2228 qiregs.mode = QImode;
2229 hiregs.mode = HImode;
2230 siregs.mode = SImode;
2231 diregs.mode = DImode;
2232
2233 for (insn = get_insns (); insn; insn = next)
2234 {
2235 next = NEXT_INSN (insn);
2236 if (!NONDEBUG_INSN_P (insn)
1fe6befc 2237 || asm_noperands (PATTERN (insn)) >= 0
738f2522
BS
2238 || GET_CODE (PATTERN (insn)) == USE
2239 || GET_CODE (PATTERN (insn)) == CLOBBER)
2240 continue;
f324806d 2241
738f2522
BS
2242 qiregs.n_in_use = 0;
2243 hiregs.n_in_use = 0;
2244 siregs.n_in_use = 0;
2245 diregs.n_in_use = 0;
2246 extract_insn (insn);
2247 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
f324806d 2248
738f2522
BS
2249 for (int i = 0; i < recog_data.n_operands; i++)
2250 {
2251 rtx op = recog_data.operand[i];
2252 if (GET_CODE (op) != SUBREG)
2253 continue;
2254
2255 rtx inner = SUBREG_REG (op);
2256
2257 machine_mode outer_mode = GET_MODE (op);
2258 machine_mode inner_mode = GET_MODE (inner);
2259 gcc_assert (s_ok);
2260 if (s_ok
2261 && (GET_MODE_PRECISION (inner_mode)
2262 >= GET_MODE_PRECISION (outer_mode)))
2263 continue;
2264 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2265 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2266 : outer_mode == HImode ? &hiregs
2267 : outer_mode == SImode ? &siregs
2268 : &diregs);
2269 rtx new_reg = get_replacement (r);
2270
2271 if (recog_data.operand_type[i] != OP_OUT)
2272 {
2273 enum rtx_code code;
2274 if (GET_MODE_PRECISION (inner_mode)
2275 < GET_MODE_PRECISION (outer_mode))
2276 code = ZERO_EXTEND;
2277 else
2278 code = TRUNCATE;
2279
f7df4a84 2280 rtx pat = gen_rtx_SET (new_reg,
738f2522
BS
2281 gen_rtx_fmt_e (code, outer_mode, inner));
2282 emit_insn_before (pat, insn);
2283 }
2284
2285 if (recog_data.operand_type[i] != OP_IN)
2286 {
2287 enum rtx_code code;
2288 if (GET_MODE_PRECISION (inner_mode)
2289 < GET_MODE_PRECISION (outer_mode))
2290 code = TRUNCATE;
2291 else
2292 code = ZERO_EXTEND;
2293
f7df4a84 2294 rtx pat = gen_rtx_SET (inner,
738f2522
BS
2295 gen_rtx_fmt_e (code, inner_mode, new_reg));
2296 emit_insn_after (pat, insn);
2297 }
2298 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2299 }
2300 }
517665b3 2301}
738f2522 2302
d2d47a28
NS
2303/* Loop structure of the function. The entire function is described as
2304 a NULL loop. */
d88cd9c4
NS
2305
2306struct parallel
2307{
2308 /* Parent parallel. */
2309 parallel *parent;
2310
2311 /* Next sibling parallel. */
2312 parallel *next;
2313
2314 /* First child parallel. */
2315 parallel *inner;
2316
2317 /* Partitioning mask of the parallel. */
2318 unsigned mask;
2319
2320 /* Partitioning used within inner parallels. */
2321 unsigned inner_mask;
2322
2323 /* Location of parallel forked and join. The forked is the first
2324 block in the parallel and the join is the first block after of
2325 the partition. */
2326 basic_block forked_block;
2327 basic_block join_block;
2328
2329 rtx_insn *forked_insn;
2330 rtx_insn *join_insn;
2331
2332 rtx_insn *fork_insn;
2333 rtx_insn *joining_insn;
2334
2335 /* Basic blocks in this parallel, but not in child parallels. The
2336 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2337 blocks are not. */
2338 auto_vec<basic_block> blocks;
2339
2340public:
2341 parallel (parallel *parent, unsigned mode);
2342 ~parallel ();
2343};
2344
2345/* Constructor links the new parallel into it's parent's chain of
2346 children. */
2347
2348parallel::parallel (parallel *parent_, unsigned mask_)
2349 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2350{
2351 forked_block = join_block = 0;
2352 forked_insn = join_insn = 0;
2353 fork_insn = joining_insn = 0;
2354
2355 if (parent)
2356 {
2357 next = parent->inner;
2358 parent->inner = this;
2359 }
2360}
2361
2362parallel::~parallel ()
2363{
2364 delete inner;
2365 delete next;
2366}
2367
2368/* Map of basic blocks to insns */
2369typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2370
2371/* A tuple of an insn of interest and the BB in which it resides. */
2372typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2373typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2374
2375/* Split basic blocks such that each forked and join unspecs are at
2376 the start of their basic blocks. Thus afterwards each block will
2377 have a single partitioning mode. We also do the same for return
2378 insns, as they are executed by every thread. Return the
2379 partitioning mode of the function as a whole. Populate MAP with
2380 head and tail blocks. We also clear the BB visited flag, which is
2381 used when finding partitions. */
2382
2383static void
2384nvptx_split_blocks (bb_insn_map_t *map)
2385{
2386 insn_bb_vec_t worklist;
2387 basic_block block;
2388 rtx_insn *insn;
2389
2390 /* Locate all the reorg instructions of interest. */
2391 FOR_ALL_BB_FN (block, cfun)
2392 {
2393 bool seen_insn = false;
2394
2395 /* Clear visited flag, for use by parallel locator */
2396 block->flags &= ~BB_VISITED;
2397
2398 FOR_BB_INSNS (block, insn)
2399 {
2400 if (!INSN_P (insn))
2401 continue;
2402 switch (recog_memoized (insn))
2403 {
2404 default:
2405 seen_insn = true;
2406 continue;
2407 case CODE_FOR_nvptx_forked:
2408 case CODE_FOR_nvptx_join:
2409 break;
2410
2411 case CODE_FOR_return:
2412 /* We also need to split just before return insns, as
2413 that insn needs executing by all threads, but the
2414 block it is in probably does not. */
2415 break;
2416 }
2417
2418 if (seen_insn)
2419 /* We've found an instruction that must be at the start of
2420 a block, but isn't. Add it to the worklist. */
2421 worklist.safe_push (insn_bb_t (insn, block));
2422 else
2423 /* It was already the first instruction. Just add it to
2424 the map. */
2425 map->get_or_insert (block) = insn;
2426 seen_insn = true;
2427 }
2428 }
2429
2430 /* Split blocks on the worklist. */
2431 unsigned ix;
2432 insn_bb_t *elt;
2433 basic_block remap = 0;
2434 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2435 {
2436 if (remap != elt->second)
2437 {
2438 block = elt->second;
2439 remap = block;
2440 }
2441
2442 /* Split block before insn. The insn is in the new block */
2443 edge e = split_block (block, PREV_INSN (elt->first));
2444
2445 block = e->dest;
2446 map->get_or_insert (block) = elt->first;
2447 }
2448}
2449
2450/* BLOCK is a basic block containing a head or tail instruction.
2451 Locate the associated prehead or pretail instruction, which must be
2452 in the single predecessor block. */
2453
2454static rtx_insn *
2455nvptx_discover_pre (basic_block block, int expected)
2456{
2457 gcc_assert (block->preds->length () == 1);
2458 basic_block pre_block = (*block->preds)[0]->src;
2459 rtx_insn *pre_insn;
2460
2461 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
2462 pre_insn = PREV_INSN (pre_insn))
2463 gcc_assert (pre_insn != BB_HEAD (pre_block));
2464
2465 gcc_assert (recog_memoized (pre_insn) == expected);
2466 return pre_insn;
2467}
2468
2469/* Dump this parallel and all its inner parallels. */
2470
2471static void
2472nvptx_dump_pars (parallel *par, unsigned depth)
2473{
2474 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
2475 depth, par->mask,
2476 par->forked_block ? par->forked_block->index : -1,
2477 par->join_block ? par->join_block->index : -1);
2478
2479 fprintf (dump_file, " blocks:");
2480
2481 basic_block block;
2482 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
2483 fprintf (dump_file, " %d", block->index);
2484 fprintf (dump_file, "\n");
2485 if (par->inner)
2486 nvptx_dump_pars (par->inner, depth + 1);
2487
2488 if (par->next)
2489 nvptx_dump_pars (par->next, depth);
2490}
2491
2492/* If BLOCK contains a fork/join marker, process it to create or
2493 terminate a loop structure. Add this block to the current loop,
2494 and then walk successor blocks. */
2495
2496static parallel *
2497nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
2498{
2499 if (block->flags & BB_VISITED)
2500 return par;
2501 block->flags |= BB_VISITED;
2502
2503 if (rtx_insn **endp = map->get (block))
2504 {
2505 rtx_insn *end = *endp;
2506
2507 /* This is a block head or tail, or return instruction. */
2508 switch (recog_memoized (end))
2509 {
2510 case CODE_FOR_return:
2511 /* Return instructions are in their own block, and we
2512 don't need to do anything more. */
2513 return par;
2514
2515 case CODE_FOR_nvptx_forked:
2516 /* Loop head, create a new inner loop and add it into
2517 our parent's child list. */
2518 {
2519 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2520
2521 gcc_assert (mask);
2522 par = new parallel (par, mask);
2523 par->forked_block = block;
2524 par->forked_insn = end;
2525 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2526 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2527 par->fork_insn
2528 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
2529 }
2530 break;
2531
2532 case CODE_FOR_nvptx_join:
2533 /* A loop tail. Finish the current loop and return to
2534 parent. */
2535 {
2536 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
2537
2538 gcc_assert (par->mask == mask);
2539 par->join_block = block;
2540 par->join_insn = end;
2541 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2542 && (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
2543 par->joining_insn
2544 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
2545 par = par->parent;
2546 }
2547 break;
2548
2549 default:
2550 gcc_unreachable ();
2551 }
2552 }
2553
2554 if (par)
2555 /* Add this block onto the current loop's list of blocks. */
2556 par->blocks.safe_push (block);
2557 else
2558 /* This must be the entry block. Create a NULL parallel. */
2559 par = new parallel (0, 0);
2560
2561 /* Walk successor blocks. */
2562 edge e;
2563 edge_iterator ei;
2564
2565 FOR_EACH_EDGE (e, ei, block->succs)
2566 nvptx_find_par (map, par, e->dest);
2567
2568 return par;
2569}
2570
2571/* DFS walk the CFG looking for fork & join markers. Construct
2572 loop structures as we go. MAP is a mapping of basic blocks
2573 to head & tail markers, discovered when splitting blocks. This
2574 speeds up the discovery. We rely on the BB visited flag having
2575 been cleared when splitting blocks. */
2576
2577static parallel *
2578nvptx_discover_pars (bb_insn_map_t *map)
2579{
2580 basic_block block;
2581
2582 /* Mark exit blocks as visited. */
2583 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
2584 block->flags |= BB_VISITED;
2585
2586 /* And entry block as not. */
2587 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
2588 block->flags &= ~BB_VISITED;
2589
2590 parallel *par = nvptx_find_par (map, 0, block);
2591
2592 if (dump_file)
2593 {
2594 fprintf (dump_file, "\nLoops\n");
2595 nvptx_dump_pars (par, 0);
2596 fprintf (dump_file, "\n");
2597 }
2598
2599 return par;
2600}
2601
2602/* Propagate live state at the start of a partitioned region. BLOCK
2603 provides the live register information, and might not contain
2604 INSN. Propagation is inserted just after INSN. RW indicates whether
2605 we are reading and/or writing state. This
2606 separation is needed for worker-level proppagation where we
2607 essentially do a spill & fill. FN is the underlying worker
2608 function to generate the propagation instructions for single
2609 register. DATA is user data.
2610
2611 We propagate the live register set and the entire frame. We could
2612 do better by (a) propagating just the live set that is used within
2613 the partitioned regions and (b) only propagating stack entries that
2614 are used. The latter might be quite hard to determine. */
2615
2616typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
2617
2618static void
2619nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
2620 propagator_fn fn, void *data)
2621{
2622 bitmap live = DF_LIVE_IN (block);
2623 bitmap_iterator iterator;
2624 unsigned ix;
2625
2626 /* Copy the frame array. */
2627 HOST_WIDE_INT fs = get_frame_size ();
2628 if (fs)
2629 {
2630 rtx tmp = gen_reg_rtx (DImode);
2631 rtx idx = NULL_RTX;
2632 rtx ptr = gen_reg_rtx (Pmode);
2633 rtx pred = NULL_RTX;
2634 rtx_code_label *label = NULL;
2635
2636 gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
2637 fs /= GET_MODE_SIZE (DImode);
2638 /* Detect single iteration loop. */
2639 if (fs == 1)
2640 fs = 0;
2641
2642 start_sequence ();
2643 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
2644 if (fs)
2645 {
2646 idx = gen_reg_rtx (SImode);
2647 pred = gen_reg_rtx (BImode);
2648 label = gen_label_rtx ();
2649
2650 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
2651 /* Allow worker function to initialize anything needed. */
2652 rtx init = fn (tmp, PM_loop_begin, fs, data);
2653 if (init)
2654 emit_insn (init);
2655 emit_label (label);
2656 LABEL_NUSES (label)++;
2657 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
2658 }
2659 if (rw & PM_read)
2660 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
2661 emit_insn (fn (tmp, rw, fs, data));
2662 if (rw & PM_write)
2663 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
2664 if (fs)
2665 {
2666 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
2667 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
2668 emit_insn (gen_br_true_uni (pred, label));
2669 rtx fini = fn (tmp, PM_loop_end, fs, data);
2670 if (fini)
2671 emit_insn (fini);
2672 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
2673 }
2674 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
2675 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
2676 rtx cpy = get_insns ();
2677 end_sequence ();
2678 insn = emit_insn_after (cpy, insn);
2679 }
2680
2681 /* Copy live registers. */
2682 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
2683 {
2684 rtx reg = regno_reg_rtx[ix];
2685
2686 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
2687 {
2688 rtx bcast = fn (reg, rw, 0, data);
2689
2690 insn = emit_insn_after (bcast, insn);
2691 }
2692 }
2693}
2694
2695/* Worker for nvptx_vpropagate. */
2696
2697static rtx
2698vprop_gen (rtx reg, propagate_mask pm,
2699 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
2700{
2701 if (!(pm & PM_read_write))
2702 return 0;
2703
2704 return nvptx_gen_vcast (reg);
2705}
2706
2707/* Propagate state that is live at start of BLOCK across the vectors
2708 of a single warp. Propagation is inserted just after INSN. */
2709
2710static void
2711nvptx_vpropagate (basic_block block, rtx_insn *insn)
2712{
2713 nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
2714}
2715
2716/* Worker for nvptx_wpropagate. */
2717
2718static rtx
2719wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
2720{
2721 wcast_data_t *data = (wcast_data_t *)data_;
2722
2723 if (pm & PM_loop_begin)
2724 {
2725 /* Starting a loop, initialize pointer. */
2726 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
2727
2728 if (align > worker_bcast_align)
2729 worker_bcast_align = align;
2730 data->offset = (data->offset + align - 1) & ~(align - 1);
2731
2732 data->ptr = gen_reg_rtx (Pmode);
2733
2734 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
2735 }
2736 else if (pm & PM_loop_end)
2737 {
2738 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
2739 data->ptr = NULL_RTX;
2740 return clobber;
2741 }
2742 else
2743 return nvptx_gen_wcast (reg, pm, rep, data);
2744}
2745
2746/* Spill or fill live state that is live at start of BLOCK. PRE_P
2747 indicates if this is just before partitioned mode (do spill), or
2748 just after it starts (do fill). Sequence is inserted just after
2749 INSN. */
2750
2751static void
2752nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
2753{
2754 wcast_data_t data;
2755
2756 data.base = gen_reg_rtx (Pmode);
2757 data.offset = 0;
2758 data.ptr = NULL_RTX;
2759
2760 nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
2761 if (data.offset)
2762 {
2763 /* Stuff was emitted, initialize the base pointer now. */
2764 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
2765 emit_insn_after (init, insn);
2766
2767 if (worker_bcast_size < data.offset)
2768 worker_bcast_size = data.offset;
2769 }
2770}
2771
2772/* Emit a worker-level synchronization barrier. We use different
2773 markers for before and after synchronizations. */
2774
2775static rtx
2776nvptx_wsync (bool after)
2777{
2778 return gen_nvptx_barsync (GEN_INT (after));
2779}
2780
2781/* Single neutering according to MASK. FROM is the incoming block and
2782 TO is the outgoing block. These may be the same block. Insert at
2783 start of FROM:
2784
2785 if (tid.<axis>) goto end.
2786
2787 and insert before ending branch of TO (if there is such an insn):
2788
2789 end:
2790 <possibly-broadcast-cond>
2791 <branch>
2792
2793 We currently only use differnt FROM and TO when skipping an entire
2794 loop. We could do more if we detected superblocks. */
2795
2796static void
2797nvptx_single (unsigned mask, basic_block from, basic_block to)
2798{
2799 rtx_insn *head = BB_HEAD (from);
2800 rtx_insn *tail = BB_END (to);
2801 unsigned skip_mask = mask;
2802
2803 /* Find first insn of from block */
2804 while (head != BB_END (from) && !INSN_P (head))
2805 head = NEXT_INSN (head);
2806
2807 /* Find last insn of to block */
2808 rtx_insn *limit = from == to ? head : BB_HEAD (to);
2809 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
2810 tail = PREV_INSN (tail);
2811
2812 /* Detect if tail is a branch. */
2813 rtx tail_branch = NULL_RTX;
2814 rtx cond_branch = NULL_RTX;
2815 if (tail && INSN_P (tail))
2816 {
2817 tail_branch = PATTERN (tail);
2818 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
2819 tail_branch = NULL_RTX;
2820 else
2821 {
2822 cond_branch = SET_SRC (tail_branch);
2823 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
2824 cond_branch = NULL_RTX;
2825 }
2826 }
2827
2828 if (tail == head)
2829 {
2830 /* If this is empty, do nothing. */
2831 if (!head || !INSN_P (head))
2832 return;
2833
2834 /* If this is a dummy insn, do nothing. */
2835 switch (recog_memoized (head))
2836 {
2837 default:
2838 break;
2839 case CODE_FOR_nvptx_fork:
2840 case CODE_FOR_nvptx_forked:
2841 case CODE_FOR_nvptx_joining:
2842 case CODE_FOR_nvptx_join:
2843 return;
2844 }
2845
2846 if (cond_branch)
2847 {
2848 /* If we're only doing vector single, there's no need to
2849 emit skip code because we'll not insert anything. */
2850 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
2851 skip_mask = 0;
2852 }
2853 else if (tail_branch)
2854 /* Block with only unconditional branch. Nothing to do. */
2855 return;
2856 }
2857
2858 /* Insert the vector test inside the worker test. */
2859 unsigned mode;
2860 rtx_insn *before = tail;
2861 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
2862 if (GOMP_DIM_MASK (mode) & skip_mask)
2863 {
2864 rtx_code_label *label = gen_label_rtx ();
2865 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
2866
2867 if (!pred)
2868 {
2869 pred = gen_reg_rtx (BImode);
2870 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
2871 }
2872
2873 rtx br;
2874 if (mode == GOMP_DIM_VECTOR)
2875 br = gen_br_true (pred, label);
2876 else
2877 br = gen_br_true_uni (pred, label);
2878 emit_insn_before (br, head);
2879
2880 LABEL_NUSES (label)++;
2881 if (tail_branch)
2882 before = emit_label_before (label, before);
2883 else
2884 emit_label_after (label, tail);
2885 }
2886
2887 /* Now deal with propagating the branch condition. */
2888 if (cond_branch)
2889 {
2890 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
2891
2892 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
2893 {
2894 /* Vector mode only, do a shuffle. */
2895 emit_insn_before (nvptx_gen_vcast (pvar), tail);
2896 }
2897 else
2898 {
2899 /* Includes worker mode, do spill & fill. By construction
2900 we should never have worker mode only. */
2901 wcast_data_t data;
2902
2903 data.base = worker_bcast_sym;
2904 data.ptr = 0;
2905
2906 if (worker_bcast_size < GET_MODE_SIZE (SImode))
2907 worker_bcast_size = GET_MODE_SIZE (SImode);
2908
2909 data.offset = 0;
2910 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
2911 before);
2912 /* Barrier so other workers can see the write. */
2913 emit_insn_before (nvptx_wsync (false), tail);
2914 data.offset = 0;
2915 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
2916 /* This barrier is needed to avoid worker zero clobbering
2917 the broadcast buffer before all the other workers have
2918 had a chance to read this instance of it. */
2919 emit_insn_before (nvptx_wsync (true), tail);
2920 }
2921
2922 extract_insn (tail);
2923 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
2924 UNSPEC_BR_UNIFIED);
2925 validate_change (tail, recog_data.operand_loc[0], unsp, false);
2926 }
2927}
2928
2929/* PAR is a parallel that is being skipped in its entirety according to
2930 MASK. Treat this as skipping a superblock starting at forked
2931 and ending at joining. */
2932
2933static void
2934nvptx_skip_par (unsigned mask, parallel *par)
2935{
2936 basic_block tail = par->join_block;
2937 gcc_assert (tail->preds->length () == 1);
2938
2939 basic_block pre_tail = (*tail->preds)[0]->src;
2940 gcc_assert (pre_tail->succs->length () == 1);
2941
2942 nvptx_single (mask, par->forked_block, pre_tail);
2943}
2944
2945/* Process the parallel PAR and all its contained
2946 parallels. We do everything but the neutering. Return mask of
2947 partitioned modes used within this parallel. */
2948
2949static unsigned
2950nvptx_process_pars (parallel *par)
2951{
2952 unsigned inner_mask = par->mask;
2953
2954 /* Do the inner parallels first. */
2955 if (par->inner)
2956 {
2957 par->inner_mask = nvptx_process_pars (par->inner);
2958 inner_mask |= par->inner_mask;
2959 }
2960
2961 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
2962 /* No propagation needed for a call. */;
2963 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
2964 {
2965 nvptx_wpropagate (false, par->forked_block, par->forked_insn);
2966 nvptx_wpropagate (true, par->forked_block, par->fork_insn);
2967 /* Insert begin and end synchronizations. */
2968 emit_insn_after (nvptx_wsync (false), par->forked_insn);
2969 emit_insn_before (nvptx_wsync (true), par->joining_insn);
2970 }
2971 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
2972 nvptx_vpropagate (par->forked_block, par->forked_insn);
2973
2974 /* Now do siblings. */
2975 if (par->next)
2976 inner_mask |= nvptx_process_pars (par->next);
2977 return inner_mask;
2978}
2979
2980/* Neuter the parallel described by PAR. We recurse in depth-first
2981 order. MODES are the partitioning of the execution and OUTER is
2982 the partitioning of the parallels we are contained in. */
2983
2984static void
2985nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
2986{
2987 unsigned me = (par->mask
2988 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
2989 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
2990 unsigned skip_mask = 0, neuter_mask = 0;
2991
2992 if (par->inner)
2993 nvptx_neuter_pars (par->inner, modes, outer | me);
2994
2995 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
2996 {
2997 if ((outer | me) & GOMP_DIM_MASK (mode))
2998 {} /* Mode is partitioned: no neutering. */
2999 else if (!(modes & GOMP_DIM_MASK (mode)))
3000 {} /* Mode is not used: nothing to do. */
3001 else if (par->inner_mask & GOMP_DIM_MASK (mode)
3002 || !par->forked_insn)
3003 /* Partitioned in inner parallels, or we're not a partitioned
3004 at all: neuter individual blocks. */
3005 neuter_mask |= GOMP_DIM_MASK (mode);
3006 else if (!par->parent || !par->parent->forked_insn
3007 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
3008 /* Parent isn't a parallel or contains this paralleling: skip
3009 parallel at this level. */
3010 skip_mask |= GOMP_DIM_MASK (mode);
3011 else
3012 {} /* Parent will skip this parallel itself. */
3013 }
3014
3015 if (neuter_mask)
3016 {
3017 int ix;
3018 int len = par->blocks.length ();
3019
3020 for (ix = 0; ix != len; ix++)
3021 {
3022 basic_block block = par->blocks[ix];
3023
3024 nvptx_single (neuter_mask, block, block);
3025 }
3026 }
3027
3028 if (skip_mask)
3029 nvptx_skip_par (skip_mask, par);
3030
3031 if (par->next)
3032 nvptx_neuter_pars (par->next, modes, outer);
3033}
3034
517665b3 3035/* PTX-specific reorganization
d88cd9c4 3036 - Split blocks at fork and join instructions
c38f0d8c
NS
3037 - Compute live registers
3038 - Mark now-unused registers, so function begin doesn't declare
517665b3 3039 unused registers.
d88cd9c4
NS
3040 - Insert state propagation when entering partitioned mode
3041 - Insert neutering instructions when in single mode
c38f0d8c 3042 - Replace subregs with suitable sequences.
517665b3
NS
3043*/
3044
3045static void
3046nvptx_reorg (void)
3047{
517665b3
NS
3048 /* We are freeing block_for_insn in the toplev to keep compatibility
3049 with old MDEP_REORGS that are not CFG based. Recompute it now. */
3050 compute_bb_for_insn ();
3051
3052 thread_prologue_and_epilogue_insns ();
3053
d88cd9c4
NS
3054 /* Split blocks and record interesting unspecs. */
3055 bb_insn_map_t bb_insn_map;
3056
3057 nvptx_split_blocks (&bb_insn_map);
3058
c38f0d8c 3059 /* Compute live regs */
517665b3
NS
3060 df_clear_flags (DF_LR_RUN_DCE);
3061 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
d88cd9c4
NS
3062 df_live_add_problem ();
3063 df_live_set_all_dirty ();
517665b3 3064 df_analyze ();
738f2522
BS
3065 regstat_init_n_sets_and_refs ();
3066
d88cd9c4
NS
3067 if (dump_file)
3068 df_dump (dump_file);
3069
517665b3 3070 /* Mark unused regs as unused. */
d88cd9c4 3071 int max_regs = max_reg_num ();
517665b3 3072 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
738f2522
BS
3073 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
3074 regno_reg_rtx[i] = const0_rtx;
517665b3 3075
d88cd9c4
NS
3076 /* Determine launch dimensions of the function. If it is not an
3077 offloaded function (i.e. this is a regular compiler), the
3078 function has no neutering. */
3079 tree attr = get_oacc_fn_attrib (current_function_decl);
3080 if (attr)
3081 {
3082 /* If we determined this mask before RTL expansion, we could
3083 elide emission of some levels of forks and joins. */
3084 unsigned mask = 0;
3085 tree dims = TREE_VALUE (attr);
3086 unsigned ix;
3087
3088 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
3089 {
3090 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
3091 tree allowed = TREE_PURPOSE (dims);
3092
3093 if (size != 1 && !(allowed && integer_zerop (allowed)))
3094 mask |= GOMP_DIM_MASK (ix);
3095 }
3096 /* If there is worker neutering, there must be vector
3097 neutering. Otherwise the hardware will fail. */
3098 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3099 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
3100
3101 /* Discover & process partitioned regions. */
3102 parallel *pars = nvptx_discover_pars (&bb_insn_map);
3103 nvptx_process_pars (pars);
3104 nvptx_neuter_pars (pars, mask, 0);
3105 delete pars;
3106 }
3107
517665b3 3108 /* Replace subregs. */
c03b0416 3109 nvptx_reorg_subreg ();
517665b3 3110
738f2522 3111 regstat_free_n_sets_and_refs ();
517665b3
NS
3112
3113 df_finish_pass (true);
738f2522
BS
3114}
3115\f
3116/* Handle a "kernel" attribute; arguments as in
3117 struct attribute_spec.handler. */
3118
3119static tree
3120nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
3121 int ARG_UNUSED (flags), bool *no_add_attrs)
3122{
3123 tree decl = *node;
3124
3125 if (TREE_CODE (decl) != FUNCTION_DECL)
3126 {
3127 error ("%qE attribute only applies to functions", name);
3128 *no_add_attrs = true;
3129 }
3130
3131 else if (TREE_TYPE (TREE_TYPE (decl)) != void_type_node)
3132 {
3133 error ("%qE attribute requires a void return type", name);
3134 *no_add_attrs = true;
3135 }
3136
3137 return NULL_TREE;
3138}
3139
3140/* Table of valid machine attributes. */
3141static const struct attribute_spec nvptx_attribute_table[] =
3142{
3143 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
3144 affects_type_identity } */
3145 { "kernel", 0, 0, true, false, false, nvptx_handle_kernel_attribute, false },
3146 { NULL, 0, 0, false, false, false, NULL, false }
3147};
3148\f
3149/* Limit vector alignments to BIGGEST_ALIGNMENT. */
3150
3151static HOST_WIDE_INT
3152nvptx_vector_alignment (const_tree type)
3153{
3154 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
3155
3156 return MIN (align, BIGGEST_ALIGNMENT);
3157}
d88cd9c4
NS
3158
3159/* Indicate that INSN cannot be duplicated. */
3160
3161static bool
3162nvptx_cannot_copy_insn_p (rtx_insn *insn)
3163{
3164 switch (recog_memoized (insn))
3165 {
3166 case CODE_FOR_nvptx_shufflesi:
3167 case CODE_FOR_nvptx_shufflesf:
3168 case CODE_FOR_nvptx_barsync:
3169 case CODE_FOR_nvptx_fork:
3170 case CODE_FOR_nvptx_forked:
3171 case CODE_FOR_nvptx_joining:
3172 case CODE_FOR_nvptx_join:
3173 return true;
3174 default:
3175 return false;
3176 }
3177}
738f2522 3178\f
1f83528e
TS
3179/* Record a symbol for mkoffload to enter into the mapping table. */
3180
3181static void
3182nvptx_record_offload_symbol (tree decl)
3183{
3e32ee19
NS
3184 switch (TREE_CODE (decl))
3185 {
3186 case VAR_DECL:
3187 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
3188 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3189 break;
3190
3191 case FUNCTION_DECL:
3192 {
3193 tree attr = get_oacc_fn_attrib (decl);
3194 tree dims = NULL_TREE;
3195 unsigned ix;
3196
3197 if (attr)
3198 dims = TREE_VALUE (attr);
3199 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
3200 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
3201
3202 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
3203 {
3204 int size = 1;
3205
3206 /* TODO: This check can go away once the dimension default
3207 machinery is merged to trunk. */
3208 if (dims)
3209 {
3210 tree dim = TREE_VALUE (dims);
3211
3212 if (dim)
3213 size = TREE_INT_CST_LOW (dim);
3214
3215 gcc_assert (!TREE_PURPOSE (dims));
3216 dims = TREE_CHAIN (dims);
3217 }
3218
3219 fprintf (asm_out_file, ", %#x", size);
3220 }
d2d47a28 3221
3e32ee19
NS
3222 fprintf (asm_out_file, "\n");
3223 }
3224 break;
d2d47a28 3225
3e32ee19
NS
3226 default:
3227 gcc_unreachable ();
3228 }
1f83528e
TS
3229}
3230
738f2522
BS
3231/* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
3232 at the start of a file. */
3233
3234static void
3235nvptx_file_start (void)
3236{
3237 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
3238 fputs ("\t.version\t3.1\n", asm_out_file);
3239 fputs ("\t.target\tsm_30\n", asm_out_file);
3240 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
3241 fputs ("// END PREAMBLE\n", asm_out_file);
3242}
3243
ecf6e535
BS
3244/* Write out the function declarations we've collected and declare storage
3245 for the broadcast buffer. */
738f2522
BS
3246
3247static void
3248nvptx_file_end (void)
3249{
f3dba894
TS
3250 hash_table<tree_hasher>::iterator iter;
3251 tree decl;
3252 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
3253 nvptx_record_fndecl (decl, true);
738f2522 3254 fputs (func_decls.str().c_str(), asm_out_file);
d88cd9c4
NS
3255
3256 if (worker_bcast_size)
3257 {
3258 /* Define the broadcast buffer. */
3259
3260 worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
3261 & ~(worker_bcast_align - 1);
3262
3263 fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_bcast_name);
3264 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3265 worker_bcast_align,
3266 worker_bcast_name, worker_bcast_size);
3267 }
f3552158
NS
3268
3269 if (worker_red_size)
3270 {
3271 /* Define the reduction buffer. */
3272
3273 worker_red_size = ((worker_red_size + worker_red_align - 1)
3274 & ~(worker_red_align - 1));
3275
3276 fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_red_name);
3277 fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
3278 worker_red_align,
3279 worker_red_name, worker_red_size);
3280 }
3281}
3282
3283/* Expander for the shuffle builtins. */
3284
3285static rtx
3286nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
3287{
3288 if (ignore)
3289 return target;
3290
3291 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
3292 NULL_RTX, mode, EXPAND_NORMAL);
3293 if (!REG_P (src))
3294 src = copy_to_mode_reg (mode, src);
3295
3296 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
3297 NULL_RTX, SImode, EXPAND_NORMAL);
3298 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
3299 NULL_RTX, SImode, EXPAND_NORMAL);
3300
3301 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
3302 idx = copy_to_mode_reg (SImode, idx);
3303
3304 rtx pat = nvptx_gen_shuffle (target, src, idx, INTVAL (op));
3305 if (pat)
3306 emit_insn (pat);
3307
3308 return target;
3309}
3310
3311/* Worker reduction address expander. */
3312
3313static rtx
3314nvptx_expand_worker_addr (tree exp, rtx target,
3315 machine_mode ARG_UNUSED (mode), int ignore)
3316{
3317 if (ignore)
3318 return target;
3319
3320 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
3321 if (align > worker_red_align)
3322 worker_red_align = align;
3323
3324 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
3325 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
3326 if (size + offset > worker_red_size)
3327 worker_red_size = size + offset;
3328
3329 emit_insn (gen_rtx_SET (target, worker_red_sym));
3330
3331 if (offset)
3332 emit_insn (gen_rtx_SET (target,
3333 gen_rtx_PLUS (Pmode, target, GEN_INT (offset))));
3334
3335 emit_insn (gen_rtx_SET (target,
3336 gen_rtx_UNSPEC (Pmode, gen_rtvec (1, target),
3337 UNSPEC_FROM_SHARED)));
3338
3339 return target;
3340}
3341
3342/* Expand the CMP_SWAP PTX builtins. We have our own versions that do
3343 not require taking the address of any object, other than the memory
3344 cell being operated on. */
3345
3346static rtx
3347nvptx_expand_cmp_swap (tree exp, rtx target,
3348 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
3349{
3350 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
3351
3352 if (!target)
3353 target = gen_reg_rtx (mode);
3354
3355 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
3356 NULL_RTX, Pmode, EXPAND_NORMAL);
3357 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
3358 NULL_RTX, mode, EXPAND_NORMAL);
3359 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
3360 NULL_RTX, mode, EXPAND_NORMAL);
3361 rtx pat;
3362
3363 mem = gen_rtx_MEM (mode, mem);
3364 if (!REG_P (cmp))
3365 cmp = copy_to_mode_reg (mode, cmp);
3366 if (!REG_P (src))
3367 src = copy_to_mode_reg (mode, src);
3368
3369 if (mode == SImode)
3370 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
3371 else
3372 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
3373
3374 emit_insn (pat);
3375
3376 return target;
3377}
3378
3379
3380/* Codes for all the NVPTX builtins. */
3381enum nvptx_builtins
3382{
3383 NVPTX_BUILTIN_SHUFFLE,
3384 NVPTX_BUILTIN_SHUFFLELL,
3385 NVPTX_BUILTIN_WORKER_ADDR,
3386 NVPTX_BUILTIN_CMP_SWAP,
3387 NVPTX_BUILTIN_CMP_SWAPLL,
3388 NVPTX_BUILTIN_MAX
3389};
3390
3391static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
3392
3393/* Return the NVPTX builtin for CODE. */
3394
3395static tree
3396nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
3397{
3398 if (code >= NVPTX_BUILTIN_MAX)
3399 return error_mark_node;
3400
3401 return nvptx_builtin_decls[code];
3402}
3403
3404/* Set up all builtin functions for this target. */
3405
3406static void
3407nvptx_init_builtins (void)
3408{
3409#define DEF(ID, NAME, T) \
3410 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
3411 = add_builtin_function ("__builtin_nvptx_" NAME, \
3412 build_function_type_list T, \
3413 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
3414#define ST sizetype
3415#define UINT unsigned_type_node
3416#define LLUINT long_long_unsigned_type_node
3417#define PTRVOID ptr_type_node
3418
3419 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
3420 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
3421 DEF (WORKER_ADDR, "worker_addr",
3422 (PTRVOID, ST, UINT, UINT, NULL_TREE));
3423 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
3424 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
3425
3426#undef DEF
3427#undef ST
3428#undef UINT
3429#undef LLUINT
3430#undef PTRVOID
3431}
3432
3433/* Expand an expression EXP that calls a built-in function,
3434 with result going to TARGET if that's convenient
3435 (and in mode MODE if that's convenient).
3436 SUBTARGET may be used as the target for computing one of EXP's operands.
3437 IGNORE is nonzero if the value is to be ignored. */
3438
3439static rtx
3440nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
3441 machine_mode mode, int ignore)
3442{
3443 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
3444 switch (DECL_FUNCTION_CODE (fndecl))
3445 {
3446 case NVPTX_BUILTIN_SHUFFLE:
3447 case NVPTX_BUILTIN_SHUFFLELL:
3448 return nvptx_expand_shuffle (exp, target, mode, ignore);
3449
3450 case NVPTX_BUILTIN_WORKER_ADDR:
3451 return nvptx_expand_worker_addr (exp, target, mode, ignore);
3452
3453 case NVPTX_BUILTIN_CMP_SWAP:
3454 case NVPTX_BUILTIN_CMP_SWAPLL:
3455 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
3456
3457 default: gcc_unreachable ();
3458 }
738f2522
BS
3459}
3460\f
f3552158
NS
3461/* Define dimension sizes for known hardware. */
3462#define PTX_VECTOR_LENGTH 32
3463#define PTX_WORKER_LENGTH 32
3464
94829f87
NS
3465/* Validate compute dimensions of an OpenACC offload or routine, fill
3466 in non-unity defaults. FN_LEVEL indicates the level at which a
3467 routine might spawn a loop. It is negative for non-routines. */
3468
3469static bool
3470nvptx_goacc_validate_dims (tree ARG_UNUSED (decl), int *ARG_UNUSED (dims),
3471 int ARG_UNUSED (fn_level))
3472{
3473 bool changed = false;
3474
ccc8282b
NS
3475 /* The vector size must be 32, unless this is a SEQ routine. */
3476 if (fn_level <= GOMP_DIM_VECTOR
3477 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
3478 {
3479 if (dims[GOMP_DIM_VECTOR] >= 0 && fn_level < 0)
3480 warning_at (DECL_SOURCE_LOCATION (decl), 0,
3481 dims[GOMP_DIM_VECTOR]
3482 ? "using vector_length (%d), ignoring %d"
3483 : "using vector_length (%d), ignoring runtime setting",
3484 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
3485 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
3486 changed = true;
3487 }
3488
3489 /* Check the num workers is not too large. */
3490 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
3491 {
3492 warning_at (DECL_SOURCE_LOCATION (decl), 0,
3493 "using num_workers (%d), ignoring %d",
3494 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
3495 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
3496 changed = true;
3497 }
94829f87
NS
3498
3499 return changed;
3500}
d88cd9c4
NS
3501
3502/* Determine whether fork & joins are needed. */
3503
3504static bool
3505nvptx_goacc_fork_join (gcall *call, const int dims[],
3506 bool ARG_UNUSED (is_fork))
3507{
3508 tree arg = gimple_call_arg (call, 2);
3509 unsigned axis = TREE_INT_CST_LOW (arg);
3510
3511 /* We only care about worker and vector partitioning. */
3512 if (axis < GOMP_DIM_WORKER)
3513 return false;
3514
3515 /* If the size is 1, there's no partitioning. */
3516 if (dims[axis] == 1)
3517 return false;
3518
3519 return true;
3520}
3521
f3552158
NS
3522/* Generate a PTX builtin function call that returns the address in
3523 the worker reduction buffer at OFFSET. TYPE is the type of the
3524 data at that location. */
3525
3526static tree
3527nvptx_get_worker_red_addr (tree type, tree offset)
3528{
3529 machine_mode mode = TYPE_MODE (type);
3530 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
3531 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
3532 tree align = build_int_cst (unsigned_type_node,
3533 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
3534 tree call = build_call_expr (fndecl, 3, offset, size, align);
3535
3536 return fold_convert (build_pointer_type (type), call);
3537}
3538
3539/* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
3540 will cast the variable if necessary. */
3541
3542static void
3543nvptx_generate_vector_shuffle (location_t loc,
3544 tree dest_var, tree var, unsigned shift,
3545 gimple_seq *seq)
3546{
3547 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
3548 tree_code code = NOP_EXPR;
3549 tree type = unsigned_type_node;
3550 enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
3551
3552 if (!INTEGRAL_MODE_P (mode))
3553 code = VIEW_CONVERT_EXPR;
3554 if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
3555 {
3556 fn = NVPTX_BUILTIN_SHUFFLELL;
3557 type = long_long_unsigned_type_node;
3558 }
3559
3560 tree call = nvptx_builtin_decl (fn, true);
3561 call = build_call_expr_loc
3562 (loc, call, 3, fold_build1 (code, type, var),
3563 build_int_cst (unsigned_type_node, shift),
3564 build_int_cst (unsigned_type_node, SHUFFLE_DOWN));
3565
3566 call = fold_build1 (code, TREE_TYPE (dest_var), call);
3567
3568 gimplify_assign (dest_var, call, seq);
3569}
3570
3571/* Insert code to locklessly update *PTR with *PTR OP VAR just before
3572 GSI. */
3573
3574static tree
3575nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
3576 tree ptr, tree var, tree_code op)
3577{
3578 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
3579 tree_code code = NOP_EXPR;
3580 tree type = unsigned_type_node;
3581
3582 enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
3583
3584 if (!INTEGRAL_MODE_P (mode))
3585 code = VIEW_CONVERT_EXPR;
3586 if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
3587 {
3588 fn = NVPTX_BUILTIN_CMP_SWAPLL;
3589 type = long_long_unsigned_type_node;
3590 }
3591
3592 gimple_seq init_seq = NULL;
3593 tree init_var = make_ssa_name (type);
3594 tree init_expr = omp_reduction_init_op (loc, op, TREE_TYPE (var));
3595 init_expr = fold_build1 (code, type, init_expr);
3596 gimplify_assign (init_var, init_expr, &init_seq);
3597 gimple *init_end = gimple_seq_last (init_seq);
3598
3599 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
3600
3601 gimple_seq loop_seq = NULL;
3602 tree expect_var = make_ssa_name (type);
3603 tree actual_var = make_ssa_name (type);
3604 tree write_var = make_ssa_name (type);
3605
3606 tree write_expr = fold_build1 (code, TREE_TYPE (var), expect_var);
3607 write_expr = fold_build2 (op, TREE_TYPE (var), write_expr, var);
3608 write_expr = fold_build1 (code, type, write_expr);
3609 gimplify_assign (write_var, write_expr, &loop_seq);
3610
3611 tree swap_expr = nvptx_builtin_decl (fn, true);
3612 swap_expr = build_call_expr_loc (loc, swap_expr, 3,
3613 ptr, expect_var, write_var);
3614 gimplify_assign (actual_var, swap_expr, &loop_seq);
3615
3616 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
3617 NULL_TREE, NULL_TREE);
3618 gimple_seq_add_stmt (&loop_seq, cond);
3619
3620 /* Split the block just after the init stmts. */
3621 basic_block pre_bb = gsi_bb (*gsi);
3622 edge pre_edge = split_block (pre_bb, init_end);
3623 basic_block loop_bb = pre_edge->dest;
3624 pre_bb = pre_edge->src;
3625 /* Reset the iterator. */
3626 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
3627
3628 /* Insert the loop statements. */
3629 gimple *loop_end = gimple_seq_last (loop_seq);
3630 gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
3631
3632 /* Split the block just after the loop stmts. */
3633 edge post_edge = split_block (loop_bb, loop_end);
3634 basic_block post_bb = post_edge->dest;
3635 loop_bb = post_edge->src;
3636 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
3637
3638 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
3639 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
3640 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
3641 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
3642
3643 gphi *phi = create_phi_node (expect_var, loop_bb);
3644 add_phi_arg (phi, init_var, pre_edge, loc);
3645 add_phi_arg (phi, actual_var, loop_edge, loc);
3646
3647 loop *loop = alloc_loop ();
3648 loop->header = loop_bb;
3649 loop->latch = loop_bb;
3650 add_loop (loop, loop_bb->loop_father);
3651
3652 return fold_build1 (code, TREE_TYPE (var), write_var);
3653}
3654
3655/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
3656
3657static void
3658nvptx_goacc_reduction_setup (gcall *call)
3659{
3660 gimple_stmt_iterator gsi = gsi_for_stmt (call);
3661 tree lhs = gimple_call_lhs (call);
3662 tree var = gimple_call_arg (call, 2);
3663 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
3664 gimple_seq seq = NULL;
3665
3666 push_gimplify_context (true);
3667
3668 if (level != GOMP_DIM_GANG)
3669 {
3670 /* Copy the receiver object. */
3671 tree ref_to_res = gimple_call_arg (call, 1);
3672
3673 if (!integer_zerop (ref_to_res))
3674 var = build_simple_mem_ref (ref_to_res);
3675 }
3676
3677 if (level == GOMP_DIM_WORKER)
3678 {
3679 /* Store incoming value to worker reduction buffer. */
3680 tree offset = gimple_call_arg (call, 5);
3681 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
3682 tree ptr = make_ssa_name (TREE_TYPE (call));
3683
3684 gimplify_assign (ptr, call, &seq);
3685 tree ref = build_simple_mem_ref (ptr);
3686 TREE_THIS_VOLATILE (ref) = 1;
3687 gimplify_assign (ref, var, &seq);
3688 }
3689
3690 if (lhs)
3691 gimplify_assign (lhs, var, &seq);
3692
3693 pop_gimplify_context (NULL);
3694 gsi_replace_with_seq (&gsi, seq, true);
3695}
3696
3697/* NVPTX implementation of GOACC_REDUCTION_INIT. */
3698
3699static void
3700nvptx_goacc_reduction_init (gcall *call)
3701{
3702 gimple_stmt_iterator gsi = gsi_for_stmt (call);
3703 tree lhs = gimple_call_lhs (call);
3704 tree var = gimple_call_arg (call, 2);
3705 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
3706 enum tree_code rcode
3707 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
3708 tree init = omp_reduction_init_op (gimple_location (call), rcode,
3709 TREE_TYPE (var));
3710 gimple_seq seq = NULL;
3711
3712 push_gimplify_context (true);
3713
3714 if (level == GOMP_DIM_VECTOR)
3715 {
3716 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
3717 tree tid = make_ssa_name (integer_type_node);
3718 tree dim_vector = gimple_call_arg (call, 3);
3719 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
3720 dim_vector);
3721 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
3722 NULL_TREE, NULL_TREE);
3723
3724 gimple_call_set_lhs (tid_call, tid);
3725 gimple_seq_add_stmt (&seq, tid_call);
3726 gimple_seq_add_stmt (&seq, cond_stmt);
3727
3728 /* Split the block just after the call. */
3729 edge init_edge = split_block (gsi_bb (gsi), call);
3730 basic_block init_bb = init_edge->dest;
3731 basic_block call_bb = init_edge->src;
3732
3733 /* Fixup flags from call_bb to init_bb. */
3734 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
3735
3736 /* Set the initialization stmts. */
3737 gimple_seq init_seq = NULL;
3738 tree init_var = make_ssa_name (TREE_TYPE (var));
3739 gimplify_assign (init_var, init, &init_seq);
3740 gsi = gsi_start_bb (init_bb);
3741 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
3742
3743 /* Split block just after the init stmt. */
3744 gsi_prev (&gsi);
3745 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
3746 basic_block dst_bb = inited_edge->dest;
3747
3748 /* Create false edge from call_bb to dst_bb. */
3749 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
3750
3751 /* Create phi node in dst block. */
3752 gphi *phi = create_phi_node (lhs, dst_bb);
3753 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
3754 add_phi_arg (phi, var, nop_edge, gimple_location (call));
3755
3756 /* Reset dominator of dst bb. */
3757 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
3758
3759 /* Reset the gsi. */
3760 gsi = gsi_for_stmt (call);
3761 }
3762 else
3763 {
3764 if (level == GOMP_DIM_GANG)
3765 {
3766 /* If there's no receiver object, propagate the incoming VAR. */
3767 tree ref_to_res = gimple_call_arg (call, 1);
3768 if (integer_zerop (ref_to_res))
3769 init = var;
3770 }
3771
3772 gimplify_assign (lhs, init, &seq);
3773 }
3774
3775 pop_gimplify_context (NULL);
3776 gsi_replace_with_seq (&gsi, seq, true);
3777}
3778
3779/* NVPTX implementation of GOACC_REDUCTION_FINI. */
3780
3781static void
3782nvptx_goacc_reduction_fini (gcall *call)
3783{
3784 gimple_stmt_iterator gsi = gsi_for_stmt (call);
3785 tree lhs = gimple_call_lhs (call);
3786 tree ref_to_res = gimple_call_arg (call, 1);
3787 tree var = gimple_call_arg (call, 2);
3788 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
3789 enum tree_code op
3790 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
3791 gimple_seq seq = NULL;
3792 tree r = NULL_TREE;;
3793
3794 push_gimplify_context (true);
3795
3796 if (level == GOMP_DIM_VECTOR)
3797 {
3798 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
3799 but that requires a method of emitting a unified jump at the
3800 gimple level. */
3801 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
3802 {
3803 tree other_var = make_ssa_name (TREE_TYPE (var));
3804 nvptx_generate_vector_shuffle (gimple_location (call),
3805 other_var, var, shfl, &seq);
3806
3807 r = make_ssa_name (TREE_TYPE (var));
3808 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
3809 var, other_var), &seq);
3810 var = r;
3811 }
3812 }
3813 else
3814 {
3815 tree accum = NULL_TREE;
3816
3817 if (level == GOMP_DIM_WORKER)
3818 {
3819 /* Get reduction buffer address. */
3820 tree offset = gimple_call_arg (call, 5);
3821 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
3822 tree ptr = make_ssa_name (TREE_TYPE (call));
3823
3824 gimplify_assign (ptr, call, &seq);
3825 accum = ptr;
3826 }
3827 else if (integer_zerop (ref_to_res))
3828 r = var;
3829 else
3830 accum = ref_to_res;
3831
3832 if (accum)
3833 {
3834 /* Locklessly update the accumulator. */
3835 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
3836 seq = NULL;
3837 r = nvptx_lockless_update (gimple_location (call), &gsi,
3838 accum, var, op);
3839 }
3840 }
3841
3842 if (lhs)
3843 gimplify_assign (lhs, r, &seq);
3844 pop_gimplify_context (NULL);
3845
3846 gsi_replace_with_seq (&gsi, seq, true);
3847}
3848
3849/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
3850
3851static void
3852nvptx_goacc_reduction_teardown (gcall *call)
3853{
3854 gimple_stmt_iterator gsi = gsi_for_stmt (call);
3855 tree lhs = gimple_call_lhs (call);
3856 tree var = gimple_call_arg (call, 2);
3857 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
3858 gimple_seq seq = NULL;
3859
3860 push_gimplify_context (true);
3861 if (level == GOMP_DIM_WORKER)
3862 {
3863 /* Read the worker reduction buffer. */
3864 tree offset = gimple_call_arg (call, 5);
3865 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
3866 tree ptr = make_ssa_name (TREE_TYPE (call));
3867
3868 gimplify_assign (ptr, call, &seq);
3869 var = build_simple_mem_ref (ptr);
3870 TREE_THIS_VOLATILE (var) = 1;
3871 }
3872
3873 if (level != GOMP_DIM_GANG)
3874 {
3875 /* Write to the receiver object. */
3876 tree ref_to_res = gimple_call_arg (call, 1);
3877
3878 if (!integer_zerop (ref_to_res))
3879 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
3880 }
3881
3882 if (lhs)
3883 gimplify_assign (lhs, var, &seq);
3884
3885 pop_gimplify_context (NULL);
3886
3887 gsi_replace_with_seq (&gsi, seq, true);
3888}
3889
3890/* NVPTX reduction expander. */
3891
3892void
3893nvptx_goacc_reduction (gcall *call)
3894{
3895 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
3896
3897 switch (code)
3898 {
3899 case IFN_GOACC_REDUCTION_SETUP:
3900 nvptx_goacc_reduction_setup (call);
3901 break;
3902
3903 case IFN_GOACC_REDUCTION_INIT:
3904 nvptx_goacc_reduction_init (call);
3905 break;
3906
3907 case IFN_GOACC_REDUCTION_FINI:
3908 nvptx_goacc_reduction_fini (call);
3909 break;
3910
3911 case IFN_GOACC_REDUCTION_TEARDOWN:
3912 nvptx_goacc_reduction_teardown (call);
3913 break;
3914
3915 default:
3916 gcc_unreachable ();
3917 }
3918}
3919
738f2522
BS
3920#undef TARGET_OPTION_OVERRIDE
3921#define TARGET_OPTION_OVERRIDE nvptx_option_override
3922
3923#undef TARGET_ATTRIBUTE_TABLE
3924#define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
3925
3926#undef TARGET_LEGITIMATE_ADDRESS_P
3927#define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
3928
3929#undef TARGET_PROMOTE_FUNCTION_MODE
3930#define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
3931
3932#undef TARGET_FUNCTION_ARG
3933#define TARGET_FUNCTION_ARG nvptx_function_arg
3934#undef TARGET_FUNCTION_INCOMING_ARG
3935#define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
3936#undef TARGET_FUNCTION_ARG_ADVANCE
3937#define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
3938#undef TARGET_FUNCTION_ARG_BOUNDARY
3939#define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
3940#undef TARGET_FUNCTION_ARG_ROUND_BOUNDARY
3941#define TARGET_FUNCTION_ARG_ROUND_BOUNDARY nvptx_function_arg_boundary
3942#undef TARGET_PASS_BY_REFERENCE
3943#define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
3944#undef TARGET_FUNCTION_VALUE_REGNO_P
3945#define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
3946#undef TARGET_FUNCTION_VALUE
3947#define TARGET_FUNCTION_VALUE nvptx_function_value
3948#undef TARGET_LIBCALL_VALUE
3949#define TARGET_LIBCALL_VALUE nvptx_libcall_value
3950#undef TARGET_FUNCTION_OK_FOR_SIBCALL
3951#define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
18c05628
NS
3952#undef TARGET_GET_DRAP_RTX
3953#define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
738f2522
BS
3954#undef TARGET_SPLIT_COMPLEX_ARG
3955#define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
3956#undef TARGET_RETURN_IN_MEMORY
3957#define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
3958#undef TARGET_OMIT_STRUCT_RETURN_REG
3959#define TARGET_OMIT_STRUCT_RETURN_REG true
3960#undef TARGET_STRICT_ARGUMENT_NAMING
3961#define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
3962#undef TARGET_STATIC_CHAIN
3963#define TARGET_STATIC_CHAIN nvptx_static_chain
3964
3965#undef TARGET_CALL_ARGS
3966#define TARGET_CALL_ARGS nvptx_call_args
3967#undef TARGET_END_CALL_ARGS
3968#define TARGET_END_CALL_ARGS nvptx_end_call_args
3969
3970#undef TARGET_ASM_FILE_START
3971#define TARGET_ASM_FILE_START nvptx_file_start
3972#undef TARGET_ASM_FILE_END
3973#define TARGET_ASM_FILE_END nvptx_file_end
3974#undef TARGET_ASM_GLOBALIZE_LABEL
3975#define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
3976#undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
3977#define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
3978#undef TARGET_PRINT_OPERAND
3979#define TARGET_PRINT_OPERAND nvptx_print_operand
3980#undef TARGET_PRINT_OPERAND_ADDRESS
3981#define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
3982#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
3983#define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
3984#undef TARGET_ASM_INTEGER
3985#define TARGET_ASM_INTEGER nvptx_assemble_integer
3986#undef TARGET_ASM_DECL_END
3987#define TARGET_ASM_DECL_END nvptx_assemble_decl_end
3988#undef TARGET_ASM_DECLARE_CONSTANT_NAME
3989#define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
3990#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
3991#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
3992#undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
3993#define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
3994
3995#undef TARGET_MACHINE_DEPENDENT_REORG
3996#define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
3997#undef TARGET_NO_REGISTER_ALLOCATION
3998#define TARGET_NO_REGISTER_ALLOCATION true
3999
1f83528e
TS
4000#undef TARGET_RECORD_OFFLOAD_SYMBOL
4001#define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
4002
738f2522
BS
4003#undef TARGET_VECTOR_ALIGNMENT
4004#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
4005
d88cd9c4
NS
4006#undef TARGET_CANNOT_COPY_INSN_P
4007#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
4008
f3552158
NS
4009#undef TARGET_INIT_BUILTINS
4010#define TARGET_INIT_BUILTINS nvptx_init_builtins
4011#undef TARGET_EXPAND_BUILTIN
4012#define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
4013#undef TARGET_BUILTIN_DECL
4014#define TARGET_BUILTIN_DECL nvptx_builtin_decl
4015
94829f87
NS
4016#undef TARGET_GOACC_VALIDATE_DIMS
4017#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
4018
d88cd9c4
NS
4019#undef TARGET_GOACC_FORK_JOIN
4020#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
4021
f3552158
NS
4022#undef TARGET_GOACC_REDUCTION
4023#define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
4024
738f2522
BS
4025struct gcc_target targetm = TARGET_INITIALIZER;
4026
4027#include "gt-nvptx.h"