1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
58 #include "stor-layout.h"
60 #include "omp-general.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
73 #include "tree-phinodes.h"
75 #include "fold-const.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
88 #define PTX_CTA_SIZE 1024
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
103 /* The various PTX memory areas an object might reside in. */
115 /* We record the data area in the target symbol flags. */
116 #define SYMBOL_DATA_AREA(SYM) \
117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
119 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
122 /* Record the function decls we've written, and the libfuncs and function
123 decls corresponding to them. */
124 static std::stringstream func_decls
;
126 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
128 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
129 static bool equal (rtx a
, rtx b
) { return a
== b
; }
133 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
135 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
137 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
138 static bool equal (tree a
, tree b
) { return a
== b
; }
141 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
142 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
144 /* Buffer needed to broadcast across workers and vectors. This is
145 used for both worker-neutering and worker broadcasting, and
146 vector-neutering and boardcasting when vector_length > 32. It is
147 shared by all functions emitted. The buffer is placed in shared
148 memory. It'd be nice if PTX supported common blocks, because then
149 this could be shared across TUs (taking the largest size). */
150 static unsigned oacc_bcast_size
;
151 static unsigned oacc_bcast_partition
;
152 static unsigned oacc_bcast_align
;
153 static GTY(()) rtx oacc_bcast_sym
;
155 /* Buffer needed for worker reductions. This has to be distinct from
156 the worker broadcast array, as both may be live concurrently. */
157 static unsigned worker_red_size
;
158 static unsigned worker_red_align
;
159 static GTY(()) rtx worker_red_sym
;
161 /* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164 static unsigned vector_red_size
;
165 static unsigned vector_red_align
;
166 static unsigned vector_red_partition
;
167 static GTY(()) rtx vector_red_sym
;
169 /* Global lock variable, needed for 128bit worker & gang reductions. */
170 static GTY(()) tree global_lock_var
;
172 /* True if any function references __nvptx_stacks. */
173 static bool need_softstack_decl
;
175 /* True if any function references __nvptx_uni. */
176 static bool need_unisimt_decl
;
178 static int nvptx_mach_max_workers ();
180 /* Allocate a new, cleared machine_function structure. */
182 static struct machine_function
*
183 nvptx_init_machine_status (void)
185 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
186 p
->return_mode
= VOIDmode
;
190 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191 and -fopenacc is also enabled. */
194 diagnose_openacc_conflict (bool optval
, const char *optname
)
196 if (flag_openacc
&& optval
)
197 error ("option %s is not supported together with %<-fopenacc%>", optname
);
200 /* Implement TARGET_OPTION_OVERRIDE. */
203 nvptx_option_override (void)
205 init_machine_status
= nvptx_init_machine_status
;
207 /* Set toplevel_reorder, unless explicitly disabled. We need
208 reordering so that we emit necessary assembler decls of
209 undeclared variables. */
210 if (!global_options_set
.x_flag_toplevel_reorder
)
211 flag_toplevel_reorder
= 1;
213 debug_nonbind_markers_p
= 0;
215 /* Set flag_no_common, unless explicitly disabled. We fake common
216 using .weak, and that's not entirely accurate, so avoid it
218 if (!global_options_set
.x_flag_no_common
)
221 /* The patch area requires nops, which we don't have. */
222 if (function_entry_patch_area_size
> 0)
223 sorry ("not generating patch area, nops not supported");
225 /* Assumes that it will see only hard registers. */
226 flag_var_tracking
= 0;
228 if (nvptx_optimize
< 0)
229 nvptx_optimize
= optimize
> 0;
231 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
232 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
233 declared_libfuncs_htab
234 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
236 oacc_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__oacc_bcast");
237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym
, DATA_AREA_SHARED
);
238 oacc_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
239 oacc_bcast_partition
= 0;
241 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
242 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
243 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
245 vector_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym
, DATA_AREA_SHARED
);
247 vector_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
248 vector_red_partition
= 0;
250 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
251 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
255 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
258 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
259 deal with ptx ideosyncracies. */
262 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
297 /* Encode the PTX data area that DECL (which might not actually be a
298 _DECL) should reside in. */
301 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
303 default_encode_section_info (decl
, rtl
, first
);
304 if (first
&& MEM_P (rtl
))
306 nvptx_data_area area
= DATA_AREA_GENERIC
;
308 if (TREE_CONSTANT (decl
))
309 area
= DATA_AREA_CONST
;
310 else if (TREE_CODE (decl
) == VAR_DECL
)
312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
314 area
= DATA_AREA_SHARED
;
315 if (DECL_INITIAL (decl
))
316 error ("static initialization of variable %q+D in %<.shared%>"
317 " memory is not supported", decl
);
320 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
323 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
327 /* Return the PTX name of the data area in which SYM should be
328 placed. The symbol must have already been processed by
329 nvptx_encode_seciton_info, or equivalent. */
332 section_for_sym (rtx sym
)
334 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
335 /* Same order as nvptx_data_area enum. */
336 static char const *const areas
[] =
337 {"", ".global", ".shared", ".local", ".const", ".param"};
342 /* Similarly for a decl. */
345 section_for_decl (const_tree decl
)
347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
350 /* Check NAME for special function names and redirect them by returning a
351 replacement. This applies to malloc, free and realloc, for which we
352 want to use libgcc wrappers, and call, which triggers a bug in
353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354 not active in an offload compiler -- the names are all set by the
355 host-side compiler. */
358 nvptx_name_replacement (const char *name
)
360 if (strcmp (name
, "call") == 0)
361 return "__nvptx_call";
362 if (strcmp (name
, "malloc") == 0)
363 return "__nvptx_malloc";
364 if (strcmp (name
, "free") == 0)
365 return "__nvptx_free";
366 if (strcmp (name
, "realloc") == 0)
367 return "__nvptx_realloc";
371 /* If MODE should be treated as two registers of an inner mode, return
372 that inner mode. Otherwise return VOIDmode. */
375 maybe_split_mode (machine_mode mode
)
377 if (COMPLEX_MODE_P (mode
))
378 return GET_MODE_INNER (mode
);
386 /* Return true if mode should be treated as two registers. */
389 split_mode_p (machine_mode mode
)
391 return maybe_split_mode (mode
) != VOIDmode
;
394 /* Output a register, subreg, or register pair (with optional
395 enclosing braces). */
398 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
399 int subreg_offset
= -1)
401 if (inner_mode
== VOIDmode
)
403 if (HARD_REGISTER_NUM_P (regno
))
404 fprintf (file
, "%s", reg_names
[regno
]);
406 fprintf (file
, "%%r%d", regno
);
408 else if (subreg_offset
>= 0)
410 output_reg (file
, regno
, VOIDmode
);
411 fprintf (file
, "$%d", subreg_offset
);
415 if (subreg_offset
== -1)
417 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
419 output_reg (file
, regno
, inner_mode
, 0);
420 if (subreg_offset
== -1)
425 /* Emit forking instructions for MASK. */
428 nvptx_emit_forking (unsigned mask
, bool is_call
)
430 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
431 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
434 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
436 /* Emit fork at all levels. This helps form SESE regions, as
437 it creates a block with a single successor before entering a
438 partitooned region. That is a good candidate for the end of
440 emit_insn (gen_nvptx_fork (op
));
441 emit_insn (gen_nvptx_forked (op
));
445 /* Emit joining instructions for MASK. */
448 nvptx_emit_joining (unsigned mask
, bool is_call
)
450 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
454 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
456 /* Emit joining for all non-call pars to ensure there's a single
457 predecessor for the block the join insn ends up in. This is
458 needed for skipping entire loops. */
459 emit_insn (gen_nvptx_joining (op
));
460 emit_insn (gen_nvptx_join (op
));
465 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
466 returned in memory. Integer and floating types supported by the
467 machine are passed in registers, everything else is passed in
468 memory. Complex types are split. */
471 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
475 if (AGGREGATE_TYPE_P (type
))
477 if (TREE_CODE (type
) == VECTOR_TYPE
)
481 if (!for_return
&& COMPLEX_MODE_P (mode
))
482 /* Complex types are passed as two underlying args. */
483 mode
= GET_MODE_INNER (mode
);
485 if (GET_MODE_CLASS (mode
) != MODE_INT
486 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
489 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
495 /* A non-memory argument of mode MODE is being passed, determine the mode it
496 should be promoted to. This is also used for determining return
500 promote_arg (machine_mode mode
, bool prototyped
)
502 if (!prototyped
&& mode
== SFmode
)
503 /* K&R float promotion for unprototyped functions. */
505 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
511 /* A non-memory return type of MODE is being returned. Determine the
512 mode it should be promoted to. */
515 promote_return (machine_mode mode
)
517 return promote_arg (mode
, true);
520 /* Implement TARGET_FUNCTION_ARG. */
523 nvptx_function_arg (cumulative_args_t
, const function_arg_info
&arg
)
525 if (arg
.end_marker_p () || !arg
.named
)
528 return gen_reg_rtx (arg
.mode
);
531 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
534 nvptx_function_incoming_arg (cumulative_args_t cum_v
,
535 const function_arg_info
&arg
)
537 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
539 if (arg
.end_marker_p () || !arg
.named
)
542 /* No need to deal with split modes here, the only case that can
543 happen is complex modes and those are dealt with by
544 TARGET_SPLIT_COMPLEX_ARG. */
545 return gen_rtx_UNSPEC (arg
.mode
,
546 gen_rtvec (1, GEN_INT (cum
->count
)),
550 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
553 nvptx_function_arg_advance (cumulative_args_t cum_v
, const function_arg_info
&)
555 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
560 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
562 For nvptx This is only used for varadic args. The type has already
563 been promoted and/or converted to invisible reference. */
566 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
568 return GET_MODE_ALIGNMENT (mode
);
571 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
573 For nvptx, we know how to handle functions declared as stdarg: by
574 passing an extra pointer to the unnamed arguments. However, the
575 Fortran frontend can produce a different situation, where a
576 function pointer is declared with no arguments, but the actual
577 function and calls to it take more arguments. In that case, we
578 want to ensure the call matches the definition of the function. */
581 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
583 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
585 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
588 /* Implement TARGET_LIBCALL_VALUE. */
591 nvptx_libcall_value (machine_mode mode
, const_rtx
)
593 if (!cfun
|| !cfun
->machine
->doing_call
)
594 /* Pretend to return in a hard reg for early uses before pseudos can be
596 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
598 return gen_reg_rtx (mode
);
601 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
602 where function FUNC returns or receives a value of data type TYPE. */
605 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
608 machine_mode mode
= promote_return (TYPE_MODE (type
));
613 cfun
->machine
->return_mode
= mode
;
614 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
617 return nvptx_libcall_value (mode
, NULL_RTX
);
620 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
623 nvptx_function_value_regno_p (const unsigned int regno
)
625 return regno
== NVPTX_RETURN_REGNUM
;
628 /* Types with a mode other than those supported by the machine are passed by
629 reference in memory. */
632 nvptx_pass_by_reference (cumulative_args_t
, const function_arg_info
&arg
)
634 return pass_in_memory (arg
.mode
, arg
.type
, false);
637 /* Implement TARGET_RETURN_IN_MEMORY. */
640 nvptx_return_in_memory (const_tree type
, const_tree
)
642 return pass_in_memory (TYPE_MODE (type
), type
, true);
645 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
648 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
649 int *ARG_UNUSED (punsignedp
),
650 const_tree funtype
, int for_return
)
652 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
655 /* Helper for write_arg. Emit a single PTX argument of MODE, either
656 in a prototype, or as copy in a function prologue. ARGNO is the
657 index of this argument in the PTX function. FOR_REG is negative,
658 if we're emitting the PTX prototype. It is zero if we're copying
659 to an argument register and it is greater than zero if we're
660 copying to a specific hard register. */
663 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
666 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
670 /* Writing PTX prototype. */
671 s
<< (argno
? ", " : " (");
672 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
676 s
<< "\t.reg" << ptx_type
<< " ";
678 s
<< reg_names
[for_reg
];
684 s
<< "\tld.param" << ptx_type
<< " ";
686 s
<< reg_names
[for_reg
];
689 s
<< ", [%in_ar" << argno
<< "];\n";
695 /* Process function parameter TYPE to emit one or more PTX
696 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
697 is true, if this is a prototyped function, rather than an old-style
698 C declaration. Returns the next argument number to use.
700 The promotion behavior here must match the regular GCC function
701 parameter marshalling machinery. */
704 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
705 tree type
, bool prototyped
)
707 machine_mode mode
= TYPE_MODE (type
);
709 if (mode
== VOIDmode
)
712 if (pass_in_memory (mode
, type
, false))
716 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
720 /* Complex types are sent as two separate args. */
721 type
= TREE_TYPE (type
);
722 mode
= TYPE_MODE (type
);
726 mode
= promote_arg (mode
, prototyped
);
728 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
731 return write_arg_mode (s
, for_reg
, argno
, mode
);
734 /* Emit a PTX return as a prototype or function prologue declaration
738 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
740 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
741 const char *pfx
= "\t.reg";
742 const char *sfx
= ";\n";
745 pfx
= "(.param", sfx
= "_out) ";
747 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
750 /* Process a function return TYPE to emit a PTX return as a prototype
751 or function prologue declaration. Returns true if return is via an
752 additional pointer parameter. The promotion behavior here must
753 match the regular GCC function return mashalling. */
756 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
758 machine_mode mode
= TYPE_MODE (type
);
760 if (mode
== VOIDmode
)
763 bool return_in_mem
= pass_in_memory (mode
, type
, true);
768 return return_in_mem
;
770 /* Named return values can cause us to return a pointer as well
771 as expect an argument for the return location. This is
772 optimization-level specific, so no caller can make use of
773 this data, but more importantly for us, we must ensure it
774 doesn't change the PTX prototype. */
775 mode
= (machine_mode
) cfun
->machine
->return_mode
;
777 if (mode
== VOIDmode
)
778 return return_in_mem
;
780 /* Clear return_mode to inhibit copy of retval to non-existent
782 cfun
->machine
->return_mode
= VOIDmode
;
785 mode
= promote_return (mode
);
787 write_return_mode (s
, for_proto
, mode
);
789 return return_in_mem
;
792 /* Look for attributes in ATTRS that would indicate we must write a function
793 as a .entry kernel rather than a .func. Return true if one is found. */
796 write_as_kernel (tree attrs
)
798 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
799 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
800 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
801 /* For OpenMP target regions, the corresponding kernel entry is emitted from
802 write_omp_entry as a separate function. */
805 /* Emit a linker marker for a function decl or defn. */
808 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
814 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
818 /* Emit a linker marker for a variable decl or defn. */
821 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
823 fprintf (file
, "\n// BEGIN%s VAR %s: ",
824 globalize
? " GLOBAL" : "",
825 is_defn
? "DEF" : "DECL");
826 assemble_name_raw (file
, name
);
830 /* Write a .func or .kernel declaration or definition along with
831 a helper comment for use by ld. S is the stream to write to, DECL
832 the decl for the function with name NAME. For definitions, emit
833 a declaration too. */
836 write_fn_proto (std::stringstream
&s
, bool is_defn
,
837 const char *name
, const_tree decl
)
840 /* Emit a declaration. The PTX assembler gets upset without it. */
841 name
= write_fn_proto (s
, false, name
, decl
);
844 /* Avoid repeating the name replacement. */
845 name
= nvptx_name_replacement (name
);
850 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
), name
);
852 /* PTX declaration. */
853 if (DECL_EXTERNAL (decl
))
855 else if (TREE_PUBLIC (decl
))
856 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
857 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
859 tree fntype
= TREE_TYPE (decl
);
860 tree result_type
= TREE_TYPE (fntype
);
862 /* atomic_compare_exchange_$n builtins have an exceptional calling
864 int not_atomic_weak_arg
= -1;
865 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
866 switch (DECL_FUNCTION_CODE (decl
))
868 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
869 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
871 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
872 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
873 /* These atomics skip the 'weak' parm in an actual library
874 call. We must skip it in the prototype too. */
875 not_atomic_weak_arg
= 3;
882 /* Declare the result. */
883 bool return_in_mem
= write_return_type (s
, true, result_type
);
889 /* Emit argument list. */
891 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
894 NULL in TYPE_ARG_TYPES, for old-style functions
895 NULL in DECL_ARGUMENTS, for builtin functions without another
897 So we have to pick the best one we have. */
898 tree args
= TYPE_ARG_TYPES (fntype
);
899 bool prototyped
= true;
902 args
= DECL_ARGUMENTS (decl
);
906 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
908 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
910 if (not_atomic_weak_arg
)
911 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
913 gcc_assert (type
== boolean_type_node
);
916 if (stdarg_p (fntype
))
917 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
919 if (DECL_STATIC_CHAIN (decl
))
920 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
922 if (!argno
&& strcmp (name
, "main") == 0)
924 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
925 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
931 s
<< (is_defn
? "\n" : ";\n");
936 /* Construct a function declaration from a call insn. This can be
937 necessary for two reasons - either we have an indirect call which
938 requires a .callprototype declaration, or we have a libcall
939 generated by emit_library_call for which no decl exists. */
942 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
947 s
<< "\t.callprototype ";
952 name
= nvptx_name_replacement (name
);
953 write_fn_marker (s
, false, true, name
);
954 s
<< "\t.extern .func ";
957 if (result
!= NULL_RTX
)
958 write_return_mode (s
, true, GET_MODE (result
));
962 int arg_end
= XVECLEN (pat
, 0);
963 for (int i
= 1; i
< arg_end
; i
++)
965 /* We don't have to deal with mode splitting & promotion here,
966 as that was already done when generating the call
968 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
970 write_arg_mode (s
, -1, i
- 1, mode
);
977 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
978 table and and write a ptx prototype. These are emitted at end of
982 nvptx_record_fndecl (tree decl
)
984 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
988 const char *name
= get_fnname_from_decl (decl
);
989 write_fn_proto (func_decls
, false, name
, decl
);
993 /* Record a libcall or unprototyped external function. CALLEE is the
994 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
995 declaration for it. */
998 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
1000 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
1005 const char *name
= XSTR (callee
, 0);
1006 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
1010 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1011 is prototyped, record it now. Otherwise record it as needed at end
1012 of compilation, when we might have more information about it. */
1015 nvptx_record_needed_fndecl (tree decl
)
1017 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
1019 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
1024 nvptx_record_fndecl (decl
);
1027 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1031 nvptx_maybe_record_fnsym (rtx sym
)
1033 tree decl
= SYMBOL_REF_DECL (sym
);
1035 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1036 nvptx_record_needed_fndecl (decl
);
1039 /* Emit a local array to hold some part of a conventional stack frame
1040 and initialize REGNO to point to it. If the size is zero, it'll
1041 never be valid to dereference, so we can simply initialize to
1045 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1048 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1049 align
, reg_names
[regno
], size
);
1050 fprintf (file
, "\t.reg.u%d %s;\n",
1051 POINTER_SIZE
, reg_names
[regno
]);
1052 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1053 : "\tmov.u%d %s, 0;\n"),
1054 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1057 /* Emit soft stack frame setup sequence. */
1060 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1062 /* Maintain 64-bit stack alignment. */
1063 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1064 size
= ROUND_UP (size
, keep_align
);
1065 int bits
= POINTER_SIZE
;
1066 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1067 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1068 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1069 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1070 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1071 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1072 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1073 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1074 fprintf (file
, "\t{\n");
1075 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1076 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1077 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1078 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1079 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1080 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1081 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1083 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1084 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1086 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1087 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1088 bits
, reg_sspprev
, reg_sspslot
);
1090 /* Initialize %frame = %sspprev - size. */
1091 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1092 bits
, reg_frame
, reg_sspprev
, size
);
1094 /* Apply alignment, if larger than 64. */
1095 if (alignment
> keep_align
)
1096 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1097 bits
, reg_frame
, reg_frame
, -alignment
);
1099 size
= crtl
->outgoing_args_size
;
1100 gcc_assert (size
% keep_align
== 0);
1102 /* Initialize %stack. */
1103 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1104 bits
, reg_stack
, reg_frame
, size
);
1107 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1108 bits
, reg_sspslot
, reg_stack
);
1109 fprintf (file
, "\t}\n");
1110 cfun
->machine
->has_softstack
= true;
1111 need_softstack_decl
= true;
1114 /* Emit code to initialize the REGNO predicate register to indicate
1115 whether we are not lane zero on the NAME axis. */
1118 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1120 fprintf (file
, "\t{\n");
1121 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1122 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1124 fprintf (file
, "\t\t.reg.u64\t%%t_red;\n");
1125 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1127 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1128 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1129 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1131 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1132 fprintf (file
, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1133 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1134 "// vector reduction buffer\n",
1135 REGNO (cfun
->machine
->red_partition
),
1136 vector_red_partition
);
1138 /* Verify vector_red_size. */
1139 gcc_assert (vector_red_partition
* nvptx_mach_max_workers ()
1140 <= vector_red_size
);
1141 fprintf (file
, "\t}\n");
1144 /* Emit code to initialize OpenACC worker broadcast and synchronization
1148 nvptx_init_oacc_workers (FILE *file
)
1150 fprintf (file
, "\t{\n");
1151 fprintf (file
, "\t\t.reg.u32\t%%tidy;\n");
1152 if (cfun
->machine
->bcast_partition
)
1154 fprintf (file
, "\t\t.reg.u64\t%%t_bcast;\n");
1155 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1157 fprintf (file
, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1158 if (cfun
->machine
->bcast_partition
)
1160 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1161 fprintf (file
, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1162 fprintf (file
, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1163 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1164 "// vector broadcast offset\n",
1165 REGNO (cfun
->machine
->bcast_partition
),
1166 oacc_bcast_partition
);
1168 /* Verify oacc_bcast_size. */
1169 gcc_assert (oacc_bcast_partition
* (nvptx_mach_max_workers () + 1)
1170 <= oacc_bcast_size
);
1171 if (cfun
->machine
->sync_bar
)
1172 fprintf (file
, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1173 "// vector synchronization barrier\n",
1174 REGNO (cfun
->machine
->sync_bar
));
1175 fprintf (file
, "\t}\n");
1178 /* Emit code to initialize predicate and master lane index registers for
1179 -muniform-simt code generation variant. */
1182 nvptx_init_unisimt_predicate (FILE *file
)
1184 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1185 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1186 int bits
= POINTER_SIZE
;
1187 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1188 fprintf (file
, "\t{\n");
1189 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1190 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1191 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1192 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1193 bits
== 64 ? ".wide" : ".lo");
1194 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1195 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1196 if (cfun
->machine
->unisimt_predicate
)
1198 int master
= REGNO (cfun
->machine
->unisimt_master
);
1199 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1200 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1201 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1202 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1203 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1204 /* Compute predicate as 'tid.x == master'. */
1205 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1207 fprintf (file
, "\t}\n");
1208 need_unisimt_decl
= true;
1211 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1213 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1214 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1216 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1217 __nvptx_uni[tid.y] = 0;
1218 gomp_nvptx_main (ORIG, arg);
1220 ORIG itself should not be emitted as a PTX .entry function. */
1223 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1225 static bool gomp_nvptx_main_declared
;
1226 if (!gomp_nvptx_main_declared
)
1228 gomp_nvptx_main_declared
= true;
1229 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1230 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1231 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1233 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1234 #define NTID_Y "%ntid.y"
1235 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1236 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1239 .reg.u" PS " %R<4>;\n\
1240 mov.u32 %r0, %tid.y;\n\
1241 mov.u32 %r1, " NTID_Y ";\n\
1242 mov.u32 %r2, %ctaid.x;\n\
1243 cvt.u" PS ".u32 %R1, %r0;\n\
1244 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1245 mov.u" PS " %R0, __nvptx_stacks;\n\
1246 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1247 ld.param.u" PS " %R2, [%stack];\n\
1248 ld.param.u" PS " %R3, [%sz];\n\
1249 add.u" PS " %R2, %R2, %R3;\n\
1250 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1251 st.shared.u" PS " [%R0], %R2;\n\
1252 mov.u" PS " %R0, __nvptx_uni;\n\
1253 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1255 st.shared.u32 [%R0], %r0;\n\
1256 mov.u" PS " %R0, \0;\n\
1257 ld.param.u" PS " %R1, [%arg];\n\
1259 .param.u" PS " %P<2>;\n\
1260 st.param.u" PS " [%P0], %R0;\n\
1261 st.param.u" PS " [%P1], %R1;\n\
1262 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1266 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1267 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1268 #undef ENTRY_TEMPLATE
1270 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1271 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1272 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1273 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1274 need_softstack_decl
= need_unisimt_decl
= true;
1277 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1278 function, including local var decls and copies from the arguments to
1282 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1284 tree fntype
= TREE_TYPE (decl
);
1285 tree result_type
= TREE_TYPE (fntype
);
1288 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1289 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1291 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1292 sprintf (buf
, "%s$impl", name
);
1293 write_omp_entry (file
, name
, buf
);
1296 /* We construct the initial part of the function into a string
1297 stream, in order to share the prototype writing code. */
1298 std::stringstream s
;
1299 write_fn_proto (s
, true, name
, decl
);
1302 bool return_in_mem
= write_return_type (s
, false, result_type
);
1304 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1306 /* Declare and initialize incoming arguments. */
1307 tree args
= TYPE_ARG_TYPES (fntype
);
1308 bool prototyped
= true;
1311 args
= DECL_ARGUMENTS (decl
);
1315 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1317 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1319 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1322 if (stdarg_p (fntype
))
1323 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1326 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1327 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1328 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1331 fprintf (file
, "%s", s
.str().c_str());
1333 /* Usually 'crtl->is_leaf' is computed during register allocator
1334 initialization (which is not done on NVPTX) or for pressure-sensitive
1335 optimizations. Initialize it here, except if already set. */
1337 crtl
->is_leaf
= leaf_function_p ();
1339 HOST_WIDE_INT sz
= get_frame_size ();
1340 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1341 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1342 if (!TARGET_SOFT_STACK
)
1344 /* Declare a local var for outgoing varargs. */
1345 if (cfun
->machine
->has_varadic
)
1346 init_frame (file
, STACK_POINTER_REGNUM
,
1347 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1349 /* Declare a local variable for the frame. Force its size to be
1350 DImode-compatible. */
1352 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1353 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1355 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1356 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1357 init_softstack_frame (file
, alignment
, sz
);
1359 if (cfun
->machine
->has_simtreg
)
1361 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1362 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1363 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1364 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1365 simtsz
= HOST_WIDE_INT_M1U
;
1366 if (simtsz
== HOST_WIDE_INT_M1U
)
1367 simtsz
= nvptx_softstack_size
;
1368 if (cfun
->machine
->has_softstack
)
1369 simtsz
+= POINTER_SIZE
/ 8;
1370 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1371 if (align
> GET_MODE_SIZE (DImode
))
1372 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1374 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1375 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1378 /* Restore the vector reduction partition register, if necessary.
1379 FIXME: Find out when and why this is necessary, and fix it. */
1380 if (cfun
->machine
->red_partition
)
1381 regno_reg_rtx
[REGNO (cfun
->machine
->red_partition
)]
1382 = cfun
->machine
->red_partition
;
1384 /* Declare the pseudos we have as ptx registers. */
1385 int maxregs
= max_reg_num ();
1386 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1388 if (regno_reg_rtx
[i
] != const0_rtx
)
1390 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1391 machine_mode split
= maybe_split_mode (mode
);
1393 if (split_mode_p (mode
))
1395 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1396 output_reg (file
, i
, split
, -2);
1397 fprintf (file
, ";\n");
1401 /* Emit axis predicates. */
1402 if (cfun
->machine
->axis_predicate
[0])
1403 nvptx_init_axis_predicate (file
,
1404 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1405 if (cfun
->machine
->axis_predicate
[1])
1406 nvptx_init_axis_predicate (file
,
1407 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1408 if (cfun
->machine
->unisimt_predicate
1409 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1410 nvptx_init_unisimt_predicate (file
);
1411 if (cfun
->machine
->bcast_partition
|| cfun
->machine
->sync_bar
)
1412 nvptx_init_oacc_workers (file
);
1415 /* Output code for switching uniform-simt state. ENTERING indicates whether
1416 we are entering or leaving non-uniform execution region. */
1419 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1421 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1423 fprintf (file
, "\t{\n");
1424 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1425 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1428 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1429 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1431 if (cfun
->machine
->unisimt_predicate
)
1433 int master
= REGNO (cfun
->machine
->unisimt_master
);
1434 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1435 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1436 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1437 master
, entering
? "%ustmp2" : "0");
1438 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1440 fprintf (file
, "\t}\n");
1443 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1444 ENTERING indicates whether we are entering or leaving non-uniform execution.
1445 PTR is the register pointing to allocated storage, it is assigned to on
1446 entering and used to restore state on leaving. SIZE and ALIGN are used only
1450 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1451 rtx ptr
, rtx size
, rtx align
)
1453 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1454 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1456 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1457 fprintf (file
, "\t{\n");
1460 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1461 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1462 cfun
->machine
->simt_stack_size
);
1463 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1464 if (CONST_INT_P (size
))
1465 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1466 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1468 output_reg (file
, REGNO (size
), VOIDmode
);
1469 fputs (";\n", file
);
1470 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1472 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1473 bits
, regno
, regno
, UINTVAL (align
));
1475 if (cfun
->machine
->has_softstack
)
1477 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1480 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1481 bits
, regno
, bits
/ 8, reg_stack
);
1482 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1483 bits
, reg_stack
, regno
, bits
/ 8);
1487 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1488 bits
, reg_stack
, regno
, bits
/ 8);
1490 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1492 fprintf (file
, "\t}\n");
1495 /* Output code to enter non-uniform execution region. DEST is a register
1496 to hold a per-lane allocation given by SIZE and ALIGN. */
1499 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1501 nvptx_output_unisimt_switch (asm_out_file
, true);
1502 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1506 /* Output code to leave non-uniform execution region. SRC is the register
1507 holding per-lane storage previously allocated by omp_simt_enter insn. */
1510 nvptx_output_simt_exit (rtx src
)
1512 nvptx_output_unisimt_switch (asm_out_file
, false);
1513 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1517 /* Output instruction that sets soft stack pointer in shared memory to the
1518 value in register given by SRC_REGNO. */
1521 nvptx_output_set_softstack (unsigned src_regno
)
1523 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1525 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1526 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1527 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1528 fprintf (asm_out_file
, ";\n");
1532 /* Output a return instruction. Also copy the return value to its outgoing
1536 nvptx_output_return (void)
1538 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1540 if (mode
!= VOIDmode
)
1541 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1542 nvptx_ptx_type_from_mode (mode
, false),
1543 reg_names
[NVPTX_RETURN_REGNUM
],
1544 reg_names
[NVPTX_RETURN_REGNUM
]);
1549 /* Terminate a function by writing a closing brace to FILE. */
1552 nvptx_function_end (FILE *file
)
1554 fprintf (file
, "}\n");
1557 /* Decide whether we can make a sibling call to a function. For ptx, we
1561 nvptx_function_ok_for_sibcall (tree
, tree
)
1566 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1569 nvptx_get_drap_rtx (void)
1571 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1572 return arg_pointer_rtx
;
1576 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1577 argument to the next call. */
1580 nvptx_call_args (rtx arg
, tree fntype
)
1582 if (!cfun
->machine
->doing_call
)
1584 cfun
->machine
->doing_call
= true;
1585 cfun
->machine
->is_varadic
= false;
1586 cfun
->machine
->num_args
= 0;
1588 if (fntype
&& stdarg_p (fntype
))
1590 cfun
->machine
->is_varadic
= true;
1591 cfun
->machine
->has_varadic
= true;
1592 cfun
->machine
->num_args
++;
1596 if (REG_P (arg
) && arg
!= pc_rtx
)
1598 cfun
->machine
->num_args
++;
1599 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1600 cfun
->machine
->call_args
);
1604 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1605 information we recorded. */
1608 nvptx_end_call_args (void)
1610 cfun
->machine
->doing_call
= false;
1611 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1614 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1615 track of whether calls involving static chains or varargs were seen
1616 in the current function.
1617 For libcalls, maintain a hash table of decls we have seen, and
1618 record a function decl for later when encountering a new one. */
1621 nvptx_expand_call (rtx retval
, rtx address
)
1623 rtx callee
= XEXP (address
, 0);
1624 rtx varargs
= NULL_RTX
;
1625 unsigned parallel
= 0;
1627 if (!call_insn_operand (callee
, Pmode
))
1629 callee
= force_reg (Pmode
, callee
);
1630 address
= change_address (address
, QImode
, callee
);
1633 if (GET_CODE (callee
) == SYMBOL_REF
)
1635 tree decl
= SYMBOL_REF_DECL (callee
);
1636 if (decl
!= NULL_TREE
)
1638 if (DECL_STATIC_CHAIN (decl
))
1639 cfun
->machine
->has_chain
= true;
1641 tree attr
= oacc_get_fn_attrib (decl
);
1644 tree dims
= TREE_VALUE (attr
);
1646 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1647 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1649 if (TREE_PURPOSE (dims
)
1650 && !integer_zerop (TREE_PURPOSE (dims
)))
1652 /* Not on this axis. */
1653 parallel
^= GOMP_DIM_MASK (ix
);
1654 dims
= TREE_CHAIN (dims
);
1660 unsigned nargs
= cfun
->machine
->num_args
;
1661 if (cfun
->machine
->is_varadic
)
1663 varargs
= gen_reg_rtx (Pmode
);
1664 emit_move_insn (varargs
, stack_pointer_rtx
);
1667 rtvec vec
= rtvec_alloc (nargs
+ 1);
1668 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1671 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1672 rtx tmp_retval
= retval
;
1675 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1676 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1677 call
= gen_rtx_SET (tmp_retval
, call
);
1679 XVECEXP (pat
, 0, vec_pos
++) = call
;
1681 /* Construct the call insn, including a USE for each argument pseudo
1682 register. These will be used when printing the insn. */
1683 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1684 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1687 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1689 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1691 nvptx_emit_forking (parallel
, true);
1692 emit_call_insn (pat
);
1693 nvptx_emit_joining (parallel
, true);
1695 if (tmp_retval
!= retval
)
1696 emit_move_insn (retval
, tmp_retval
);
1699 /* Emit a comparison COMPARE, and return the new test to be used in the
1703 nvptx_expand_compare (rtx compare
)
1705 rtx pred
= gen_reg_rtx (BImode
);
1706 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1707 XEXP (compare
, 0), XEXP (compare
, 1));
1708 emit_insn (gen_rtx_SET (pred
, cmp
));
1709 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1712 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1715 nvptx_expand_oacc_fork (unsigned mode
)
1717 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1721 nvptx_expand_oacc_join (unsigned mode
)
1723 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1726 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1730 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1734 switch (GET_MODE (src
))
1737 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1740 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1742 default: gcc_unreachable ();
1747 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1751 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1755 switch (GET_MODE (dst
))
1758 res
= gen_packsidi2 (dst
, src0
, src1
);
1761 res
= gen_packsidf2 (dst
, src0
, src1
);
1763 default: gcc_unreachable ();
1768 /* Generate an instruction or sequence to broadcast register REG
1769 across the vectors of a single warp. */
1772 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1776 switch (GET_MODE (dst
))
1779 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1782 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1787 rtx tmp0
= gen_reg_rtx (SImode
);
1788 rtx tmp1
= gen_reg_rtx (SImode
);
1791 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1792 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1793 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1794 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1801 rtx tmp
= gen_reg_rtx (SImode
);
1804 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1805 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1806 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1814 rtx tmp
= gen_reg_rtx (SImode
);
1817 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
1818 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1819 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
1832 /* Generate an instruction or sequence to broadcast register REG
1833 across the vectors of a single warp. */
1836 nvptx_gen_warp_bcast (rtx reg
)
1838 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1841 /* Structure used when generating a worker-level spill or fill. */
1843 struct broadcast_data_t
1845 rtx base
; /* Register holding base addr of buffer. */
1846 rtx ptr
; /* Iteration var, if needed. */
1847 unsigned offset
; /* Offset into worker buffer. */
1850 /* Direction of the spill/fill and looping setup/teardown indicator. */
1856 PM_loop_begin
= 1 << 2,
1857 PM_loop_end
= 1 << 3,
1859 PM_read_write
= PM_read
| PM_write
1862 /* Generate instruction(s) to spill or fill register REG to/from the
1863 worker broadcast array. PM indicates what is to be done, REP
1864 how many loop iterations will be executed (0 for not a loop). */
1867 nvptx_gen_shared_bcast (rtx reg
, propagate_mask pm
, unsigned rep
,
1868 broadcast_data_t
*data
, bool vector
)
1871 machine_mode mode
= GET_MODE (reg
);
1877 rtx tmp
= gen_reg_rtx (SImode
);
1881 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1882 emit_insn (nvptx_gen_shared_bcast (tmp
, pm
, rep
, data
, vector
));
1884 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1892 rtx addr
= data
->ptr
;
1896 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1898 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
1899 data
->offset
= ROUND_UP (data
->offset
, align
);
1901 gcc_assert (data
->base
!= NULL
);
1903 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1906 addr
= gen_rtx_MEM (mode
, addr
);
1908 res
= gen_rtx_SET (addr
, reg
);
1909 else if (pm
== PM_write
)
1910 res
= gen_rtx_SET (reg
, addr
);
1916 /* We're using a ptr, increment it. */
1920 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1921 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1927 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1934 /* Returns true if X is a valid address for use in a memory reference. */
1937 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1939 enum rtx_code code
= GET_CODE (x
);
1947 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1961 /* Machinery to output constant initializers. When beginning an
1962 initializer, we decide on a fragment size (which is visible in ptx
1963 in the type used), and then all initializer data is buffered until
1964 a fragment is filled and ready to be written out. */
1968 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
1969 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
1970 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
1972 unsigned size
; /* Fragment size to accumulate. */
1973 unsigned offset
; /* Offset within current fragment. */
1974 bool started
; /* Whether we've output any initializer. */
1977 /* The current fragment is full, write it out. SYM may provide a
1978 symbolic reference we should output, in which case the fragment
1979 value is the addend. */
1982 output_init_frag (rtx sym
)
1984 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
1985 unsigned HOST_WIDE_INT val
= init_frag
.val
;
1987 init_frag
.started
= true;
1989 init_frag
.offset
= 0;
1990 init_frag
.remaining
--;
1994 bool function
= (SYMBOL_REF_DECL (sym
)
1995 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
1997 fprintf (asm_out_file
, "generic(");
1998 output_address (VOIDmode
, sym
);
2000 fprintf (asm_out_file
, ")");
2002 fprintf (asm_out_file
, " + ");
2006 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
2009 /* Add value VAL of size SIZE to the data we're emitting, and keep
2010 writing out chunks as they fill up. */
2013 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
2015 val
&= ((unsigned HOST_WIDE_INT
)2 << (size
* BITS_PER_UNIT
- 1)) - 1;
2017 for (unsigned part
= 0; size
; size
-= part
)
2019 val
>>= part
* BITS_PER_UNIT
;
2020 part
= init_frag
.size
- init_frag
.offset
;
2021 part
= MIN (part
, size
);
2023 unsigned HOST_WIDE_INT partial
2024 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
2025 init_frag
.val
|= partial
& init_frag
.mask
;
2026 init_frag
.offset
+= part
;
2028 if (init_frag
.offset
== init_frag
.size
)
2029 output_init_frag (NULL
);
2033 /* Target hook for assembling integer object X of size SIZE. */
2036 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
2038 HOST_WIDE_INT val
= 0;
2040 switch (GET_CODE (x
))
2043 /* Let the generic machinery figure it out, usually for a
2048 nvptx_assemble_value (INTVAL (x
), size
);
2053 gcc_assert (GET_CODE (x
) == PLUS
);
2054 val
= INTVAL (XEXP (x
, 1));
2056 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
2060 gcc_assert (size
== init_frag
.size
);
2061 if (init_frag
.offset
)
2062 sorry ("cannot emit unaligned pointers in ptx assembly");
2064 nvptx_maybe_record_fnsym (x
);
2065 init_frag
.val
= val
;
2066 output_init_frag (x
);
2073 /* Output SIZE zero bytes. We ignore the FILE argument since the
2074 functions we're calling to perform the output just use
2078 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
2080 /* Finish the current fragment, if it's started. */
2081 if (init_frag
.offset
)
2083 unsigned part
= init_frag
.size
- init_frag
.offset
;
2084 part
= MIN (part
, (unsigned)size
);
2086 nvptx_assemble_value (0, part
);
2089 /* If this skip doesn't terminate the initializer, write as many
2090 remaining pieces as possible directly. */
2091 if (size
< init_frag
.remaining
* init_frag
.size
)
2093 while (size
>= init_frag
.size
)
2095 size
-= init_frag
.size
;
2096 output_init_frag (NULL_RTX
);
2099 nvptx_assemble_value (0, size
);
2103 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2104 ignore the FILE arg. */
2107 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2109 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2110 nvptx_assemble_value (str
[i
], 1);
2113 /* Return true if TYPE is a record type where the last field is an array without
2117 flexible_array_member_type_p (const_tree type
)
2119 if (TREE_CODE (type
) != RECORD_TYPE
)
2122 const_tree last_field
= NULL_TREE
;
2123 for (const_tree f
= TYPE_FIELDS (type
); f
; f
= TREE_CHAIN (f
))
2129 const_tree last_field_type
= TREE_TYPE (last_field
);
2130 if (TREE_CODE (last_field_type
) != ARRAY_TYPE
)
2133 return (! TYPE_DOMAIN (last_field_type
)
2134 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type
)));
2137 /* Emit a PTX variable decl and prepare for emission of its
2138 initializer. NAME is the symbol name and SETION the PTX data
2139 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2140 The caller has already emitted any indentation and linkage
2141 specifier. It is responsible for any initializer, terminating ;
2142 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2143 this is the opposite way round that PTX wants them! */
2146 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2147 const_tree type
, HOST_WIDE_INT size
, unsigned align
,
2148 bool undefined
= false)
2150 bool atype
= (TREE_CODE (type
) == ARRAY_TYPE
)
2151 && (TYPE_DOMAIN (type
) == NULL_TREE
);
2153 if (undefined
&& flexible_array_member_type_p (type
))
2159 while (TREE_CODE (type
) == ARRAY_TYPE
)
2160 type
= TREE_TYPE (type
);
2162 if (TREE_CODE (type
) == VECTOR_TYPE
2163 || TREE_CODE (type
) == COMPLEX_TYPE
)
2164 /* Neither vector nor complex types can contain the other. */
2165 type
= TREE_TYPE (type
);
2167 unsigned elt_size
= int_size_in_bytes (type
);
2169 /* Largest mode we're prepared to accept. For BLKmode types we
2170 don't know if it'll contain pointer constants, so have to choose
2171 pointer size, otherwise we can choose DImode. */
2172 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2174 elt_size
|= GET_MODE_SIZE (elt_mode
);
2175 elt_size
&= -elt_size
; /* Extract LSB set. */
2177 init_frag
.size
= elt_size
;
2178 /* Avoid undefined shift behavior by using '2'. */
2179 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2180 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2182 init_frag
.offset
= 0;
2183 init_frag
.started
= false;
2184 /* Size might not be a multiple of elt size, if there's an
2185 initialized trailing struct array with smaller type than
2187 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2189 fprintf (file
, "%s .align %d .u%d ",
2190 section
, align
/ BITS_PER_UNIT
,
2191 elt_size
* BITS_PER_UNIT
);
2192 assemble_name (file
, name
);
2195 /* We make everything an array, to simplify any initialization
2197 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", init_frag
.remaining
);
2199 fprintf (file
, "[]");
2202 /* Called when the initializer for a decl has been completely output through
2203 combinations of the three functions above. */
2206 nvptx_assemble_decl_end (void)
2208 if (init_frag
.offset
)
2209 /* This can happen with a packed struct with trailing array member. */
2210 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2211 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2214 /* Output an uninitialized common or file-scope variable. */
2217 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2218 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2220 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2222 /* If this is public, it is common. The nearest thing we have to
2224 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2226 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2227 TREE_TYPE (decl
), size
, align
);
2228 nvptx_assemble_decl_end ();
2231 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2232 writing a constant variable EXP with NAME and SIZE and its
2233 initializer to FILE. */
2236 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2237 const_tree exp
, HOST_WIDE_INT obj_size
)
2239 write_var_marker (file
, true, false, name
);
2241 fprintf (file
, "\t");
2243 tree type
= TREE_TYPE (exp
);
2244 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2248 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2249 a variable DECL with NAME to FILE. */
2252 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2254 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2256 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2257 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2259 tree type
= TREE_TYPE (decl
);
2260 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2261 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2262 type
, obj_size
, DECL_ALIGN (decl
));
2265 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2268 nvptx_globalize_label (FILE *, const char *)
2272 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2273 declaration only for variable DECL with NAME to FILE. */
2276 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2278 /* The middle end can place constant pool decls into the varpool as
2279 undefined. Until that is fixed, catch the problem here. */
2280 if (DECL_IN_CONSTANT_POOL (decl
))
2283 /* We support weak defintions, and hence have the right
2284 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2285 if (DECL_WEAK (decl
))
2286 error_at (DECL_SOURCE_LOCATION (decl
),
2287 "PTX does not support weak declarations"
2288 " (only weak definitions)");
2289 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2291 fprintf (file
, "\t.extern ");
2292 tree size
= DECL_SIZE_UNIT (decl
);
2293 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2294 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2295 DECL_ALIGN (decl
), true);
2296 nvptx_assemble_decl_end ();
2299 /* Output a pattern for a move instruction. */
2302 nvptx_output_mov_insn (rtx dst
, rtx src
)
2304 machine_mode dst_mode
= GET_MODE (dst
);
2305 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2306 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2307 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2308 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2311 if (GET_CODE (sym
) == CONST
)
2312 sym
= XEXP (XEXP (sym
, 0), 0);
2313 if (SYMBOL_REF_P (sym
))
2315 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2316 return "%.\tcvta%D1%t0\t%0, %1;";
2317 nvptx_maybe_record_fnsym (sym
);
2320 if (src_inner
== dst_inner
)
2321 return "%.\tmov%t0\t%0, %1;";
2323 if (CONSTANT_P (src
))
2324 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2325 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2326 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2328 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2330 if (GET_MODE_BITSIZE (dst_mode
) == 128
2331 && GET_MODE_BITSIZE (GET_MODE (src
)) == 128)
2333 /* mov.b128 is not supported. */
2334 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2335 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2336 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2337 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2341 return "%.\tmov.b%T0\t%0, %1;";
2344 return "%.\tcvt%t0%t1\t%0, %1;";
2347 static void nvptx_print_operand (FILE *, rtx
, int);
2349 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2350 involves writing .param declarations and in/out copies into them. For
2351 indirect calls, also write the .callprototype. */
2354 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2358 bool needs_tgt
= register_operand (callee
, Pmode
);
2359 rtx pat
= PATTERN (insn
);
2360 if (GET_CODE (pat
) == COND_EXEC
)
2361 pat
= COND_EXEC_CODE (pat
);
2362 int arg_end
= XVECLEN (pat
, 0);
2363 tree decl
= NULL_TREE
;
2365 fprintf (asm_out_file
, "\t{\n");
2367 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2368 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2369 reg_names
[NVPTX_RETURN_REGNUM
]);
2371 /* Ensure we have a ptx declaration in the output if necessary. */
2372 if (GET_CODE (callee
) == SYMBOL_REF
)
2374 decl
= SYMBOL_REF_DECL (callee
);
2376 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2377 nvptx_record_libfunc (callee
, result
, pat
);
2378 else if (DECL_EXTERNAL (decl
))
2379 nvptx_record_fndecl (decl
);
2384 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2386 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2387 std::stringstream s
;
2388 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2389 fputs (s
.str().c_str(), asm_out_file
);
2392 for (int argno
= 1; argno
< arg_end
; argno
++)
2394 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2395 machine_mode mode
= GET_MODE (t
);
2396 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2398 /* Mode splitting has already been done. */
2399 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2400 "\t\tst.param%s [%%out_arg%d], ",
2401 ptx_type
, argno
, ptx_type
, argno
);
2402 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2403 fprintf (asm_out_file
, ";\n");
2406 /* The '.' stands for the call's predicate, if any. */
2407 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2408 fprintf (asm_out_file
, "\t\tcall ");
2409 if (result
!= NULL_RTX
)
2410 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2414 const char *name
= get_fnname_from_decl (decl
);
2415 name
= nvptx_name_replacement (name
);
2416 assemble_name (asm_out_file
, name
);
2419 output_address (VOIDmode
, callee
);
2421 const char *open
= "(";
2422 for (int argno
= 1; argno
< arg_end
; argno
++)
2424 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2427 if (decl
&& DECL_STATIC_CHAIN (decl
))
2429 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2433 fprintf (asm_out_file
, ")");
2437 fprintf (asm_out_file
, ", ");
2438 assemble_name (asm_out_file
, buf
);
2440 fprintf (asm_out_file
, ";\n");
2442 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2444 /* No return functions confuse the PTX JIT, as it doesn't realize
2445 the flow control barrier they imply. It can seg fault if it
2446 encounters what looks like an unexitable loop. Emit a trailing
2447 trap and exit, which it does grok. */
2448 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2449 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2454 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2457 /* We must escape the '%' that starts RETURN_REGNUM. */
2458 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2459 reg_names
[NVPTX_RETURN_REGNUM
]);
2466 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2469 nvptx_print_operand_punct_valid_p (unsigned char c
)
2471 return c
== '.' || c
== '#';
2474 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2477 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2480 if (GET_CODE (x
) == CONST
)
2482 switch (GET_CODE (x
))
2486 output_address (VOIDmode
, XEXP (x
, 0));
2487 fprintf (file
, "+");
2488 output_address (VOIDmode
, off
);
2493 output_addr_const (file
, x
);
2497 gcc_assert (GET_CODE (x
) != MEM
);
2498 nvptx_print_operand (file
, x
, 0);
2503 /* Write assembly language output for the address ADDR to FILE. */
2506 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2508 nvptx_print_address_operand (file
, addr
, mode
);
2511 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2514 . -- print the predicate for the instruction or an emptry string for an
2516 # -- print a rounding mode for the instruction
2518 A -- print a data area for a MEM
2519 c -- print an opcode suffix for a comparison operator, including a type code
2520 D -- print a data area for a MEM operand
2521 S -- print a shuffle kind specified by CONST_INT
2522 t -- print a type opcode suffix, promoting QImode to 32 bits
2523 T -- print a type size in bits
2524 u -- print a type opcode suffix without promotions. */
2527 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2531 x
= current_insn_predicate
;
2535 if (GET_CODE (x
) == EQ
)
2537 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2541 else if (code
== '#')
2543 fputs (".rn", file
);
2547 enum rtx_code x_code
= GET_CODE (x
);
2548 machine_mode mode
= GET_MODE (x
);
2557 if (GET_CODE (x
) == CONST
)
2559 if (GET_CODE (x
) == PLUS
)
2562 if (GET_CODE (x
) == SYMBOL_REF
)
2563 fputs (section_for_sym (x
), file
);
2568 if (x_code
== SUBREG
)
2570 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2571 if (VECTOR_MODE_P (inner_mode
)
2572 && (GET_MODE_SIZE (mode
)
2573 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2574 mode
= GET_MODE_INNER (inner_mode
);
2575 else if (split_mode_p (inner_mode
))
2576 mode
= maybe_split_mode (inner_mode
);
2580 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2586 rtx inner_x
= SUBREG_REG (x
);
2587 machine_mode inner_mode
= GET_MODE (inner_x
);
2588 machine_mode split
= maybe_split_mode (inner_mode
);
2590 output_reg (file
, REGNO (inner_x
), split
,
2592 ? GET_MODE_SIZE (inner_mode
) / 2
2599 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
2600 /* Same order as nvptx_shuffle_kind. */
2601 static const char *const kinds
[] =
2602 {".up", ".down", ".bfly", ".idx"};
2603 fputs (kinds
[kind
], file
);
2608 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
2612 fprintf (file
, "@");
2616 fprintf (file
, "@!");
2620 mode
= GET_MODE (XEXP (x
, 0));
2624 fputs (".eq", file
);
2627 if (FLOAT_MODE_P (mode
))
2628 fputs (".neu", file
);
2630 fputs (".ne", file
);
2634 fputs (".le", file
);
2638 fputs (".ge", file
);
2642 fputs (".lt", file
);
2646 fputs (".gt", file
);
2649 fputs (".ne", file
);
2652 fputs (".equ", file
);
2655 fputs (".leu", file
);
2658 fputs (".geu", file
);
2661 fputs (".ltu", file
);
2664 fputs (".gtu", file
);
2667 fputs (".nan", file
);
2670 fputs (".num", file
);
2675 if (FLOAT_MODE_P (mode
)
2676 || x_code
== EQ
|| x_code
== NE
2677 || x_code
== GEU
|| x_code
== GTU
2678 || x_code
== LEU
|| x_code
== LTU
)
2679 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
2681 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
2689 rtx inner_x
= SUBREG_REG (x
);
2690 machine_mode inner_mode
= GET_MODE (inner_x
);
2691 machine_mode split
= maybe_split_mode (inner_mode
);
2693 if (VECTOR_MODE_P (inner_mode
)
2694 && (GET_MODE_SIZE (mode
)
2695 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2697 output_reg (file
, REGNO (inner_x
), VOIDmode
);
2698 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
2700 else if (split_mode_p (inner_mode
)
2701 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
2702 output_reg (file
, REGNO (inner_x
), split
);
2704 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
2709 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
2714 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
2719 output_addr_const (file
, x
);
2725 /* We could use output_addr_const, but that can print things like
2726 "x-8", which breaks ptxas. Need to ensure it is output as
2728 nvptx_print_address_operand (file
, x
, VOIDmode
);
2733 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
2734 vals
[0] &= 0xffffffff;
2735 vals
[1] &= 0xffffffff;
2737 fprintf (file
, "0f%08lx", vals
[0]);
2739 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2744 unsigned n
= CONST_VECTOR_NUNITS (x
);
2745 fprintf (file
, "{ ");
2746 for (unsigned i
= 0; i
< n
; ++i
)
2749 fprintf (file
, ", ");
2751 rtx elem
= CONST_VECTOR_ELT (x
, i
);
2752 output_addr_const (file
, elem
);
2754 fprintf (file
, " }");
2759 output_addr_const (file
, x
);
2764 /* Record replacement regs used to deal with subreg operands. */
2767 rtx replacement
[MAX_RECOG_OPERANDS
];
2773 /* Allocate or reuse a replacement in R and return the rtx. */
2776 get_replacement (struct reg_replace
*r
)
2778 if (r
->n_allocated
== r
->n_in_use
)
2779 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2780 return r
->replacement
[r
->n_in_use
++];
2783 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2784 the presence of subregs would break the rules for most instructions.
2785 Replace them with a suitable new register of the right size, plus
2786 conversion copyin/copyout instructions. */
2789 nvptx_reorg_subreg (void)
2791 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2792 rtx_insn
*insn
, *next
;
2794 qiregs
.n_allocated
= 0;
2795 hiregs
.n_allocated
= 0;
2796 siregs
.n_allocated
= 0;
2797 diregs
.n_allocated
= 0;
2798 qiregs
.mode
= QImode
;
2799 hiregs
.mode
= HImode
;
2800 siregs
.mode
= SImode
;
2801 diregs
.mode
= DImode
;
2803 for (insn
= get_insns (); insn
; insn
= next
)
2805 next
= NEXT_INSN (insn
);
2806 if (!NONDEBUG_INSN_P (insn
)
2807 || asm_noperands (PATTERN (insn
)) >= 0
2808 || GET_CODE (PATTERN (insn
)) == USE
2809 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2812 qiregs
.n_in_use
= 0;
2813 hiregs
.n_in_use
= 0;
2814 siregs
.n_in_use
= 0;
2815 diregs
.n_in_use
= 0;
2816 extract_insn (insn
);
2817 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2819 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2821 rtx op
= recog_data
.operand
[i
];
2822 if (GET_CODE (op
) != SUBREG
)
2825 rtx inner
= SUBREG_REG (op
);
2827 machine_mode outer_mode
= GET_MODE (op
);
2828 machine_mode inner_mode
= GET_MODE (inner
);
2831 && (GET_MODE_PRECISION (inner_mode
)
2832 >= GET_MODE_PRECISION (outer_mode
)))
2834 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2835 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2836 : outer_mode
== HImode
? &hiregs
2837 : outer_mode
== SImode
? &siregs
2839 rtx new_reg
= get_replacement (r
);
2841 if (recog_data
.operand_type
[i
] != OP_OUT
)
2844 if (GET_MODE_PRECISION (inner_mode
)
2845 < GET_MODE_PRECISION (outer_mode
))
2850 rtx pat
= gen_rtx_SET (new_reg
,
2851 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2852 emit_insn_before (pat
, insn
);
2855 if (recog_data
.operand_type
[i
] != OP_IN
)
2858 if (GET_MODE_PRECISION (inner_mode
)
2859 < GET_MODE_PRECISION (outer_mode
))
2864 rtx pat
= gen_rtx_SET (inner
,
2865 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2866 emit_insn_after (pat
, insn
);
2868 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2873 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2877 nvptx_get_unisimt_master ()
2879 rtx
&master
= cfun
->machine
->unisimt_master
;
2880 return master
? master
: master
= gen_reg_rtx (SImode
);
2883 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2886 nvptx_get_unisimt_predicate ()
2888 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
2889 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
2892 /* Return true if given call insn references one of the functions provided by
2893 the CUDA runtime: malloc, free, vprintf. */
2896 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
2898 rtx pat
= PATTERN (insn
);
2899 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
2900 pat
= XVECEXP (pat
, 0, 0);
2901 if (GET_CODE (pat
) == SET
)
2902 pat
= SET_SRC (pat
);
2903 gcc_checking_assert (GET_CODE (pat
) == CALL
2904 && GET_CODE (XEXP (pat
, 0)) == MEM
);
2905 rtx addr
= XEXP (XEXP (pat
, 0), 0);
2906 if (GET_CODE (addr
) != SYMBOL_REF
)
2908 const char *name
= XSTR (addr
, 0);
2909 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2910 references with forced assembler name refer to PTX syscalls. For vprintf,
2911 accept both normal and forced-assembler-name references. */
2912 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
2913 || !strcmp (name
, "*malloc")
2914 || !strcmp (name
, "*free"));
2917 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2918 propagate its value from lane MASTER to current lane. */
2921 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
2924 if (GET_CODE (set
) == SET
&& REG_P (reg
= SET_DEST (set
)))
2925 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
), insn
);
2928 /* Adjust code for uniform-simt code generation variant by making atomics and
2929 "syscalls" conditionally executed, and inserting shuffle-based propagation
2930 for registers being set. */
2933 nvptx_reorg_uniform_simt ()
2935 rtx_insn
*insn
, *next
;
2937 for (insn
= get_insns (); insn
; insn
= next
)
2939 next
= NEXT_INSN (insn
);
2940 if (!(CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
2941 && !(NONJUMP_INSN_P (insn
)
2942 && GET_CODE (PATTERN (insn
)) == PARALLEL
2943 && get_attr_atomic (insn
)))
2945 rtx pat
= PATTERN (insn
);
2946 rtx master
= nvptx_get_unisimt_master ();
2947 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
2948 nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
2949 rtx pred
= nvptx_get_unisimt_predicate ();
2950 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
2951 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
2952 validate_change (insn
, &PATTERN (insn
), pat
, false);
2956 /* Offloading function attributes. */
2958 struct offload_attrs
2966 /* Define entries for cfun->machine->axis_dim. */
2968 #define MACH_VECTOR_LENGTH 0
2969 #define MACH_MAX_WORKERS 1
2971 static void populate_offload_attrs (offload_attrs
*oa
);
2974 init_axis_dim (void)
2979 populate_offload_attrs (&oa
);
2981 if (oa
.num_workers
== 0)
2982 max_workers
= PTX_CTA_SIZE
/ oa
.vector_length
;
2984 max_workers
= oa
.num_workers
;
2986 cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
] = oa
.vector_length
;
2987 cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
] = max_workers
;
2988 cfun
->machine
->axis_dim_init_p
= true;
2991 static int ATTRIBUTE_UNUSED
2992 nvptx_mach_max_workers ()
2994 if (!cfun
->machine
->axis_dim_init_p
)
2996 return cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
];
2999 static int ATTRIBUTE_UNUSED
3000 nvptx_mach_vector_length ()
3002 if (!cfun
->machine
->axis_dim_init_p
)
3004 return cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
];
3007 /* Loop structure of the function. The entire function is described as
3012 /* Parent parallel. */
3015 /* Next sibling parallel. */
3018 /* First child parallel. */
3021 /* Partitioning mask of the parallel. */
3024 /* Partitioning used within inner parallels. */
3025 unsigned inner_mask
;
3027 /* Location of parallel forked and join. The forked is the first
3028 block in the parallel and the join is the first block after of
3030 basic_block forked_block
;
3031 basic_block join_block
;
3033 rtx_insn
*forked_insn
;
3034 rtx_insn
*join_insn
;
3036 rtx_insn
*fork_insn
;
3037 rtx_insn
*joining_insn
;
3039 /* Basic blocks in this parallel, but not in child parallels. The
3040 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3042 auto_vec
<basic_block
> blocks
;
3045 parallel (parallel
*parent
, unsigned mode
);
3049 /* Constructor links the new parallel into it's parent's chain of
3052 parallel::parallel (parallel
*parent_
, unsigned mask_
)
3053 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
3055 forked_block
= join_block
= 0;
3056 forked_insn
= join_insn
= 0;
3057 fork_insn
= joining_insn
= 0;
3061 next
= parent
->inner
;
3062 parent
->inner
= this;
3066 parallel::~parallel ()
3072 /* Map of basic blocks to insns */
3073 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
3075 /* A tuple of an insn of interest and the BB in which it resides. */
3076 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
3077 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
3079 /* Split basic blocks such that each forked and join unspecs are at
3080 the start of their basic blocks. Thus afterwards each block will
3081 have a single partitioning mode. We also do the same for return
3082 insns, as they are executed by every thread. Return the
3083 partitioning mode of the function as a whole. Populate MAP with
3084 head and tail blocks. We also clear the BB visited flag, which is
3085 used when finding partitions. */
3088 nvptx_split_blocks (bb_insn_map_t
*map
)
3090 insn_bb_vec_t worklist
;
3094 /* Locate all the reorg instructions of interest. */
3095 FOR_ALL_BB_FN (block
, cfun
)
3097 bool seen_insn
= false;
3099 /* Clear visited flag, for use by parallel locator */
3100 block
->flags
&= ~BB_VISITED
;
3102 FOR_BB_INSNS (block
, insn
)
3106 switch (recog_memoized (insn
))
3111 case CODE_FOR_nvptx_forked
:
3112 case CODE_FOR_nvptx_join
:
3115 case CODE_FOR_return
:
3116 /* We also need to split just before return insns, as
3117 that insn needs executing by all threads, but the
3118 block it is in probably does not. */
3123 /* We've found an instruction that must be at the start of
3124 a block, but isn't. Add it to the worklist. */
3125 worklist
.safe_push (insn_bb_t (insn
, block
));
3127 /* It was already the first instruction. Just add it to
3129 map
->get_or_insert (block
) = insn
;
3134 /* Split blocks on the worklist. */
3137 basic_block remap
= 0;
3138 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
3140 if (remap
!= elt
->second
)
3142 block
= elt
->second
;
3146 /* Split block before insn. The insn is in the new block */
3147 edge e
= split_block (block
, PREV_INSN (elt
->first
));
3150 map
->get_or_insert (block
) = elt
->first
;
3154 /* Return true if MASK contains parallelism that requires shared
3155 memory to broadcast. */
3158 nvptx_needs_shared_bcast (unsigned mask
)
3160 bool worker
= mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
);
3161 bool large_vector
= (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
3162 && nvptx_mach_vector_length () != PTX_WARP_SIZE
;
3164 return worker
|| large_vector
;
3167 /* BLOCK is a basic block containing a head or tail instruction.
3168 Locate the associated prehead or pretail instruction, which must be
3169 in the single predecessor block. */
3172 nvptx_discover_pre (basic_block block
, int expected
)
3174 gcc_assert (block
->preds
->length () == 1);
3175 basic_block pre_block
= (*block
->preds
)[0]->src
;
3178 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
3179 pre_insn
= PREV_INSN (pre_insn
))
3180 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
3182 gcc_assert (recog_memoized (pre_insn
) == expected
);
3186 /* Dump this parallel and all its inner parallels. */
3189 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3191 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3193 par
->forked_block
? par
->forked_block
->index
: -1,
3194 par
->join_block
? par
->join_block
->index
: -1);
3196 fprintf (dump_file
, " blocks:");
3199 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3200 fprintf (dump_file
, " %d", block
->index
);
3201 fprintf (dump_file
, "\n");
3203 nvptx_dump_pars (par
->inner
, depth
+ 1);
3206 nvptx_dump_pars (par
->next
, depth
);
3209 /* If BLOCK contains a fork/join marker, process it to create or
3210 terminate a loop structure. Add this block to the current loop,
3211 and then walk successor blocks. */
3214 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3216 if (block
->flags
& BB_VISITED
)
3218 block
->flags
|= BB_VISITED
;
3220 if (rtx_insn
**endp
= map
->get (block
))
3222 rtx_insn
*end
= *endp
;
3224 /* This is a block head or tail, or return instruction. */
3225 switch (recog_memoized (end
))
3227 case CODE_FOR_return
:
3228 /* Return instructions are in their own block, and we
3229 don't need to do anything more. */
3232 case CODE_FOR_nvptx_forked
:
3233 /* Loop head, create a new inner loop and add it into
3234 our parent's child list. */
3236 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3239 par
= new parallel (par
, mask
);
3240 par
->forked_block
= block
;
3241 par
->forked_insn
= end
;
3242 if (nvptx_needs_shared_bcast (mask
))
3244 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3248 case CODE_FOR_nvptx_join
:
3249 /* A loop tail. Finish the current loop and return to
3252 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3254 gcc_assert (par
->mask
== mask
);
3255 gcc_assert (par
->join_block
== NULL
);
3256 par
->join_block
= block
;
3257 par
->join_insn
= end
;
3258 if (nvptx_needs_shared_bcast (mask
))
3260 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3271 /* Add this block onto the current loop's list of blocks. */
3272 par
->blocks
.safe_push (block
);
3274 /* This must be the entry block. Create a NULL parallel. */
3275 par
= new parallel (0, 0);
3277 /* Walk successor blocks. */
3281 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3282 nvptx_find_par (map
, par
, e
->dest
);
3287 /* DFS walk the CFG looking for fork & join markers. Construct
3288 loop structures as we go. MAP is a mapping of basic blocks
3289 to head & tail markers, discovered when splitting blocks. This
3290 speeds up the discovery. We rely on the BB visited flag having
3291 been cleared when splitting blocks. */
3294 nvptx_discover_pars (bb_insn_map_t
*map
)
3298 /* Mark exit blocks as visited. */
3299 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3300 block
->flags
|= BB_VISITED
;
3302 /* And entry block as not. */
3303 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3304 block
->flags
&= ~BB_VISITED
;
3306 parallel
*par
= nvptx_find_par (map
, 0, block
);
3310 fprintf (dump_file
, "\nLoops\n");
3311 nvptx_dump_pars (par
, 0);
3312 fprintf (dump_file
, "\n");
3318 /* Analyse a group of BBs within a partitioned region and create N
3319 Single-Entry-Single-Exit regions. Some of those regions will be
3320 trivial ones consisting of a single BB. The blocks of a
3321 partitioned region might form a set of disjoint graphs -- because
3322 the region encloses a differently partitoned sub region.
3324 We use the linear time algorithm described in 'Finding Regions Fast:
3325 Single Entry Single Exit and control Regions in Linear Time'
3326 Johnson, Pearson & Pingali. That algorithm deals with complete
3327 CFGs, where a back edge is inserted from END to START, and thus the
3328 problem becomes one of finding equivalent loops.
3330 In this case we have a partial CFG. We complete it by redirecting
3331 any incoming edge to the graph to be from an arbitrary external BB,
3332 and similarly redirecting any outgoing edge to be to that BB.
3333 Thus we end up with a closed graph.
3335 The algorithm works by building a spanning tree of an undirected
3336 graph and keeping track of back edges from nodes further from the
3337 root in the tree to nodes nearer to the root in the tree. In the
3338 description below, the root is up and the tree grows downwards.
3340 We avoid having to deal with degenerate back-edges to the same
3341 block, by splitting each BB into 3 -- one for input edges, one for
3342 the node itself and one for the output edges. Such back edges are
3343 referred to as 'Brackets'. Cycle equivalent nodes will have the
3344 same set of brackets.
3346 Determining bracket equivalency is done by maintaining a list of
3347 brackets in such a manner that the list length and final bracket
3348 uniquely identify the set.
3350 We use coloring to mark all BBs with cycle equivalency with the
3351 same color. This is the output of the 'Finding Regions Fast'
3352 algorithm. Notice it doesn't actually find the set of nodes within
3353 a particular region, just unorderd sets of nodes that are the
3354 entries and exits of SESE regions.
3356 After determining cycle equivalency, we need to find the minimal
3357 set of SESE regions. Do this with a DFS coloring walk of the
3358 complete graph. We're either 'looking' or 'coloring'. When
3359 looking, and we're in the subgraph, we start coloring the color of
3360 the current node, and remember that node as the start of the
3361 current color's SESE region. Every time we go to a new node, we
3362 decrement the count of nodes with thet color. If it reaches zero,
3363 we remember that node as the end of the current color's SESE region
3364 and return to 'looking'. Otherwise we color the node the current
3367 This way we end up with coloring the inside of non-trivial SESE
3368 regions with the color of that region. */
3370 /* A pair of BBs. We use this to represent SESE regions. */
3371 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3372 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3374 /* A node in the undirected CFG. The discriminator SECOND indicates just
3375 above or just below the BB idicated by FIRST. */
3376 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3378 /* A bracket indicates an edge towards the root of the spanning tree of the
3379 undirected graph. Each bracket has a color, determined
3380 from the currrent set of brackets. */
3383 pseudo_node_t back
; /* Back target */
3385 /* Current color and size of set. */
3389 bracket (pseudo_node_t back_
)
3390 : back (back_
), color (~0u), size (~0u)
3394 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3399 color
= color_counts
.length ();
3400 color_counts
.quick_push (0);
3402 color_counts
[color
]++;
3407 typedef auto_vec
<bracket
> bracket_vec_t
;
3409 /* Basic block info for finding SESE regions. */
3413 int node
; /* Node number in spanning tree. */
3414 int parent
; /* Parent node number. */
3416 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3417 edges arrive at pseudo-node Ai and the outgoing edges leave at
3418 pseudo-node Ao. We have to remember which way we arrived at a
3419 particular node when generating the spanning tree. dir > 0 means
3420 we arrived at Ai, dir < 0 means we arrived at Ao. */
3423 /* Lowest numbered pseudo-node reached via a backedge from thsis
3424 node, or any descendant. */
3427 int color
; /* Cycle-equivalence color */
3429 /* Stack of brackets for this node. */
3430 bracket_vec_t brackets
;
3432 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3433 :node (node_
), parent (p
), dir (dir_
)
3438 /* Push a bracket ending at BACK. */
3439 void push (const pseudo_node_t
&back
)
3442 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3443 back
.first
? back
.first
->index
: 0, back
.second
);
3444 brackets
.safe_push (bracket (back
));
3447 void append (bb_sese
*child
);
3448 void remove (const pseudo_node_t
&);
3450 /* Set node's color. */
3451 void set_color (auto_vec
<unsigned> &color_counts
)
3453 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3457 bb_sese::~bb_sese ()
3461 /* Destructively append CHILD's brackets. */
3464 bb_sese::append (bb_sese
*child
)
3466 if (int len
= child
->brackets
.length ())
3472 for (ix
= 0; ix
< len
; ix
++)
3474 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3475 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3476 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3480 if (!brackets
.length ())
3481 std::swap (brackets
, child
->brackets
);
3484 brackets
.reserve (len
);
3485 for (ix
= 0; ix
< len
; ix
++)
3486 brackets
.quick_push (child
->brackets
[ix
]);
3491 /* Remove brackets that terminate at PSEUDO. */
3494 bb_sese::remove (const pseudo_node_t
&pseudo
)
3496 unsigned removed
= 0;
3497 int len
= brackets
.length ();
3499 for (int ix
= 0; ix
< len
; ix
++)
3501 if (brackets
[ix
].back
== pseudo
)
3504 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3505 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3509 brackets
[ix
-removed
] = brackets
[ix
];
3515 /* Accessors for BB's aux pointer. */
3516 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3517 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3519 /* DFS walk creating SESE data structures. Only cover nodes with
3520 BB_VISITED set. Append discovered blocks to LIST. We number in
3521 increments of 3 so that the above and below pseudo nodes can be
3522 implicitly numbered too. */
3525 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
3526 auto_vec
<basic_block
> *list
)
3528 if (BB_GET_SESE (b
))
3532 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
3533 b
->index
, n
, p
, dir
);
3535 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
3539 list
->quick_push (b
);
3541 /* First walk the nodes on the 'other side' of this node, then walk
3542 the nodes on the same side. */
3543 for (unsigned ix
= 2; ix
; ix
--)
3545 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
3546 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
3547 : offsetof (edge_def
, src
));
3551 FOR_EACH_EDGE (e
, ei
, edges
)
3553 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3555 if (target
->flags
& BB_VISITED
)
3556 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
3563 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3564 EDGES are the outgoing edges and OFFSET is the offset to the src
3565 or dst block on the edges. */
3568 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
3569 vec
<edge
, va_gc
> *edges
, size_t offset
)
3573 int hi_back
= depth
;
3574 pseudo_node_t
node_back (0, depth
);
3575 int hi_child
= depth
;
3576 pseudo_node_t
node_child (0, depth
);
3577 basic_block child
= NULL
;
3578 unsigned num_children
= 0;
3579 int usd
= -dir
* sese
->dir
;
3582 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
3583 me
->index
, sese
->node
, dir
);
3587 /* This is the above pseudo-child. It has the BB itself as an
3588 additional child node. */
3589 node_child
= sese
->high
;
3590 hi_child
= node_child
.second
;
3591 if (node_child
.first
)
3592 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3596 /* Examine each edge.
3597 - if it is a child (a) append its bracket list and (b) record
3598 whether it is the child with the highest reaching bracket.
3599 - if it is an edge to ancestor, record whether it's the highest
3600 reaching backlink. */
3601 FOR_EACH_EDGE (e
, ei
, edges
)
3603 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3605 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3607 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
3609 /* Child node. Append its bracket list. */
3611 sese
->append (t_sese
);
3613 /* Compare it's hi value. */
3614 int t_hi
= t_sese
->high
.second
;
3616 if (basic_block child_hi_block
= t_sese
->high
.first
)
3617 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3619 if (hi_child
> t_hi
)
3622 node_child
= t_sese
->high
;
3626 else if (t_sese
->node
< sese
->node
+ dir
3627 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3629 /* Non-parental ancestor node -- a backlink. */
3630 int d
= usd
* t_sese
->dir
;
3631 int back
= t_sese
->node
+ d
;
3636 node_back
= pseudo_node_t (target
, d
);
3641 { /* Fallen off graph, backlink to entry node. */
3643 node_back
= pseudo_node_t (0, 0);
3647 /* Remove any brackets that terminate at this pseudo node. */
3648 sese
->remove (pseudo_node_t (me
, dir
));
3650 /* Now push any backlinks from this pseudo node. */
3651 FOR_EACH_EDGE (e
, ei
, edges
)
3653 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3654 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3656 if (t_sese
->node
< sese
->node
+ dir
3657 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3658 /* Non-parental ancestor node - backedge from me. */
3659 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
3663 /* back edge to entry node */
3664 sese
->push (pseudo_node_t (0, 0));
3668 /* If this node leads directly or indirectly to a no-return region of
3669 the graph, then fake a backedge to entry node. */
3670 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
3673 node_back
= pseudo_node_t (0, 0);
3674 sese
->push (node_back
);
3677 /* Record the highest reaching backedge from us or a descendant. */
3678 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
3680 if (num_children
> 1)
3682 /* There is more than one child -- this is a Y shaped piece of
3683 spanning tree. We have to insert a fake backedge from this
3684 node to the highest ancestor reached by not-the-highest
3685 reaching child. Note that there may be multiple children
3686 with backedges to the same highest node. That's ok and we
3687 insert the edge to that highest node. */
3689 if (dir
< 0 && child
)
3691 node_child
= sese
->high
;
3692 hi_child
= node_child
.second
;
3693 if (node_child
.first
)
3694 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3697 FOR_EACH_EDGE (e
, ei
, edges
)
3699 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3701 if (target
== child
)
3702 /* Ignore the highest child. */
3705 bb_sese
*t_sese
= BB_GET_SESE (target
);
3708 if (t_sese
->parent
!= sese
->node
)
3712 /* Compare its hi value. */
3713 int t_hi
= t_sese
->high
.second
;
3715 if (basic_block child_hi_block
= t_sese
->high
.first
)
3716 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3718 if (hi_child
> t_hi
)
3721 node_child
= t_sese
->high
;
3725 sese
->push (node_child
);
3730 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3731 proceed to successors. Set SESE entry and exit nodes of
3735 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
3736 basic_block block
, int coloring
)
3738 bb_sese
*sese
= BB_GET_SESE (block
);
3740 if (block
->flags
& BB_VISITED
)
3742 /* If we've already encountered this block, either we must not
3743 be coloring, or it must have been colored the current color. */
3744 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
3748 block
->flags
|= BB_VISITED
;
3754 /* Start coloring a region. */
3755 regions
[sese
->color
].first
= block
;
3756 coloring
= sese
->color
;
3759 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
3761 /* Found final block of SESE region. */
3762 regions
[sese
->color
].second
= block
;
3766 /* Color the node, so we can assert on revisiting the node
3767 that the graph is indeed SESE. */
3768 sese
->color
= coloring
;
3771 /* Fallen off the subgraph, we cannot be coloring. */
3772 gcc_assert (coloring
< 0);
3774 /* Walk each successor block. */
3775 if (block
->succs
&& block
->succs
->length ())
3780 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3781 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
3784 gcc_assert (coloring
< 0);
3787 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3788 end up with NULL entries in it. */
3791 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
3796 /* First clear each BB of the whole function. */
3797 FOR_ALL_BB_FN (block
, cfun
)
3799 block
->flags
&= ~BB_VISITED
;
3800 BB_SET_SESE (block
, 0);
3803 /* Mark blocks in the function that are in this graph. */
3804 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3805 block
->flags
|= BB_VISITED
;
3807 /* Counts of nodes assigned to each color. There cannot be more
3808 colors than blocks (and hopefully there will be fewer). */
3809 auto_vec
<unsigned> color_counts
;
3810 color_counts
.reserve (blocks
.length ());
3812 /* Worklist of nodes in the spanning tree. Again, there cannot be
3813 more nodes in the tree than blocks (there will be fewer if the
3814 CFG of blocks is disjoint). */
3815 auto_vec
<basic_block
> spanlist
;
3816 spanlist
.reserve (blocks
.length ());
3818 /* Make sure every block has its cycle class determined. */
3819 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3821 if (BB_GET_SESE (block
))
3822 /* We already met this block in an earlier graph solve. */
3826 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3828 /* Number the nodes reachable from block initial DFS order. */
3829 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3831 /* Now walk in reverse DFS order to find cycle equivalents. */
3832 while (spanlist
.length ())
3834 block
= spanlist
.pop ();
3835 bb_sese
*sese
= BB_GET_SESE (block
);
3837 /* Do the pseudo node below. */
3838 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3839 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3840 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3841 : offsetof (edge_def
, src
)));
3842 sese
->set_color (color_counts
);
3843 /* Do the pseudo node above. */
3844 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3845 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3846 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3847 : offsetof (edge_def
, src
)));
3850 fprintf (dump_file
, "\n");
3856 const char *comma
= "";
3858 fprintf (dump_file
, "Found %d cycle equivalents\n",
3859 color_counts
.length ());
3860 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3862 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3865 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3866 if (BB_GET_SESE (block
)->color
== ix
)
3868 block
->flags
|= BB_VISITED
;
3869 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3872 fprintf (dump_file
, "}");
3875 fprintf (dump_file
, "\n");
3878 /* Now we've colored every block in the subgraph. We now need to
3879 determine the minimal set of SESE regions that cover that
3880 subgraph. Do this with a DFS walk of the complete function.
3881 During the walk we're either 'looking' or 'coloring'. When we
3882 reach the last node of a particular color, we stop coloring and
3883 return to looking. */
3885 /* There cannot be more SESE regions than colors. */
3886 regions
.reserve (color_counts
.length ());
3887 for (ix
= color_counts
.length (); ix
--;)
3888 regions
.quick_push (bb_pair_t (0, 0));
3890 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3891 block
->flags
&= ~BB_VISITED
;
3893 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3897 const char *comma
= "";
3898 int len
= regions
.length ();
3900 fprintf (dump_file
, "SESE regions:");
3901 for (ix
= 0; ix
!= len
; ix
++)
3903 basic_block from
= regions
[ix
].first
;
3904 basic_block to
= regions
[ix
].second
;
3908 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3910 fprintf (dump_file
, "->%d", to
->index
);
3912 int color
= BB_GET_SESE (from
)->color
;
3914 /* Print the blocks within the region (excluding ends). */
3915 FOR_EACH_BB_FN (block
, cfun
)
3917 bb_sese
*sese
= BB_GET_SESE (block
);
3919 if (sese
&& sese
->color
== color
3920 && block
!= from
&& block
!= to
)
3921 fprintf (dump_file
, ".%d", block
->index
);
3923 fprintf (dump_file
, "}");
3927 fprintf (dump_file
, "\n\n");
3930 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3931 delete BB_GET_SESE (block
);
3937 /* Propagate live state at the start of a partitioned region. IS_CALL
3938 indicates whether the propagation is for a (partitioned) call
3939 instruction. BLOCK provides the live register information, and
3940 might not contain INSN. Propagation is inserted just after INSN. RW
3941 indicates whether we are reading and/or writing state. This
3942 separation is needed for worker-level proppagation where we
3943 essentially do a spill & fill. FN is the underlying worker
3944 function to generate the propagation instructions for single
3945 register. DATA is user data.
3947 Returns true if we didn't emit any instructions.
3949 We propagate the live register set for non-calls and the entire
3950 frame for calls and non-calls. We could do better by (a)
3951 propagating just the live set that is used within the partitioned
3952 regions and (b) only propagating stack entries that are used. The
3953 latter might be quite hard to determine. */
3955 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *, bool);
3958 nvptx_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
,
3959 propagate_mask rw
, propagator_fn fn
, void *data
, bool vector
)
3961 bitmap live
= DF_LIVE_IN (block
);
3962 bitmap_iterator iterator
;
3966 /* Copy the frame array. */
3967 HOST_WIDE_INT fs
= get_frame_size ();
3970 rtx tmp
= gen_reg_rtx (DImode
);
3972 rtx ptr
= gen_reg_rtx (Pmode
);
3973 rtx pred
= NULL_RTX
;
3974 rtx_code_label
*label
= NULL
;
3977 /* The frame size might not be DImode compatible, but the frame
3978 array's declaration will be. So it's ok to round up here. */
3979 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
3980 /* Detect single iteration loop. */
3985 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3988 idx
= gen_reg_rtx (SImode
);
3989 pred
= gen_reg_rtx (BImode
);
3990 label
= gen_label_rtx ();
3992 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3993 /* Allow worker function to initialize anything needed. */
3994 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
, vector
);
3998 LABEL_NUSES (label
)++;
3999 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
4002 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
4003 emit_insn (fn (tmp
, rw
, fs
, data
, vector
));
4005 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
4008 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
4009 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
4010 emit_insn (gen_br_true_uni (pred
, label
));
4011 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
, vector
);
4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
4016 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
4017 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
4018 rtx cpy
= get_insns ();
4020 insn
= emit_insn_after (cpy
, insn
);
4024 /* Copy live registers. */
4025 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
4027 rtx reg
= regno_reg_rtx
[ix
];
4029 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
4031 rtx bcast
= fn (reg
, rw
, 0, data
, vector
);
4033 insn
= emit_insn_after (bcast
, insn
);
4040 /* Worker for nvptx_warp_propagate. */
4043 warp_prop_gen (rtx reg
, propagate_mask pm
,
4044 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
),
4045 bool ARG_UNUSED (vector
))
4047 if (!(pm
& PM_read_write
))
4050 return nvptx_gen_warp_bcast (reg
);
4053 /* Propagate state that is live at start of BLOCK across the vectors
4054 of a single warp. Propagation is inserted just after INSN.
4055 IS_CALL and return as for nvptx_propagate. */
4058 nvptx_warp_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
)
4060 return nvptx_propagate (is_call
, block
, insn
, PM_read_write
,
4061 warp_prop_gen
, 0, false);
4064 /* Worker for nvptx_shared_propagate. */
4067 shared_prop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
,
4070 broadcast_data_t
*data
= (broadcast_data_t
*)data_
;
4072 if (pm
& PM_loop_begin
)
4074 /* Starting a loop, initialize pointer. */
4075 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
4077 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
4078 data
->offset
= ROUND_UP (data
->offset
, align
);
4080 data
->ptr
= gen_reg_rtx (Pmode
);
4082 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
4084 else if (pm
& PM_loop_end
)
4086 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
4087 data
->ptr
= NULL_RTX
;
4091 return nvptx_gen_shared_bcast (reg
, pm
, rep
, data
, vector
);
4094 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4095 indicates if this is just before partitioned mode (do spill), or
4096 just after it starts (do fill). Sequence is inserted just after
4097 INSN. IS_CALL and return as for nvptx_propagate. */
4100 nvptx_shared_propagate (bool pre_p
, bool is_call
, basic_block block
,
4101 rtx_insn
*insn
, bool vector
)
4103 broadcast_data_t data
;
4105 data
.base
= gen_reg_rtx (Pmode
);
4107 data
.ptr
= NULL_RTX
;
4109 bool empty
= nvptx_propagate (is_call
, block
, insn
,
4110 pre_p
? PM_read
: PM_write
, shared_prop_gen
,
4112 gcc_assert (empty
== !data
.offset
);
4115 rtx bcast_sym
= oacc_bcast_sym
;
4117 /* Stuff was emitted, initialize the base pointer now. */
4118 if (vector
&& nvptx_mach_max_workers () > 1)
4120 if (!cfun
->machine
->bcast_partition
)
4122 /* It would be nice to place this register in
4123 DATA_AREA_SHARED. */
4124 cfun
->machine
->bcast_partition
= gen_reg_rtx (DImode
);
4126 if (!cfun
->machine
->sync_bar
)
4127 cfun
->machine
->sync_bar
= gen_reg_rtx (SImode
);
4129 bcast_sym
= cfun
->machine
->bcast_partition
;
4132 rtx init
= gen_rtx_SET (data
.base
, bcast_sym
);
4133 emit_insn_after (init
, insn
);
4135 unsigned int psize
= ROUND_UP (data
.offset
, oacc_bcast_align
);
4136 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4137 ? nvptx_mach_max_workers () + 1
4140 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4141 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4146 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4147 which is an integer or a register. THREADS is the number of threads
4148 controlled by the barrier. */
4151 nvptx_cta_sync (rtx lock
, int threads
)
4153 return gen_nvptx_barsync (lock
, GEN_INT (threads
));
4156 #if WORKAROUND_PTXJIT_BUG
4157 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4161 bb_first_real_insn (basic_block bb
)
4165 /* Find first insn of from block. */
4166 FOR_BB_INSNS (bb
, insn
)
4174 /* Return true if INSN needs neutering. */
4177 needs_neutering_p (rtx_insn
*insn
)
4182 switch (recog_memoized (insn
))
4184 case CODE_FOR_nvptx_fork
:
4185 case CODE_FOR_nvptx_forked
:
4186 case CODE_FOR_nvptx_joining
:
4187 case CODE_FOR_nvptx_join
:
4188 case CODE_FOR_nvptx_barsync
:
4195 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4198 verify_neutering_jumps (basic_block from
,
4199 rtx_insn
*vector_jump
, rtx_insn
*worker_jump
,
4200 rtx_insn
*vector_label
, rtx_insn
*worker_label
)
4202 basic_block bb
= from
;
4203 rtx_insn
*insn
= BB_HEAD (bb
);
4204 bool seen_worker_jump
= false;
4205 bool seen_vector_jump
= false;
4206 bool seen_worker_label
= false;
4207 bool seen_vector_label
= false;
4208 bool worker_neutered
= false;
4209 bool vector_neutered
= false;
4212 if (insn
== worker_jump
)
4214 seen_worker_jump
= true;
4215 worker_neutered
= true;
4216 gcc_assert (!vector_neutered
);
4218 else if (insn
== vector_jump
)
4220 seen_vector_jump
= true;
4221 vector_neutered
= true;
4223 else if (insn
== worker_label
)
4225 seen_worker_label
= true;
4226 gcc_assert (worker_neutered
);
4227 worker_neutered
= false;
4229 else if (insn
== vector_label
)
4231 seen_vector_label
= true;
4232 gcc_assert (vector_neutered
);
4233 vector_neutered
= false;
4235 else if (INSN_P (insn
))
4236 switch (recog_memoized (insn
))
4238 case CODE_FOR_nvptx_barsync
:
4239 gcc_assert (!vector_neutered
&& !worker_neutered
);
4245 if (insn
!= BB_END (bb
))
4246 insn
= NEXT_INSN (insn
);
4247 else if (JUMP_P (insn
) && single_succ_p (bb
)
4248 && !seen_vector_jump
&& !seen_worker_jump
)
4250 bb
= single_succ (bb
);
4251 insn
= BB_HEAD (bb
);
4257 gcc_assert (!(vector_jump
&& !seen_vector_jump
));
4258 gcc_assert (!(worker_jump
&& !seen_worker_jump
));
4260 if (seen_vector_label
|| seen_worker_label
)
4262 gcc_assert (!(vector_label
&& !seen_vector_label
));
4263 gcc_assert (!(worker_label
&& !seen_worker_label
));
4271 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4274 verify_neutering_labels (basic_block to
, rtx_insn
*vector_label
,
4275 rtx_insn
*worker_label
)
4277 basic_block bb
= to
;
4278 rtx_insn
*insn
= BB_END (bb
);
4279 bool seen_worker_label
= false;
4280 bool seen_vector_label
= false;
4283 if (insn
== worker_label
)
4285 seen_worker_label
= true;
4286 gcc_assert (!seen_vector_label
);
4288 else if (insn
== vector_label
)
4289 seen_vector_label
= true;
4290 else if (INSN_P (insn
))
4291 switch (recog_memoized (insn
))
4293 case CODE_FOR_nvptx_barsync
:
4294 gcc_assert (!seen_vector_label
&& !seen_worker_label
);
4298 if (insn
!= BB_HEAD (bb
))
4299 insn
= PREV_INSN (insn
);
4304 gcc_assert (!(vector_label
&& !seen_vector_label
));
4305 gcc_assert (!(worker_label
&& !seen_worker_label
));
4308 /* Single neutering according to MASK. FROM is the incoming block and
4309 TO is the outgoing block. These may be the same block. Insert at
4312 if (tid.<axis>) goto end.
4314 and insert before ending branch of TO (if there is such an insn):
4317 <possibly-broadcast-cond>
4320 We currently only use differnt FROM and TO when skipping an entire
4321 loop. We could do more if we detected superblocks. */
4324 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
4326 rtx_insn
*head
= BB_HEAD (from
);
4327 rtx_insn
*tail
= BB_END (to
);
4328 unsigned skip_mask
= mask
;
4332 /* Find first insn of from block. */
4333 while (head
!= BB_END (from
) && !needs_neutering_p (head
))
4334 head
= NEXT_INSN (head
);
4339 if (!(JUMP_P (head
) && single_succ_p (from
)))
4342 basic_block jump_target
= single_succ (from
);
4343 if (!single_pred_p (jump_target
))
4347 head
= BB_HEAD (from
);
4350 /* Find last insn of to block */
4351 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
4352 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
4353 tail
= PREV_INSN (tail
);
4355 /* Detect if tail is a branch. */
4356 rtx tail_branch
= NULL_RTX
;
4357 rtx cond_branch
= NULL_RTX
;
4358 if (tail
&& INSN_P (tail
))
4360 tail_branch
= PATTERN (tail
);
4361 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4362 tail_branch
= NULL_RTX
;
4365 cond_branch
= SET_SRC (tail_branch
);
4366 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4367 cond_branch
= NULL_RTX
;
4373 /* If this is empty, do nothing. */
4374 if (!head
|| !needs_neutering_p (head
))
4379 /* If we're only doing vector single, there's no need to
4380 emit skip code because we'll not insert anything. */
4381 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4384 else if (tail_branch
)
4385 /* Block with only unconditional branch. Nothing to do. */
4389 /* Insert the vector test inside the worker test. */
4391 rtx_insn
*before
= tail
;
4392 rtx_insn
*neuter_start
= NULL
;
4393 rtx_insn
*worker_label
= NULL
, *vector_label
= NULL
;
4394 rtx_insn
*worker_jump
= NULL
, *vector_jump
= NULL
;
4395 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4396 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4398 rtx_code_label
*label
= gen_label_rtx ();
4399 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4400 rtx_insn
**mode_jump
4401 = mode
== GOMP_DIM_VECTOR
? &vector_jump
: &worker_jump
;
4402 rtx_insn
**mode_label
4403 = mode
== GOMP_DIM_VECTOR
? &vector_label
: &worker_label
;
4407 pred
= gen_reg_rtx (BImode
);
4408 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4412 if (mode
== GOMP_DIM_VECTOR
)
4413 br
= gen_br_true (pred
, label
);
4415 br
= gen_br_true_uni (pred
, label
);
4417 neuter_start
= emit_insn_after (br
, neuter_start
);
4419 neuter_start
= emit_insn_before (br
, head
);
4420 *mode_jump
= neuter_start
;
4422 LABEL_NUSES (label
)++;
4423 rtx_insn
*label_insn
;
4426 label_insn
= emit_label_before (label
, before
);
4427 before
= label_insn
;
4431 label_insn
= emit_label_after (label
, tail
);
4432 if ((mode
== GOMP_DIM_VECTOR
|| mode
== GOMP_DIM_WORKER
)
4433 && CALL_P (tail
) && find_reg_note (tail
, REG_NORETURN
, NULL
))
4434 emit_insn_after (gen_exit (), label_insn
);
4437 *mode_label
= label_insn
;
4440 /* Now deal with propagating the branch condition. */
4443 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4445 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
4446 && nvptx_mach_vector_length () == PTX_WARP_SIZE
)
4448 /* Vector mode only, do a shuffle. */
4449 #if WORKAROUND_PTXJIT_BUG
4450 /* The branch condition %rcond is propagated like this:
4455 setp.ne.u32 %rnotvzero,%x,0;
4458 @%rnotvzero bra Lskip;
4459 setp.<op>.<type> %rcond,op1,op2;
4461 selp.u32 %rcondu32,1,0,%rcond;
4462 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4463 setp.ne.u32 %rcond,%rcondu32,0;
4465 There seems to be a bug in the ptx JIT compiler (observed at driver
4466 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4467 unless %rcond is initialized to something before 'bra Lskip'. The
4468 bug is not observed with ptxas from cuda 8.0.61.
4470 It is true that the code is non-trivial: at Lskip, %rcond is
4471 uninitialized in threads 1-31, and after the selp the same holds
4472 for %rcondu32. But shfl propagates the defined value in thread 0
4473 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4474 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4476 There is nothing in the PTX spec to suggest that this is wrong, or
4477 to explain why the extra initialization is needed. So, we classify
4478 it as a JIT bug, and the extra initialization as workaround:
4483 setp.ne.u32 %rnotvzero,%x,0;
4486 +.reg .pred %rcond2;
4487 +setp.eq.u32 %rcond2, 1, 0;
4489 @%rnotvzero bra Lskip;
4490 setp.<op>.<type> %rcond,op1,op2;
4491 +mov.pred %rcond2, %rcond;
4493 +mov.pred %rcond, %rcond2;
4494 selp.u32 %rcondu32,1,0,%rcond;
4495 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4496 setp.ne.u32 %rcond,%rcondu32,0;
4498 rtx_insn
*label
= PREV_INSN (tail
);
4499 gcc_assert (label
&& LABEL_P (label
));
4500 rtx tmp
= gen_reg_rtx (BImode
);
4501 emit_insn_before (gen_movbi (tmp
, const0_rtx
),
4502 bb_first_real_insn (from
));
4503 emit_insn_before (gen_rtx_SET (tmp
, pvar
), label
);
4504 emit_insn_before (gen_rtx_SET (pvar
, tmp
), tail
);
4506 emit_insn_before (nvptx_gen_warp_bcast (pvar
), tail
);
4510 /* Includes worker mode, do spill & fill. By construction
4511 we should never have worker mode only. */
4512 broadcast_data_t data
;
4513 unsigned size
= GET_MODE_SIZE (SImode
);
4514 bool vector
= (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
) != 0;
4515 bool worker
= (GOMP_DIM_MASK (GOMP_DIM_WORKER
) == mask
) != 0;
4516 rtx barrier
= GEN_INT (0);
4519 data
.base
= oacc_bcast_sym
;
4522 bool use_partitioning_p
= (vector
&& !worker
4523 && nvptx_mach_max_workers () > 1
4524 && cfun
->machine
->bcast_partition
);
4525 if (use_partitioning_p
)
4527 data
.base
= cfun
->machine
->bcast_partition
;
4528 barrier
= cfun
->machine
->sync_bar
;
4529 threads
= nvptx_mach_vector_length ();
4531 gcc_assert (data
.base
!= NULL
);
4532 gcc_assert (barrier
);
4534 unsigned int psize
= ROUND_UP (size
, oacc_bcast_align
);
4535 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4536 ? nvptx_mach_max_workers () + 1
4539 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4540 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4543 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_read
, 0, &data
,
4547 /* Barrier so other workers can see the write. */
4548 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
4550 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_write
, 0, &data
,
4553 /* This barrier is needed to avoid worker zero clobbering
4554 the broadcast buffer before all the other workers have
4555 had a chance to read this instance of it. */
4556 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
4559 extract_insn (tail
);
4560 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
4562 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
4565 bool seen_label
= verify_neutering_jumps (from
, vector_jump
, worker_jump
,
4566 vector_label
, worker_label
);
4568 verify_neutering_labels (to
, vector_label
, worker_label
);
4571 /* PAR is a parallel that is being skipped in its entirety according to
4572 MASK. Treat this as skipping a superblock starting at forked
4573 and ending at joining. */
4576 nvptx_skip_par (unsigned mask
, parallel
*par
)
4578 basic_block tail
= par
->join_block
;
4579 gcc_assert (tail
->preds
->length () == 1);
4581 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
4582 gcc_assert (pre_tail
->succs
->length () == 1);
4584 nvptx_single (mask
, par
->forked_block
, pre_tail
);
4587 /* If PAR has a single inner parallel and PAR itself only contains
4588 empty entry and exit blocks, swallow the inner PAR. */
4591 nvptx_optimize_inner (parallel
*par
)
4593 parallel
*inner
= par
->inner
;
4595 /* We mustn't be the outer dummy par. */
4599 /* We must have a single inner par. */
4600 if (!inner
|| inner
->next
)
4603 /* We must only contain 2 blocks ourselves -- the head and tail of
4605 if (par
->blocks
.length () != 2)
4608 /* We must be disjoint partitioning. As we only have vector and
4609 worker partitioning, this is sufficient to guarantee the pars
4610 have adjacent partitioning. */
4611 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
4612 /* This indicates malformed code generation. */
4615 /* The outer forked insn should be immediately followed by the inner
4617 rtx_insn
*forked
= par
->forked_insn
;
4618 rtx_insn
*fork
= BB_END (par
->forked_block
);
4620 if (NEXT_INSN (forked
) != fork
)
4622 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
4624 /* The outer joining insn must immediately follow the inner join
4626 rtx_insn
*joining
= par
->joining_insn
;
4627 rtx_insn
*join
= inner
->join_insn
;
4628 if (NEXT_INSN (join
) != joining
)
4631 /* Preconditions met. Swallow the inner par. */
4633 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4634 inner
->mask
, inner
->forked_block
->index
,
4635 inner
->join_block
->index
,
4636 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
4638 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
4640 par
->blocks
.reserve (inner
->blocks
.length ());
4641 while (inner
->blocks
.length ())
4642 par
->blocks
.quick_push (inner
->blocks
.pop ());
4644 par
->inner
= inner
->inner
;
4645 inner
->inner
= NULL
;
4650 /* Process the parallel PAR and all its contained
4651 parallels. We do everything but the neutering. Return mask of
4652 partitioned modes used within this parallel. */
4655 nvptx_process_pars (parallel
*par
)
4658 nvptx_optimize_inner (par
);
4660 unsigned inner_mask
= par
->mask
;
4662 /* Do the inner parallels first. */
4665 par
->inner_mask
= nvptx_process_pars (par
->inner
);
4666 inner_mask
|= par
->inner_mask
;
4669 bool is_call
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
)) != 0;
4670 bool worker
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
));
4671 bool large_vector
= ((par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4672 && nvptx_mach_vector_length () > PTX_WARP_SIZE
);
4674 if (worker
|| large_vector
)
4676 nvptx_shared_propagate (false, is_call
, par
->forked_block
,
4677 par
->forked_insn
, !worker
);
4679 = nvptx_shared_propagate (true, is_call
, par
->forked_block
,
4680 par
->fork_insn
, !worker
);
4682 = !is_call
&& (NEXT_INSN (par
->forked_insn
)
4683 && NEXT_INSN (par
->forked_insn
) == par
->joining_insn
);
4684 rtx barrier
= GEN_INT (0);
4687 if (!worker
&& cfun
->machine
->sync_bar
)
4689 barrier
= cfun
->machine
->sync_bar
;
4690 threads
= nvptx_mach_vector_length ();
4693 if (no_prop_p
&& empty_loop_p
)
4695 else if (no_prop_p
&& is_call
)
4699 /* Insert begin and end synchronizations. */
4700 emit_insn_before (nvptx_cta_sync (barrier
, threads
),
4702 emit_insn_before (nvptx_cta_sync (barrier
, threads
), par
->join_insn
);
4705 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4706 nvptx_warp_propagate (is_call
, par
->forked_block
, par
->forked_insn
);
4708 /* Now do siblings. */
4710 inner_mask
|= nvptx_process_pars (par
->next
);
4714 /* Neuter the parallel described by PAR. We recurse in depth-first
4715 order. MODES are the partitioning of the execution and OUTER is
4716 the partitioning of the parallels we are contained in. */
4719 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
4721 unsigned me
= (par
->mask
4722 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
4723 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4724 unsigned skip_mask
= 0, neuter_mask
= 0;
4727 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
4729 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4731 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
4732 {} /* Mode is partitioned: no neutering. */
4733 else if (!(modes
& GOMP_DIM_MASK (mode
)))
4734 {} /* Mode is not used: nothing to do. */
4735 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
4736 || !par
->forked_insn
)
4737 /* Partitioned in inner parallels, or we're not a partitioned
4738 at all: neuter individual blocks. */
4739 neuter_mask
|= GOMP_DIM_MASK (mode
);
4740 else if (!par
->parent
|| !par
->parent
->forked_insn
4741 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
4742 /* Parent isn't a parallel or contains this paralleling: skip
4743 parallel at this level. */
4744 skip_mask
|= GOMP_DIM_MASK (mode
);
4746 {} /* Parent will skip this parallel itself. */
4755 /* Neuter whole SESE regions. */
4756 bb_pair_vec_t regions
;
4758 nvptx_find_sese (par
->blocks
, regions
);
4759 len
= regions
.length ();
4760 for (ix
= 0; ix
!= len
; ix
++)
4762 basic_block from
= regions
[ix
].first
;
4763 basic_block to
= regions
[ix
].second
;
4766 nvptx_single (neuter_mask
, from
, to
);
4773 /* Neuter each BB individually. */
4774 len
= par
->blocks
.length ();
4775 for (ix
= 0; ix
!= len
; ix
++)
4777 basic_block block
= par
->blocks
[ix
];
4779 nvptx_single (neuter_mask
, block
, block
);
4785 nvptx_skip_par (skip_mask
, par
);
4788 nvptx_neuter_pars (par
->next
, modes
, outer
);
4792 populate_offload_attrs (offload_attrs
*oa
)
4794 tree attr
= oacc_get_fn_attrib (current_function_decl
);
4795 tree dims
= TREE_VALUE (attr
);
4800 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
4802 tree t
= TREE_VALUE (dims
);
4803 int size
= (t
== NULL_TREE
) ? -1 : TREE_INT_CST_LOW (t
);
4804 tree allowed
= TREE_PURPOSE (dims
);
4806 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
4807 oa
->mask
|= GOMP_DIM_MASK (ix
);
4812 oa
->num_gangs
= size
;
4815 case GOMP_DIM_WORKER
:
4816 oa
->num_workers
= size
;
4819 case GOMP_DIM_VECTOR
:
4820 oa
->vector_length
= size
;
4826 #if WORKAROUND_PTXJIT_BUG_2
4827 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4828 is needed in the nvptx target because the branches generated for
4829 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4832 nvptx_pc_set (const rtx_insn
*insn
, bool strict
= true)
4835 if ((strict
&& !JUMP_P (insn
))
4836 || (!strict
&& !INSN_P (insn
)))
4838 pat
= PATTERN (insn
);
4840 /* The set is allowed to appear either as the insn pattern or
4841 the first set in a PARALLEL. */
4842 if (GET_CODE (pat
) == PARALLEL
)
4843 pat
= XVECEXP (pat
, 0, 0);
4844 if (GET_CODE (pat
) == SET
&& GET_CODE (SET_DEST (pat
)) == PC
)
4850 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4853 nvptx_condjump_label (const rtx_insn
*insn
, bool strict
= true)
4855 rtx x
= nvptx_pc_set (insn
, strict
);
4860 if (GET_CODE (x
) == LABEL_REF
)
4862 if (GET_CODE (x
) != IF_THEN_ELSE
)
4864 if (XEXP (x
, 2) == pc_rtx
&& GET_CODE (XEXP (x
, 1)) == LABEL_REF
)
4866 if (XEXP (x
, 1) == pc_rtx
&& GET_CODE (XEXP (x
, 2)) == LABEL_REF
)
4871 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4872 insn inbetween the branch and the label. This works around a JIT bug
4873 observed at driver version 384.111, at -O0 for sm_50. */
4876 prevent_branch_around_nothing (void)
4878 rtx_insn
*seen_label
= NULL
;
4879 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4881 if (INSN_P (insn
) && condjump_p (insn
))
4883 seen_label
= label_ref_label (nvptx_condjump_label (insn
, false));
4887 if (seen_label
== NULL
)
4890 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4894 switch (recog_memoized (insn
))
4896 case CODE_FOR_nvptx_fork
:
4897 case CODE_FOR_nvptx_forked
:
4898 case CODE_FOR_nvptx_joining
:
4899 case CODE_FOR_nvptx_join
:
4906 if (LABEL_P (insn
) && insn
== seen_label
)
4907 emit_insn_before (gen_fake_nop (), insn
);
4914 #ifdef WORKAROUND_PTXJIT_BUG_3
4915 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4916 works around a hang observed at driver version 390.48 for sm_50. */
4919 workaround_barsyncs (void)
4921 bool seen_barsync
= false;
4922 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4924 if (INSN_P (insn
) && recog_memoized (insn
) == CODE_FOR_nvptx_barsync
)
4928 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4929 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4932 seen_barsync
= true;
4939 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4941 else if (INSN_P (insn
))
4942 switch (recog_memoized (insn
))
4944 case CODE_FOR_nvptx_fork
:
4945 case CODE_FOR_nvptx_forked
:
4946 case CODE_FOR_nvptx_joining
:
4947 case CODE_FOR_nvptx_join
:
4953 seen_barsync
= false;
4958 /* PTX-specific reorganization
4959 - Split blocks at fork and join instructions
4960 - Compute live registers
4961 - Mark now-unused registers, so function begin doesn't declare
4963 - Insert state propagation when entering partitioned mode
4964 - Insert neutering instructions when in single mode
4965 - Replace subregs with suitable sequences.
4971 /* We are freeing block_for_insn in the toplev to keep compatibility
4972 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4973 compute_bb_for_insn ();
4975 thread_prologue_and_epilogue_insns ();
4977 /* Split blocks and record interesting unspecs. */
4978 bb_insn_map_t bb_insn_map
;
4980 nvptx_split_blocks (&bb_insn_map
);
4982 /* Compute live regs */
4983 df_clear_flags (DF_LR_RUN_DCE
);
4984 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
4985 df_live_add_problem ();
4986 df_live_set_all_dirty ();
4988 regstat_init_n_sets_and_refs ();
4991 df_dump (dump_file
);
4993 /* Mark unused regs as unused. */
4994 int max_regs
= max_reg_num ();
4995 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
4996 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
4997 regno_reg_rtx
[i
] = const0_rtx
;
4999 /* Determine launch dimensions of the function. If it is not an
5000 offloaded function (i.e. this is a regular compiler), the
5001 function has no neutering. */
5002 tree attr
= oacc_get_fn_attrib (current_function_decl
);
5005 /* If we determined this mask before RTL expansion, we could
5006 elide emission of some levels of forks and joins. */
5009 populate_offload_attrs (&oa
);
5011 /* If there is worker neutering, there must be vector
5012 neutering. Otherwise the hardware will fail. */
5013 gcc_assert (!(oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
5014 || (oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
5016 /* Discover & process partitioned regions. */
5017 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
5018 nvptx_process_pars (pars
);
5019 nvptx_neuter_pars (pars
, oa
.mask
, 0);
5023 /* Replace subregs. */
5024 nvptx_reorg_subreg ();
5026 if (TARGET_UNIFORM_SIMT
)
5027 nvptx_reorg_uniform_simt ();
5029 #if WORKAROUND_PTXJIT_BUG_2
5030 prevent_branch_around_nothing ();
5033 #ifdef WORKAROUND_PTXJIT_BUG_3
5034 workaround_barsyncs ();
5037 regstat_free_n_sets_and_refs ();
5039 df_finish_pass (true);
5042 /* Handle a "kernel" attribute; arguments as in
5043 struct attribute_spec.handler. */
5046 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5047 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5051 if (TREE_CODE (decl
) != FUNCTION_DECL
)
5053 error ("%qE attribute only applies to functions", name
);
5054 *no_add_attrs
= true;
5056 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
5058 error ("%qE attribute requires a void return type", name
);
5059 *no_add_attrs
= true;
5065 /* Handle a "shared" attribute; arguments as in
5066 struct attribute_spec.handler. */
5069 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5070 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5074 if (TREE_CODE (decl
) != VAR_DECL
)
5076 error ("%qE attribute only applies to variables", name
);
5077 *no_add_attrs
= true;
5079 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
5081 error ("%qE attribute not allowed with auto storage class", name
);
5082 *no_add_attrs
= true;
5088 /* Table of valid machine attributes. */
5089 static const struct attribute_spec nvptx_attribute_table
[] =
5091 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5092 affects_type_identity, handler, exclude } */
5093 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
5095 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
5097 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
5100 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5102 static HOST_WIDE_INT
5103 nvptx_vector_alignment (const_tree type
)
5105 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
5107 return MIN (align
, BIGGEST_ALIGNMENT
);
5110 /* Indicate that INSN cannot be duplicated. */
5113 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
5115 switch (recog_memoized (insn
))
5117 case CODE_FOR_nvptx_shufflesi
:
5118 case CODE_FOR_nvptx_shufflesf
:
5119 case CODE_FOR_nvptx_barsync
:
5120 case CODE_FOR_nvptx_fork
:
5121 case CODE_FOR_nvptx_forked
:
5122 case CODE_FOR_nvptx_joining
:
5123 case CODE_FOR_nvptx_join
:
5130 /* Section anchors do not work. Initialization for flag_section_anchor
5131 probes the existence of the anchoring target hooks and prevents
5132 anchoring if they don't exist. However, we may be being used with
5133 a host-side compiler that does support anchoring, and hence see
5134 the anchor flag set (as it's not recalculated). So provide an
5135 implementation denying anchoring. */
5138 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
5143 /* Record a symbol for mkoffload to enter into the mapping table. */
5146 nvptx_record_offload_symbol (tree decl
)
5148 switch (TREE_CODE (decl
))
5151 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
5152 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5157 tree attr
= oacc_get_fn_attrib (decl
);
5158 /* OpenMP offloading does not set this attribute. */
5159 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
5161 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
5162 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5164 for (; dims
; dims
= TREE_CHAIN (dims
))
5166 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
5168 gcc_assert (!TREE_PURPOSE (dims
));
5169 fprintf (asm_out_file
, ", %#x", size
);
5172 fprintf (asm_out_file
, "\n");
5181 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5182 at the start of a file. */
5185 nvptx_file_start (void)
5187 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
5188 fputs ("\t.version\t3.1\n", asm_out_file
);
5190 fputs ("\t.target\tsm_35\n", asm_out_file
);
5192 fputs ("\t.target\tsm_30\n", asm_out_file
);
5193 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
5194 fputs ("// END PREAMBLE\n", asm_out_file
);
5197 /* Emit a declaration for a worker and vector-level buffer in .shared
5201 write_shared_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
5203 const char *name
= XSTR (sym
, 0);
5205 write_var_marker (file
, true, false, name
);
5206 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
5210 /* Write out the function declarations we've collected and declare storage
5211 for the broadcast buffer. */
5214 nvptx_file_end (void)
5216 hash_table
<tree_hasher
>::iterator iter
;
5218 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
5219 nvptx_record_fndecl (decl
);
5220 fputs (func_decls
.str().c_str(), asm_out_file
);
5222 if (oacc_bcast_size
)
5223 write_shared_buffer (asm_out_file
, oacc_bcast_sym
,
5224 oacc_bcast_align
, oacc_bcast_size
);
5226 if (worker_red_size
)
5227 write_shared_buffer (asm_out_file
, worker_red_sym
,
5228 worker_red_align
, worker_red_size
);
5230 if (vector_red_size
)
5231 write_shared_buffer (asm_out_file
, vector_red_sym
,
5232 vector_red_align
, vector_red_size
);
5234 if (need_softstack_decl
)
5236 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
5237 /* 32 is the maximum number of warps in a block. Even though it's an
5238 external declaration, emit the array size explicitly; otherwise, it
5239 may fail at PTX JIT time if the definition is later in link order. */
5240 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
5243 if (need_unisimt_decl
)
5245 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
5246 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
5250 /* Expander for the shuffle builtins. */
5253 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
5258 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5259 NULL_RTX
, mode
, EXPAND_NORMAL
);
5261 src
= copy_to_mode_reg (mode
, src
);
5263 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5264 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5265 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5266 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5268 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
5269 idx
= copy_to_mode_reg (SImode
, idx
);
5271 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
5272 (nvptx_shuffle_kind
) INTVAL (op
));
5280 nvptx_output_red_partition (rtx dst
, rtx offset
)
5282 const char *zero_offset
= "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5283 const char *with_offset
= "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5285 if (offset
== const0_rtx
)
5286 fprintf (asm_out_file
, zero_offset
, REGNO (dst
),
5287 REGNO (cfun
->machine
->red_partition
));
5289 fprintf (asm_out_file
, with_offset
, REGNO (dst
),
5290 REGNO (cfun
->machine
->red_partition
), UINTVAL (offset
));
5295 /* Shared-memory reduction address expander. */
5298 nvptx_expand_shared_addr (tree exp
, rtx target
,
5299 machine_mode
ARG_UNUSED (mode
), int ignore
,
5305 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
5306 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
5307 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
5308 rtx addr
= worker_red_sym
;
5314 populate_offload_attrs (&oa
);
5316 unsigned int psize
= ROUND_UP (size
+ offset
, align
);
5317 unsigned int pnum
= nvptx_mach_max_workers ();
5318 vector_red_partition
= MAX (vector_red_partition
, psize
);
5319 vector_red_size
= MAX (vector_red_size
, psize
* pnum
);
5320 vector_red_align
= MAX (vector_red_align
, align
);
5322 if (cfun
->machine
->red_partition
== NULL
)
5323 cfun
->machine
->red_partition
= gen_reg_rtx (Pmode
);
5325 addr
= gen_reg_rtx (Pmode
);
5326 emit_insn (gen_nvptx_red_partition (addr
, GEN_INT (offset
)));
5330 worker_red_align
= MAX (worker_red_align
, align
);
5331 worker_red_size
= MAX (worker_red_size
, size
+ offset
);
5335 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
5336 addr
= gen_rtx_CONST (Pmode
, addr
);
5340 emit_move_insn (target
, addr
);
5344 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5345 not require taking the address of any object, other than the memory
5346 cell being operated on. */
5349 nvptx_expand_cmp_swap (tree exp
, rtx target
,
5350 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
5352 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
5355 target
= gen_reg_rtx (mode
);
5357 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5358 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
5359 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5360 NULL_RTX
, mode
, EXPAND_NORMAL
);
5361 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5362 NULL_RTX
, mode
, EXPAND_NORMAL
);
5365 mem
= gen_rtx_MEM (mode
, mem
);
5367 cmp
= copy_to_mode_reg (mode
, cmp
);
5369 src
= copy_to_mode_reg (mode
, src
);
5372 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5374 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5382 /* Codes for all the NVPTX builtins. */
5385 NVPTX_BUILTIN_SHUFFLE
,
5386 NVPTX_BUILTIN_SHUFFLELL
,
5387 NVPTX_BUILTIN_WORKER_ADDR
,
5388 NVPTX_BUILTIN_VECTOR_ADDR
,
5389 NVPTX_BUILTIN_CMP_SWAP
,
5390 NVPTX_BUILTIN_CMP_SWAPLL
,
5394 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
5396 /* Return the NVPTX builtin for CODE. */
5399 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
5401 if (code
>= NVPTX_BUILTIN_MAX
)
5402 return error_mark_node
;
5404 return nvptx_builtin_decls
[code
];
5407 /* Set up all builtin functions for this target. */
5410 nvptx_init_builtins (void)
5412 #define DEF(ID, NAME, T) \
5413 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5414 = add_builtin_function ("__builtin_nvptx_" NAME, \
5415 build_function_type_list T, \
5416 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5418 #define UINT unsigned_type_node
5419 #define LLUINT long_long_unsigned_type_node
5420 #define PTRVOID ptr_type_node
5422 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
5423 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
5424 DEF (WORKER_ADDR
, "worker_addr",
5425 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5426 DEF (VECTOR_ADDR
, "vector_addr",
5427 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5428 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
5429 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
5438 /* Expand an expression EXP that calls a built-in function,
5439 with result going to TARGET if that's convenient
5440 (and in mode MODE if that's convenient).
5441 SUBTARGET may be used as the target for computing one of EXP's operands.
5442 IGNORE is nonzero if the value is to be ignored. */
5445 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
5446 machine_mode mode
, int ignore
)
5448 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
5449 switch (DECL_MD_FUNCTION_CODE (fndecl
))
5451 case NVPTX_BUILTIN_SHUFFLE
:
5452 case NVPTX_BUILTIN_SHUFFLELL
:
5453 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
5455 case NVPTX_BUILTIN_WORKER_ADDR
:
5456 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, false);
5458 case NVPTX_BUILTIN_VECTOR_ADDR
:
5459 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, true);
5461 case NVPTX_BUILTIN_CMP_SWAP
:
5462 case NVPTX_BUILTIN_CMP_SWAPLL
:
5463 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
5465 default: gcc_unreachable ();
5469 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5474 return PTX_WARP_SIZE
;
5478 nvptx_welformed_vector_length_p (int l
)
5481 return l
% PTX_WARP_SIZE
== 0;
5485 nvptx_apply_dim_limits (int dims
[])
5487 /* Check that the vector_length is not too large. */
5488 if (dims
[GOMP_DIM_VECTOR
] > PTX_MAX_VECTOR_LENGTH
)
5489 dims
[GOMP_DIM_VECTOR
] = PTX_MAX_VECTOR_LENGTH
;
5491 /* Check that the number of workers is not too large. */
5492 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
5493 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
5495 /* Ensure that num_worker * vector_length <= cta size. */
5496 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
5497 && dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] > PTX_CTA_SIZE
)
5498 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5500 /* If we need a per-worker barrier ... . */
5501 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
5502 && dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
5503 /* Don't use more barriers than available. */
5504 dims
[GOMP_DIM_WORKER
] = MIN (dims
[GOMP_DIM_WORKER
],
5505 PTX_NUM_PER_WORKER_BARRIERS
);
5508 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5511 has_vector_partitionable_routine_calls_p (tree fndecl
)
5517 FOR_EACH_BB_FN (bb
, DECL_STRUCT_FUNCTION (fndecl
))
5518 for (gimple_stmt_iterator i
= gsi_start_bb (bb
); !gsi_end_p (i
);
5519 gsi_next_nondebug (&i
))
5521 gimple
*stmt
= gsi_stmt (i
);
5522 if (gimple_code (stmt
) != GIMPLE_CALL
)
5525 tree callee
= gimple_call_fndecl (stmt
);
5529 tree attrs
= oacc_get_fn_attrib (callee
);
5530 if (attrs
== NULL_TREE
)
5533 int partition_level
= oacc_fn_attrib_level (attrs
);
5534 bool seq_routine_p
= partition_level
== GOMP_DIM_MAX
;
5542 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5543 DIMS has changed. */
5546 nvptx_goacc_validate_dims_1 (tree decl
, int dims
[], int fn_level
, unsigned used
)
5548 bool oacc_default_dims_p
= false;
5549 bool oacc_min_dims_p
= false;
5550 bool offload_region_p
= false;
5551 bool routine_p
= false;
5552 bool routine_seq_p
= false;
5553 int default_vector_length
= -1;
5555 if (decl
== NULL_TREE
)
5558 oacc_default_dims_p
= true;
5559 else if (fn_level
== -2)
5560 oacc_min_dims_p
= true;
5564 else if (fn_level
== -1)
5565 offload_region_p
= true;
5566 else if (0 <= fn_level
&& fn_level
<= GOMP_DIM_MAX
)
5569 routine_seq_p
= fn_level
== GOMP_DIM_MAX
;
5574 if (oacc_min_dims_p
)
5576 gcc_assert (dims
[GOMP_DIM_VECTOR
] == 1);
5577 gcc_assert (dims
[GOMP_DIM_WORKER
] == 1);
5578 gcc_assert (dims
[GOMP_DIM_GANG
] == 1);
5580 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5587 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5592 if (oacc_default_dims_p
)
5595 0 : set at runtime, f.i. -fopenacc-dims=-
5596 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5597 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
5598 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
5599 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
5601 /* But -fopenacc-dims=- is not yet supported on trunk. */
5602 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
5603 gcc_assert (dims
[GOMP_DIM_WORKER
] != 0);
5604 gcc_assert (dims
[GOMP_DIM_GANG
] != 0);
5607 if (offload_region_p
)
5610 0 : set using variable, f.i. num_gangs (n)
5611 >= 1: set using constant, f.i. num_gangs (1). */
5612 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
5613 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
5614 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
5617 if (offload_region_p
)
5618 default_vector_length
= oacc_get_default_dim (GOMP_DIM_VECTOR
);
5620 /* oacc_default_dims_p. */
5621 default_vector_length
= PTX_DEFAULT_VECTOR_LENGTH
;
5623 int old_dims
[GOMP_DIM_MAX
];
5625 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5626 old_dims
[i
] = dims
[i
];
5628 const char *vector_reason
= NULL
;
5629 if (offload_region_p
&& has_vector_partitionable_routine_calls_p (decl
))
5631 default_vector_length
= PTX_WARP_SIZE
;
5633 if (dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
5635 vector_reason
= G_("using vector_length (%d) due to call to"
5636 " vector-partitionable routine, ignoring %d");
5637 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5641 if (dims
[GOMP_DIM_VECTOR
] == 0)
5643 vector_reason
= G_("using vector_length (%d), ignoring runtime setting");
5644 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
5647 if (dims
[GOMP_DIM_VECTOR
] > 0
5648 && !nvptx_welformed_vector_length_p (dims
[GOMP_DIM_VECTOR
]))
5649 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
5651 nvptx_apply_dim_limits (dims
);
5653 if (dims
[GOMP_DIM_VECTOR
] != old_dims
[GOMP_DIM_VECTOR
])
5654 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5655 vector_reason
!= NULL
5657 : G_("using vector_length (%d), ignoring %d"),
5658 dims
[GOMP_DIM_VECTOR
], old_dims
[GOMP_DIM_VECTOR
]);
5660 if (dims
[GOMP_DIM_WORKER
] != old_dims
[GOMP_DIM_WORKER
])
5661 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5662 G_("using num_workers (%d), ignoring %d"),
5663 dims
[GOMP_DIM_WORKER
], old_dims
[GOMP_DIM_WORKER
]);
5665 if (oacc_default_dims_p
)
5667 if (dims
[GOMP_DIM_VECTOR
] < 0)
5668 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
5669 if (dims
[GOMP_DIM_WORKER
] < 0)
5670 dims
[GOMP_DIM_WORKER
] = PTX_DEFAULT_RUNTIME_DIM
;
5671 if (dims
[GOMP_DIM_GANG
] < 0)
5672 dims
[GOMP_DIM_GANG
] = PTX_DEFAULT_RUNTIME_DIM
;
5673 nvptx_apply_dim_limits (dims
);
5676 if (offload_region_p
)
5678 for (i
= 0; i
< GOMP_DIM_MAX
; i
++)
5683 if ((used
& GOMP_DIM_MASK (i
)) == 0)
5684 /* Function oacc_validate_dims will apply the minimal dimension. */
5687 dims
[i
] = (i
== GOMP_DIM_VECTOR
5688 ? default_vector_length
5689 : oacc_get_default_dim (i
));
5692 nvptx_apply_dim_limits (dims
);
5696 /* Validate compute dimensions of an OpenACC offload or routine, fill
5697 in non-unity defaults. FN_LEVEL indicates the level at which a
5698 routine might spawn a loop. It is negative for non-routines. If
5699 DECL is null, we are validating the default dimensions. */
5702 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
, unsigned used
)
5704 int old_dims
[GOMP_DIM_MAX
];
5707 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5708 old_dims
[i
] = dims
[i
];
5710 nvptx_goacc_validate_dims_1 (decl
, dims
, fn_level
, used
);
5712 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
5713 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0)
5714 gcc_assert (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] <= PTX_CTA_SIZE
);
5716 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5717 if (old_dims
[i
] != dims
[i
])
5723 /* Return maximum dimension size, or zero for unbounded. */
5726 nvptx_dim_limit (int axis
)
5730 case GOMP_DIM_VECTOR
:
5731 return PTX_MAX_VECTOR_LENGTH
;
5739 /* Determine whether fork & joins are needed. */
5742 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
5743 bool ARG_UNUSED (is_fork
))
5745 tree arg
= gimple_call_arg (call
, 2);
5746 unsigned axis
= TREE_INT_CST_LOW (arg
);
5748 /* We only care about worker and vector partitioning. */
5749 if (axis
< GOMP_DIM_WORKER
)
5752 /* If the size is 1, there's no partitioning. */
5753 if (dims
[axis
] == 1)
5759 /* Generate a PTX builtin function call that returns the address in
5760 the worker reduction buffer at OFFSET. TYPE is the type of the
5761 data at that location. */
5764 nvptx_get_shared_red_addr (tree type
, tree offset
, bool vector
)
5766 enum nvptx_builtins addr_dim
= NVPTX_BUILTIN_WORKER_ADDR
;
5768 addr_dim
= NVPTX_BUILTIN_VECTOR_ADDR
;
5769 machine_mode mode
= TYPE_MODE (type
);
5770 tree fndecl
= nvptx_builtin_decl (addr_dim
, true);
5771 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
5772 tree align
= build_int_cst (unsigned_type_node
,
5773 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
5774 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
5776 return fold_convert (build_pointer_type (type
), call
);
5779 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5780 will cast the variable if necessary. */
5783 nvptx_generate_vector_shuffle (location_t loc
,
5784 tree dest_var
, tree var
, unsigned shift
,
5787 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
5788 tree_code code
= NOP_EXPR
;
5789 tree arg_type
= unsigned_type_node
;
5790 tree var_type
= TREE_TYPE (var
);
5791 tree dest_type
= var_type
;
5793 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
5794 var_type
= TREE_TYPE (var_type
);
5796 if (TREE_CODE (var_type
) == REAL_TYPE
)
5797 code
= VIEW_CONVERT_EXPR
;
5799 if (TYPE_SIZE (var_type
)
5800 == TYPE_SIZE (long_long_unsigned_type_node
))
5802 fn
= NVPTX_BUILTIN_SHUFFLELL
;
5803 arg_type
= long_long_unsigned_type_node
;
5806 tree call
= nvptx_builtin_decl (fn
, true);
5807 tree bits
= build_int_cst (unsigned_type_node
, shift
);
5808 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
5811 if (var_type
!= dest_type
)
5813 /* Do real and imaginary parts separately. */
5814 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
5815 real
= fold_build1 (code
, arg_type
, real
);
5816 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
5817 real
= fold_build1 (code
, var_type
, real
);
5819 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
5820 imag
= fold_build1 (code
, arg_type
, imag
);
5821 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
5822 imag
= fold_build1 (code
, var_type
, imag
);
5824 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
5828 expr
= fold_build1 (code
, arg_type
, var
);
5829 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
5830 expr
= fold_build1 (code
, dest_type
, expr
);
5833 gimplify_assign (dest_var
, expr
, seq
);
5836 /* Lazily generate the global lock var decl and return its address. */
5839 nvptx_global_lock_addr ()
5841 tree v
= global_lock_var
;
5845 tree name
= get_identifier ("__reduction_lock");
5846 tree type
= build_qualified_type (unsigned_type_node
,
5847 TYPE_QUAL_VOLATILE
);
5848 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
5849 global_lock_var
= v
;
5850 DECL_ARTIFICIAL (v
) = 1;
5851 DECL_EXTERNAL (v
) = 1;
5852 TREE_STATIC (v
) = 1;
5853 TREE_PUBLIC (v
) = 1;
5855 mark_addressable (v
);
5856 mark_decl_referenced (v
);
5859 return build_fold_addr_expr (v
);
5862 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5863 GSI. We use a lockless scheme for nearly all case, which looks
5865 actual = initval(OP);
5868 write = guess OP myval;
5869 actual = cmp&swap (ptr, guess, write)
5870 } while (actual bit-different-to guess);
5873 This relies on a cmp&swap instruction, which is available for 32-
5874 and 64-bit types. Larger types must use a locking scheme. */
5877 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5878 tree ptr
, tree var
, tree_code op
)
5880 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
5881 tree_code code
= NOP_EXPR
;
5882 tree arg_type
= unsigned_type_node
;
5883 tree var_type
= TREE_TYPE (var
);
5885 if (TREE_CODE (var_type
) == COMPLEX_TYPE
5886 || TREE_CODE (var_type
) == REAL_TYPE
)
5887 code
= VIEW_CONVERT_EXPR
;
5889 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
5891 arg_type
= long_long_unsigned_type_node
;
5892 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
5895 tree swap_fn
= nvptx_builtin_decl (fn
, true);
5897 gimple_seq init_seq
= NULL
;
5898 tree init_var
= make_ssa_name (arg_type
);
5899 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
5900 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
5901 gimplify_assign (init_var
, init_expr
, &init_seq
);
5902 gimple
*init_end
= gimple_seq_last (init_seq
);
5904 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
5906 /* Split the block just after the init stmts. */
5907 basic_block pre_bb
= gsi_bb (*gsi
);
5908 edge pre_edge
= split_block (pre_bb
, init_end
);
5909 basic_block loop_bb
= pre_edge
->dest
;
5910 pre_bb
= pre_edge
->src
;
5911 /* Reset the iterator. */
5912 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5914 tree expect_var
= make_ssa_name (arg_type
);
5915 tree actual_var
= make_ssa_name (arg_type
);
5916 tree write_var
= make_ssa_name (arg_type
);
5918 /* Build and insert the reduction calculation. */
5919 gimple_seq red_seq
= NULL
;
5920 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
5921 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
5922 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
5923 gimplify_assign (write_var
, write_expr
, &red_seq
);
5925 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5927 /* Build & insert the cmp&swap sequence. */
5928 gimple_seq latch_seq
= NULL
;
5929 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
5930 ptr
, expect_var
, write_var
);
5931 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
5933 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
5934 NULL_TREE
, NULL_TREE
);
5935 gimple_seq_add_stmt (&latch_seq
, cond
);
5937 gimple
*latch_end
= gimple_seq_last (latch_seq
);
5938 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
5940 /* Split the block just after the latch stmts. */
5941 edge post_edge
= split_block (loop_bb
, latch_end
);
5942 basic_block post_bb
= post_edge
->dest
;
5943 loop_bb
= post_edge
->src
;
5944 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5946 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5947 post_edge
->probability
= profile_probability::even ();
5948 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
5949 loop_edge
->probability
= profile_probability::even ();
5950 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
5951 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
5953 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
5954 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
5955 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
5957 loop
*loop
= alloc_loop ();
5958 loop
->header
= loop_bb
;
5959 loop
->latch
= loop_bb
;
5960 add_loop (loop
, loop_bb
->loop_father
);
5962 return fold_build1 (code
, var_type
, write_var
);
5965 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5966 GSI. This is necessary for types larger than 64 bits, where there
5967 is no cmp&swap instruction to implement a lockless scheme. We use
5968 a lock variable in global memory.
5970 while (cmp&swap (&lock_var, 0, 1))
5973 accum = accum OP var;
5975 cmp&swap (&lock_var, 1, 0);
5978 A lock in global memory is necessary to force execution engine
5979 descheduling and avoid resource starvation that can occur if the
5980 lock is in .shared memory. */
5983 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5984 tree ptr
, tree var
, tree_code op
)
5986 tree var_type
= TREE_TYPE (var
);
5987 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
5988 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
5989 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
5991 /* Split the block just before the gsi. Insert a gimple nop to make
5993 gimple
*nop
= gimple_build_nop ();
5994 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
5995 basic_block entry_bb
= gsi_bb (*gsi
);
5996 edge entry_edge
= split_block (entry_bb
, nop
);
5997 basic_block lock_bb
= entry_edge
->dest
;
5998 /* Reset the iterator. */
5999 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6001 /* Build and insert the locking sequence. */
6002 gimple_seq lock_seq
= NULL
;
6003 tree lock_var
= make_ssa_name (unsigned_type_node
);
6004 tree lock_expr
= nvptx_global_lock_addr ();
6005 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
6006 uns_unlocked
, uns_locked
);
6007 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
6008 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
6009 NULL_TREE
, NULL_TREE
);
6010 gimple_seq_add_stmt (&lock_seq
, cond
);
6011 gimple
*lock_end
= gimple_seq_last (lock_seq
);
6012 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
6014 /* Split the block just after the lock sequence. */
6015 edge locked_edge
= split_block (lock_bb
, lock_end
);
6016 basic_block update_bb
= locked_edge
->dest
;
6017 lock_bb
= locked_edge
->src
;
6018 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6020 /* Create the lock loop ... */
6021 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
6022 locked_edge
->probability
= profile_probability::even ();
6023 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
6024 loop_edge
->probability
= profile_probability::even ();
6025 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
6026 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
6028 /* ... and the loop structure. */
6029 loop
*lock_loop
= alloc_loop ();
6030 lock_loop
->header
= lock_bb
;
6031 lock_loop
->latch
= lock_bb
;
6032 lock_loop
->nb_iterations_estimate
= 1;
6033 lock_loop
->any_estimate
= true;
6034 add_loop (lock_loop
, entry_bb
->loop_father
);
6036 /* Build and insert the reduction calculation. */
6037 gimple_seq red_seq
= NULL
;
6038 tree acc_in
= make_ssa_name (var_type
);
6039 tree ref_in
= build_simple_mem_ref (ptr
);
6040 TREE_THIS_VOLATILE (ref_in
) = 1;
6041 gimplify_assign (acc_in
, ref_in
, &red_seq
);
6043 tree acc_out
= make_ssa_name (var_type
);
6044 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
6045 gimplify_assign (acc_out
, update_expr
, &red_seq
);
6047 tree ref_out
= build_simple_mem_ref (ptr
);
6048 TREE_THIS_VOLATILE (ref_out
) = 1;
6049 gimplify_assign (ref_out
, acc_out
, &red_seq
);
6051 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
6053 /* Build & insert the unlock sequence. */
6054 gimple_seq unlock_seq
= NULL
;
6055 tree unlock_expr
= nvptx_global_lock_addr ();
6056 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
6057 uns_locked
, uns_unlocked
);
6058 gimplify_and_add (unlock_expr
, &unlock_seq
);
6059 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
6064 /* Emit a sequence to update a reduction accumlator at *PTR with the
6065 value held in VAR using operator OP. Return the updated value.
6067 TODO: optimize for atomic ops and indepedent complex ops. */
6070 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
6071 tree ptr
, tree var
, tree_code op
)
6073 tree type
= TREE_TYPE (var
);
6074 tree size
= TYPE_SIZE (type
);
6076 if (size
== TYPE_SIZE (unsigned_type_node
)
6077 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
6078 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
6080 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
6083 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6086 nvptx_goacc_reduction_setup (gcall
*call
, offload_attrs
*oa
)
6088 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6089 tree lhs
= gimple_call_lhs (call
);
6090 tree var
= gimple_call_arg (call
, 2);
6091 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6092 gimple_seq seq
= NULL
;
6094 push_gimplify_context (true);
6096 if (level
!= GOMP_DIM_GANG
)
6098 /* Copy the receiver object. */
6099 tree ref_to_res
= gimple_call_arg (call
, 1);
6101 if (!integer_zerop (ref_to_res
))
6102 var
= build_simple_mem_ref (ref_to_res
);
6105 if (level
== GOMP_DIM_WORKER
6106 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
6108 /* Store incoming value to worker reduction buffer. */
6109 tree offset
= gimple_call_arg (call
, 5);
6110 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6111 level
== GOMP_DIM_VECTOR
);
6112 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6114 gimplify_assign (ptr
, call
, &seq
);
6115 tree ref
= build_simple_mem_ref (ptr
);
6116 TREE_THIS_VOLATILE (ref
) = 1;
6117 gimplify_assign (ref
, var
, &seq
);
6121 gimplify_assign (lhs
, var
, &seq
);
6123 pop_gimplify_context (NULL
);
6124 gsi_replace_with_seq (&gsi
, seq
, true);
6127 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6130 nvptx_goacc_reduction_init (gcall
*call
, offload_attrs
*oa
)
6132 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6133 tree lhs
= gimple_call_lhs (call
);
6134 tree var
= gimple_call_arg (call
, 2);
6135 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6136 enum tree_code rcode
6137 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
6138 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
6140 gimple_seq seq
= NULL
;
6142 push_gimplify_context (true);
6144 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
6146 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6147 tree tid
= make_ssa_name (integer_type_node
);
6148 tree dim_vector
= gimple_call_arg (call
, 3);
6149 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
6151 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
6152 NULL_TREE
, NULL_TREE
);
6154 gimple_call_set_lhs (tid_call
, tid
);
6155 gimple_seq_add_stmt (&seq
, tid_call
);
6156 gimple_seq_add_stmt (&seq
, cond_stmt
);
6158 /* Split the block just after the call. */
6159 edge init_edge
= split_block (gsi_bb (gsi
), call
);
6160 basic_block init_bb
= init_edge
->dest
;
6161 basic_block call_bb
= init_edge
->src
;
6163 /* Fixup flags from call_bb to init_bb. */
6164 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
6165 init_edge
->probability
= profile_probability::even ();
6167 /* Set the initialization stmts. */
6168 gimple_seq init_seq
= NULL
;
6169 tree init_var
= make_ssa_name (TREE_TYPE (var
));
6170 gimplify_assign (init_var
, init
, &init_seq
);
6171 gsi
= gsi_start_bb (init_bb
);
6172 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
6174 /* Split block just after the init stmt. */
6176 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
6177 basic_block dst_bb
= inited_edge
->dest
;
6179 /* Create false edge from call_bb to dst_bb. */
6180 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
6181 nop_edge
->probability
= profile_probability::even ();
6183 /* Create phi node in dst block. */
6184 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
6185 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
6186 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
6188 /* Reset dominator of dst bb. */
6189 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
6191 /* Reset the gsi. */
6192 gsi
= gsi_for_stmt (call
);
6196 if (level
== GOMP_DIM_GANG
)
6198 /* If there's no receiver object, propagate the incoming VAR. */
6199 tree ref_to_res
= gimple_call_arg (call
, 1);
6200 if (integer_zerop (ref_to_res
))
6204 if (lhs
!= NULL_TREE
)
6205 gimplify_assign (lhs
, init
, &seq
);
6208 pop_gimplify_context (NULL
);
6209 gsi_replace_with_seq (&gsi
, seq
, true);
6212 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6215 nvptx_goacc_reduction_fini (gcall
*call
, offload_attrs
*oa
)
6217 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6218 tree lhs
= gimple_call_lhs (call
);
6219 tree ref_to_res
= gimple_call_arg (call
, 1);
6220 tree var
= gimple_call_arg (call
, 2);
6221 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6223 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
6224 gimple_seq seq
= NULL
;
6225 tree r
= NULL_TREE
;;
6227 push_gimplify_context (true);
6229 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
6231 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6232 but that requires a method of emitting a unified jump at the
6234 for (int shfl
= PTX_WARP_SIZE
/ 2; shfl
> 0; shfl
= shfl
>> 1)
6236 tree other_var
= make_ssa_name (TREE_TYPE (var
));
6237 nvptx_generate_vector_shuffle (gimple_location (call
),
6238 other_var
, var
, shfl
, &seq
);
6240 r
= make_ssa_name (TREE_TYPE (var
));
6241 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
6242 var
, other_var
), &seq
);
6248 tree accum
= NULL_TREE
;
6250 if (level
== GOMP_DIM_WORKER
|| level
== GOMP_DIM_VECTOR
)
6252 /* Get reduction buffer address. */
6253 tree offset
= gimple_call_arg (call
, 5);
6254 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6255 level
== GOMP_DIM_VECTOR
);
6256 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6258 gimplify_assign (ptr
, call
, &seq
);
6261 else if (integer_zerop (ref_to_res
))
6268 /* UPDATE the accumulator. */
6269 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
6271 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
6277 gimplify_assign (lhs
, r
, &seq
);
6278 pop_gimplify_context (NULL
);
6280 gsi_replace_with_seq (&gsi
, seq
, true);
6283 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6286 nvptx_goacc_reduction_teardown (gcall
*call
, offload_attrs
*oa
)
6288 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6289 tree lhs
= gimple_call_lhs (call
);
6290 tree var
= gimple_call_arg (call
, 2);
6291 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6292 gimple_seq seq
= NULL
;
6294 push_gimplify_context (true);
6295 if (level
== GOMP_DIM_WORKER
6296 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
6298 /* Read the worker reduction buffer. */
6299 tree offset
= gimple_call_arg (call
, 5);
6300 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6301 level
== GOMP_DIM_VECTOR
);
6302 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6304 gimplify_assign (ptr
, call
, &seq
);
6305 var
= build_simple_mem_ref (ptr
);
6306 TREE_THIS_VOLATILE (var
) = 1;
6309 if (level
!= GOMP_DIM_GANG
)
6311 /* Write to the receiver object. */
6312 tree ref_to_res
= gimple_call_arg (call
, 1);
6314 if (!integer_zerop (ref_to_res
))
6315 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
6319 gimplify_assign (lhs
, var
, &seq
);
6321 pop_gimplify_context (NULL
);
6323 gsi_replace_with_seq (&gsi
, seq
, true);
6326 /* NVPTX reduction expander. */
6329 nvptx_goacc_reduction (gcall
*call
)
6331 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
6334 populate_offload_attrs (&oa
);
6338 case IFN_GOACC_REDUCTION_SETUP
:
6339 nvptx_goacc_reduction_setup (call
, &oa
);
6342 case IFN_GOACC_REDUCTION_INIT
:
6343 nvptx_goacc_reduction_init (call
, &oa
);
6346 case IFN_GOACC_REDUCTION_FINI
:
6347 nvptx_goacc_reduction_fini (call
, &oa
);
6350 case IFN_GOACC_REDUCTION_TEARDOWN
:
6351 nvptx_goacc_reduction_teardown (call
, &oa
);
6360 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
6361 rtx x ATTRIBUTE_UNUSED
)
6367 nvptx_vector_mode_supported (machine_mode mode
)
6369 return (mode
== V2SImode
6370 || mode
== V2DImode
);
6373 /* Return the preferred mode for vectorizing scalar MODE. */
6376 nvptx_preferred_simd_mode (scalar_mode mode
)
6386 return default_preferred_simd_mode (mode
);
6391 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
6393 if (TREE_CODE (type
) == INTEGER_TYPE
)
6395 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
6396 if (size
== GET_MODE_SIZE (TImode
))
6397 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
6403 /* Implement TARGET_MODES_TIEABLE_P. */
6406 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
6411 /* Implement TARGET_HARD_REGNO_NREGS. */
6414 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
6419 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6422 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
6427 static GTY(()) tree nvptx_previous_fndecl
;
6430 nvptx_set_current_function (tree fndecl
)
6432 if (!fndecl
|| fndecl
== nvptx_previous_fndecl
)
6435 nvptx_previous_fndecl
= fndecl
;
6436 vector_red_partition
= 0;
6437 oacc_bcast_partition
= 0;
6440 #undef TARGET_OPTION_OVERRIDE
6441 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6443 #undef TARGET_ATTRIBUTE_TABLE
6444 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6447 #define TARGET_LRA_P hook_bool_void_false
6449 #undef TARGET_LEGITIMATE_ADDRESS_P
6450 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6452 #undef TARGET_PROMOTE_FUNCTION_MODE
6453 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6455 #undef TARGET_FUNCTION_ARG
6456 #define TARGET_FUNCTION_ARG nvptx_function_arg
6457 #undef TARGET_FUNCTION_INCOMING_ARG
6458 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6459 #undef TARGET_FUNCTION_ARG_ADVANCE
6460 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6461 #undef TARGET_FUNCTION_ARG_BOUNDARY
6462 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6463 #undef TARGET_PASS_BY_REFERENCE
6464 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6465 #undef TARGET_FUNCTION_VALUE_REGNO_P
6466 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6467 #undef TARGET_FUNCTION_VALUE
6468 #define TARGET_FUNCTION_VALUE nvptx_function_value
6469 #undef TARGET_LIBCALL_VALUE
6470 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6471 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6472 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6473 #undef TARGET_GET_DRAP_RTX
6474 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6475 #undef TARGET_SPLIT_COMPLEX_ARG
6476 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6477 #undef TARGET_RETURN_IN_MEMORY
6478 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6479 #undef TARGET_OMIT_STRUCT_RETURN_REG
6480 #define TARGET_OMIT_STRUCT_RETURN_REG true
6481 #undef TARGET_STRICT_ARGUMENT_NAMING
6482 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6483 #undef TARGET_CALL_ARGS
6484 #define TARGET_CALL_ARGS nvptx_call_args
6485 #undef TARGET_END_CALL_ARGS
6486 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6488 #undef TARGET_ASM_FILE_START
6489 #define TARGET_ASM_FILE_START nvptx_file_start
6490 #undef TARGET_ASM_FILE_END
6491 #define TARGET_ASM_FILE_END nvptx_file_end
6492 #undef TARGET_ASM_GLOBALIZE_LABEL
6493 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6494 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6495 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6496 #undef TARGET_PRINT_OPERAND
6497 #define TARGET_PRINT_OPERAND nvptx_print_operand
6498 #undef TARGET_PRINT_OPERAND_ADDRESS
6499 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6500 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6501 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6502 #undef TARGET_ASM_INTEGER
6503 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6504 #undef TARGET_ASM_DECL_END
6505 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6506 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6507 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6508 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6509 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6510 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6511 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6513 #undef TARGET_MACHINE_DEPENDENT_REORG
6514 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6515 #undef TARGET_NO_REGISTER_ALLOCATION
6516 #define TARGET_NO_REGISTER_ALLOCATION true
6518 #undef TARGET_ENCODE_SECTION_INFO
6519 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6520 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6521 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6523 #undef TARGET_VECTOR_ALIGNMENT
6524 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6526 #undef TARGET_CANNOT_COPY_INSN_P
6527 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6529 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6530 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6532 #undef TARGET_INIT_BUILTINS
6533 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6534 #undef TARGET_EXPAND_BUILTIN
6535 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6536 #undef TARGET_BUILTIN_DECL
6537 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6539 #undef TARGET_SIMT_VF
6540 #define TARGET_SIMT_VF nvptx_simt_vf
6542 #undef TARGET_GOACC_VALIDATE_DIMS
6543 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6545 #undef TARGET_GOACC_DIM_LIMIT
6546 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6548 #undef TARGET_GOACC_FORK_JOIN
6549 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6551 #undef TARGET_GOACC_REDUCTION
6552 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6554 #undef TARGET_CANNOT_FORCE_CONST_MEM
6555 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6557 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6558 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6560 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6561 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6562 nvptx_preferred_simd_mode
6564 #undef TARGET_MODES_TIEABLE_P
6565 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6567 #undef TARGET_HARD_REGNO_NREGS
6568 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6570 #undef TARGET_CAN_CHANGE_MODE_CLASS
6571 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6573 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6574 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6576 #undef TARGET_SET_CURRENT_FUNCTION
6577 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6579 struct gcc_target targetm
= TARGET_INITIALIZER
;
6581 #include "gt-nvptx.h"