1 /* Target code for NVPTX.
2 Copyright (C) 2014-2019 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
26 #include "coretypes.h"
40 #include "diagnostic.h"
42 #include "insn-flags.h"
44 #include "insn-attr.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
58 #include "stor-layout.h"
60 #include "omp-general.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
73 #include "tree-phinodes.h"
75 #include "fold-const.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86 block, which has had a maximum number of threads of 1024 since CUDA version
88 #define PTX_CTA_SIZE 1024
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
103 /* The various PTX memory areas an object might reside in. */
115 /* We record the data area in the target symbol flags. */
116 #define SYMBOL_DATA_AREA(SYM) \
117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
119 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
122 /* Record the function decls we've written, and the libfuncs and function
123 decls corresponding to them. */
124 static std::stringstream func_decls
;
126 struct declared_libfunc_hasher
: ggc_cache_ptr_hash
<rtx_def
>
128 static hashval_t
hash (rtx x
) { return htab_hash_pointer (x
); }
129 static bool equal (rtx a
, rtx b
) { return a
== b
; }
133 hash_table
<declared_libfunc_hasher
> *declared_libfuncs_htab
;
135 struct tree_hasher
: ggc_cache_ptr_hash
<tree_node
>
137 static hashval_t
hash (tree t
) { return htab_hash_pointer (t
); }
138 static bool equal (tree a
, tree b
) { return a
== b
; }
141 static GTY((cache
)) hash_table
<tree_hasher
> *declared_fndecls_htab
;
142 static GTY((cache
)) hash_table
<tree_hasher
> *needed_fndecls_htab
;
144 /* Buffer needed to broadcast across workers and vectors. This is
145 used for both worker-neutering and worker broadcasting, and
146 vector-neutering and boardcasting when vector_length > 32. It is
147 shared by all functions emitted. The buffer is placed in shared
148 memory. It'd be nice if PTX supported common blocks, because then
149 this could be shared across TUs (taking the largest size). */
150 static unsigned oacc_bcast_size
;
151 static unsigned oacc_bcast_partition
;
152 static unsigned oacc_bcast_align
;
153 static GTY(()) rtx oacc_bcast_sym
;
155 /* Buffer needed for worker reductions. This has to be distinct from
156 the worker broadcast array, as both may be live concurrently. */
157 static unsigned worker_red_size
;
158 static unsigned worker_red_align
;
159 static GTY(()) rtx worker_red_sym
;
161 /* Buffer needed for vector reductions, when vector_length >
162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
163 array, as both may be live concurrently. */
164 static unsigned vector_red_size
;
165 static unsigned vector_red_align
;
166 static unsigned vector_red_partition
;
167 static GTY(()) rtx vector_red_sym
;
169 /* Global lock variable, needed for 128bit worker & gang reductions. */
170 static GTY(()) tree global_lock_var
;
172 /* True if any function references __nvptx_stacks. */
173 static bool need_softstack_decl
;
175 /* True if any function references __nvptx_uni. */
176 static bool need_unisimt_decl
;
178 static int nvptx_mach_max_workers ();
180 /* Allocate a new, cleared machine_function structure. */
182 static struct machine_function
*
183 nvptx_init_machine_status (void)
185 struct machine_function
*p
= ggc_cleared_alloc
<machine_function
> ();
186 p
->return_mode
= VOIDmode
;
190 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191 and -fopenacc is also enabled. */
194 diagnose_openacc_conflict (bool optval
, const char *optname
)
196 if (flag_openacc
&& optval
)
197 error ("option %s is not supported together with %<-fopenacc%>", optname
);
200 /* Implement TARGET_OPTION_OVERRIDE. */
203 nvptx_option_override (void)
205 init_machine_status
= nvptx_init_machine_status
;
207 /* Set toplevel_reorder, unless explicitly disabled. We need
208 reordering so that we emit necessary assembler decls of
209 undeclared variables. */
210 if (!global_options_set
.x_flag_toplevel_reorder
)
211 flag_toplevel_reorder
= 1;
213 debug_nonbind_markers_p
= 0;
215 /* Set flag_no_common, unless explicitly disabled. We fake common
216 using .weak, and that's not entirely accurate, so avoid it
218 if (!global_options_set
.x_flag_no_common
)
221 /* The patch area requires nops, which we don't have. */
222 if (function_entry_patch_area_size
> 0)
223 sorry ("not generating patch area, nops not supported");
225 /* Assumes that it will see only hard registers. */
226 flag_var_tracking
= 0;
228 if (nvptx_optimize
< 0)
229 nvptx_optimize
= optimize
> 0;
231 declared_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
232 needed_fndecls_htab
= hash_table
<tree_hasher
>::create_ggc (17);
233 declared_libfuncs_htab
234 = hash_table
<declared_libfunc_hasher
>::create_ggc (17);
236 oacc_bcast_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__oacc_bcast");
237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym
, DATA_AREA_SHARED
);
238 oacc_bcast_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
239 oacc_bcast_partition
= 0;
241 worker_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__worker_red");
242 SET_SYMBOL_DATA_AREA (worker_red_sym
, DATA_AREA_SHARED
);
243 worker_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
245 vector_red_sym
= gen_rtx_SYMBOL_REF (Pmode
, "__vector_red");
246 SET_SYMBOL_DATA_AREA (vector_red_sym
, DATA_AREA_SHARED
);
247 vector_red_align
= GET_MODE_ALIGNMENT (SImode
) / BITS_PER_UNIT
;
248 vector_red_partition
= 0;
250 diagnose_openacc_conflict (TARGET_GOMP
, "-mgomp");
251 diagnose_openacc_conflict (TARGET_SOFT_STACK
, "-msoft-stack");
252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT
, "-muniform-simt");
255 target_flags
|= MASK_SOFT_STACK
| MASK_UNIFORM_SIMT
;
258 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
259 deal with ptx ideosyncracies. */
262 nvptx_ptx_type_from_mode (machine_mode mode
, bool promote
)
297 /* Encode the PTX data area that DECL (which might not actually be a
298 _DECL) should reside in. */
301 nvptx_encode_section_info (tree decl
, rtx rtl
, int first
)
303 default_encode_section_info (decl
, rtl
, first
);
304 if (first
&& MEM_P (rtl
))
306 nvptx_data_area area
= DATA_AREA_GENERIC
;
308 if (TREE_CONSTANT (decl
))
309 area
= DATA_AREA_CONST
;
310 else if (TREE_CODE (decl
) == VAR_DECL
)
312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl
)))
314 area
= DATA_AREA_SHARED
;
315 if (DECL_INITIAL (decl
))
316 error ("static initialization of variable %q+D in %<.shared%>"
317 " memory is not supported", decl
);
320 area
= TREE_READONLY (decl
) ? DATA_AREA_CONST
: DATA_AREA_GLOBAL
;
323 SET_SYMBOL_DATA_AREA (XEXP (rtl
, 0), area
);
327 /* Return the PTX name of the data area in which SYM should be
328 placed. The symbol must have already been processed by
329 nvptx_encode_seciton_info, or equivalent. */
332 section_for_sym (rtx sym
)
334 nvptx_data_area area
= SYMBOL_DATA_AREA (sym
);
335 /* Same order as nvptx_data_area enum. */
336 static char const *const areas
[] =
337 {"", ".global", ".shared", ".local", ".const", ".param"};
342 /* Similarly for a decl. */
345 section_for_decl (const_tree decl
)
347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree
, decl
)), 0));
350 /* Check NAME for special function names and redirect them by returning a
351 replacement. This applies to malloc, free and realloc, for which we
352 want to use libgcc wrappers, and call, which triggers a bug in
353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354 not active in an offload compiler -- the names are all set by the
355 host-side compiler. */
358 nvptx_name_replacement (const char *name
)
360 if (strcmp (name
, "call") == 0)
361 return "__nvptx_call";
362 if (strcmp (name
, "malloc") == 0)
363 return "__nvptx_malloc";
364 if (strcmp (name
, "free") == 0)
365 return "__nvptx_free";
366 if (strcmp (name
, "realloc") == 0)
367 return "__nvptx_realloc";
371 /* If MODE should be treated as two registers of an inner mode, return
372 that inner mode. Otherwise return VOIDmode. */
375 maybe_split_mode (machine_mode mode
)
377 if (COMPLEX_MODE_P (mode
))
378 return GET_MODE_INNER (mode
);
386 /* Return true if mode should be treated as two registers. */
389 split_mode_p (machine_mode mode
)
391 return maybe_split_mode (mode
) != VOIDmode
;
394 /* Output a register, subreg, or register pair (with optional
395 enclosing braces). */
398 output_reg (FILE *file
, unsigned regno
, machine_mode inner_mode
,
399 int subreg_offset
= -1)
401 if (inner_mode
== VOIDmode
)
403 if (HARD_REGISTER_NUM_P (regno
))
404 fprintf (file
, "%s", reg_names
[regno
]);
406 fprintf (file
, "%%r%d", regno
);
408 else if (subreg_offset
>= 0)
410 output_reg (file
, regno
, VOIDmode
);
411 fprintf (file
, "$%d", subreg_offset
);
415 if (subreg_offset
== -1)
417 output_reg (file
, regno
, inner_mode
, GET_MODE_SIZE (inner_mode
));
419 output_reg (file
, regno
, inner_mode
, 0);
420 if (subreg_offset
== -1)
425 /* Emit forking instructions for MASK. */
428 nvptx_emit_forking (unsigned mask
, bool is_call
)
430 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
431 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
434 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
436 /* Emit fork at all levels. This helps form SESE regions, as
437 it creates a block with a single successor before entering a
438 partitooned region. That is a good candidate for the end of
440 emit_insn (gen_nvptx_fork (op
));
441 emit_insn (gen_nvptx_forked (op
));
445 /* Emit joining instructions for MASK. */
448 nvptx_emit_joining (unsigned mask
, bool is_call
)
450 mask
&= (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
));
454 rtx op
= GEN_INT (mask
| (is_call
<< GOMP_DIM_MAX
));
456 /* Emit joining for all non-call pars to ensure there's a single
457 predecessor for the block the join insn ends up in. This is
458 needed for skipping entire loops. */
459 emit_insn (gen_nvptx_joining (op
));
460 emit_insn (gen_nvptx_join (op
));
465 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
466 returned in memory. Integer and floating types supported by the
467 machine are passed in registers, everything else is passed in
468 memory. Complex types are split. */
471 pass_in_memory (machine_mode mode
, const_tree type
, bool for_return
)
475 if (AGGREGATE_TYPE_P (type
))
477 if (TREE_CODE (type
) == VECTOR_TYPE
)
481 if (!for_return
&& COMPLEX_MODE_P (mode
))
482 /* Complex types are passed as two underlying args. */
483 mode
= GET_MODE_INNER (mode
);
485 if (GET_MODE_CLASS (mode
) != MODE_INT
486 && GET_MODE_CLASS (mode
) != MODE_FLOAT
)
489 if (GET_MODE_SIZE (mode
) > UNITS_PER_WORD
)
495 /* A non-memory argument of mode MODE is being passed, determine the mode it
496 should be promoted to. This is also used for determining return
500 promote_arg (machine_mode mode
, bool prototyped
)
502 if (!prototyped
&& mode
== SFmode
)
503 /* K&R float promotion for unprototyped functions. */
505 else if (GET_MODE_SIZE (mode
) < GET_MODE_SIZE (SImode
))
511 /* A non-memory return type of MODE is being returned. Determine the
512 mode it should be promoted to. */
515 promote_return (machine_mode mode
)
517 return promote_arg (mode
, true);
520 /* Implement TARGET_FUNCTION_ARG. */
523 nvptx_function_arg (cumulative_args_t
ARG_UNUSED (cum_v
), machine_mode mode
,
524 const_tree
, bool named
)
526 if (mode
== VOIDmode
|| !named
)
529 return gen_reg_rtx (mode
);
532 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
535 nvptx_function_incoming_arg (cumulative_args_t cum_v
, machine_mode mode
,
536 const_tree
, bool named
)
538 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
540 if (mode
== VOIDmode
|| !named
)
543 /* No need to deal with split modes here, the only case that can
544 happen is complex modes and those are dealt with by
545 TARGET_SPLIT_COMPLEX_ARG. */
546 return gen_rtx_UNSPEC (mode
,
547 gen_rtvec (1, GEN_INT (cum
->count
)),
551 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
554 nvptx_function_arg_advance (cumulative_args_t cum_v
,
555 machine_mode
ARG_UNUSED (mode
),
556 const_tree
ARG_UNUSED (type
),
557 bool ARG_UNUSED (named
))
559 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
564 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
566 For nvptx This is only used for varadic args. The type has already
567 been promoted and/or converted to invisible reference. */
570 nvptx_function_arg_boundary (machine_mode mode
, const_tree
ARG_UNUSED (type
))
572 return GET_MODE_ALIGNMENT (mode
);
575 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
577 For nvptx, we know how to handle functions declared as stdarg: by
578 passing an extra pointer to the unnamed arguments. However, the
579 Fortran frontend can produce a different situation, where a
580 function pointer is declared with no arguments, but the actual
581 function and calls to it take more arguments. In that case, we
582 want to ensure the call matches the definition of the function. */
585 nvptx_strict_argument_naming (cumulative_args_t cum_v
)
587 CUMULATIVE_ARGS
*cum
= get_cumulative_args (cum_v
);
589 return cum
->fntype
== NULL_TREE
|| stdarg_p (cum
->fntype
);
592 /* Implement TARGET_LIBCALL_VALUE. */
595 nvptx_libcall_value (machine_mode mode
, const_rtx
)
597 if (!cfun
|| !cfun
->machine
->doing_call
)
598 /* Pretend to return in a hard reg for early uses before pseudos can be
600 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
602 return gen_reg_rtx (mode
);
605 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
606 where function FUNC returns or receives a value of data type TYPE. */
609 nvptx_function_value (const_tree type
, const_tree
ARG_UNUSED (func
),
612 machine_mode mode
= promote_return (TYPE_MODE (type
));
617 cfun
->machine
->return_mode
= mode
;
618 return gen_rtx_REG (mode
, NVPTX_RETURN_REGNUM
);
621 return nvptx_libcall_value (mode
, NULL_RTX
);
624 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
627 nvptx_function_value_regno_p (const unsigned int regno
)
629 return regno
== NVPTX_RETURN_REGNUM
;
632 /* Types with a mode other than those supported by the machine are passed by
633 reference in memory. */
636 nvptx_pass_by_reference (cumulative_args_t
ARG_UNUSED (cum
),
637 machine_mode mode
, const_tree type
,
638 bool ARG_UNUSED (named
))
640 return pass_in_memory (mode
, type
, false);
643 /* Implement TARGET_RETURN_IN_MEMORY. */
646 nvptx_return_in_memory (const_tree type
, const_tree
)
648 return pass_in_memory (TYPE_MODE (type
), type
, true);
651 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
654 nvptx_promote_function_mode (const_tree type
, machine_mode mode
,
655 int *ARG_UNUSED (punsignedp
),
656 const_tree funtype
, int for_return
)
658 return promote_arg (mode
, for_return
|| !type
|| TYPE_ARG_TYPES (funtype
));
661 /* Helper for write_arg. Emit a single PTX argument of MODE, either
662 in a prototype, or as copy in a function prologue. ARGNO is the
663 index of this argument in the PTX function. FOR_REG is negative,
664 if we're emitting the PTX prototype. It is zero if we're copying
665 to an argument register and it is greater than zero if we're
666 copying to a specific hard register. */
669 write_arg_mode (std::stringstream
&s
, int for_reg
, int argno
,
672 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
676 /* Writing PTX prototype. */
677 s
<< (argno
? ", " : " (");
678 s
<< ".param" << ptx_type
<< " %in_ar" << argno
;
682 s
<< "\t.reg" << ptx_type
<< " ";
684 s
<< reg_names
[for_reg
];
690 s
<< "\tld.param" << ptx_type
<< " ";
692 s
<< reg_names
[for_reg
];
695 s
<< ", [%in_ar" << argno
<< "];\n";
701 /* Process function parameter TYPE to emit one or more PTX
702 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
703 is true, if this is a prototyped function, rather than an old-style
704 C declaration. Returns the next argument number to use.
706 The promotion behavior here must match the regular GCC function
707 parameter marshalling machinery. */
710 write_arg_type (std::stringstream
&s
, int for_reg
, int argno
,
711 tree type
, bool prototyped
)
713 machine_mode mode
= TYPE_MODE (type
);
715 if (mode
== VOIDmode
)
718 if (pass_in_memory (mode
, type
, false))
722 bool split
= TREE_CODE (type
) == COMPLEX_TYPE
;
726 /* Complex types are sent as two separate args. */
727 type
= TREE_TYPE (type
);
728 mode
= TYPE_MODE (type
);
732 mode
= promote_arg (mode
, prototyped
);
734 argno
= write_arg_mode (s
, for_reg
, argno
, mode
);
737 return write_arg_mode (s
, for_reg
, argno
, mode
);
740 /* Emit a PTX return as a prototype or function prologue declaration
744 write_return_mode (std::stringstream
&s
, bool for_proto
, machine_mode mode
)
746 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
747 const char *pfx
= "\t.reg";
748 const char *sfx
= ";\n";
751 pfx
= "(.param", sfx
= "_out) ";
753 s
<< pfx
<< ptx_type
<< " " << reg_names
[NVPTX_RETURN_REGNUM
] << sfx
;
756 /* Process a function return TYPE to emit a PTX return as a prototype
757 or function prologue declaration. Returns true if return is via an
758 additional pointer parameter. The promotion behavior here must
759 match the regular GCC function return mashalling. */
762 write_return_type (std::stringstream
&s
, bool for_proto
, tree type
)
764 machine_mode mode
= TYPE_MODE (type
);
766 if (mode
== VOIDmode
)
769 bool return_in_mem
= pass_in_memory (mode
, type
, true);
774 return return_in_mem
;
776 /* Named return values can cause us to return a pointer as well
777 as expect an argument for the return location. This is
778 optimization-level specific, so no caller can make use of
779 this data, but more importantly for us, we must ensure it
780 doesn't change the PTX prototype. */
781 mode
= (machine_mode
) cfun
->machine
->return_mode
;
783 if (mode
== VOIDmode
)
784 return return_in_mem
;
786 /* Clear return_mode to inhibit copy of retval to non-existent
788 cfun
->machine
->return_mode
= VOIDmode
;
791 mode
= promote_return (mode
);
793 write_return_mode (s
, for_proto
, mode
);
795 return return_in_mem
;
798 /* Look for attributes in ATTRS that would indicate we must write a function
799 as a .entry kernel rather than a .func. Return true if one is found. */
802 write_as_kernel (tree attrs
)
804 return (lookup_attribute ("kernel", attrs
) != NULL_TREE
805 || (lookup_attribute ("omp target entrypoint", attrs
) != NULL_TREE
806 && lookup_attribute ("oacc function", attrs
) != NULL_TREE
));
807 /* For OpenMP target regions, the corresponding kernel entry is emitted from
808 write_omp_entry as a separate function. */
811 /* Emit a linker marker for a function decl or defn. */
814 write_fn_marker (std::stringstream
&s
, bool is_defn
, bool globalize
,
820 s
<< " FUNCTION " << (is_defn
? "DEF: " : "DECL: ");
824 /* Emit a linker marker for a variable decl or defn. */
827 write_var_marker (FILE *file
, bool is_defn
, bool globalize
, const char *name
)
829 fprintf (file
, "\n// BEGIN%s VAR %s: ",
830 globalize
? " GLOBAL" : "",
831 is_defn
? "DEF" : "DECL");
832 assemble_name_raw (file
, name
);
836 /* Write a .func or .kernel declaration or definition along with
837 a helper comment for use by ld. S is the stream to write to, DECL
838 the decl for the function with name NAME. For definitions, emit
839 a declaration too. */
842 write_fn_proto (std::stringstream
&s
, bool is_defn
,
843 const char *name
, const_tree decl
)
846 /* Emit a declaration. The PTX assembler gets upset without it. */
847 name
= write_fn_proto (s
, false, name
, decl
);
850 /* Avoid repeating the name replacement. */
851 name
= nvptx_name_replacement (name
);
856 write_fn_marker (s
, is_defn
, TREE_PUBLIC (decl
), name
);
858 /* PTX declaration. */
859 if (DECL_EXTERNAL (decl
))
861 else if (TREE_PUBLIC (decl
))
862 s
<< (DECL_WEAK (decl
) ? ".weak " : ".visible ");
863 s
<< (write_as_kernel (DECL_ATTRIBUTES (decl
)) ? ".entry " : ".func ");
865 tree fntype
= TREE_TYPE (decl
);
866 tree result_type
= TREE_TYPE (fntype
);
868 /* atomic_compare_exchange_$n builtins have an exceptional calling
870 int not_atomic_weak_arg
= -1;
871 if (DECL_BUILT_IN_CLASS (decl
) == BUILT_IN_NORMAL
)
872 switch (DECL_FUNCTION_CODE (decl
))
874 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1
:
875 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2
:
876 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4
:
877 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8
:
878 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16
:
879 /* These atomics skip the 'weak' parm in an actual library
880 call. We must skip it in the prototype too. */
881 not_atomic_weak_arg
= 3;
888 /* Declare the result. */
889 bool return_in_mem
= write_return_type (s
, true, result_type
);
895 /* Emit argument list. */
897 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
900 NULL in TYPE_ARG_TYPES, for old-style functions
901 NULL in DECL_ARGUMENTS, for builtin functions without another
903 So we have to pick the best one we have. */
904 tree args
= TYPE_ARG_TYPES (fntype
);
905 bool prototyped
= true;
908 args
= DECL_ARGUMENTS (decl
);
912 for (; args
; args
= TREE_CHAIN (args
), not_atomic_weak_arg
--)
914 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
916 if (not_atomic_weak_arg
)
917 argno
= write_arg_type (s
, -1, argno
, type
, prototyped
);
919 gcc_assert (type
== boolean_type_node
);
922 if (stdarg_p (fntype
))
923 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
925 if (DECL_STATIC_CHAIN (decl
))
926 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
928 if (!argno
&& strcmp (name
, "main") == 0)
930 argno
= write_arg_type (s
, -1, argno
, integer_type_node
, true);
931 argno
= write_arg_type (s
, -1, argno
, ptr_type_node
, true);
937 s
<< (is_defn
? "\n" : ";\n");
942 /* Construct a function declaration from a call insn. This can be
943 necessary for two reasons - either we have an indirect call which
944 requires a .callprototype declaration, or we have a libcall
945 generated by emit_library_call for which no decl exists. */
948 write_fn_proto_from_insn (std::stringstream
&s
, const char *name
,
953 s
<< "\t.callprototype ";
958 name
= nvptx_name_replacement (name
);
959 write_fn_marker (s
, false, true, name
);
960 s
<< "\t.extern .func ";
963 if (result
!= NULL_RTX
)
964 write_return_mode (s
, true, GET_MODE (result
));
968 int arg_end
= XVECLEN (pat
, 0);
969 for (int i
= 1; i
< arg_end
; i
++)
971 /* We don't have to deal with mode splitting & promotion here,
972 as that was already done when generating the call
974 machine_mode mode
= GET_MODE (XEXP (XVECEXP (pat
, 0, i
), 0));
976 write_arg_mode (s
, -1, i
- 1, mode
);
983 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
984 table and and write a ptx prototype. These are emitted at end of
988 nvptx_record_fndecl (tree decl
)
990 tree
*slot
= declared_fndecls_htab
->find_slot (decl
, INSERT
);
994 const char *name
= get_fnname_from_decl (decl
);
995 write_fn_proto (func_decls
, false, name
, decl
);
999 /* Record a libcall or unprototyped external function. CALLEE is the
1000 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1001 declaration for it. */
1004 nvptx_record_libfunc (rtx callee
, rtx retval
, rtx pat
)
1006 rtx
*slot
= declared_libfuncs_htab
->find_slot (callee
, INSERT
);
1011 const char *name
= XSTR (callee
, 0);
1012 write_fn_proto_from_insn (func_decls
, name
, retval
, pat
);
1016 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1017 is prototyped, record it now. Otherwise record it as needed at end
1018 of compilation, when we might have more information about it. */
1021 nvptx_record_needed_fndecl (tree decl
)
1023 if (TYPE_ARG_TYPES (TREE_TYPE (decl
)) == NULL_TREE
)
1025 tree
*slot
= needed_fndecls_htab
->find_slot (decl
, INSERT
);
1030 nvptx_record_fndecl (decl
);
1033 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1037 nvptx_maybe_record_fnsym (rtx sym
)
1039 tree decl
= SYMBOL_REF_DECL (sym
);
1041 if (decl
&& TREE_CODE (decl
) == FUNCTION_DECL
&& DECL_EXTERNAL (decl
))
1042 nvptx_record_needed_fndecl (decl
);
1045 /* Emit a local array to hold some part of a conventional stack frame
1046 and initialize REGNO to point to it. If the size is zero, it'll
1047 never be valid to dereference, so we can simply initialize to
1051 init_frame (FILE *file
, int regno
, unsigned align
, unsigned size
)
1054 fprintf (file
, "\t.local .align %d .b8 %s_ar[%u];\n",
1055 align
, reg_names
[regno
], size
);
1056 fprintf (file
, "\t.reg.u%d %s;\n",
1057 POINTER_SIZE
, reg_names
[regno
]);
1058 fprintf (file
, (size
? "\tcvta.local.u%d %s, %s_ar;\n"
1059 : "\tmov.u%d %s, 0;\n"),
1060 POINTER_SIZE
, reg_names
[regno
], reg_names
[regno
]);
1063 /* Emit soft stack frame setup sequence. */
1066 init_softstack_frame (FILE *file
, unsigned alignment
, HOST_WIDE_INT size
)
1068 /* Maintain 64-bit stack alignment. */
1069 unsigned keep_align
= BIGGEST_ALIGNMENT
/ BITS_PER_UNIT
;
1070 size
= ROUND_UP (size
, keep_align
);
1071 int bits
= POINTER_SIZE
;
1072 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1073 const char *reg_frame
= reg_names
[FRAME_POINTER_REGNUM
];
1074 const char *reg_sspslot
= reg_names
[SOFTSTACK_SLOT_REGNUM
];
1075 const char *reg_sspprev
= reg_names
[SOFTSTACK_PREV_REGNUM
];
1076 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_stack
);
1077 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_frame
);
1078 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspslot
);
1079 fprintf (file
, "\t.reg.u%d %s;\n", bits
, reg_sspprev
);
1080 fprintf (file
, "\t{\n");
1081 fprintf (file
, "\t\t.reg.u32 %%fstmp0;\n");
1082 fprintf (file
, "\t\t.reg.u%d %%fstmp1;\n", bits
);
1083 fprintf (file
, "\t\t.reg.u%d %%fstmp2;\n", bits
);
1084 fprintf (file
, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1085 fprintf (file
, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1086 bits
== 64 ? ".wide" : ".lo", bits
/ 8);
1087 fprintf (file
, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits
);
1089 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1090 fprintf (file
, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits
, reg_sspslot
);
1092 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1093 fprintf (file
, "\t\tld.shared.u%d %s, [%s];\n",
1094 bits
, reg_sspprev
, reg_sspslot
);
1096 /* Initialize %frame = %sspprev - size. */
1097 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1098 bits
, reg_frame
, reg_sspprev
, size
);
1100 /* Apply alignment, if larger than 64. */
1101 if (alignment
> keep_align
)
1102 fprintf (file
, "\t\tand.b%d %s, %s, %d;\n",
1103 bits
, reg_frame
, reg_frame
, -alignment
);
1105 size
= crtl
->outgoing_args_size
;
1106 gcc_assert (size
% keep_align
== 0);
1108 /* Initialize %stack. */
1109 fprintf (file
, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC
";\n",
1110 bits
, reg_stack
, reg_frame
, size
);
1113 fprintf (file
, "\t\tst.shared.u%d [%s], %s;\n",
1114 bits
, reg_sspslot
, reg_stack
);
1115 fprintf (file
, "\t}\n");
1116 cfun
->machine
->has_softstack
= true;
1117 need_softstack_decl
= true;
1120 /* Emit code to initialize the REGNO predicate register to indicate
1121 whether we are not lane zero on the NAME axis. */
1124 nvptx_init_axis_predicate (FILE *file
, int regno
, const char *name
)
1126 fprintf (file
, "\t{\n");
1127 fprintf (file
, "\t\t.reg.u32\t%%%s;\n", name
);
1128 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1130 fprintf (file
, "\t\t.reg.u64\t%%t_red;\n");
1131 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1133 fprintf (file
, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name
, name
);
1134 fprintf (file
, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno
, name
);
1135 if (strcmp (name
, "x") == 0 && cfun
->machine
->red_partition
)
1137 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1138 fprintf (file
, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1139 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1140 "// vector reduction buffer\n",
1141 REGNO (cfun
->machine
->red_partition
),
1142 vector_red_partition
);
1144 /* Verify vector_red_size. */
1145 gcc_assert (vector_red_partition
* nvptx_mach_max_workers ()
1146 <= vector_red_size
);
1147 fprintf (file
, "\t}\n");
1150 /* Emit code to initialize OpenACC worker broadcast and synchronization
1154 nvptx_init_oacc_workers (FILE *file
)
1156 fprintf (file
, "\t{\n");
1157 fprintf (file
, "\t\t.reg.u32\t%%tidy;\n");
1158 if (cfun
->machine
->bcast_partition
)
1160 fprintf (file
, "\t\t.reg.u64\t%%t_bcast;\n");
1161 fprintf (file
, "\t\t.reg.u64\t%%y64;\n");
1163 fprintf (file
, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1164 if (cfun
->machine
->bcast_partition
)
1166 fprintf (file
, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1167 fprintf (file
, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1168 fprintf (file
, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1169 fprintf (file
, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1170 "// vector broadcast offset\n",
1171 REGNO (cfun
->machine
->bcast_partition
),
1172 oacc_bcast_partition
);
1174 /* Verify oacc_bcast_size. */
1175 gcc_assert (oacc_bcast_partition
* (nvptx_mach_max_workers () + 1)
1176 <= oacc_bcast_size
);
1177 if (cfun
->machine
->sync_bar
)
1178 fprintf (file
, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1179 "// vector synchronization barrier\n",
1180 REGNO (cfun
->machine
->sync_bar
));
1181 fprintf (file
, "\t}\n");
1184 /* Emit code to initialize predicate and master lane index registers for
1185 -muniform-simt code generation variant. */
1188 nvptx_init_unisimt_predicate (FILE *file
)
1190 cfun
->machine
->unisimt_location
= gen_reg_rtx (Pmode
);
1191 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1192 int bits
= POINTER_SIZE
;
1193 fprintf (file
, "\t.reg.u%d %%r%d;\n", bits
, loc
);
1194 fprintf (file
, "\t{\n");
1195 fprintf (file
, "\t\t.reg.u32 %%ustmp0;\n");
1196 fprintf (file
, "\t\t.reg.u%d %%ustmp1;\n", bits
);
1197 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1198 fprintf (file
, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1199 bits
== 64 ? ".wide" : ".lo");
1200 fprintf (file
, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits
, loc
);
1201 fprintf (file
, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits
, loc
, loc
);
1202 if (cfun
->machine
->unisimt_predicate
)
1204 int master
= REGNO (cfun
->machine
->unisimt_master
);
1205 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1206 fprintf (file
, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master
, loc
);
1207 fprintf (file
, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1208 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1209 fprintf (file
, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master
, master
);
1210 /* Compute predicate as 'tid.x == master'. */
1211 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred
, master
);
1213 fprintf (file
, "\t}\n");
1214 need_unisimt_decl
= true;
1217 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1219 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1220 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1222 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1223 __nvptx_uni[tid.y] = 0;
1224 gomp_nvptx_main (ORIG, arg);
1226 ORIG itself should not be emitted as a PTX .entry function. */
1229 write_omp_entry (FILE *file
, const char *name
, const char *orig
)
1231 static bool gomp_nvptx_main_declared
;
1232 if (!gomp_nvptx_main_declared
)
1234 gomp_nvptx_main_declared
= true;
1235 write_fn_marker (func_decls
, false, true, "gomp_nvptx_main");
1236 func_decls
<< ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1237 << " %in_ar1, .param.u" << POINTER_SIZE
<< " %in_ar2);\n";
1239 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1240 #define NTID_Y "%ntid.y"
1241 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1242 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1245 .reg.u" PS " %R<4>;\n\
1246 mov.u32 %r0, %tid.y;\n\
1247 mov.u32 %r1, " NTID_Y ";\n\
1248 mov.u32 %r2, %ctaid.x;\n\
1249 cvt.u" PS ".u32 %R1, %r0;\n\
1250 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1251 mov.u" PS " %R0, __nvptx_stacks;\n\
1252 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1253 ld.param.u" PS " %R2, [%stack];\n\
1254 ld.param.u" PS " %R3, [%sz];\n\
1255 add.u" PS " %R2, %R2, %R3;\n\
1256 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1257 st.shared.u" PS " [%R0], %R2;\n\
1258 mov.u" PS " %R0, __nvptx_uni;\n\
1259 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1261 st.shared.u32 [%R0], %r0;\n\
1262 mov.u" PS " %R0, \0;\n\
1263 ld.param.u" PS " %R1, [%arg];\n\
1265 .param.u" PS " %P<2>;\n\
1266 st.param.u" PS " [%P0], %R0;\n\
1267 st.param.u" PS " [%P1], %R1;\n\
1268 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1272 static const char entry64
[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1273 static const char entry32
[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1274 #undef ENTRY_TEMPLATE
1276 const char *entry_1
= TARGET_ABI64
? entry64
: entry32
;
1277 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1278 const char *entry_2
= entry_1
+ strlen (entry64
) + 1;
1279 fprintf (file
, ".visible .entry %s%s%s%s", name
, entry_1
, orig
, entry_2
);
1280 need_softstack_decl
= need_unisimt_decl
= true;
1283 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1284 function, including local var decls and copies from the arguments to
1288 nvptx_declare_function_name (FILE *file
, const char *name
, const_tree decl
)
1290 tree fntype
= TREE_TYPE (decl
);
1291 tree result_type
= TREE_TYPE (fntype
);
1294 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl
))
1295 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl
)))
1297 char *buf
= (char *) alloca (strlen (name
) + sizeof ("$impl"));
1298 sprintf (buf
, "%s$impl", name
);
1299 write_omp_entry (file
, name
, buf
);
1302 /* We construct the initial part of the function into a string
1303 stream, in order to share the prototype writing code. */
1304 std::stringstream s
;
1305 write_fn_proto (s
, true, name
, decl
);
1308 bool return_in_mem
= write_return_type (s
, false, result_type
);
1310 argno
= write_arg_type (s
, 0, argno
, ptr_type_node
, true);
1312 /* Declare and initialize incoming arguments. */
1313 tree args
= TYPE_ARG_TYPES (fntype
);
1314 bool prototyped
= true;
1317 args
= DECL_ARGUMENTS (decl
);
1321 for (; args
!= NULL_TREE
; args
= TREE_CHAIN (args
))
1323 tree type
= prototyped
? TREE_VALUE (args
) : TREE_TYPE (args
);
1325 argno
= write_arg_type (s
, 0, argno
, type
, prototyped
);
1328 if (stdarg_p (fntype
))
1329 argno
= write_arg_type (s
, ARG_POINTER_REGNUM
, argno
, ptr_type_node
,
1332 if (DECL_STATIC_CHAIN (decl
) || cfun
->machine
->has_chain
)
1333 write_arg_type (s
, STATIC_CHAIN_REGNUM
,
1334 DECL_STATIC_CHAIN (decl
) ? argno
: -1, ptr_type_node
,
1337 fprintf (file
, "%s", s
.str().c_str());
1339 /* Usually 'crtl->is_leaf' is computed during register allocator
1340 initialization (which is not done on NVPTX) or for pressure-sensitive
1341 optimizations. Initialize it here, except if already set. */
1343 crtl
->is_leaf
= leaf_function_p ();
1345 HOST_WIDE_INT sz
= get_frame_size ();
1346 bool need_frameptr
= sz
|| cfun
->machine
->has_chain
;
1347 int alignment
= crtl
->stack_alignment_needed
/ BITS_PER_UNIT
;
1348 if (!TARGET_SOFT_STACK
)
1350 /* Declare a local var for outgoing varargs. */
1351 if (cfun
->machine
->has_varadic
)
1352 init_frame (file
, STACK_POINTER_REGNUM
,
1353 UNITS_PER_WORD
, crtl
->outgoing_args_size
);
1355 /* Declare a local variable for the frame. Force its size to be
1356 DImode-compatible. */
1358 init_frame (file
, FRAME_POINTER_REGNUM
, alignment
,
1359 ROUND_UP (sz
, GET_MODE_SIZE (DImode
)));
1361 else if (need_frameptr
|| cfun
->machine
->has_varadic
|| cfun
->calls_alloca
1362 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1363 init_softstack_frame (file
, alignment
, sz
);
1365 if (cfun
->machine
->has_simtreg
)
1367 unsigned HOST_WIDE_INT
&simtsz
= cfun
->machine
->simt_stack_size
;
1368 unsigned HOST_WIDE_INT
&align
= cfun
->machine
->simt_stack_align
;
1369 align
= MAX (align
, GET_MODE_SIZE (DImode
));
1370 if (!crtl
->is_leaf
|| cfun
->calls_alloca
)
1371 simtsz
= HOST_WIDE_INT_M1U
;
1372 if (simtsz
== HOST_WIDE_INT_M1U
)
1373 simtsz
= nvptx_softstack_size
;
1374 if (cfun
->machine
->has_softstack
)
1375 simtsz
+= POINTER_SIZE
/ 8;
1376 simtsz
= ROUND_UP (simtsz
, GET_MODE_SIZE (DImode
));
1377 if (align
> GET_MODE_SIZE (DImode
))
1378 simtsz
+= align
- GET_MODE_SIZE (DImode
);
1380 fprintf (file
, "\t.local.align 8 .b8 %%simtstack_ar["
1381 HOST_WIDE_INT_PRINT_DEC
"];\n", simtsz
);
1384 /* Restore the vector reduction partition register, if necessary.
1385 FIXME: Find out when and why this is necessary, and fix it. */
1386 if (cfun
->machine
->red_partition
)
1387 regno_reg_rtx
[REGNO (cfun
->machine
->red_partition
)]
1388 = cfun
->machine
->red_partition
;
1390 /* Declare the pseudos we have as ptx registers. */
1391 int maxregs
= max_reg_num ();
1392 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< maxregs
; i
++)
1394 if (regno_reg_rtx
[i
] != const0_rtx
)
1396 machine_mode mode
= PSEUDO_REGNO_MODE (i
);
1397 machine_mode split
= maybe_split_mode (mode
);
1399 if (split_mode_p (mode
))
1401 fprintf (file
, "\t.reg%s ", nvptx_ptx_type_from_mode (mode
, true));
1402 output_reg (file
, i
, split
, -2);
1403 fprintf (file
, ";\n");
1407 /* Emit axis predicates. */
1408 if (cfun
->machine
->axis_predicate
[0])
1409 nvptx_init_axis_predicate (file
,
1410 REGNO (cfun
->machine
->axis_predicate
[0]), "y");
1411 if (cfun
->machine
->axis_predicate
[1])
1412 nvptx_init_axis_predicate (file
,
1413 REGNO (cfun
->machine
->axis_predicate
[1]), "x");
1414 if (cfun
->machine
->unisimt_predicate
1415 || (cfun
->machine
->has_simtreg
&& !crtl
->is_leaf
))
1416 nvptx_init_unisimt_predicate (file
);
1417 if (cfun
->machine
->bcast_partition
|| cfun
->machine
->sync_bar
)
1418 nvptx_init_oacc_workers (file
);
1421 /* Output code for switching uniform-simt state. ENTERING indicates whether
1422 we are entering or leaving non-uniform execution region. */
1425 nvptx_output_unisimt_switch (FILE *file
, bool entering
)
1427 if (crtl
->is_leaf
&& !cfun
->machine
->unisimt_predicate
)
1429 fprintf (file
, "\t{\n");
1430 fprintf (file
, "\t\t.reg.u32 %%ustmp2;\n");
1431 fprintf (file
, "\t\tmov.u32 %%ustmp2, %d;\n", entering
? -1 : 0);
1434 int loc
= REGNO (cfun
->machine
->unisimt_location
);
1435 fprintf (file
, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc
);
1437 if (cfun
->machine
->unisimt_predicate
)
1439 int master
= REGNO (cfun
->machine
->unisimt_master
);
1440 int pred
= REGNO (cfun
->machine
->unisimt_predicate
);
1441 fprintf (file
, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1442 fprintf (file
, "\t\tmov.u32 %%r%d, %s;\n",
1443 master
, entering
? "%ustmp2" : "0");
1444 fprintf (file
, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred
, master
);
1446 fprintf (file
, "\t}\n");
1449 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1450 ENTERING indicates whether we are entering or leaving non-uniform execution.
1451 PTR is the register pointing to allocated storage, it is assigned to on
1452 entering and used to restore state on leaving. SIZE and ALIGN are used only
1456 nvptx_output_softstack_switch (FILE *file
, bool entering
,
1457 rtx ptr
, rtx size
, rtx align
)
1459 gcc_assert (REG_P (ptr
) && !HARD_REGISTER_P (ptr
));
1460 if (crtl
->is_leaf
&& !cfun
->machine
->simt_stack_size
)
1462 int bits
= POINTER_SIZE
, regno
= REGNO (ptr
);
1463 fprintf (file
, "\t{\n");
1466 fprintf (file
, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1467 HOST_WIDE_INT_PRINT_DEC
";\n", bits
, regno
,
1468 cfun
->machine
->simt_stack_size
);
1469 fprintf (file
, "\t\tsub.u%d %%r%d, %%r%d, ", bits
, regno
, regno
);
1470 if (CONST_INT_P (size
))
1471 fprintf (file
, HOST_WIDE_INT_PRINT_DEC
,
1472 ROUND_UP (UINTVAL (size
), GET_MODE_SIZE (DImode
)));
1474 output_reg (file
, REGNO (size
), VOIDmode
);
1475 fputs (";\n", file
);
1476 if (!CONST_INT_P (size
) || UINTVAL (align
) > GET_MODE_SIZE (DImode
))
1478 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC
";\n",
1479 bits
, regno
, regno
, UINTVAL (align
));
1481 if (cfun
->machine
->has_softstack
)
1483 const char *reg_stack
= reg_names
[STACK_POINTER_REGNUM
];
1486 fprintf (file
, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1487 bits
, regno
, bits
/ 8, reg_stack
);
1488 fprintf (file
, "\t\tsub.u%d %s, %%r%d, %d;\n",
1489 bits
, reg_stack
, regno
, bits
/ 8);
1493 fprintf (file
, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1494 bits
, reg_stack
, regno
, bits
/ 8);
1496 nvptx_output_set_softstack (REGNO (stack_pointer_rtx
));
1498 fprintf (file
, "\t}\n");
1501 /* Output code to enter non-uniform execution region. DEST is a register
1502 to hold a per-lane allocation given by SIZE and ALIGN. */
1505 nvptx_output_simt_enter (rtx dest
, rtx size
, rtx align
)
1507 nvptx_output_unisimt_switch (asm_out_file
, true);
1508 nvptx_output_softstack_switch (asm_out_file
, true, dest
, size
, align
);
1512 /* Output code to leave non-uniform execution region. SRC is the register
1513 holding per-lane storage previously allocated by omp_simt_enter insn. */
1516 nvptx_output_simt_exit (rtx src
)
1518 nvptx_output_unisimt_switch (asm_out_file
, false);
1519 nvptx_output_softstack_switch (asm_out_file
, false, src
, NULL_RTX
, NULL_RTX
);
1523 /* Output instruction that sets soft stack pointer in shared memory to the
1524 value in register given by SRC_REGNO. */
1527 nvptx_output_set_softstack (unsigned src_regno
)
1529 if (cfun
->machine
->has_softstack
&& !crtl
->is_leaf
)
1531 fprintf (asm_out_file
, "\tst.shared.u%d\t[%s], ",
1532 POINTER_SIZE
, reg_names
[SOFTSTACK_SLOT_REGNUM
]);
1533 output_reg (asm_out_file
, src_regno
, VOIDmode
);
1534 fprintf (asm_out_file
, ";\n");
1538 /* Output a return instruction. Also copy the return value to its outgoing
1542 nvptx_output_return (void)
1544 machine_mode mode
= (machine_mode
)cfun
->machine
->return_mode
;
1546 if (mode
!= VOIDmode
)
1547 fprintf (asm_out_file
, "\tst.param%s\t[%s_out], %s;\n",
1548 nvptx_ptx_type_from_mode (mode
, false),
1549 reg_names
[NVPTX_RETURN_REGNUM
],
1550 reg_names
[NVPTX_RETURN_REGNUM
]);
1555 /* Terminate a function by writing a closing brace to FILE. */
1558 nvptx_function_end (FILE *file
)
1560 fprintf (file
, "}\n");
1563 /* Decide whether we can make a sibling call to a function. For ptx, we
1567 nvptx_function_ok_for_sibcall (tree
, tree
)
1572 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1575 nvptx_get_drap_rtx (void)
1577 if (TARGET_SOFT_STACK
&& stack_realign_drap
)
1578 return arg_pointer_rtx
;
1582 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1583 argument to the next call. */
1586 nvptx_call_args (rtx arg
, tree fntype
)
1588 if (!cfun
->machine
->doing_call
)
1590 cfun
->machine
->doing_call
= true;
1591 cfun
->machine
->is_varadic
= false;
1592 cfun
->machine
->num_args
= 0;
1594 if (fntype
&& stdarg_p (fntype
))
1596 cfun
->machine
->is_varadic
= true;
1597 cfun
->machine
->has_varadic
= true;
1598 cfun
->machine
->num_args
++;
1602 if (REG_P (arg
) && arg
!= pc_rtx
)
1604 cfun
->machine
->num_args
++;
1605 cfun
->machine
->call_args
= alloc_EXPR_LIST (VOIDmode
, arg
,
1606 cfun
->machine
->call_args
);
1610 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1611 information we recorded. */
1614 nvptx_end_call_args (void)
1616 cfun
->machine
->doing_call
= false;
1617 free_EXPR_LIST_list (&cfun
->machine
->call_args
);
1620 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1621 track of whether calls involving static chains or varargs were seen
1622 in the current function.
1623 For libcalls, maintain a hash table of decls we have seen, and
1624 record a function decl for later when encountering a new one. */
1627 nvptx_expand_call (rtx retval
, rtx address
)
1629 rtx callee
= XEXP (address
, 0);
1630 rtx varargs
= NULL_RTX
;
1631 unsigned parallel
= 0;
1633 if (!call_insn_operand (callee
, Pmode
))
1635 callee
= force_reg (Pmode
, callee
);
1636 address
= change_address (address
, QImode
, callee
);
1639 if (GET_CODE (callee
) == SYMBOL_REF
)
1641 tree decl
= SYMBOL_REF_DECL (callee
);
1642 if (decl
!= NULL_TREE
)
1644 if (DECL_STATIC_CHAIN (decl
))
1645 cfun
->machine
->has_chain
= true;
1647 tree attr
= oacc_get_fn_attrib (decl
);
1650 tree dims
= TREE_VALUE (attr
);
1652 parallel
= GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1;
1653 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
1655 if (TREE_PURPOSE (dims
)
1656 && !integer_zerop (TREE_PURPOSE (dims
)))
1658 /* Not on this axis. */
1659 parallel
^= GOMP_DIM_MASK (ix
);
1660 dims
= TREE_CHAIN (dims
);
1666 unsigned nargs
= cfun
->machine
->num_args
;
1667 if (cfun
->machine
->is_varadic
)
1669 varargs
= gen_reg_rtx (Pmode
);
1670 emit_move_insn (varargs
, stack_pointer_rtx
);
1673 rtvec vec
= rtvec_alloc (nargs
+ 1);
1674 rtx pat
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1677 rtx call
= gen_rtx_CALL (VOIDmode
, address
, const0_rtx
);
1678 rtx tmp_retval
= retval
;
1681 if (!nvptx_register_operand (retval
, GET_MODE (retval
)))
1682 tmp_retval
= gen_reg_rtx (GET_MODE (retval
));
1683 call
= gen_rtx_SET (tmp_retval
, call
);
1685 XVECEXP (pat
, 0, vec_pos
++) = call
;
1687 /* Construct the call insn, including a USE for each argument pseudo
1688 register. These will be used when printing the insn. */
1689 for (rtx arg
= cfun
->machine
->call_args
; arg
; arg
= XEXP (arg
, 1))
1690 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, XEXP (arg
, 0));
1693 XVECEXP (pat
, 0, vec_pos
++) = gen_rtx_USE (VOIDmode
, varargs
);
1695 gcc_assert (vec_pos
= XVECLEN (pat
, 0));
1697 nvptx_emit_forking (parallel
, true);
1698 emit_call_insn (pat
);
1699 nvptx_emit_joining (parallel
, true);
1701 if (tmp_retval
!= retval
)
1702 emit_move_insn (retval
, tmp_retval
);
1705 /* Emit a comparison COMPARE, and return the new test to be used in the
1709 nvptx_expand_compare (rtx compare
)
1711 rtx pred
= gen_reg_rtx (BImode
);
1712 rtx cmp
= gen_rtx_fmt_ee (GET_CODE (compare
), BImode
,
1713 XEXP (compare
, 0), XEXP (compare
, 1));
1714 emit_insn (gen_rtx_SET (pred
, cmp
));
1715 return gen_rtx_NE (BImode
, pred
, const0_rtx
);
1718 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1721 nvptx_expand_oacc_fork (unsigned mode
)
1723 nvptx_emit_forking (GOMP_DIM_MASK (mode
), false);
1727 nvptx_expand_oacc_join (unsigned mode
)
1729 nvptx_emit_joining (GOMP_DIM_MASK (mode
), false);
1732 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1736 nvptx_gen_unpack (rtx dst0
, rtx dst1
, rtx src
)
1740 switch (GET_MODE (src
))
1743 res
= gen_unpackdisi2 (dst0
, dst1
, src
);
1746 res
= gen_unpackdfsi2 (dst0
, dst1
, src
);
1748 default: gcc_unreachable ();
1753 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1757 nvptx_gen_pack (rtx dst
, rtx src0
, rtx src1
)
1761 switch (GET_MODE (dst
))
1764 res
= gen_packsidi2 (dst
, src0
, src1
);
1767 res
= gen_packsidf2 (dst
, src0
, src1
);
1769 default: gcc_unreachable ();
1774 /* Generate an instruction or sequence to broadcast register REG
1775 across the vectors of a single warp. */
1778 nvptx_gen_shuffle (rtx dst
, rtx src
, rtx idx
, nvptx_shuffle_kind kind
)
1782 switch (GET_MODE (dst
))
1785 res
= gen_nvptx_shufflesi (dst
, src
, idx
, GEN_INT (kind
));
1788 res
= gen_nvptx_shufflesf (dst
, src
, idx
, GEN_INT (kind
));
1793 rtx tmp0
= gen_reg_rtx (SImode
);
1794 rtx tmp1
= gen_reg_rtx (SImode
);
1797 emit_insn (nvptx_gen_unpack (tmp0
, tmp1
, src
));
1798 emit_insn (nvptx_gen_shuffle (tmp0
, tmp0
, idx
, kind
));
1799 emit_insn (nvptx_gen_shuffle (tmp1
, tmp1
, idx
, kind
));
1800 emit_insn (nvptx_gen_pack (dst
, tmp0
, tmp1
));
1807 rtx tmp
= gen_reg_rtx (SImode
);
1810 emit_insn (gen_sel_truesi (tmp
, src
, GEN_INT (1), const0_rtx
));
1811 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1812 emit_insn (gen_rtx_SET (dst
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1820 rtx tmp
= gen_reg_rtx (SImode
);
1823 emit_insn (gen_rtx_SET (tmp
, gen_rtx_fmt_e (ZERO_EXTEND
, SImode
, src
)));
1824 emit_insn (nvptx_gen_shuffle (tmp
, tmp
, idx
, kind
));
1825 emit_insn (gen_rtx_SET (dst
, gen_rtx_fmt_e (TRUNCATE
, GET_MODE (dst
),
1838 /* Generate an instruction or sequence to broadcast register REG
1839 across the vectors of a single warp. */
1842 nvptx_gen_warp_bcast (rtx reg
)
1844 return nvptx_gen_shuffle (reg
, reg
, const0_rtx
, SHUFFLE_IDX
);
1847 /* Structure used when generating a worker-level spill or fill. */
1849 struct broadcast_data_t
1851 rtx base
; /* Register holding base addr of buffer. */
1852 rtx ptr
; /* Iteration var, if needed. */
1853 unsigned offset
; /* Offset into worker buffer. */
1856 /* Direction of the spill/fill and looping setup/teardown indicator. */
1862 PM_loop_begin
= 1 << 2,
1863 PM_loop_end
= 1 << 3,
1865 PM_read_write
= PM_read
| PM_write
1868 /* Generate instruction(s) to spill or fill register REG to/from the
1869 worker broadcast array. PM indicates what is to be done, REP
1870 how many loop iterations will be executed (0 for not a loop). */
1873 nvptx_gen_shared_bcast (rtx reg
, propagate_mask pm
, unsigned rep
,
1874 broadcast_data_t
*data
, bool vector
)
1877 machine_mode mode
= GET_MODE (reg
);
1883 rtx tmp
= gen_reg_rtx (SImode
);
1887 emit_insn (gen_sel_truesi (tmp
, reg
, GEN_INT (1), const0_rtx
));
1888 emit_insn (nvptx_gen_shared_bcast (tmp
, pm
, rep
, data
, vector
));
1890 emit_insn (gen_rtx_SET (reg
, gen_rtx_NE (BImode
, tmp
, const0_rtx
)));
1898 rtx addr
= data
->ptr
;
1902 unsigned align
= GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
;
1904 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
1905 data
->offset
= ROUND_UP (data
->offset
, align
);
1907 gcc_assert (data
->base
!= NULL
);
1909 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (data
->offset
));
1912 addr
= gen_rtx_MEM (mode
, addr
);
1914 res
= gen_rtx_SET (addr
, reg
);
1915 else if (pm
== PM_write
)
1916 res
= gen_rtx_SET (reg
, addr
);
1922 /* We're using a ptr, increment it. */
1926 emit_insn (gen_adddi3 (data
->ptr
, data
->ptr
,
1927 GEN_INT (GET_MODE_SIZE (GET_MODE (reg
)))));
1933 data
->offset
+= rep
* GET_MODE_SIZE (GET_MODE (reg
));
1940 /* Returns true if X is a valid address for use in a memory reference. */
1943 nvptx_legitimate_address_p (machine_mode
, rtx x
, bool)
1945 enum rtx_code code
= GET_CODE (x
);
1953 if (REG_P (XEXP (x
, 0)) && CONST_INT_P (XEXP (x
, 1)))
1967 /* Machinery to output constant initializers. When beginning an
1968 initializer, we decide on a fragment size (which is visible in ptx
1969 in the type used), and then all initializer data is buffered until
1970 a fragment is filled and ready to be written out. */
1974 unsigned HOST_WIDE_INT mask
; /* Mask for storing fragment. */
1975 unsigned HOST_WIDE_INT val
; /* Current fragment value. */
1976 unsigned HOST_WIDE_INT remaining
; /* Remaining bytes to be written
1978 unsigned size
; /* Fragment size to accumulate. */
1979 unsigned offset
; /* Offset within current fragment. */
1980 bool started
; /* Whether we've output any initializer. */
1983 /* The current fragment is full, write it out. SYM may provide a
1984 symbolic reference we should output, in which case the fragment
1985 value is the addend. */
1988 output_init_frag (rtx sym
)
1990 fprintf (asm_out_file
, init_frag
.started
? ", " : " = { ");
1991 unsigned HOST_WIDE_INT val
= init_frag
.val
;
1993 init_frag
.started
= true;
1995 init_frag
.offset
= 0;
1996 init_frag
.remaining
--;
2000 bool function
= (SYMBOL_REF_DECL (sym
)
2001 && (TREE_CODE (SYMBOL_REF_DECL (sym
)) == FUNCTION_DECL
));
2003 fprintf (asm_out_file
, "generic(");
2004 output_address (VOIDmode
, sym
);
2006 fprintf (asm_out_file
, ")");
2008 fprintf (asm_out_file
, " + ");
2012 fprintf (asm_out_file
, HOST_WIDE_INT_PRINT_DEC
, val
);
2015 /* Add value VAL of size SIZE to the data we're emitting, and keep
2016 writing out chunks as they fill up. */
2019 nvptx_assemble_value (unsigned HOST_WIDE_INT val
, unsigned size
)
2021 val
&= ((unsigned HOST_WIDE_INT
)2 << (size
* BITS_PER_UNIT
- 1)) - 1;
2023 for (unsigned part
= 0; size
; size
-= part
)
2025 val
>>= part
* BITS_PER_UNIT
;
2026 part
= init_frag
.size
- init_frag
.offset
;
2027 part
= MIN (part
, size
);
2029 unsigned HOST_WIDE_INT partial
2030 = val
<< (init_frag
.offset
* BITS_PER_UNIT
);
2031 init_frag
.val
|= partial
& init_frag
.mask
;
2032 init_frag
.offset
+= part
;
2034 if (init_frag
.offset
== init_frag
.size
)
2035 output_init_frag (NULL
);
2039 /* Target hook for assembling integer object X of size SIZE. */
2042 nvptx_assemble_integer (rtx x
, unsigned int size
, int ARG_UNUSED (aligned_p
))
2044 HOST_WIDE_INT val
= 0;
2046 switch (GET_CODE (x
))
2049 /* Let the generic machinery figure it out, usually for a
2054 nvptx_assemble_value (INTVAL (x
), size
);
2059 gcc_assert (GET_CODE (x
) == PLUS
);
2060 val
= INTVAL (XEXP (x
, 1));
2062 gcc_assert (GET_CODE (x
) == SYMBOL_REF
);
2066 gcc_assert (size
== init_frag
.size
);
2067 if (init_frag
.offset
)
2068 sorry ("cannot emit unaligned pointers in ptx assembly");
2070 nvptx_maybe_record_fnsym (x
);
2071 init_frag
.val
= val
;
2072 output_init_frag (x
);
2079 /* Output SIZE zero bytes. We ignore the FILE argument since the
2080 functions we're calling to perform the output just use
2084 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size
)
2086 /* Finish the current fragment, if it's started. */
2087 if (init_frag
.offset
)
2089 unsigned part
= init_frag
.size
- init_frag
.offset
;
2090 part
= MIN (part
, (unsigned)size
);
2092 nvptx_assemble_value (0, part
);
2095 /* If this skip doesn't terminate the initializer, write as many
2096 remaining pieces as possible directly. */
2097 if (size
< init_frag
.remaining
* init_frag
.size
)
2099 while (size
>= init_frag
.size
)
2101 size
-= init_frag
.size
;
2102 output_init_frag (NULL_RTX
);
2105 nvptx_assemble_value (0, size
);
2109 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2110 ignore the FILE arg. */
2113 nvptx_output_ascii (FILE *, const char *str
, unsigned HOST_WIDE_INT size
)
2115 for (unsigned HOST_WIDE_INT i
= 0; i
< size
; i
++)
2116 nvptx_assemble_value (str
[i
], 1);
2119 /* Return true if TYPE is a record type where the last field is an array without
2123 flexible_array_member_type_p (const_tree type
)
2125 if (TREE_CODE (type
) != RECORD_TYPE
)
2128 const_tree last_field
= NULL_TREE
;
2129 for (const_tree f
= TYPE_FIELDS (type
); f
; f
= TREE_CHAIN (f
))
2135 const_tree last_field_type
= TREE_TYPE (last_field
);
2136 if (TREE_CODE (last_field_type
) != ARRAY_TYPE
)
2139 return (! TYPE_DOMAIN (last_field_type
)
2140 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type
)));
2143 /* Emit a PTX variable decl and prepare for emission of its
2144 initializer. NAME is the symbol name and SETION the PTX data
2145 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2146 The caller has already emitted any indentation and linkage
2147 specifier. It is responsible for any initializer, terminating ;
2148 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2149 this is the opposite way round that PTX wants them! */
2152 nvptx_assemble_decl_begin (FILE *file
, const char *name
, const char *section
,
2153 const_tree type
, HOST_WIDE_INT size
, unsigned align
,
2154 bool undefined
= false)
2156 bool atype
= (TREE_CODE (type
) == ARRAY_TYPE
)
2157 && (TYPE_DOMAIN (type
) == NULL_TREE
);
2159 if (undefined
&& flexible_array_member_type_p (type
))
2165 while (TREE_CODE (type
) == ARRAY_TYPE
)
2166 type
= TREE_TYPE (type
);
2168 if (TREE_CODE (type
) == VECTOR_TYPE
2169 || TREE_CODE (type
) == COMPLEX_TYPE
)
2170 /* Neither vector nor complex types can contain the other. */
2171 type
= TREE_TYPE (type
);
2173 unsigned elt_size
= int_size_in_bytes (type
);
2175 /* Largest mode we're prepared to accept. For BLKmode types we
2176 don't know if it'll contain pointer constants, so have to choose
2177 pointer size, otherwise we can choose DImode. */
2178 machine_mode elt_mode
= TYPE_MODE (type
) == BLKmode
? Pmode
: DImode
;
2180 elt_size
|= GET_MODE_SIZE (elt_mode
);
2181 elt_size
&= -elt_size
; /* Extract LSB set. */
2183 init_frag
.size
= elt_size
;
2184 /* Avoid undefined shift behavior by using '2'. */
2185 init_frag
.mask
= ((unsigned HOST_WIDE_INT
)2
2186 << (elt_size
* BITS_PER_UNIT
- 1)) - 1;
2188 init_frag
.offset
= 0;
2189 init_frag
.started
= false;
2190 /* Size might not be a multiple of elt size, if there's an
2191 initialized trailing struct array with smaller type than
2193 init_frag
.remaining
= (size
+ elt_size
- 1) / elt_size
;
2195 fprintf (file
, "%s .align %d .u%d ",
2196 section
, align
/ BITS_PER_UNIT
,
2197 elt_size
* BITS_PER_UNIT
);
2198 assemble_name (file
, name
);
2201 /* We make everything an array, to simplify any initialization
2203 fprintf (file
, "[" HOST_WIDE_INT_PRINT_DEC
"]", init_frag
.remaining
);
2205 fprintf (file
, "[]");
2208 /* Called when the initializer for a decl has been completely output through
2209 combinations of the three functions above. */
2212 nvptx_assemble_decl_end (void)
2214 if (init_frag
.offset
)
2215 /* This can happen with a packed struct with trailing array member. */
2216 nvptx_assemble_value (0, init_frag
.size
- init_frag
.offset
);
2217 fprintf (asm_out_file
, init_frag
.started
? " };\n" : ";\n");
2220 /* Output an uninitialized common or file-scope variable. */
2223 nvptx_output_aligned_decl (FILE *file
, const char *name
,
2224 const_tree decl
, HOST_WIDE_INT size
, unsigned align
)
2226 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2228 /* If this is public, it is common. The nearest thing we have to
2230 fprintf (file
, "\t%s", TREE_PUBLIC (decl
) ? ".weak " : "");
2232 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2233 TREE_TYPE (decl
), size
, align
);
2234 nvptx_assemble_decl_end ();
2237 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2238 writing a constant variable EXP with NAME and SIZE and its
2239 initializer to FILE. */
2242 nvptx_asm_declare_constant_name (FILE *file
, const char *name
,
2243 const_tree exp
, HOST_WIDE_INT obj_size
)
2245 write_var_marker (file
, true, false, name
);
2247 fprintf (file
, "\t");
2249 tree type
= TREE_TYPE (exp
);
2250 nvptx_assemble_decl_begin (file
, name
, ".const", type
, obj_size
,
2254 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2255 a variable DECL with NAME to FILE. */
2258 nvptx_declare_object_name (FILE *file
, const char *name
, const_tree decl
)
2260 write_var_marker (file
, true, TREE_PUBLIC (decl
), name
);
2262 fprintf (file
, "\t%s", (!TREE_PUBLIC (decl
) ? ""
2263 : DECL_WEAK (decl
) ? ".weak " : ".visible "));
2265 tree type
= TREE_TYPE (decl
);
2266 HOST_WIDE_INT obj_size
= tree_to_shwi (DECL_SIZE_UNIT (decl
));
2267 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2268 type
, obj_size
, DECL_ALIGN (decl
));
2271 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2274 nvptx_globalize_label (FILE *, const char *)
2278 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2279 declaration only for variable DECL with NAME to FILE. */
2282 nvptx_assemble_undefined_decl (FILE *file
, const char *name
, const_tree decl
)
2284 /* The middle end can place constant pool decls into the varpool as
2285 undefined. Until that is fixed, catch the problem here. */
2286 if (DECL_IN_CONSTANT_POOL (decl
))
2289 /* We support weak defintions, and hence have the right
2290 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2291 if (DECL_WEAK (decl
))
2292 error_at (DECL_SOURCE_LOCATION (decl
),
2293 "PTX does not support weak declarations"
2294 " (only weak definitions)");
2295 write_var_marker (file
, false, TREE_PUBLIC (decl
), name
);
2297 fprintf (file
, "\t.extern ");
2298 tree size
= DECL_SIZE_UNIT (decl
);
2299 nvptx_assemble_decl_begin (file
, name
, section_for_decl (decl
),
2300 TREE_TYPE (decl
), size
? tree_to_shwi (size
) : 0,
2301 DECL_ALIGN (decl
), true);
2302 nvptx_assemble_decl_end ();
2305 /* Output a pattern for a move instruction. */
2308 nvptx_output_mov_insn (rtx dst
, rtx src
)
2310 machine_mode dst_mode
= GET_MODE (dst
);
2311 machine_mode dst_inner
= (GET_CODE (dst
) == SUBREG
2312 ? GET_MODE (XEXP (dst
, 0)) : dst_mode
);
2313 machine_mode src_inner
= (GET_CODE (src
) == SUBREG
2314 ? GET_MODE (XEXP (src
, 0)) : dst_mode
);
2317 if (GET_CODE (sym
) == CONST
)
2318 sym
= XEXP (XEXP (sym
, 0), 0);
2319 if (SYMBOL_REF_P (sym
))
2321 if (SYMBOL_DATA_AREA (sym
) != DATA_AREA_GENERIC
)
2322 return "%.\tcvta%D1%t0\t%0, %1;";
2323 nvptx_maybe_record_fnsym (sym
);
2326 if (src_inner
== dst_inner
)
2327 return "%.\tmov%t0\t%0, %1;";
2329 if (CONSTANT_P (src
))
2330 return (GET_MODE_CLASS (dst_inner
) == MODE_INT
2331 && GET_MODE_CLASS (src_inner
) != MODE_FLOAT
2332 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2334 if (GET_MODE_SIZE (dst_inner
) == GET_MODE_SIZE (src_inner
))
2336 if (GET_MODE_BITSIZE (dst_mode
) == 128
2337 && GET_MODE_BITSIZE (GET_MODE (src
)) == 128)
2339 /* mov.b128 is not supported. */
2340 if (dst_inner
== V2DImode
&& src_inner
== TImode
)
2341 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2342 else if (dst_inner
== TImode
&& src_inner
== V2DImode
)
2343 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2347 return "%.\tmov.b%T0\t%0, %1;";
2350 return "%.\tcvt%t0%t1\t%0, %1;";
2353 static void nvptx_print_operand (FILE *, rtx
, int);
2355 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2356 involves writing .param declarations and in/out copies into them. For
2357 indirect calls, also write the .callprototype. */
2360 nvptx_output_call_insn (rtx_insn
*insn
, rtx result
, rtx callee
)
2364 bool needs_tgt
= register_operand (callee
, Pmode
);
2365 rtx pat
= PATTERN (insn
);
2366 if (GET_CODE (pat
) == COND_EXEC
)
2367 pat
= COND_EXEC_CODE (pat
);
2368 int arg_end
= XVECLEN (pat
, 0);
2369 tree decl
= NULL_TREE
;
2371 fprintf (asm_out_file
, "\t{\n");
2373 fprintf (asm_out_file
, "\t\t.param%s %s_in;\n",
2374 nvptx_ptx_type_from_mode (GET_MODE (result
), false),
2375 reg_names
[NVPTX_RETURN_REGNUM
]);
2377 /* Ensure we have a ptx declaration in the output if necessary. */
2378 if (GET_CODE (callee
) == SYMBOL_REF
)
2380 decl
= SYMBOL_REF_DECL (callee
);
2382 || (DECL_EXTERNAL (decl
) && !TYPE_ARG_TYPES (TREE_TYPE (decl
))))
2383 nvptx_record_libfunc (callee
, result
, pat
);
2384 else if (DECL_EXTERNAL (decl
))
2385 nvptx_record_fndecl (decl
);
2390 ASM_GENERATE_INTERNAL_LABEL (buf
, "LCT", labelno
);
2392 ASM_OUTPUT_LABEL (asm_out_file
, buf
);
2393 std::stringstream s
;
2394 write_fn_proto_from_insn (s
, NULL
, result
, pat
);
2395 fputs (s
.str().c_str(), asm_out_file
);
2398 for (int argno
= 1; argno
< arg_end
; argno
++)
2400 rtx t
= XEXP (XVECEXP (pat
, 0, argno
), 0);
2401 machine_mode mode
= GET_MODE (t
);
2402 const char *ptx_type
= nvptx_ptx_type_from_mode (mode
, false);
2404 /* Mode splitting has already been done. */
2405 fprintf (asm_out_file
, "\t\t.param%s %%out_arg%d;\n"
2406 "\t\tst.param%s [%%out_arg%d], ",
2407 ptx_type
, argno
, ptx_type
, argno
);
2408 output_reg (asm_out_file
, REGNO (t
), VOIDmode
);
2409 fprintf (asm_out_file
, ";\n");
2412 /* The '.' stands for the call's predicate, if any. */
2413 nvptx_print_operand (asm_out_file
, NULL_RTX
, '.');
2414 fprintf (asm_out_file
, "\t\tcall ");
2415 if (result
!= NULL_RTX
)
2416 fprintf (asm_out_file
, "(%s_in), ", reg_names
[NVPTX_RETURN_REGNUM
]);
2420 const char *name
= get_fnname_from_decl (decl
);
2421 name
= nvptx_name_replacement (name
);
2422 assemble_name (asm_out_file
, name
);
2425 output_address (VOIDmode
, callee
);
2427 const char *open
= "(";
2428 for (int argno
= 1; argno
< arg_end
; argno
++)
2430 fprintf (asm_out_file
, ", %s%%out_arg%d", open
, argno
);
2433 if (decl
&& DECL_STATIC_CHAIN (decl
))
2435 fprintf (asm_out_file
, ", %s%s", open
, reg_names
[STATIC_CHAIN_REGNUM
]);
2439 fprintf (asm_out_file
, ")");
2443 fprintf (asm_out_file
, ", ");
2444 assemble_name (asm_out_file
, buf
);
2446 fprintf (asm_out_file
, ";\n");
2448 if (find_reg_note (insn
, REG_NORETURN
, NULL
))
2450 /* No return functions confuse the PTX JIT, as it doesn't realize
2451 the flow control barrier they imply. It can seg fault if it
2452 encounters what looks like an unexitable loop. Emit a trailing
2453 trap and exit, which it does grok. */
2454 fprintf (asm_out_file
, "\t\ttrap; // (noreturn)\n");
2455 fprintf (asm_out_file
, "\t\texit; // (noreturn)\n");
2460 static char rval
[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2463 /* We must escape the '%' that starts RETURN_REGNUM. */
2464 sprintf (rval
, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2465 reg_names
[NVPTX_RETURN_REGNUM
]);
2472 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2475 nvptx_print_operand_punct_valid_p (unsigned char c
)
2477 return c
== '.' || c
== '#';
2480 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2483 nvptx_print_address_operand (FILE *file
, rtx x
, machine_mode
)
2486 if (GET_CODE (x
) == CONST
)
2488 switch (GET_CODE (x
))
2492 output_address (VOIDmode
, XEXP (x
, 0));
2493 fprintf (file
, "+");
2494 output_address (VOIDmode
, off
);
2499 output_addr_const (file
, x
);
2503 gcc_assert (GET_CODE (x
) != MEM
);
2504 nvptx_print_operand (file
, x
, 0);
2509 /* Write assembly language output for the address ADDR to FILE. */
2512 nvptx_print_operand_address (FILE *file
, machine_mode mode
, rtx addr
)
2514 nvptx_print_address_operand (file
, addr
, mode
);
2517 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2520 . -- print the predicate for the instruction or an emptry string for an
2522 # -- print a rounding mode for the instruction
2524 A -- print a data area for a MEM
2525 c -- print an opcode suffix for a comparison operator, including a type code
2526 D -- print a data area for a MEM operand
2527 S -- print a shuffle kind specified by CONST_INT
2528 t -- print a type opcode suffix, promoting QImode to 32 bits
2529 T -- print a type size in bits
2530 u -- print a type opcode suffix without promotions. */
2533 nvptx_print_operand (FILE *file
, rtx x
, int code
)
2537 x
= current_insn_predicate
;
2541 if (GET_CODE (x
) == EQ
)
2543 output_reg (file
, REGNO (XEXP (x
, 0)), VOIDmode
);
2547 else if (code
== '#')
2549 fputs (".rn", file
);
2553 enum rtx_code x_code
= GET_CODE (x
);
2554 machine_mode mode
= GET_MODE (x
);
2563 if (GET_CODE (x
) == CONST
)
2565 if (GET_CODE (x
) == PLUS
)
2568 if (GET_CODE (x
) == SYMBOL_REF
)
2569 fputs (section_for_sym (x
), file
);
2574 if (x_code
== SUBREG
)
2576 machine_mode inner_mode
= GET_MODE (SUBREG_REG (x
));
2577 if (VECTOR_MODE_P (inner_mode
)
2578 && (GET_MODE_SIZE (mode
)
2579 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2580 mode
= GET_MODE_INNER (inner_mode
);
2581 else if (split_mode_p (inner_mode
))
2582 mode
= maybe_split_mode (inner_mode
);
2586 fprintf (file
, "%s", nvptx_ptx_type_from_mode (mode
, code
== 't'));
2592 rtx inner_x
= SUBREG_REG (x
);
2593 machine_mode inner_mode
= GET_MODE (inner_x
);
2594 machine_mode split
= maybe_split_mode (inner_mode
);
2596 output_reg (file
, REGNO (inner_x
), split
,
2598 ? GET_MODE_SIZE (inner_mode
) / 2
2605 nvptx_shuffle_kind kind
= (nvptx_shuffle_kind
) UINTVAL (x
);
2606 /* Same order as nvptx_shuffle_kind. */
2607 static const char *const kinds
[] =
2608 {".up", ".down", ".bfly", ".idx"};
2609 fputs (kinds
[kind
], file
);
2614 fprintf (file
, "%d", GET_MODE_BITSIZE (mode
));
2618 fprintf (file
, "@");
2622 fprintf (file
, "@!");
2626 mode
= GET_MODE (XEXP (x
, 0));
2630 fputs (".eq", file
);
2633 if (FLOAT_MODE_P (mode
))
2634 fputs (".neu", file
);
2636 fputs (".ne", file
);
2640 fputs (".le", file
);
2644 fputs (".ge", file
);
2648 fputs (".lt", file
);
2652 fputs (".gt", file
);
2655 fputs (".ne", file
);
2658 fputs (".equ", file
);
2661 fputs (".leu", file
);
2664 fputs (".geu", file
);
2667 fputs (".ltu", file
);
2670 fputs (".gtu", file
);
2673 fputs (".nan", file
);
2676 fputs (".num", file
);
2681 if (FLOAT_MODE_P (mode
)
2682 || x_code
== EQ
|| x_code
== NE
2683 || x_code
== GEU
|| x_code
== GTU
2684 || x_code
== LEU
|| x_code
== LTU
)
2685 fputs (nvptx_ptx_type_from_mode (mode
, true), file
);
2687 fprintf (file
, ".s%d", GET_MODE_BITSIZE (mode
));
2695 rtx inner_x
= SUBREG_REG (x
);
2696 machine_mode inner_mode
= GET_MODE (inner_x
);
2697 machine_mode split
= maybe_split_mode (inner_mode
);
2699 if (VECTOR_MODE_P (inner_mode
)
2700 && (GET_MODE_SIZE (mode
)
2701 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode
))))
2703 output_reg (file
, REGNO (inner_x
), VOIDmode
);
2704 fprintf (file
, ".%s", SUBREG_BYTE (x
) == 0 ? "x" : "y");
2706 else if (split_mode_p (inner_mode
)
2707 && (GET_MODE_SIZE (inner_mode
) == GET_MODE_SIZE (mode
)))
2708 output_reg (file
, REGNO (inner_x
), split
);
2710 output_reg (file
, REGNO (inner_x
), split
, SUBREG_BYTE (x
));
2715 output_reg (file
, REGNO (x
), maybe_split_mode (mode
));
2720 nvptx_print_address_operand (file
, XEXP (x
, 0), mode
);
2725 output_addr_const (file
, x
);
2731 /* We could use output_addr_const, but that can print things like
2732 "x-8", which breaks ptxas. Need to ensure it is output as
2734 nvptx_print_address_operand (file
, x
, VOIDmode
);
2739 real_to_target (vals
, CONST_DOUBLE_REAL_VALUE (x
), mode
);
2740 vals
[0] &= 0xffffffff;
2741 vals
[1] &= 0xffffffff;
2743 fprintf (file
, "0f%08lx", vals
[0]);
2745 fprintf (file
, "0d%08lx%08lx", vals
[1], vals
[0]);
2750 unsigned n
= CONST_VECTOR_NUNITS (x
);
2751 fprintf (file
, "{ ");
2752 for (unsigned i
= 0; i
< n
; ++i
)
2755 fprintf (file
, ", ");
2757 rtx elem
= CONST_VECTOR_ELT (x
, i
);
2758 output_addr_const (file
, elem
);
2760 fprintf (file
, " }");
2765 output_addr_const (file
, x
);
2770 /* Record replacement regs used to deal with subreg operands. */
2773 rtx replacement
[MAX_RECOG_OPERANDS
];
2779 /* Allocate or reuse a replacement in R and return the rtx. */
2782 get_replacement (struct reg_replace
*r
)
2784 if (r
->n_allocated
== r
->n_in_use
)
2785 r
->replacement
[r
->n_allocated
++] = gen_reg_rtx (r
->mode
);
2786 return r
->replacement
[r
->n_in_use
++];
2789 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2790 the presence of subregs would break the rules for most instructions.
2791 Replace them with a suitable new register of the right size, plus
2792 conversion copyin/copyout instructions. */
2795 nvptx_reorg_subreg (void)
2797 struct reg_replace qiregs
, hiregs
, siregs
, diregs
;
2798 rtx_insn
*insn
, *next
;
2800 qiregs
.n_allocated
= 0;
2801 hiregs
.n_allocated
= 0;
2802 siregs
.n_allocated
= 0;
2803 diregs
.n_allocated
= 0;
2804 qiregs
.mode
= QImode
;
2805 hiregs
.mode
= HImode
;
2806 siregs
.mode
= SImode
;
2807 diregs
.mode
= DImode
;
2809 for (insn
= get_insns (); insn
; insn
= next
)
2811 next
= NEXT_INSN (insn
);
2812 if (!NONDEBUG_INSN_P (insn
)
2813 || asm_noperands (PATTERN (insn
)) >= 0
2814 || GET_CODE (PATTERN (insn
)) == USE
2815 || GET_CODE (PATTERN (insn
)) == CLOBBER
)
2818 qiregs
.n_in_use
= 0;
2819 hiregs
.n_in_use
= 0;
2820 siregs
.n_in_use
= 0;
2821 diregs
.n_in_use
= 0;
2822 extract_insn (insn
);
2823 enum attr_subregs_ok s_ok
= get_attr_subregs_ok (insn
);
2825 for (int i
= 0; i
< recog_data
.n_operands
; i
++)
2827 rtx op
= recog_data
.operand
[i
];
2828 if (GET_CODE (op
) != SUBREG
)
2831 rtx inner
= SUBREG_REG (op
);
2833 machine_mode outer_mode
= GET_MODE (op
);
2834 machine_mode inner_mode
= GET_MODE (inner
);
2837 && (GET_MODE_PRECISION (inner_mode
)
2838 >= GET_MODE_PRECISION (outer_mode
)))
2840 gcc_assert (SCALAR_INT_MODE_P (outer_mode
));
2841 struct reg_replace
*r
= (outer_mode
== QImode
? &qiregs
2842 : outer_mode
== HImode
? &hiregs
2843 : outer_mode
== SImode
? &siregs
2845 rtx new_reg
= get_replacement (r
);
2847 if (recog_data
.operand_type
[i
] != OP_OUT
)
2850 if (GET_MODE_PRECISION (inner_mode
)
2851 < GET_MODE_PRECISION (outer_mode
))
2856 rtx pat
= gen_rtx_SET (new_reg
,
2857 gen_rtx_fmt_e (code
, outer_mode
, inner
));
2858 emit_insn_before (pat
, insn
);
2861 if (recog_data
.operand_type
[i
] != OP_IN
)
2864 if (GET_MODE_PRECISION (inner_mode
)
2865 < GET_MODE_PRECISION (outer_mode
))
2870 rtx pat
= gen_rtx_SET (inner
,
2871 gen_rtx_fmt_e (code
, inner_mode
, new_reg
));
2872 emit_insn_after (pat
, insn
);
2874 validate_change (insn
, recog_data
.operand_loc
[i
], new_reg
, false);
2879 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2883 nvptx_get_unisimt_master ()
2885 rtx
&master
= cfun
->machine
->unisimt_master
;
2886 return master
? master
: master
= gen_reg_rtx (SImode
);
2889 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2892 nvptx_get_unisimt_predicate ()
2894 rtx
&pred
= cfun
->machine
->unisimt_predicate
;
2895 return pred
? pred
: pred
= gen_reg_rtx (BImode
);
2898 /* Return true if given call insn references one of the functions provided by
2899 the CUDA runtime: malloc, free, vprintf. */
2902 nvptx_call_insn_is_syscall_p (rtx_insn
*insn
)
2904 rtx pat
= PATTERN (insn
);
2905 gcc_checking_assert (GET_CODE (pat
) == PARALLEL
);
2906 pat
= XVECEXP (pat
, 0, 0);
2907 if (GET_CODE (pat
) == SET
)
2908 pat
= SET_SRC (pat
);
2909 gcc_checking_assert (GET_CODE (pat
) == CALL
2910 && GET_CODE (XEXP (pat
, 0)) == MEM
);
2911 rtx addr
= XEXP (XEXP (pat
, 0), 0);
2912 if (GET_CODE (addr
) != SYMBOL_REF
)
2914 const char *name
= XSTR (addr
, 0);
2915 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2916 references with forced assembler name refer to PTX syscalls. For vprintf,
2917 accept both normal and forced-assembler-name references. */
2918 return (!strcmp (name
, "vprintf") || !strcmp (name
, "*vprintf")
2919 || !strcmp (name
, "*malloc")
2920 || !strcmp (name
, "*free"));
2923 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2924 propagate its value from lane MASTER to current lane. */
2927 nvptx_unisimt_handle_set (rtx set
, rtx_insn
*insn
, rtx master
)
2930 if (GET_CODE (set
) == SET
&& REG_P (reg
= SET_DEST (set
)))
2931 emit_insn_after (nvptx_gen_shuffle (reg
, reg
, master
, SHUFFLE_IDX
), insn
);
2934 /* Adjust code for uniform-simt code generation variant by making atomics and
2935 "syscalls" conditionally executed, and inserting shuffle-based propagation
2936 for registers being set. */
2939 nvptx_reorg_uniform_simt ()
2941 rtx_insn
*insn
, *next
;
2943 for (insn
= get_insns (); insn
; insn
= next
)
2945 next
= NEXT_INSN (insn
);
2946 if (!(CALL_P (insn
) && nvptx_call_insn_is_syscall_p (insn
))
2947 && !(NONJUMP_INSN_P (insn
)
2948 && GET_CODE (PATTERN (insn
)) == PARALLEL
2949 && get_attr_atomic (insn
)))
2951 rtx pat
= PATTERN (insn
);
2952 rtx master
= nvptx_get_unisimt_master ();
2953 for (int i
= 0; i
< XVECLEN (pat
, 0); i
++)
2954 nvptx_unisimt_handle_set (XVECEXP (pat
, 0, i
), insn
, master
);
2955 rtx pred
= nvptx_get_unisimt_predicate ();
2956 pred
= gen_rtx_NE (BImode
, pred
, const0_rtx
);
2957 pat
= gen_rtx_COND_EXEC (VOIDmode
, pred
, pat
);
2958 validate_change (insn
, &PATTERN (insn
), pat
, false);
2962 /* Offloading function attributes. */
2964 struct offload_attrs
2972 /* Define entries for cfun->machine->axis_dim. */
2974 #define MACH_VECTOR_LENGTH 0
2975 #define MACH_MAX_WORKERS 1
2977 static void populate_offload_attrs (offload_attrs
*oa
);
2980 init_axis_dim (void)
2985 populate_offload_attrs (&oa
);
2987 if (oa
.num_workers
== 0)
2988 max_workers
= PTX_CTA_SIZE
/ oa
.vector_length
;
2990 max_workers
= oa
.num_workers
;
2992 cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
] = oa
.vector_length
;
2993 cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
] = max_workers
;
2994 cfun
->machine
->axis_dim_init_p
= true;
2997 static int ATTRIBUTE_UNUSED
2998 nvptx_mach_max_workers ()
3000 if (!cfun
->machine
->axis_dim_init_p
)
3002 return cfun
->machine
->axis_dim
[MACH_MAX_WORKERS
];
3005 static int ATTRIBUTE_UNUSED
3006 nvptx_mach_vector_length ()
3008 if (!cfun
->machine
->axis_dim_init_p
)
3010 return cfun
->machine
->axis_dim
[MACH_VECTOR_LENGTH
];
3013 /* Loop structure of the function. The entire function is described as
3018 /* Parent parallel. */
3021 /* Next sibling parallel. */
3024 /* First child parallel. */
3027 /* Partitioning mask of the parallel. */
3030 /* Partitioning used within inner parallels. */
3031 unsigned inner_mask
;
3033 /* Location of parallel forked and join. The forked is the first
3034 block in the parallel and the join is the first block after of
3036 basic_block forked_block
;
3037 basic_block join_block
;
3039 rtx_insn
*forked_insn
;
3040 rtx_insn
*join_insn
;
3042 rtx_insn
*fork_insn
;
3043 rtx_insn
*joining_insn
;
3045 /* Basic blocks in this parallel, but not in child parallels. The
3046 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3048 auto_vec
<basic_block
> blocks
;
3051 parallel (parallel
*parent
, unsigned mode
);
3055 /* Constructor links the new parallel into it's parent's chain of
3058 parallel::parallel (parallel
*parent_
, unsigned mask_
)
3059 :parent (parent_
), next (0), inner (0), mask (mask_
), inner_mask (0)
3061 forked_block
= join_block
= 0;
3062 forked_insn
= join_insn
= 0;
3063 fork_insn
= joining_insn
= 0;
3067 next
= parent
->inner
;
3068 parent
->inner
= this;
3072 parallel::~parallel ()
3078 /* Map of basic blocks to insns */
3079 typedef hash_map
<basic_block
, rtx_insn
*> bb_insn_map_t
;
3081 /* A tuple of an insn of interest and the BB in which it resides. */
3082 typedef std::pair
<rtx_insn
*, basic_block
> insn_bb_t
;
3083 typedef auto_vec
<insn_bb_t
> insn_bb_vec_t
;
3085 /* Split basic blocks such that each forked and join unspecs are at
3086 the start of their basic blocks. Thus afterwards each block will
3087 have a single partitioning mode. We also do the same for return
3088 insns, as they are executed by every thread. Return the
3089 partitioning mode of the function as a whole. Populate MAP with
3090 head and tail blocks. We also clear the BB visited flag, which is
3091 used when finding partitions. */
3094 nvptx_split_blocks (bb_insn_map_t
*map
)
3096 insn_bb_vec_t worklist
;
3100 /* Locate all the reorg instructions of interest. */
3101 FOR_ALL_BB_FN (block
, cfun
)
3103 bool seen_insn
= false;
3105 /* Clear visited flag, for use by parallel locator */
3106 block
->flags
&= ~BB_VISITED
;
3108 FOR_BB_INSNS (block
, insn
)
3112 switch (recog_memoized (insn
))
3117 case CODE_FOR_nvptx_forked
:
3118 case CODE_FOR_nvptx_join
:
3121 case CODE_FOR_return
:
3122 /* We also need to split just before return insns, as
3123 that insn needs executing by all threads, but the
3124 block it is in probably does not. */
3129 /* We've found an instruction that must be at the start of
3130 a block, but isn't. Add it to the worklist. */
3131 worklist
.safe_push (insn_bb_t (insn
, block
));
3133 /* It was already the first instruction. Just add it to
3135 map
->get_or_insert (block
) = insn
;
3140 /* Split blocks on the worklist. */
3143 basic_block remap
= 0;
3144 for (ix
= 0; worklist
.iterate (ix
, &elt
); ix
++)
3146 if (remap
!= elt
->second
)
3148 block
= elt
->second
;
3152 /* Split block before insn. The insn is in the new block */
3153 edge e
= split_block (block
, PREV_INSN (elt
->first
));
3156 map
->get_or_insert (block
) = elt
->first
;
3160 /* Return true if MASK contains parallelism that requires shared
3161 memory to broadcast. */
3164 nvptx_needs_shared_bcast (unsigned mask
)
3166 bool worker
= mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
);
3167 bool large_vector
= (mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
3168 && nvptx_mach_vector_length () != PTX_WARP_SIZE
;
3170 return worker
|| large_vector
;
3173 /* BLOCK is a basic block containing a head or tail instruction.
3174 Locate the associated prehead or pretail instruction, which must be
3175 in the single predecessor block. */
3178 nvptx_discover_pre (basic_block block
, int expected
)
3180 gcc_assert (block
->preds
->length () == 1);
3181 basic_block pre_block
= (*block
->preds
)[0]->src
;
3184 for (pre_insn
= BB_END (pre_block
); !INSN_P (pre_insn
);
3185 pre_insn
= PREV_INSN (pre_insn
))
3186 gcc_assert (pre_insn
!= BB_HEAD (pre_block
));
3188 gcc_assert (recog_memoized (pre_insn
) == expected
);
3192 /* Dump this parallel and all its inner parallels. */
3195 nvptx_dump_pars (parallel
*par
, unsigned depth
)
3197 fprintf (dump_file
, "%u: mask %d head=%d, tail=%d\n",
3199 par
->forked_block
? par
->forked_block
->index
: -1,
3200 par
->join_block
? par
->join_block
->index
: -1);
3202 fprintf (dump_file
, " blocks:");
3205 for (unsigned ix
= 0; par
->blocks
.iterate (ix
, &block
); ix
++)
3206 fprintf (dump_file
, " %d", block
->index
);
3207 fprintf (dump_file
, "\n");
3209 nvptx_dump_pars (par
->inner
, depth
+ 1);
3212 nvptx_dump_pars (par
->next
, depth
);
3215 /* If BLOCK contains a fork/join marker, process it to create or
3216 terminate a loop structure. Add this block to the current loop,
3217 and then walk successor blocks. */
3220 nvptx_find_par (bb_insn_map_t
*map
, parallel
*par
, basic_block block
)
3222 if (block
->flags
& BB_VISITED
)
3224 block
->flags
|= BB_VISITED
;
3226 if (rtx_insn
**endp
= map
->get (block
))
3228 rtx_insn
*end
= *endp
;
3230 /* This is a block head or tail, or return instruction. */
3231 switch (recog_memoized (end
))
3233 case CODE_FOR_return
:
3234 /* Return instructions are in their own block, and we
3235 don't need to do anything more. */
3238 case CODE_FOR_nvptx_forked
:
3239 /* Loop head, create a new inner loop and add it into
3240 our parent's child list. */
3242 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3245 par
= new parallel (par
, mask
);
3246 par
->forked_block
= block
;
3247 par
->forked_insn
= end
;
3248 if (nvptx_needs_shared_bcast (mask
))
3250 = nvptx_discover_pre (block
, CODE_FOR_nvptx_fork
);
3254 case CODE_FOR_nvptx_join
:
3255 /* A loop tail. Finish the current loop and return to
3258 unsigned mask
= UINTVAL (XVECEXP (PATTERN (end
), 0, 0));
3260 gcc_assert (par
->mask
== mask
);
3261 gcc_assert (par
->join_block
== NULL
);
3262 par
->join_block
= block
;
3263 par
->join_insn
= end
;
3264 if (nvptx_needs_shared_bcast (mask
))
3266 = nvptx_discover_pre (block
, CODE_FOR_nvptx_joining
);
3277 /* Add this block onto the current loop's list of blocks. */
3278 par
->blocks
.safe_push (block
);
3280 /* This must be the entry block. Create a NULL parallel. */
3281 par
= new parallel (0, 0);
3283 /* Walk successor blocks. */
3287 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3288 nvptx_find_par (map
, par
, e
->dest
);
3293 /* DFS walk the CFG looking for fork & join markers. Construct
3294 loop structures as we go. MAP is a mapping of basic blocks
3295 to head & tail markers, discovered when splitting blocks. This
3296 speeds up the discovery. We rely on the BB visited flag having
3297 been cleared when splitting blocks. */
3300 nvptx_discover_pars (bb_insn_map_t
*map
)
3304 /* Mark exit blocks as visited. */
3305 block
= EXIT_BLOCK_PTR_FOR_FN (cfun
);
3306 block
->flags
|= BB_VISITED
;
3308 /* And entry block as not. */
3309 block
= ENTRY_BLOCK_PTR_FOR_FN (cfun
);
3310 block
->flags
&= ~BB_VISITED
;
3312 parallel
*par
= nvptx_find_par (map
, 0, block
);
3316 fprintf (dump_file
, "\nLoops\n");
3317 nvptx_dump_pars (par
, 0);
3318 fprintf (dump_file
, "\n");
3324 /* Analyse a group of BBs within a partitioned region and create N
3325 Single-Entry-Single-Exit regions. Some of those regions will be
3326 trivial ones consisting of a single BB. The blocks of a
3327 partitioned region might form a set of disjoint graphs -- because
3328 the region encloses a differently partitoned sub region.
3330 We use the linear time algorithm described in 'Finding Regions Fast:
3331 Single Entry Single Exit and control Regions in Linear Time'
3332 Johnson, Pearson & Pingali. That algorithm deals with complete
3333 CFGs, where a back edge is inserted from END to START, and thus the
3334 problem becomes one of finding equivalent loops.
3336 In this case we have a partial CFG. We complete it by redirecting
3337 any incoming edge to the graph to be from an arbitrary external BB,
3338 and similarly redirecting any outgoing edge to be to that BB.
3339 Thus we end up with a closed graph.
3341 The algorithm works by building a spanning tree of an undirected
3342 graph and keeping track of back edges from nodes further from the
3343 root in the tree to nodes nearer to the root in the tree. In the
3344 description below, the root is up and the tree grows downwards.
3346 We avoid having to deal with degenerate back-edges to the same
3347 block, by splitting each BB into 3 -- one for input edges, one for
3348 the node itself and one for the output edges. Such back edges are
3349 referred to as 'Brackets'. Cycle equivalent nodes will have the
3350 same set of brackets.
3352 Determining bracket equivalency is done by maintaining a list of
3353 brackets in such a manner that the list length and final bracket
3354 uniquely identify the set.
3356 We use coloring to mark all BBs with cycle equivalency with the
3357 same color. This is the output of the 'Finding Regions Fast'
3358 algorithm. Notice it doesn't actually find the set of nodes within
3359 a particular region, just unorderd sets of nodes that are the
3360 entries and exits of SESE regions.
3362 After determining cycle equivalency, we need to find the minimal
3363 set of SESE regions. Do this with a DFS coloring walk of the
3364 complete graph. We're either 'looking' or 'coloring'. When
3365 looking, and we're in the subgraph, we start coloring the color of
3366 the current node, and remember that node as the start of the
3367 current color's SESE region. Every time we go to a new node, we
3368 decrement the count of nodes with thet color. If it reaches zero,
3369 we remember that node as the end of the current color's SESE region
3370 and return to 'looking'. Otherwise we color the node the current
3373 This way we end up with coloring the inside of non-trivial SESE
3374 regions with the color of that region. */
3376 /* A pair of BBs. We use this to represent SESE regions. */
3377 typedef std::pair
<basic_block
, basic_block
> bb_pair_t
;
3378 typedef auto_vec
<bb_pair_t
> bb_pair_vec_t
;
3380 /* A node in the undirected CFG. The discriminator SECOND indicates just
3381 above or just below the BB idicated by FIRST. */
3382 typedef std::pair
<basic_block
, int> pseudo_node_t
;
3384 /* A bracket indicates an edge towards the root of the spanning tree of the
3385 undirected graph. Each bracket has a color, determined
3386 from the currrent set of brackets. */
3389 pseudo_node_t back
; /* Back target */
3391 /* Current color and size of set. */
3395 bracket (pseudo_node_t back_
)
3396 : back (back_
), color (~0u), size (~0u)
3400 unsigned get_color (auto_vec
<unsigned> &color_counts
, unsigned length
)
3405 color
= color_counts
.length ();
3406 color_counts
.quick_push (0);
3408 color_counts
[color
]++;
3413 typedef auto_vec
<bracket
> bracket_vec_t
;
3415 /* Basic block info for finding SESE regions. */
3419 int node
; /* Node number in spanning tree. */
3420 int parent
; /* Parent node number. */
3422 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3423 edges arrive at pseudo-node Ai and the outgoing edges leave at
3424 pseudo-node Ao. We have to remember which way we arrived at a
3425 particular node when generating the spanning tree. dir > 0 means
3426 we arrived at Ai, dir < 0 means we arrived at Ao. */
3429 /* Lowest numbered pseudo-node reached via a backedge from thsis
3430 node, or any descendant. */
3433 int color
; /* Cycle-equivalence color */
3435 /* Stack of brackets for this node. */
3436 bracket_vec_t brackets
;
3438 bb_sese (unsigned node_
, unsigned p
, int dir_
)
3439 :node (node_
), parent (p
), dir (dir_
)
3444 /* Push a bracket ending at BACK. */
3445 void push (const pseudo_node_t
&back
)
3448 fprintf (dump_file
, "Pushing backedge %d:%+d\n",
3449 back
.first
? back
.first
->index
: 0, back
.second
);
3450 brackets
.safe_push (bracket (back
));
3453 void append (bb_sese
*child
);
3454 void remove (const pseudo_node_t
&);
3456 /* Set node's color. */
3457 void set_color (auto_vec
<unsigned> &color_counts
)
3459 color
= brackets
.last ().get_color (color_counts
, brackets
.length ());
3463 bb_sese::~bb_sese ()
3467 /* Destructively append CHILD's brackets. */
3470 bb_sese::append (bb_sese
*child
)
3472 if (int len
= child
->brackets
.length ())
3478 for (ix
= 0; ix
< len
; ix
++)
3480 const pseudo_node_t
&pseudo
= child
->brackets
[ix
].back
;
3481 fprintf (dump_file
, "Appending (%d)'s backedge %d:%+d\n",
3482 child
->node
, pseudo
.first
? pseudo
.first
->index
: 0,
3486 if (!brackets
.length ())
3487 std::swap (brackets
, child
->brackets
);
3490 brackets
.reserve (len
);
3491 for (ix
= 0; ix
< len
; ix
++)
3492 brackets
.quick_push (child
->brackets
[ix
]);
3497 /* Remove brackets that terminate at PSEUDO. */
3500 bb_sese::remove (const pseudo_node_t
&pseudo
)
3502 unsigned removed
= 0;
3503 int len
= brackets
.length ();
3505 for (int ix
= 0; ix
< len
; ix
++)
3507 if (brackets
[ix
].back
== pseudo
)
3510 fprintf (dump_file
, "Removing backedge %d:%+d\n",
3511 pseudo
.first
? pseudo
.first
->index
: 0, pseudo
.second
);
3515 brackets
[ix
-removed
] = brackets
[ix
];
3521 /* Accessors for BB's aux pointer. */
3522 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3523 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3525 /* DFS walk creating SESE data structures. Only cover nodes with
3526 BB_VISITED set. Append discovered blocks to LIST. We number in
3527 increments of 3 so that the above and below pseudo nodes can be
3528 implicitly numbered too. */
3531 nvptx_sese_number (int n
, int p
, int dir
, basic_block b
,
3532 auto_vec
<basic_block
> *list
)
3534 if (BB_GET_SESE (b
))
3538 fprintf (dump_file
, "Block %d(%d), parent (%d), orientation %+d\n",
3539 b
->index
, n
, p
, dir
);
3541 BB_SET_SESE (b
, new bb_sese (n
, p
, dir
));
3545 list
->quick_push (b
);
3547 /* First walk the nodes on the 'other side' of this node, then walk
3548 the nodes on the same side. */
3549 for (unsigned ix
= 2; ix
; ix
--)
3551 vec
<edge
, va_gc
> *edges
= dir
> 0 ? b
->succs
: b
->preds
;
3552 size_t offset
= (dir
> 0 ? offsetof (edge_def
, dest
)
3553 : offsetof (edge_def
, src
));
3557 FOR_EACH_EDGE (e
, ei
, edges
)
3559 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3561 if (target
->flags
& BB_VISITED
)
3562 n
= nvptx_sese_number (n
, p
, dir
, target
, list
);
3569 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3570 EDGES are the outgoing edges and OFFSET is the offset to the src
3571 or dst block on the edges. */
3574 nvptx_sese_pseudo (basic_block me
, bb_sese
*sese
, int depth
, int dir
,
3575 vec
<edge
, va_gc
> *edges
, size_t offset
)
3579 int hi_back
= depth
;
3580 pseudo_node_t
node_back (0, depth
);
3581 int hi_child
= depth
;
3582 pseudo_node_t
node_child (0, depth
);
3583 basic_block child
= NULL
;
3584 unsigned num_children
= 0;
3585 int usd
= -dir
* sese
->dir
;
3588 fprintf (dump_file
, "\nProcessing %d(%d) %+d\n",
3589 me
->index
, sese
->node
, dir
);
3593 /* This is the above pseudo-child. It has the BB itself as an
3594 additional child node. */
3595 node_child
= sese
->high
;
3596 hi_child
= node_child
.second
;
3597 if (node_child
.first
)
3598 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3602 /* Examine each edge.
3603 - if it is a child (a) append its bracket list and (b) record
3604 whether it is the child with the highest reaching bracket.
3605 - if it is an edge to ancestor, record whether it's the highest
3606 reaching backlink. */
3607 FOR_EACH_EDGE (e
, ei
, edges
)
3609 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3611 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3613 if (t_sese
->parent
== sese
->node
&& !(t_sese
->dir
+ usd
))
3615 /* Child node. Append its bracket list. */
3617 sese
->append (t_sese
);
3619 /* Compare it's hi value. */
3620 int t_hi
= t_sese
->high
.second
;
3622 if (basic_block child_hi_block
= t_sese
->high
.first
)
3623 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3625 if (hi_child
> t_hi
)
3628 node_child
= t_sese
->high
;
3632 else if (t_sese
->node
< sese
->node
+ dir
3633 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3635 /* Non-parental ancestor node -- a backlink. */
3636 int d
= usd
* t_sese
->dir
;
3637 int back
= t_sese
->node
+ d
;
3642 node_back
= pseudo_node_t (target
, d
);
3647 { /* Fallen off graph, backlink to entry node. */
3649 node_back
= pseudo_node_t (0, 0);
3653 /* Remove any brackets that terminate at this pseudo node. */
3654 sese
->remove (pseudo_node_t (me
, dir
));
3656 /* Now push any backlinks from this pseudo node. */
3657 FOR_EACH_EDGE (e
, ei
, edges
)
3659 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3660 if (bb_sese
*t_sese
= BB_GET_SESE (target
))
3662 if (t_sese
->node
< sese
->node
+ dir
3663 && !(dir
< 0 && sese
->parent
== t_sese
->node
))
3664 /* Non-parental ancestor node - backedge from me. */
3665 sese
->push (pseudo_node_t (target
, usd
* t_sese
->dir
));
3669 /* back edge to entry node */
3670 sese
->push (pseudo_node_t (0, 0));
3674 /* If this node leads directly or indirectly to a no-return region of
3675 the graph, then fake a backedge to entry node. */
3676 if (!sese
->brackets
.length () || !edges
|| !edges
->length ())
3679 node_back
= pseudo_node_t (0, 0);
3680 sese
->push (node_back
);
3683 /* Record the highest reaching backedge from us or a descendant. */
3684 sese
->high
= hi_back
< hi_child
? node_back
: node_child
;
3686 if (num_children
> 1)
3688 /* There is more than one child -- this is a Y shaped piece of
3689 spanning tree. We have to insert a fake backedge from this
3690 node to the highest ancestor reached by not-the-highest
3691 reaching child. Note that there may be multiple children
3692 with backedges to the same highest node. That's ok and we
3693 insert the edge to that highest node. */
3695 if (dir
< 0 && child
)
3697 node_child
= sese
->high
;
3698 hi_child
= node_child
.second
;
3699 if (node_child
.first
)
3700 hi_child
+= BB_GET_SESE (node_child
.first
)->node
;
3703 FOR_EACH_EDGE (e
, ei
, edges
)
3705 basic_block target
= *(basic_block
*)((char *)e
+ offset
);
3707 if (target
== child
)
3708 /* Ignore the highest child. */
3711 bb_sese
*t_sese
= BB_GET_SESE (target
);
3714 if (t_sese
->parent
!= sese
->node
)
3718 /* Compare its hi value. */
3719 int t_hi
= t_sese
->high
.second
;
3721 if (basic_block child_hi_block
= t_sese
->high
.first
)
3722 t_hi
+= BB_GET_SESE (child_hi_block
)->node
;
3724 if (hi_child
> t_hi
)
3727 node_child
= t_sese
->high
;
3731 sese
->push (node_child
);
3736 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3737 proceed to successors. Set SESE entry and exit nodes of
3741 nvptx_sese_color (auto_vec
<unsigned> &color_counts
, bb_pair_vec_t
®ions
,
3742 basic_block block
, int coloring
)
3744 bb_sese
*sese
= BB_GET_SESE (block
);
3746 if (block
->flags
& BB_VISITED
)
3748 /* If we've already encountered this block, either we must not
3749 be coloring, or it must have been colored the current color. */
3750 gcc_assert (coloring
< 0 || (sese
&& coloring
== sese
->color
));
3754 block
->flags
|= BB_VISITED
;
3760 /* Start coloring a region. */
3761 regions
[sese
->color
].first
= block
;
3762 coloring
= sese
->color
;
3765 if (!--color_counts
[sese
->color
] && sese
->color
== coloring
)
3767 /* Found final block of SESE region. */
3768 regions
[sese
->color
].second
= block
;
3772 /* Color the node, so we can assert on revisiting the node
3773 that the graph is indeed SESE. */
3774 sese
->color
= coloring
;
3777 /* Fallen off the subgraph, we cannot be coloring. */
3778 gcc_assert (coloring
< 0);
3780 /* Walk each successor block. */
3781 if (block
->succs
&& block
->succs
->length ())
3786 FOR_EACH_EDGE (e
, ei
, block
->succs
)
3787 nvptx_sese_color (color_counts
, regions
, e
->dest
, coloring
);
3790 gcc_assert (coloring
< 0);
3793 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3794 end up with NULL entries in it. */
3797 nvptx_find_sese (auto_vec
<basic_block
> &blocks
, bb_pair_vec_t
®ions
)
3802 /* First clear each BB of the whole function. */
3803 FOR_ALL_BB_FN (block
, cfun
)
3805 block
->flags
&= ~BB_VISITED
;
3806 BB_SET_SESE (block
, 0);
3809 /* Mark blocks in the function that are in this graph. */
3810 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3811 block
->flags
|= BB_VISITED
;
3813 /* Counts of nodes assigned to each color. There cannot be more
3814 colors than blocks (and hopefully there will be fewer). */
3815 auto_vec
<unsigned> color_counts
;
3816 color_counts
.reserve (blocks
.length ());
3818 /* Worklist of nodes in the spanning tree. Again, there cannot be
3819 more nodes in the tree than blocks (there will be fewer if the
3820 CFG of blocks is disjoint). */
3821 auto_vec
<basic_block
> spanlist
;
3822 spanlist
.reserve (blocks
.length ());
3824 /* Make sure every block has its cycle class determined. */
3825 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3827 if (BB_GET_SESE (block
))
3828 /* We already met this block in an earlier graph solve. */
3832 fprintf (dump_file
, "Searching graph starting at %d\n", block
->index
);
3834 /* Number the nodes reachable from block initial DFS order. */
3835 int depth
= nvptx_sese_number (2, 0, +1, block
, &spanlist
);
3837 /* Now walk in reverse DFS order to find cycle equivalents. */
3838 while (spanlist
.length ())
3840 block
= spanlist
.pop ();
3841 bb_sese
*sese
= BB_GET_SESE (block
);
3843 /* Do the pseudo node below. */
3844 nvptx_sese_pseudo (block
, sese
, depth
, +1,
3845 sese
->dir
> 0 ? block
->succs
: block
->preds
,
3846 (sese
->dir
> 0 ? offsetof (edge_def
, dest
)
3847 : offsetof (edge_def
, src
)));
3848 sese
->set_color (color_counts
);
3849 /* Do the pseudo node above. */
3850 nvptx_sese_pseudo (block
, sese
, depth
, -1,
3851 sese
->dir
< 0 ? block
->succs
: block
->preds
,
3852 (sese
->dir
< 0 ? offsetof (edge_def
, dest
)
3853 : offsetof (edge_def
, src
)));
3856 fprintf (dump_file
, "\n");
3862 const char *comma
= "";
3864 fprintf (dump_file
, "Found %d cycle equivalents\n",
3865 color_counts
.length ());
3866 for (ix
= 0; color_counts
.iterate (ix
, &count
); ix
++)
3868 fprintf (dump_file
, "%s%d[%d]={", comma
, ix
, count
);
3871 for (unsigned jx
= 0; blocks
.iterate (jx
, &block
); jx
++)
3872 if (BB_GET_SESE (block
)->color
== ix
)
3874 block
->flags
|= BB_VISITED
;
3875 fprintf (dump_file
, "%s%d", comma
, block
->index
);
3878 fprintf (dump_file
, "}");
3881 fprintf (dump_file
, "\n");
3884 /* Now we've colored every block in the subgraph. We now need to
3885 determine the minimal set of SESE regions that cover that
3886 subgraph. Do this with a DFS walk of the complete function.
3887 During the walk we're either 'looking' or 'coloring'. When we
3888 reach the last node of a particular color, we stop coloring and
3889 return to looking. */
3891 /* There cannot be more SESE regions than colors. */
3892 regions
.reserve (color_counts
.length ());
3893 for (ix
= color_counts
.length (); ix
--;)
3894 regions
.quick_push (bb_pair_t (0, 0));
3896 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3897 block
->flags
&= ~BB_VISITED
;
3899 nvptx_sese_color (color_counts
, regions
, ENTRY_BLOCK_PTR_FOR_FN (cfun
), -1);
3903 const char *comma
= "";
3904 int len
= regions
.length ();
3906 fprintf (dump_file
, "SESE regions:");
3907 for (ix
= 0; ix
!= len
; ix
++)
3909 basic_block from
= regions
[ix
].first
;
3910 basic_block to
= regions
[ix
].second
;
3914 fprintf (dump_file
, "%s %d{%d", comma
, ix
, from
->index
);
3916 fprintf (dump_file
, "->%d", to
->index
);
3918 int color
= BB_GET_SESE (from
)->color
;
3920 /* Print the blocks within the region (excluding ends). */
3921 FOR_EACH_BB_FN (block
, cfun
)
3923 bb_sese
*sese
= BB_GET_SESE (block
);
3925 if (sese
&& sese
->color
== color
3926 && block
!= from
&& block
!= to
)
3927 fprintf (dump_file
, ".%d", block
->index
);
3929 fprintf (dump_file
, "}");
3933 fprintf (dump_file
, "\n\n");
3936 for (ix
= 0; blocks
.iterate (ix
, &block
); ix
++)
3937 delete BB_GET_SESE (block
);
3943 /* Propagate live state at the start of a partitioned region. IS_CALL
3944 indicates whether the propagation is for a (partitioned) call
3945 instruction. BLOCK provides the live register information, and
3946 might not contain INSN. Propagation is inserted just after INSN. RW
3947 indicates whether we are reading and/or writing state. This
3948 separation is needed for worker-level proppagation where we
3949 essentially do a spill & fill. FN is the underlying worker
3950 function to generate the propagation instructions for single
3951 register. DATA is user data.
3953 Returns true if we didn't emit any instructions.
3955 We propagate the live register set for non-calls and the entire
3956 frame for calls and non-calls. We could do better by (a)
3957 propagating just the live set that is used within the partitioned
3958 regions and (b) only propagating stack entries that are used. The
3959 latter might be quite hard to determine. */
3961 typedef rtx (*propagator_fn
) (rtx
, propagate_mask
, unsigned, void *, bool);
3964 nvptx_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
,
3965 propagate_mask rw
, propagator_fn fn
, void *data
, bool vector
)
3967 bitmap live
= DF_LIVE_IN (block
);
3968 bitmap_iterator iterator
;
3972 /* Copy the frame array. */
3973 HOST_WIDE_INT fs
= get_frame_size ();
3976 rtx tmp
= gen_reg_rtx (DImode
);
3978 rtx ptr
= gen_reg_rtx (Pmode
);
3979 rtx pred
= NULL_RTX
;
3980 rtx_code_label
*label
= NULL
;
3983 /* The frame size might not be DImode compatible, but the frame
3984 array's declaration will be. So it's ok to round up here. */
3985 fs
= (fs
+ GET_MODE_SIZE (DImode
) - 1) / GET_MODE_SIZE (DImode
);
3986 /* Detect single iteration loop. */
3991 emit_insn (gen_rtx_SET (ptr
, frame_pointer_rtx
));
3994 idx
= gen_reg_rtx (SImode
);
3995 pred
= gen_reg_rtx (BImode
);
3996 label
= gen_label_rtx ();
3998 emit_insn (gen_rtx_SET (idx
, GEN_INT (fs
)));
3999 /* Allow worker function to initialize anything needed. */
4000 rtx init
= fn (tmp
, PM_loop_begin
, fs
, data
, vector
);
4004 LABEL_NUSES (label
)++;
4005 emit_insn (gen_addsi3 (idx
, idx
, GEN_INT (-1)));
4008 emit_insn (gen_rtx_SET (tmp
, gen_rtx_MEM (DImode
, ptr
)));
4009 emit_insn (fn (tmp
, rw
, fs
, data
, vector
));
4011 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode
, ptr
), tmp
));
4014 emit_insn (gen_rtx_SET (pred
, gen_rtx_NE (BImode
, idx
, const0_rtx
)));
4015 emit_insn (gen_adddi3 (ptr
, ptr
, GEN_INT (GET_MODE_SIZE (DImode
))));
4016 emit_insn (gen_br_true_uni (pred
, label
));
4017 rtx fini
= fn (tmp
, PM_loop_end
, fs
, data
, vector
);
4020 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx
), idx
));
4022 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp
), tmp
));
4023 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr
), ptr
));
4024 rtx cpy
= get_insns ();
4026 insn
= emit_insn_after (cpy
, insn
);
4030 /* Copy live registers. */
4031 EXECUTE_IF_SET_IN_BITMAP (live
, 0, ix
, iterator
)
4033 rtx reg
= regno_reg_rtx
[ix
];
4035 if (REGNO (reg
) >= FIRST_PSEUDO_REGISTER
)
4037 rtx bcast
= fn (reg
, rw
, 0, data
, vector
);
4039 insn
= emit_insn_after (bcast
, insn
);
4046 /* Worker for nvptx_warp_propagate. */
4049 warp_prop_gen (rtx reg
, propagate_mask pm
,
4050 unsigned ARG_UNUSED (count
), void *ARG_UNUSED (data
),
4051 bool ARG_UNUSED (vector
))
4053 if (!(pm
& PM_read_write
))
4056 return nvptx_gen_warp_bcast (reg
);
4059 /* Propagate state that is live at start of BLOCK across the vectors
4060 of a single warp. Propagation is inserted just after INSN.
4061 IS_CALL and return as for nvptx_propagate. */
4064 nvptx_warp_propagate (bool is_call
, basic_block block
, rtx_insn
*insn
)
4066 return nvptx_propagate (is_call
, block
, insn
, PM_read_write
,
4067 warp_prop_gen
, 0, false);
4070 /* Worker for nvptx_shared_propagate. */
4073 shared_prop_gen (rtx reg
, propagate_mask pm
, unsigned rep
, void *data_
,
4076 broadcast_data_t
*data
= (broadcast_data_t
*)data_
;
4078 if (pm
& PM_loop_begin
)
4080 /* Starting a loop, initialize pointer. */
4081 unsigned align
= GET_MODE_ALIGNMENT (GET_MODE (reg
)) / BITS_PER_UNIT
;
4083 oacc_bcast_align
= MAX (oacc_bcast_align
, align
);
4084 data
->offset
= ROUND_UP (data
->offset
, align
);
4086 data
->ptr
= gen_reg_rtx (Pmode
);
4088 return gen_adddi3 (data
->ptr
, data
->base
, GEN_INT (data
->offset
));
4090 else if (pm
& PM_loop_end
)
4092 rtx clobber
= gen_rtx_CLOBBER (GET_MODE (data
->ptr
), data
->ptr
);
4093 data
->ptr
= NULL_RTX
;
4097 return nvptx_gen_shared_bcast (reg
, pm
, rep
, data
, vector
);
4100 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4101 indicates if this is just before partitioned mode (do spill), or
4102 just after it starts (do fill). Sequence is inserted just after
4103 INSN. IS_CALL and return as for nvptx_propagate. */
4106 nvptx_shared_propagate (bool pre_p
, bool is_call
, basic_block block
,
4107 rtx_insn
*insn
, bool vector
)
4109 broadcast_data_t data
;
4111 data
.base
= gen_reg_rtx (Pmode
);
4113 data
.ptr
= NULL_RTX
;
4115 bool empty
= nvptx_propagate (is_call
, block
, insn
,
4116 pre_p
? PM_read
: PM_write
, shared_prop_gen
,
4118 gcc_assert (empty
== !data
.offset
);
4121 rtx bcast_sym
= oacc_bcast_sym
;
4123 /* Stuff was emitted, initialize the base pointer now. */
4124 if (vector
&& nvptx_mach_max_workers () > 1)
4126 if (!cfun
->machine
->bcast_partition
)
4128 /* It would be nice to place this register in
4129 DATA_AREA_SHARED. */
4130 cfun
->machine
->bcast_partition
= gen_reg_rtx (DImode
);
4132 if (!cfun
->machine
->sync_bar
)
4133 cfun
->machine
->sync_bar
= gen_reg_rtx (SImode
);
4135 bcast_sym
= cfun
->machine
->bcast_partition
;
4138 rtx init
= gen_rtx_SET (data
.base
, bcast_sym
);
4139 emit_insn_after (init
, insn
);
4141 unsigned int psize
= ROUND_UP (data
.offset
, oacc_bcast_align
);
4142 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4143 ? nvptx_mach_max_workers () + 1
4146 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4147 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4152 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4153 which is an integer or a register. THREADS is the number of threads
4154 controlled by the barrier. */
4157 nvptx_cta_sync (rtx lock
, int threads
)
4159 return gen_nvptx_barsync (lock
, GEN_INT (threads
));
4162 #if WORKAROUND_PTXJIT_BUG
4163 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4167 bb_first_real_insn (basic_block bb
)
4171 /* Find first insn of from block. */
4172 FOR_BB_INSNS (bb
, insn
)
4180 /* Return true if INSN needs neutering. */
4183 needs_neutering_p (rtx_insn
*insn
)
4188 switch (recog_memoized (insn
))
4190 case CODE_FOR_nvptx_fork
:
4191 case CODE_FOR_nvptx_forked
:
4192 case CODE_FOR_nvptx_joining
:
4193 case CODE_FOR_nvptx_join
:
4194 case CODE_FOR_nvptx_barsync
:
4201 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4204 verify_neutering_jumps (basic_block from
,
4205 rtx_insn
*vector_jump
, rtx_insn
*worker_jump
,
4206 rtx_insn
*vector_label
, rtx_insn
*worker_label
)
4208 basic_block bb
= from
;
4209 rtx_insn
*insn
= BB_HEAD (bb
);
4210 bool seen_worker_jump
= false;
4211 bool seen_vector_jump
= false;
4212 bool seen_worker_label
= false;
4213 bool seen_vector_label
= false;
4214 bool worker_neutered
= false;
4215 bool vector_neutered
= false;
4218 if (insn
== worker_jump
)
4220 seen_worker_jump
= true;
4221 worker_neutered
= true;
4222 gcc_assert (!vector_neutered
);
4224 else if (insn
== vector_jump
)
4226 seen_vector_jump
= true;
4227 vector_neutered
= true;
4229 else if (insn
== worker_label
)
4231 seen_worker_label
= true;
4232 gcc_assert (worker_neutered
);
4233 worker_neutered
= false;
4235 else if (insn
== vector_label
)
4237 seen_vector_label
= true;
4238 gcc_assert (vector_neutered
);
4239 vector_neutered
= false;
4241 else if (INSN_P (insn
))
4242 switch (recog_memoized (insn
))
4244 case CODE_FOR_nvptx_barsync
:
4245 gcc_assert (!vector_neutered
&& !worker_neutered
);
4251 if (insn
!= BB_END (bb
))
4252 insn
= NEXT_INSN (insn
);
4253 else if (JUMP_P (insn
) && single_succ_p (bb
)
4254 && !seen_vector_jump
&& !seen_worker_jump
)
4256 bb
= single_succ (bb
);
4257 insn
= BB_HEAD (bb
);
4263 gcc_assert (!(vector_jump
&& !seen_vector_jump
));
4264 gcc_assert (!(worker_jump
&& !seen_worker_jump
));
4266 if (seen_vector_label
|| seen_worker_label
)
4268 gcc_assert (!(vector_label
&& !seen_vector_label
));
4269 gcc_assert (!(worker_label
&& !seen_worker_label
));
4277 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4280 verify_neutering_labels (basic_block to
, rtx_insn
*vector_label
,
4281 rtx_insn
*worker_label
)
4283 basic_block bb
= to
;
4284 rtx_insn
*insn
= BB_END (bb
);
4285 bool seen_worker_label
= false;
4286 bool seen_vector_label
= false;
4289 if (insn
== worker_label
)
4291 seen_worker_label
= true;
4292 gcc_assert (!seen_vector_label
);
4294 else if (insn
== vector_label
)
4295 seen_vector_label
= true;
4296 else if (INSN_P (insn
))
4297 switch (recog_memoized (insn
))
4299 case CODE_FOR_nvptx_barsync
:
4300 gcc_assert (!seen_vector_label
&& !seen_worker_label
);
4304 if (insn
!= BB_HEAD (bb
))
4305 insn
= PREV_INSN (insn
);
4310 gcc_assert (!(vector_label
&& !seen_vector_label
));
4311 gcc_assert (!(worker_label
&& !seen_worker_label
));
4314 /* Single neutering according to MASK. FROM is the incoming block and
4315 TO is the outgoing block. These may be the same block. Insert at
4318 if (tid.<axis>) goto end.
4320 and insert before ending branch of TO (if there is such an insn):
4323 <possibly-broadcast-cond>
4326 We currently only use differnt FROM and TO when skipping an entire
4327 loop. We could do more if we detected superblocks. */
4330 nvptx_single (unsigned mask
, basic_block from
, basic_block to
)
4332 rtx_insn
*head
= BB_HEAD (from
);
4333 rtx_insn
*tail
= BB_END (to
);
4334 unsigned skip_mask
= mask
;
4338 /* Find first insn of from block. */
4339 while (head
!= BB_END (from
) && !needs_neutering_p (head
))
4340 head
= NEXT_INSN (head
);
4345 if (!(JUMP_P (head
) && single_succ_p (from
)))
4348 basic_block jump_target
= single_succ (from
);
4349 if (!single_pred_p (jump_target
))
4353 head
= BB_HEAD (from
);
4356 /* Find last insn of to block */
4357 rtx_insn
*limit
= from
== to
? head
: BB_HEAD (to
);
4358 while (tail
!= limit
&& !INSN_P (tail
) && !LABEL_P (tail
))
4359 tail
= PREV_INSN (tail
);
4361 /* Detect if tail is a branch. */
4362 rtx tail_branch
= NULL_RTX
;
4363 rtx cond_branch
= NULL_RTX
;
4364 if (tail
&& INSN_P (tail
))
4366 tail_branch
= PATTERN (tail
);
4367 if (GET_CODE (tail_branch
) != SET
|| SET_DEST (tail_branch
) != pc_rtx
)
4368 tail_branch
= NULL_RTX
;
4371 cond_branch
= SET_SRC (tail_branch
);
4372 if (GET_CODE (cond_branch
) != IF_THEN_ELSE
)
4373 cond_branch
= NULL_RTX
;
4379 /* If this is empty, do nothing. */
4380 if (!head
|| !needs_neutering_p (head
))
4385 /* If we're only doing vector single, there's no need to
4386 emit skip code because we'll not insert anything. */
4387 if (!(mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)))
4390 else if (tail_branch
)
4391 /* Block with only unconditional branch. Nothing to do. */
4395 /* Insert the vector test inside the worker test. */
4397 rtx_insn
*before
= tail
;
4398 rtx_insn
*neuter_start
= NULL
;
4399 rtx_insn
*worker_label
= NULL
, *vector_label
= NULL
;
4400 rtx_insn
*worker_jump
= NULL
, *vector_jump
= NULL
;
4401 for (mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4402 if (GOMP_DIM_MASK (mode
) & skip_mask
)
4404 rtx_code_label
*label
= gen_label_rtx ();
4405 rtx pred
= cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
];
4406 rtx_insn
**mode_jump
4407 = mode
== GOMP_DIM_VECTOR
? &vector_jump
: &worker_jump
;
4408 rtx_insn
**mode_label
4409 = mode
== GOMP_DIM_VECTOR
? &vector_label
: &worker_label
;
4413 pred
= gen_reg_rtx (BImode
);
4414 cfun
->machine
->axis_predicate
[mode
- GOMP_DIM_WORKER
] = pred
;
4418 if (mode
== GOMP_DIM_VECTOR
)
4419 br
= gen_br_true (pred
, label
);
4421 br
= gen_br_true_uni (pred
, label
);
4423 neuter_start
= emit_insn_after (br
, neuter_start
);
4425 neuter_start
= emit_insn_before (br
, head
);
4426 *mode_jump
= neuter_start
;
4428 LABEL_NUSES (label
)++;
4429 rtx_insn
*label_insn
;
4432 label_insn
= emit_label_before (label
, before
);
4433 before
= label_insn
;
4437 label_insn
= emit_label_after (label
, tail
);
4438 if ((mode
== GOMP_DIM_VECTOR
|| mode
== GOMP_DIM_WORKER
)
4439 && CALL_P (tail
) && find_reg_note (tail
, REG_NORETURN
, NULL
))
4440 emit_insn_after (gen_exit (), label_insn
);
4443 *mode_label
= label_insn
;
4446 /* Now deal with propagating the branch condition. */
4449 rtx pvar
= XEXP (XEXP (cond_branch
, 0), 0);
4451 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
4452 && nvptx_mach_vector_length () == PTX_WARP_SIZE
)
4454 /* Vector mode only, do a shuffle. */
4455 #if WORKAROUND_PTXJIT_BUG
4456 /* The branch condition %rcond is propagated like this:
4461 setp.ne.u32 %rnotvzero,%x,0;
4464 @%rnotvzero bra Lskip;
4465 setp.<op>.<type> %rcond,op1,op2;
4467 selp.u32 %rcondu32,1,0,%rcond;
4468 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4469 setp.ne.u32 %rcond,%rcondu32,0;
4471 There seems to be a bug in the ptx JIT compiler (observed at driver
4472 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4473 unless %rcond is initialized to something before 'bra Lskip'. The
4474 bug is not observed with ptxas from cuda 8.0.61.
4476 It is true that the code is non-trivial: at Lskip, %rcond is
4477 uninitialized in threads 1-31, and after the selp the same holds
4478 for %rcondu32. But shfl propagates the defined value in thread 0
4479 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4480 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4482 There is nothing in the PTX spec to suggest that this is wrong, or
4483 to explain why the extra initialization is needed. So, we classify
4484 it as a JIT bug, and the extra initialization as workaround:
4489 setp.ne.u32 %rnotvzero,%x,0;
4492 +.reg .pred %rcond2;
4493 +setp.eq.u32 %rcond2, 1, 0;
4495 @%rnotvzero bra Lskip;
4496 setp.<op>.<type> %rcond,op1,op2;
4497 +mov.pred %rcond2, %rcond;
4499 +mov.pred %rcond, %rcond2;
4500 selp.u32 %rcondu32,1,0,%rcond;
4501 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4502 setp.ne.u32 %rcond,%rcondu32,0;
4504 rtx_insn
*label
= PREV_INSN (tail
);
4505 gcc_assert (label
&& LABEL_P (label
));
4506 rtx tmp
= gen_reg_rtx (BImode
);
4507 emit_insn_before (gen_movbi (tmp
, const0_rtx
),
4508 bb_first_real_insn (from
));
4509 emit_insn_before (gen_rtx_SET (tmp
, pvar
), label
);
4510 emit_insn_before (gen_rtx_SET (pvar
, tmp
), tail
);
4512 emit_insn_before (nvptx_gen_warp_bcast (pvar
), tail
);
4516 /* Includes worker mode, do spill & fill. By construction
4517 we should never have worker mode only. */
4518 broadcast_data_t data
;
4519 unsigned size
= GET_MODE_SIZE (SImode
);
4520 bool vector
= (GOMP_DIM_MASK (GOMP_DIM_VECTOR
) == mask
) != 0;
4521 bool worker
= (GOMP_DIM_MASK (GOMP_DIM_WORKER
) == mask
) != 0;
4522 rtx barrier
= GEN_INT (0);
4525 data
.base
= oacc_bcast_sym
;
4528 bool use_partitioning_p
= (vector
&& !worker
4529 && nvptx_mach_max_workers () > 1
4530 && cfun
->machine
->bcast_partition
);
4531 if (use_partitioning_p
)
4533 data
.base
= cfun
->machine
->bcast_partition
;
4534 barrier
= cfun
->machine
->sync_bar
;
4535 threads
= nvptx_mach_vector_length ();
4537 gcc_assert (data
.base
!= NULL
);
4538 gcc_assert (barrier
);
4540 unsigned int psize
= ROUND_UP (size
, oacc_bcast_align
);
4541 unsigned int pnum
= (nvptx_mach_vector_length () > PTX_WARP_SIZE
4542 ? nvptx_mach_max_workers () + 1
4545 oacc_bcast_partition
= MAX (oacc_bcast_partition
, psize
);
4546 oacc_bcast_size
= MAX (oacc_bcast_size
, psize
* pnum
);
4549 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_read
, 0, &data
,
4553 /* Barrier so other workers can see the write. */
4554 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
4556 emit_insn_before (nvptx_gen_shared_bcast (pvar
, PM_write
, 0, &data
,
4559 /* This barrier is needed to avoid worker zero clobbering
4560 the broadcast buffer before all the other workers have
4561 had a chance to read this instance of it. */
4562 emit_insn_before (nvptx_cta_sync (barrier
, threads
), tail
);
4565 extract_insn (tail
);
4566 rtx unsp
= gen_rtx_UNSPEC (BImode
, gen_rtvec (1, pvar
),
4568 validate_change (tail
, recog_data
.operand_loc
[0], unsp
, false);
4571 bool seen_label
= verify_neutering_jumps (from
, vector_jump
, worker_jump
,
4572 vector_label
, worker_label
);
4574 verify_neutering_labels (to
, vector_label
, worker_label
);
4577 /* PAR is a parallel that is being skipped in its entirety according to
4578 MASK. Treat this as skipping a superblock starting at forked
4579 and ending at joining. */
4582 nvptx_skip_par (unsigned mask
, parallel
*par
)
4584 basic_block tail
= par
->join_block
;
4585 gcc_assert (tail
->preds
->length () == 1);
4587 basic_block pre_tail
= (*tail
->preds
)[0]->src
;
4588 gcc_assert (pre_tail
->succs
->length () == 1);
4590 nvptx_single (mask
, par
->forked_block
, pre_tail
);
4593 /* If PAR has a single inner parallel and PAR itself only contains
4594 empty entry and exit blocks, swallow the inner PAR. */
4597 nvptx_optimize_inner (parallel
*par
)
4599 parallel
*inner
= par
->inner
;
4601 /* We mustn't be the outer dummy par. */
4605 /* We must have a single inner par. */
4606 if (!inner
|| inner
->next
)
4609 /* We must only contain 2 blocks ourselves -- the head and tail of
4611 if (par
->blocks
.length () != 2)
4614 /* We must be disjoint partitioning. As we only have vector and
4615 worker partitioning, this is sufficient to guarantee the pars
4616 have adjacent partitioning. */
4617 if ((par
->mask
& inner
->mask
) & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1))
4618 /* This indicates malformed code generation. */
4621 /* The outer forked insn should be immediately followed by the inner
4623 rtx_insn
*forked
= par
->forked_insn
;
4624 rtx_insn
*fork
= BB_END (par
->forked_block
);
4626 if (NEXT_INSN (forked
) != fork
)
4628 gcc_checking_assert (recog_memoized (fork
) == CODE_FOR_nvptx_fork
);
4630 /* The outer joining insn must immediately follow the inner join
4632 rtx_insn
*joining
= par
->joining_insn
;
4633 rtx_insn
*join
= inner
->join_insn
;
4634 if (NEXT_INSN (join
) != joining
)
4637 /* Preconditions met. Swallow the inner par. */
4639 fprintf (dump_file
, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4640 inner
->mask
, inner
->forked_block
->index
,
4641 inner
->join_block
->index
,
4642 par
->mask
, par
->forked_block
->index
, par
->join_block
->index
);
4644 par
->mask
|= inner
->mask
& (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1);
4646 par
->blocks
.reserve (inner
->blocks
.length ());
4647 while (inner
->blocks
.length ())
4648 par
->blocks
.quick_push (inner
->blocks
.pop ());
4650 par
->inner
= inner
->inner
;
4651 inner
->inner
= NULL
;
4656 /* Process the parallel PAR and all its contained
4657 parallels. We do everything but the neutering. Return mask of
4658 partitioned modes used within this parallel. */
4661 nvptx_process_pars (parallel
*par
)
4664 nvptx_optimize_inner (par
);
4666 unsigned inner_mask
= par
->mask
;
4668 /* Do the inner parallels first. */
4671 par
->inner_mask
= nvptx_process_pars (par
->inner
);
4672 inner_mask
|= par
->inner_mask
;
4675 bool is_call
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_MAX
)) != 0;
4676 bool worker
= (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
));
4677 bool large_vector
= ((par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4678 && nvptx_mach_vector_length () > PTX_WARP_SIZE
);
4680 if (worker
|| large_vector
)
4682 nvptx_shared_propagate (false, is_call
, par
->forked_block
,
4683 par
->forked_insn
, !worker
);
4685 = nvptx_shared_propagate (true, is_call
, par
->forked_block
,
4686 par
->fork_insn
, !worker
);
4688 = !is_call
&& (NEXT_INSN (par
->forked_insn
)
4689 && NEXT_INSN (par
->forked_insn
) == par
->joining_insn
);
4690 rtx barrier
= GEN_INT (0);
4693 if (!worker
&& cfun
->machine
->sync_bar
)
4695 barrier
= cfun
->machine
->sync_bar
;
4696 threads
= nvptx_mach_vector_length ();
4699 if (no_prop_p
&& empty_loop_p
)
4701 else if (no_prop_p
&& is_call
)
4705 /* Insert begin and end synchronizations. */
4706 emit_insn_before (nvptx_cta_sync (barrier
, threads
),
4708 emit_insn_before (nvptx_cta_sync (barrier
, threads
), par
->join_insn
);
4711 else if (par
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
4712 nvptx_warp_propagate (is_call
, par
->forked_block
, par
->forked_insn
);
4714 /* Now do siblings. */
4716 inner_mask
|= nvptx_process_pars (par
->next
);
4720 /* Neuter the parallel described by PAR. We recurse in depth-first
4721 order. MODES are the partitioning of the execution and OUTER is
4722 the partitioning of the parallels we are contained in. */
4725 nvptx_neuter_pars (parallel
*par
, unsigned modes
, unsigned outer
)
4727 unsigned me
= (par
->mask
4728 & (GOMP_DIM_MASK (GOMP_DIM_WORKER
)
4729 | GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
4730 unsigned skip_mask
= 0, neuter_mask
= 0;
4733 nvptx_neuter_pars (par
->inner
, modes
, outer
| me
);
4735 for (unsigned mode
= GOMP_DIM_WORKER
; mode
<= GOMP_DIM_VECTOR
; mode
++)
4737 if ((outer
| me
) & GOMP_DIM_MASK (mode
))
4738 {} /* Mode is partitioned: no neutering. */
4739 else if (!(modes
& GOMP_DIM_MASK (mode
)))
4740 {} /* Mode is not used: nothing to do. */
4741 else if (par
->inner_mask
& GOMP_DIM_MASK (mode
)
4742 || !par
->forked_insn
)
4743 /* Partitioned in inner parallels, or we're not a partitioned
4744 at all: neuter individual blocks. */
4745 neuter_mask
|= GOMP_DIM_MASK (mode
);
4746 else if (!par
->parent
|| !par
->parent
->forked_insn
4747 || par
->parent
->inner_mask
& GOMP_DIM_MASK (mode
))
4748 /* Parent isn't a parallel or contains this paralleling: skip
4749 parallel at this level. */
4750 skip_mask
|= GOMP_DIM_MASK (mode
);
4752 {} /* Parent will skip this parallel itself. */
4761 /* Neuter whole SESE regions. */
4762 bb_pair_vec_t regions
;
4764 nvptx_find_sese (par
->blocks
, regions
);
4765 len
= regions
.length ();
4766 for (ix
= 0; ix
!= len
; ix
++)
4768 basic_block from
= regions
[ix
].first
;
4769 basic_block to
= regions
[ix
].second
;
4772 nvptx_single (neuter_mask
, from
, to
);
4779 /* Neuter each BB individually. */
4780 len
= par
->blocks
.length ();
4781 for (ix
= 0; ix
!= len
; ix
++)
4783 basic_block block
= par
->blocks
[ix
];
4785 nvptx_single (neuter_mask
, block
, block
);
4791 nvptx_skip_par (skip_mask
, par
);
4794 nvptx_neuter_pars (par
->next
, modes
, outer
);
4798 populate_offload_attrs (offload_attrs
*oa
)
4800 tree attr
= oacc_get_fn_attrib (current_function_decl
);
4801 tree dims
= TREE_VALUE (attr
);
4806 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, dims
= TREE_CHAIN (dims
))
4808 tree t
= TREE_VALUE (dims
);
4809 int size
= (t
== NULL_TREE
) ? -1 : TREE_INT_CST_LOW (t
);
4810 tree allowed
= TREE_PURPOSE (dims
);
4812 if (size
!= 1 && !(allowed
&& integer_zerop (allowed
)))
4813 oa
->mask
|= GOMP_DIM_MASK (ix
);
4818 oa
->num_gangs
= size
;
4821 case GOMP_DIM_WORKER
:
4822 oa
->num_workers
= size
;
4825 case GOMP_DIM_VECTOR
:
4826 oa
->vector_length
= size
;
4832 #if WORKAROUND_PTXJIT_BUG_2
4833 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4834 is needed in the nvptx target because the branches generated for
4835 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4838 nvptx_pc_set (const rtx_insn
*insn
, bool strict
= true)
4841 if ((strict
&& !JUMP_P (insn
))
4842 || (!strict
&& !INSN_P (insn
)))
4844 pat
= PATTERN (insn
);
4846 /* The set is allowed to appear either as the insn pattern or
4847 the first set in a PARALLEL. */
4848 if (GET_CODE (pat
) == PARALLEL
)
4849 pat
= XVECEXP (pat
, 0, 0);
4850 if (GET_CODE (pat
) == SET
&& GET_CODE (SET_DEST (pat
)) == PC
)
4856 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4859 nvptx_condjump_label (const rtx_insn
*insn
, bool strict
= true)
4861 rtx x
= nvptx_pc_set (insn
, strict
);
4866 if (GET_CODE (x
) == LABEL_REF
)
4868 if (GET_CODE (x
) != IF_THEN_ELSE
)
4870 if (XEXP (x
, 2) == pc_rtx
&& GET_CODE (XEXP (x
, 1)) == LABEL_REF
)
4872 if (XEXP (x
, 1) == pc_rtx
&& GET_CODE (XEXP (x
, 2)) == LABEL_REF
)
4877 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4878 insn inbetween the branch and the label. This works around a JIT bug
4879 observed at driver version 384.111, at -O0 for sm_50. */
4882 prevent_branch_around_nothing (void)
4884 rtx_insn
*seen_label
= NULL
;
4885 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4887 if (INSN_P (insn
) && condjump_p (insn
))
4889 seen_label
= label_ref_label (nvptx_condjump_label (insn
, false));
4893 if (seen_label
== NULL
)
4896 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4900 switch (recog_memoized (insn
))
4902 case CODE_FOR_nvptx_fork
:
4903 case CODE_FOR_nvptx_forked
:
4904 case CODE_FOR_nvptx_joining
:
4905 case CODE_FOR_nvptx_join
:
4912 if (LABEL_P (insn
) && insn
== seen_label
)
4913 emit_insn_before (gen_fake_nop (), insn
);
4920 #ifdef WORKAROUND_PTXJIT_BUG_3
4921 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
4922 works around a hang observed at driver version 390.48 for sm_50. */
4925 workaround_barsyncs (void)
4927 bool seen_barsync
= false;
4928 for (rtx_insn
*insn
= get_insns (); insn
; insn
= NEXT_INSN (insn
))
4930 if (INSN_P (insn
) && recog_memoized (insn
) == CODE_FOR_nvptx_barsync
)
4934 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4935 emit_insn_before (gen_nvptx_membar_cta (), insn
);
4938 seen_barsync
= true;
4945 if (NOTE_P (insn
) || DEBUG_INSN_P (insn
))
4947 else if (INSN_P (insn
))
4948 switch (recog_memoized (insn
))
4950 case CODE_FOR_nvptx_fork
:
4951 case CODE_FOR_nvptx_forked
:
4952 case CODE_FOR_nvptx_joining
:
4953 case CODE_FOR_nvptx_join
:
4959 seen_barsync
= false;
4964 /* PTX-specific reorganization
4965 - Split blocks at fork and join instructions
4966 - Compute live registers
4967 - Mark now-unused registers, so function begin doesn't declare
4969 - Insert state propagation when entering partitioned mode
4970 - Insert neutering instructions when in single mode
4971 - Replace subregs with suitable sequences.
4977 /* We are freeing block_for_insn in the toplev to keep compatibility
4978 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4979 compute_bb_for_insn ();
4981 thread_prologue_and_epilogue_insns ();
4983 /* Split blocks and record interesting unspecs. */
4984 bb_insn_map_t bb_insn_map
;
4986 nvptx_split_blocks (&bb_insn_map
);
4988 /* Compute live regs */
4989 df_clear_flags (DF_LR_RUN_DCE
);
4990 df_set_flags (DF_NO_INSN_RESCAN
| DF_NO_HARD_REGS
);
4991 df_live_add_problem ();
4992 df_live_set_all_dirty ();
4994 regstat_init_n_sets_and_refs ();
4997 df_dump (dump_file
);
4999 /* Mark unused regs as unused. */
5000 int max_regs
= max_reg_num ();
5001 for (int i
= LAST_VIRTUAL_REGISTER
+ 1; i
< max_regs
; i
++)
5002 if (REG_N_SETS (i
) == 0 && REG_N_REFS (i
) == 0)
5003 regno_reg_rtx
[i
] = const0_rtx
;
5005 /* Determine launch dimensions of the function. If it is not an
5006 offloaded function (i.e. this is a regular compiler), the
5007 function has no neutering. */
5008 tree attr
= oacc_get_fn_attrib (current_function_decl
);
5011 /* If we determined this mask before RTL expansion, we could
5012 elide emission of some levels of forks and joins. */
5015 populate_offload_attrs (&oa
);
5017 /* If there is worker neutering, there must be vector
5018 neutering. Otherwise the hardware will fail. */
5019 gcc_assert (!(oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
5020 || (oa
.mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
)));
5022 /* Discover & process partitioned regions. */
5023 parallel
*pars
= nvptx_discover_pars (&bb_insn_map
);
5024 nvptx_process_pars (pars
);
5025 nvptx_neuter_pars (pars
, oa
.mask
, 0);
5029 /* Replace subregs. */
5030 nvptx_reorg_subreg ();
5032 if (TARGET_UNIFORM_SIMT
)
5033 nvptx_reorg_uniform_simt ();
5035 #if WORKAROUND_PTXJIT_BUG_2
5036 prevent_branch_around_nothing ();
5039 #ifdef WORKAROUND_PTXJIT_BUG_3
5040 workaround_barsyncs ();
5043 regstat_free_n_sets_and_refs ();
5045 df_finish_pass (true);
5048 /* Handle a "kernel" attribute; arguments as in
5049 struct attribute_spec.handler. */
5052 nvptx_handle_kernel_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5053 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5057 if (TREE_CODE (decl
) != FUNCTION_DECL
)
5059 error ("%qE attribute only applies to functions", name
);
5060 *no_add_attrs
= true;
5062 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl
))))
5064 error ("%qE attribute requires a void return type", name
);
5065 *no_add_attrs
= true;
5071 /* Handle a "shared" attribute; arguments as in
5072 struct attribute_spec.handler. */
5075 nvptx_handle_shared_attribute (tree
*node
, tree name
, tree
ARG_UNUSED (args
),
5076 int ARG_UNUSED (flags
), bool *no_add_attrs
)
5080 if (TREE_CODE (decl
) != VAR_DECL
)
5082 error ("%qE attribute only applies to variables", name
);
5083 *no_add_attrs
= true;
5085 else if (!(TREE_PUBLIC (decl
) || TREE_STATIC (decl
)))
5087 error ("%qE attribute not allowed with auto storage class", name
);
5088 *no_add_attrs
= true;
5094 /* Table of valid machine attributes. */
5095 static const struct attribute_spec nvptx_attribute_table
[] =
5097 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5098 affects_type_identity, handler, exclude } */
5099 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute
,
5101 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute
,
5103 { NULL
, 0, 0, false, false, false, false, NULL
, NULL
}
5106 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5108 static HOST_WIDE_INT
5109 nvptx_vector_alignment (const_tree type
)
5111 HOST_WIDE_INT align
= tree_to_shwi (TYPE_SIZE (type
));
5113 return MIN (align
, BIGGEST_ALIGNMENT
);
5116 /* Indicate that INSN cannot be duplicated. */
5119 nvptx_cannot_copy_insn_p (rtx_insn
*insn
)
5121 switch (recog_memoized (insn
))
5123 case CODE_FOR_nvptx_shufflesi
:
5124 case CODE_FOR_nvptx_shufflesf
:
5125 case CODE_FOR_nvptx_barsync
:
5126 case CODE_FOR_nvptx_fork
:
5127 case CODE_FOR_nvptx_forked
:
5128 case CODE_FOR_nvptx_joining
:
5129 case CODE_FOR_nvptx_join
:
5136 /* Section anchors do not work. Initialization for flag_section_anchor
5137 probes the existence of the anchoring target hooks and prevents
5138 anchoring if they don't exist. However, we may be being used with
5139 a host-side compiler that does support anchoring, and hence see
5140 the anchor flag set (as it's not recalculated). So provide an
5141 implementation denying anchoring. */
5144 nvptx_use_anchors_for_symbol_p (const_rtx
ARG_UNUSED (a
))
5149 /* Record a symbol for mkoffload to enter into the mapping table. */
5152 nvptx_record_offload_symbol (tree decl
)
5154 switch (TREE_CODE (decl
))
5157 fprintf (asm_out_file
, "//:VAR_MAP \"%s\"\n",
5158 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5163 tree attr
= oacc_get_fn_attrib (decl
);
5164 /* OpenMP offloading does not set this attribute. */
5165 tree dims
= attr
? TREE_VALUE (attr
) : NULL_TREE
;
5167 fprintf (asm_out_file
, "//:FUNC_MAP \"%s\"",
5168 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl
)));
5170 for (; dims
; dims
= TREE_CHAIN (dims
))
5172 int size
= TREE_INT_CST_LOW (TREE_VALUE (dims
));
5174 gcc_assert (!TREE_PURPOSE (dims
));
5175 fprintf (asm_out_file
, ", %#x", size
);
5178 fprintf (asm_out_file
, "\n");
5187 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5188 at the start of a file. */
5191 nvptx_file_start (void)
5193 fputs ("// BEGIN PREAMBLE\n", asm_out_file
);
5194 fputs ("\t.version\t3.1\n", asm_out_file
);
5196 fputs ("\t.target\tsm_35\n", asm_out_file
);
5198 fputs ("\t.target\tsm_30\n", asm_out_file
);
5199 fprintf (asm_out_file
, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode
));
5200 fputs ("// END PREAMBLE\n", asm_out_file
);
5203 /* Emit a declaration for a worker and vector-level buffer in .shared
5207 write_shared_buffer (FILE *file
, rtx sym
, unsigned align
, unsigned size
)
5209 const char *name
= XSTR (sym
, 0);
5211 write_var_marker (file
, true, false, name
);
5212 fprintf (file
, ".shared .align %d .u8 %s[%d];\n",
5216 /* Write out the function declarations we've collected and declare storage
5217 for the broadcast buffer. */
5220 nvptx_file_end (void)
5222 hash_table
<tree_hasher
>::iterator iter
;
5224 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab
, decl
, tree
, iter
)
5225 nvptx_record_fndecl (decl
);
5226 fputs (func_decls
.str().c_str(), asm_out_file
);
5228 if (oacc_bcast_size
)
5229 write_shared_buffer (asm_out_file
, oacc_bcast_sym
,
5230 oacc_bcast_align
, oacc_bcast_size
);
5232 if (worker_red_size
)
5233 write_shared_buffer (asm_out_file
, worker_red_sym
,
5234 worker_red_align
, worker_red_size
);
5236 if (vector_red_size
)
5237 write_shared_buffer (asm_out_file
, vector_red_sym
,
5238 vector_red_align
, vector_red_size
);
5240 if (need_softstack_decl
)
5242 write_var_marker (asm_out_file
, false, true, "__nvptx_stacks");
5243 /* 32 is the maximum number of warps in a block. Even though it's an
5244 external declaration, emit the array size explicitly; otherwise, it
5245 may fail at PTX JIT time if the definition is later in link order. */
5246 fprintf (asm_out_file
, ".extern .shared .u%d __nvptx_stacks[32];\n",
5249 if (need_unisimt_decl
)
5251 write_var_marker (asm_out_file
, false, true, "__nvptx_uni");
5252 fprintf (asm_out_file
, ".extern .shared .u32 __nvptx_uni[32];\n");
5256 /* Expander for the shuffle builtins. */
5259 nvptx_expand_shuffle (tree exp
, rtx target
, machine_mode mode
, int ignore
)
5264 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5265 NULL_RTX
, mode
, EXPAND_NORMAL
);
5267 src
= copy_to_mode_reg (mode
, src
);
5269 rtx idx
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5270 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5271 rtx op
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5272 NULL_RTX
, SImode
, EXPAND_NORMAL
);
5274 if (!REG_P (idx
) && GET_CODE (idx
) != CONST_INT
)
5275 idx
= copy_to_mode_reg (SImode
, idx
);
5277 rtx pat
= nvptx_gen_shuffle (target
, src
, idx
,
5278 (nvptx_shuffle_kind
) INTVAL (op
));
5286 nvptx_output_red_partition (rtx dst
, rtx offset
)
5288 const char *zero_offset
= "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5289 const char *with_offset
= "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5291 if (offset
== const0_rtx
)
5292 fprintf (asm_out_file
, zero_offset
, REGNO (dst
),
5293 REGNO (cfun
->machine
->red_partition
));
5295 fprintf (asm_out_file
, with_offset
, REGNO (dst
),
5296 REGNO (cfun
->machine
->red_partition
), UINTVAL (offset
));
5301 /* Shared-memory reduction address expander. */
5304 nvptx_expand_shared_addr (tree exp
, rtx target
,
5305 machine_mode
ARG_UNUSED (mode
), int ignore
,
5311 unsigned align
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 2));
5312 unsigned offset
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 0));
5313 unsigned size
= TREE_INT_CST_LOW (CALL_EXPR_ARG (exp
, 1));
5314 rtx addr
= worker_red_sym
;
5320 populate_offload_attrs (&oa
);
5322 unsigned int psize
= ROUND_UP (size
+ offset
, align
);
5323 unsigned int pnum
= nvptx_mach_max_workers ();
5324 vector_red_partition
= MAX (vector_red_partition
, psize
);
5325 vector_red_size
= MAX (vector_red_size
, psize
* pnum
);
5326 vector_red_align
= MAX (vector_red_align
, align
);
5328 if (cfun
->machine
->red_partition
== NULL
)
5329 cfun
->machine
->red_partition
= gen_reg_rtx (Pmode
);
5331 addr
= gen_reg_rtx (Pmode
);
5332 emit_insn (gen_nvptx_red_partition (addr
, GEN_INT (offset
)));
5336 worker_red_align
= MAX (worker_red_align
, align
);
5337 worker_red_size
= MAX (worker_red_size
, size
+ offset
);
5341 addr
= gen_rtx_PLUS (Pmode
, addr
, GEN_INT (offset
));
5342 addr
= gen_rtx_CONST (Pmode
, addr
);
5346 emit_move_insn (target
, addr
);
5350 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5351 not require taking the address of any object, other than the memory
5352 cell being operated on. */
5355 nvptx_expand_cmp_swap (tree exp
, rtx target
,
5356 machine_mode
ARG_UNUSED (m
), int ARG_UNUSED (ignore
))
5358 machine_mode mode
= TYPE_MODE (TREE_TYPE (exp
));
5361 target
= gen_reg_rtx (mode
);
5363 rtx mem
= expand_expr (CALL_EXPR_ARG (exp
, 0),
5364 NULL_RTX
, Pmode
, EXPAND_NORMAL
);
5365 rtx cmp
= expand_expr (CALL_EXPR_ARG (exp
, 1),
5366 NULL_RTX
, mode
, EXPAND_NORMAL
);
5367 rtx src
= expand_expr (CALL_EXPR_ARG (exp
, 2),
5368 NULL_RTX
, mode
, EXPAND_NORMAL
);
5371 mem
= gen_rtx_MEM (mode
, mem
);
5373 cmp
= copy_to_mode_reg (mode
, cmp
);
5375 src
= copy_to_mode_reg (mode
, src
);
5378 pat
= gen_atomic_compare_and_swapsi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5380 pat
= gen_atomic_compare_and_swapdi_1 (target
, mem
, cmp
, src
, const0_rtx
);
5388 /* Codes for all the NVPTX builtins. */
5391 NVPTX_BUILTIN_SHUFFLE
,
5392 NVPTX_BUILTIN_SHUFFLELL
,
5393 NVPTX_BUILTIN_WORKER_ADDR
,
5394 NVPTX_BUILTIN_VECTOR_ADDR
,
5395 NVPTX_BUILTIN_CMP_SWAP
,
5396 NVPTX_BUILTIN_CMP_SWAPLL
,
5400 static GTY(()) tree nvptx_builtin_decls
[NVPTX_BUILTIN_MAX
];
5402 /* Return the NVPTX builtin for CODE. */
5405 nvptx_builtin_decl (unsigned code
, bool ARG_UNUSED (initialize_p
))
5407 if (code
>= NVPTX_BUILTIN_MAX
)
5408 return error_mark_node
;
5410 return nvptx_builtin_decls
[code
];
5413 /* Set up all builtin functions for this target. */
5416 nvptx_init_builtins (void)
5418 #define DEF(ID, NAME, T) \
5419 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5420 = add_builtin_function ("__builtin_nvptx_" NAME, \
5421 build_function_type_list T, \
5422 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5424 #define UINT unsigned_type_node
5425 #define LLUINT long_long_unsigned_type_node
5426 #define PTRVOID ptr_type_node
5428 DEF (SHUFFLE
, "shuffle", (UINT
, UINT
, UINT
, UINT
, NULL_TREE
));
5429 DEF (SHUFFLELL
, "shufflell", (LLUINT
, LLUINT
, UINT
, UINT
, NULL_TREE
));
5430 DEF (WORKER_ADDR
, "worker_addr",
5431 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5432 DEF (VECTOR_ADDR
, "vector_addr",
5433 (PTRVOID
, ST
, UINT
, UINT
, NULL_TREE
));
5434 DEF (CMP_SWAP
, "cmp_swap", (UINT
, PTRVOID
, UINT
, UINT
, NULL_TREE
));
5435 DEF (CMP_SWAPLL
, "cmp_swapll", (LLUINT
, PTRVOID
, LLUINT
, LLUINT
, NULL_TREE
));
5444 /* Expand an expression EXP that calls a built-in function,
5445 with result going to TARGET if that's convenient
5446 (and in mode MODE if that's convenient).
5447 SUBTARGET may be used as the target for computing one of EXP's operands.
5448 IGNORE is nonzero if the value is to be ignored. */
5451 nvptx_expand_builtin (tree exp
, rtx target
, rtx
ARG_UNUSED (subtarget
),
5452 machine_mode mode
, int ignore
)
5454 tree fndecl
= TREE_OPERAND (CALL_EXPR_FN (exp
), 0);
5455 switch (DECL_FUNCTION_CODE (fndecl
))
5457 case NVPTX_BUILTIN_SHUFFLE
:
5458 case NVPTX_BUILTIN_SHUFFLELL
:
5459 return nvptx_expand_shuffle (exp
, target
, mode
, ignore
);
5461 case NVPTX_BUILTIN_WORKER_ADDR
:
5462 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, false);
5464 case NVPTX_BUILTIN_VECTOR_ADDR
:
5465 return nvptx_expand_shared_addr (exp
, target
, mode
, ignore
, true);
5467 case NVPTX_BUILTIN_CMP_SWAP
:
5468 case NVPTX_BUILTIN_CMP_SWAPLL
:
5469 return nvptx_expand_cmp_swap (exp
, target
, mode
, ignore
);
5471 default: gcc_unreachable ();
5475 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5480 return PTX_WARP_SIZE
;
5484 nvptx_welformed_vector_length_p (int l
)
5487 return l
% PTX_WARP_SIZE
== 0;
5491 nvptx_apply_dim_limits (int dims
[])
5493 /* Check that the vector_length is not too large. */
5494 if (dims
[GOMP_DIM_VECTOR
] > PTX_MAX_VECTOR_LENGTH
)
5495 dims
[GOMP_DIM_VECTOR
] = PTX_MAX_VECTOR_LENGTH
;
5497 /* Check that the number of workers is not too large. */
5498 if (dims
[GOMP_DIM_WORKER
] > PTX_WORKER_LENGTH
)
5499 dims
[GOMP_DIM_WORKER
] = PTX_WORKER_LENGTH
;
5501 /* Ensure that num_worker * vector_length <= cta size. */
5502 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
5503 && dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] > PTX_CTA_SIZE
)
5504 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5506 /* If we need a per-worker barrier ... . */
5507 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0
5508 && dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
5509 /* Don't use more barriers than available. */
5510 dims
[GOMP_DIM_WORKER
] = MIN (dims
[GOMP_DIM_WORKER
],
5511 PTX_NUM_PER_WORKER_BARRIERS
);
5514 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5517 has_vector_partitionable_routine_calls_p (tree fndecl
)
5523 FOR_EACH_BB_FN (bb
, DECL_STRUCT_FUNCTION (fndecl
))
5524 for (gimple_stmt_iterator i
= gsi_start_bb (bb
); !gsi_end_p (i
);
5525 gsi_next_nondebug (&i
))
5527 gimple
*stmt
= gsi_stmt (i
);
5528 if (gimple_code (stmt
) != GIMPLE_CALL
)
5531 tree callee
= gimple_call_fndecl (stmt
);
5535 tree attrs
= oacc_get_fn_attrib (callee
);
5536 if (attrs
== NULL_TREE
)
5539 int partition_level
= oacc_fn_attrib_level (attrs
);
5540 bool seq_routine_p
= partition_level
== GOMP_DIM_MAX
;
5548 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5549 DIMS has changed. */
5552 nvptx_goacc_validate_dims_1 (tree decl
, int dims
[], int fn_level
, unsigned used
)
5554 bool oacc_default_dims_p
= false;
5555 bool oacc_min_dims_p
= false;
5556 bool offload_region_p
= false;
5557 bool routine_p
= false;
5558 bool routine_seq_p
= false;
5559 int default_vector_length
= -1;
5561 if (decl
== NULL_TREE
)
5564 oacc_default_dims_p
= true;
5565 else if (fn_level
== -2)
5566 oacc_min_dims_p
= true;
5570 else if (fn_level
== -1)
5571 offload_region_p
= true;
5572 else if (0 <= fn_level
&& fn_level
<= GOMP_DIM_MAX
)
5575 routine_seq_p
= fn_level
== GOMP_DIM_MAX
;
5580 if (oacc_min_dims_p
)
5582 gcc_assert (dims
[GOMP_DIM_VECTOR
] == 1);
5583 gcc_assert (dims
[GOMP_DIM_WORKER
] == 1);
5584 gcc_assert (dims
[GOMP_DIM_GANG
] == 1);
5586 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5593 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5598 if (oacc_default_dims_p
)
5601 0 : set at runtime, f.i. -fopenacc-dims=-
5602 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5603 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
5604 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
5605 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
5607 /* But -fopenacc-dims=- is not yet supported on trunk. */
5608 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
5609 gcc_assert (dims
[GOMP_DIM_WORKER
] != 0);
5610 gcc_assert (dims
[GOMP_DIM_GANG
] != 0);
5613 if (offload_region_p
)
5616 0 : set using variable, f.i. num_gangs (n)
5617 >= 1: set using constant, f.i. num_gangs (1). */
5618 gcc_assert (dims
[GOMP_DIM_VECTOR
] >= -1);
5619 gcc_assert (dims
[GOMP_DIM_WORKER
] >= -1);
5620 gcc_assert (dims
[GOMP_DIM_GANG
] >= -1);
5623 if (offload_region_p
)
5624 default_vector_length
= oacc_get_default_dim (GOMP_DIM_VECTOR
);
5626 /* oacc_default_dims_p. */
5627 default_vector_length
= PTX_DEFAULT_VECTOR_LENGTH
;
5629 int old_dims
[GOMP_DIM_MAX
];
5631 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5632 old_dims
[i
] = dims
[i
];
5634 const char *vector_reason
= NULL
;
5635 if (offload_region_p
&& has_vector_partitionable_routine_calls_p (decl
))
5637 default_vector_length
= PTX_WARP_SIZE
;
5639 if (dims
[GOMP_DIM_VECTOR
] > PTX_WARP_SIZE
)
5641 vector_reason
= G_("using vector_length (%d) due to call to"
5642 " vector-partitionable routine, ignoring %d");
5643 dims
[GOMP_DIM_VECTOR
] = PTX_WARP_SIZE
;
5647 if (dims
[GOMP_DIM_VECTOR
] == 0)
5649 vector_reason
= G_("using vector_length (%d), ignoring runtime setting");
5650 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
5653 if (dims
[GOMP_DIM_VECTOR
] > 0
5654 && !nvptx_welformed_vector_length_p (dims
[GOMP_DIM_VECTOR
]))
5655 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
5657 nvptx_apply_dim_limits (dims
);
5659 if (dims
[GOMP_DIM_VECTOR
] != old_dims
[GOMP_DIM_VECTOR
])
5660 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5661 vector_reason
!= NULL
5663 : G_("using vector_length (%d), ignoring %d"),
5664 dims
[GOMP_DIM_VECTOR
], old_dims
[GOMP_DIM_VECTOR
]);
5666 if (dims
[GOMP_DIM_WORKER
] != old_dims
[GOMP_DIM_WORKER
])
5667 warning_at (decl
? DECL_SOURCE_LOCATION (decl
) : UNKNOWN_LOCATION
, 0,
5668 G_("using num_workers (%d), ignoring %d"),
5669 dims
[GOMP_DIM_WORKER
], old_dims
[GOMP_DIM_WORKER
]);
5671 if (oacc_default_dims_p
)
5673 if (dims
[GOMP_DIM_VECTOR
] < 0)
5674 dims
[GOMP_DIM_VECTOR
] = default_vector_length
;
5675 if (dims
[GOMP_DIM_WORKER
] < 0)
5676 dims
[GOMP_DIM_WORKER
] = PTX_DEFAULT_RUNTIME_DIM
;
5677 if (dims
[GOMP_DIM_GANG
] < 0)
5678 dims
[GOMP_DIM_GANG
] = PTX_DEFAULT_RUNTIME_DIM
;
5679 nvptx_apply_dim_limits (dims
);
5682 if (offload_region_p
)
5684 for (i
= 0; i
< GOMP_DIM_MAX
; i
++)
5689 if ((used
& GOMP_DIM_MASK (i
)) == 0)
5690 /* Function oacc_validate_dims will apply the minimal dimension. */
5693 dims
[i
] = (i
== GOMP_DIM_VECTOR
5694 ? default_vector_length
5695 : oacc_get_default_dim (i
));
5698 nvptx_apply_dim_limits (dims
);
5702 /* Validate compute dimensions of an OpenACC offload or routine, fill
5703 in non-unity defaults. FN_LEVEL indicates the level at which a
5704 routine might spawn a loop. It is negative for non-routines. If
5705 DECL is null, we are validating the default dimensions. */
5708 nvptx_goacc_validate_dims (tree decl
, int dims
[], int fn_level
, unsigned used
)
5710 int old_dims
[GOMP_DIM_MAX
];
5713 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5714 old_dims
[i
] = dims
[i
];
5716 nvptx_goacc_validate_dims_1 (decl
, dims
, fn_level
, used
);
5718 gcc_assert (dims
[GOMP_DIM_VECTOR
] != 0);
5719 if (dims
[GOMP_DIM_WORKER
] > 0 && dims
[GOMP_DIM_VECTOR
] > 0)
5720 gcc_assert (dims
[GOMP_DIM_WORKER
] * dims
[GOMP_DIM_VECTOR
] <= PTX_CTA_SIZE
);
5722 for (i
= 0; i
< GOMP_DIM_MAX
; ++i
)
5723 if (old_dims
[i
] != dims
[i
])
5729 /* Return maximum dimension size, or zero for unbounded. */
5732 nvptx_dim_limit (int axis
)
5736 case GOMP_DIM_VECTOR
:
5737 return PTX_MAX_VECTOR_LENGTH
;
5745 /* Determine whether fork & joins are needed. */
5748 nvptx_goacc_fork_join (gcall
*call
, const int dims
[],
5749 bool ARG_UNUSED (is_fork
))
5751 tree arg
= gimple_call_arg (call
, 2);
5752 unsigned axis
= TREE_INT_CST_LOW (arg
);
5754 /* We only care about worker and vector partitioning. */
5755 if (axis
< GOMP_DIM_WORKER
)
5758 /* If the size is 1, there's no partitioning. */
5759 if (dims
[axis
] == 1)
5765 /* Generate a PTX builtin function call that returns the address in
5766 the worker reduction buffer at OFFSET. TYPE is the type of the
5767 data at that location. */
5770 nvptx_get_shared_red_addr (tree type
, tree offset
, bool vector
)
5772 enum nvptx_builtins addr_dim
= NVPTX_BUILTIN_WORKER_ADDR
;
5774 addr_dim
= NVPTX_BUILTIN_VECTOR_ADDR
;
5775 machine_mode mode
= TYPE_MODE (type
);
5776 tree fndecl
= nvptx_builtin_decl (addr_dim
, true);
5777 tree size
= build_int_cst (unsigned_type_node
, GET_MODE_SIZE (mode
));
5778 tree align
= build_int_cst (unsigned_type_node
,
5779 GET_MODE_ALIGNMENT (mode
) / BITS_PER_UNIT
);
5780 tree call
= build_call_expr (fndecl
, 3, offset
, size
, align
);
5782 return fold_convert (build_pointer_type (type
), call
);
5785 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5786 will cast the variable if necessary. */
5789 nvptx_generate_vector_shuffle (location_t loc
,
5790 tree dest_var
, tree var
, unsigned shift
,
5793 unsigned fn
= NVPTX_BUILTIN_SHUFFLE
;
5794 tree_code code
= NOP_EXPR
;
5795 tree arg_type
= unsigned_type_node
;
5796 tree var_type
= TREE_TYPE (var
);
5797 tree dest_type
= var_type
;
5799 if (TREE_CODE (var_type
) == COMPLEX_TYPE
)
5800 var_type
= TREE_TYPE (var_type
);
5802 if (TREE_CODE (var_type
) == REAL_TYPE
)
5803 code
= VIEW_CONVERT_EXPR
;
5805 if (TYPE_SIZE (var_type
)
5806 == TYPE_SIZE (long_long_unsigned_type_node
))
5808 fn
= NVPTX_BUILTIN_SHUFFLELL
;
5809 arg_type
= long_long_unsigned_type_node
;
5812 tree call
= nvptx_builtin_decl (fn
, true);
5813 tree bits
= build_int_cst (unsigned_type_node
, shift
);
5814 tree kind
= build_int_cst (unsigned_type_node
, SHUFFLE_DOWN
);
5817 if (var_type
!= dest_type
)
5819 /* Do real and imaginary parts separately. */
5820 tree real
= fold_build1 (REALPART_EXPR
, var_type
, var
);
5821 real
= fold_build1 (code
, arg_type
, real
);
5822 real
= build_call_expr_loc (loc
, call
, 3, real
, bits
, kind
);
5823 real
= fold_build1 (code
, var_type
, real
);
5825 tree imag
= fold_build1 (IMAGPART_EXPR
, var_type
, var
);
5826 imag
= fold_build1 (code
, arg_type
, imag
);
5827 imag
= build_call_expr_loc (loc
, call
, 3, imag
, bits
, kind
);
5828 imag
= fold_build1 (code
, var_type
, imag
);
5830 expr
= fold_build2 (COMPLEX_EXPR
, dest_type
, real
, imag
);
5834 expr
= fold_build1 (code
, arg_type
, var
);
5835 expr
= build_call_expr_loc (loc
, call
, 3, expr
, bits
, kind
);
5836 expr
= fold_build1 (code
, dest_type
, expr
);
5839 gimplify_assign (dest_var
, expr
, seq
);
5842 /* Lazily generate the global lock var decl and return its address. */
5845 nvptx_global_lock_addr ()
5847 tree v
= global_lock_var
;
5851 tree name
= get_identifier ("__reduction_lock");
5852 tree type
= build_qualified_type (unsigned_type_node
,
5853 TYPE_QUAL_VOLATILE
);
5854 v
= build_decl (BUILTINS_LOCATION
, VAR_DECL
, name
, type
);
5855 global_lock_var
= v
;
5856 DECL_ARTIFICIAL (v
) = 1;
5857 DECL_EXTERNAL (v
) = 1;
5858 TREE_STATIC (v
) = 1;
5859 TREE_PUBLIC (v
) = 1;
5861 mark_addressable (v
);
5862 mark_decl_referenced (v
);
5865 return build_fold_addr_expr (v
);
5868 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5869 GSI. We use a lockless scheme for nearly all case, which looks
5871 actual = initval(OP);
5874 write = guess OP myval;
5875 actual = cmp&swap (ptr, guess, write)
5876 } while (actual bit-different-to guess);
5879 This relies on a cmp&swap instruction, which is available for 32-
5880 and 64-bit types. Larger types must use a locking scheme. */
5883 nvptx_lockless_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5884 tree ptr
, tree var
, tree_code op
)
5886 unsigned fn
= NVPTX_BUILTIN_CMP_SWAP
;
5887 tree_code code
= NOP_EXPR
;
5888 tree arg_type
= unsigned_type_node
;
5889 tree var_type
= TREE_TYPE (var
);
5891 if (TREE_CODE (var_type
) == COMPLEX_TYPE
5892 || TREE_CODE (var_type
) == REAL_TYPE
)
5893 code
= VIEW_CONVERT_EXPR
;
5895 if (TYPE_SIZE (var_type
) == TYPE_SIZE (long_long_unsigned_type_node
))
5897 arg_type
= long_long_unsigned_type_node
;
5898 fn
= NVPTX_BUILTIN_CMP_SWAPLL
;
5901 tree swap_fn
= nvptx_builtin_decl (fn
, true);
5903 gimple_seq init_seq
= NULL
;
5904 tree init_var
= make_ssa_name (arg_type
);
5905 tree init_expr
= omp_reduction_init_op (loc
, op
, var_type
);
5906 init_expr
= fold_build1 (code
, arg_type
, init_expr
);
5907 gimplify_assign (init_var
, init_expr
, &init_seq
);
5908 gimple
*init_end
= gimple_seq_last (init_seq
);
5910 gsi_insert_seq_before (gsi
, init_seq
, GSI_SAME_STMT
);
5912 /* Split the block just after the init stmts. */
5913 basic_block pre_bb
= gsi_bb (*gsi
);
5914 edge pre_edge
= split_block (pre_bb
, init_end
);
5915 basic_block loop_bb
= pre_edge
->dest
;
5916 pre_bb
= pre_edge
->src
;
5917 /* Reset the iterator. */
5918 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5920 tree expect_var
= make_ssa_name (arg_type
);
5921 tree actual_var
= make_ssa_name (arg_type
);
5922 tree write_var
= make_ssa_name (arg_type
);
5924 /* Build and insert the reduction calculation. */
5925 gimple_seq red_seq
= NULL
;
5926 tree write_expr
= fold_build1 (code
, var_type
, expect_var
);
5927 write_expr
= fold_build2 (op
, var_type
, write_expr
, var
);
5928 write_expr
= fold_build1 (code
, arg_type
, write_expr
);
5929 gimplify_assign (write_var
, write_expr
, &red_seq
);
5931 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
5933 /* Build & insert the cmp&swap sequence. */
5934 gimple_seq latch_seq
= NULL
;
5935 tree swap_expr
= build_call_expr_loc (loc
, swap_fn
, 3,
5936 ptr
, expect_var
, write_var
);
5937 gimplify_assign (actual_var
, swap_expr
, &latch_seq
);
5939 gcond
*cond
= gimple_build_cond (EQ_EXPR
, actual_var
, expect_var
,
5940 NULL_TREE
, NULL_TREE
);
5941 gimple_seq_add_stmt (&latch_seq
, cond
);
5943 gimple
*latch_end
= gimple_seq_last (latch_seq
);
5944 gsi_insert_seq_before (gsi
, latch_seq
, GSI_SAME_STMT
);
5946 /* Split the block just after the latch stmts. */
5947 edge post_edge
= split_block (loop_bb
, latch_end
);
5948 basic_block post_bb
= post_edge
->dest
;
5949 loop_bb
= post_edge
->src
;
5950 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
5952 post_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
5953 post_edge
->probability
= profile_probability::even ();
5954 edge loop_edge
= make_edge (loop_bb
, loop_bb
, EDGE_FALSE_VALUE
);
5955 loop_edge
->probability
= profile_probability::even ();
5956 set_immediate_dominator (CDI_DOMINATORS
, loop_bb
, pre_bb
);
5957 set_immediate_dominator (CDI_DOMINATORS
, post_bb
, loop_bb
);
5959 gphi
*phi
= create_phi_node (expect_var
, loop_bb
);
5960 add_phi_arg (phi
, init_var
, pre_edge
, loc
);
5961 add_phi_arg (phi
, actual_var
, loop_edge
, loc
);
5963 loop
*loop
= alloc_loop ();
5964 loop
->header
= loop_bb
;
5965 loop
->latch
= loop_bb
;
5966 add_loop (loop
, loop_bb
->loop_father
);
5968 return fold_build1 (code
, var_type
, write_var
);
5971 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5972 GSI. This is necessary for types larger than 64 bits, where there
5973 is no cmp&swap instruction to implement a lockless scheme. We use
5974 a lock variable in global memory.
5976 while (cmp&swap (&lock_var, 0, 1))
5979 accum = accum OP var;
5981 cmp&swap (&lock_var, 1, 0);
5984 A lock in global memory is necessary to force execution engine
5985 descheduling and avoid resource starvation that can occur if the
5986 lock is in .shared memory. */
5989 nvptx_lockfull_update (location_t loc
, gimple_stmt_iterator
*gsi
,
5990 tree ptr
, tree var
, tree_code op
)
5992 tree var_type
= TREE_TYPE (var
);
5993 tree swap_fn
= nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP
, true);
5994 tree uns_unlocked
= build_int_cst (unsigned_type_node
, 0);
5995 tree uns_locked
= build_int_cst (unsigned_type_node
, 1);
5997 /* Split the block just before the gsi. Insert a gimple nop to make
5999 gimple
*nop
= gimple_build_nop ();
6000 gsi_insert_before (gsi
, nop
, GSI_SAME_STMT
);
6001 basic_block entry_bb
= gsi_bb (*gsi
);
6002 edge entry_edge
= split_block (entry_bb
, nop
);
6003 basic_block lock_bb
= entry_edge
->dest
;
6004 /* Reset the iterator. */
6005 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6007 /* Build and insert the locking sequence. */
6008 gimple_seq lock_seq
= NULL
;
6009 tree lock_var
= make_ssa_name (unsigned_type_node
);
6010 tree lock_expr
= nvptx_global_lock_addr ();
6011 lock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, lock_expr
,
6012 uns_unlocked
, uns_locked
);
6013 gimplify_assign (lock_var
, lock_expr
, &lock_seq
);
6014 gcond
*cond
= gimple_build_cond (EQ_EXPR
, lock_var
, uns_unlocked
,
6015 NULL_TREE
, NULL_TREE
);
6016 gimple_seq_add_stmt (&lock_seq
, cond
);
6017 gimple
*lock_end
= gimple_seq_last (lock_seq
);
6018 gsi_insert_seq_before (gsi
, lock_seq
, GSI_SAME_STMT
);
6020 /* Split the block just after the lock sequence. */
6021 edge locked_edge
= split_block (lock_bb
, lock_end
);
6022 basic_block update_bb
= locked_edge
->dest
;
6023 lock_bb
= locked_edge
->src
;
6024 *gsi
= gsi_for_stmt (gsi_stmt (*gsi
));
6026 /* Create the lock loop ... */
6027 locked_edge
->flags
^= EDGE_TRUE_VALUE
| EDGE_FALLTHRU
;
6028 locked_edge
->probability
= profile_probability::even ();
6029 edge loop_edge
= make_edge (lock_bb
, lock_bb
, EDGE_FALSE_VALUE
);
6030 loop_edge
->probability
= profile_probability::even ();
6031 set_immediate_dominator (CDI_DOMINATORS
, lock_bb
, entry_bb
);
6032 set_immediate_dominator (CDI_DOMINATORS
, update_bb
, lock_bb
);
6034 /* ... and the loop structure. */
6035 loop
*lock_loop
= alloc_loop ();
6036 lock_loop
->header
= lock_bb
;
6037 lock_loop
->latch
= lock_bb
;
6038 lock_loop
->nb_iterations_estimate
= 1;
6039 lock_loop
->any_estimate
= true;
6040 add_loop (lock_loop
, entry_bb
->loop_father
);
6042 /* Build and insert the reduction calculation. */
6043 gimple_seq red_seq
= NULL
;
6044 tree acc_in
= make_ssa_name (var_type
);
6045 tree ref_in
= build_simple_mem_ref (ptr
);
6046 TREE_THIS_VOLATILE (ref_in
) = 1;
6047 gimplify_assign (acc_in
, ref_in
, &red_seq
);
6049 tree acc_out
= make_ssa_name (var_type
);
6050 tree update_expr
= fold_build2 (op
, var_type
, ref_in
, var
);
6051 gimplify_assign (acc_out
, update_expr
, &red_seq
);
6053 tree ref_out
= build_simple_mem_ref (ptr
);
6054 TREE_THIS_VOLATILE (ref_out
) = 1;
6055 gimplify_assign (ref_out
, acc_out
, &red_seq
);
6057 gsi_insert_seq_before (gsi
, red_seq
, GSI_SAME_STMT
);
6059 /* Build & insert the unlock sequence. */
6060 gimple_seq unlock_seq
= NULL
;
6061 tree unlock_expr
= nvptx_global_lock_addr ();
6062 unlock_expr
= build_call_expr_loc (loc
, swap_fn
, 3, unlock_expr
,
6063 uns_locked
, uns_unlocked
);
6064 gimplify_and_add (unlock_expr
, &unlock_seq
);
6065 gsi_insert_seq_before (gsi
, unlock_seq
, GSI_SAME_STMT
);
6070 /* Emit a sequence to update a reduction accumlator at *PTR with the
6071 value held in VAR using operator OP. Return the updated value.
6073 TODO: optimize for atomic ops and indepedent complex ops. */
6076 nvptx_reduction_update (location_t loc
, gimple_stmt_iterator
*gsi
,
6077 tree ptr
, tree var
, tree_code op
)
6079 tree type
= TREE_TYPE (var
);
6080 tree size
= TYPE_SIZE (type
);
6082 if (size
== TYPE_SIZE (unsigned_type_node
)
6083 || size
== TYPE_SIZE (long_long_unsigned_type_node
))
6084 return nvptx_lockless_update (loc
, gsi
, ptr
, var
, op
);
6086 return nvptx_lockfull_update (loc
, gsi
, ptr
, var
, op
);
6089 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6092 nvptx_goacc_reduction_setup (gcall
*call
, offload_attrs
*oa
)
6094 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6095 tree lhs
= gimple_call_lhs (call
);
6096 tree var
= gimple_call_arg (call
, 2);
6097 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6098 gimple_seq seq
= NULL
;
6100 push_gimplify_context (true);
6102 if (level
!= GOMP_DIM_GANG
)
6104 /* Copy the receiver object. */
6105 tree ref_to_res
= gimple_call_arg (call
, 1);
6107 if (!integer_zerop (ref_to_res
))
6108 var
= build_simple_mem_ref (ref_to_res
);
6111 if (level
== GOMP_DIM_WORKER
6112 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
6114 /* Store incoming value to worker reduction buffer. */
6115 tree offset
= gimple_call_arg (call
, 5);
6116 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6117 level
== GOMP_DIM_VECTOR
);
6118 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6120 gimplify_assign (ptr
, call
, &seq
);
6121 tree ref
= build_simple_mem_ref (ptr
);
6122 TREE_THIS_VOLATILE (ref
) = 1;
6123 gimplify_assign (ref
, var
, &seq
);
6127 gimplify_assign (lhs
, var
, &seq
);
6129 pop_gimplify_context (NULL
);
6130 gsi_replace_with_seq (&gsi
, seq
, true);
6133 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6136 nvptx_goacc_reduction_init (gcall
*call
, offload_attrs
*oa
)
6138 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6139 tree lhs
= gimple_call_lhs (call
);
6140 tree var
= gimple_call_arg (call
, 2);
6141 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6142 enum tree_code rcode
6143 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
6144 tree init
= omp_reduction_init_op (gimple_location (call
), rcode
,
6146 gimple_seq seq
= NULL
;
6148 push_gimplify_context (true);
6150 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
6152 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6153 tree tid
= make_ssa_name (integer_type_node
);
6154 tree dim_vector
= gimple_call_arg (call
, 3);
6155 gimple
*tid_call
= gimple_build_call_internal (IFN_GOACC_DIM_POS
, 1,
6157 gimple
*cond_stmt
= gimple_build_cond (NE_EXPR
, tid
, integer_zero_node
,
6158 NULL_TREE
, NULL_TREE
);
6160 gimple_call_set_lhs (tid_call
, tid
);
6161 gimple_seq_add_stmt (&seq
, tid_call
);
6162 gimple_seq_add_stmt (&seq
, cond_stmt
);
6164 /* Split the block just after the call. */
6165 edge init_edge
= split_block (gsi_bb (gsi
), call
);
6166 basic_block init_bb
= init_edge
->dest
;
6167 basic_block call_bb
= init_edge
->src
;
6169 /* Fixup flags from call_bb to init_bb. */
6170 init_edge
->flags
^= EDGE_FALLTHRU
| EDGE_TRUE_VALUE
;
6171 init_edge
->probability
= profile_probability::even ();
6173 /* Set the initialization stmts. */
6174 gimple_seq init_seq
= NULL
;
6175 tree init_var
= make_ssa_name (TREE_TYPE (var
));
6176 gimplify_assign (init_var
, init
, &init_seq
);
6177 gsi
= gsi_start_bb (init_bb
);
6178 gsi_insert_seq_before (&gsi
, init_seq
, GSI_SAME_STMT
);
6180 /* Split block just after the init stmt. */
6182 edge inited_edge
= split_block (gsi_bb (gsi
), gsi_stmt (gsi
));
6183 basic_block dst_bb
= inited_edge
->dest
;
6185 /* Create false edge from call_bb to dst_bb. */
6186 edge nop_edge
= make_edge (call_bb
, dst_bb
, EDGE_FALSE_VALUE
);
6187 nop_edge
->probability
= profile_probability::even ();
6189 /* Create phi node in dst block. */
6190 gphi
*phi
= create_phi_node (lhs
, dst_bb
);
6191 add_phi_arg (phi
, init_var
, inited_edge
, gimple_location (call
));
6192 add_phi_arg (phi
, var
, nop_edge
, gimple_location (call
));
6194 /* Reset dominator of dst bb. */
6195 set_immediate_dominator (CDI_DOMINATORS
, dst_bb
, call_bb
);
6197 /* Reset the gsi. */
6198 gsi
= gsi_for_stmt (call
);
6202 if (level
== GOMP_DIM_GANG
)
6204 /* If there's no receiver object, propagate the incoming VAR. */
6205 tree ref_to_res
= gimple_call_arg (call
, 1);
6206 if (integer_zerop (ref_to_res
))
6210 if (lhs
!= NULL_TREE
)
6211 gimplify_assign (lhs
, init
, &seq
);
6214 pop_gimplify_context (NULL
);
6215 gsi_replace_with_seq (&gsi
, seq
, true);
6218 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6221 nvptx_goacc_reduction_fini (gcall
*call
, offload_attrs
*oa
)
6223 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6224 tree lhs
= gimple_call_lhs (call
);
6225 tree ref_to_res
= gimple_call_arg (call
, 1);
6226 tree var
= gimple_call_arg (call
, 2);
6227 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6229 = (enum tree_code
)TREE_INT_CST_LOW (gimple_call_arg (call
, 4));
6230 gimple_seq seq
= NULL
;
6231 tree r
= NULL_TREE
;;
6233 push_gimplify_context (true);
6235 if (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
== PTX_WARP_SIZE
)
6237 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6238 but that requires a method of emitting a unified jump at the
6240 for (int shfl
= PTX_WARP_SIZE
/ 2; shfl
> 0; shfl
= shfl
>> 1)
6242 tree other_var
= make_ssa_name (TREE_TYPE (var
));
6243 nvptx_generate_vector_shuffle (gimple_location (call
),
6244 other_var
, var
, shfl
, &seq
);
6246 r
= make_ssa_name (TREE_TYPE (var
));
6247 gimplify_assign (r
, fold_build2 (op
, TREE_TYPE (var
),
6248 var
, other_var
), &seq
);
6254 tree accum
= NULL_TREE
;
6256 if (level
== GOMP_DIM_WORKER
|| level
== GOMP_DIM_VECTOR
)
6258 /* Get reduction buffer address. */
6259 tree offset
= gimple_call_arg (call
, 5);
6260 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6261 level
== GOMP_DIM_VECTOR
);
6262 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6264 gimplify_assign (ptr
, call
, &seq
);
6267 else if (integer_zerop (ref_to_res
))
6274 /* UPDATE the accumulator. */
6275 gsi_insert_seq_before (&gsi
, seq
, GSI_SAME_STMT
);
6277 r
= nvptx_reduction_update (gimple_location (call
), &gsi
,
6283 gimplify_assign (lhs
, r
, &seq
);
6284 pop_gimplify_context (NULL
);
6286 gsi_replace_with_seq (&gsi
, seq
, true);
6289 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6292 nvptx_goacc_reduction_teardown (gcall
*call
, offload_attrs
*oa
)
6294 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
6295 tree lhs
= gimple_call_lhs (call
);
6296 tree var
= gimple_call_arg (call
, 2);
6297 int level
= TREE_INT_CST_LOW (gimple_call_arg (call
, 3));
6298 gimple_seq seq
= NULL
;
6300 push_gimplify_context (true);
6301 if (level
== GOMP_DIM_WORKER
6302 || (level
== GOMP_DIM_VECTOR
&& oa
->vector_length
> PTX_WARP_SIZE
))
6304 /* Read the worker reduction buffer. */
6305 tree offset
= gimple_call_arg (call
, 5);
6306 tree call
= nvptx_get_shared_red_addr (TREE_TYPE (var
), offset
,
6307 level
== GOMP_DIM_VECTOR
);
6308 tree ptr
= make_ssa_name (TREE_TYPE (call
));
6310 gimplify_assign (ptr
, call
, &seq
);
6311 var
= build_simple_mem_ref (ptr
);
6312 TREE_THIS_VOLATILE (var
) = 1;
6315 if (level
!= GOMP_DIM_GANG
)
6317 /* Write to the receiver object. */
6318 tree ref_to_res
= gimple_call_arg (call
, 1);
6320 if (!integer_zerop (ref_to_res
))
6321 gimplify_assign (build_simple_mem_ref (ref_to_res
), var
, &seq
);
6325 gimplify_assign (lhs
, var
, &seq
);
6327 pop_gimplify_context (NULL
);
6329 gsi_replace_with_seq (&gsi
, seq
, true);
6332 /* NVPTX reduction expander. */
6335 nvptx_goacc_reduction (gcall
*call
)
6337 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
6340 populate_offload_attrs (&oa
);
6344 case IFN_GOACC_REDUCTION_SETUP
:
6345 nvptx_goacc_reduction_setup (call
, &oa
);
6348 case IFN_GOACC_REDUCTION_INIT
:
6349 nvptx_goacc_reduction_init (call
, &oa
);
6352 case IFN_GOACC_REDUCTION_FINI
:
6353 nvptx_goacc_reduction_fini (call
, &oa
);
6356 case IFN_GOACC_REDUCTION_TEARDOWN
:
6357 nvptx_goacc_reduction_teardown (call
, &oa
);
6366 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED
,
6367 rtx x ATTRIBUTE_UNUSED
)
6373 nvptx_vector_mode_supported (machine_mode mode
)
6375 return (mode
== V2SImode
6376 || mode
== V2DImode
);
6379 /* Return the preferred mode for vectorizing scalar MODE. */
6382 nvptx_preferred_simd_mode (scalar_mode mode
)
6392 return default_preferred_simd_mode (mode
);
6397 nvptx_data_alignment (const_tree type
, unsigned int basic_align
)
6399 if (TREE_CODE (type
) == INTEGER_TYPE
)
6401 unsigned HOST_WIDE_INT size
= tree_to_uhwi (TYPE_SIZE_UNIT (type
));
6402 if (size
== GET_MODE_SIZE (TImode
))
6403 return GET_MODE_BITSIZE (maybe_split_mode (TImode
));
6409 /* Implement TARGET_MODES_TIEABLE_P. */
6412 nvptx_modes_tieable_p (machine_mode
, machine_mode
)
6417 /* Implement TARGET_HARD_REGNO_NREGS. */
6420 nvptx_hard_regno_nregs (unsigned int, machine_mode
)
6425 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6428 nvptx_can_change_mode_class (machine_mode
, machine_mode
, reg_class_t
)
6433 static GTY(()) tree nvptx_previous_fndecl
;
6436 nvptx_set_current_function (tree fndecl
)
6438 if (!fndecl
|| fndecl
== nvptx_previous_fndecl
)
6441 nvptx_previous_fndecl
= fndecl
;
6442 vector_red_partition
= 0;
6443 oacc_bcast_partition
= 0;
6446 #undef TARGET_OPTION_OVERRIDE
6447 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6449 #undef TARGET_ATTRIBUTE_TABLE
6450 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6453 #define TARGET_LRA_P hook_bool_void_false
6455 #undef TARGET_LEGITIMATE_ADDRESS_P
6456 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6458 #undef TARGET_PROMOTE_FUNCTION_MODE
6459 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6461 #undef TARGET_FUNCTION_ARG
6462 #define TARGET_FUNCTION_ARG nvptx_function_arg
6463 #undef TARGET_FUNCTION_INCOMING_ARG
6464 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6465 #undef TARGET_FUNCTION_ARG_ADVANCE
6466 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6467 #undef TARGET_FUNCTION_ARG_BOUNDARY
6468 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6469 #undef TARGET_PASS_BY_REFERENCE
6470 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6471 #undef TARGET_FUNCTION_VALUE_REGNO_P
6472 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6473 #undef TARGET_FUNCTION_VALUE
6474 #define TARGET_FUNCTION_VALUE nvptx_function_value
6475 #undef TARGET_LIBCALL_VALUE
6476 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6477 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6478 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6479 #undef TARGET_GET_DRAP_RTX
6480 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6481 #undef TARGET_SPLIT_COMPLEX_ARG
6482 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6483 #undef TARGET_RETURN_IN_MEMORY
6484 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6485 #undef TARGET_OMIT_STRUCT_RETURN_REG
6486 #define TARGET_OMIT_STRUCT_RETURN_REG true
6487 #undef TARGET_STRICT_ARGUMENT_NAMING
6488 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6489 #undef TARGET_CALL_ARGS
6490 #define TARGET_CALL_ARGS nvptx_call_args
6491 #undef TARGET_END_CALL_ARGS
6492 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6494 #undef TARGET_ASM_FILE_START
6495 #define TARGET_ASM_FILE_START nvptx_file_start
6496 #undef TARGET_ASM_FILE_END
6497 #define TARGET_ASM_FILE_END nvptx_file_end
6498 #undef TARGET_ASM_GLOBALIZE_LABEL
6499 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6500 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6501 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6502 #undef TARGET_PRINT_OPERAND
6503 #define TARGET_PRINT_OPERAND nvptx_print_operand
6504 #undef TARGET_PRINT_OPERAND_ADDRESS
6505 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6506 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6507 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6508 #undef TARGET_ASM_INTEGER
6509 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6510 #undef TARGET_ASM_DECL_END
6511 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6512 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6513 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6514 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6515 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6516 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6517 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6519 #undef TARGET_MACHINE_DEPENDENT_REORG
6520 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6521 #undef TARGET_NO_REGISTER_ALLOCATION
6522 #define TARGET_NO_REGISTER_ALLOCATION true
6524 #undef TARGET_ENCODE_SECTION_INFO
6525 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6526 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6527 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6529 #undef TARGET_VECTOR_ALIGNMENT
6530 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6532 #undef TARGET_CANNOT_COPY_INSN_P
6533 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6535 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6536 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6538 #undef TARGET_INIT_BUILTINS
6539 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6540 #undef TARGET_EXPAND_BUILTIN
6541 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6542 #undef TARGET_BUILTIN_DECL
6543 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6545 #undef TARGET_SIMT_VF
6546 #define TARGET_SIMT_VF nvptx_simt_vf
6548 #undef TARGET_GOACC_VALIDATE_DIMS
6549 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6551 #undef TARGET_GOACC_DIM_LIMIT
6552 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6554 #undef TARGET_GOACC_FORK_JOIN
6555 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6557 #undef TARGET_GOACC_REDUCTION
6558 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6560 #undef TARGET_CANNOT_FORCE_CONST_MEM
6561 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6563 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6564 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6566 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6567 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6568 nvptx_preferred_simd_mode
6570 #undef TARGET_MODES_TIEABLE_P
6571 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6573 #undef TARGET_HARD_REGNO_NREGS
6574 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6576 #undef TARGET_CAN_CHANGE_MODE_CLASS
6577 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6579 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6580 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6582 #undef TARGET_SET_CURRENT_FUNCTION
6583 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6585 struct gcc_target targetm
= TARGET_INITIALIZER
;
6587 #include "gt-nvptx.h"