1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
4 Copyright (C) 2005-2021 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
29 #include "tree-pass.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
52 #include "stringpool.h"
58 /* Describe the OpenACC looping structure of a function. The entire
59 function is held in a 'NULL' loop. */
63 oacc_loop
*parent
; /* Containing loop. */
65 oacc_loop
*child
; /* First inner loop. */
67 oacc_loop
*sibling
; /* Next loop within same parent. */
69 location_t loc
; /* Location of the loop start. */
71 gcall
*marker
; /* Initial head marker. */
73 gcall
*heads
[GOMP_DIM_MAX
]; /* Head marker functions. */
74 gcall
*tails
[GOMP_DIM_MAX
]; /* Tail marker functions. */
76 tree routine
; /* Pseudo-loop enclosing a routine. */
78 unsigned mask
; /* Partitioning mask. */
79 unsigned e_mask
; /* Partitioning of element loops (when tiling). */
80 unsigned inner
; /* Partitioning of inner loops. */
81 unsigned flags
; /* Partitioning flags. */
82 vec
<gcall
*> ifns
; /* Contained loop abstraction functions. */
83 tree chunk_size
; /* Chunk size. */
84 gcall
*head_end
; /* Final marker of head sequence. */
87 /* Holds offload tables with decls. */
88 vec
<tree
, va_gc
> *offload_funcs
, *offload_vars
;
90 /* Return level at which oacc routine may spawn a partitioned loop, or
91 -1 if it is not a routine (i.e. is an offload fn). */
94 oacc_fn_attrib_level (tree attr
)
96 tree pos
= TREE_VALUE (attr
);
98 if (!TREE_PURPOSE (pos
))
102 for (ix
= 0; ix
!= GOMP_DIM_MAX
;
103 ix
++, pos
= TREE_CHAIN (pos
))
104 if (!integer_zerop (TREE_PURPOSE (pos
)))
110 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
111 adds their addresses and sizes to constructor-vector V_CTOR. */
114 add_decls_addresses_to_decl_constructor (vec
<tree
, va_gc
> *v_decls
,
115 vec
<constructor_elt
, va_gc
> *v_ctor
)
117 unsigned len
= vec_safe_length (v_decls
);
118 for (unsigned i
= 0; i
< len
; i
++)
120 tree it
= (*v_decls
)[i
];
121 bool is_var
= VAR_P (it
);
124 #ifdef ACCEL_COMPILER
125 && DECL_HAS_VALUE_EXPR_P (it
)
127 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it
));
129 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
130 if (!in_lto_p
&& !symtab_node::get (it
))
133 tree size
= NULL_TREE
;
135 size
= fold_convert (const_ptr_type_node
, DECL_SIZE_UNIT (it
));
139 addr
= build_fold_addr_expr (it
);
142 #ifdef ACCEL_COMPILER
143 /* For "omp declare target link" vars add address of the pointer to
144 the target table, instead of address of the var. */
145 tree value_expr
= DECL_VALUE_EXPR (it
);
146 tree link_ptr_decl
= TREE_OPERAND (value_expr
, 0);
147 varpool_node::finalize_decl (link_ptr_decl
);
148 addr
= build_fold_addr_expr (link_ptr_decl
);
150 addr
= build_fold_addr_expr (it
);
153 /* Most significant bit of the size marks "omp declare target link"
154 vars in host and target tables. */
155 unsigned HOST_WIDE_INT isize
= tree_to_uhwi (size
);
156 isize
|= 1ULL << (int_size_in_bytes (const_ptr_type_node
)
157 * BITS_PER_UNIT
- 1);
158 size
= wide_int_to_tree (const_ptr_type_node
, isize
);
161 CONSTRUCTOR_APPEND_ELT (v_ctor
, NULL_TREE
, addr
);
163 CONSTRUCTOR_APPEND_ELT (v_ctor
, NULL_TREE
, size
);
167 /* Return true if DECL is a function for which its references should be
171 omp_declare_target_fn_p (tree decl
)
173 return (TREE_CODE (decl
) == FUNCTION_DECL
174 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl
))
175 && !lookup_attribute ("omp declare target host",
176 DECL_ATTRIBUTES (decl
))
178 || oacc_get_fn_attrib (decl
) == NULL_TREE
));
181 /* Return true if DECL Is a variable for which its initializer references
182 should be analyzed. */
185 omp_declare_target_var_p (tree decl
)
188 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl
))
189 && !lookup_attribute ("omp declare target link",
190 DECL_ATTRIBUTES (decl
)));
193 /* Helper function for omp_discover_implicit_declare_target, called through
194 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
195 declare target to. */
198 omp_discover_declare_target_tgt_fn_r (tree
*tp
, int *walk_subtrees
, void *data
)
200 if (TREE_CODE (*tp
) == CALL_EXPR
201 && CALL_EXPR_FN (*tp
)
202 && TREE_CODE (CALL_EXPR_FN (*tp
)) == ADDR_EXPR
203 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp
), 0)) == FUNCTION_DECL
204 && lookup_attribute ("omp declare variant base",
205 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp
),
208 tree fn
= TREE_OPERAND (CALL_EXPR_FN (*tp
), 0);
209 for (tree attr
= DECL_ATTRIBUTES (fn
); attr
; attr
= TREE_CHAIN (attr
))
211 attr
= lookup_attribute ("omp declare variant base", attr
);
212 if (attr
== NULL_TREE
)
214 tree purpose
= TREE_PURPOSE (TREE_VALUE (attr
));
215 if (TREE_CODE (purpose
) == FUNCTION_DECL
)
216 omp_discover_declare_target_tgt_fn_r (&purpose
, walk_subtrees
, data
);
219 else if (TREE_CODE (*tp
) == FUNCTION_DECL
)
222 tree id
= get_identifier ("omp declare target");
223 symtab_node
*node
= symtab_node::get (*tp
);
226 while (node
->alias_target
227 && TREE_CODE (node
->alias_target
) == FUNCTION_DECL
)
229 if (!omp_declare_target_fn_p (node
->decl
)
230 && !lookup_attribute ("omp declare target host",
231 DECL_ATTRIBUTES (node
->decl
)))
233 node
->offloadable
= 1;
234 DECL_ATTRIBUTES (node
->decl
)
235 = tree_cons (id
, NULL_TREE
, DECL_ATTRIBUTES (node
->decl
));
237 node
= symtab_node::get (node
->alias_target
);
239 symtab_node
*new_node
= node
->ultimate_alias_target ();
240 decl
= new_node
->decl
;
241 while (node
!= new_node
)
243 if (!omp_declare_target_fn_p (node
->decl
)
244 && !lookup_attribute ("omp declare target host",
245 DECL_ATTRIBUTES (node
->decl
)))
247 node
->offloadable
= 1;
248 DECL_ATTRIBUTES (node
->decl
)
249 = tree_cons (id
, NULL_TREE
, DECL_ATTRIBUTES (node
->decl
));
251 gcc_assert (node
->alias
&& node
->analyzed
);
252 node
= node
->get_alias_target ();
254 node
->offloadable
= 1;
255 if (ENABLE_OFFLOADING
)
256 g
->have_offload
= true;
258 if (omp_declare_target_fn_p (decl
)
259 || lookup_attribute ("omp declare target host",
260 DECL_ATTRIBUTES (decl
)))
263 if (!DECL_EXTERNAL (decl
) && DECL_SAVED_TREE (decl
))
264 ((vec
<tree
> *) data
)->safe_push (decl
);
265 DECL_ATTRIBUTES (decl
) = tree_cons (id
, NULL_TREE
,
266 DECL_ATTRIBUTES (decl
));
268 else if (TYPE_P (*tp
))
270 /* else if (TREE_CODE (*tp) == OMP_TARGET)
272 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
273 if (OMP_DEVICE_ANCESTOR (dev))
279 /* Similarly, but ignore references outside of OMP_TARGET regions. */
282 omp_discover_declare_target_fn_r (tree
*tp
, int *walk_subtrees
, void *data
)
284 if (TREE_CODE (*tp
) == OMP_TARGET
)
286 /* And not OMP_DEVICE_ANCESTOR. */
287 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp
),
288 omp_discover_declare_target_tgt_fn_r
,
292 else if (TYPE_P (*tp
))
297 /* Helper function for omp_discover_implicit_declare_target, called through
298 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
299 declare target to. */
302 omp_discover_declare_target_var_r (tree
*tp
, int *walk_subtrees
, void *data
)
304 if (TREE_CODE (*tp
) == FUNCTION_DECL
)
305 return omp_discover_declare_target_tgt_fn_r (tp
, walk_subtrees
, data
);
307 && is_global_var (*tp
)
308 && !omp_declare_target_var_p (*tp
))
310 tree id
= get_identifier ("omp declare target");
311 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp
)))
313 error_at (DECL_SOURCE_LOCATION (*tp
),
314 "%qD specified both in declare target %<link%> and "
315 "implicitly in %<to%> clauses", *tp
);
316 DECL_ATTRIBUTES (*tp
)
317 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp
));
319 if (TREE_STATIC (*tp
) && lang_hooks
.decls
.omp_get_decl_init (*tp
))
320 ((vec
<tree
> *) data
)->safe_push (*tp
);
321 DECL_ATTRIBUTES (*tp
) = tree_cons (id
, NULL_TREE
, DECL_ATTRIBUTES (*tp
));
322 symtab_node
*node
= symtab_node::get (*tp
);
323 if (node
!= NULL
&& !node
->offloadable
)
325 node
->offloadable
= 1;
326 if (ENABLE_OFFLOADING
)
328 g
->have_offload
= true;
329 if (is_a
<varpool_node
*> (node
))
330 vec_safe_push (offload_vars
, node
->decl
);
334 else if (TYPE_P (*tp
))
339 /* Perform the OpenMP implicit declare target to discovery. */
342 omp_discover_implicit_declare_target (void)
346 auto_vec
<tree
> worklist
;
348 FOR_EACH_DEFINED_FUNCTION (node
)
349 if (DECL_SAVED_TREE (node
->decl
))
351 struct cgraph_node
*cgn
;
352 if (omp_declare_target_fn_p (node
->decl
))
353 worklist
.safe_push (node
->decl
);
354 else if (DECL_STRUCT_FUNCTION (node
->decl
)
355 && DECL_STRUCT_FUNCTION (node
->decl
)->has_omp_target
)
356 worklist
.safe_push (node
->decl
);
357 for (cgn
= first_nested_function (node
);
358 cgn
; cgn
= next_nested_function (cgn
))
359 if (omp_declare_target_fn_p (cgn
->decl
))
360 worklist
.safe_push (cgn
->decl
);
361 else if (DECL_STRUCT_FUNCTION (cgn
->decl
)
362 && DECL_STRUCT_FUNCTION (cgn
->decl
)->has_omp_target
)
363 worklist
.safe_push (cgn
->decl
);
365 FOR_EACH_VARIABLE (vnode
)
366 if (lang_hooks
.decls
.omp_get_decl_init (vnode
->decl
)
367 && omp_declare_target_var_p (vnode
->decl
))
368 worklist
.safe_push (vnode
->decl
);
369 while (!worklist
.is_empty ())
371 tree decl
= worklist
.pop ();
373 walk_tree_without_duplicates (lang_hooks
.decls
.omp_get_decl_init (decl
),
374 omp_discover_declare_target_var_r
,
376 else if (omp_declare_target_fn_p (decl
))
377 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl
),
378 omp_discover_declare_target_tgt_fn_r
,
381 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl
),
382 omp_discover_declare_target_fn_r
,
386 lang_hooks
.decls
.omp_finish_decl_inits ();
390 /* Create new symbols containing (address, size) pairs for global variables,
391 marked with "omp declare target" attribute, as well as addresses for the
392 functions, which are outlined offloading regions. */
394 omp_finish_file (void)
396 unsigned num_funcs
= vec_safe_length (offload_funcs
);
397 unsigned num_vars
= vec_safe_length (offload_vars
);
399 if (num_funcs
== 0 && num_vars
== 0)
402 if (targetm_common
.have_named_sections
)
404 vec
<constructor_elt
, va_gc
> *v_f
, *v_v
;
405 vec_alloc (v_f
, num_funcs
);
406 vec_alloc (v_v
, num_vars
* 2);
408 add_decls_addresses_to_decl_constructor (offload_funcs
, v_f
);
409 add_decls_addresses_to_decl_constructor (offload_vars
, v_v
);
411 tree vars_decl_type
= build_array_type_nelts (pointer_sized_int_node
,
412 vec_safe_length (v_v
));
413 tree funcs_decl_type
= build_array_type_nelts (pointer_sized_int_node
,
415 SET_TYPE_ALIGN (vars_decl_type
, TYPE_ALIGN (pointer_sized_int_node
));
416 SET_TYPE_ALIGN (funcs_decl_type
, TYPE_ALIGN (pointer_sized_int_node
));
417 tree ctor_v
= build_constructor (vars_decl_type
, v_v
);
418 tree ctor_f
= build_constructor (funcs_decl_type
, v_f
);
419 TREE_CONSTANT (ctor_v
) = TREE_CONSTANT (ctor_f
) = 1;
420 TREE_STATIC (ctor_v
) = TREE_STATIC (ctor_f
) = 1;
421 tree funcs_decl
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
422 get_identifier (".offload_func_table"),
424 tree vars_decl
= build_decl (UNKNOWN_LOCATION
, VAR_DECL
,
425 get_identifier (".offload_var_table"),
427 TREE_STATIC (funcs_decl
) = TREE_STATIC (vars_decl
) = 1;
428 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
429 otherwise a joint table in a binary will contain padding between
430 tables from multiple object files. */
431 DECL_USER_ALIGN (funcs_decl
) = DECL_USER_ALIGN (vars_decl
) = 1;
432 SET_DECL_ALIGN (funcs_decl
, TYPE_ALIGN (funcs_decl_type
));
433 SET_DECL_ALIGN (vars_decl
, TYPE_ALIGN (vars_decl_type
));
434 DECL_INITIAL (funcs_decl
) = ctor_f
;
435 DECL_INITIAL (vars_decl
) = ctor_v
;
436 set_decl_section_name (funcs_decl
, OFFLOAD_FUNC_TABLE_SECTION_NAME
);
437 set_decl_section_name (vars_decl
, OFFLOAD_VAR_TABLE_SECTION_NAME
);
439 varpool_node::finalize_decl (vars_decl
);
440 varpool_node::finalize_decl (funcs_decl
);
444 for (unsigned i
= 0; i
< num_funcs
; i
++)
446 tree it
= (*offload_funcs
)[i
];
447 /* See also add_decls_addresses_to_decl_constructor
448 and output_offload_tables in lto-cgraph.c. */
449 if (!in_lto_p
&& !symtab_node::get (it
))
451 targetm
.record_offload_symbol (it
);
453 for (unsigned i
= 0; i
< num_vars
; i
++)
455 tree it
= (*offload_vars
)[i
];
456 if (!in_lto_p
&& !symtab_node::get (it
))
458 #ifdef ACCEL_COMPILER
459 if (DECL_HAS_VALUE_EXPR_P (it
)
460 && lookup_attribute ("omp declare target link",
461 DECL_ATTRIBUTES (it
)))
463 tree value_expr
= DECL_VALUE_EXPR (it
);
464 tree link_ptr_decl
= TREE_OPERAND (value_expr
, 0);
465 targetm
.record_offload_symbol (link_ptr_decl
);
466 varpool_node::finalize_decl (link_ptr_decl
);
470 targetm
.record_offload_symbol (it
);
475 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
476 axis DIM. Return a tmp var holding the result. */
479 oacc_dim_call (bool pos
, int dim
, gimple_seq
*seq
)
481 tree arg
= build_int_cst (unsigned_type_node
, dim
);
482 tree size
= create_tmp_var (integer_type_node
);
483 enum internal_fn fn
= pos
? IFN_GOACC_DIM_POS
: IFN_GOACC_DIM_SIZE
;
484 gimple
*call
= gimple_build_call_internal (fn
, 1, arg
);
486 gimple_call_set_lhs (call
, size
);
487 gimple_seq_add_stmt (seq
, call
);
492 /* Find the number of threads (POS = false), or thread number (POS =
493 true) for an OpenACC region partitioned as MASK. Setup code
494 required for the calculation is added to SEQ. */
497 oacc_thread_numbers (bool pos
, int mask
, gimple_seq
*seq
)
499 tree res
= pos
? NULL_TREE
: build_int_cst (unsigned_type_node
, 1);
502 /* Start at gang level, and examine relevant dimension indices. */
503 for (ix
= GOMP_DIM_GANG
; ix
!= GOMP_DIM_MAX
; ix
++)
504 if (GOMP_DIM_MASK (ix
) & mask
)
508 /* We had an outer index, so scale that by the size of
510 tree n
= oacc_dim_call (false, ix
, seq
);
511 res
= fold_build2 (MULT_EXPR
, integer_type_node
, res
, n
);
515 /* Determine index in this dimension. */
516 tree id
= oacc_dim_call (true, ix
, seq
);
518 res
= fold_build2 (PLUS_EXPR
, integer_type_node
, res
, id
);
524 if (res
== NULL_TREE
)
525 res
= integer_zero_node
;
530 /* Transform IFN_GOACC_LOOP calls to actual code. See
531 expand_oacc_for for where these are generated. At the vector
532 level, we stride loops, such that each member of a warp will
533 operate on adjacent iterations. At the worker and gang level,
534 each gang/warp executes a set of contiguous iterations. Chunking
535 can override this such that each iteration engine executes a
536 contiguous chunk, and then moves on to stride to the next chunk. */
539 oacc_xform_loop (gcall
*call
)
541 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
542 enum ifn_goacc_loop_kind code
543 = (enum ifn_goacc_loop_kind
) TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
544 tree dir
= gimple_call_arg (call
, 1);
545 tree range
= gimple_call_arg (call
, 2);
546 tree step
= gimple_call_arg (call
, 3);
547 tree chunk_size
= NULL_TREE
;
548 unsigned mask
= (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call
, 5));
549 tree lhs
= gimple_call_lhs (call
);
550 tree type
= NULL_TREE
;
551 tree diff_type
= TREE_TYPE (range
);
553 gimple_seq seq
= NULL
;
554 bool chunking
= false, striding
= true;
555 unsigned outer_mask
= mask
& (~mask
+ 1); // Outermost partitioning
556 unsigned inner_mask
= mask
& ~outer_mask
; // Inner partitioning (if any)
558 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
561 gsi_replace_with_seq (&gsi
, seq
, true);
565 type
= TREE_TYPE (lhs
);
567 #ifdef ACCEL_COMPILER
568 chunk_size
= gimple_call_arg (call
, 4);
569 if (integer_minus_onep (chunk_size
) /* Force static allocation. */
570 || integer_zerop (chunk_size
)) /* Default (also static). */
572 /* If we're at the gang level, we want each to execute a
573 contiguous run of iterations. Otherwise we want each element
575 striding
= !(outer_mask
& GOMP_DIM_MASK (GOMP_DIM_GANG
));
580 /* Chunk of size 1 is striding. */
581 striding
= integer_onep (chunk_size
);
582 chunking
= !striding
;
586 /* striding=true, chunking=true
588 striding=true, chunking=false
590 striding=false,chunking=true
591 -> chunks=ceil (range/(chunksize*threads*step))
592 striding=false,chunking=false
593 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
594 push_gimplify_context (true);
598 default: gcc_unreachable ();
600 case IFN_GOACC_LOOP_CHUNKS
:
602 r
= build_int_cst (type
, 1);
606 = (range - dir) / (chunks * step * num_threads) + dir */
607 tree per
= oacc_thread_numbers (false, mask
, &seq
);
608 per
= fold_convert (type
, per
);
609 chunk_size
= fold_convert (type
, chunk_size
);
610 per
= fold_build2 (MULT_EXPR
, type
, per
, chunk_size
);
611 per
= fold_build2 (MULT_EXPR
, type
, per
, step
);
612 r
= build2 (MINUS_EXPR
, type
, range
, dir
);
613 r
= build2 (PLUS_EXPR
, type
, r
, per
);
614 r
= build2 (TRUNC_DIV_EXPR
, type
, r
, per
);
618 case IFN_GOACC_LOOP_STEP
:
620 /* If striding, step by the entire compute volume, otherwise
621 step by the inner volume. */
622 unsigned volume
= striding
? mask
: inner_mask
;
624 r
= oacc_thread_numbers (false, volume
, &seq
);
625 r
= build2 (MULT_EXPR
, type
, fold_convert (type
, r
), step
);
629 case IFN_GOACC_LOOP_OFFSET
:
630 /* Enable vectorization on non-SIMT targets. */
632 && outer_mask
== GOMP_DIM_MASK (GOMP_DIM_VECTOR
)
633 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
635 && (flag_tree_loop_vectorize
636 || !global_options_set
.x_flag_tree_loop_vectorize
))
638 basic_block bb
= gsi_bb (gsi
);
639 class loop
*parent
= bb
->loop_father
;
640 class loop
*body
= parent
->inner
;
642 parent
->force_vectorize
= true;
643 parent
->safelen
= INT_MAX
;
645 /* "Chunking loops" may have inner loops. */
648 body
->force_vectorize
= true;
649 body
->safelen
= INT_MAX
;
652 cfun
->has_force_vectorize_loops
= true;
656 r
= oacc_thread_numbers (true, mask
, &seq
);
657 r
= fold_convert (diff_type
, r
);
661 tree inner_size
= oacc_thread_numbers (false, inner_mask
, &seq
);
662 tree outer_size
= oacc_thread_numbers (false, outer_mask
, &seq
);
663 tree volume
= fold_build2 (MULT_EXPR
, TREE_TYPE (inner_size
),
664 inner_size
, outer_size
);
666 volume
= fold_convert (diff_type
, volume
);
668 chunk_size
= fold_convert (diff_type
, chunk_size
);
671 tree per
= fold_build2 (MULT_EXPR
, diff_type
, volume
, step
);
673 chunk_size
= build2 (MINUS_EXPR
, diff_type
, range
, dir
);
674 chunk_size
= build2 (PLUS_EXPR
, diff_type
, chunk_size
, per
);
675 chunk_size
= build2 (TRUNC_DIV_EXPR
, diff_type
, chunk_size
, per
);
678 tree span
= build2 (MULT_EXPR
, diff_type
, chunk_size
,
679 fold_convert (diff_type
, inner_size
));
680 r
= oacc_thread_numbers (true, outer_mask
, &seq
);
681 r
= fold_convert (diff_type
, r
);
682 r
= build2 (MULT_EXPR
, diff_type
, r
, span
);
684 tree inner
= oacc_thread_numbers (true, inner_mask
, &seq
);
685 inner
= fold_convert (diff_type
, inner
);
686 r
= fold_build2 (PLUS_EXPR
, diff_type
, r
, inner
);
690 tree chunk
= fold_convert (diff_type
, gimple_call_arg (call
, 6));
692 = fold_build2 (MULT_EXPR
, diff_type
, volume
, chunk_size
);
693 per
= build2 (MULT_EXPR
, diff_type
, per
, chunk
);
695 r
= build2 (PLUS_EXPR
, diff_type
, r
, per
);
698 r
= fold_build2 (MULT_EXPR
, diff_type
, r
, step
);
699 if (type
!= diff_type
)
700 r
= fold_convert (type
, r
);
703 case IFN_GOACC_LOOP_BOUND
:
708 tree inner_size
= oacc_thread_numbers (false, inner_mask
, &seq
);
709 tree outer_size
= oacc_thread_numbers (false, outer_mask
, &seq
);
710 tree volume
= fold_build2 (MULT_EXPR
, TREE_TYPE (inner_size
),
711 inner_size
, outer_size
);
713 volume
= fold_convert (diff_type
, volume
);
715 chunk_size
= fold_convert (diff_type
, chunk_size
);
718 tree per
= fold_build2 (MULT_EXPR
, diff_type
, volume
, step
);
720 chunk_size
= build2 (MINUS_EXPR
, diff_type
, range
, dir
);
721 chunk_size
= build2 (PLUS_EXPR
, diff_type
, chunk_size
, per
);
722 chunk_size
= build2 (TRUNC_DIV_EXPR
, diff_type
, chunk_size
, per
);
725 tree span
= build2 (MULT_EXPR
, diff_type
, chunk_size
,
726 fold_convert (diff_type
, inner_size
));
728 r
= fold_build2 (MULT_EXPR
, diff_type
, span
, step
);
730 tree offset
= gimple_call_arg (call
, 6);
731 r
= build2 (PLUS_EXPR
, diff_type
, r
,
732 fold_convert (diff_type
, offset
));
733 r
= build2 (integer_onep (dir
) ? MIN_EXPR
: MAX_EXPR
,
734 diff_type
, r
, range
);
736 if (diff_type
!= type
)
737 r
= fold_convert (type
, r
);
741 gimplify_assign (lhs
, r
, &seq
);
743 pop_gimplify_context (NULL
);
745 gsi_replace_with_seq (&gsi
, seq
, true);
748 /* Transform a GOACC_TILE call. Determines the element loop span for
749 the specified loop of the nest. This is 1 if we're not tiling.
751 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
754 oacc_xform_tile (gcall
*call
)
756 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
757 unsigned collapse
= tree_to_uhwi (gimple_call_arg (call
, 0));
758 /* Inner loops have higher loop_nos. */
759 unsigned loop_no
= tree_to_uhwi (gimple_call_arg (call
, 1));
760 tree tile_size
= gimple_call_arg (call
, 2);
761 unsigned e_mask
= tree_to_uhwi (gimple_call_arg (call
, 4));
762 tree lhs
= gimple_call_lhs (call
);
763 tree type
= TREE_TYPE (lhs
);
764 gimple_seq seq
= NULL
;
765 tree span
= build_int_cst (type
, 1);
768 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR
)
769 | GOMP_DIM_MASK (GOMP_DIM_WORKER
))));
770 push_gimplify_context (!seen_error ());
772 #ifndef ACCEL_COMPILER
773 /* Partitioning disabled on host compilers. */
777 /* Not paritioning. */
778 span
= integer_one_node
;
779 else if (!integer_zerop (tile_size
))
780 /* User explicitly specified size. */
784 /* Pick a size based on the paritioning of the element loop and
785 the number of loop nests. */
786 tree first_size
= NULL_TREE
;
787 tree second_size
= NULL_TREE
;
789 if (e_mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
))
790 first_size
= oacc_dim_call (false, GOMP_DIM_VECTOR
, &seq
);
791 if (e_mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
))
792 second_size
= oacc_dim_call (false, GOMP_DIM_WORKER
, &seq
);
796 first_size
= second_size
;
797 second_size
= NULL_TREE
;
800 if (loop_no
+ 1 == collapse
)
803 if (!loop_no
&& second_size
)
804 span
= fold_build2 (MULT_EXPR
, TREE_TYPE (span
),
807 else if (loop_no
+ 2 == collapse
)
813 /* There's no obvious element size for this loop. Options
814 are 1, first_size or some non-unity constant (32 is my
815 favourite). We should gather some statistics. */
819 span
= fold_convert (type
, span
);
820 gimplify_assign (lhs
, span
, &seq
);
822 pop_gimplify_context (NULL
);
824 gsi_replace_with_seq (&gsi
, seq
, true);
827 /* Default partitioned and minimum partitioned dimensions. */
829 static int oacc_default_dims
[GOMP_DIM_MAX
];
830 static int oacc_min_dims
[GOMP_DIM_MAX
];
833 oacc_get_default_dim (int dim
)
835 gcc_assert (0 <= dim
&& dim
< GOMP_DIM_MAX
);
836 return oacc_default_dims
[dim
];
840 oacc_get_min_dim (int dim
)
842 gcc_assert (0 <= dim
&& dim
< GOMP_DIM_MAX
);
843 return oacc_min_dims
[dim
];
846 /* Parse the default dimension parameter. This is a set of
847 :-separated optional compute dimensions. Each specified dimension
848 is a positive integer. When device type support is added, it is
849 planned to be a comma separated list of such compute dimensions,
850 with all but the first prefixed by the colon-terminated device
854 oacc_parse_default_dims (const char *dims
)
858 for (ix
= GOMP_DIM_MAX
; ix
--;)
860 oacc_default_dims
[ix
] = -1;
861 oacc_min_dims
[ix
] = 1;
864 #ifndef ACCEL_COMPILER
865 /* Cannot be overridden on the host. */
870 const char *pos
= dims
;
872 for (ix
= 0; *pos
&& ix
!= GOMP_DIM_MAX
; ix
++)
887 val
= strtol (pos
, CONST_CAST (char **, &eptr
), 10);
888 if (errno
|| val
<= 0 || (int) val
!= val
)
891 oacc_default_dims
[ix
] = (int) val
;
897 error_at (UNKNOWN_LOCATION
,
898 "%<-fopenacc-dim%> operand is malformed at %qs", pos
);
902 /* Allow the backend to validate the dimensions. */
903 targetm
.goacc
.validate_dims (NULL_TREE
, oacc_default_dims
, -1, 0);
904 targetm
.goacc
.validate_dims (NULL_TREE
, oacc_min_dims
, -2, 0);
907 /* Validate and update the dimensions for offloaded FN. ATTRS is the
908 raw attribute. DIMS is an array of dimensions, which is filled in.
909 LEVEL is the partitioning level of a routine, or -1 for an offload
910 region itself. USED is the mask of partitioned execution in the
914 oacc_validate_dims (tree fn
, tree attrs
, int *dims
, int level
, unsigned used
)
916 tree purpose
[GOMP_DIM_MAX
];
918 tree pos
= TREE_VALUE (attrs
);
920 /* Make sure the attribute creator attached the dimension
924 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
926 purpose
[ix
] = TREE_PURPOSE (pos
);
927 tree val
= TREE_VALUE (pos
);
928 dims
[ix
] = val
? TREE_INT_CST_LOW (val
) : -1;
929 pos
= TREE_CHAIN (pos
);
933 #ifdef ACCEL_COMPILER
937 && warn_openacc_parallelism
938 && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn
)))
940 static char const *const axes
[] =
941 /* Must be kept in sync with GOMP_DIM enumeration. */
942 { "gang", "worker", "vector" };
943 for (ix
= level
>= 0 ? level
: 0; ix
!= GOMP_DIM_MAX
; ix
++)
945 ; /* Defaulting axis. */
946 else if ((used
& GOMP_DIM_MASK (ix
)) && dims
[ix
] == 1)
947 /* There is partitioned execution, but the user requested a
948 dimension size of 1. They're probably confused. */
949 warning_at (DECL_SOURCE_LOCATION (fn
), OPT_Wopenacc_parallelism
,
950 "region contains %s partitioned code but"
951 " is not %s partitioned", axes
[ix
], axes
[ix
]);
952 else if (!(used
& GOMP_DIM_MASK (ix
)) && dims
[ix
] != 1)
953 /* The dimension is explicitly partitioned to non-unity, but
954 no use is made within the region. */
955 warning_at (DECL_SOURCE_LOCATION (fn
), OPT_Wopenacc_parallelism
,
956 "region is %s partitioned but"
957 " does not contain %s partitioned code",
961 bool changed
= targetm
.goacc
.validate_dims (fn
, dims
, level
, used
);
963 /* Default anything left to 1 or a partitioned default. */
964 for (ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
967 /* The OpenACC spec says 'If the [num_gangs] clause is not
968 specified, an implementation-defined default will be used;
969 the default may depend on the code within the construct.'
970 (2.5.6). Thus an implementation is free to choose
971 non-unity default for a parallel region that doesn't have
972 any gang-partitioned loops. However, it appears that there
973 is a sufficient body of user code that expects non-gang
974 partitioned regions to not execute in gang-redundant mode.
975 So we (a) don't warn about the non-portability and (b) pick
976 the minimum permissible dimension size when there is no
977 partitioned execution. Otherwise we pick the global
978 default for the dimension, which the user can control. The
979 same wording and logic applies to num_workers and
980 vector_length, however the worker- or vector- single
981 execution doesn't have the same impact as gang-redundant
982 execution. (If the minimum gang-level partioning is not 1,
983 the target is probably too confusing.) */
984 dims
[ix
] = (used
& GOMP_DIM_MASK (ix
)
985 ? oacc_default_dims
[ix
] : oacc_min_dims
[ix
]);
991 /* Replace the attribute with new values. */
993 for (ix
= GOMP_DIM_MAX
; ix
--;)
994 pos
= tree_cons (purpose
[ix
],
995 build_int_cst (integer_type_node
, dims
[ix
]), pos
);
996 oacc_replace_fn_attrib (fn
, pos
);
1000 /* Create an empty OpenACC loop structure at LOC. */
1003 new_oacc_loop_raw (oacc_loop
*parent
, location_t loc
)
1005 oacc_loop
*loop
= XCNEW (oacc_loop
);
1007 loop
->parent
= parent
;
1011 loop
->sibling
= parent
->child
;
1012 parent
->child
= loop
;
1019 /* Create an outermost, dummy OpenACC loop for offloaded function
1023 new_oacc_loop_outer (tree decl
)
1025 return new_oacc_loop_raw (NULL
, DECL_SOURCE_LOCATION (decl
));
1028 /* Start a new OpenACC loop structure beginning at head marker HEAD.
1029 Link into PARENT loop. Return the new loop. */
1032 new_oacc_loop (oacc_loop
*parent
, gcall
*marker
)
1034 oacc_loop
*loop
= new_oacc_loop_raw (parent
, gimple_location (marker
));
1036 loop
->marker
= marker
;
1038 /* TODO: This is where device_type flattening would occur for the loop
1041 loop
->flags
= TREE_INT_CST_LOW (gimple_call_arg (marker
, 3));
1043 tree chunk_size
= integer_zero_node
;
1044 if (loop
->flags
& OLF_GANG_STATIC
)
1045 chunk_size
= gimple_call_arg (marker
, 4);
1046 loop
->chunk_size
= chunk_size
;
1051 /* Create a dummy loop encompassing a call to a openACC routine.
1052 Extract the routine's partitioning requirements. */
1055 new_oacc_loop_routine (oacc_loop
*parent
, gcall
*call
, tree decl
, tree attrs
)
1057 oacc_loop
*loop
= new_oacc_loop_raw (parent
, gimple_location (call
));
1058 int level
= oacc_fn_attrib_level (attrs
);
1060 gcc_assert (level
>= 0);
1062 loop
->marker
= call
;
1063 loop
->routine
= decl
;
1064 loop
->mask
= ((GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1)
1065 ^ (GOMP_DIM_MASK (level
) - 1));
1068 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1069 Return the parent loop. */
1072 finish_oacc_loop (oacc_loop
*loop
)
1074 /* If the loop has been collapsed, don't partition it. */
1075 if (loop
->ifns
.is_empty ())
1076 loop
->mask
= loop
->flags
= 0;
1077 return loop
->parent
;
1080 /* Free all OpenACC loop structures within LOOP (inclusive). */
1083 free_oacc_loop (oacc_loop
*loop
)
1086 free_oacc_loop (loop
->sibling
);
1088 free_oacc_loop (loop
->child
);
1090 loop
->ifns
.release ();
1094 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1097 dump_oacc_loop_part (FILE *file
, gcall
*from
, int depth
,
1098 const char *title
, int level
)
1100 enum ifn_unique_kind kind
1101 = (enum ifn_unique_kind
) TREE_INT_CST_LOW (gimple_call_arg (from
, 0));
1103 fprintf (file
, "%*s%s-%d:\n", depth
* 2, "", title
, level
);
1104 for (gimple_stmt_iterator gsi
= gsi_for_stmt (from
);;)
1106 gimple
*stmt
= gsi_stmt (gsi
);
1108 if (gimple_call_internal_p (stmt
, IFN_UNIQUE
))
1110 enum ifn_unique_kind k
1111 = ((enum ifn_unique_kind
) TREE_INT_CST_LOW
1112 (gimple_call_arg (stmt
, 0)));
1114 if (k
== kind
&& stmt
!= from
)
1117 print_gimple_stmt (file
, stmt
, depth
* 2 + 2);
1120 while (gsi_end_p (gsi
))
1121 gsi
= gsi_start_bb (single_succ (gsi_bb (gsi
)));
1125 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1128 dump_oacc_loop (FILE *file
, oacc_loop
*loop
, int depth
)
1132 fprintf (file
, "%*sLoop %x(%x) %s:%u\n", depth
* 2, "",
1133 loop
->flags
, loop
->mask
,
1134 LOCATION_FILE (loop
->loc
), LOCATION_LINE (loop
->loc
));
1137 print_gimple_stmt (file
, loop
->marker
, depth
* 2);
1140 fprintf (file
, "%*sRoutine %s:%u:%s\n",
1141 depth
* 2, "", DECL_SOURCE_FILE (loop
->routine
),
1142 DECL_SOURCE_LINE (loop
->routine
),
1143 IDENTIFIER_POINTER (DECL_NAME (loop
->routine
)));
1145 for (ix
= GOMP_DIM_GANG
; ix
!= GOMP_DIM_MAX
; ix
++)
1146 if (loop
->heads
[ix
])
1147 dump_oacc_loop_part (file
, loop
->heads
[ix
], depth
, "Head", ix
);
1148 for (ix
= GOMP_DIM_MAX
; ix
--;)
1149 if (loop
->tails
[ix
])
1150 dump_oacc_loop_part (file
, loop
->tails
[ix
], depth
, "Tail", ix
);
1153 dump_oacc_loop (file
, loop
->child
, depth
+ 1);
1155 dump_oacc_loop (file
, loop
->sibling
, depth
);
1158 void debug_oacc_loop (oacc_loop
*);
1160 /* Dump loops to stderr. */
1163 debug_oacc_loop (oacc_loop
*loop
)
1165 dump_oacc_loop (stderr
, loop
, 0);
1168 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1172 inform_oacc_loop (const oacc_loop
*loop
)
1175 = loop
->mask
& GOMP_DIM_MASK (GOMP_DIM_GANG
) ? " gang" : "";
1177 = loop
->mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
) ? " worker" : "";
1179 = loop
->mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
) ? " vector" : "";
1180 const char *seq
= loop
->mask
== 0 ? " seq" : "";
1181 const dump_user_location_t loc
1182 = dump_user_location_t::from_location_t (loop
->loc
);
1183 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
, loc
,
1184 "assigned OpenACC%s%s%s%s loop parallelism\n", gang
, worker
,
1188 inform_oacc_loop (loop
->child
);
1190 inform_oacc_loop (loop
->sibling
);
1193 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1194 structures as we go. By construction these loops are properly
1198 oacc_loop_discover_walk (oacc_loop
*loop
, basic_block bb
)
1203 if (bb
->flags
& BB_VISITED
)
1207 bb
->flags
|= BB_VISITED
;
1209 /* Scan for loop markers. */
1210 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);
1213 gimple
*stmt
= gsi_stmt (gsi
);
1215 if (!is_gimple_call (stmt
))
1218 gcall
*call
= as_a
<gcall
*> (stmt
);
1220 /* If this is a routine, make a dummy loop for it. */
1221 if (tree decl
= gimple_call_fndecl (call
))
1222 if (tree attrs
= oacc_get_fn_attrib (decl
))
1224 gcc_assert (!marker
);
1225 new_oacc_loop_routine (loop
, call
, decl
, attrs
);
1228 if (!gimple_call_internal_p (call
))
1231 switch (gimple_call_internal_fn (call
))
1236 case IFN_GOACC_LOOP
:
1237 case IFN_GOACC_TILE
:
1238 /* Record the abstraction function, so we can manipulate it
1240 loop
->ifns
.safe_push (call
);
1244 enum ifn_unique_kind kind
1245 = (enum ifn_unique_kind
) (TREE_INT_CST_LOW
1246 (gimple_call_arg (call
, 0)));
1247 if (kind
== IFN_UNIQUE_OACC_HEAD_MARK
1248 || kind
== IFN_UNIQUE_OACC_TAIL_MARK
)
1250 if (gimple_call_num_args (call
) == 2)
1252 gcc_assert (marker
&& !remaining
);
1254 if (kind
== IFN_UNIQUE_OACC_TAIL_MARK
)
1255 loop
= finish_oacc_loop (loop
);
1257 loop
->head_end
= call
;
1261 int count
= TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
1265 if (kind
== IFN_UNIQUE_OACC_HEAD_MARK
)
1266 loop
= new_oacc_loop (loop
, call
);
1269 gcc_assert (count
== remaining
);
1273 if (kind
== IFN_UNIQUE_OACC_HEAD_MARK
)
1274 loop
->heads
[marker
] = call
;
1276 loop
->tails
[remaining
] = call
;
1283 if (remaining
|| marker
)
1285 bb
= single_succ (bb
);
1286 gcc_assert (single_pred_p (bb
) && !(bb
->flags
& BB_VISITED
));
1290 /* Walk successor blocks. */
1294 FOR_EACH_EDGE (e
, ei
, bb
->succs
)
1295 oacc_loop_discover_walk (loop
, e
->dest
);
1298 /* LOOP is the first sibling. Reverse the order in place and return
1299 the new first sibling. Recurse to child loops. */
1302 oacc_loop_sibling_nreverse (oacc_loop
*loop
)
1304 oacc_loop
*last
= NULL
;
1308 loop
->child
= oacc_loop_sibling_nreverse (loop
->child
);
1310 oacc_loop
*next
= loop
->sibling
;
1311 loop
->sibling
= last
;
1320 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1321 the current function. */
1324 oacc_loop_discovery ()
1326 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1327 in the following. */
1330 oacc_loop
*top
= new_oacc_loop_outer (current_function_decl
);
1331 oacc_loop_discover_walk (top
, ENTRY_BLOCK_PTR_FOR_FN (cfun
));
1333 /* The siblings were constructed in reverse order, reverse them so
1334 that diagnostics come out in an unsurprising order. */
1335 top
= oacc_loop_sibling_nreverse (top
);
1340 /* Transform the abstract internal function markers starting at FROM
1341 to be for partitioning level LEVEL. Stop when we meet another HEAD
1345 oacc_loop_xform_head_tail (gcall
*from
, int level
)
1347 enum ifn_unique_kind kind
1348 = (enum ifn_unique_kind
) TREE_INT_CST_LOW (gimple_call_arg (from
, 0));
1349 tree replacement
= build_int_cst (unsigned_type_node
, level
);
1351 for (gimple_stmt_iterator gsi
= gsi_for_stmt (from
);;)
1353 gimple
*stmt
= gsi_stmt (gsi
);
1355 if (gimple_call_internal_p (stmt
, IFN_UNIQUE
))
1357 enum ifn_unique_kind k
1358 = ((enum ifn_unique_kind
)
1359 TREE_INT_CST_LOW (gimple_call_arg (stmt
, 0)));
1361 if (k
== IFN_UNIQUE_OACC_FORK
1362 || k
== IFN_UNIQUE_OACC_JOIN
1363 || k
== IFN_UNIQUE_OACC_PRIVATE
)
1364 *gimple_call_arg_ptr (stmt
, 2) = replacement
;
1365 else if (k
== kind
&& stmt
!= from
)
1368 else if (gimple_call_internal_p (stmt
, IFN_GOACC_REDUCTION
))
1369 *gimple_call_arg_ptr (stmt
, 3) = replacement
;
1373 while (gsi_end_p (gsi
))
1374 gsi
= gsi_start_bb (single_succ (gsi_bb (gsi
)));
1378 /* Process the discovered OpenACC loops, setting the correct
1379 partitioning level etc. */
1382 oacc_loop_process (oacc_loop
*loop
)
1385 oacc_loop_process (loop
->child
);
1387 if (loop
->mask
&& !loop
->routine
)
1390 tree mask_arg
= build_int_cst (unsigned_type_node
, loop
->mask
);
1391 tree e_mask_arg
= build_int_cst (unsigned_type_node
, loop
->e_mask
);
1392 tree chunk_arg
= loop
->chunk_size
;
1395 for (ix
= 0; loop
->ifns
.iterate (ix
, &call
); ix
++)
1397 switch (gimple_call_internal_fn (call
))
1399 case IFN_GOACC_LOOP
:
1401 bool is_e
= gimple_call_arg (call
, 5) == integer_minus_one_node
;
1402 gimple_call_set_arg (call
, 5, is_e
? e_mask_arg
: mask_arg
);
1404 gimple_call_set_arg (call
, 4, chunk_arg
);
1408 case IFN_GOACC_TILE
:
1409 gimple_call_set_arg (call
, 3, mask_arg
);
1410 gimple_call_set_arg (call
, 4, e_mask_arg
);
1419 unsigned dim
= GOMP_DIM_GANG
;
1420 unsigned mask
= loop
->mask
| loop
->e_mask
;
1421 for (ix
= 0; ix
!= GOMP_DIM_MAX
&& mask
; ix
++)
1423 while (!(GOMP_DIM_MASK (dim
) & mask
))
1426 oacc_loop_xform_head_tail (loop
->heads
[ix
], dim
);
1427 oacc_loop_xform_head_tail (loop
->tails
[ix
], dim
);
1429 mask
^= GOMP_DIM_MASK (dim
);
1434 oacc_loop_process (loop
->sibling
);
1437 /* Walk the OpenACC loop heirarchy checking and assigning the
1438 programmer-specified partitionings. OUTER_MASK is the partitioning
1439 this loop is contained within. Return mask of partitioning
1440 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1444 oacc_loop_fixed_partitions (oacc_loop
*loop
, unsigned outer_mask
)
1446 unsigned this_mask
= loop
->mask
;
1447 unsigned mask_all
= 0;
1450 #ifdef ACCEL_COMPILER
1451 /* When device_type is supported, we want the device compiler to be
1452 noisy, if the loop parameters are device_type-specific. */
1458 bool auto_par
= (loop
->flags
& OLF_AUTO
) != 0;
1459 bool seq_par
= (loop
->flags
& OLF_SEQ
) != 0;
1460 bool tiling
= (loop
->flags
& OLF_TILE
) != 0;
1462 this_mask
= ((loop
->flags
>> OLF_DIM_BASE
)
1463 & (GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1));
1465 /* Apply auto partitioning if this is a non-partitioned regular
1466 loop, or (no more than) single axis tiled loop. */
1468 = !seq_par
&& this_mask
== (tiling
? this_mask
& -this_mask
: 0);
1470 if ((this_mask
!= 0) + auto_par
+ seq_par
> 1)
1473 error_at (loop
->loc
,
1475 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1476 : G_("%<auto%> conflicts with other OpenACC loop "
1479 loop
->flags
&= ~OLF_AUTO
;
1483 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX
) - 1) << OLF_DIM_BASE
);
1488 if (maybe_auto
&& (loop
->flags
& OLF_INDEPENDENT
))
1490 loop
->flags
|= OLF_AUTO
;
1491 mask_all
|= GOMP_DIM_MASK (GOMP_DIM_MAX
);
1495 if (this_mask
& outer_mask
)
1497 const oacc_loop
*outer
;
1498 for (outer
= loop
->parent
; outer
; outer
= outer
->parent
)
1499 if ((outer
->mask
| outer
->e_mask
) & this_mask
)
1506 error_at (loop
->loc
,
1508 ? G_("routine call uses same OpenACC parallelism"
1509 " as containing loop")
1510 : G_("inner loop uses same OpenACC parallelism"
1511 " as containing loop"));
1512 inform (outer
->loc
, "containing loop here");
1515 error_at (loop
->loc
,
1517 ? G_("routine call uses OpenACC parallelism disallowed"
1518 " by containing routine")
1519 : G_("loop uses OpenACC parallelism disallowed"
1520 " by containing routine"));
1523 inform (DECL_SOURCE_LOCATION (loop
->routine
),
1524 "routine %qD declared here", loop
->routine
);
1526 this_mask
&= ~outer_mask
;
1530 unsigned outermost
= least_bit_hwi (this_mask
);
1532 if (outermost
&& outermost
<= outer_mask
)
1536 error_at (loop
->loc
,
1537 "incorrectly nested OpenACC loop parallelism");
1539 const oacc_loop
*outer
;
1540 for (outer
= loop
->parent
;
1541 outer
->flags
&& outer
->flags
< outermost
;
1542 outer
= outer
->parent
)
1544 inform (outer
->loc
, "containing loop here");
1547 this_mask
&= ~outermost
;
1551 mask_all
|= this_mask
;
1553 if (loop
->flags
& OLF_TILE
)
1555 /* When tiling, vector goes to the element loop, and failing
1556 that we put worker there. The std doesn't contemplate
1557 specifying all three. We choose to put worker and vector on
1558 the element loops in that case. */
1559 unsigned this_e_mask
= this_mask
& GOMP_DIM_MASK (GOMP_DIM_VECTOR
);
1560 if (!this_e_mask
|| this_mask
& GOMP_DIM_MASK (GOMP_DIM_GANG
))
1561 this_e_mask
|= this_mask
& GOMP_DIM_MASK (GOMP_DIM_WORKER
);
1563 loop
->e_mask
= this_e_mask
;
1564 this_mask
^= this_e_mask
;
1567 loop
->mask
= this_mask
;
1570 fprintf (dump_file
, "Loop %s:%d user specified %d & %d\n",
1571 LOCATION_FILE (loop
->loc
), LOCATION_LINE (loop
->loc
),
1572 loop
->mask
, loop
->e_mask
);
1576 unsigned tmp_mask
= outer_mask
| this_mask
| loop
->e_mask
;
1577 loop
->inner
= oacc_loop_fixed_partitions (loop
->child
, tmp_mask
);
1578 mask_all
|= loop
->inner
;
1582 mask_all
|= oacc_loop_fixed_partitions (loop
->sibling
, outer_mask
);
1587 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1588 OUTER_MASK is the partitioning this loop is contained within.
1589 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1590 Return the cumulative partitioning used by this loop, siblings and
1594 oacc_loop_auto_partitions (oacc_loop
*loop
, unsigned outer_mask
,
1597 bool assign
= (loop
->flags
& OLF_AUTO
) && (loop
->flags
& OLF_INDEPENDENT
);
1599 bool tiling
= loop
->flags
& OLF_TILE
;
1601 #ifdef ACCEL_COMPILER
1602 /* When device_type is supported, we want the device compiler to be
1603 noisy, if the loop parameters are device_type-specific. */
1607 if (assign
&& (!outer_assign
|| loop
->inner
))
1609 /* Allocate outermost and non-innermost loops at the outermost
1610 non-innermost available level. */
1611 unsigned this_mask
= GOMP_DIM_MASK (GOMP_DIM_GANG
);
1613 /* Find the first outermost available partition. */
1614 while (this_mask
<= outer_mask
)
1617 /* Grab two axes if tiling, and we've not assigned anything */
1618 if (tiling
&& !(loop
->mask
| loop
->e_mask
))
1619 this_mask
|= this_mask
<< 1;
1621 /* Prohibit the innermost partitioning at the moment. */
1622 this_mask
&= GOMP_DIM_MASK (GOMP_DIM_MAX
- 1) - 1;
1624 /* Don't use any dimension explicitly claimed by an inner loop. */
1625 this_mask
&= ~loop
->inner
;
1627 if (tiling
&& !loop
->e_mask
)
1629 /* If we got two axes, allocate the inner one to the element
1631 loop
->e_mask
= this_mask
& (this_mask
<< 1);
1632 this_mask
^= loop
->e_mask
;
1635 loop
->mask
|= this_mask
;
1640 unsigned tmp_mask
= outer_mask
| loop
->mask
| loop
->e_mask
;
1641 loop
->inner
= oacc_loop_auto_partitions (loop
->child
, tmp_mask
,
1642 outer_assign
| assign
);
1645 if (assign
&& (!loop
->mask
|| (tiling
&& !loop
->e_mask
) || !outer_assign
))
1647 /* Allocate the loop at the innermost available level. Note
1648 that we do this even if we already assigned this loop the
1649 outermost available level above. That way we'll partition
1650 this along 2 axes, if they are available. */
1651 unsigned this_mask
= 0;
1653 /* Determine the outermost partitioning used within this loop. */
1654 this_mask
= loop
->inner
| GOMP_DIM_MASK (GOMP_DIM_MAX
);
1655 this_mask
= least_bit_hwi (this_mask
);
1657 /* Pick the partitioning just inside that one. */
1660 /* And avoid picking one use by an outer loop. */
1661 this_mask
&= ~outer_mask
;
1663 /* If tiling and we failed completely above, grab the next one
1664 too. Making sure it doesn't hit an outer loop. */
1667 this_mask
&= ~(loop
->e_mask
| loop
->mask
);
1668 unsigned tile_mask
= ((this_mask
>> 1)
1669 & ~(outer_mask
| loop
->e_mask
| loop
->mask
));
1671 if (tile_mask
|| loop
->mask
)
1673 loop
->e_mask
|= this_mask
;
1674 this_mask
= tile_mask
;
1676 if (!loop
->e_mask
&& noisy
)
1677 warning_at (loop
->loc
, 0,
1678 "insufficient partitioning available"
1679 " to parallelize element loop");
1682 loop
->mask
|= this_mask
;
1683 if (!loop
->mask
&& noisy
)
1684 warning_at (loop
->loc
, 0,
1686 ? G_("insufficient partitioning available"
1687 " to parallelize tile loop")
1688 : G_("insufficient partitioning available"
1689 " to parallelize loop"));
1692 if (assign
&& dump_file
)
1693 fprintf (dump_file
, "Auto loop %s:%d assigned %d & %d\n",
1694 LOCATION_FILE (loop
->loc
), LOCATION_LINE (loop
->loc
),
1695 loop
->mask
, loop
->e_mask
);
1697 unsigned inner_mask
= 0;
1700 inner_mask
|= oacc_loop_auto_partitions (loop
->sibling
,
1701 outer_mask
, outer_assign
);
1703 inner_mask
|= loop
->inner
| loop
->mask
| loop
->e_mask
;
1708 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1709 axes. Return mask of partitioning. */
1712 oacc_loop_partition (oacc_loop
*loop
, unsigned outer_mask
)
1714 unsigned mask_all
= oacc_loop_fixed_partitions (loop
, outer_mask
);
1716 if (mask_all
& GOMP_DIM_MASK (GOMP_DIM_MAX
))
1718 mask_all
^= GOMP_DIM_MASK (GOMP_DIM_MAX
);
1719 mask_all
|= oacc_loop_auto_partitions (loop
, outer_mask
, false);
1724 /* Default fork/join early expander. Delete the function calls if
1725 there is no RTL expander. */
1728 default_goacc_fork_join (gcall
*ARG_UNUSED (call
),
1729 const int *ARG_UNUSED (dims
), bool is_fork
)
1732 return targetm
.have_oacc_fork ();
1734 return targetm
.have_oacc_join ();
1737 /* Default goacc.reduction early expander.
1739 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1740 If RES_PTR is not integer-zerop:
1741 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1742 TEARDOWN - emit '*RES_PTR = VAR'
1747 default_goacc_reduction (gcall
*call
)
1749 unsigned code
= (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call
, 0));
1750 gimple_stmt_iterator gsi
= gsi_for_stmt (call
);
1751 tree lhs
= gimple_call_lhs (call
);
1752 tree var
= gimple_call_arg (call
, 2);
1753 gimple_seq seq
= NULL
;
1755 if (code
== IFN_GOACC_REDUCTION_SETUP
1756 || code
== IFN_GOACC_REDUCTION_TEARDOWN
)
1758 /* Setup and Teardown need to copy from/to the receiver object,
1760 tree ref_to_res
= gimple_call_arg (call
, 1);
1762 if (!integer_zerop (ref_to_res
))
1764 tree dst
= build_simple_mem_ref (ref_to_res
);
1767 if (code
== IFN_GOACC_REDUCTION_SETUP
)
1773 gimple_seq_add_stmt (&seq
, gimple_build_assign (dst
, src
));
1777 /* Copy VAR to LHS, if there is an LHS. */
1779 gimple_seq_add_stmt (&seq
, gimple_build_assign (lhs
, var
));
1781 gsi_replace_with_seq (&gsi
, seq
, true);
1784 struct var_decl_rewrite_info
1787 hash_map
<tree
, tree
> *adjusted_vars
;
1788 bool avoid_pointer_conversion
;
1792 /* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1793 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1794 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1795 gang-private variables in OpenACC offload regions to reside in GPU shared
1799 oacc_rewrite_var_decl (tree
*tp
, int *walk_subtrees
, void *data
)
1801 walk_stmt_info
*wi
= (walk_stmt_info
*) data
;
1802 var_decl_rewrite_info
*info
= (var_decl_rewrite_info
*) wi
->info
;
1804 if (TREE_CODE (*tp
) == ADDR_EXPR
)
1806 tree arg
= TREE_OPERAND (*tp
, 0);
1807 tree
*new_arg
= info
->adjusted_vars
->get (arg
);
1811 if (info
->avoid_pointer_conversion
)
1813 *tp
= build_fold_addr_expr (*new_arg
);
1814 info
->modified
= true;
1819 gimple_stmt_iterator gsi
= gsi_for_stmt (info
->stmt
);
1820 tree repl
= build_fold_addr_expr (*new_arg
);
1822 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl
)), repl
);
1823 tree conv
= convert_to_pointer (TREE_TYPE (*tp
),
1824 gimple_assign_lhs (stmt1
));
1826 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp
)), conv
);
1827 gsi_insert_before (&gsi
, stmt1
, GSI_SAME_STMT
);
1828 gsi_insert_before (&gsi
, stmt2
, GSI_SAME_STMT
);
1829 *tp
= gimple_assign_lhs (stmt2
);
1830 info
->modified
= true;
1835 else if (TREE_CODE (*tp
) == COMPONENT_REF
|| TREE_CODE (*tp
) == ARRAY_REF
)
1837 tree
*base
= &TREE_OPERAND (*tp
, 0);
1839 while (TREE_CODE (*base
) == COMPONENT_REF
1840 || TREE_CODE (*base
) == ARRAY_REF
)
1841 base
= &TREE_OPERAND (*base
, 0);
1843 if (TREE_CODE (*base
) != VAR_DECL
)
1846 tree
*new_decl
= info
->adjusted_vars
->get (*base
);
1850 int base_quals
= TYPE_QUALS (TREE_TYPE (*new_decl
));
1851 tree field
= TREE_OPERAND (*tp
, 1);
1853 /* Adjust the type of the field. */
1854 int field_quals
= TYPE_QUALS (TREE_TYPE (field
));
1855 if (TREE_CODE (field
) == FIELD_DECL
&& field_quals
!= base_quals
)
1857 tree
*field_type
= &TREE_TYPE (field
);
1858 while (TREE_CODE (*field_type
) == ARRAY_TYPE
)
1859 field_type
= &TREE_TYPE (*field_type
);
1860 field_quals
|= base_quals
;
1861 *field_type
= build_qualified_type (*field_type
, field_quals
);
1864 /* Adjust the type of the component ref itself. */
1865 tree comp_type
= TREE_TYPE (*tp
);
1866 int comp_quals
= TYPE_QUALS (comp_type
);
1867 if (TREE_CODE (*tp
) == COMPONENT_REF
&& comp_quals
!= base_quals
)
1869 comp_quals
|= base_quals
;
1871 = build_qualified_type (comp_type
, comp_quals
);
1875 info
->modified
= true;
1877 else if (TREE_CODE (*tp
) == VAR_DECL
)
1879 tree
*new_decl
= info
->adjusted_vars
->get (*tp
);
1883 info
->modified
= true;
1890 /* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1893 is_sync_builtin_call (gcall
*call
)
1895 tree callee
= gimple_call_fndecl (call
);
1897 if (callee
!= NULL_TREE
1898 && gimple_call_builtin_p (call
, BUILT_IN_NORMAL
))
1899 switch (DECL_FUNCTION_CODE (callee
))
1901 #undef DEF_SYNC_BUILTIN
1902 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1903 #include "sync-builtins.def"
1904 #undef DEF_SYNC_BUILTIN
1914 /* Main entry point for oacc transformations which run on the device
1915 compiler after LTO, so we know what the target device is at this
1916 point (including the host fallback). */
1919 execute_oacc_loop_designation ()
1921 tree attrs
= oacc_get_fn_attrib (current_function_decl
);
1924 /* Not an offloaded function. */
1927 /* Parse the default dim argument exactly once. */
1928 if ((const void *)flag_openacc_dims
!= &flag_openacc_dims
)
1930 oacc_parse_default_dims (flag_openacc_dims
);
1931 flag_openacc_dims
= (char *)&flag_openacc_dims
;
1934 bool is_oacc_parallel
1935 = (lookup_attribute ("oacc parallel",
1936 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1937 bool is_oacc_kernels
1938 = (lookup_attribute ("oacc kernels",
1939 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1941 = (lookup_attribute ("oacc serial",
1942 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1943 bool is_oacc_parallel_kernels_parallelized
1944 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1945 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1946 bool is_oacc_parallel_kernels_gang_single
1947 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1948 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1949 int fn_level
= oacc_fn_attrib_level (attrs
);
1950 bool is_oacc_routine
= (fn_level
>= 0);
1951 gcc_checking_assert (is_oacc_parallel
1954 + is_oacc_parallel_kernels_parallelized
1955 + is_oacc_parallel_kernels_gang_single
1959 bool is_oacc_kernels_parallelized
1960 = (lookup_attribute ("oacc kernels parallelized",
1961 DECL_ATTRIBUTES (current_function_decl
)) != NULL
);
1962 if (is_oacc_kernels_parallelized
)
1963 gcc_checking_assert (is_oacc_kernels
);
1967 if (is_oacc_parallel
)
1968 fprintf (dump_file
, "Function is OpenACC parallel offload\n");
1969 else if (is_oacc_kernels
)
1970 fprintf (dump_file
, "Function is %s OpenACC kernels offload\n",
1971 (is_oacc_kernels_parallelized
1972 ? "parallelized" : "unparallelized"));
1973 else if (is_oacc_serial
)
1974 fprintf (dump_file
, "Function is OpenACC serial offload\n");
1975 else if (is_oacc_parallel_kernels_parallelized
)
1976 fprintf (dump_file
, "Function is %s OpenACC kernels offload\n",
1977 "parallel_kernels_parallelized");
1978 else if (is_oacc_parallel_kernels_gang_single
)
1979 fprintf (dump_file
, "Function is %s OpenACC kernels offload\n",
1980 "parallel_kernels_gang_single");
1981 else if (is_oacc_routine
)
1982 fprintf (dump_file
, "Function is OpenACC routine level %d\n",
1988 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
1989 it's a convenient place, so... */
1990 if (is_oacc_routine
)
1992 tree attr
= lookup_attribute ("omp declare target",
1993 DECL_ATTRIBUTES (current_function_decl
));
1994 gcc_checking_assert (attr
);
1995 tree clauses
= TREE_VALUE (attr
);
1996 gcc_checking_assert (clauses
);
1998 /* Should this OpenACC routine be discarded? */
1999 bool discard
= false;
2001 tree clause_nohost
= omp_find_clause (clauses
, OMP_CLAUSE_NOHOST
);
2004 "OpenACC routine '%s' %s '%s' clause.\n",
2005 lang_hooks
.decl_printable_name (current_function_decl
, 2),
2006 clause_nohost
? "has" : "doesn't have",
2007 omp_clause_code_name
[OMP_CLAUSE_NOHOST
]);
2008 /* Host compiler, 'nohost' clause? */
2009 #ifndef ACCEL_COMPILER
2016 "OpenACC routine '%s' %sdiscarded.\n",
2017 lang_hooks
.decl_printable_name (current_function_decl
, 2),
2018 discard
? "" : "not ");
2021 TREE_ASM_WRITTEN (current_function_decl
) = 1;
2022 return TODO_discard_function
;
2026 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2027 kernels, so remove the parallelism dimensions function attributes
2028 potentially set earlier on. */
2029 if (is_oacc_kernels
&& !is_oacc_kernels_parallelized
)
2031 oacc_set_fn_attrib (current_function_decl
, NULL
, NULL
);
2032 attrs
= oacc_get_fn_attrib (current_function_decl
);
2035 /* Discover, partition and process the loops. */
2036 oacc_loop
*loops
= oacc_loop_discovery ();
2038 unsigned outer_mask
= 0;
2039 if (is_oacc_routine
)
2040 outer_mask
= GOMP_DIM_MASK (fn_level
) - 1;
2041 unsigned used_mask
= oacc_loop_partition (loops
, outer_mask
);
2042 /* OpenACC kernels constructs are special: they currently don't use the
2043 generic oacc_loop infrastructure and attribute/dimension processing. */
2044 if (is_oacc_kernels
&& is_oacc_kernels_parallelized
)
2046 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2047 also tree-parloops.c:create_parallel_loop. */
2048 used_mask
|= GOMP_DIM_MASK (GOMP_DIM_GANG
);
2051 int dims
[GOMP_DIM_MAX
];
2052 oacc_validate_dims (current_function_decl
, attrs
, dims
, fn_level
, used_mask
);
2056 const char *comma
= "Compute dimensions [";
2057 for (int ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++, comma
= ", ")
2058 fprintf (dump_file
, "%s%d", comma
, dims
[ix
]);
2059 fprintf (dump_file
, "]\n");
2062 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2063 a single gang only. */
2064 if (is_oacc_parallel_kernels_gang_single
)
2065 gcc_checking_assert (dims
[GOMP_DIM_GANG
] == 1);
2067 oacc_loop_process (loops
);
2070 fprintf (dump_file
, "OpenACC loops\n");
2071 dump_oacc_loop (dump_file
, loops
, 0);
2072 fprintf (dump_file
, "\n");
2074 if (dump_enabled_p ())
2076 oacc_loop
*l
= loops
;
2077 /* OpenACC kernels constructs are special: they currently don't use the
2078 generic oacc_loop infrastructure. */
2079 if (is_oacc_kernels
)
2081 /* Create a fake oacc_loop for diagnostic purposes. */
2082 l
= new_oacc_loop_raw (NULL
,
2083 DECL_SOURCE_LOCATION (current_function_decl
));
2084 l
->mask
= used_mask
;
2088 /* Skip the outermost, dummy OpenACC loop */
2092 inform_oacc_loop (l
);
2093 if (is_oacc_kernels
)
2097 free_oacc_loop (loops
);
2103 execute_oacc_device_lower ()
2105 tree attrs
= oacc_get_fn_attrib (current_function_decl
);
2108 /* Not an offloaded function. */
2111 int dims
[GOMP_DIM_MAX
];
2112 for (unsigned i
= 0; i
< GOMP_DIM_MAX
; i
++)
2113 dims
[i
] = oacc_get_fn_dim_size (current_function_decl
, i
);
2115 hash_map
<tree
, tree
> adjusted_vars
;
2117 /* Now lower internal loop functions to target-specific code
2120 FOR_ALL_BB_FN (bb
, cfun
)
2121 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
);)
2123 gimple
*stmt
= gsi_stmt (gsi
);
2124 if (!is_gimple_call (stmt
))
2130 gcall
*call
= as_a
<gcall
*> (stmt
);
2131 if (!gimple_call_internal_p (call
))
2137 /* Rewind to allow rescan. */
2139 bool rescan
= false, remove
= false;
2140 enum internal_fn ifn_code
= gimple_call_internal_fn (call
);
2146 case IFN_GOACC_TILE
:
2147 oacc_xform_tile (call
);
2151 case IFN_GOACC_LOOP
:
2152 oacc_xform_loop (call
);
2156 case IFN_GOACC_REDUCTION
:
2157 /* Mark the function for SSA renaming. */
2158 mark_virtual_operands_for_renaming (cfun
);
2160 /* If the level is -1, this ended up being an unused
2161 axis. Handle as a default. */
2162 if (integer_minus_onep (gimple_call_arg (call
, 3)))
2163 default_goacc_reduction (call
);
2165 targetm
.goacc
.reduction (call
);
2171 enum ifn_unique_kind kind
2172 = ((enum ifn_unique_kind
)
2173 TREE_INT_CST_LOW (gimple_call_arg (call
, 0)));
2180 case IFN_UNIQUE_OACC_FORK
:
2181 case IFN_UNIQUE_OACC_JOIN
:
2182 if (integer_minus_onep (gimple_call_arg (call
, 2)))
2184 else if (!targetm
.goacc
.fork_join
2185 (call
, dims
, kind
== IFN_UNIQUE_OACC_FORK
))
2189 case IFN_UNIQUE_OACC_HEAD_MARK
:
2190 case IFN_UNIQUE_OACC_TAIL_MARK
:
2194 case IFN_UNIQUE_OACC_PRIVATE
:
2196 dump_flags_t l_dump_flags
2197 = get_openacc_privatization_dump_flags ();
2199 location_t loc
= gimple_location (stmt
);
2200 if (LOCATION_LOCUS (loc
) == UNKNOWN_LOCATION
)
2201 loc
= DECL_SOURCE_LOCATION (current_function_decl
);
2202 const dump_user_location_t d_u_loc
2203 = dump_user_location_t::from_location_t (loc
);
2206 = TREE_INT_CST_LOW (gimple_call_arg (call
, 2));
2207 gcc_checking_assert (level
== -1
2209 && level
< GOMP_DIM_MAX
));
2210 for (unsigned i
= 3;
2211 i
< gimple_call_num_args (call
);
2214 static char const *const axes
[] =
2215 /* Must be kept in sync with GOMP_DIM enumeration. */
2216 { "gang", "worker", "vector" };
2218 tree arg
= gimple_call_arg (call
, i
);
2219 gcc_checking_assert (TREE_CODE (arg
) == ADDR_EXPR
);
2220 tree decl
= TREE_OPERAND (arg
, 0);
2221 if (dump_enabled_p ())
2222 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2224 # pragma GCC diagnostic push
2225 # pragma GCC diagnostic ignored "-Wformat"
2227 dump_printf_loc (l_dump_flags
, d_u_loc
,
2228 "variable %<%T%> ought to be"
2229 " adjusted for OpenACC"
2230 " privatization level: %qs\n",
2233 ? "UNKNOWN" : axes
[level
]));
2235 # pragma GCC diagnostic pop
2240 else if (!targetm
.goacc
.adjust_private_decl
)
2242 else if (level
== GOMP_DIM_VECTOR
)
2244 /* That's the default behavior. */
2249 tree oldtype
= TREE_TYPE (decl
);
2251 = targetm
.goacc
.adjust_private_decl (loc
, decl
,
2253 adjusted
= (TREE_TYPE (newdecl
) != oldtype
2254 || newdecl
!= decl
);
2256 adjusted_vars
.put (decl
, newdecl
);
2259 && dump_enabled_p ())
2260 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2262 # pragma GCC diagnostic push
2263 # pragma GCC diagnostic ignored "-Wformat"
2265 dump_printf_loc (l_dump_flags
, d_u_loc
,
2266 "variable %<%T%> adjusted for"
2267 " OpenACC privatization level:"
2271 # pragma GCC diagnostic pop
2282 if (gsi_end_p (gsi
))
2283 /* We rewound past the beginning of the BB. */
2284 gsi
= gsi_start_bb (bb
);
2286 /* Undo the rewind. */
2291 if (gimple_vdef (call
))
2292 replace_uses_by (gimple_vdef (call
), gimple_vuse (call
));
2293 if (gimple_call_lhs (call
))
2295 /* Propagate the data dependency var. */
2296 gimple
*ass
= gimple_build_assign (gimple_call_lhs (call
),
2297 gimple_call_arg (call
, 1));
2298 gsi_replace (&gsi
, ass
, false);
2301 gsi_remove (&gsi
, true);
2304 /* If not rescanning, advance over the call. */
2308 /* Regarding the OpenACC privatization level, we're currently only looking at
2309 making the gang-private level work. Regarding that, we have the following
2312 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2313 particular, change 'TREE_TYPE', etc.) and there is no
2314 'targetm.goacc.expand_var_decl'.
2316 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2317 marker and then 'targetm.goacc.expand_var_decl' does the work.
2319 Eventually (in particular, for worker-private level?), both
2320 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2321 may need to do things, but that's currently not meant to be addressed, and
2322 thus not fully worked out and implemented, and thus untested. Hence,
2323 'assert' what currently is implemented/tested, only. */
2325 if (targetm
.goacc
.expand_var_decl
)
2326 gcc_assert (adjusted_vars
.is_empty ());
2328 /* Make adjustments to gang-private local variables if required by the
2329 target, e.g. forcing them into a particular address space. Afterwards,
2330 ADDR_EXPR nodes which have adjusted variables as their argument need to
2331 be modified in one of two ways:
2333 1. They can be recreated, making a pointer to the variable in the new
2336 2. The address of the variable in the new address space can be taken,
2337 converted to the default (original) address space, and the result of
2338 that conversion subsituted in place of the original ADDR_EXPR node.
2340 Which of these is done depends on the gimple statement being processed.
2341 At present atomic operations and inline asms use (1), and everything else
2342 uses (2). At least on AMD GCN, there are atomic operations that work
2343 directly in the LDS address space.
2345 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2346 the new decl, adjusting types of appropriate tree nodes as necessary. */
2348 if (targetm
.goacc
.adjust_private_decl
2349 && !adjusted_vars
.is_empty ())
2351 FOR_ALL_BB_FN (bb
, cfun
)
2352 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb
);
2356 gimple
*stmt
= gsi_stmt (gsi
);
2358 var_decl_rewrite_info info
;
2360 info
.avoid_pointer_conversion
2361 = (is_gimple_call (stmt
)
2362 && is_sync_builtin_call (as_a
<gcall
*> (stmt
)))
2363 || gimple_code (stmt
) == GIMPLE_ASM
;
2365 info
.modified
= false;
2366 info
.adjusted_vars
= &adjusted_vars
;
2368 memset (&wi
, 0, sizeof (wi
));
2371 walk_gimple_op (stmt
, oacc_rewrite_var_decl
, &wi
);
2381 /* Default launch dimension validator. Force everything to 1. A
2382 backend that wants to provide larger dimensions must override this
2386 default_goacc_validate_dims (tree
ARG_UNUSED (decl
), int *dims
,
2387 int ARG_UNUSED (fn_level
),
2388 unsigned ARG_UNUSED (used
))
2390 bool changed
= false;
2392 for (unsigned ix
= 0; ix
!= GOMP_DIM_MAX
; ix
++)
2404 /* Default dimension bound is unknown on accelerator and 1 on host. */
2407 default_goacc_dim_limit (int ARG_UNUSED (axis
))
2409 #ifdef ACCEL_COMPILER
2418 const pass_data pass_data_oacc_loop_designation
=
2420 GIMPLE_PASS
, /* type */
2421 "oaccloops", /* name */
2422 OPTGROUP_OMP
, /* optinfo_flags */
2423 TV_NONE
, /* tv_id */
2424 PROP_cfg
, /* properties_required */
2425 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2426 0, /* properties_destroyed */
2427 0, /* todo_flags_start */
2428 TODO_update_ssa
| TODO_cleanup_cfg
, /* todo_flags_finish */
2431 class pass_oacc_loop_designation
: public gimple_opt_pass
2434 pass_oacc_loop_designation (gcc::context
*ctxt
)
2435 : gimple_opt_pass (pass_data_oacc_loop_designation
, ctxt
)
2438 /* opt_pass methods: */
2439 virtual bool gate (function
*) { return flag_openacc
; };
2441 virtual unsigned int execute (function
*)
2443 return execute_oacc_loop_designation ();
2446 }; // class pass_oacc_loop_designation
2448 const pass_data pass_data_oacc_device_lower
=
2450 GIMPLE_PASS
, /* type */
2451 "oaccdevlow", /* name */
2452 OPTGROUP_OMP
, /* optinfo_flags */
2453 TV_NONE
, /* tv_id */
2454 PROP_cfg
, /* properties_required */
2455 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2456 0, /* properties_destroyed */
2457 0, /* todo_flags_start */
2458 TODO_update_ssa
| TODO_cleanup_cfg
, /* todo_flags_finish */
2461 class pass_oacc_device_lower
: public gimple_opt_pass
2464 pass_oacc_device_lower (gcc::context
*ctxt
)
2465 : gimple_opt_pass (pass_data_oacc_device_lower
, ctxt
)
2468 /* opt_pass methods: */
2469 virtual bool gate (function
*) { return flag_openacc
; };
2471 virtual unsigned int execute (function
*)
2473 return execute_oacc_device_lower ();
2476 }; // class pass_oacc_device_lower
2481 make_pass_oacc_loop_designation (gcc::context
*ctxt
)
2483 return new pass_oacc_loop_designation (ctxt
);
2487 make_pass_oacc_device_lower (gcc::context
*ctxt
)
2489 return new pass_oacc_device_lower (ctxt
);
2493 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2494 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2495 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2496 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2499 ompdevlow_adjust_simt_enter (gimple_stmt_iterator
*gsi
, bool *regimplify
)
2501 gimple
*alloc_stmt
= gsi_stmt (*gsi
);
2502 tree simtrec
= gimple_call_lhs (alloc_stmt
);
2503 tree simduid
= gimple_call_arg (alloc_stmt
, 0);
2504 gimple
*enter_stmt
= SSA_NAME_DEF_STMT (simduid
);
2505 gcc_assert (gimple_call_internal_p (enter_stmt
, IFN_GOMP_SIMT_ENTER
));
2506 tree rectype
= lang_hooks
.types
.make_type (RECORD_TYPE
);
2507 TYPE_ARTIFICIAL (rectype
) = TYPE_NAMELESS (rectype
) = 1;
2508 TREE_ADDRESSABLE (rectype
) = 1;
2509 TREE_TYPE (simtrec
) = build_pointer_type (rectype
);
2510 for (unsigned i
= 1; i
< gimple_call_num_args (enter_stmt
); i
++)
2512 tree
*argp
= gimple_call_arg_ptr (enter_stmt
, i
);
2513 if (*argp
== null_pointer_node
)
2515 gcc_assert (TREE_CODE (*argp
) == ADDR_EXPR
2516 && VAR_P (TREE_OPERAND (*argp
, 0)));
2517 tree var
= TREE_OPERAND (*argp
, 0);
2519 tree field
= build_decl (DECL_SOURCE_LOCATION (var
), FIELD_DECL
,
2520 DECL_NAME (var
), TREE_TYPE (var
));
2521 SET_DECL_ALIGN (field
, DECL_ALIGN (var
));
2522 DECL_USER_ALIGN (field
) = DECL_USER_ALIGN (var
);
2523 TREE_THIS_VOLATILE (field
) = TREE_THIS_VOLATILE (var
);
2525 insert_field_into_struct (rectype
, field
);
2527 tree t
= build_simple_mem_ref (simtrec
);
2528 t
= build3 (COMPONENT_REF
, TREE_TYPE (var
), t
, field
, NULL
);
2529 TREE_THIS_VOLATILE (t
) = TREE_THIS_VOLATILE (var
);
2530 SET_DECL_VALUE_EXPR (var
, t
);
2531 DECL_HAS_VALUE_EXPR_P (var
) = 1;
2534 layout_type (rectype
);
2535 tree size
= TYPE_SIZE_UNIT (rectype
);
2536 tree align
= build_int_cst (TREE_TYPE (size
), TYPE_ALIGN_UNIT (rectype
));
2539 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC
, 2, size
, align
);
2540 gimple_call_set_lhs (alloc_stmt
, simtrec
);
2541 gsi_replace (gsi
, alloc_stmt
, false);
2542 gimple_stmt_iterator enter_gsi
= gsi_for_stmt (enter_stmt
);
2543 enter_stmt
= gimple_build_assign (simduid
, gimple_call_arg (enter_stmt
, 0));
2544 gsi_replace (&enter_gsi
, enter_stmt
, false);
2548 if (single_imm_use (simtrec
, &use
, &exit_stmt
))
2550 gcc_assert (gimple_call_internal_p (exit_stmt
, IFN_GOMP_SIMT_EXIT
));
2551 gimple_stmt_iterator exit_gsi
= gsi_for_stmt (exit_stmt
);
2552 tree clobber
= build_clobber (rectype
);
2553 exit_stmt
= gimple_build_assign (build_simple_mem_ref (simtrec
), clobber
);
2554 gsi_insert_before (&exit_gsi
, exit_stmt
, GSI_SAME_STMT
);
2557 gcc_checking_assert (has_zero_uses (simtrec
));
2560 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2563 find_simtpriv_var_op (tree
*tp
, int *walk_subtrees
, void *)
2568 && DECL_HAS_VALUE_EXPR_P (t
)
2569 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t
)))
2577 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2578 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2579 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2580 internal functions on non-SIMT targets, and likewise some SIMD internal
2581 functions on SIMT targets. */
2584 execute_omp_device_lower ()
2586 int vf
= targetm
.simt
.vf
? targetm
.simt
.vf () : 1;
2587 bool regimplify
= false;
2589 gimple_stmt_iterator gsi
;
2590 bool calls_declare_variant_alt
2591 = cgraph_node::get (cfun
->decl
)->calls_declare_variant_alt
;
2592 FOR_EACH_BB_FN (bb
, cfun
)
2593 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2595 gimple
*stmt
= gsi_stmt (gsi
);
2596 if (!is_gimple_call (stmt
))
2598 if (!gimple_call_internal_p (stmt
))
2600 if (calls_declare_variant_alt
)
2601 if (tree fndecl
= gimple_call_fndecl (stmt
))
2603 tree new_fndecl
= omp_resolve_declare_variant (fndecl
);
2604 if (new_fndecl
!= fndecl
)
2606 gimple_call_set_fndecl (stmt
, new_fndecl
);
2612 tree lhs
= gimple_call_lhs (stmt
), rhs
= NULL_TREE
;
2613 tree type
= lhs
? TREE_TYPE (lhs
) : integer_type_node
;
2614 switch (gimple_call_internal_fn (stmt
))
2616 case IFN_GOMP_USE_SIMT
:
2617 rhs
= vf
== 1 ? integer_zero_node
: integer_one_node
;
2619 case IFN_GOMP_SIMT_ENTER
:
2620 rhs
= vf
== 1 ? gimple_call_arg (stmt
, 0) : NULL_TREE
;
2621 goto simtreg_enter_exit
;
2622 case IFN_GOMP_SIMT_ENTER_ALLOC
:
2624 ompdevlow_adjust_simt_enter (&gsi
, ®implify
);
2625 rhs
= vf
== 1 ? null_pointer_node
: NULL_TREE
;
2626 goto simtreg_enter_exit
;
2627 case IFN_GOMP_SIMT_EXIT
:
2631 unlink_stmt_vdef (stmt
);
2633 case IFN_GOMP_SIMT_LANE
:
2634 case IFN_GOMP_SIMT_LAST_LANE
:
2635 rhs
= vf
== 1 ? build_zero_cst (type
) : NULL_TREE
;
2637 case IFN_GOMP_SIMT_VF
:
2638 rhs
= build_int_cst (type
, vf
);
2640 case IFN_GOMP_SIMT_ORDERED_PRED
:
2641 rhs
= vf
== 1 ? integer_zero_node
: NULL_TREE
;
2643 unlink_stmt_vdef (stmt
);
2645 case IFN_GOMP_SIMT_VOTE_ANY
:
2646 case IFN_GOMP_SIMT_XCHG_BFLY
:
2647 case IFN_GOMP_SIMT_XCHG_IDX
:
2648 rhs
= vf
== 1 ? gimple_call_arg (stmt
, 0) : NULL_TREE
;
2650 case IFN_GOMP_SIMD_LANE
:
2651 case IFN_GOMP_SIMD_LAST_LANE
:
2652 rhs
= vf
!= 1 ? build_zero_cst (type
) : NULL_TREE
;
2654 case IFN_GOMP_SIMD_VF
:
2655 rhs
= vf
!= 1 ? build_one_cst (type
) : NULL_TREE
;
2662 stmt
= lhs
? gimple_build_assign (lhs
, rhs
) : gimple_build_nop ();
2663 gsi_replace (&gsi
, stmt
, false);
2666 FOR_EACH_BB_REVERSE_FN (bb
, cfun
)
2667 for (gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
); gsi_prev (&gsi
))
2668 if (walk_gimple_stmt (&gsi
, NULL
, find_simtpriv_var_op
, NULL
))
2670 if (gimple_clobber_p (gsi_stmt (gsi
)))
2671 gsi_remove (&gsi
, true);
2673 gimple_regimplify_operands (gsi_stmt (gsi
), &gsi
);
2676 cfun
->has_force_vectorize_loops
= false;
2682 const pass_data pass_data_omp_device_lower
=
2684 GIMPLE_PASS
, /* type */
2685 "ompdevlow", /* name */
2686 OPTGROUP_OMP
, /* optinfo_flags */
2687 TV_NONE
, /* tv_id */
2688 PROP_cfg
, /* properties_required */
2689 PROP_gimple_lomp_dev
, /* properties_provided */
2690 0, /* properties_destroyed */
2691 0, /* todo_flags_start */
2692 TODO_update_ssa
, /* todo_flags_finish */
2695 class pass_omp_device_lower
: public gimple_opt_pass
2698 pass_omp_device_lower (gcc::context
*ctxt
)
2699 : gimple_opt_pass (pass_data_omp_device_lower
, ctxt
)
2702 /* opt_pass methods: */
2703 virtual bool gate (function
*fun
)
2705 return (!(fun
->curr_properties
& PROP_gimple_lomp_dev
)
2707 && cgraph_node::get (fun
->decl
)->calls_declare_variant_alt
));
2709 virtual unsigned int execute (function
*)
2711 return execute_omp_device_lower ();
2714 }; // class pass_expand_omp_ssa
2719 make_pass_omp_device_lower (gcc::context
*ctxt
)
2721 return new pass_omp_device_lower (ctxt
);
2724 /* "omp declare target link" handling pass. */
2728 const pass_data pass_data_omp_target_link
=
2730 GIMPLE_PASS
, /* type */
2731 "omptargetlink", /* name */
2732 OPTGROUP_OMP
, /* optinfo_flags */
2733 TV_NONE
, /* tv_id */
2734 PROP_ssa
, /* properties_required */
2735 0, /* properties_provided */
2736 0, /* properties_destroyed */
2737 0, /* todo_flags_start */
2738 TODO_update_ssa
, /* todo_flags_finish */
2741 class pass_omp_target_link
: public gimple_opt_pass
2744 pass_omp_target_link (gcc::context
*ctxt
)
2745 : gimple_opt_pass (pass_data_omp_target_link
, ctxt
)
2748 /* opt_pass methods: */
2749 virtual bool gate (function
*fun
)
2751 #ifdef ACCEL_COMPILER
2752 return offloading_function_p (fun
->decl
);
2759 virtual unsigned execute (function
*);
2762 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2765 find_link_var_op (tree
*tp
, int *walk_subtrees
, void *)
2770 && DECL_HAS_VALUE_EXPR_P (t
)
2771 && is_global_var (t
)
2772 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t
)))
2782 pass_omp_target_link::execute (function
*fun
)
2785 FOR_EACH_BB_FN (bb
, fun
)
2787 gimple_stmt_iterator gsi
;
2788 for (gsi
= gsi_start_bb (bb
); !gsi_end_p (gsi
); gsi_next (&gsi
))
2790 if (gimple_call_builtin_p (gsi_stmt (gsi
), BUILT_IN_GOMP_TARGET
))
2792 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2793 gimple_call_set_arg (gsi_stmt (gsi
), 1, null_pointer_node
);
2794 update_stmt (gsi_stmt (gsi
));
2796 if (walk_gimple_stmt (&gsi
, NULL
, find_link_var_op
, NULL
))
2797 gimple_regimplify_operands (gsi_stmt (gsi
), &gsi
);
2807 make_pass_omp_target_link (gcc::context
*ctxt
)
2809 return new pass_omp_target_link (ctxt
);