]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/omp-offload.c
Correct a function pre/postcondition [PR102403].
[thirdparty/gcc.git] / gcc / omp-offload.c
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
4 Copyright (C) 2005-2021 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
56 #include "convert.h"
57
58 /* Describe the OpenACC looping structure of a function. The entire
59 function is held in a 'NULL' loop. */
60
61 struct oacc_loop
62 {
63 oacc_loop *parent; /* Containing loop. */
64
65 oacc_loop *child; /* First inner loop. */
66
67 oacc_loop *sibling; /* Next loop within same parent. */
68
69 location_t loc; /* Location of the loop start. */
70
71 gcall *marker; /* Initial head marker. */
72
73 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
74 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
75
76 tree routine; /* Pseudo-loop enclosing a routine. */
77
78 unsigned mask; /* Partitioning mask. */
79 unsigned e_mask; /* Partitioning of element loops (when tiling). */
80 unsigned inner; /* Partitioning of inner loops. */
81 unsigned flags; /* Partitioning flags. */
82 vec<gcall *> ifns; /* Contained loop abstraction functions. */
83 tree chunk_size; /* Chunk size. */
84 gcall *head_end; /* Final marker of head sequence. */
85 };
86
87 /* Holds offload tables with decls. */
88 vec<tree, va_gc> *offload_funcs, *offload_vars;
89
90 /* Return level at which oacc routine may spawn a partitioned loop, or
91 -1 if it is not a routine (i.e. is an offload fn). */
92
93 int
94 oacc_fn_attrib_level (tree attr)
95 {
96 tree pos = TREE_VALUE (attr);
97
98 if (!TREE_PURPOSE (pos))
99 return -1;
100
101 int ix = 0;
102 for (ix = 0; ix != GOMP_DIM_MAX;
103 ix++, pos = TREE_CHAIN (pos))
104 if (!integer_zerop (TREE_PURPOSE (pos)))
105 break;
106
107 return ix;
108 }
109
110 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
111 adds their addresses and sizes to constructor-vector V_CTOR. */
112
113 static void
114 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
115 vec<constructor_elt, va_gc> *v_ctor)
116 {
117 unsigned len = vec_safe_length (v_decls);
118 for (unsigned i = 0; i < len; i++)
119 {
120 tree it = (*v_decls)[i];
121 bool is_var = VAR_P (it);
122 bool is_link_var
123 = is_var
124 #ifdef ACCEL_COMPILER
125 && DECL_HAS_VALUE_EXPR_P (it)
126 #endif
127 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
128
129 /* See also omp_finish_file and output_offload_tables in lto-cgraph.c. */
130 if (!in_lto_p && !symtab_node::get (it))
131 continue;
132
133 tree size = NULL_TREE;
134 if (is_var)
135 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
136
137 tree addr;
138 if (!is_link_var)
139 addr = build_fold_addr_expr (it);
140 else
141 {
142 #ifdef ACCEL_COMPILER
143 /* For "omp declare target link" vars add address of the pointer to
144 the target table, instead of address of the var. */
145 tree value_expr = DECL_VALUE_EXPR (it);
146 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
147 varpool_node::finalize_decl (link_ptr_decl);
148 addr = build_fold_addr_expr (link_ptr_decl);
149 #else
150 addr = build_fold_addr_expr (it);
151 #endif
152
153 /* Most significant bit of the size marks "omp declare target link"
154 vars in host and target tables. */
155 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
156 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
157 * BITS_PER_UNIT - 1);
158 size = wide_int_to_tree (const_ptr_type_node, isize);
159 }
160
161 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
162 if (is_var)
163 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
164 }
165 }
166
167 /* Return true if DECL is a function for which its references should be
168 analyzed. */
169
170 static bool
171 omp_declare_target_fn_p (tree decl)
172 {
173 return (TREE_CODE (decl) == FUNCTION_DECL
174 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
175 && !lookup_attribute ("omp declare target host",
176 DECL_ATTRIBUTES (decl))
177 && (!flag_openacc
178 || oacc_get_fn_attrib (decl) == NULL_TREE));
179 }
180
181 /* Return true if DECL Is a variable for which its initializer references
182 should be analyzed. */
183
184 static bool
185 omp_declare_target_var_p (tree decl)
186 {
187 return (VAR_P (decl)
188 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
189 && !lookup_attribute ("omp declare target link",
190 DECL_ATTRIBUTES (decl)));
191 }
192
193 /* Helper function for omp_discover_implicit_declare_target, called through
194 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
195 declare target to. */
196
197 static tree
198 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
199 {
200 if (TREE_CODE (*tp) == CALL_EXPR
201 && CALL_EXPR_FN (*tp)
202 && TREE_CODE (CALL_EXPR_FN (*tp)) == ADDR_EXPR
203 && TREE_CODE (TREE_OPERAND (CALL_EXPR_FN (*tp), 0)) == FUNCTION_DECL
204 && lookup_attribute ("omp declare variant base",
205 DECL_ATTRIBUTES (TREE_OPERAND (CALL_EXPR_FN (*tp),
206 0))))
207 {
208 tree fn = TREE_OPERAND (CALL_EXPR_FN (*tp), 0);
209 for (tree attr = DECL_ATTRIBUTES (fn); attr; attr = TREE_CHAIN (attr))
210 {
211 attr = lookup_attribute ("omp declare variant base", attr);
212 if (attr == NULL_TREE)
213 break;
214 tree purpose = TREE_PURPOSE (TREE_VALUE (attr));
215 if (TREE_CODE (purpose) == FUNCTION_DECL)
216 omp_discover_declare_target_tgt_fn_r (&purpose, walk_subtrees, data);
217 }
218 }
219 else if (TREE_CODE (*tp) == FUNCTION_DECL)
220 {
221 tree decl = *tp;
222 tree id = get_identifier ("omp declare target");
223 symtab_node *node = symtab_node::get (*tp);
224 if (node != NULL)
225 {
226 while (node->alias_target
227 && TREE_CODE (node->alias_target) == FUNCTION_DECL)
228 {
229 if (!omp_declare_target_fn_p (node->decl)
230 && !lookup_attribute ("omp declare target host",
231 DECL_ATTRIBUTES (node->decl)))
232 {
233 node->offloadable = 1;
234 DECL_ATTRIBUTES (node->decl)
235 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
236 }
237 node = symtab_node::get (node->alias_target);
238 }
239 symtab_node *new_node = node->ultimate_alias_target ();
240 decl = new_node->decl;
241 while (node != new_node)
242 {
243 if (!omp_declare_target_fn_p (node->decl)
244 && !lookup_attribute ("omp declare target host",
245 DECL_ATTRIBUTES (node->decl)))
246 {
247 node->offloadable = 1;
248 DECL_ATTRIBUTES (node->decl)
249 = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (node->decl));
250 }
251 gcc_assert (node->alias && node->analyzed);
252 node = node->get_alias_target ();
253 }
254 node->offloadable = 1;
255 if (ENABLE_OFFLOADING)
256 g->have_offload = true;
257 }
258 if (omp_declare_target_fn_p (decl)
259 || lookup_attribute ("omp declare target host",
260 DECL_ATTRIBUTES (decl)))
261 return NULL_TREE;
262
263 if (!DECL_EXTERNAL (decl) && DECL_SAVED_TREE (decl))
264 ((vec<tree> *) data)->safe_push (decl);
265 DECL_ATTRIBUTES (decl) = tree_cons (id, NULL_TREE,
266 DECL_ATTRIBUTES (decl));
267 }
268 else if (TYPE_P (*tp))
269 *walk_subtrees = 0;
270 /* else if (TREE_CODE (*tp) == OMP_TARGET)
271 {
272 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
273 if (OMP_DEVICE_ANCESTOR (dev))
274 *walk_subtrees = 0;
275 } */
276 return NULL_TREE;
277 }
278
279 /* Similarly, but ignore references outside of OMP_TARGET regions. */
280
281 static tree
282 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
283 {
284 if (TREE_CODE (*tp) == OMP_TARGET)
285 {
286 /* And not OMP_DEVICE_ANCESTOR. */
287 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
288 omp_discover_declare_target_tgt_fn_r,
289 data);
290 *walk_subtrees = 0;
291 }
292 else if (TYPE_P (*tp))
293 *walk_subtrees = 0;
294 return NULL_TREE;
295 }
296
297 /* Helper function for omp_discover_implicit_declare_target, called through
298 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
299 declare target to. */
300
301 static tree
302 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
303 {
304 if (TREE_CODE (*tp) == FUNCTION_DECL)
305 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
306 else if (VAR_P (*tp)
307 && is_global_var (*tp)
308 && !omp_declare_target_var_p (*tp))
309 {
310 tree id = get_identifier ("omp declare target");
311 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
312 {
313 error_at (DECL_SOURCE_LOCATION (*tp),
314 "%qD specified both in declare target %<link%> and "
315 "implicitly in %<to%> clauses", *tp);
316 DECL_ATTRIBUTES (*tp)
317 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
318 }
319 if (TREE_STATIC (*tp) && lang_hooks.decls.omp_get_decl_init (*tp))
320 ((vec<tree> *) data)->safe_push (*tp);
321 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
322 symtab_node *node = symtab_node::get (*tp);
323 if (node != NULL && !node->offloadable)
324 {
325 node->offloadable = 1;
326 if (ENABLE_OFFLOADING)
327 {
328 g->have_offload = true;
329 if (is_a <varpool_node *> (node))
330 vec_safe_push (offload_vars, node->decl);
331 }
332 }
333 }
334 else if (TYPE_P (*tp))
335 *walk_subtrees = 0;
336 return NULL_TREE;
337 }
338
339 /* Perform the OpenMP implicit declare target to discovery. */
340
341 void
342 omp_discover_implicit_declare_target (void)
343 {
344 cgraph_node *node;
345 varpool_node *vnode;
346 auto_vec<tree> worklist;
347
348 FOR_EACH_DEFINED_FUNCTION (node)
349 if (DECL_SAVED_TREE (node->decl))
350 {
351 struct cgraph_node *cgn;
352 if (omp_declare_target_fn_p (node->decl))
353 worklist.safe_push (node->decl);
354 else if (DECL_STRUCT_FUNCTION (node->decl)
355 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
356 worklist.safe_push (node->decl);
357 for (cgn = first_nested_function (node);
358 cgn; cgn = next_nested_function (cgn))
359 if (omp_declare_target_fn_p (cgn->decl))
360 worklist.safe_push (cgn->decl);
361 else if (DECL_STRUCT_FUNCTION (cgn->decl)
362 && DECL_STRUCT_FUNCTION (cgn->decl)->has_omp_target)
363 worklist.safe_push (cgn->decl);
364 }
365 FOR_EACH_VARIABLE (vnode)
366 if (lang_hooks.decls.omp_get_decl_init (vnode->decl)
367 && omp_declare_target_var_p (vnode->decl))
368 worklist.safe_push (vnode->decl);
369 while (!worklist.is_empty ())
370 {
371 tree decl = worklist.pop ();
372 if (VAR_P (decl))
373 walk_tree_without_duplicates (lang_hooks.decls.omp_get_decl_init (decl),
374 omp_discover_declare_target_var_r,
375 &worklist);
376 else if (omp_declare_target_fn_p (decl))
377 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
378 omp_discover_declare_target_tgt_fn_r,
379 &worklist);
380 else
381 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
382 omp_discover_declare_target_fn_r,
383 &worklist);
384 }
385
386 lang_hooks.decls.omp_finish_decl_inits ();
387 }
388
389
390 /* Create new symbols containing (address, size) pairs for global variables,
391 marked with "omp declare target" attribute, as well as addresses for the
392 functions, which are outlined offloading regions. */
393 void
394 omp_finish_file (void)
395 {
396 unsigned num_funcs = vec_safe_length (offload_funcs);
397 unsigned num_vars = vec_safe_length (offload_vars);
398
399 if (num_funcs == 0 && num_vars == 0)
400 return;
401
402 if (targetm_common.have_named_sections)
403 {
404 vec<constructor_elt, va_gc> *v_f, *v_v;
405 vec_alloc (v_f, num_funcs);
406 vec_alloc (v_v, num_vars * 2);
407
408 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
409 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
410
411 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
412 vec_safe_length (v_v));
413 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
414 num_funcs);
415 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
416 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
417 tree ctor_v = build_constructor (vars_decl_type, v_v);
418 tree ctor_f = build_constructor (funcs_decl_type, v_f);
419 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
420 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
421 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
422 get_identifier (".offload_func_table"),
423 funcs_decl_type);
424 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
425 get_identifier (".offload_var_table"),
426 vars_decl_type);
427 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
428 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
429 otherwise a joint table in a binary will contain padding between
430 tables from multiple object files. */
431 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
432 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
433 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
434 DECL_INITIAL (funcs_decl) = ctor_f;
435 DECL_INITIAL (vars_decl) = ctor_v;
436 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
437 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
438
439 varpool_node::finalize_decl (vars_decl);
440 varpool_node::finalize_decl (funcs_decl);
441 }
442 else
443 {
444 for (unsigned i = 0; i < num_funcs; i++)
445 {
446 tree it = (*offload_funcs)[i];
447 /* See also add_decls_addresses_to_decl_constructor
448 and output_offload_tables in lto-cgraph.c. */
449 if (!in_lto_p && !symtab_node::get (it))
450 continue;
451 targetm.record_offload_symbol (it);
452 }
453 for (unsigned i = 0; i < num_vars; i++)
454 {
455 tree it = (*offload_vars)[i];
456 if (!in_lto_p && !symtab_node::get (it))
457 continue;
458 #ifdef ACCEL_COMPILER
459 if (DECL_HAS_VALUE_EXPR_P (it)
460 && lookup_attribute ("omp declare target link",
461 DECL_ATTRIBUTES (it)))
462 {
463 tree value_expr = DECL_VALUE_EXPR (it);
464 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
465 targetm.record_offload_symbol (link_ptr_decl);
466 varpool_node::finalize_decl (link_ptr_decl);
467 }
468 else
469 #endif
470 targetm.record_offload_symbol (it);
471 }
472 }
473 }
474
475 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
476 axis DIM. Return a tmp var holding the result. */
477
478 static tree
479 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
480 {
481 tree arg = build_int_cst (unsigned_type_node, dim);
482 tree size = create_tmp_var (integer_type_node);
483 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
484 gimple *call = gimple_build_call_internal (fn, 1, arg);
485
486 gimple_call_set_lhs (call, size);
487 gimple_seq_add_stmt (seq, call);
488
489 return size;
490 }
491
492 /* Find the number of threads (POS = false), or thread number (POS =
493 true) for an OpenACC region partitioned as MASK. Setup code
494 required for the calculation is added to SEQ. */
495
496 static tree
497 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
498 {
499 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
500 unsigned ix;
501
502 /* Start at gang level, and examine relevant dimension indices. */
503 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
504 if (GOMP_DIM_MASK (ix) & mask)
505 {
506 if (res)
507 {
508 /* We had an outer index, so scale that by the size of
509 this dimension. */
510 tree n = oacc_dim_call (false, ix, seq);
511 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
512 }
513 if (pos)
514 {
515 /* Determine index in this dimension. */
516 tree id = oacc_dim_call (true, ix, seq);
517 if (res)
518 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
519 else
520 res = id;
521 }
522 }
523
524 if (res == NULL_TREE)
525 res = integer_zero_node;
526
527 return res;
528 }
529
530 /* Transform IFN_GOACC_LOOP calls to actual code. See
531 expand_oacc_for for where these are generated. At the vector
532 level, we stride loops, such that each member of a warp will
533 operate on adjacent iterations. At the worker and gang level,
534 each gang/warp executes a set of contiguous iterations. Chunking
535 can override this such that each iteration engine executes a
536 contiguous chunk, and then moves on to stride to the next chunk. */
537
538 static void
539 oacc_xform_loop (gcall *call)
540 {
541 gimple_stmt_iterator gsi = gsi_for_stmt (call);
542 enum ifn_goacc_loop_kind code
543 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
544 tree dir = gimple_call_arg (call, 1);
545 tree range = gimple_call_arg (call, 2);
546 tree step = gimple_call_arg (call, 3);
547 tree chunk_size = NULL_TREE;
548 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
549 tree lhs = gimple_call_lhs (call);
550 tree type = NULL_TREE;
551 tree diff_type = TREE_TYPE (range);
552 tree r = NULL_TREE;
553 gimple_seq seq = NULL;
554 bool chunking = false, striding = true;
555 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
556 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
557
558 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
559 if (!lhs)
560 {
561 gsi_replace_with_seq (&gsi, seq, true);
562 return;
563 }
564
565 type = TREE_TYPE (lhs);
566
567 #ifdef ACCEL_COMPILER
568 chunk_size = gimple_call_arg (call, 4);
569 if (integer_minus_onep (chunk_size) /* Force static allocation. */
570 || integer_zerop (chunk_size)) /* Default (also static). */
571 {
572 /* If we're at the gang level, we want each to execute a
573 contiguous run of iterations. Otherwise we want each element
574 to stride. */
575 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
576 chunking = false;
577 }
578 else
579 {
580 /* Chunk of size 1 is striding. */
581 striding = integer_onep (chunk_size);
582 chunking = !striding;
583 }
584 #endif
585
586 /* striding=true, chunking=true
587 -> invalid.
588 striding=true, chunking=false
589 -> chunks=1
590 striding=false,chunking=true
591 -> chunks=ceil (range/(chunksize*threads*step))
592 striding=false,chunking=false
593 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
594 push_gimplify_context (true);
595
596 switch (code)
597 {
598 default: gcc_unreachable ();
599
600 case IFN_GOACC_LOOP_CHUNKS:
601 if (!chunking)
602 r = build_int_cst (type, 1);
603 else
604 {
605 /* chunk_max
606 = (range - dir) / (chunks * step * num_threads) + dir */
607 tree per = oacc_thread_numbers (false, mask, &seq);
608 per = fold_convert (type, per);
609 chunk_size = fold_convert (type, chunk_size);
610 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
611 per = fold_build2 (MULT_EXPR, type, per, step);
612 r = build2 (MINUS_EXPR, type, range, dir);
613 r = build2 (PLUS_EXPR, type, r, per);
614 r = build2 (TRUNC_DIV_EXPR, type, r, per);
615 }
616 break;
617
618 case IFN_GOACC_LOOP_STEP:
619 {
620 /* If striding, step by the entire compute volume, otherwise
621 step by the inner volume. */
622 unsigned volume = striding ? mask : inner_mask;
623
624 r = oacc_thread_numbers (false, volume, &seq);
625 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
626 }
627 break;
628
629 case IFN_GOACC_LOOP_OFFSET:
630 /* Enable vectorization on non-SIMT targets. */
631 if (!targetm.simt.vf
632 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
633 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
634 the loop. */
635 && (flag_tree_loop_vectorize
636 || !global_options_set.x_flag_tree_loop_vectorize))
637 {
638 basic_block bb = gsi_bb (gsi);
639 class loop *parent = bb->loop_father;
640 class loop *body = parent->inner;
641
642 parent->force_vectorize = true;
643 parent->safelen = INT_MAX;
644
645 /* "Chunking loops" may have inner loops. */
646 if (parent->inner)
647 {
648 body->force_vectorize = true;
649 body->safelen = INT_MAX;
650 }
651
652 cfun->has_force_vectorize_loops = true;
653 }
654 if (striding)
655 {
656 r = oacc_thread_numbers (true, mask, &seq);
657 r = fold_convert (diff_type, r);
658 }
659 else
660 {
661 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
662 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
663 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
664 inner_size, outer_size);
665
666 volume = fold_convert (diff_type, volume);
667 if (chunking)
668 chunk_size = fold_convert (diff_type, chunk_size);
669 else
670 {
671 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
672
673 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
674 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
675 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
676 }
677
678 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
679 fold_convert (diff_type, inner_size));
680 r = oacc_thread_numbers (true, outer_mask, &seq);
681 r = fold_convert (diff_type, r);
682 r = build2 (MULT_EXPR, diff_type, r, span);
683
684 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
685 inner = fold_convert (diff_type, inner);
686 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
687
688 if (chunking)
689 {
690 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
691 tree per
692 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
693 per = build2 (MULT_EXPR, diff_type, per, chunk);
694
695 r = build2 (PLUS_EXPR, diff_type, r, per);
696 }
697 }
698 r = fold_build2 (MULT_EXPR, diff_type, r, step);
699 if (type != diff_type)
700 r = fold_convert (type, r);
701 break;
702
703 case IFN_GOACC_LOOP_BOUND:
704 if (striding)
705 r = range;
706 else
707 {
708 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
709 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
710 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
711 inner_size, outer_size);
712
713 volume = fold_convert (diff_type, volume);
714 if (chunking)
715 chunk_size = fold_convert (diff_type, chunk_size);
716 else
717 {
718 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
719
720 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
721 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
722 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
723 }
724
725 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
726 fold_convert (diff_type, inner_size));
727
728 r = fold_build2 (MULT_EXPR, diff_type, span, step);
729
730 tree offset = gimple_call_arg (call, 6);
731 r = build2 (PLUS_EXPR, diff_type, r,
732 fold_convert (diff_type, offset));
733 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
734 diff_type, r, range);
735 }
736 if (diff_type != type)
737 r = fold_convert (type, r);
738 break;
739 }
740
741 gimplify_assign (lhs, r, &seq);
742
743 pop_gimplify_context (NULL);
744
745 gsi_replace_with_seq (&gsi, seq, true);
746 }
747
748 /* Transform a GOACC_TILE call. Determines the element loop span for
749 the specified loop of the nest. This is 1 if we're not tiling.
750
751 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
752
753 static void
754 oacc_xform_tile (gcall *call)
755 {
756 gimple_stmt_iterator gsi = gsi_for_stmt (call);
757 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
758 /* Inner loops have higher loop_nos. */
759 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
760 tree tile_size = gimple_call_arg (call, 2);
761 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
762 tree lhs = gimple_call_lhs (call);
763 tree type = TREE_TYPE (lhs);
764 gimple_seq seq = NULL;
765 tree span = build_int_cst (type, 1);
766
767 gcc_assert (!(e_mask
768 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
769 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
770 push_gimplify_context (!seen_error ());
771
772 #ifndef ACCEL_COMPILER
773 /* Partitioning disabled on host compilers. */
774 e_mask = 0;
775 #endif
776 if (!e_mask)
777 /* Not paritioning. */
778 span = integer_one_node;
779 else if (!integer_zerop (tile_size))
780 /* User explicitly specified size. */
781 span = tile_size;
782 else
783 {
784 /* Pick a size based on the paritioning of the element loop and
785 the number of loop nests. */
786 tree first_size = NULL_TREE;
787 tree second_size = NULL_TREE;
788
789 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
790 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
791 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
792 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
793
794 if (!first_size)
795 {
796 first_size = second_size;
797 second_size = NULL_TREE;
798 }
799
800 if (loop_no + 1 == collapse)
801 {
802 span = first_size;
803 if (!loop_no && second_size)
804 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
805 span, second_size);
806 }
807 else if (loop_no + 2 == collapse)
808 span = second_size;
809 else
810 span = NULL_TREE;
811
812 if (!span)
813 /* There's no obvious element size for this loop. Options
814 are 1, first_size or some non-unity constant (32 is my
815 favourite). We should gather some statistics. */
816 span = first_size;
817 }
818
819 span = fold_convert (type, span);
820 gimplify_assign (lhs, span, &seq);
821
822 pop_gimplify_context (NULL);
823
824 gsi_replace_with_seq (&gsi, seq, true);
825 }
826
827 /* Default partitioned and minimum partitioned dimensions. */
828
829 static int oacc_default_dims[GOMP_DIM_MAX];
830 static int oacc_min_dims[GOMP_DIM_MAX];
831
832 int
833 oacc_get_default_dim (int dim)
834 {
835 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
836 return oacc_default_dims[dim];
837 }
838
839 int
840 oacc_get_min_dim (int dim)
841 {
842 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
843 return oacc_min_dims[dim];
844 }
845
846 /* Parse the default dimension parameter. This is a set of
847 :-separated optional compute dimensions. Each specified dimension
848 is a positive integer. When device type support is added, it is
849 planned to be a comma separated list of such compute dimensions,
850 with all but the first prefixed by the colon-terminated device
851 type. */
852
853 static void
854 oacc_parse_default_dims (const char *dims)
855 {
856 int ix;
857
858 for (ix = GOMP_DIM_MAX; ix--;)
859 {
860 oacc_default_dims[ix] = -1;
861 oacc_min_dims[ix] = 1;
862 }
863
864 #ifndef ACCEL_COMPILER
865 /* Cannot be overridden on the host. */
866 dims = NULL;
867 #endif
868 if (dims)
869 {
870 const char *pos = dims;
871
872 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
873 {
874 if (ix)
875 {
876 if (*pos != ':')
877 goto malformed;
878 pos++;
879 }
880
881 if (*pos != ':')
882 {
883 long val;
884 const char *eptr;
885
886 errno = 0;
887 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
888 if (errno || val <= 0 || (int) val != val)
889 goto malformed;
890 pos = eptr;
891 oacc_default_dims[ix] = (int) val;
892 }
893 }
894 if (*pos)
895 {
896 malformed:
897 error_at (UNKNOWN_LOCATION,
898 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
899 }
900 }
901
902 /* Allow the backend to validate the dimensions. */
903 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
904 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
905 }
906
907 /* Validate and update the dimensions for offloaded FN. ATTRS is the
908 raw attribute. DIMS is an array of dimensions, which is filled in.
909 LEVEL is the partitioning level of a routine, or -1 for an offload
910 region itself. USED is the mask of partitioned execution in the
911 function. */
912
913 static void
914 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
915 {
916 tree purpose[GOMP_DIM_MAX];
917 unsigned ix;
918 tree pos = TREE_VALUE (attrs);
919
920 /* Make sure the attribute creator attached the dimension
921 information. */
922 gcc_assert (pos);
923
924 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
925 {
926 purpose[ix] = TREE_PURPOSE (pos);
927 tree val = TREE_VALUE (pos);
928 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
929 pos = TREE_CHAIN (pos);
930 }
931
932 bool check = true;
933 #ifdef ACCEL_COMPILER
934 check = false;
935 #endif
936 if (check
937 && warn_openacc_parallelism
938 && !lookup_attribute ("oacc kernels", DECL_ATTRIBUTES (fn)))
939 {
940 static char const *const axes[] =
941 /* Must be kept in sync with GOMP_DIM enumeration. */
942 { "gang", "worker", "vector" };
943 for (ix = level >= 0 ? level : 0; ix != GOMP_DIM_MAX; ix++)
944 if (dims[ix] < 0)
945 ; /* Defaulting axis. */
946 else if ((used & GOMP_DIM_MASK (ix)) && dims[ix] == 1)
947 /* There is partitioned execution, but the user requested a
948 dimension size of 1. They're probably confused. */
949 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
950 "region contains %s partitioned code but"
951 " is not %s partitioned", axes[ix], axes[ix]);
952 else if (!(used & GOMP_DIM_MASK (ix)) && dims[ix] != 1)
953 /* The dimension is explicitly partitioned to non-unity, but
954 no use is made within the region. */
955 warning_at (DECL_SOURCE_LOCATION (fn), OPT_Wopenacc_parallelism,
956 "region is %s partitioned but"
957 " does not contain %s partitioned code",
958 axes[ix], axes[ix]);
959 }
960
961 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
962
963 /* Default anything left to 1 or a partitioned default. */
964 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
965 if (dims[ix] < 0)
966 {
967 /* The OpenACC spec says 'If the [num_gangs] clause is not
968 specified, an implementation-defined default will be used;
969 the default may depend on the code within the construct.'
970 (2.5.6). Thus an implementation is free to choose
971 non-unity default for a parallel region that doesn't have
972 any gang-partitioned loops. However, it appears that there
973 is a sufficient body of user code that expects non-gang
974 partitioned regions to not execute in gang-redundant mode.
975 So we (a) don't warn about the non-portability and (b) pick
976 the minimum permissible dimension size when there is no
977 partitioned execution. Otherwise we pick the global
978 default for the dimension, which the user can control. The
979 same wording and logic applies to num_workers and
980 vector_length, however the worker- or vector- single
981 execution doesn't have the same impact as gang-redundant
982 execution. (If the minimum gang-level partioning is not 1,
983 the target is probably too confusing.) */
984 dims[ix] = (used & GOMP_DIM_MASK (ix)
985 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
986 changed = true;
987 }
988
989 if (changed)
990 {
991 /* Replace the attribute with new values. */
992 pos = NULL_TREE;
993 for (ix = GOMP_DIM_MAX; ix--;)
994 pos = tree_cons (purpose[ix],
995 build_int_cst (integer_type_node, dims[ix]), pos);
996 oacc_replace_fn_attrib (fn, pos);
997 }
998 }
999
1000 /* Create an empty OpenACC loop structure at LOC. */
1001
1002 static oacc_loop *
1003 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
1004 {
1005 oacc_loop *loop = XCNEW (oacc_loop);
1006
1007 loop->parent = parent;
1008
1009 if (parent)
1010 {
1011 loop->sibling = parent->child;
1012 parent->child = loop;
1013 }
1014
1015 loop->loc = loc;
1016 return loop;
1017 }
1018
1019 /* Create an outermost, dummy OpenACC loop for offloaded function
1020 DECL. */
1021
1022 static oacc_loop *
1023 new_oacc_loop_outer (tree decl)
1024 {
1025 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
1026 }
1027
1028 /* Start a new OpenACC loop structure beginning at head marker HEAD.
1029 Link into PARENT loop. Return the new loop. */
1030
1031 static oacc_loop *
1032 new_oacc_loop (oacc_loop *parent, gcall *marker)
1033 {
1034 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
1035
1036 loop->marker = marker;
1037
1038 /* TODO: This is where device_type flattening would occur for the loop
1039 flags. */
1040
1041 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
1042
1043 tree chunk_size = integer_zero_node;
1044 if (loop->flags & OLF_GANG_STATIC)
1045 chunk_size = gimple_call_arg (marker, 4);
1046 loop->chunk_size = chunk_size;
1047
1048 return loop;
1049 }
1050
1051 /* Create a dummy loop encompassing a call to a openACC routine.
1052 Extract the routine's partitioning requirements. */
1053
1054 static void
1055 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
1056 {
1057 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
1058 int level = oacc_fn_attrib_level (attrs);
1059
1060 gcc_assert (level >= 0);
1061
1062 loop->marker = call;
1063 loop->routine = decl;
1064 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
1065 ^ (GOMP_DIM_MASK (level) - 1));
1066 }
1067
1068 /* Finish off the current OpenACC loop ending at tail marker TAIL.
1069 Return the parent loop. */
1070
1071 static oacc_loop *
1072 finish_oacc_loop (oacc_loop *loop)
1073 {
1074 /* If the loop has been collapsed, don't partition it. */
1075 if (loop->ifns.is_empty ())
1076 loop->mask = loop->flags = 0;
1077 return loop->parent;
1078 }
1079
1080 /* Free all OpenACC loop structures within LOOP (inclusive). */
1081
1082 static void
1083 free_oacc_loop (oacc_loop *loop)
1084 {
1085 if (loop->sibling)
1086 free_oacc_loop (loop->sibling);
1087 if (loop->child)
1088 free_oacc_loop (loop->child);
1089
1090 loop->ifns.release ();
1091 free (loop);
1092 }
1093
1094 /* Dump out the OpenACC loop head or tail beginning at FROM. */
1095
1096 static void
1097 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
1098 const char *title, int level)
1099 {
1100 enum ifn_unique_kind kind
1101 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1102
1103 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1104 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1105 {
1106 gimple *stmt = gsi_stmt (gsi);
1107
1108 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1109 {
1110 enum ifn_unique_kind k
1111 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1112 (gimple_call_arg (stmt, 0)));
1113
1114 if (k == kind && stmt != from)
1115 break;
1116 }
1117 print_gimple_stmt (file, stmt, depth * 2 + 2);
1118
1119 gsi_next (&gsi);
1120 while (gsi_end_p (gsi))
1121 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1122 }
1123 }
1124
1125 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1126
1127 static void
1128 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1129 {
1130 int ix;
1131
1132 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1133 loop->flags, loop->mask,
1134 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1135
1136 if (loop->marker)
1137 print_gimple_stmt (file, loop->marker, depth * 2);
1138
1139 if (loop->routine)
1140 fprintf (file, "%*sRoutine %s:%u:%s\n",
1141 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1142 DECL_SOURCE_LINE (loop->routine),
1143 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1144
1145 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1146 if (loop->heads[ix])
1147 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1148 for (ix = GOMP_DIM_MAX; ix--;)
1149 if (loop->tails[ix])
1150 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1151
1152 if (loop->child)
1153 dump_oacc_loop (file, loop->child, depth + 1);
1154 if (loop->sibling)
1155 dump_oacc_loop (file, loop->sibling, depth);
1156 }
1157
1158 void debug_oacc_loop (oacc_loop *);
1159
1160 /* Dump loops to stderr. */
1161
1162 DEBUG_FUNCTION void
1163 debug_oacc_loop (oacc_loop *loop)
1164 {
1165 dump_oacc_loop (stderr, loop, 0);
1166 }
1167
1168 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1169 siblings. */
1170
1171 static void
1172 inform_oacc_loop (const oacc_loop *loop)
1173 {
1174 const char *gang
1175 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1176 const char *worker
1177 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1178 const char *vector
1179 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1180 const char *seq = loop->mask == 0 ? " seq" : "";
1181 const dump_user_location_t loc
1182 = dump_user_location_t::from_location_t (loop->loc);
1183 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1184 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1185 vector, seq);
1186
1187 if (loop->child)
1188 inform_oacc_loop (loop->child);
1189 if (loop->sibling)
1190 inform_oacc_loop (loop->sibling);
1191 }
1192
1193 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1194 structures as we go. By construction these loops are properly
1195 nested. */
1196
1197 static void
1198 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1199 {
1200 int marker = 0;
1201 int remaining = 0;
1202
1203 if (bb->flags & BB_VISITED)
1204 return;
1205
1206 follow:
1207 bb->flags |= BB_VISITED;
1208
1209 /* Scan for loop markers. */
1210 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1211 gsi_next (&gsi))
1212 {
1213 gimple *stmt = gsi_stmt (gsi);
1214
1215 if (!is_gimple_call (stmt))
1216 continue;
1217
1218 gcall *call = as_a <gcall *> (stmt);
1219
1220 /* If this is a routine, make a dummy loop for it. */
1221 if (tree decl = gimple_call_fndecl (call))
1222 if (tree attrs = oacc_get_fn_attrib (decl))
1223 {
1224 gcc_assert (!marker);
1225 new_oacc_loop_routine (loop, call, decl, attrs);
1226 }
1227
1228 if (!gimple_call_internal_p (call))
1229 continue;
1230
1231 switch (gimple_call_internal_fn (call))
1232 {
1233 default:
1234 break;
1235
1236 case IFN_GOACC_LOOP:
1237 case IFN_GOACC_TILE:
1238 /* Record the abstraction function, so we can manipulate it
1239 later. */
1240 loop->ifns.safe_push (call);
1241 break;
1242
1243 case IFN_UNIQUE:
1244 enum ifn_unique_kind kind
1245 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1246 (gimple_call_arg (call, 0)));
1247 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1248 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1249 {
1250 if (gimple_call_num_args (call) == 2)
1251 {
1252 gcc_assert (marker && !remaining);
1253 marker = 0;
1254 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1255 loop = finish_oacc_loop (loop);
1256 else
1257 loop->head_end = call;
1258 }
1259 else
1260 {
1261 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1262
1263 if (!marker)
1264 {
1265 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1266 loop = new_oacc_loop (loop, call);
1267 remaining = count;
1268 }
1269 gcc_assert (count == remaining);
1270 if (remaining)
1271 {
1272 remaining--;
1273 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1274 loop->heads[marker] = call;
1275 else
1276 loop->tails[remaining] = call;
1277 }
1278 marker++;
1279 }
1280 }
1281 }
1282 }
1283 if (remaining || marker)
1284 {
1285 bb = single_succ (bb);
1286 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1287 goto follow;
1288 }
1289
1290 /* Walk successor blocks. */
1291 edge e;
1292 edge_iterator ei;
1293
1294 FOR_EACH_EDGE (e, ei, bb->succs)
1295 oacc_loop_discover_walk (loop, e->dest);
1296 }
1297
1298 /* LOOP is the first sibling. Reverse the order in place and return
1299 the new first sibling. Recurse to child loops. */
1300
1301 static oacc_loop *
1302 oacc_loop_sibling_nreverse (oacc_loop *loop)
1303 {
1304 oacc_loop *last = NULL;
1305 do
1306 {
1307 if (loop->child)
1308 loop->child = oacc_loop_sibling_nreverse (loop->child);
1309
1310 oacc_loop *next = loop->sibling;
1311 loop->sibling = last;
1312 last = loop;
1313 loop = next;
1314 }
1315 while (loop);
1316
1317 return last;
1318 }
1319
1320 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1321 the current function. */
1322
1323 static oacc_loop *
1324 oacc_loop_discovery ()
1325 {
1326 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1327 in the following. */
1328 clear_bb_flags ();
1329
1330 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1331 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1332
1333 /* The siblings were constructed in reverse order, reverse them so
1334 that diagnostics come out in an unsurprising order. */
1335 top = oacc_loop_sibling_nreverse (top);
1336
1337 return top;
1338 }
1339
1340 /* Transform the abstract internal function markers starting at FROM
1341 to be for partitioning level LEVEL. Stop when we meet another HEAD
1342 or TAIL marker. */
1343
1344 static void
1345 oacc_loop_xform_head_tail (gcall *from, int level)
1346 {
1347 enum ifn_unique_kind kind
1348 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1349 tree replacement = build_int_cst (unsigned_type_node, level);
1350
1351 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1352 {
1353 gimple *stmt = gsi_stmt (gsi);
1354
1355 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1356 {
1357 enum ifn_unique_kind k
1358 = ((enum ifn_unique_kind)
1359 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1360
1361 if (k == IFN_UNIQUE_OACC_FORK
1362 || k == IFN_UNIQUE_OACC_JOIN
1363 || k == IFN_UNIQUE_OACC_PRIVATE)
1364 *gimple_call_arg_ptr (stmt, 2) = replacement;
1365 else if (k == kind && stmt != from)
1366 break;
1367 }
1368 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1369 *gimple_call_arg_ptr (stmt, 3) = replacement;
1370 update_stmt (stmt);
1371
1372 gsi_next (&gsi);
1373 while (gsi_end_p (gsi))
1374 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1375 }
1376 }
1377
1378 /* Process the discovered OpenACC loops, setting the correct
1379 partitioning level etc. */
1380
1381 static void
1382 oacc_loop_process (oacc_loop *loop)
1383 {
1384 if (loop->child)
1385 oacc_loop_process (loop->child);
1386
1387 if (loop->mask && !loop->routine)
1388 {
1389 int ix;
1390 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1391 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1392 tree chunk_arg = loop->chunk_size;
1393 gcall *call;
1394
1395 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1396 {
1397 switch (gimple_call_internal_fn (call))
1398 {
1399 case IFN_GOACC_LOOP:
1400 {
1401 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1402 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1403 if (!is_e)
1404 gimple_call_set_arg (call, 4, chunk_arg);
1405 }
1406 break;
1407
1408 case IFN_GOACC_TILE:
1409 gimple_call_set_arg (call, 3, mask_arg);
1410 gimple_call_set_arg (call, 4, e_mask_arg);
1411 break;
1412
1413 default:
1414 gcc_unreachable ();
1415 }
1416 update_stmt (call);
1417 }
1418
1419 unsigned dim = GOMP_DIM_GANG;
1420 unsigned mask = loop->mask | loop->e_mask;
1421 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1422 {
1423 while (!(GOMP_DIM_MASK (dim) & mask))
1424 dim++;
1425
1426 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1427 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1428
1429 mask ^= GOMP_DIM_MASK (dim);
1430 }
1431 }
1432
1433 if (loop->sibling)
1434 oacc_loop_process (loop->sibling);
1435 }
1436
1437 /* Walk the OpenACC loop heirarchy checking and assigning the
1438 programmer-specified partitionings. OUTER_MASK is the partitioning
1439 this loop is contained within. Return mask of partitioning
1440 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1441 bit. */
1442
1443 static unsigned
1444 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1445 {
1446 unsigned this_mask = loop->mask;
1447 unsigned mask_all = 0;
1448 bool noisy = true;
1449
1450 #ifdef ACCEL_COMPILER
1451 /* When device_type is supported, we want the device compiler to be
1452 noisy, if the loop parameters are device_type-specific. */
1453 noisy = false;
1454 #endif
1455
1456 if (!loop->routine)
1457 {
1458 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1459 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1460 bool tiling = (loop->flags & OLF_TILE) != 0;
1461
1462 this_mask = ((loop->flags >> OLF_DIM_BASE)
1463 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1464
1465 /* Apply auto partitioning if this is a non-partitioned regular
1466 loop, or (no more than) single axis tiled loop. */
1467 bool maybe_auto
1468 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1469
1470 if ((this_mask != 0) + auto_par + seq_par > 1)
1471 {
1472 if (noisy)
1473 error_at (loop->loc,
1474 seq_par
1475 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1476 : G_("%<auto%> conflicts with other OpenACC loop "
1477 "specifiers"));
1478 maybe_auto = false;
1479 loop->flags &= ~OLF_AUTO;
1480 if (seq_par)
1481 {
1482 loop->flags
1483 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1484 this_mask = 0;
1485 }
1486 }
1487
1488 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1489 {
1490 loop->flags |= OLF_AUTO;
1491 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1492 }
1493 }
1494
1495 if (this_mask & outer_mask)
1496 {
1497 const oacc_loop *outer;
1498 for (outer = loop->parent; outer; outer = outer->parent)
1499 if ((outer->mask | outer->e_mask) & this_mask)
1500 break;
1501
1502 if (noisy)
1503 {
1504 if (outer)
1505 {
1506 error_at (loop->loc,
1507 loop->routine
1508 ? G_("routine call uses same OpenACC parallelism"
1509 " as containing loop")
1510 : G_("inner loop uses same OpenACC parallelism"
1511 " as containing loop"));
1512 inform (outer->loc, "containing loop here");
1513 }
1514 else
1515 error_at (loop->loc,
1516 loop->routine
1517 ? G_("routine call uses OpenACC parallelism disallowed"
1518 " by containing routine")
1519 : G_("loop uses OpenACC parallelism disallowed"
1520 " by containing routine"));
1521
1522 if (loop->routine)
1523 inform (DECL_SOURCE_LOCATION (loop->routine),
1524 "routine %qD declared here", loop->routine);
1525 }
1526 this_mask &= ~outer_mask;
1527 }
1528 else
1529 {
1530 unsigned outermost = least_bit_hwi (this_mask);
1531
1532 if (outermost && outermost <= outer_mask)
1533 {
1534 if (noisy)
1535 {
1536 error_at (loop->loc,
1537 "incorrectly nested OpenACC loop parallelism");
1538
1539 const oacc_loop *outer;
1540 for (outer = loop->parent;
1541 outer->flags && outer->flags < outermost;
1542 outer = outer->parent)
1543 continue;
1544 inform (outer->loc, "containing loop here");
1545 }
1546
1547 this_mask &= ~outermost;
1548 }
1549 }
1550
1551 mask_all |= this_mask;
1552
1553 if (loop->flags & OLF_TILE)
1554 {
1555 /* When tiling, vector goes to the element loop, and failing
1556 that we put worker there. The std doesn't contemplate
1557 specifying all three. We choose to put worker and vector on
1558 the element loops in that case. */
1559 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1560 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1561 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1562
1563 loop->e_mask = this_e_mask;
1564 this_mask ^= this_e_mask;
1565 }
1566
1567 loop->mask = this_mask;
1568
1569 if (dump_file)
1570 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1571 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1572 loop->mask, loop->e_mask);
1573
1574 if (loop->child)
1575 {
1576 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1577 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1578 mask_all |= loop->inner;
1579 }
1580
1581 if (loop->sibling)
1582 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1583
1584 return mask_all;
1585 }
1586
1587 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1588 OUTER_MASK is the partitioning this loop is contained within.
1589 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1590 Return the cumulative partitioning used by this loop, siblings and
1591 children. */
1592
1593 static unsigned
1594 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1595 bool outer_assign)
1596 {
1597 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1598 bool noisy = true;
1599 bool tiling = loop->flags & OLF_TILE;
1600
1601 #ifdef ACCEL_COMPILER
1602 /* When device_type is supported, we want the device compiler to be
1603 noisy, if the loop parameters are device_type-specific. */
1604 noisy = false;
1605 #endif
1606
1607 if (assign && (!outer_assign || loop->inner))
1608 {
1609 /* Allocate outermost and non-innermost loops at the outermost
1610 non-innermost available level. */
1611 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1612
1613 /* Find the first outermost available partition. */
1614 while (this_mask <= outer_mask)
1615 this_mask <<= 1;
1616
1617 /* Grab two axes if tiling, and we've not assigned anything */
1618 if (tiling && !(loop->mask | loop->e_mask))
1619 this_mask |= this_mask << 1;
1620
1621 /* Prohibit the innermost partitioning at the moment. */
1622 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1623
1624 /* Don't use any dimension explicitly claimed by an inner loop. */
1625 this_mask &= ~loop->inner;
1626
1627 if (tiling && !loop->e_mask)
1628 {
1629 /* If we got two axes, allocate the inner one to the element
1630 loop. */
1631 loop->e_mask = this_mask & (this_mask << 1);
1632 this_mask ^= loop->e_mask;
1633 }
1634
1635 loop->mask |= this_mask;
1636 }
1637
1638 if (loop->child)
1639 {
1640 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1641 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1642 outer_assign | assign);
1643 }
1644
1645 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1646 {
1647 /* Allocate the loop at the innermost available level. Note
1648 that we do this even if we already assigned this loop the
1649 outermost available level above. That way we'll partition
1650 this along 2 axes, if they are available. */
1651 unsigned this_mask = 0;
1652
1653 /* Determine the outermost partitioning used within this loop. */
1654 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1655 this_mask = least_bit_hwi (this_mask);
1656
1657 /* Pick the partitioning just inside that one. */
1658 this_mask >>= 1;
1659
1660 /* And avoid picking one use by an outer loop. */
1661 this_mask &= ~outer_mask;
1662
1663 /* If tiling and we failed completely above, grab the next one
1664 too. Making sure it doesn't hit an outer loop. */
1665 if (tiling)
1666 {
1667 this_mask &= ~(loop->e_mask | loop->mask);
1668 unsigned tile_mask = ((this_mask >> 1)
1669 & ~(outer_mask | loop->e_mask | loop->mask));
1670
1671 if (tile_mask || loop->mask)
1672 {
1673 loop->e_mask |= this_mask;
1674 this_mask = tile_mask;
1675 }
1676 if (!loop->e_mask && noisy)
1677 warning_at (loop->loc, 0,
1678 "insufficient partitioning available"
1679 " to parallelize element loop");
1680 }
1681
1682 loop->mask |= this_mask;
1683 if (!loop->mask && noisy)
1684 warning_at (loop->loc, 0,
1685 tiling
1686 ? G_("insufficient partitioning available"
1687 " to parallelize tile loop")
1688 : G_("insufficient partitioning available"
1689 " to parallelize loop"));
1690 }
1691
1692 if (assign && dump_file)
1693 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1694 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1695 loop->mask, loop->e_mask);
1696
1697 unsigned inner_mask = 0;
1698
1699 if (loop->sibling)
1700 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1701 outer_mask, outer_assign);
1702
1703 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1704
1705 return inner_mask;
1706 }
1707
1708 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1709 axes. Return mask of partitioning. */
1710
1711 static unsigned
1712 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1713 {
1714 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1715
1716 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1717 {
1718 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1719 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1720 }
1721 return mask_all;
1722 }
1723
1724 /* Default fork/join early expander. Delete the function calls if
1725 there is no RTL expander. */
1726
1727 bool
1728 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1729 const int *ARG_UNUSED (dims), bool is_fork)
1730 {
1731 if (is_fork)
1732 return targetm.have_oacc_fork ();
1733 else
1734 return targetm.have_oacc_join ();
1735 }
1736
1737 /* Default goacc.reduction early expander.
1738
1739 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1740 If RES_PTR is not integer-zerop:
1741 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1742 TEARDOWN - emit '*RES_PTR = VAR'
1743 If LHS is not NULL
1744 emit 'LHS = VAR' */
1745
1746 void
1747 default_goacc_reduction (gcall *call)
1748 {
1749 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1750 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1751 tree lhs = gimple_call_lhs (call);
1752 tree var = gimple_call_arg (call, 2);
1753 gimple_seq seq = NULL;
1754
1755 if (code == IFN_GOACC_REDUCTION_SETUP
1756 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1757 {
1758 /* Setup and Teardown need to copy from/to the receiver object,
1759 if there is one. */
1760 tree ref_to_res = gimple_call_arg (call, 1);
1761
1762 if (!integer_zerop (ref_to_res))
1763 {
1764 tree dst = build_simple_mem_ref (ref_to_res);
1765 tree src = var;
1766
1767 if (code == IFN_GOACC_REDUCTION_SETUP)
1768 {
1769 src = dst;
1770 dst = lhs;
1771 lhs = NULL;
1772 }
1773 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1774 }
1775 }
1776
1777 /* Copy VAR to LHS, if there is an LHS. */
1778 if (lhs)
1779 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1780
1781 gsi_replace_with_seq (&gsi, seq, true);
1782 }
1783
1784 struct var_decl_rewrite_info
1785 {
1786 gimple *stmt;
1787 hash_map<tree, tree> *adjusted_vars;
1788 bool avoid_pointer_conversion;
1789 bool modified;
1790 };
1791
1792 /* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by
1793 themselves or wrapped in various other nodes) according to ADJUSTED_VARS in
1794 the var_decl_rewrite_info pointed to via DATA. Used as part of coercing
1795 gang-private variables in OpenACC offload regions to reside in GPU shared
1796 memory. */
1797
1798 static tree
1799 oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data)
1800 {
1801 walk_stmt_info *wi = (walk_stmt_info *) data;
1802 var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info;
1803
1804 if (TREE_CODE (*tp) == ADDR_EXPR)
1805 {
1806 tree arg = TREE_OPERAND (*tp, 0);
1807 tree *new_arg = info->adjusted_vars->get (arg);
1808
1809 if (new_arg)
1810 {
1811 if (info->avoid_pointer_conversion)
1812 {
1813 *tp = build_fold_addr_expr (*new_arg);
1814 info->modified = true;
1815 *walk_subtrees = 0;
1816 }
1817 else
1818 {
1819 gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt);
1820 tree repl = build_fold_addr_expr (*new_arg);
1821 gimple *stmt1
1822 = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl);
1823 tree conv = convert_to_pointer (TREE_TYPE (*tp),
1824 gimple_assign_lhs (stmt1));
1825 gimple *stmt2
1826 = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv);
1827 gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT);
1828 gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT);
1829 *tp = gimple_assign_lhs (stmt2);
1830 info->modified = true;
1831 *walk_subtrees = 0;
1832 }
1833 }
1834 }
1835 else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF)
1836 {
1837 tree *base = &TREE_OPERAND (*tp, 0);
1838
1839 while (TREE_CODE (*base) == COMPONENT_REF
1840 || TREE_CODE (*base) == ARRAY_REF)
1841 base = &TREE_OPERAND (*base, 0);
1842
1843 if (TREE_CODE (*base) != VAR_DECL)
1844 return NULL;
1845
1846 tree *new_decl = info->adjusted_vars->get (*base);
1847 if (!new_decl)
1848 return NULL;
1849
1850 int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl));
1851 tree field = TREE_OPERAND (*tp, 1);
1852
1853 /* Adjust the type of the field. */
1854 int field_quals = TYPE_QUALS (TREE_TYPE (field));
1855 if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals)
1856 {
1857 tree *field_type = &TREE_TYPE (field);
1858 while (TREE_CODE (*field_type) == ARRAY_TYPE)
1859 field_type = &TREE_TYPE (*field_type);
1860 field_quals |= base_quals;
1861 *field_type = build_qualified_type (*field_type, field_quals);
1862 }
1863
1864 /* Adjust the type of the component ref itself. */
1865 tree comp_type = TREE_TYPE (*tp);
1866 int comp_quals = TYPE_QUALS (comp_type);
1867 if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals)
1868 {
1869 comp_quals |= base_quals;
1870 TREE_TYPE (*tp)
1871 = build_qualified_type (comp_type, comp_quals);
1872 }
1873
1874 *base = *new_decl;
1875 info->modified = true;
1876 }
1877 else if (TREE_CODE (*tp) == VAR_DECL)
1878 {
1879 tree *new_decl = info->adjusted_vars->get (*tp);
1880 if (new_decl)
1881 {
1882 *tp = *new_decl;
1883 info->modified = true;
1884 }
1885 }
1886
1887 return NULL_TREE;
1888 }
1889
1890 /* Return TRUE if CALL is a call to a builtin atomic/sync operation. */
1891
1892 static bool
1893 is_sync_builtin_call (gcall *call)
1894 {
1895 tree callee = gimple_call_fndecl (call);
1896
1897 if (callee != NULL_TREE
1898 && gimple_call_builtin_p (call, BUILT_IN_NORMAL))
1899 switch (DECL_FUNCTION_CODE (callee))
1900 {
1901 #undef DEF_SYNC_BUILTIN
1902 #define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM:
1903 #include "sync-builtins.def"
1904 #undef DEF_SYNC_BUILTIN
1905 return true;
1906
1907 default:
1908 ;
1909 }
1910
1911 return false;
1912 }
1913
1914 /* Main entry point for oacc transformations which run on the device
1915 compiler after LTO, so we know what the target device is at this
1916 point (including the host fallback). */
1917
1918 static unsigned int
1919 execute_oacc_loop_designation ()
1920 {
1921 tree attrs = oacc_get_fn_attrib (current_function_decl);
1922
1923 if (!attrs)
1924 /* Not an offloaded function. */
1925 return 0;
1926
1927 /* Parse the default dim argument exactly once. */
1928 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1929 {
1930 oacc_parse_default_dims (flag_openacc_dims);
1931 flag_openacc_dims = (char *)&flag_openacc_dims;
1932 }
1933
1934 bool is_oacc_parallel
1935 = (lookup_attribute ("oacc parallel",
1936 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1937 bool is_oacc_kernels
1938 = (lookup_attribute ("oacc kernels",
1939 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1940 bool is_oacc_serial
1941 = (lookup_attribute ("oacc serial",
1942 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1943 bool is_oacc_parallel_kernels_parallelized
1944 = (lookup_attribute ("oacc parallel_kernels_parallelized",
1945 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1946 bool is_oacc_parallel_kernels_gang_single
1947 = (lookup_attribute ("oacc parallel_kernels_gang_single",
1948 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1949 int fn_level = oacc_fn_attrib_level (attrs);
1950 bool is_oacc_routine = (fn_level >= 0);
1951 gcc_checking_assert (is_oacc_parallel
1952 + is_oacc_kernels
1953 + is_oacc_serial
1954 + is_oacc_parallel_kernels_parallelized
1955 + is_oacc_parallel_kernels_gang_single
1956 + is_oacc_routine
1957 == 1);
1958
1959 bool is_oacc_kernels_parallelized
1960 = (lookup_attribute ("oacc kernels parallelized",
1961 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1962 if (is_oacc_kernels_parallelized)
1963 gcc_checking_assert (is_oacc_kernels);
1964
1965 if (dump_file)
1966 {
1967 if (is_oacc_parallel)
1968 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1969 else if (is_oacc_kernels)
1970 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1971 (is_oacc_kernels_parallelized
1972 ? "parallelized" : "unparallelized"));
1973 else if (is_oacc_serial)
1974 fprintf (dump_file, "Function is OpenACC serial offload\n");
1975 else if (is_oacc_parallel_kernels_parallelized)
1976 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1977 "parallel_kernels_parallelized");
1978 else if (is_oacc_parallel_kernels_gang_single)
1979 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1980 "parallel_kernels_gang_single");
1981 else if (is_oacc_routine)
1982 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1983 fn_level);
1984 else
1985 gcc_unreachable ();
1986 }
1987
1988 /* This doesn't belong into 'pass_oacc_loop_designation' conceptually, but
1989 it's a convenient place, so... */
1990 if (is_oacc_routine)
1991 {
1992 tree attr = lookup_attribute ("omp declare target",
1993 DECL_ATTRIBUTES (current_function_decl));
1994 gcc_checking_assert (attr);
1995 tree clauses = TREE_VALUE (attr);
1996 gcc_checking_assert (clauses);
1997
1998 /* Should this OpenACC routine be discarded? */
1999 bool discard = false;
2000
2001 tree clause_nohost = omp_find_clause (clauses, OMP_CLAUSE_NOHOST);
2002 if (dump_file)
2003 fprintf (dump_file,
2004 "OpenACC routine '%s' %s '%s' clause.\n",
2005 lang_hooks.decl_printable_name (current_function_decl, 2),
2006 clause_nohost ? "has" : "doesn't have",
2007 omp_clause_code_name[OMP_CLAUSE_NOHOST]);
2008 /* Host compiler, 'nohost' clause? */
2009 #ifndef ACCEL_COMPILER
2010 if (clause_nohost)
2011 discard = true;
2012 #endif
2013
2014 if (dump_file)
2015 fprintf (dump_file,
2016 "OpenACC routine '%s' %sdiscarded.\n",
2017 lang_hooks.decl_printable_name (current_function_decl, 2),
2018 discard ? "" : "not ");
2019 if (discard)
2020 {
2021 TREE_ASM_WRITTEN (current_function_decl) = 1;
2022 return TODO_discard_function;
2023 }
2024 }
2025
2026 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
2027 kernels, so remove the parallelism dimensions function attributes
2028 potentially set earlier on. */
2029 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
2030 {
2031 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
2032 attrs = oacc_get_fn_attrib (current_function_decl);
2033 }
2034
2035 /* Discover, partition and process the loops. */
2036 oacc_loop *loops = oacc_loop_discovery ();
2037
2038 unsigned outer_mask = 0;
2039 if (is_oacc_routine)
2040 outer_mask = GOMP_DIM_MASK (fn_level) - 1;
2041 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
2042 /* OpenACC kernels constructs are special: they currently don't use the
2043 generic oacc_loop infrastructure and attribute/dimension processing. */
2044 if (is_oacc_kernels && is_oacc_kernels_parallelized)
2045 {
2046 /* Parallelized OpenACC kernels constructs use gang parallelism. See
2047 also tree-parloops.c:create_parallel_loop. */
2048 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
2049 }
2050
2051 int dims[GOMP_DIM_MAX];
2052 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
2053
2054 if (dump_file)
2055 {
2056 const char *comma = "Compute dimensions [";
2057 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
2058 fprintf (dump_file, "%s%d", comma, dims[ix]);
2059 fprintf (dump_file, "]\n");
2060 }
2061
2062 /* Verify that for OpenACC 'kernels' decomposed "gang-single" parts we launch
2063 a single gang only. */
2064 if (is_oacc_parallel_kernels_gang_single)
2065 gcc_checking_assert (dims[GOMP_DIM_GANG] == 1);
2066
2067 oacc_loop_process (loops);
2068 if (dump_file)
2069 {
2070 fprintf (dump_file, "OpenACC loops\n");
2071 dump_oacc_loop (dump_file, loops, 0);
2072 fprintf (dump_file, "\n");
2073 }
2074 if (dump_enabled_p ())
2075 {
2076 oacc_loop *l = loops;
2077 /* OpenACC kernels constructs are special: they currently don't use the
2078 generic oacc_loop infrastructure. */
2079 if (is_oacc_kernels)
2080 {
2081 /* Create a fake oacc_loop for diagnostic purposes. */
2082 l = new_oacc_loop_raw (NULL,
2083 DECL_SOURCE_LOCATION (current_function_decl));
2084 l->mask = used_mask;
2085 }
2086 else
2087 {
2088 /* Skip the outermost, dummy OpenACC loop */
2089 l = l->child;
2090 }
2091 if (l)
2092 inform_oacc_loop (l);
2093 if (is_oacc_kernels)
2094 free_oacc_loop (l);
2095 }
2096
2097 free_oacc_loop (loops);
2098
2099 return 0;
2100 }
2101
2102 static unsigned int
2103 execute_oacc_device_lower ()
2104 {
2105 tree attrs = oacc_get_fn_attrib (current_function_decl);
2106
2107 if (!attrs)
2108 /* Not an offloaded function. */
2109 return 0;
2110
2111 int dims[GOMP_DIM_MAX];
2112 for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
2113 dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
2114
2115 hash_map<tree, tree> adjusted_vars;
2116
2117 /* Now lower internal loop functions to target-specific code
2118 sequences. */
2119 basic_block bb;
2120 FOR_ALL_BB_FN (bb, cfun)
2121 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
2122 {
2123 gimple *stmt = gsi_stmt (gsi);
2124 if (!is_gimple_call (stmt))
2125 {
2126 gsi_next (&gsi);
2127 continue;
2128 }
2129
2130 gcall *call = as_a <gcall *> (stmt);
2131 if (!gimple_call_internal_p (call))
2132 {
2133 gsi_next (&gsi);
2134 continue;
2135 }
2136
2137 /* Rewind to allow rescan. */
2138 gsi_prev (&gsi);
2139 bool rescan = false, remove = false;
2140 enum internal_fn ifn_code = gimple_call_internal_fn (call);
2141
2142 switch (ifn_code)
2143 {
2144 default: break;
2145
2146 case IFN_GOACC_TILE:
2147 oacc_xform_tile (call);
2148 rescan = true;
2149 break;
2150
2151 case IFN_GOACC_LOOP:
2152 oacc_xform_loop (call);
2153 rescan = true;
2154 break;
2155
2156 case IFN_GOACC_REDUCTION:
2157 /* Mark the function for SSA renaming. */
2158 mark_virtual_operands_for_renaming (cfun);
2159
2160 /* If the level is -1, this ended up being an unused
2161 axis. Handle as a default. */
2162 if (integer_minus_onep (gimple_call_arg (call, 3)))
2163 default_goacc_reduction (call);
2164 else
2165 targetm.goacc.reduction (call);
2166 rescan = true;
2167 break;
2168
2169 case IFN_UNIQUE:
2170 {
2171 enum ifn_unique_kind kind
2172 = ((enum ifn_unique_kind)
2173 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
2174
2175 switch (kind)
2176 {
2177 default:
2178 break;
2179
2180 case IFN_UNIQUE_OACC_FORK:
2181 case IFN_UNIQUE_OACC_JOIN:
2182 if (integer_minus_onep (gimple_call_arg (call, 2)))
2183 remove = true;
2184 else if (!targetm.goacc.fork_join
2185 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
2186 remove = true;
2187 break;
2188
2189 case IFN_UNIQUE_OACC_HEAD_MARK:
2190 case IFN_UNIQUE_OACC_TAIL_MARK:
2191 remove = true;
2192 break;
2193
2194 case IFN_UNIQUE_OACC_PRIVATE:
2195 {
2196 dump_flags_t l_dump_flags
2197 = get_openacc_privatization_dump_flags ();
2198
2199 location_t loc = gimple_location (stmt);
2200 if (LOCATION_LOCUS (loc) == UNKNOWN_LOCATION)
2201 loc = DECL_SOURCE_LOCATION (current_function_decl);
2202 const dump_user_location_t d_u_loc
2203 = dump_user_location_t::from_location_t (loc);
2204
2205 HOST_WIDE_INT level
2206 = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
2207 gcc_checking_assert (level == -1
2208 || (level >= 0
2209 && level < GOMP_DIM_MAX));
2210 for (unsigned i = 3;
2211 i < gimple_call_num_args (call);
2212 i++)
2213 {
2214 static char const *const axes[] =
2215 /* Must be kept in sync with GOMP_DIM enumeration. */
2216 { "gang", "worker", "vector" };
2217
2218 tree arg = gimple_call_arg (call, i);
2219 gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR);
2220 tree decl = TREE_OPERAND (arg, 0);
2221 if (dump_enabled_p ())
2222 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2223 #if __GNUC__ >= 10
2224 # pragma GCC diagnostic push
2225 # pragma GCC diagnostic ignored "-Wformat"
2226 #endif
2227 dump_printf_loc (l_dump_flags, d_u_loc,
2228 "variable %<%T%> ought to be"
2229 " adjusted for OpenACC"
2230 " privatization level: %qs\n",
2231 decl,
2232 (level == -1
2233 ? "UNKNOWN" : axes[level]));
2234 #if __GNUC__ >= 10
2235 # pragma GCC diagnostic pop
2236 #endif
2237 bool adjusted;
2238 if (level == -1)
2239 adjusted = false;
2240 else if (!targetm.goacc.adjust_private_decl)
2241 adjusted = false;
2242 else if (level == GOMP_DIM_VECTOR)
2243 {
2244 /* That's the default behavior. */
2245 adjusted = true;
2246 }
2247 else
2248 {
2249 tree oldtype = TREE_TYPE (decl);
2250 tree newdecl
2251 = targetm.goacc.adjust_private_decl (loc, decl,
2252 level);
2253 adjusted = (TREE_TYPE (newdecl) != oldtype
2254 || newdecl != decl);
2255 if (adjusted)
2256 adjusted_vars.put (decl, newdecl);
2257 }
2258 if (adjusted
2259 && dump_enabled_p ())
2260 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
2261 #if __GNUC__ >= 10
2262 # pragma GCC diagnostic push
2263 # pragma GCC diagnostic ignored "-Wformat"
2264 #endif
2265 dump_printf_loc (l_dump_flags, d_u_loc,
2266 "variable %<%T%> adjusted for"
2267 " OpenACC privatization level:"
2268 " %qs\n",
2269 decl, axes[level]);
2270 #if __GNUC__ >= 10
2271 # pragma GCC diagnostic pop
2272 #endif
2273 }
2274 remove = true;
2275 }
2276 break;
2277 }
2278 break;
2279 }
2280 }
2281
2282 if (gsi_end_p (gsi))
2283 /* We rewound past the beginning of the BB. */
2284 gsi = gsi_start_bb (bb);
2285 else
2286 /* Undo the rewind. */
2287 gsi_next (&gsi);
2288
2289 if (remove)
2290 {
2291 if (gimple_vdef (call))
2292 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
2293 if (gimple_call_lhs (call))
2294 {
2295 /* Propagate the data dependency var. */
2296 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
2297 gimple_call_arg (call, 1));
2298 gsi_replace (&gsi, ass, false);
2299 }
2300 else
2301 gsi_remove (&gsi, true);
2302 }
2303 else if (!rescan)
2304 /* If not rescanning, advance over the call. */
2305 gsi_next (&gsi);
2306 }
2307
2308 /* Regarding the OpenACC privatization level, we're currently only looking at
2309 making the gang-private level work. Regarding that, we have the following
2310 configurations:
2311
2312 - GCN offloading: 'targetm.goacc.adjust_private_decl' does the work (in
2313 particular, change 'TREE_TYPE', etc.) and there is no
2314 'targetm.goacc.expand_var_decl'.
2315
2316 - nvptx offloading: 'targetm.goacc.adjust_private_decl' only sets a
2317 marker and then 'targetm.goacc.expand_var_decl' does the work.
2318
2319 Eventually (in particular, for worker-private level?), both
2320 'targetm.goacc.adjust_private_decl' and 'targetm.goacc.expand_var_decl'
2321 may need to do things, but that's currently not meant to be addressed, and
2322 thus not fully worked out and implemented, and thus untested. Hence,
2323 'assert' what currently is implemented/tested, only. */
2324
2325 if (targetm.goacc.expand_var_decl)
2326 gcc_assert (adjusted_vars.is_empty ());
2327
2328 /* Make adjustments to gang-private local variables if required by the
2329 target, e.g. forcing them into a particular address space. Afterwards,
2330 ADDR_EXPR nodes which have adjusted variables as their argument need to
2331 be modified in one of two ways:
2332
2333 1. They can be recreated, making a pointer to the variable in the new
2334 address space, or
2335
2336 2. The address of the variable in the new address space can be taken,
2337 converted to the default (original) address space, and the result of
2338 that conversion subsituted in place of the original ADDR_EXPR node.
2339
2340 Which of these is done depends on the gimple statement being processed.
2341 At present atomic operations and inline asms use (1), and everything else
2342 uses (2). At least on AMD GCN, there are atomic operations that work
2343 directly in the LDS address space.
2344
2345 COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use
2346 the new decl, adjusting types of appropriate tree nodes as necessary. */
2347
2348 if (targetm.goacc.adjust_private_decl
2349 && !adjusted_vars.is_empty ())
2350 {
2351 FOR_ALL_BB_FN (bb, cfun)
2352 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
2353 !gsi_end_p (gsi);
2354 gsi_next (&gsi))
2355 {
2356 gimple *stmt = gsi_stmt (gsi);
2357 walk_stmt_info wi;
2358 var_decl_rewrite_info info;
2359
2360 info.avoid_pointer_conversion
2361 = (is_gimple_call (stmt)
2362 && is_sync_builtin_call (as_a <gcall *> (stmt)))
2363 || gimple_code (stmt) == GIMPLE_ASM;
2364 info.stmt = stmt;
2365 info.modified = false;
2366 info.adjusted_vars = &adjusted_vars;
2367
2368 memset (&wi, 0, sizeof (wi));
2369 wi.info = &info;
2370
2371 walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi);
2372
2373 if (info.modified)
2374 update_stmt (stmt);
2375 }
2376 }
2377
2378 return 0;
2379 }
2380
2381 /* Default launch dimension validator. Force everything to 1. A
2382 backend that wants to provide larger dimensions must override this
2383 hook. */
2384
2385 bool
2386 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
2387 int ARG_UNUSED (fn_level),
2388 unsigned ARG_UNUSED (used))
2389 {
2390 bool changed = false;
2391
2392 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
2393 {
2394 if (dims[ix] != 1)
2395 {
2396 dims[ix] = 1;
2397 changed = true;
2398 }
2399 }
2400
2401 return changed;
2402 }
2403
2404 /* Default dimension bound is unknown on accelerator and 1 on host. */
2405
2406 int
2407 default_goacc_dim_limit (int ARG_UNUSED (axis))
2408 {
2409 #ifdef ACCEL_COMPILER
2410 return 0;
2411 #else
2412 return 1;
2413 #endif
2414 }
2415
2416 namespace {
2417
2418 const pass_data pass_data_oacc_loop_designation =
2419 {
2420 GIMPLE_PASS, /* type */
2421 "oaccloops", /* name */
2422 OPTGROUP_OMP, /* optinfo_flags */
2423 TV_NONE, /* tv_id */
2424 PROP_cfg, /* properties_required */
2425 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2426 0, /* properties_destroyed */
2427 0, /* todo_flags_start */
2428 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2429 };
2430
2431 class pass_oacc_loop_designation : public gimple_opt_pass
2432 {
2433 public:
2434 pass_oacc_loop_designation (gcc::context *ctxt)
2435 : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt)
2436 {}
2437
2438 /* opt_pass methods: */
2439 virtual bool gate (function *) { return flag_openacc; };
2440
2441 virtual unsigned int execute (function *)
2442 {
2443 return execute_oacc_loop_designation ();
2444 }
2445
2446 }; // class pass_oacc_loop_designation
2447
2448 const pass_data pass_data_oacc_device_lower =
2449 {
2450 GIMPLE_PASS, /* type */
2451 "oaccdevlow", /* name */
2452 OPTGROUP_OMP, /* optinfo_flags */
2453 TV_NONE, /* tv_id */
2454 PROP_cfg, /* properties_required */
2455 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
2456 0, /* properties_destroyed */
2457 0, /* todo_flags_start */
2458 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
2459 };
2460
2461 class pass_oacc_device_lower : public gimple_opt_pass
2462 {
2463 public:
2464 pass_oacc_device_lower (gcc::context *ctxt)
2465 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
2466 {}
2467
2468 /* opt_pass methods: */
2469 virtual bool gate (function *) { return flag_openacc; };
2470
2471 virtual unsigned int execute (function *)
2472 {
2473 return execute_oacc_device_lower ();
2474 }
2475
2476 }; // class pass_oacc_device_lower
2477
2478 } // anon namespace
2479
2480 gimple_opt_pass *
2481 make_pass_oacc_loop_designation (gcc::context *ctxt)
2482 {
2483 return new pass_oacc_loop_designation (ctxt);
2484 }
2485
2486 gimple_opt_pass *
2487 make_pass_oacc_device_lower (gcc::context *ctxt)
2488 {
2489 return new pass_oacc_device_lower (ctxt);
2490 }
2491
2492 \f
2493 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
2494 GOMP_SIMT_ENTER call identifying the privatized variables, which are
2495 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
2496 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
2497
2498 static void
2499 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
2500 {
2501 gimple *alloc_stmt = gsi_stmt (*gsi);
2502 tree simtrec = gimple_call_lhs (alloc_stmt);
2503 tree simduid = gimple_call_arg (alloc_stmt, 0);
2504 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
2505 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
2506 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
2507 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
2508 TREE_ADDRESSABLE (rectype) = 1;
2509 TREE_TYPE (simtrec) = build_pointer_type (rectype);
2510 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
2511 {
2512 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
2513 if (*argp == null_pointer_node)
2514 continue;
2515 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
2516 && VAR_P (TREE_OPERAND (*argp, 0)));
2517 tree var = TREE_OPERAND (*argp, 0);
2518
2519 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
2520 DECL_NAME (var), TREE_TYPE (var));
2521 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2522 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2523 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2524
2525 insert_field_into_struct (rectype, field);
2526
2527 tree t = build_simple_mem_ref (simtrec);
2528 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2529 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2530 SET_DECL_VALUE_EXPR (var, t);
2531 DECL_HAS_VALUE_EXPR_P (var) = 1;
2532 *regimplify = true;
2533 }
2534 layout_type (rectype);
2535 tree size = TYPE_SIZE_UNIT (rectype);
2536 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2537
2538 alloc_stmt
2539 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2540 gimple_call_set_lhs (alloc_stmt, simtrec);
2541 gsi_replace (gsi, alloc_stmt, false);
2542 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2543 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2544 gsi_replace (&enter_gsi, enter_stmt, false);
2545
2546 use_operand_p use;
2547 gimple *exit_stmt;
2548 if (single_imm_use (simtrec, &use, &exit_stmt))
2549 {
2550 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2551 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2552 tree clobber = build_clobber (rectype);
2553 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2554 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2555 }
2556 else
2557 gcc_checking_assert (has_zero_uses (simtrec));
2558 }
2559
2560 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2561
2562 static tree
2563 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2564 {
2565 tree t = *tp;
2566
2567 if (VAR_P (t)
2568 && DECL_HAS_VALUE_EXPR_P (t)
2569 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2570 {
2571 *walk_subtrees = 0;
2572 return t;
2573 }
2574 return NULL_TREE;
2575 }
2576
2577 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2578 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2579 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2580 internal functions on non-SIMT targets, and likewise some SIMD internal
2581 functions on SIMT targets. */
2582
2583 static unsigned int
2584 execute_omp_device_lower ()
2585 {
2586 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2587 bool regimplify = false;
2588 basic_block bb;
2589 gimple_stmt_iterator gsi;
2590 bool calls_declare_variant_alt
2591 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2592 FOR_EACH_BB_FN (bb, cfun)
2593 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2594 {
2595 gimple *stmt = gsi_stmt (gsi);
2596 if (!is_gimple_call (stmt))
2597 continue;
2598 if (!gimple_call_internal_p (stmt))
2599 {
2600 if (calls_declare_variant_alt)
2601 if (tree fndecl = gimple_call_fndecl (stmt))
2602 {
2603 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2604 if (new_fndecl != fndecl)
2605 {
2606 gimple_call_set_fndecl (stmt, new_fndecl);
2607 update_stmt (stmt);
2608 }
2609 }
2610 continue;
2611 }
2612 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2613 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2614 switch (gimple_call_internal_fn (stmt))
2615 {
2616 case IFN_GOMP_USE_SIMT:
2617 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2618 break;
2619 case IFN_GOMP_SIMT_ENTER:
2620 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2621 goto simtreg_enter_exit;
2622 case IFN_GOMP_SIMT_ENTER_ALLOC:
2623 if (vf != 1)
2624 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2625 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2626 goto simtreg_enter_exit;
2627 case IFN_GOMP_SIMT_EXIT:
2628 simtreg_enter_exit:
2629 if (vf != 1)
2630 continue;
2631 unlink_stmt_vdef (stmt);
2632 break;
2633 case IFN_GOMP_SIMT_LANE:
2634 case IFN_GOMP_SIMT_LAST_LANE:
2635 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2636 break;
2637 case IFN_GOMP_SIMT_VF:
2638 rhs = build_int_cst (type, vf);
2639 break;
2640 case IFN_GOMP_SIMT_ORDERED_PRED:
2641 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2642 if (rhs || !lhs)
2643 unlink_stmt_vdef (stmt);
2644 break;
2645 case IFN_GOMP_SIMT_VOTE_ANY:
2646 case IFN_GOMP_SIMT_XCHG_BFLY:
2647 case IFN_GOMP_SIMT_XCHG_IDX:
2648 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2649 break;
2650 case IFN_GOMP_SIMD_LANE:
2651 case IFN_GOMP_SIMD_LAST_LANE:
2652 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2653 break;
2654 case IFN_GOMP_SIMD_VF:
2655 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2656 break;
2657 default:
2658 continue;
2659 }
2660 if (lhs && !rhs)
2661 continue;
2662 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2663 gsi_replace (&gsi, stmt, false);
2664 }
2665 if (regimplify)
2666 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2667 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2668 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2669 {
2670 if (gimple_clobber_p (gsi_stmt (gsi)))
2671 gsi_remove (&gsi, true);
2672 else
2673 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2674 }
2675 if (vf != 1)
2676 cfun->has_force_vectorize_loops = false;
2677 return 0;
2678 }
2679
2680 namespace {
2681
2682 const pass_data pass_data_omp_device_lower =
2683 {
2684 GIMPLE_PASS, /* type */
2685 "ompdevlow", /* name */
2686 OPTGROUP_OMP, /* optinfo_flags */
2687 TV_NONE, /* tv_id */
2688 PROP_cfg, /* properties_required */
2689 PROP_gimple_lomp_dev, /* properties_provided */
2690 0, /* properties_destroyed */
2691 0, /* todo_flags_start */
2692 TODO_update_ssa, /* todo_flags_finish */
2693 };
2694
2695 class pass_omp_device_lower : public gimple_opt_pass
2696 {
2697 public:
2698 pass_omp_device_lower (gcc::context *ctxt)
2699 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2700 {}
2701
2702 /* opt_pass methods: */
2703 virtual bool gate (function *fun)
2704 {
2705 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2706 || (flag_openmp
2707 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2708 }
2709 virtual unsigned int execute (function *)
2710 {
2711 return execute_omp_device_lower ();
2712 }
2713
2714 }; // class pass_expand_omp_ssa
2715
2716 } // anon namespace
2717
2718 gimple_opt_pass *
2719 make_pass_omp_device_lower (gcc::context *ctxt)
2720 {
2721 return new pass_omp_device_lower (ctxt);
2722 }
2723
2724 /* "omp declare target link" handling pass. */
2725
2726 namespace {
2727
2728 const pass_data pass_data_omp_target_link =
2729 {
2730 GIMPLE_PASS, /* type */
2731 "omptargetlink", /* name */
2732 OPTGROUP_OMP, /* optinfo_flags */
2733 TV_NONE, /* tv_id */
2734 PROP_ssa, /* properties_required */
2735 0, /* properties_provided */
2736 0, /* properties_destroyed */
2737 0, /* todo_flags_start */
2738 TODO_update_ssa, /* todo_flags_finish */
2739 };
2740
2741 class pass_omp_target_link : public gimple_opt_pass
2742 {
2743 public:
2744 pass_omp_target_link (gcc::context *ctxt)
2745 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2746 {}
2747
2748 /* opt_pass methods: */
2749 virtual bool gate (function *fun)
2750 {
2751 #ifdef ACCEL_COMPILER
2752 return offloading_function_p (fun->decl);
2753 #else
2754 (void) fun;
2755 return false;
2756 #endif
2757 }
2758
2759 virtual unsigned execute (function *);
2760 };
2761
2762 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2763
2764 static tree
2765 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2766 {
2767 tree t = *tp;
2768
2769 if (VAR_P (t)
2770 && DECL_HAS_VALUE_EXPR_P (t)
2771 && is_global_var (t)
2772 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2773 {
2774 *walk_subtrees = 0;
2775 return t;
2776 }
2777
2778 return NULL_TREE;
2779 }
2780
2781 unsigned
2782 pass_omp_target_link::execute (function *fun)
2783 {
2784 basic_block bb;
2785 FOR_EACH_BB_FN (bb, fun)
2786 {
2787 gimple_stmt_iterator gsi;
2788 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2789 {
2790 if (gimple_call_builtin_p (gsi_stmt (gsi), BUILT_IN_GOMP_TARGET))
2791 {
2792 /* Nullify the second argument of __builtin_GOMP_target_ext. */
2793 gimple_call_set_arg (gsi_stmt (gsi), 1, null_pointer_node);
2794 update_stmt (gsi_stmt (gsi));
2795 }
2796 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2797 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2798 }
2799 }
2800
2801 return 0;
2802 }
2803
2804 } // anon namespace
2805
2806 gimple_opt_pass *
2807 make_pass_omp_target_link (gcc::context *ctxt)
2808 {
2809 return new pass_omp_target_link (ctxt);
2810 }