]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/omp-offload.c
PR fortran/95090 - ICE: identifier overflow
[thirdparty/gcc.git] / gcc / omp-offload.c
1 /* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
4 Copyright (C) 2005-2020 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "tree.h"
28 #include "gimple.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "pretty-print.h"
33 #include "diagnostic-core.h"
34 #include "fold-const.h"
35 #include "internal-fn.h"
36 #include "langhooks.h"
37 #include "gimplify.h"
38 #include "gimple-iterator.h"
39 #include "gimplify-me.h"
40 #include "gimple-walk.h"
41 #include "tree-cfg.h"
42 #include "tree-into-ssa.h"
43 #include "tree-nested.h"
44 #include "stor-layout.h"
45 #include "common/common-target.h"
46 #include "omp-general.h"
47 #include "omp-offload.h"
48 #include "lto-section-names.h"
49 #include "gomp-constants.h"
50 #include "gimple-pretty-print.h"
51 #include "intl.h"
52 #include "stringpool.h"
53 #include "attribs.h"
54 #include "cfgloop.h"
55 #include "context.h"
56
57 /* Describe the OpenACC looping structure of a function. The entire
58 function is held in a 'NULL' loop. */
59
60 struct oacc_loop
61 {
62 oacc_loop *parent; /* Containing loop. */
63
64 oacc_loop *child; /* First inner loop. */
65
66 oacc_loop *sibling; /* Next loop within same parent. */
67
68 location_t loc; /* Location of the loop start. */
69
70 gcall *marker; /* Initial head marker. */
71
72 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
73 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
74
75 tree routine; /* Pseudo-loop enclosing a routine. */
76
77 unsigned mask; /* Partitioning mask. */
78 unsigned e_mask; /* Partitioning of element loops (when tiling). */
79 unsigned inner; /* Partitioning of inner loops. */
80 unsigned flags; /* Partitioning flags. */
81 vec<gcall *> ifns; /* Contained loop abstraction functions. */
82 tree chunk_size; /* Chunk size. */
83 gcall *head_end; /* Final marker of head sequence. */
84 };
85
86 /* Holds offload tables with decls. */
87 vec<tree, va_gc> *offload_funcs, *offload_vars;
88
89 /* Return level at which oacc routine may spawn a partitioned loop, or
90 -1 if it is not a routine (i.e. is an offload fn). */
91
92 int
93 oacc_fn_attrib_level (tree attr)
94 {
95 tree pos = TREE_VALUE (attr);
96
97 if (!TREE_PURPOSE (pos))
98 return -1;
99
100 int ix = 0;
101 for (ix = 0; ix != GOMP_DIM_MAX;
102 ix++, pos = TREE_CHAIN (pos))
103 if (!integer_zerop (TREE_PURPOSE (pos)))
104 break;
105
106 return ix;
107 }
108
109 /* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
110 adds their addresses and sizes to constructor-vector V_CTOR. */
111
112 static void
113 add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
114 vec<constructor_elt, va_gc> *v_ctor)
115 {
116 unsigned len = vec_safe_length (v_decls);
117 for (unsigned i = 0; i < len; i++)
118 {
119 tree it = (*v_decls)[i];
120 bool is_var = VAR_P (it);
121 bool is_link_var
122 = is_var
123 #ifdef ACCEL_COMPILER
124 && DECL_HAS_VALUE_EXPR_P (it)
125 #endif
126 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
127
128 tree size = NULL_TREE;
129 if (is_var)
130 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
131
132 tree addr;
133 if (!is_link_var)
134 addr = build_fold_addr_expr (it);
135 else
136 {
137 #ifdef ACCEL_COMPILER
138 /* For "omp declare target link" vars add address of the pointer to
139 the target table, instead of address of the var. */
140 tree value_expr = DECL_VALUE_EXPR (it);
141 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
142 varpool_node::finalize_decl (link_ptr_decl);
143 addr = build_fold_addr_expr (link_ptr_decl);
144 #else
145 addr = build_fold_addr_expr (it);
146 #endif
147
148 /* Most significant bit of the size marks "omp declare target link"
149 vars in host and target tables. */
150 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
151 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
152 * BITS_PER_UNIT - 1);
153 size = wide_int_to_tree (const_ptr_type_node, isize);
154 }
155
156 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
157 if (is_var)
158 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
159 }
160 }
161
162 /* Return true if DECL is a function for which its references should be
163 analyzed. */
164
165 static bool
166 omp_declare_target_fn_p (tree decl)
167 {
168 return (TREE_CODE (decl) == FUNCTION_DECL
169 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
170 && !lookup_attribute ("omp declare target host",
171 DECL_ATTRIBUTES (decl))
172 && (!flag_openacc
173 || oacc_get_fn_attrib (decl) == NULL_TREE));
174 }
175
176 /* Return true if DECL Is a variable for which its initializer references
177 should be analyzed. */
178
179 static bool
180 omp_declare_target_var_p (tree decl)
181 {
182 return (VAR_P (decl)
183 && lookup_attribute ("omp declare target", DECL_ATTRIBUTES (decl))
184 && !lookup_attribute ("omp declare target link",
185 DECL_ATTRIBUTES (decl)));
186 }
187
188 /* Helper function for omp_discover_implicit_declare_target, called through
189 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
190 declare target to. */
191
192 static tree
193 omp_discover_declare_target_tgt_fn_r (tree *tp, int *walk_subtrees, void *data)
194 {
195 if (TREE_CODE (*tp) == FUNCTION_DECL
196 && !omp_declare_target_fn_p (*tp)
197 && !lookup_attribute ("omp declare target host", DECL_ATTRIBUTES (*tp)))
198 {
199 tree id = get_identifier ("omp declare target");
200 if (!DECL_EXTERNAL (*tp) && DECL_SAVED_TREE (*tp))
201 ((vec<tree> *) data)->safe_push (*tp);
202 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
203 symtab_node *node = symtab_node::get (*tp);
204 if (node != NULL)
205 {
206 node->offloadable = 1;
207 if (ENABLE_OFFLOADING)
208 g->have_offload = true;
209 }
210 }
211 else if (TYPE_P (*tp))
212 *walk_subtrees = 0;
213 /* else if (TREE_CODE (*tp) == OMP_TARGET)
214 {
215 if (tree dev = omp_find_clause (OMP_TARGET_CLAUSES (*tp)))
216 if (OMP_DEVICE_ANCESTOR (dev))
217 *walk_subtrees = 0;
218 } */
219 return NULL_TREE;
220 }
221
222 /* Similarly, but ignore references outside of OMP_TARGET regions. */
223
224 static tree
225 omp_discover_declare_target_fn_r (tree *tp, int *walk_subtrees, void *data)
226 {
227 if (TREE_CODE (*tp) == OMP_TARGET)
228 {
229 /* And not OMP_DEVICE_ANCESTOR. */
230 walk_tree_without_duplicates (&OMP_TARGET_BODY (*tp),
231 omp_discover_declare_target_tgt_fn_r,
232 data);
233 *walk_subtrees = 0;
234 }
235 else if (TYPE_P (*tp))
236 *walk_subtrees = 0;
237 return NULL_TREE;
238 }
239
240 /* Helper function for omp_discover_implicit_declare_target, called through
241 walk_tree. Mark referenced FUNCTION_DECLs implicitly as
242 declare target to. */
243
244 static tree
245 omp_discover_declare_target_var_r (tree *tp, int *walk_subtrees, void *data)
246 {
247 if (TREE_CODE (*tp) == FUNCTION_DECL)
248 return omp_discover_declare_target_tgt_fn_r (tp, walk_subtrees, data);
249 else if (VAR_P (*tp)
250 && is_global_var (*tp)
251 && !omp_declare_target_var_p (*tp))
252 {
253 tree id = get_identifier ("omp declare target");
254 if (lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp)))
255 {
256 error_at (DECL_SOURCE_LOCATION (*tp),
257 "%qD specified both in declare target %<link%> and "
258 "implicitly in %<to%> clauses", *tp);
259 DECL_ATTRIBUTES (*tp)
260 = remove_attribute ("omp declare target link", DECL_ATTRIBUTES (*tp));
261 }
262 if (TREE_STATIC (*tp) && DECL_INITIAL (*tp))
263 ((vec<tree> *) data)->safe_push (*tp);
264 DECL_ATTRIBUTES (*tp) = tree_cons (id, NULL_TREE, DECL_ATTRIBUTES (*tp));
265 symtab_node *node = symtab_node::get (*tp);
266 if (node != NULL && !node->offloadable)
267 {
268 node->offloadable = 1;
269 if (ENABLE_OFFLOADING)
270 {
271 g->have_offload = true;
272 if (is_a <varpool_node *> (node))
273 vec_safe_push (offload_vars, node->decl);
274 }
275 }
276 }
277 else if (TYPE_P (*tp))
278 *walk_subtrees = 0;
279 return NULL_TREE;
280 }
281
282 /* Perform the OpenMP implicit declare target to discovery. */
283
284 void
285 omp_discover_implicit_declare_target (void)
286 {
287 cgraph_node *node;
288 varpool_node *vnode;
289 auto_vec<tree> worklist;
290
291 FOR_EACH_DEFINED_FUNCTION (node)
292 if (DECL_SAVED_TREE (node->decl))
293 {
294 if (omp_declare_target_fn_p (node->decl))
295 worklist.safe_push (node->decl);
296 else if (DECL_STRUCT_FUNCTION (node->decl)
297 && DECL_STRUCT_FUNCTION (node->decl)->has_omp_target)
298 worklist.safe_push (node->decl);
299 }
300 FOR_EACH_STATIC_INITIALIZER (vnode)
301 if (omp_declare_target_var_p (vnode->decl))
302 worklist.safe_push (vnode->decl);
303 while (!worklist.is_empty ())
304 {
305 tree decl = worklist.pop ();
306 if (VAR_P (decl))
307 walk_tree_without_duplicates (&DECL_INITIAL (decl),
308 omp_discover_declare_target_var_r,
309 &worklist);
310 else if (omp_declare_target_fn_p (decl))
311 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
312 omp_discover_declare_target_tgt_fn_r,
313 &worklist);
314 else
315 walk_tree_without_duplicates (&DECL_SAVED_TREE (decl),
316 omp_discover_declare_target_fn_r,
317 &worklist);
318 }
319 }
320
321
322 /* Create new symbols containing (address, size) pairs for global variables,
323 marked with "omp declare target" attribute, as well as addresses for the
324 functions, which are outlined offloading regions. */
325 void
326 omp_finish_file (void)
327 {
328 unsigned num_funcs = vec_safe_length (offload_funcs);
329 unsigned num_vars = vec_safe_length (offload_vars);
330
331 if (num_funcs == 0 && num_vars == 0)
332 return;
333
334 if (targetm_common.have_named_sections)
335 {
336 vec<constructor_elt, va_gc> *v_f, *v_v;
337 vec_alloc (v_f, num_funcs);
338 vec_alloc (v_v, num_vars * 2);
339
340 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
341 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
342
343 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
344 num_vars * 2);
345 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
346 num_funcs);
347 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
348 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
349 tree ctor_v = build_constructor (vars_decl_type, v_v);
350 tree ctor_f = build_constructor (funcs_decl_type, v_f);
351 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
352 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
353 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
354 get_identifier (".offload_func_table"),
355 funcs_decl_type);
356 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
357 get_identifier (".offload_var_table"),
358 vars_decl_type);
359 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
360 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
361 otherwise a joint table in a binary will contain padding between
362 tables from multiple object files. */
363 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
364 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
365 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
366 DECL_INITIAL (funcs_decl) = ctor_f;
367 DECL_INITIAL (vars_decl) = ctor_v;
368 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
369 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
370
371 varpool_node::finalize_decl (vars_decl);
372 varpool_node::finalize_decl (funcs_decl);
373 }
374 else
375 {
376 for (unsigned i = 0; i < num_funcs; i++)
377 {
378 tree it = (*offload_funcs)[i];
379 targetm.record_offload_symbol (it);
380 }
381 for (unsigned i = 0; i < num_vars; i++)
382 {
383 tree it = (*offload_vars)[i];
384 #ifdef ACCEL_COMPILER
385 if (DECL_HAS_VALUE_EXPR_P (it)
386 && lookup_attribute ("omp declare target link",
387 DECL_ATTRIBUTES (it)))
388 {
389 tree value_expr = DECL_VALUE_EXPR (it);
390 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
391 targetm.record_offload_symbol (link_ptr_decl);
392 varpool_node::finalize_decl (link_ptr_decl);
393 }
394 else
395 #endif
396 targetm.record_offload_symbol (it);
397 }
398 }
399 }
400
401 /* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
402 axis DIM. Return a tmp var holding the result. */
403
404 static tree
405 oacc_dim_call (bool pos, int dim, gimple_seq *seq)
406 {
407 tree arg = build_int_cst (unsigned_type_node, dim);
408 tree size = create_tmp_var (integer_type_node);
409 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
410 gimple *call = gimple_build_call_internal (fn, 1, arg);
411
412 gimple_call_set_lhs (call, size);
413 gimple_seq_add_stmt (seq, call);
414
415 return size;
416 }
417
418 /* Find the number of threads (POS = false), or thread number (POS =
419 true) for an OpenACC region partitioned as MASK. Setup code
420 required for the calculation is added to SEQ. */
421
422 static tree
423 oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
424 {
425 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
426 unsigned ix;
427
428 /* Start at gang level, and examine relevant dimension indices. */
429 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
430 if (GOMP_DIM_MASK (ix) & mask)
431 {
432 if (res)
433 {
434 /* We had an outer index, so scale that by the size of
435 this dimension. */
436 tree n = oacc_dim_call (false, ix, seq);
437 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
438 }
439 if (pos)
440 {
441 /* Determine index in this dimension. */
442 tree id = oacc_dim_call (true, ix, seq);
443 if (res)
444 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
445 else
446 res = id;
447 }
448 }
449
450 if (res == NULL_TREE)
451 res = integer_zero_node;
452
453 return res;
454 }
455
456 /* Transform IFN_GOACC_LOOP calls to actual code. See
457 expand_oacc_for for where these are generated. At the vector
458 level, we stride loops, such that each member of a warp will
459 operate on adjacent iterations. At the worker and gang level,
460 each gang/warp executes a set of contiguous iterations. Chunking
461 can override this such that each iteration engine executes a
462 contiguous chunk, and then moves on to stride to the next chunk. */
463
464 static void
465 oacc_xform_loop (gcall *call)
466 {
467 gimple_stmt_iterator gsi = gsi_for_stmt (call);
468 enum ifn_goacc_loop_kind code
469 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
470 tree dir = gimple_call_arg (call, 1);
471 tree range = gimple_call_arg (call, 2);
472 tree step = gimple_call_arg (call, 3);
473 tree chunk_size = NULL_TREE;
474 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
475 tree lhs = gimple_call_lhs (call);
476 tree type = NULL_TREE;
477 tree diff_type = TREE_TYPE (range);
478 tree r = NULL_TREE;
479 gimple_seq seq = NULL;
480 bool chunking = false, striding = true;
481 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
482 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
483
484 /* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
485 if (!lhs)
486 {
487 gsi_replace_with_seq (&gsi, seq, true);
488 return;
489 }
490
491 type = TREE_TYPE (lhs);
492
493 #ifdef ACCEL_COMPILER
494 chunk_size = gimple_call_arg (call, 4);
495 if (integer_minus_onep (chunk_size) /* Force static allocation. */
496 || integer_zerop (chunk_size)) /* Default (also static). */
497 {
498 /* If we're at the gang level, we want each to execute a
499 contiguous run of iterations. Otherwise we want each element
500 to stride. */
501 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
502 chunking = false;
503 }
504 else
505 {
506 /* Chunk of size 1 is striding. */
507 striding = integer_onep (chunk_size);
508 chunking = !striding;
509 }
510 #endif
511
512 /* striding=true, chunking=true
513 -> invalid.
514 striding=true, chunking=false
515 -> chunks=1
516 striding=false,chunking=true
517 -> chunks=ceil (range/(chunksize*threads*step))
518 striding=false,chunking=false
519 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
520 push_gimplify_context (true);
521
522 switch (code)
523 {
524 default: gcc_unreachable ();
525
526 case IFN_GOACC_LOOP_CHUNKS:
527 if (!chunking)
528 r = build_int_cst (type, 1);
529 else
530 {
531 /* chunk_max
532 = (range - dir) / (chunks * step * num_threads) + dir */
533 tree per = oacc_thread_numbers (false, mask, &seq);
534 per = fold_convert (type, per);
535 chunk_size = fold_convert (type, chunk_size);
536 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
537 per = fold_build2 (MULT_EXPR, type, per, step);
538 r = build2 (MINUS_EXPR, type, range, dir);
539 r = build2 (PLUS_EXPR, type, r, per);
540 r = build2 (TRUNC_DIV_EXPR, type, r, per);
541 }
542 break;
543
544 case IFN_GOACC_LOOP_STEP:
545 {
546 /* If striding, step by the entire compute volume, otherwise
547 step by the inner volume. */
548 unsigned volume = striding ? mask : inner_mask;
549
550 r = oacc_thread_numbers (false, volume, &seq);
551 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
552 }
553 break;
554
555 case IFN_GOACC_LOOP_OFFSET:
556 /* Enable vectorization on non-SIMT targets. */
557 if (!targetm.simt.vf
558 && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
559 /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
560 the loop. */
561 && (flag_tree_loop_vectorize
562 || !global_options_set.x_flag_tree_loop_vectorize))
563 {
564 basic_block bb = gsi_bb (gsi);
565 class loop *parent = bb->loop_father;
566 class loop *body = parent->inner;
567
568 parent->force_vectorize = true;
569 parent->safelen = INT_MAX;
570
571 /* "Chunking loops" may have inner loops. */
572 if (parent->inner)
573 {
574 body->force_vectorize = true;
575 body->safelen = INT_MAX;
576 }
577
578 cfun->has_force_vectorize_loops = true;
579 }
580 if (striding)
581 {
582 r = oacc_thread_numbers (true, mask, &seq);
583 r = fold_convert (diff_type, r);
584 }
585 else
586 {
587 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
588 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
589 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
590 inner_size, outer_size);
591
592 volume = fold_convert (diff_type, volume);
593 if (chunking)
594 chunk_size = fold_convert (diff_type, chunk_size);
595 else
596 {
597 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
598
599 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
600 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
601 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
602 }
603
604 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
605 fold_convert (diff_type, inner_size));
606 r = oacc_thread_numbers (true, outer_mask, &seq);
607 r = fold_convert (diff_type, r);
608 r = build2 (MULT_EXPR, diff_type, r, span);
609
610 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
611 inner = fold_convert (diff_type, inner);
612 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
613
614 if (chunking)
615 {
616 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
617 tree per
618 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
619 per = build2 (MULT_EXPR, diff_type, per, chunk);
620
621 r = build2 (PLUS_EXPR, diff_type, r, per);
622 }
623 }
624 r = fold_build2 (MULT_EXPR, diff_type, r, step);
625 if (type != diff_type)
626 r = fold_convert (type, r);
627 break;
628
629 case IFN_GOACC_LOOP_BOUND:
630 if (striding)
631 r = range;
632 else
633 {
634 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
635 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
636 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
637 inner_size, outer_size);
638
639 volume = fold_convert (diff_type, volume);
640 if (chunking)
641 chunk_size = fold_convert (diff_type, chunk_size);
642 else
643 {
644 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
645
646 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
647 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
648 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
649 }
650
651 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
652 fold_convert (diff_type, inner_size));
653
654 r = fold_build2 (MULT_EXPR, diff_type, span, step);
655
656 tree offset = gimple_call_arg (call, 6);
657 r = build2 (PLUS_EXPR, diff_type, r,
658 fold_convert (diff_type, offset));
659 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
660 diff_type, r, range);
661 }
662 if (diff_type != type)
663 r = fold_convert (type, r);
664 break;
665 }
666
667 gimplify_assign (lhs, r, &seq);
668
669 pop_gimplify_context (NULL);
670
671 gsi_replace_with_seq (&gsi, seq, true);
672 }
673
674 /* Transform a GOACC_TILE call. Determines the element loop span for
675 the specified loop of the nest. This is 1 if we're not tiling.
676
677 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
678
679 static void
680 oacc_xform_tile (gcall *call)
681 {
682 gimple_stmt_iterator gsi = gsi_for_stmt (call);
683 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
684 /* Inner loops have higher loop_nos. */
685 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
686 tree tile_size = gimple_call_arg (call, 2);
687 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
688 tree lhs = gimple_call_lhs (call);
689 tree type = TREE_TYPE (lhs);
690 gimple_seq seq = NULL;
691 tree span = build_int_cst (type, 1);
692
693 gcc_assert (!(e_mask
694 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
695 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
696 push_gimplify_context (!seen_error ());
697
698 #ifndef ACCEL_COMPILER
699 /* Partitioning disabled on host compilers. */
700 e_mask = 0;
701 #endif
702 if (!e_mask)
703 /* Not paritioning. */
704 span = integer_one_node;
705 else if (!integer_zerop (tile_size))
706 /* User explicitly specified size. */
707 span = tile_size;
708 else
709 {
710 /* Pick a size based on the paritioning of the element loop and
711 the number of loop nests. */
712 tree first_size = NULL_TREE;
713 tree second_size = NULL_TREE;
714
715 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
716 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
717 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
718 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
719
720 if (!first_size)
721 {
722 first_size = second_size;
723 second_size = NULL_TREE;
724 }
725
726 if (loop_no + 1 == collapse)
727 {
728 span = first_size;
729 if (!loop_no && second_size)
730 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
731 span, second_size);
732 }
733 else if (loop_no + 2 == collapse)
734 span = second_size;
735 else
736 span = NULL_TREE;
737
738 if (!span)
739 /* There's no obvious element size for this loop. Options
740 are 1, first_size or some non-unity constant (32 is my
741 favourite). We should gather some statistics. */
742 span = first_size;
743 }
744
745 span = fold_convert (type, span);
746 gimplify_assign (lhs, span, &seq);
747
748 pop_gimplify_context (NULL);
749
750 gsi_replace_with_seq (&gsi, seq, true);
751 }
752
753 /* Default partitioned and minimum partitioned dimensions. */
754
755 static int oacc_default_dims[GOMP_DIM_MAX];
756 static int oacc_min_dims[GOMP_DIM_MAX];
757
758 int
759 oacc_get_default_dim (int dim)
760 {
761 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
762 return oacc_default_dims[dim];
763 }
764
765 int
766 oacc_get_min_dim (int dim)
767 {
768 gcc_assert (0 <= dim && dim < GOMP_DIM_MAX);
769 return oacc_min_dims[dim];
770 }
771
772 /* Parse the default dimension parameter. This is a set of
773 :-separated optional compute dimensions. Each specified dimension
774 is a positive integer. When device type support is added, it is
775 planned to be a comma separated list of such compute dimensions,
776 with all but the first prefixed by the colon-terminated device
777 type. */
778
779 static void
780 oacc_parse_default_dims (const char *dims)
781 {
782 int ix;
783
784 for (ix = GOMP_DIM_MAX; ix--;)
785 {
786 oacc_default_dims[ix] = -1;
787 oacc_min_dims[ix] = 1;
788 }
789
790 #ifndef ACCEL_COMPILER
791 /* Cannot be overridden on the host. */
792 dims = NULL;
793 #endif
794 if (dims)
795 {
796 const char *pos = dims;
797
798 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
799 {
800 if (ix)
801 {
802 if (*pos != ':')
803 goto malformed;
804 pos++;
805 }
806
807 if (*pos != ':')
808 {
809 long val;
810 const char *eptr;
811
812 errno = 0;
813 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
814 if (errno || val <= 0 || (int) val != val)
815 goto malformed;
816 pos = eptr;
817 oacc_default_dims[ix] = (int) val;
818 }
819 }
820 if (*pos)
821 {
822 malformed:
823 error_at (UNKNOWN_LOCATION,
824 "%<-fopenacc-dim%> operand is malformed at %qs", pos);
825 }
826 }
827
828 /* Allow the backend to validate the dimensions. */
829 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1, 0);
830 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2, 0);
831 }
832
833 /* Validate and update the dimensions for offloaded FN. ATTRS is the
834 raw attribute. DIMS is an array of dimensions, which is filled in.
835 LEVEL is the partitioning level of a routine, or -1 for an offload
836 region itself. USED is the mask of partitioned execution in the
837 function. */
838
839 static void
840 oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
841 {
842 tree purpose[GOMP_DIM_MAX];
843 unsigned ix;
844 tree pos = TREE_VALUE (attrs);
845
846 /* Make sure the attribute creator attached the dimension
847 information. */
848 gcc_assert (pos);
849
850 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
851 {
852 purpose[ix] = TREE_PURPOSE (pos);
853 tree val = TREE_VALUE (pos);
854 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
855 pos = TREE_CHAIN (pos);
856 }
857
858 bool changed = targetm.goacc.validate_dims (fn, dims, level, used);
859
860 /* Default anything left to 1 or a partitioned default. */
861 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
862 if (dims[ix] < 0)
863 {
864 /* The OpenACC spec says 'If the [num_gangs] clause is not
865 specified, an implementation-defined default will be used;
866 the default may depend on the code within the construct.'
867 (2.5.6). Thus an implementation is free to choose
868 non-unity default for a parallel region that doesn't have
869 any gang-partitioned loops. However, it appears that there
870 is a sufficient body of user code that expects non-gang
871 partitioned regions to not execute in gang-redundant mode.
872 So we (a) don't warn about the non-portability and (b) pick
873 the minimum permissible dimension size when there is no
874 partitioned execution. Otherwise we pick the global
875 default for the dimension, which the user can control. The
876 same wording and logic applies to num_workers and
877 vector_length, however the worker- or vector- single
878 execution doesn't have the same impact as gang-redundant
879 execution. (If the minimum gang-level partioning is not 1,
880 the target is probably too confusing.) */
881 dims[ix] = (used & GOMP_DIM_MASK (ix)
882 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
883 changed = true;
884 }
885
886 if (changed)
887 {
888 /* Replace the attribute with new values. */
889 pos = NULL_TREE;
890 for (ix = GOMP_DIM_MAX; ix--;)
891 pos = tree_cons (purpose[ix],
892 build_int_cst (integer_type_node, dims[ix]), pos);
893 oacc_replace_fn_attrib (fn, pos);
894 }
895 }
896
897 /* Create an empty OpenACC loop structure at LOC. */
898
899 static oacc_loop *
900 new_oacc_loop_raw (oacc_loop *parent, location_t loc)
901 {
902 oacc_loop *loop = XCNEW (oacc_loop);
903
904 loop->parent = parent;
905
906 if (parent)
907 {
908 loop->sibling = parent->child;
909 parent->child = loop;
910 }
911
912 loop->loc = loc;
913 return loop;
914 }
915
916 /* Create an outermost, dummy OpenACC loop for offloaded function
917 DECL. */
918
919 static oacc_loop *
920 new_oacc_loop_outer (tree decl)
921 {
922 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
923 }
924
925 /* Start a new OpenACC loop structure beginning at head marker HEAD.
926 Link into PARENT loop. Return the new loop. */
927
928 static oacc_loop *
929 new_oacc_loop (oacc_loop *parent, gcall *marker)
930 {
931 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
932
933 loop->marker = marker;
934
935 /* TODO: This is where device_type flattening would occur for the loop
936 flags. */
937
938 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
939
940 tree chunk_size = integer_zero_node;
941 if (loop->flags & OLF_GANG_STATIC)
942 chunk_size = gimple_call_arg (marker, 4);
943 loop->chunk_size = chunk_size;
944
945 return loop;
946 }
947
948 /* Create a dummy loop encompassing a call to a openACC routine.
949 Extract the routine's partitioning requirements. */
950
951 static void
952 new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
953 {
954 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
955 int level = oacc_fn_attrib_level (attrs);
956
957 gcc_assert (level >= 0);
958
959 loop->marker = call;
960 loop->routine = decl;
961 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
962 ^ (GOMP_DIM_MASK (level) - 1));
963 }
964
965 /* Finish off the current OpenACC loop ending at tail marker TAIL.
966 Return the parent loop. */
967
968 static oacc_loop *
969 finish_oacc_loop (oacc_loop *loop)
970 {
971 /* If the loop has been collapsed, don't partition it. */
972 if (loop->ifns.is_empty ())
973 loop->mask = loop->flags = 0;
974 return loop->parent;
975 }
976
977 /* Free all OpenACC loop structures within LOOP (inclusive). */
978
979 static void
980 free_oacc_loop (oacc_loop *loop)
981 {
982 if (loop->sibling)
983 free_oacc_loop (loop->sibling);
984 if (loop->child)
985 free_oacc_loop (loop->child);
986
987 loop->ifns.release ();
988 free (loop);
989 }
990
991 /* Dump out the OpenACC loop head or tail beginning at FROM. */
992
993 static void
994 dump_oacc_loop_part (FILE *file, gcall *from, int depth,
995 const char *title, int level)
996 {
997 enum ifn_unique_kind kind
998 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
999
1000 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
1001 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1002 {
1003 gimple *stmt = gsi_stmt (gsi);
1004
1005 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1006 {
1007 enum ifn_unique_kind k
1008 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
1009 (gimple_call_arg (stmt, 0)));
1010
1011 if (k == kind && stmt != from)
1012 break;
1013 }
1014 print_gimple_stmt (file, stmt, depth * 2 + 2);
1015
1016 gsi_next (&gsi);
1017 while (gsi_end_p (gsi))
1018 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1019 }
1020 }
1021
1022 /* Dump OpenACC loop LOOP, its children, and its siblings. */
1023
1024 static void
1025 dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
1026 {
1027 int ix;
1028
1029 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
1030 loop->flags, loop->mask,
1031 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
1032
1033 if (loop->marker)
1034 print_gimple_stmt (file, loop->marker, depth * 2);
1035
1036 if (loop->routine)
1037 fprintf (file, "%*sRoutine %s:%u:%s\n",
1038 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
1039 DECL_SOURCE_LINE (loop->routine),
1040 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
1041
1042 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
1043 if (loop->heads[ix])
1044 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
1045 for (ix = GOMP_DIM_MAX; ix--;)
1046 if (loop->tails[ix])
1047 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
1048
1049 if (loop->child)
1050 dump_oacc_loop (file, loop->child, depth + 1);
1051 if (loop->sibling)
1052 dump_oacc_loop (file, loop->sibling, depth);
1053 }
1054
1055 void debug_oacc_loop (oacc_loop *);
1056
1057 /* Dump loops to stderr. */
1058
1059 DEBUG_FUNCTION void
1060 debug_oacc_loop (oacc_loop *loop)
1061 {
1062 dump_oacc_loop (stderr, loop, 0);
1063 }
1064
1065 /* Provide diagnostics on OpenACC loop LOOP, its children, and its
1066 siblings. */
1067
1068 static void
1069 inform_oacc_loop (const oacc_loop *loop)
1070 {
1071 const char *gang
1072 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_GANG) ? " gang" : "";
1073 const char *worker
1074 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER) ? " worker" : "";
1075 const char *vector
1076 = loop->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR) ? " vector" : "";
1077 const char *seq = loop->mask == 0 ? " seq" : "";
1078 const dump_user_location_t loc
1079 = dump_user_location_t::from_location_t (loop->loc);
1080 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1081 "assigned OpenACC%s%s%s%s loop parallelism\n", gang, worker,
1082 vector, seq);
1083
1084 if (loop->child)
1085 inform_oacc_loop (loop->child);
1086 if (loop->sibling)
1087 inform_oacc_loop (loop->sibling);
1088 }
1089
1090 /* DFS walk of basic blocks BB onwards, creating OpenACC loop
1091 structures as we go. By construction these loops are properly
1092 nested. */
1093
1094 static void
1095 oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
1096 {
1097 int marker = 0;
1098 int remaining = 0;
1099
1100 if (bb->flags & BB_VISITED)
1101 return;
1102
1103 follow:
1104 bb->flags |= BB_VISITED;
1105
1106 /* Scan for loop markers. */
1107 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1108 gsi_next (&gsi))
1109 {
1110 gimple *stmt = gsi_stmt (gsi);
1111
1112 if (!is_gimple_call (stmt))
1113 continue;
1114
1115 gcall *call = as_a <gcall *> (stmt);
1116
1117 /* If this is a routine, make a dummy loop for it. */
1118 if (tree decl = gimple_call_fndecl (call))
1119 if (tree attrs = oacc_get_fn_attrib (decl))
1120 {
1121 gcc_assert (!marker);
1122 new_oacc_loop_routine (loop, call, decl, attrs);
1123 }
1124
1125 if (!gimple_call_internal_p (call))
1126 continue;
1127
1128 switch (gimple_call_internal_fn (call))
1129 {
1130 default:
1131 break;
1132
1133 case IFN_GOACC_LOOP:
1134 case IFN_GOACC_TILE:
1135 /* Record the abstraction function, so we can manipulate it
1136 later. */
1137 loop->ifns.safe_push (call);
1138 break;
1139
1140 case IFN_UNIQUE:
1141 enum ifn_unique_kind kind
1142 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
1143 (gimple_call_arg (call, 0)));
1144 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
1145 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
1146 {
1147 if (gimple_call_num_args (call) == 2)
1148 {
1149 gcc_assert (marker && !remaining);
1150 marker = 0;
1151 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
1152 loop = finish_oacc_loop (loop);
1153 else
1154 loop->head_end = call;
1155 }
1156 else
1157 {
1158 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1159
1160 if (!marker)
1161 {
1162 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1163 loop = new_oacc_loop (loop, call);
1164 remaining = count;
1165 }
1166 gcc_assert (count == remaining);
1167 if (remaining)
1168 {
1169 remaining--;
1170 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
1171 loop->heads[marker] = call;
1172 else
1173 loop->tails[remaining] = call;
1174 }
1175 marker++;
1176 }
1177 }
1178 }
1179 }
1180 if (remaining || marker)
1181 {
1182 bb = single_succ (bb);
1183 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
1184 goto follow;
1185 }
1186
1187 /* Walk successor blocks. */
1188 edge e;
1189 edge_iterator ei;
1190
1191 FOR_EACH_EDGE (e, ei, bb->succs)
1192 oacc_loop_discover_walk (loop, e->dest);
1193 }
1194
1195 /* LOOP is the first sibling. Reverse the order in place and return
1196 the new first sibling. Recurse to child loops. */
1197
1198 static oacc_loop *
1199 oacc_loop_sibling_nreverse (oacc_loop *loop)
1200 {
1201 oacc_loop *last = NULL;
1202 do
1203 {
1204 if (loop->child)
1205 loop->child = oacc_loop_sibling_nreverse (loop->child);
1206
1207 oacc_loop *next = loop->sibling;
1208 loop->sibling = last;
1209 last = loop;
1210 loop = next;
1211 }
1212 while (loop);
1213
1214 return last;
1215 }
1216
1217 /* Discover the OpenACC loops marked up by HEAD and TAIL markers for
1218 the current function. */
1219
1220 static oacc_loop *
1221 oacc_loop_discovery ()
1222 {
1223 /* Clear basic block flags, in particular BB_VISITED which we're going to use
1224 in the following. */
1225 clear_bb_flags ();
1226
1227 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
1228 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
1229
1230 /* The siblings were constructed in reverse order, reverse them so
1231 that diagnostics come out in an unsurprising order. */
1232 top = oacc_loop_sibling_nreverse (top);
1233
1234 return top;
1235 }
1236
1237 /* Transform the abstract internal function markers starting at FROM
1238 to be for partitioning level LEVEL. Stop when we meet another HEAD
1239 or TAIL marker. */
1240
1241 static void
1242 oacc_loop_xform_head_tail (gcall *from, int level)
1243 {
1244 enum ifn_unique_kind kind
1245 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1246 tree replacement = build_int_cst (unsigned_type_node, level);
1247
1248 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1249 {
1250 gimple *stmt = gsi_stmt (gsi);
1251
1252 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1253 {
1254 enum ifn_unique_kind k
1255 = ((enum ifn_unique_kind)
1256 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1257
1258 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1259 *gimple_call_arg_ptr (stmt, 2) = replacement;
1260 else if (k == kind && stmt != from)
1261 break;
1262 }
1263 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1264 *gimple_call_arg_ptr (stmt, 3) = replacement;
1265
1266 gsi_next (&gsi);
1267 while (gsi_end_p (gsi))
1268 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1269 }
1270 }
1271
1272 /* Process the discovered OpenACC loops, setting the correct
1273 partitioning level etc. */
1274
1275 static void
1276 oacc_loop_process (oacc_loop *loop)
1277 {
1278 if (loop->child)
1279 oacc_loop_process (loop->child);
1280
1281 if (loop->mask && !loop->routine)
1282 {
1283 int ix;
1284 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1285 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
1286 tree chunk_arg = loop->chunk_size;
1287 gcall *call;
1288
1289 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1290 switch (gimple_call_internal_fn (call))
1291 {
1292 case IFN_GOACC_LOOP:
1293 {
1294 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1295 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1296 if (!is_e)
1297 gimple_call_set_arg (call, 4, chunk_arg);
1298 }
1299 break;
1300
1301 case IFN_GOACC_TILE:
1302 gimple_call_set_arg (call, 3, mask_arg);
1303 gimple_call_set_arg (call, 4, e_mask_arg);
1304 break;
1305
1306 default:
1307 gcc_unreachable ();
1308 }
1309
1310 unsigned dim = GOMP_DIM_GANG;
1311 unsigned mask = loop->mask | loop->e_mask;
1312 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1313 {
1314 while (!(GOMP_DIM_MASK (dim) & mask))
1315 dim++;
1316
1317 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1318 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1319
1320 mask ^= GOMP_DIM_MASK (dim);
1321 }
1322 }
1323
1324 if (loop->sibling)
1325 oacc_loop_process (loop->sibling);
1326 }
1327
1328 /* Walk the OpenACC loop heirarchy checking and assigning the
1329 programmer-specified partitionings. OUTER_MASK is the partitioning
1330 this loop is contained within. Return mask of partitioning
1331 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1332 bit. */
1333
1334 static unsigned
1335 oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1336 {
1337 unsigned this_mask = loop->mask;
1338 unsigned mask_all = 0;
1339 bool noisy = true;
1340
1341 #ifdef ACCEL_COMPILER
1342 /* When device_type is supported, we want the device compiler to be
1343 noisy, if the loop parameters are device_type-specific. */
1344 noisy = false;
1345 #endif
1346
1347 if (!loop->routine)
1348 {
1349 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1350 bool seq_par = (loop->flags & OLF_SEQ) != 0;
1351 bool tiling = (loop->flags & OLF_TILE) != 0;
1352
1353 this_mask = ((loop->flags >> OLF_DIM_BASE)
1354 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1355
1356 /* Apply auto partitioning if this is a non-partitioned regular
1357 loop, or (no more than) single axis tiled loop. */
1358 bool maybe_auto
1359 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1360
1361 if ((this_mask != 0) + auto_par + seq_par > 1)
1362 {
1363 if (noisy)
1364 error_at (loop->loc,
1365 seq_par
1366 ? G_("%<seq%> overrides other OpenACC loop specifiers")
1367 : G_("%<auto%> conflicts with other OpenACC loop "
1368 "specifiers"));
1369 maybe_auto = false;
1370 loop->flags &= ~OLF_AUTO;
1371 if (seq_par)
1372 {
1373 loop->flags
1374 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
1375 this_mask = 0;
1376 }
1377 }
1378
1379 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1380 {
1381 loop->flags |= OLF_AUTO;
1382 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1383 }
1384 }
1385
1386 if (this_mask & outer_mask)
1387 {
1388 const oacc_loop *outer;
1389 for (outer = loop->parent; outer; outer = outer->parent)
1390 if ((outer->mask | outer->e_mask) & this_mask)
1391 break;
1392
1393 if (noisy)
1394 {
1395 if (outer)
1396 {
1397 error_at (loop->loc,
1398 loop->routine
1399 ? G_("routine call uses same OpenACC parallelism"
1400 " as containing loop")
1401 : G_("inner loop uses same OpenACC parallelism"
1402 " as containing loop"));
1403 inform (outer->loc, "containing loop here");
1404 }
1405 else
1406 error_at (loop->loc,
1407 loop->routine
1408 ? G_("routine call uses OpenACC parallelism disallowed"
1409 " by containing routine")
1410 : G_("loop uses OpenACC parallelism disallowed"
1411 " by containing routine"));
1412
1413 if (loop->routine)
1414 inform (DECL_SOURCE_LOCATION (loop->routine),
1415 "routine %qD declared here", loop->routine);
1416 }
1417 this_mask &= ~outer_mask;
1418 }
1419 else
1420 {
1421 unsigned outermost = least_bit_hwi (this_mask);
1422
1423 if (outermost && outermost <= outer_mask)
1424 {
1425 if (noisy)
1426 {
1427 error_at (loop->loc,
1428 "incorrectly nested OpenACC loop parallelism");
1429
1430 const oacc_loop *outer;
1431 for (outer = loop->parent;
1432 outer->flags && outer->flags < outermost;
1433 outer = outer->parent)
1434 continue;
1435 inform (outer->loc, "containing loop here");
1436 }
1437
1438 this_mask &= ~outermost;
1439 }
1440 }
1441
1442 mask_all |= this_mask;
1443
1444 if (loop->flags & OLF_TILE)
1445 {
1446 /* When tiling, vector goes to the element loop, and failing
1447 that we put worker there. The std doesn't contemplate
1448 specifying all three. We choose to put worker and vector on
1449 the element loops in that case. */
1450 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1451 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1452 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1453
1454 loop->e_mask = this_e_mask;
1455 this_mask ^= this_e_mask;
1456 }
1457
1458 loop->mask = this_mask;
1459
1460 if (dump_file)
1461 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1462 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1463 loop->mask, loop->e_mask);
1464
1465 if (loop->child)
1466 {
1467 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1468 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
1469 mask_all |= loop->inner;
1470 }
1471
1472 if (loop->sibling)
1473 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1474
1475 return mask_all;
1476 }
1477
1478 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1479 OUTER_MASK is the partitioning this loop is contained within.
1480 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
1481 Return the cumulative partitioning used by this loop, siblings and
1482 children. */
1483
1484 static unsigned
1485 oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1486 bool outer_assign)
1487 {
1488 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1489 bool noisy = true;
1490 bool tiling = loop->flags & OLF_TILE;
1491
1492 #ifdef ACCEL_COMPILER
1493 /* When device_type is supported, we want the device compiler to be
1494 noisy, if the loop parameters are device_type-specific. */
1495 noisy = false;
1496 #endif
1497
1498 if (assign && (!outer_assign || loop->inner))
1499 {
1500 /* Allocate outermost and non-innermost loops at the outermost
1501 non-innermost available level. */
1502 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1503
1504 /* Find the first outermost available partition. */
1505 while (this_mask <= outer_mask)
1506 this_mask <<= 1;
1507
1508 /* Grab two axes if tiling, and we've not assigned anything */
1509 if (tiling && !(loop->mask | loop->e_mask))
1510 this_mask |= this_mask << 1;
1511
1512 /* Prohibit the innermost partitioning at the moment. */
1513 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
1514
1515 /* Don't use any dimension explicitly claimed by an inner loop. */
1516 this_mask &= ~loop->inner;
1517
1518 if (tiling && !loop->e_mask)
1519 {
1520 /* If we got two axes, allocate the inner one to the element
1521 loop. */
1522 loop->e_mask = this_mask & (this_mask << 1);
1523 this_mask ^= loop->e_mask;
1524 }
1525
1526 loop->mask |= this_mask;
1527 }
1528
1529 if (loop->child)
1530 {
1531 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1532 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1533 outer_assign | assign);
1534 }
1535
1536 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
1537 {
1538 /* Allocate the loop at the innermost available level. Note
1539 that we do this even if we already assigned this loop the
1540 outermost available level above. That way we'll partition
1541 this along 2 axes, if they are available. */
1542 unsigned this_mask = 0;
1543
1544 /* Determine the outermost partitioning used within this loop. */
1545 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1546 this_mask = least_bit_hwi (this_mask);
1547
1548 /* Pick the partitioning just inside that one. */
1549 this_mask >>= 1;
1550
1551 /* And avoid picking one use by an outer loop. */
1552 this_mask &= ~outer_mask;
1553
1554 /* If tiling and we failed completely above, grab the next one
1555 too. Making sure it doesn't hit an outer loop. */
1556 if (tiling)
1557 {
1558 this_mask &= ~(loop->e_mask | loop->mask);
1559 unsigned tile_mask = ((this_mask >> 1)
1560 & ~(outer_mask | loop->e_mask | loop->mask));
1561
1562 if (tile_mask || loop->mask)
1563 {
1564 loop->e_mask |= this_mask;
1565 this_mask = tile_mask;
1566 }
1567 if (!loop->e_mask && noisy)
1568 warning_at (loop->loc, 0,
1569 "insufficient partitioning available"
1570 " to parallelize element loop");
1571 }
1572
1573 loop->mask |= this_mask;
1574 if (!loop->mask && noisy)
1575 warning_at (loop->loc, 0,
1576 tiling
1577 ? G_("insufficient partitioning available"
1578 " to parallelize tile loop")
1579 : G_("insufficient partitioning available"
1580 " to parallelize loop"));
1581 }
1582
1583 if (assign && dump_file)
1584 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
1585 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1586 loop->mask, loop->e_mask);
1587
1588 unsigned inner_mask = 0;
1589
1590 if (loop->sibling)
1591 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1592 outer_mask, outer_assign);
1593
1594 inner_mask |= loop->inner | loop->mask | loop->e_mask;
1595
1596 return inner_mask;
1597 }
1598
1599 /* Walk the OpenACC loop heirarchy to check and assign partitioning
1600 axes. Return mask of partitioning. */
1601
1602 static unsigned
1603 oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1604 {
1605 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1606
1607 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1608 {
1609 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
1610 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
1611 }
1612 return mask_all;
1613 }
1614
1615 /* Default fork/join early expander. Delete the function calls if
1616 there is no RTL expander. */
1617
1618 bool
1619 default_goacc_fork_join (gcall *ARG_UNUSED (call),
1620 const int *ARG_UNUSED (dims), bool is_fork)
1621 {
1622 if (is_fork)
1623 return targetm.have_oacc_fork ();
1624 else
1625 return targetm.have_oacc_join ();
1626 }
1627
1628 /* Default goacc.reduction early expander.
1629
1630 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1631 If RES_PTR is not integer-zerop:
1632 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1633 TEARDOWN - emit '*RES_PTR = VAR'
1634 If LHS is not NULL
1635 emit 'LHS = VAR' */
1636
1637 void
1638 default_goacc_reduction (gcall *call)
1639 {
1640 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1641 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1642 tree lhs = gimple_call_lhs (call);
1643 tree var = gimple_call_arg (call, 2);
1644 gimple_seq seq = NULL;
1645
1646 if (code == IFN_GOACC_REDUCTION_SETUP
1647 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1648 {
1649 /* Setup and Teardown need to copy from/to the receiver object,
1650 if there is one. */
1651 tree ref_to_res = gimple_call_arg (call, 1);
1652
1653 if (!integer_zerop (ref_to_res))
1654 {
1655 tree dst = build_simple_mem_ref (ref_to_res);
1656 tree src = var;
1657
1658 if (code == IFN_GOACC_REDUCTION_SETUP)
1659 {
1660 src = dst;
1661 dst = lhs;
1662 lhs = NULL;
1663 }
1664 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1665 }
1666 }
1667
1668 /* Copy VAR to LHS, if there is an LHS. */
1669 if (lhs)
1670 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1671
1672 gsi_replace_with_seq (&gsi, seq, true);
1673 }
1674
1675 /* Main entry point for oacc transformations which run on the device
1676 compiler after LTO, so we know what the target device is at this
1677 point (including the host fallback). */
1678
1679 static unsigned int
1680 execute_oacc_device_lower ()
1681 {
1682 tree attrs = oacc_get_fn_attrib (current_function_decl);
1683
1684 if (!attrs)
1685 /* Not an offloaded function. */
1686 return 0;
1687
1688 /* Parse the default dim argument exactly once. */
1689 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1690 {
1691 oacc_parse_default_dims (flag_openacc_dims);
1692 flag_openacc_dims = (char *)&flag_openacc_dims;
1693 }
1694
1695 bool is_oacc_kernels
1696 = (lookup_attribute ("oacc kernels",
1697 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1698 bool is_oacc_kernels_parallelized
1699 = (lookup_attribute ("oacc kernels parallelized",
1700 DECL_ATTRIBUTES (current_function_decl)) != NULL);
1701
1702 /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
1703 kernels, so remove the parallelism dimensions function attributes
1704 potentially set earlier on. */
1705 if (is_oacc_kernels && !is_oacc_kernels_parallelized)
1706 {
1707 oacc_set_fn_attrib (current_function_decl, NULL, NULL);
1708 attrs = oacc_get_fn_attrib (current_function_decl);
1709 }
1710
1711 /* Discover, partition and process the loops. */
1712 oacc_loop *loops = oacc_loop_discovery ();
1713 int fn_level = oacc_fn_attrib_level (attrs);
1714
1715 if (dump_file)
1716 {
1717 if (fn_level >= 0)
1718 fprintf (dump_file, "Function is OpenACC routine level %d\n",
1719 fn_level);
1720 else if (is_oacc_kernels)
1721 fprintf (dump_file, "Function is %s OpenACC kernels offload\n",
1722 (is_oacc_kernels_parallelized
1723 ? "parallelized" : "unparallelized"));
1724 else
1725 fprintf (dump_file, "Function is OpenACC parallel offload\n");
1726 }
1727
1728 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1729 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1730 /* OpenACC kernels constructs are special: they currently don't use the
1731 generic oacc_loop infrastructure and attribute/dimension processing. */
1732 if (is_oacc_kernels && is_oacc_kernels_parallelized)
1733 {
1734 /* Parallelized OpenACC kernels constructs use gang parallelism. See
1735 also tree-parloops.c:create_parallel_loop. */
1736 used_mask |= GOMP_DIM_MASK (GOMP_DIM_GANG);
1737 }
1738
1739 int dims[GOMP_DIM_MAX];
1740 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1741
1742 if (dump_file)
1743 {
1744 const char *comma = "Compute dimensions [";
1745 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1746 fprintf (dump_file, "%s%d", comma, dims[ix]);
1747 fprintf (dump_file, "]\n");
1748 }
1749
1750 oacc_loop_process (loops);
1751 if (dump_file)
1752 {
1753 fprintf (dump_file, "OpenACC loops\n");
1754 dump_oacc_loop (dump_file, loops, 0);
1755 fprintf (dump_file, "\n");
1756 }
1757 if (dump_enabled_p ())
1758 {
1759 oacc_loop *l = loops;
1760 /* OpenACC kernels constructs are special: they currently don't use the
1761 generic oacc_loop infrastructure. */
1762 if (is_oacc_kernels)
1763 {
1764 /* Create a fake oacc_loop for diagnostic purposes. */
1765 l = new_oacc_loop_raw (NULL,
1766 DECL_SOURCE_LOCATION (current_function_decl));
1767 l->mask = used_mask;
1768 }
1769 else
1770 {
1771 /* Skip the outermost, dummy OpenACC loop */
1772 l = l->child;
1773 }
1774 if (l)
1775 inform_oacc_loop (l);
1776 if (is_oacc_kernels)
1777 free_oacc_loop (l);
1778 }
1779
1780 /* Offloaded targets may introduce new basic blocks, which require
1781 dominance information to update SSA. */
1782 calculate_dominance_info (CDI_DOMINATORS);
1783
1784 /* Now lower internal loop functions to target-specific code
1785 sequences. */
1786 basic_block bb;
1787 FOR_ALL_BB_FN (bb, cfun)
1788 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1789 {
1790 gimple *stmt = gsi_stmt (gsi);
1791 if (!is_gimple_call (stmt))
1792 {
1793 gsi_next (&gsi);
1794 continue;
1795 }
1796
1797 gcall *call = as_a <gcall *> (stmt);
1798 if (!gimple_call_internal_p (call))
1799 {
1800 gsi_next (&gsi);
1801 continue;
1802 }
1803
1804 /* Rewind to allow rescan. */
1805 gsi_prev (&gsi);
1806 bool rescan = false, remove = false;
1807 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1808
1809 switch (ifn_code)
1810 {
1811 default: break;
1812
1813 case IFN_GOACC_TILE:
1814 oacc_xform_tile (call);
1815 rescan = true;
1816 break;
1817
1818 case IFN_GOACC_LOOP:
1819 oacc_xform_loop (call);
1820 rescan = true;
1821 break;
1822
1823 case IFN_GOACC_REDUCTION:
1824 /* Mark the function for SSA renaming. */
1825 mark_virtual_operands_for_renaming (cfun);
1826
1827 /* If the level is -1, this ended up being an unused
1828 axis. Handle as a default. */
1829 if (integer_minus_onep (gimple_call_arg (call, 3)))
1830 default_goacc_reduction (call);
1831 else
1832 targetm.goacc.reduction (call);
1833 rescan = true;
1834 break;
1835
1836 case IFN_UNIQUE:
1837 {
1838 enum ifn_unique_kind kind
1839 = ((enum ifn_unique_kind)
1840 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1841
1842 switch (kind)
1843 {
1844 default:
1845 break;
1846
1847 case IFN_UNIQUE_OACC_FORK:
1848 case IFN_UNIQUE_OACC_JOIN:
1849 if (integer_minus_onep (gimple_call_arg (call, 2)))
1850 remove = true;
1851 else if (!targetm.goacc.fork_join
1852 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1853 remove = true;
1854 break;
1855
1856 case IFN_UNIQUE_OACC_HEAD_MARK:
1857 case IFN_UNIQUE_OACC_TAIL_MARK:
1858 remove = true;
1859 break;
1860 }
1861 break;
1862 }
1863 }
1864
1865 if (gsi_end_p (gsi))
1866 /* We rewound past the beginning of the BB. */
1867 gsi = gsi_start_bb (bb);
1868 else
1869 /* Undo the rewind. */
1870 gsi_next (&gsi);
1871
1872 if (remove)
1873 {
1874 if (gimple_vdef (call))
1875 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1876 if (gimple_call_lhs (call))
1877 {
1878 /* Propagate the data dependency var. */
1879 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1880 gimple_call_arg (call, 1));
1881 gsi_replace (&gsi, ass, false);
1882 }
1883 else
1884 gsi_remove (&gsi, true);
1885 }
1886 else if (!rescan)
1887 /* If not rescanning, advance over the call. */
1888 gsi_next (&gsi);
1889 }
1890
1891 free_oacc_loop (loops);
1892
1893 return 0;
1894 }
1895
1896 /* Default launch dimension validator. Force everything to 1. A
1897 backend that wants to provide larger dimensions must override this
1898 hook. */
1899
1900 bool
1901 default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1902 int ARG_UNUSED (fn_level),
1903 unsigned ARG_UNUSED (used))
1904 {
1905 bool changed = false;
1906
1907 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1908 {
1909 if (dims[ix] != 1)
1910 {
1911 dims[ix] = 1;
1912 changed = true;
1913 }
1914 }
1915
1916 return changed;
1917 }
1918
1919 /* Default dimension bound is unknown on accelerator and 1 on host. */
1920
1921 int
1922 default_goacc_dim_limit (int ARG_UNUSED (axis))
1923 {
1924 #ifdef ACCEL_COMPILER
1925 return 0;
1926 #else
1927 return 1;
1928 #endif
1929 }
1930
1931 namespace {
1932
1933 const pass_data pass_data_oacc_device_lower =
1934 {
1935 GIMPLE_PASS, /* type */
1936 "oaccdevlow", /* name */
1937 OPTGROUP_OMP, /* optinfo_flags */
1938 TV_NONE, /* tv_id */
1939 PROP_cfg, /* properties_required */
1940 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1941 0, /* properties_destroyed */
1942 0, /* todo_flags_start */
1943 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1944 };
1945
1946 class pass_oacc_device_lower : public gimple_opt_pass
1947 {
1948 public:
1949 pass_oacc_device_lower (gcc::context *ctxt)
1950 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1951 {}
1952
1953 /* opt_pass methods: */
1954 virtual bool gate (function *) { return flag_openacc; };
1955
1956 virtual unsigned int execute (function *)
1957 {
1958 return execute_oacc_device_lower ();
1959 }
1960
1961 }; // class pass_oacc_device_lower
1962
1963 } // anon namespace
1964
1965 gimple_opt_pass *
1966 make_pass_oacc_device_lower (gcc::context *ctxt)
1967 {
1968 return new pass_oacc_device_lower (ctxt);
1969 }
1970
1971 \f
1972 /* Rewrite GOMP_SIMT_ENTER_ALLOC call given by GSI and remove the preceding
1973 GOMP_SIMT_ENTER call identifying the privatized variables, which are
1974 turned to structure fields and receive a DECL_VALUE_EXPR accordingly.
1975 Set *REGIMPLIFY to true, except if no privatized variables were seen. */
1976
1977 static void
1978 ompdevlow_adjust_simt_enter (gimple_stmt_iterator *gsi, bool *regimplify)
1979 {
1980 gimple *alloc_stmt = gsi_stmt (*gsi);
1981 tree simtrec = gimple_call_lhs (alloc_stmt);
1982 tree simduid = gimple_call_arg (alloc_stmt, 0);
1983 gimple *enter_stmt = SSA_NAME_DEF_STMT (simduid);
1984 gcc_assert (gimple_call_internal_p (enter_stmt, IFN_GOMP_SIMT_ENTER));
1985 tree rectype = lang_hooks.types.make_type (RECORD_TYPE);
1986 TYPE_ARTIFICIAL (rectype) = TYPE_NAMELESS (rectype) = 1;
1987 TREE_ADDRESSABLE (rectype) = 1;
1988 TREE_TYPE (simtrec) = build_pointer_type (rectype);
1989 for (unsigned i = 1; i < gimple_call_num_args (enter_stmt); i++)
1990 {
1991 tree *argp = gimple_call_arg_ptr (enter_stmt, i);
1992 if (*argp == null_pointer_node)
1993 continue;
1994 gcc_assert (TREE_CODE (*argp) == ADDR_EXPR
1995 && VAR_P (TREE_OPERAND (*argp, 0)));
1996 tree var = TREE_OPERAND (*argp, 0);
1997
1998 tree field = build_decl (DECL_SOURCE_LOCATION (var), FIELD_DECL,
1999 DECL_NAME (var), TREE_TYPE (var));
2000 SET_DECL_ALIGN (field, DECL_ALIGN (var));
2001 DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
2002 TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
2003
2004 insert_field_into_struct (rectype, field);
2005
2006 tree t = build_simple_mem_ref (simtrec);
2007 t = build3 (COMPONENT_REF, TREE_TYPE (var), t, field, NULL);
2008 TREE_THIS_VOLATILE (t) = TREE_THIS_VOLATILE (var);
2009 SET_DECL_VALUE_EXPR (var, t);
2010 DECL_HAS_VALUE_EXPR_P (var) = 1;
2011 *regimplify = true;
2012 }
2013 layout_type (rectype);
2014 tree size = TYPE_SIZE_UNIT (rectype);
2015 tree align = build_int_cst (TREE_TYPE (size), TYPE_ALIGN_UNIT (rectype));
2016
2017 alloc_stmt
2018 = gimple_build_call_internal (IFN_GOMP_SIMT_ENTER_ALLOC, 2, size, align);
2019 gimple_call_set_lhs (alloc_stmt, simtrec);
2020 gsi_replace (gsi, alloc_stmt, false);
2021 gimple_stmt_iterator enter_gsi = gsi_for_stmt (enter_stmt);
2022 enter_stmt = gimple_build_assign (simduid, gimple_call_arg (enter_stmt, 0));
2023 gsi_replace (&enter_gsi, enter_stmt, false);
2024
2025 use_operand_p use;
2026 gimple *exit_stmt;
2027 if (single_imm_use (simtrec, &use, &exit_stmt))
2028 {
2029 gcc_assert (gimple_call_internal_p (exit_stmt, IFN_GOMP_SIMT_EXIT));
2030 gimple_stmt_iterator exit_gsi = gsi_for_stmt (exit_stmt);
2031 tree clobber = build_clobber (rectype);
2032 exit_stmt = gimple_build_assign (build_simple_mem_ref (simtrec), clobber);
2033 gsi_insert_before (&exit_gsi, exit_stmt, GSI_SAME_STMT);
2034 }
2035 else
2036 gcc_checking_assert (has_zero_uses (simtrec));
2037 }
2038
2039 /* Callback for walk_gimple_stmt used to scan for SIMT-privatized variables. */
2040
2041 static tree
2042 find_simtpriv_var_op (tree *tp, int *walk_subtrees, void *)
2043 {
2044 tree t = *tp;
2045
2046 if (VAR_P (t)
2047 && DECL_HAS_VALUE_EXPR_P (t)
2048 && lookup_attribute ("omp simt private", DECL_ATTRIBUTES (t)))
2049 {
2050 *walk_subtrees = 0;
2051 return t;
2052 }
2053 return NULL_TREE;
2054 }
2055
2056 /* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
2057 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
2058 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
2059 internal functions on non-SIMT targets, and likewise some SIMD internal
2060 functions on SIMT targets. */
2061
2062 static unsigned int
2063 execute_omp_device_lower ()
2064 {
2065 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
2066 bool regimplify = false;
2067 basic_block bb;
2068 gimple_stmt_iterator gsi;
2069 bool calls_declare_variant_alt
2070 = cgraph_node::get (cfun->decl)->calls_declare_variant_alt;
2071 FOR_EACH_BB_FN (bb, cfun)
2072 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2073 {
2074 gimple *stmt = gsi_stmt (gsi);
2075 if (!is_gimple_call (stmt))
2076 continue;
2077 if (!gimple_call_internal_p (stmt))
2078 {
2079 if (calls_declare_variant_alt)
2080 if (tree fndecl = gimple_call_fndecl (stmt))
2081 {
2082 tree new_fndecl = omp_resolve_declare_variant (fndecl);
2083 if (new_fndecl != fndecl)
2084 {
2085 gimple_call_set_fndecl (stmt, new_fndecl);
2086 update_stmt (stmt);
2087 }
2088 }
2089 continue;
2090 }
2091 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
2092 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
2093 switch (gimple_call_internal_fn (stmt))
2094 {
2095 case IFN_GOMP_USE_SIMT:
2096 rhs = vf == 1 ? integer_zero_node : integer_one_node;
2097 break;
2098 case IFN_GOMP_SIMT_ENTER:
2099 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2100 goto simtreg_enter_exit;
2101 case IFN_GOMP_SIMT_ENTER_ALLOC:
2102 if (vf != 1)
2103 ompdevlow_adjust_simt_enter (&gsi, &regimplify);
2104 rhs = vf == 1 ? null_pointer_node : NULL_TREE;
2105 goto simtreg_enter_exit;
2106 case IFN_GOMP_SIMT_EXIT:
2107 simtreg_enter_exit:
2108 if (vf != 1)
2109 continue;
2110 unlink_stmt_vdef (stmt);
2111 break;
2112 case IFN_GOMP_SIMT_LANE:
2113 case IFN_GOMP_SIMT_LAST_LANE:
2114 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
2115 break;
2116 case IFN_GOMP_SIMT_VF:
2117 rhs = build_int_cst (type, vf);
2118 break;
2119 case IFN_GOMP_SIMT_ORDERED_PRED:
2120 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
2121 if (rhs || !lhs)
2122 unlink_stmt_vdef (stmt);
2123 break;
2124 case IFN_GOMP_SIMT_VOTE_ANY:
2125 case IFN_GOMP_SIMT_XCHG_BFLY:
2126 case IFN_GOMP_SIMT_XCHG_IDX:
2127 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
2128 break;
2129 case IFN_GOMP_SIMD_LANE:
2130 case IFN_GOMP_SIMD_LAST_LANE:
2131 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
2132 break;
2133 case IFN_GOMP_SIMD_VF:
2134 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
2135 break;
2136 default:
2137 continue;
2138 }
2139 if (lhs && !rhs)
2140 continue;
2141 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
2142 gsi_replace (&gsi, stmt, false);
2143 }
2144 if (regimplify)
2145 FOR_EACH_BB_REVERSE_FN (bb, cfun)
2146 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
2147 if (walk_gimple_stmt (&gsi, NULL, find_simtpriv_var_op, NULL))
2148 {
2149 if (gimple_clobber_p (gsi_stmt (gsi)))
2150 gsi_remove (&gsi, true);
2151 else
2152 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2153 }
2154 if (vf != 1)
2155 cfun->has_force_vectorize_loops = false;
2156 return 0;
2157 }
2158
2159 namespace {
2160
2161 const pass_data pass_data_omp_device_lower =
2162 {
2163 GIMPLE_PASS, /* type */
2164 "ompdevlow", /* name */
2165 OPTGROUP_OMP, /* optinfo_flags */
2166 TV_NONE, /* tv_id */
2167 PROP_cfg, /* properties_required */
2168 PROP_gimple_lomp_dev, /* properties_provided */
2169 0, /* properties_destroyed */
2170 0, /* todo_flags_start */
2171 TODO_update_ssa, /* todo_flags_finish */
2172 };
2173
2174 class pass_omp_device_lower : public gimple_opt_pass
2175 {
2176 public:
2177 pass_omp_device_lower (gcc::context *ctxt)
2178 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
2179 {}
2180
2181 /* opt_pass methods: */
2182 virtual bool gate (function *fun)
2183 {
2184 return (!(fun->curr_properties & PROP_gimple_lomp_dev)
2185 || (flag_openmp
2186 && cgraph_node::get (fun->decl)->calls_declare_variant_alt));
2187 }
2188 virtual unsigned int execute (function *)
2189 {
2190 return execute_omp_device_lower ();
2191 }
2192
2193 }; // class pass_expand_omp_ssa
2194
2195 } // anon namespace
2196
2197 gimple_opt_pass *
2198 make_pass_omp_device_lower (gcc::context *ctxt)
2199 {
2200 return new pass_omp_device_lower (ctxt);
2201 }
2202
2203 /* "omp declare target link" handling pass. */
2204
2205 namespace {
2206
2207 const pass_data pass_data_omp_target_link =
2208 {
2209 GIMPLE_PASS, /* type */
2210 "omptargetlink", /* name */
2211 OPTGROUP_OMP, /* optinfo_flags */
2212 TV_NONE, /* tv_id */
2213 PROP_ssa, /* properties_required */
2214 0, /* properties_provided */
2215 0, /* properties_destroyed */
2216 0, /* todo_flags_start */
2217 TODO_update_ssa, /* todo_flags_finish */
2218 };
2219
2220 class pass_omp_target_link : public gimple_opt_pass
2221 {
2222 public:
2223 pass_omp_target_link (gcc::context *ctxt)
2224 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
2225 {}
2226
2227 /* opt_pass methods: */
2228 virtual bool gate (function *fun)
2229 {
2230 #ifdef ACCEL_COMPILER
2231 return offloading_function_p (fun->decl);
2232 #else
2233 (void) fun;
2234 return false;
2235 #endif
2236 }
2237
2238 virtual unsigned execute (function *);
2239 };
2240
2241 /* Callback for walk_gimple_stmt used to scan for link var operands. */
2242
2243 static tree
2244 find_link_var_op (tree *tp, int *walk_subtrees, void *)
2245 {
2246 tree t = *tp;
2247
2248 if (VAR_P (t)
2249 && DECL_HAS_VALUE_EXPR_P (t)
2250 && is_global_var (t)
2251 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
2252 {
2253 *walk_subtrees = 0;
2254 return t;
2255 }
2256
2257 return NULL_TREE;
2258 }
2259
2260 unsigned
2261 pass_omp_target_link::execute (function *fun)
2262 {
2263 basic_block bb;
2264 FOR_EACH_BB_FN (bb, fun)
2265 {
2266 gimple_stmt_iterator gsi;
2267 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2268 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
2269 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
2270 }
2271
2272 return 0;
2273 }
2274
2275 } // anon namespace
2276
2277 gimple_opt_pass *
2278 make_pass_omp_target_link (gcc::context *ctxt)
2279 {
2280 return new pass_omp_target_link (ctxt);
2281 }