]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/omp-offload.c
runtime: fix sigfwd to not allocate memory
[thirdparty/gcc.git] / gcc / omp-offload.c
CommitLineData
629b3d75
MJ
1/* Bits of OpenMP and OpenACC handling that is specific to device offloading
2 and a lowering pass for OpenACC device directives.
3
cbe34bb5 4 Copyright (C) 2005-2017 Free Software Foundation, Inc.
629b3d75
MJ
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "tree.h"
28#include "gimple.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "cgraph.h"
32#include "pretty-print.h"
33#include "diagnostic-core.h"
34#include "fold-const.h"
35#include "internal-fn.h"
36#include "gimplify.h"
37#include "gimple-iterator.h"
38#include "gimplify-me.h"
39#include "gimple-walk.h"
40#include "tree-cfg.h"
41#include "tree-into-ssa.h"
42#include "common/common-target.h"
43#include "omp-general.h"
44#include "omp-offload.h"
45#include "lto-section-names.h"
46#include "gomp-constants.h"
47#include "gimple-pretty-print.h"
48
49/* Describe the OpenACC looping structure of a function. The entire
50 function is held in a 'NULL' loop. */
51
52struct oacc_loop
53{
54 oacc_loop *parent; /* Containing loop. */
55
56 oacc_loop *child; /* First inner loop. */
57
58 oacc_loop *sibling; /* Next loop within same parent. */
59
60 location_t loc; /* Location of the loop start. */
61
62 gcall *marker; /* Initial head marker. */
63
01914336
MJ
64 gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
65 gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
629b3d75
MJ
66
67 tree routine; /* Pseudo-loop enclosing a routine. */
68
69 unsigned mask; /* Partitioning mask. */
02889d23 70 unsigned e_mask; /* Partitioning of element loops (when tiling). */
629b3d75
MJ
71 unsigned inner; /* Partitioning of inner loops. */
72 unsigned flags; /* Partitioning flags. */
02889d23 73 vec<gcall *> ifns; /* Contained loop abstraction functions. */
629b3d75
MJ
74 tree chunk_size; /* Chunk size. */
75 gcall *head_end; /* Final marker of head sequence. */
76};
77
78/* Holds offload tables with decls. */
79vec<tree, va_gc> *offload_funcs, *offload_vars;
80
81/* Return level at which oacc routine may spawn a partitioned loop, or
82 -1 if it is not a routine (i.e. is an offload fn). */
83
84static int
85oacc_fn_attrib_level (tree attr)
86{
87 tree pos = TREE_VALUE (attr);
88
89 if (!TREE_PURPOSE (pos))
90 return -1;
91
92 int ix = 0;
93 for (ix = 0; ix != GOMP_DIM_MAX;
94 ix++, pos = TREE_CHAIN (pos))
95 if (!integer_zerop (TREE_PURPOSE (pos)))
96 break;
97
98 return ix;
99}
100
101/* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
102 adds their addresses and sizes to constructor-vector V_CTOR. */
103
104static void
105add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
106 vec<constructor_elt, va_gc> *v_ctor)
107{
108 unsigned len = vec_safe_length (v_decls);
109 for (unsigned i = 0; i < len; i++)
110 {
111 tree it = (*v_decls)[i];
112 bool is_var = VAR_P (it);
113 bool is_link_var
114 = is_var
115#ifdef ACCEL_COMPILER
116 && DECL_HAS_VALUE_EXPR_P (it)
117#endif
118 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
119
120 tree size = NULL_TREE;
121 if (is_var)
122 size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
123
124 tree addr;
125 if (!is_link_var)
126 addr = build_fold_addr_expr (it);
127 else
128 {
129#ifdef ACCEL_COMPILER
130 /* For "omp declare target link" vars add address of the pointer to
131 the target table, instead of address of the var. */
132 tree value_expr = DECL_VALUE_EXPR (it);
133 tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
134 varpool_node::finalize_decl (link_ptr_decl);
135 addr = build_fold_addr_expr (link_ptr_decl);
136#else
137 addr = build_fold_addr_expr (it);
138#endif
139
140 /* Most significant bit of the size marks "omp declare target link"
141 vars in host and target tables. */
142 unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
143 isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
144 * BITS_PER_UNIT - 1);
145 size = wide_int_to_tree (const_ptr_type_node, isize);
146 }
147
148 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
149 if (is_var)
150 CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
151 }
152}
153
154/* Create new symbols containing (address, size) pairs for global variables,
155 marked with "omp declare target" attribute, as well as addresses for the
156 functions, which are outlined offloading regions. */
157void
158omp_finish_file (void)
159{
160 unsigned num_funcs = vec_safe_length (offload_funcs);
161 unsigned num_vars = vec_safe_length (offload_vars);
162
163 if (num_funcs == 0 && num_vars == 0)
164 return;
165
166 if (targetm_common.have_named_sections)
167 {
168 vec<constructor_elt, va_gc> *v_f, *v_v;
169 vec_alloc (v_f, num_funcs);
170 vec_alloc (v_v, num_vars * 2);
171
172 add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
173 add_decls_addresses_to_decl_constructor (offload_vars, v_v);
174
175 tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
176 num_vars * 2);
177 tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
178 num_funcs);
179 SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
180 SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
181 tree ctor_v = build_constructor (vars_decl_type, v_v);
182 tree ctor_f = build_constructor (funcs_decl_type, v_f);
183 TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
184 TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
185 tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
186 get_identifier (".offload_func_table"),
187 funcs_decl_type);
188 tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
189 get_identifier (".offload_var_table"),
190 vars_decl_type);
191 TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
192 /* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
193 otherwise a joint table in a binary will contain padding between
194 tables from multiple object files. */
195 DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
196 SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
197 SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
198 DECL_INITIAL (funcs_decl) = ctor_f;
199 DECL_INITIAL (vars_decl) = ctor_v;
200 set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
201 set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
202
203 varpool_node::finalize_decl (vars_decl);
204 varpool_node::finalize_decl (funcs_decl);
205 }
206 else
207 {
208 for (unsigned i = 0; i < num_funcs; i++)
209 {
210 tree it = (*offload_funcs)[i];
211 targetm.record_offload_symbol (it);
212 }
213 for (unsigned i = 0; i < num_vars; i++)
214 {
215 tree it = (*offload_vars)[i];
216 targetm.record_offload_symbol (it);
217 }
218 }
219}
220
02889d23
CLT
221/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
222 axis DIM. Return a tmp var holding the result. */
223
224static tree
225oacc_dim_call (bool pos, int dim, gimple_seq *seq)
226{
227 tree arg = build_int_cst (unsigned_type_node, dim);
228 tree size = create_tmp_var (integer_type_node);
229 enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
230 gimple *call = gimple_build_call_internal (fn, 1, arg);
231
232 gimple_call_set_lhs (call, size);
233 gimple_seq_add_stmt (seq, call);
234
235 return size;
236}
237
629b3d75
MJ
238/* Find the number of threads (POS = false), or thread number (POS =
239 true) for an OpenACC region partitioned as MASK. Setup code
240 required for the calculation is added to SEQ. */
241
242static tree
243oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
244{
245 tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
246 unsigned ix;
247
248 /* Start at gang level, and examine relevant dimension indices. */
249 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
250 if (GOMP_DIM_MASK (ix) & mask)
251 {
629b3d75
MJ
252 if (res)
253 {
254 /* We had an outer index, so scale that by the size of
255 this dimension. */
02889d23 256 tree n = oacc_dim_call (false, ix, seq);
629b3d75
MJ
257 res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
258 }
259 if (pos)
260 {
261 /* Determine index in this dimension. */
02889d23 262 tree id = oacc_dim_call (true, ix, seq);
629b3d75
MJ
263 if (res)
264 res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
265 else
266 res = id;
267 }
268 }
269
270 if (res == NULL_TREE)
271 res = integer_zero_node;
272
273 return res;
274}
275
276/* Transform IFN_GOACC_LOOP calls to actual code. See
277 expand_oacc_for for where these are generated. At the vector
278 level, we stride loops, such that each member of a warp will
279 operate on adjacent iterations. At the worker and gang level,
280 each gang/warp executes a set of contiguous iterations. Chunking
281 can override this such that each iteration engine executes a
01914336 282 contiguous chunk, and then moves on to stride to the next chunk. */
629b3d75
MJ
283
284static void
285oacc_xform_loop (gcall *call)
286{
287 gimple_stmt_iterator gsi = gsi_for_stmt (call);
288 enum ifn_goacc_loop_kind code
289 = (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
290 tree dir = gimple_call_arg (call, 1);
291 tree range = gimple_call_arg (call, 2);
292 tree step = gimple_call_arg (call, 3);
293 tree chunk_size = NULL_TREE;
294 unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
295 tree lhs = gimple_call_lhs (call);
296 tree type = TREE_TYPE (lhs);
297 tree diff_type = TREE_TYPE (range);
298 tree r = NULL_TREE;
299 gimple_seq seq = NULL;
300 bool chunking = false, striding = true;
301 unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
302 unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
303
304#ifdef ACCEL_COMPILER
305 chunk_size = gimple_call_arg (call, 4);
306 if (integer_minus_onep (chunk_size) /* Force static allocation. */
307 || integer_zerop (chunk_size)) /* Default (also static). */
308 {
309 /* If we're at the gang level, we want each to execute a
310 contiguous run of iterations. Otherwise we want each element
311 to stride. */
312 striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
313 chunking = false;
314 }
315 else
316 {
317 /* Chunk of size 1 is striding. */
318 striding = integer_onep (chunk_size);
319 chunking = !striding;
320 }
321#endif
322
323 /* striding=true, chunking=true
324 -> invalid.
325 striding=true, chunking=false
326 -> chunks=1
327 striding=false,chunking=true
328 -> chunks=ceil (range/(chunksize*threads*step))
329 striding=false,chunking=false
330 -> chunk_size=ceil(range/(threads*step)),chunks=1 */
331 push_gimplify_context (true);
332
333 switch (code)
334 {
335 default: gcc_unreachable ();
336
337 case IFN_GOACC_LOOP_CHUNKS:
338 if (!chunking)
339 r = build_int_cst (type, 1);
340 else
341 {
342 /* chunk_max
343 = (range - dir) / (chunks * step * num_threads) + dir */
344 tree per = oacc_thread_numbers (false, mask, &seq);
345 per = fold_convert (type, per);
346 chunk_size = fold_convert (type, chunk_size);
347 per = fold_build2 (MULT_EXPR, type, per, chunk_size);
348 per = fold_build2 (MULT_EXPR, type, per, step);
349 r = build2 (MINUS_EXPR, type, range, dir);
350 r = build2 (PLUS_EXPR, type, r, per);
351 r = build2 (TRUNC_DIV_EXPR, type, r, per);
352 }
353 break;
354
355 case IFN_GOACC_LOOP_STEP:
356 {
357 /* If striding, step by the entire compute volume, otherwise
358 step by the inner volume. */
359 unsigned volume = striding ? mask : inner_mask;
360
361 r = oacc_thread_numbers (false, volume, &seq);
362 r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
363 }
364 break;
365
366 case IFN_GOACC_LOOP_OFFSET:
367 if (striding)
368 {
369 r = oacc_thread_numbers (true, mask, &seq);
370 r = fold_convert (diff_type, r);
371 }
372 else
373 {
374 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
375 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
376 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
377 inner_size, outer_size);
378
379 volume = fold_convert (diff_type, volume);
380 if (chunking)
381 chunk_size = fold_convert (diff_type, chunk_size);
382 else
383 {
384 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
385
386 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
387 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
388 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
389 }
390
391 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
392 fold_convert (diff_type, inner_size));
393 r = oacc_thread_numbers (true, outer_mask, &seq);
394 r = fold_convert (diff_type, r);
395 r = build2 (MULT_EXPR, diff_type, r, span);
396
397 tree inner = oacc_thread_numbers (true, inner_mask, &seq);
398 inner = fold_convert (diff_type, inner);
399 r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
400
401 if (chunking)
402 {
403 tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
404 tree per
405 = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
406 per = build2 (MULT_EXPR, diff_type, per, chunk);
407
408 r = build2 (PLUS_EXPR, diff_type, r, per);
409 }
410 }
411 r = fold_build2 (MULT_EXPR, diff_type, r, step);
412 if (type != diff_type)
413 r = fold_convert (type, r);
414 break;
415
416 case IFN_GOACC_LOOP_BOUND:
417 if (striding)
418 r = range;
419 else
420 {
421 tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
422 tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
423 tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
424 inner_size, outer_size);
425
426 volume = fold_convert (diff_type, volume);
427 if (chunking)
428 chunk_size = fold_convert (diff_type, chunk_size);
429 else
430 {
431 tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
432
433 chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
434 chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
435 chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
436 }
437
438 tree span = build2 (MULT_EXPR, diff_type, chunk_size,
439 fold_convert (diff_type, inner_size));
440
441 r = fold_build2 (MULT_EXPR, diff_type, span, step);
442
443 tree offset = gimple_call_arg (call, 6);
444 r = build2 (PLUS_EXPR, diff_type, r,
445 fold_convert (diff_type, offset));
446 r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
447 diff_type, r, range);
448 }
449 if (diff_type != type)
450 r = fold_convert (type, r);
451 break;
452 }
453
454 gimplify_assign (lhs, r, &seq);
455
456 pop_gimplify_context (NULL);
457
458 gsi_replace_with_seq (&gsi, seq, true);
459}
460
02889d23
CLT
461/* Transform a GOACC_TILE call. Determines the element loop span for
462 the specified loop of the nest. This is 1 if we're not tiling.
463
464 GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element); */
465
466static void
467oacc_xform_tile (gcall *call)
468{
469 gimple_stmt_iterator gsi = gsi_for_stmt (call);
470 unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
471 /* Inner loops have higher loop_nos. */
472 unsigned loop_no = tree_to_uhwi (gimple_call_arg (call, 1));
473 tree tile_size = gimple_call_arg (call, 2);
474 unsigned e_mask = tree_to_uhwi (gimple_call_arg (call, 4));
475 tree lhs = gimple_call_lhs (call);
476 tree type = TREE_TYPE (lhs);
477 gimple_seq seq = NULL;
478 tree span = build_int_cst (type, 1);
479
480 gcc_assert (!(e_mask
481 & ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
482 | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
483 push_gimplify_context (!seen_error ());
484
485#ifndef ACCEL_COMPILER
486 /* Partitioning disabled on host compilers. */
487 e_mask = 0;
488#endif
489 if (!e_mask)
490 /* Not paritioning. */
491 span = integer_one_node;
492 else if (!integer_zerop (tile_size))
493 /* User explicitly specified size. */
494 span = tile_size;
495 else
496 {
497 /* Pick a size based on the paritioning of the element loop and
498 the number of loop nests. */
499 tree first_size = NULL_TREE;
500 tree second_size = NULL_TREE;
501
502 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
503 first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
504 if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
505 second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
506
507 if (!first_size)
508 {
509 first_size = second_size;
510 second_size = NULL_TREE;
511 }
512
513 if (loop_no + 1 == collapse)
514 {
515 span = first_size;
516 if (!loop_no && second_size)
517 span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
518 span, second_size);
519 }
520 else if (loop_no + 2 == collapse)
521 span = second_size;
522 else
523 span = NULL_TREE;
524
525 if (!span)
526 /* There's no obvious element size for this loop. Options
527 are 1, first_size or some non-unity constant (32 is my
528 favourite). We should gather some statistics. */
529 span = first_size;
530 }
531
532 span = fold_convert (type, span);
533 gimplify_assign (lhs, span, &seq);
534
535 pop_gimplify_context (NULL);
536
537 gsi_replace_with_seq (&gsi, seq, true);
538}
539
629b3d75
MJ
540/* Default partitioned and minimum partitioned dimensions. */
541
542static int oacc_default_dims[GOMP_DIM_MAX];
543static int oacc_min_dims[GOMP_DIM_MAX];
544
545/* Parse the default dimension parameter. This is a set of
546 :-separated optional compute dimensions. Each specified dimension
547 is a positive integer. When device type support is added, it is
548 planned to be a comma separated list of such compute dimensions,
549 with all but the first prefixed by the colon-terminated device
550 type. */
551
552static void
553oacc_parse_default_dims (const char *dims)
554{
555 int ix;
556
557 for (ix = GOMP_DIM_MAX; ix--;)
558 {
559 oacc_default_dims[ix] = -1;
560 oacc_min_dims[ix] = 1;
561 }
562
563#ifndef ACCEL_COMPILER
564 /* Cannot be overridden on the host. */
565 dims = NULL;
566#endif
567 if (dims)
568 {
569 const char *pos = dims;
570
571 for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
572 {
573 if (ix)
574 {
575 if (*pos != ':')
576 goto malformed;
577 pos++;
578 }
579
580 if (*pos != ':')
581 {
582 long val;
583 const char *eptr;
584
585 errno = 0;
586 val = strtol (pos, CONST_CAST (char **, &eptr), 10);
587 if (errno || val <= 0 || (int) val != val)
588 goto malformed;
589 pos = eptr;
590 oacc_default_dims[ix] = (int) val;
591 }
592 }
593 if (*pos)
594 {
595 malformed:
596 error_at (UNKNOWN_LOCATION,
597 "-fopenacc-dim operand is malformed at '%s'", pos);
598 }
599 }
600
601 /* Allow the backend to validate the dimensions. */
602 targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
603 targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
604}
605
606/* Validate and update the dimensions for offloaded FN. ATTRS is the
607 raw attribute. DIMS is an array of dimensions, which is filled in.
608 LEVEL is the partitioning level of a routine, or -1 for an offload
01914336 609 region itself. USED is the mask of partitioned execution in the
629b3d75
MJ
610 function. */
611
612static void
613oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
614{
615 tree purpose[GOMP_DIM_MAX];
616 unsigned ix;
617 tree pos = TREE_VALUE (attrs);
618 bool is_kernel = oacc_fn_attrib_kernels_p (attrs);
619
620 /* Make sure the attribute creator attached the dimension
621 information. */
622 gcc_assert (pos);
623
624 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
625 {
626 purpose[ix] = TREE_PURPOSE (pos);
627 tree val = TREE_VALUE (pos);
628 dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
629 pos = TREE_CHAIN (pos);
630 }
631
632 bool changed = targetm.goacc.validate_dims (fn, dims, level);
633
634 /* Default anything left to 1 or a partitioned default. */
635 for (ix = 0; ix != GOMP_DIM_MAX; ix++)
636 if (dims[ix] < 0)
637 {
638 /* The OpenACC spec says 'If the [num_gangs] clause is not
639 specified, an implementation-defined default will be used;
640 the default may depend on the code within the construct.'
641 (2.5.6). Thus an implementation is free to choose
642 non-unity default for a parallel region that doesn't have
643 any gang-partitioned loops. However, it appears that there
644 is a sufficient body of user code that expects non-gang
645 partitioned regions to not execute in gang-redundant mode.
646 So we (a) don't warn about the non-portability and (b) pick
647 the minimum permissible dimension size when there is no
648 partitioned execution. Otherwise we pick the global
649 default for the dimension, which the user can control. The
650 same wording and logic applies to num_workers and
651 vector_length, however the worker- or vector- single
652 execution doesn't have the same impact as gang-redundant
653 execution. (If the minimum gang-level partioning is not 1,
654 the target is probably too confusing.) */
655 dims[ix] = (used & GOMP_DIM_MASK (ix)
656 ? oacc_default_dims[ix] : oacc_min_dims[ix]);
657 changed = true;
658 }
659
660 if (changed)
661 {
662 /* Replace the attribute with new values. */
663 pos = NULL_TREE;
664 for (ix = GOMP_DIM_MAX; ix--;)
665 {
666 pos = tree_cons (purpose[ix],
667 build_int_cst (integer_type_node, dims[ix]),
668 pos);
669 if (is_kernel)
670 TREE_PUBLIC (pos) = 1;
671 }
672 oacc_replace_fn_attrib (fn, pos);
673 }
674}
675
676/* Create an empty OpenACC loop structure at LOC. */
677
678static oacc_loop *
679new_oacc_loop_raw (oacc_loop *parent, location_t loc)
680{
681 oacc_loop *loop = XCNEW (oacc_loop);
682
683 loop->parent = parent;
629b3d75
MJ
684
685 if (parent)
686 {
687 loop->sibling = parent->child;
688 parent->child = loop;
689 }
690
691 loop->loc = loc;
629b3d75
MJ
692 return loop;
693}
694
695/* Create an outermost, dummy OpenACC loop for offloaded function
696 DECL. */
697
698static oacc_loop *
699new_oacc_loop_outer (tree decl)
700{
701 return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
702}
703
704/* Start a new OpenACC loop structure beginning at head marker HEAD.
705 Link into PARENT loop. Return the new loop. */
706
707static oacc_loop *
708new_oacc_loop (oacc_loop *parent, gcall *marker)
709{
710 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
711
712 loop->marker = marker;
713
714 /* TODO: This is where device_type flattening would occur for the loop
01914336 715 flags. */
629b3d75
MJ
716
717 loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
718
719 tree chunk_size = integer_zero_node;
720 if (loop->flags & OLF_GANG_STATIC)
721 chunk_size = gimple_call_arg (marker, 4);
722 loop->chunk_size = chunk_size;
723
724 return loop;
725}
726
727/* Create a dummy loop encompassing a call to a openACC routine.
728 Extract the routine's partitioning requirements. */
729
730static void
731new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
732{
733 oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
734 int level = oacc_fn_attrib_level (attrs);
735
736 gcc_assert (level >= 0);
737
738 loop->marker = call;
739 loop->routine = decl;
740 loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
741 ^ (GOMP_DIM_MASK (level) - 1));
742}
743
744/* Finish off the current OpenACC loop ending at tail marker TAIL.
745 Return the parent loop. */
746
747static oacc_loop *
748finish_oacc_loop (oacc_loop *loop)
749{
750 /* If the loop has been collapsed, don't partition it. */
02889d23 751 if (loop->ifns.is_empty ())
629b3d75
MJ
752 loop->mask = loop->flags = 0;
753 return loop->parent;
754}
755
756/* Free all OpenACC loop structures within LOOP (inclusive). */
757
758static void
759free_oacc_loop (oacc_loop *loop)
760{
761 if (loop->sibling)
762 free_oacc_loop (loop->sibling);
763 if (loop->child)
764 free_oacc_loop (loop->child);
765
622f6b64 766 loop->ifns.release ();
629b3d75
MJ
767 free (loop);
768}
769
770/* Dump out the OpenACC loop head or tail beginning at FROM. */
771
772static void
773dump_oacc_loop_part (FILE *file, gcall *from, int depth,
774 const char *title, int level)
775{
776 enum ifn_unique_kind kind
777 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
778
779 fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
780 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
781 {
782 gimple *stmt = gsi_stmt (gsi);
783
784 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
785 {
786 enum ifn_unique_kind k
787 = ((enum ifn_unique_kind) TREE_INT_CST_LOW
788 (gimple_call_arg (stmt, 0)));
789
790 if (k == kind && stmt != from)
791 break;
792 }
793 print_gimple_stmt (file, stmt, depth * 2 + 2, 0);
794
795 gsi_next (&gsi);
796 while (gsi_end_p (gsi))
797 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
798 }
799}
800
801/* Dump OpenACC loops LOOP, its siblings and its children. */
802
803static void
804dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
805{
806 int ix;
807
808 fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
809 loop->flags, loop->mask,
810 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
811
812 if (loop->marker)
813 print_gimple_stmt (file, loop->marker, depth * 2, 0);
814
815 if (loop->routine)
816 fprintf (file, "%*sRoutine %s:%u:%s\n",
817 depth * 2, "", DECL_SOURCE_FILE (loop->routine),
818 DECL_SOURCE_LINE (loop->routine),
819 IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
820
821 for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
822 if (loop->heads[ix])
823 dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
824 for (ix = GOMP_DIM_MAX; ix--;)
825 if (loop->tails[ix])
826 dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
827
828 if (loop->child)
829 dump_oacc_loop (file, loop->child, depth + 1);
830 if (loop->sibling)
831 dump_oacc_loop (file, loop->sibling, depth);
832}
833
834void debug_oacc_loop (oacc_loop *);
835
836/* Dump loops to stderr. */
837
838DEBUG_FUNCTION void
839debug_oacc_loop (oacc_loop *loop)
840{
841 dump_oacc_loop (stderr, loop, 0);
842}
843
844/* DFS walk of basic blocks BB onwards, creating OpenACC loop
845 structures as we go. By construction these loops are properly
846 nested. */
847
848static void
849oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
850{
851 int marker = 0;
852 int remaining = 0;
853
854 if (bb->flags & BB_VISITED)
855 return;
856
857 follow:
858 bb->flags |= BB_VISITED;
859
860 /* Scan for loop markers. */
861 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
862 gsi_next (&gsi))
863 {
864 gimple *stmt = gsi_stmt (gsi);
865
866 if (!is_gimple_call (stmt))
867 continue;
868
869 gcall *call = as_a <gcall *> (stmt);
870
871 /* If this is a routine, make a dummy loop for it. */
872 if (tree decl = gimple_call_fndecl (call))
873 if (tree attrs = oacc_get_fn_attrib (decl))
874 {
875 gcc_assert (!marker);
876 new_oacc_loop_routine (loop, call, decl, attrs);
877 }
878
879 if (!gimple_call_internal_p (call))
880 continue;
881
882 switch (gimple_call_internal_fn (call))
883 {
884 default:
885 break;
886
887 case IFN_GOACC_LOOP:
02889d23
CLT
888 case IFN_GOACC_TILE:
889 /* Record the abstraction function, so we can manipulate it
890 later. */
891 loop->ifns.safe_push (call);
629b3d75
MJ
892 break;
893
894 case IFN_UNIQUE:
895 enum ifn_unique_kind kind
896 = (enum ifn_unique_kind) (TREE_INT_CST_LOW
897 (gimple_call_arg (call, 0)));
898 if (kind == IFN_UNIQUE_OACC_HEAD_MARK
899 || kind == IFN_UNIQUE_OACC_TAIL_MARK)
900 {
901 if (gimple_call_num_args (call) == 2)
902 {
903 gcc_assert (marker && !remaining);
904 marker = 0;
905 if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
906 loop = finish_oacc_loop (loop);
907 else
908 loop->head_end = call;
909 }
910 else
911 {
912 int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
913
914 if (!marker)
915 {
916 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
917 loop = new_oacc_loop (loop, call);
918 remaining = count;
919 }
920 gcc_assert (count == remaining);
921 if (remaining)
922 {
923 remaining--;
924 if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
925 loop->heads[marker] = call;
926 else
927 loop->tails[remaining] = call;
928 }
929 marker++;
930 }
931 }
932 }
933 }
934 if (remaining || marker)
935 {
936 bb = single_succ (bb);
937 gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
938 goto follow;
939 }
940
941 /* Walk successor blocks. */
942 edge e;
943 edge_iterator ei;
944
945 FOR_EACH_EDGE (e, ei, bb->succs)
946 oacc_loop_discover_walk (loop, e->dest);
947}
948
949/* LOOP is the first sibling. Reverse the order in place and return
950 the new first sibling. Recurse to child loops. */
951
952static oacc_loop *
953oacc_loop_sibling_nreverse (oacc_loop *loop)
954{
955 oacc_loop *last = NULL;
956 do
957 {
958 if (loop->child)
01914336 959 loop->child = oacc_loop_sibling_nreverse (loop->child);
629b3d75
MJ
960
961 oacc_loop *next = loop->sibling;
962 loop->sibling = last;
963 last = loop;
964 loop = next;
965 }
966 while (loop);
967
968 return last;
969}
970
971/* Discover the OpenACC loops marked up by HEAD and TAIL markers for
972 the current function. */
973
974static oacc_loop *
975oacc_loop_discovery ()
976{
977 /* Clear basic block flags, in particular BB_VISITED which we're going to use
978 in the following. */
979 clear_bb_flags ();
980
981 oacc_loop *top = new_oacc_loop_outer (current_function_decl);
982 oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
983
984 /* The siblings were constructed in reverse order, reverse them so
985 that diagnostics come out in an unsurprising order. */
986 top = oacc_loop_sibling_nreverse (top);
987
988 return top;
989}
990
991/* Transform the abstract internal function markers starting at FROM
992 to be for partitioning level LEVEL. Stop when we meet another HEAD
993 or TAIL marker. */
994
995static void
996oacc_loop_xform_head_tail (gcall *from, int level)
997{
998 enum ifn_unique_kind kind
999 = (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
1000 tree replacement = build_int_cst (unsigned_type_node, level);
1001
1002 for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
1003 {
1004 gimple *stmt = gsi_stmt (gsi);
1005
1006 if (gimple_call_internal_p (stmt, IFN_UNIQUE))
1007 {
1008 enum ifn_unique_kind k
1009 = ((enum ifn_unique_kind)
1010 TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1011
1012 if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
1013 *gimple_call_arg_ptr (stmt, 2) = replacement;
1014 else if (k == kind && stmt != from)
1015 break;
1016 }
1017 else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
1018 *gimple_call_arg_ptr (stmt, 3) = replacement;
1019
1020 gsi_next (&gsi);
1021 while (gsi_end_p (gsi))
1022 gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
1023 }
1024}
1025
629b3d75
MJ
1026/* Process the discovered OpenACC loops, setting the correct
1027 partitioning level etc. */
1028
1029static void
1030oacc_loop_process (oacc_loop *loop)
1031{
1032 if (loop->child)
1033 oacc_loop_process (loop->child);
1034
1035 if (loop->mask && !loop->routine)
1036 {
1037 int ix;
02889d23
CLT
1038 tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
1039 tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
629b3d75 1040 tree chunk_arg = loop->chunk_size;
02889d23
CLT
1041 gcall *call;
1042
1043 for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
1044 switch (gimple_call_internal_fn (call))
1045 {
1046 case IFN_GOACC_LOOP:
1047 {
1048 bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
1049 gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg);
1050 if (!is_e)
1051 gimple_call_set_arg (call, 4, chunk_arg);
1052 }
1053 break;
1054
1055 case IFN_GOACC_TILE:
1056 gimple_call_set_arg (call, 3, mask_arg);
1057 gimple_call_set_arg (call, 4, e_mask_arg);
1058 break;
629b3d75 1059
02889d23
CLT
1060 default:
1061 gcc_unreachable ();
1062 }
629b3d75 1063
02889d23
CLT
1064 unsigned dim = GOMP_DIM_GANG;
1065 unsigned mask = loop->mask | loop->e_mask;
629b3d75
MJ
1066 for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
1067 {
1068 while (!(GOMP_DIM_MASK (dim) & mask))
1069 dim++;
1070
1071 oacc_loop_xform_head_tail (loop->heads[ix], dim);
1072 oacc_loop_xform_head_tail (loop->tails[ix], dim);
1073
1074 mask ^= GOMP_DIM_MASK (dim);
1075 }
1076 }
1077
1078 if (loop->sibling)
1079 oacc_loop_process (loop->sibling);
1080}
1081
1082/* Walk the OpenACC loop heirarchy checking and assigning the
1083 programmer-specified partitionings. OUTER_MASK is the partitioning
1084 this loop is contained within. Return mask of partitioning
1085 encountered. If any auto loops are discovered, set GOMP_DIM_MAX
1086 bit. */
1087
1088static unsigned
1089oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
1090{
1091 unsigned this_mask = loop->mask;
1092 unsigned mask_all = 0;
1093 bool noisy = true;
1094
1095#ifdef ACCEL_COMPILER
1096 /* When device_type is supported, we want the device compiler to be
1097 noisy, if the loop parameters are device_type-specific. */
1098 noisy = false;
1099#endif
1100
1101 if (!loop->routine)
1102 {
1103 bool auto_par = (loop->flags & OLF_AUTO) != 0;
1104 bool seq_par = (loop->flags & OLF_SEQ) != 0;
02889d23
CLT
1105 bool tiling = (loop->flags & OLF_TILE) != 0;
1106
629b3d75
MJ
1107 this_mask = ((loop->flags >> OLF_DIM_BASE)
1108 & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
1109
02889d23
CLT
1110 /* Apply auto partitioning if this is a non-partitioned regular
1111 loop, or (no more than) single axis tiled loop. */
1112 bool maybe_auto
1113 = !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);
1114
629b3d75
MJ
1115 if ((this_mask != 0) + auto_par + seq_par > 1)
1116 {
1117 if (noisy)
1118 error_at (loop->loc,
1119 seq_par
1120 ? "%<seq%> overrides other OpenACC loop specifiers"
1121 : "%<auto%> conflicts with other OpenACC loop "
1122 "specifiers");
02889d23 1123 maybe_auto = false;
629b3d75
MJ
1124 loop->flags &= ~OLF_AUTO;
1125 if (seq_par)
1126 {
01914336
MJ
1127 loop->flags
1128 &= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
629b3d75
MJ
1129 this_mask = 0;
1130 }
1131 }
02889d23
CLT
1132
1133 if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
1134 {
1135 loop->flags |= OLF_AUTO;
1136 mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
1137 }
629b3d75
MJ
1138 }
1139
1140 if (this_mask & outer_mask)
1141 {
1142 const oacc_loop *outer;
1143 for (outer = loop->parent; outer; outer = outer->parent)
02889d23 1144 if ((outer->mask | outer->e_mask) & this_mask)
629b3d75
MJ
1145 break;
1146
1147 if (noisy)
1148 {
1149 if (outer)
1150 {
1151 error_at (loop->loc,
1152 "%s uses same OpenACC parallelism as containing loop",
1153 loop->routine ? "routine call" : "inner loop");
1154 inform (outer->loc, "containing loop here");
1155 }
1156 else
1157 error_at (loop->loc,
1158 "%s uses OpenACC parallelism disallowed by containing "
1159 "routine", loop->routine ? "routine call" : "loop");
1160
1161 if (loop->routine)
1162 inform (DECL_SOURCE_LOCATION (loop->routine),
1163 "routine %qD declared here", loop->routine);
1164 }
1165 this_mask &= ~outer_mask;
1166 }
1167 else
1168 {
1169 unsigned outermost = least_bit_hwi (this_mask);
1170
1171 if (outermost && outermost <= outer_mask)
1172 {
1173 if (noisy)
1174 {
1175 error_at (loop->loc,
1176 "incorrectly nested OpenACC loop parallelism");
1177
1178 const oacc_loop *outer;
1179 for (outer = loop->parent;
1180 outer->flags && outer->flags < outermost;
1181 outer = outer->parent)
1182 continue;
1183 inform (outer->loc, "containing loop here");
1184 }
1185
1186 this_mask &= ~outermost;
1187 }
1188 }
1189
629b3d75
MJ
1190 mask_all |= this_mask;
1191
02889d23
CLT
1192 if (loop->flags & OLF_TILE)
1193 {
1194 /* When tiling, vector goes to the element loop, and failing
1195 that we put worker there. The std doesn't contemplate
1196 specifying all three. We choose to put worker and vector on
1197 the element loops in that case. */
1198 unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
1199 if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
1200 this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
1201
1202 loop->e_mask = this_e_mask;
1203 this_mask ^= this_e_mask;
1204 }
1205
1206 loop->mask = this_mask;
1207
1208 if (dump_file)
1209 fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
1210 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
1211 loop->mask, loop->e_mask);
1212
629b3d75
MJ
1213 if (loop->child)
1214 {
02889d23
CLT
1215 unsigned tmp_mask = outer_mask | this_mask | loop->e_mask;
1216 loop->inner = oacc_loop_fixed_partitions (loop->child, tmp_mask);
629b3d75
MJ
1217 mask_all |= loop->inner;
1218 }
1219
1220 if (loop->sibling)
1221 mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
1222
1223 return mask_all;
1224}
1225
1226/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
1227 OUTER_MASK is the partitioning this loop is contained within.
02889d23 1228 OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
629b3d75
MJ
1229 Return the cumulative partitioning used by this loop, siblings and
1230 children. */
1231
1232static unsigned
02889d23
CLT
1233oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
1234 bool outer_assign)
629b3d75
MJ
1235{
1236 bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
1237 bool noisy = true;
02889d23 1238 bool tiling = loop->flags & OLF_TILE;
629b3d75
MJ
1239
1240#ifdef ACCEL_COMPILER
1241 /* When device_type is supported, we want the device compiler to be
1242 noisy, if the loop parameters are device_type-specific. */
1243 noisy = false;
1244#endif
1245
891ba5eb 1246 if (assign && (!outer_assign || loop->inner))
629b3d75 1247 {
02889d23
CLT
1248 /* Allocate outermost and non-innermost loops at the outermost
1249 non-innermost available level. */
1250 unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
1251
1252 /* Find the first outermost available partition. */
1253 while (this_mask <= outer_mask)
1254 this_mask <<= 1;
1255
1256 /* Grab two axes if tiling, and we've not assigned anything */
1257 if (tiling && !(loop->mask | loop->e_mask))
1258 this_mask |= this_mask << 1;
1259
1260 /* Prohibit the innermost partitioning at the moment. */
1261 this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
629b3d75 1262
02889d23
CLT
1263 /* Don't use any dimension explicitly claimed by an inner loop. */
1264 this_mask &= ~loop->inner;
1265
1266 if (tiling && !loop->e_mask)
1267 {
1268 /* If we got two axes, allocate the inner one to the element
1269 loop. */
1270 loop->e_mask = this_mask & (this_mask << 1);
1271 this_mask ^= loop->e_mask;
1272 }
1273
1274 loop->mask |= this_mask;
629b3d75
MJ
1275 }
1276
1277 if (loop->child)
1278 {
02889d23
CLT
1279 unsigned tmp_mask = outer_mask | loop->mask | loop->e_mask;
1280 loop->inner = oacc_loop_auto_partitions (loop->child, tmp_mask,
1281 outer_assign | assign);
629b3d75
MJ
1282 }
1283
02889d23 1284 if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
629b3d75 1285 {
02889d23
CLT
1286 /* Allocate the loop at the innermost available level. Note
1287 that we do this even if we already assigned this loop the
1288 outermost available level above. That way we'll partition
1289 this along 2 axes, if they are available. */
629b3d75
MJ
1290 unsigned this_mask = 0;
1291
01914336 1292 /* Determine the outermost partitioning used within this loop. */
629b3d75
MJ
1293 this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
1294 this_mask = least_bit_hwi (this_mask);
1295
1296 /* Pick the partitioning just inside that one. */
1297 this_mask >>= 1;
1298
01914336 1299 /* And avoid picking one use by an outer loop. */
629b3d75
MJ
1300 this_mask &= ~outer_mask;
1301
02889d23
CLT
1302 /* If tiling and we failed completely above, grab the next one
1303 too. Making sure it doesn't hit an outer loop. */
1304 if (tiling)
1305 {
1306 this_mask &= ~(loop->e_mask | loop->mask);
1307 unsigned tile_mask = ((this_mask >> 1)
1308 & ~(outer_mask | loop->e_mask | loop->mask));
1309
1310 if (tile_mask || loop->mask)
1311 {
1312 loop->e_mask |= this_mask;
1313 this_mask = tile_mask;
1314 }
1315 if (!loop->e_mask && noisy)
1316 warning_at (loop->loc, 0,
1317 "insufficient partitioning available"
1318 " to parallelize element loop");
1319 }
629b3d75 1320
02889d23
CLT
1321 loop->mask |= this_mask;
1322 if (!loop->mask && noisy)
1323 warning_at (loop->loc, 0,
1324 "insufficient partitioning available"
1325 " to parallelize%s loop", tiling ? " tile" : "");
629b3d75
MJ
1326 }
1327
1328 if (assign && dump_file)
02889d23 1329 fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
629b3d75 1330 LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
02889d23 1331 loop->mask, loop->e_mask);
629b3d75
MJ
1332
1333 unsigned inner_mask = 0;
1334
1335 if (loop->sibling)
02889d23
CLT
1336 inner_mask |= oacc_loop_auto_partitions (loop->sibling,
1337 outer_mask, outer_assign);
629b3d75 1338
02889d23 1339 inner_mask |= loop->inner | loop->mask | loop->e_mask;
629b3d75
MJ
1340
1341 return inner_mask;
1342}
1343
1344/* Walk the OpenACC loop heirarchy to check and assign partitioning
1345 axes. Return mask of partitioning. */
1346
1347static unsigned
1348oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
1349{
1350 unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
1351
1352 if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
1353 {
1354 mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
02889d23 1355 mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
629b3d75
MJ
1356 }
1357 return mask_all;
1358}
1359
1360/* Default fork/join early expander. Delete the function calls if
1361 there is no RTL expander. */
1362
1363bool
1364default_goacc_fork_join (gcall *ARG_UNUSED (call),
1365 const int *ARG_UNUSED (dims), bool is_fork)
1366{
1367 if (is_fork)
1368 return targetm.have_oacc_fork ();
1369 else
1370 return targetm.have_oacc_join ();
1371}
1372
1373/* Default goacc.reduction early expander.
1374
1375 LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
1376 If RES_PTR is not integer-zerop:
1377 SETUP - emit 'LHS = *RES_PTR', LHS = NULL
1378 TEARDOWN - emit '*RES_PTR = VAR'
1379 If LHS is not NULL
1380 emit 'LHS = VAR' */
1381
1382void
1383default_goacc_reduction (gcall *call)
1384{
1385 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1386 gimple_stmt_iterator gsi = gsi_for_stmt (call);
1387 tree lhs = gimple_call_lhs (call);
1388 tree var = gimple_call_arg (call, 2);
1389 gimple_seq seq = NULL;
1390
1391 if (code == IFN_GOACC_REDUCTION_SETUP
1392 || code == IFN_GOACC_REDUCTION_TEARDOWN)
1393 {
1394 /* Setup and Teardown need to copy from/to the receiver object,
1395 if there is one. */
1396 tree ref_to_res = gimple_call_arg (call, 1);
1397
1398 if (!integer_zerop (ref_to_res))
1399 {
1400 tree dst = build_simple_mem_ref (ref_to_res);
1401 tree src = var;
1402
1403 if (code == IFN_GOACC_REDUCTION_SETUP)
1404 {
1405 src = dst;
1406 dst = lhs;
1407 lhs = NULL;
1408 }
1409 gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
1410 }
1411 }
1412
1413 /* Copy VAR to LHS, if there is an LHS. */
1414 if (lhs)
1415 gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
1416
1417 gsi_replace_with_seq (&gsi, seq, true);
1418}
1419
1420/* Main entry point for oacc transformations which run on the device
1421 compiler after LTO, so we know what the target device is at this
1422 point (including the host fallback). */
1423
1424static unsigned int
1425execute_oacc_device_lower ()
1426{
1427 tree attrs = oacc_get_fn_attrib (current_function_decl);
1428
1429 if (!attrs)
1430 /* Not an offloaded function. */
1431 return 0;
1432
1433 /* Parse the default dim argument exactly once. */
1434 if ((const void *)flag_openacc_dims != &flag_openacc_dims)
1435 {
1436 oacc_parse_default_dims (flag_openacc_dims);
1437 flag_openacc_dims = (char *)&flag_openacc_dims;
1438 }
1439
1440 /* Discover, partition and process the loops. */
1441 oacc_loop *loops = oacc_loop_discovery ();
1442 int fn_level = oacc_fn_attrib_level (attrs);
1443
1444 if (dump_file)
1445 fprintf (dump_file, oacc_fn_attrib_kernels_p (attrs)
1446 ? "Function is kernels offload\n"
1447 : fn_level < 0 ? "Function is parallel offload\n"
1448 : "Function is routine level %d\n", fn_level);
1449
1450 unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
1451 unsigned used_mask = oacc_loop_partition (loops, outer_mask);
1452 int dims[GOMP_DIM_MAX];
1453
1454 oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
1455
1456 if (dump_file)
1457 {
1458 const char *comma = "Compute dimensions [";
1459 for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
1460 fprintf (dump_file, "%s%d", comma, dims[ix]);
1461 fprintf (dump_file, "]\n");
1462 }
1463
1464 oacc_loop_process (loops);
1465 if (dump_file)
1466 {
1467 fprintf (dump_file, "OpenACC loops\n");
1468 dump_oacc_loop (dump_file, loops, 0);
1469 fprintf (dump_file, "\n");
1470 }
1471
1472 /* Offloaded targets may introduce new basic blocks, which require
1473 dominance information to update SSA. */
1474 calculate_dominance_info (CDI_DOMINATORS);
1475
1476 /* Now lower internal loop functions to target-specific code
1477 sequences. */
1478 basic_block bb;
1479 FOR_ALL_BB_FN (bb, cfun)
1480 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1481 {
1482 gimple *stmt = gsi_stmt (gsi);
1483 if (!is_gimple_call (stmt))
1484 {
1485 gsi_next (&gsi);
1486 continue;
1487 }
1488
1489 gcall *call = as_a <gcall *> (stmt);
1490 if (!gimple_call_internal_p (call))
1491 {
1492 gsi_next (&gsi);
1493 continue;
1494 }
1495
1496 /* Rewind to allow rescan. */
1497 gsi_prev (&gsi);
1498 bool rescan = false, remove = false;
1499 enum internal_fn ifn_code = gimple_call_internal_fn (call);
1500
1501 switch (ifn_code)
1502 {
1503 default: break;
1504
02889d23
CLT
1505 case IFN_GOACC_TILE:
1506 oacc_xform_tile (call);
1507 rescan = true;
1508 break;
1509
629b3d75
MJ
1510 case IFN_GOACC_LOOP:
1511 oacc_xform_loop (call);
1512 rescan = true;
1513 break;
1514
1515 case IFN_GOACC_REDUCTION:
1516 /* Mark the function for SSA renaming. */
1517 mark_virtual_operands_for_renaming (cfun);
1518
1519 /* If the level is -1, this ended up being an unused
1520 axis. Handle as a default. */
1521 if (integer_minus_onep (gimple_call_arg (call, 3)))
1522 default_goacc_reduction (call);
1523 else
1524 targetm.goacc.reduction (call);
1525 rescan = true;
1526 break;
1527
1528 case IFN_UNIQUE:
1529 {
1530 enum ifn_unique_kind kind
1531 = ((enum ifn_unique_kind)
1532 TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1533
1534 switch (kind)
1535 {
1536 default:
02889d23 1537 break;
629b3d75
MJ
1538
1539 case IFN_UNIQUE_OACC_FORK:
1540 case IFN_UNIQUE_OACC_JOIN:
1541 if (integer_minus_onep (gimple_call_arg (call, 2)))
1542 remove = true;
1543 else if (!targetm.goacc.fork_join
1544 (call, dims, kind == IFN_UNIQUE_OACC_FORK))
1545 remove = true;
1546 break;
1547
1548 case IFN_UNIQUE_OACC_HEAD_MARK:
1549 case IFN_UNIQUE_OACC_TAIL_MARK:
1550 remove = true;
1551 break;
1552 }
1553 break;
1554 }
1555 }
1556
1557 if (gsi_end_p (gsi))
1558 /* We rewound past the beginning of the BB. */
1559 gsi = gsi_start_bb (bb);
1560 else
1561 /* Undo the rewind. */
1562 gsi_next (&gsi);
1563
1564 if (remove)
1565 {
1566 if (gimple_vdef (call))
1567 replace_uses_by (gimple_vdef (call), gimple_vuse (call));
1568 if (gimple_call_lhs (call))
1569 {
1570 /* Propagate the data dependency var. */
1571 gimple *ass = gimple_build_assign (gimple_call_lhs (call),
1572 gimple_call_arg (call, 1));
1573 gsi_replace (&gsi, ass, false);
1574 }
1575 else
1576 gsi_remove (&gsi, true);
1577 }
1578 else if (!rescan)
1579 /* If not rescanning, advance over the call. */
1580 gsi_next (&gsi);
1581 }
1582
1583 free_oacc_loop (loops);
1584
1585 return 0;
1586}
1587
1588/* Default launch dimension validator. Force everything to 1. A
1589 backend that wants to provide larger dimensions must override this
1590 hook. */
1591
1592bool
1593default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
1594 int ARG_UNUSED (fn_level))
1595{
1596 bool changed = false;
1597
1598 for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
1599 {
1600 if (dims[ix] != 1)
1601 {
1602 dims[ix] = 1;
1603 changed = true;
1604 }
1605 }
1606
1607 return changed;
1608}
1609
01914336 1610/* Default dimension bound is unknown on accelerator and 1 on host. */
629b3d75
MJ
1611
1612int
1613default_goacc_dim_limit (int ARG_UNUSED (axis))
1614{
1615#ifdef ACCEL_COMPILER
1616 return 0;
1617#else
1618 return 1;
1619#endif
1620}
1621
1622namespace {
1623
1624const pass_data pass_data_oacc_device_lower =
1625{
1626 GIMPLE_PASS, /* type */
1627 "oaccdevlow", /* name */
fd2b8c8b 1628 OPTGROUP_OMP, /* optinfo_flags */
629b3d75
MJ
1629 TV_NONE, /* tv_id */
1630 PROP_cfg, /* properties_required */
1631 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
1632 0, /* properties_destroyed */
1633 0, /* todo_flags_start */
1634 TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1635};
1636
1637class pass_oacc_device_lower : public gimple_opt_pass
1638{
1639public:
1640 pass_oacc_device_lower (gcc::context *ctxt)
1641 : gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
1642 {}
1643
1644 /* opt_pass methods: */
1645 virtual bool gate (function *) { return flag_openacc; };
1646
1647 virtual unsigned int execute (function *)
1648 {
1649 return execute_oacc_device_lower ();
1650 }
1651
1652}; // class pass_oacc_device_lower
1653
1654} // anon namespace
1655
1656gimple_opt_pass *
1657make_pass_oacc_device_lower (gcc::context *ctxt)
1658{
1659 return new pass_oacc_device_lower (ctxt);
1660}
1661
1662/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
1663 VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
1664 LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
1665 internal functions on non-SIMT targets, and likewise some SIMD internal
1666 functions on SIMT targets. */
1667
1668static unsigned int
1669execute_omp_device_lower ()
1670{
1671 int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
1672 basic_block bb;
1673 gimple_stmt_iterator gsi;
1674 FOR_EACH_BB_FN (bb, cfun)
1675 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1676 {
1677 gimple *stmt = gsi_stmt (gsi);
1678 if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
1679 continue;
1680 tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
1681 tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
1682 switch (gimple_call_internal_fn (stmt))
1683 {
1684 case IFN_GOMP_USE_SIMT:
1685 rhs = vf == 1 ? integer_zero_node : integer_one_node;
1686 break;
1687 case IFN_GOMP_SIMT_LANE:
1688 case IFN_GOMP_SIMT_LAST_LANE:
1689 rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
1690 break;
1691 case IFN_GOMP_SIMT_VF:
1692 rhs = build_int_cst (type, vf);
1693 break;
1694 case IFN_GOMP_SIMT_ORDERED_PRED:
1695 rhs = vf == 1 ? integer_zero_node : NULL_TREE;
1696 if (rhs || !lhs)
1697 unlink_stmt_vdef (stmt);
1698 break;
1699 case IFN_GOMP_SIMT_VOTE_ANY:
1700 case IFN_GOMP_SIMT_XCHG_BFLY:
1701 case IFN_GOMP_SIMT_XCHG_IDX:
1702 rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
1703 break;
1704 case IFN_GOMP_SIMD_LANE:
1705 case IFN_GOMP_SIMD_LAST_LANE:
1706 rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
1707 break;
1708 case IFN_GOMP_SIMD_VF:
1709 rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
1710 break;
1711 default:
1712 continue;
1713 }
1714 if (lhs && !rhs)
1715 continue;
1716 stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
1717 gsi_replace (&gsi, stmt, false);
1718 }
1719 if (vf != 1)
1720 cfun->has_force_vectorize_loops = false;
1721 return 0;
1722}
1723
1724namespace {
1725
1726const pass_data pass_data_omp_device_lower =
1727{
1728 GIMPLE_PASS, /* type */
1729 "ompdevlow", /* name */
fd2b8c8b 1730 OPTGROUP_OMP, /* optinfo_flags */
629b3d75
MJ
1731 TV_NONE, /* tv_id */
1732 PROP_cfg, /* properties_required */
1733 PROP_gimple_lomp_dev, /* properties_provided */
1734 0, /* properties_destroyed */
1735 0, /* todo_flags_start */
1736 TODO_update_ssa, /* todo_flags_finish */
1737};
1738
1739class pass_omp_device_lower : public gimple_opt_pass
1740{
1741public:
1742 pass_omp_device_lower (gcc::context *ctxt)
1743 : gimple_opt_pass (pass_data_omp_device_lower, ctxt)
1744 {}
1745
1746 /* opt_pass methods: */
4cea8675 1747 virtual bool gate (function *fun)
629b3d75 1748 {
4cea8675 1749 return !(fun->curr_properties & PROP_gimple_lomp_dev);
629b3d75
MJ
1750 }
1751 virtual unsigned int execute (function *)
1752 {
1753 return execute_omp_device_lower ();
1754 }
1755
1756}; // class pass_expand_omp_ssa
1757
1758} // anon namespace
1759
1760gimple_opt_pass *
1761make_pass_omp_device_lower (gcc::context *ctxt)
1762{
1763 return new pass_omp_device_lower (ctxt);
1764}
1765
1766/* "omp declare target link" handling pass. */
1767
1768namespace {
1769
1770const pass_data pass_data_omp_target_link =
1771{
1772 GIMPLE_PASS, /* type */
1773 "omptargetlink", /* name */
fd2b8c8b 1774 OPTGROUP_OMP, /* optinfo_flags */
629b3d75
MJ
1775 TV_NONE, /* tv_id */
1776 PROP_ssa, /* properties_required */
1777 0, /* properties_provided */
1778 0, /* properties_destroyed */
1779 0, /* todo_flags_start */
1780 TODO_update_ssa, /* todo_flags_finish */
1781};
1782
1783class pass_omp_target_link : public gimple_opt_pass
1784{
1785public:
1786 pass_omp_target_link (gcc::context *ctxt)
1787 : gimple_opt_pass (pass_data_omp_target_link, ctxt)
1788 {}
1789
1790 /* opt_pass methods: */
1791 virtual bool gate (function *fun)
1792 {
1793#ifdef ACCEL_COMPILER
1794 tree attrs = DECL_ATTRIBUTES (fun->decl);
1795 return lookup_attribute ("omp declare target", attrs)
1796 || lookup_attribute ("omp target entrypoint", attrs);
1797#else
1798 (void) fun;
1799 return false;
1800#endif
1801 }
1802
1803 virtual unsigned execute (function *);
1804};
1805
1806/* Callback for walk_gimple_stmt used to scan for link var operands. */
1807
1808static tree
1809find_link_var_op (tree *tp, int *walk_subtrees, void *)
1810{
1811 tree t = *tp;
1812
56f71478
JJ
1813 if (VAR_P (t)
1814 && DECL_HAS_VALUE_EXPR_P (t)
1815 && is_global_var (t)
629b3d75
MJ
1816 && lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
1817 {
1818 *walk_subtrees = 0;
1819 return t;
1820 }
1821
1822 return NULL_TREE;
1823}
1824
1825unsigned
1826pass_omp_target_link::execute (function *fun)
1827{
1828 basic_block bb;
1829 FOR_EACH_BB_FN (bb, fun)
1830 {
1831 gimple_stmt_iterator gsi;
1832 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1833 if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
1834 gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
1835 }
1836
1837 return 0;
1838}
1839
1840} // anon namespace
1841
1842gimple_opt_pass *
1843make_pass_omp_target_link (gcc::context *ctxt)
1844{
1845 return new pass_omp_target_link (ctxt);
1846}