We want to permit full occupancy, so size accordingly. */
+/* Use this as a default, but allow it to grow if the user requests a large
+ amount of gang-private shared-memory space. */
+static int acc_lds_size = 0x600;
+
#define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */
-#define ACC_LDS_SIZE 32768 /* Half of the total should be fine. */
+#define ACC_LDS_SIZE acc_lds_size
#define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */
#define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
: flag_openmp ? OMP_LDS_SIZE \
: OTHER_LDS_SIZE)
+static int gang_private_hwm = 32;
+static hash_map<tree, int> lds_allocs;
+
/* The number of registers usable by normal non-kernel functions.
The SGPR count includes any special extra registers such as VCC. */
f = ggc_cleared_alloc<machine_function> ();
- /* Set up LDS allocation for broadcasting for this function. */
- f->lds_allocated = 32;
- f->lds_allocs = hash_map<tree, int>::create_ggc (64);
-
- /* And LDS temporary decls for worker reductions. */
- vec_alloc (f->reduc_decls, 0);
-
if (TARGET_GCN3)
f->use_flat_addressing = true;
stack_size_opt = 1048576;
}
+ /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
+ worker broadcasts. */
+ if (gang_private_size_opt == -1)
+ gang_private_size_opt = 512;
+ else if (gang_private_size_opt < gang_private_hwm)
+ gang_private_size_opt = gang_private_hwm;
+ else if (gang_private_size_opt >= acc_lds_size - 1024)
+ {
+ /* We need some space for reductions and worker broadcasting. If the
+ user requests a large amount of gang-private LDS space, we might not
+ have enough left for the former. Increase the LDS allocation in that
+ case, although this may reduce the maximum occupancy on the
+ hardware. */
+ acc_lds_size = gang_private_size_opt + 1024;
+ if (acc_lds_size > 32768)
+ acc_lds_size = 32768;
+ }
+
/* The xnack option is a placeholder, for now. */
if (flag_xnack)
sorry ("XNACK support");
The low-part is the address of the topmost addressable byte, which is
size-1. The high-part is an offset and should be zero. */
emit_move_insn (gen_rtx_REG (SImode, M0_REG),
- gen_int_mode (LDS_SIZE-1, SImode));
+ gen_int_mode (LDS_SIZE, SImode));
emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
}
}
+/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */
+
+static void
+gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
+ unsigned HOST_WIDE_INT *hi,
+ int ARG_UNUSED (dims[GOMP_DIM_MAX]),
+ unsigned HOST_WIDE_INT
+ ARG_UNUSED (private_size[GOMP_DIM_MAX]),
+ unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
+{
+ *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
+ /* !!! We can maybe use dims[] to estimate the maximum number of work
+ groups/wavefronts/etc. we will launch, and therefore tune the maximum
+ amount of LDS we should use. For now, use a minimal amount to try to
+ maximise occupancy. */
+ *hi = acc_lds_size;
+ machine_function *machfun = cfun->machine;
+ machfun->reduction_base = gang_private_size_opt;
+ machfun->reduction_limit
+ = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
+}
+
/* }}} */
/* {{{ ASM Output. */
/* Helper function for gcn_asm_output_symbol_ref.
- FIXME: If we want to have propagation blocks allocated separately and
- statically like this, it would be better done via symbol refs and the
- assembler/linker. This is a temporary hack. */
+ FIXME: This function is used to lay out gang-private variables in LDS
+ on a per-CU basis.
+ There may be cases in which gang-private variables in different compilation
+ units could clobber each other. In that case we should be relying on the
+ linker to lay out gang-private LDS space, but that doesn't appear to be
+ possible at present. */
static void
gcn_print_lds_decl (FILE *f, tree var)
{
int *offset;
- machine_function *machfun = cfun->machine;
-
- if ((offset = machfun->lds_allocs->get (var)))
+ if ((offset = lds_allocs.get (var)))
fprintf (f, "%u", (unsigned) *offset);
else
{
if (size > align && size > 4 && align < 8)
align = 8;
- machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
- & ~(align - 1));
+ gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
- machfun->lds_allocs->put (var, machfun->lds_allocated);
- fprintf (f, "%u", machfun->lds_allocated);
- machfun->lds_allocated += size;
- if (machfun->lds_allocated > LDS_SIZE)
- error ("local data-share memory exhausted");
+ lds_allocs.put (var, gang_private_hwm);
+ fprintf (f, "%u", gang_private_hwm);
+ gang_private_hwm += size;
+ if (gang_private_hwm > gang_private_size_opt)
+ error ("gang-private data-share memory exhausted (increase with "
+ "%<-mgang-private-size=<number>%>)");
}
}
#define TARGET_GOACC_REDUCTION gcn_goacc_reduction
#undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
+#undef TARGET_GOACC_SHARED_MEM_LAYOUT
+#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
#undef TARGET_HARD_REGNO_MODE_OK
#define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
#undef TARGET_HARD_REGNO_NREGS
#include "tree-cfg.h"
#include "omp-offload.h"
#include "attribs.h"
+#include "targhooks.h"
+#include "diagnostic-core.h"
/* Loop structure of the function. The entire function is described as
a NULL loop. */
static tree
build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
{
+ if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
+ sender_decl = build_simple_mem_ref (sender_decl);
tree field = *fields->get (var);
return oacc_build_component_ref (sender_decl, field);
}
worker_single_copy (basic_block from, basic_block to,
hash_set<tree> *def_escapes_block,
hash_set<tree> *worker_partitioned_uses,
- tree record_type, record_field_map_t *record_field_map)
+ tree record_type, record_field_map_t *record_field_map,
+ unsigned HOST_WIDE_INT placement,
+ bool isolate_broadcasts)
{
/* If we only have virtual defs, we'll have no record type, but we still want
to emit single_copy_start and (particularly) single_copy_end to act as
tree sender_decl
= targetm.goacc.create_worker_broadcast_record (record_type, true,
- ".oacc_worker_o");
+ ".oacc_worker_o",
+ placement);
tree receiver_decl
= targetm.goacc.create_worker_broadcast_record (record_type, false,
- ".oacc_worker_i");
+ ".oacc_worker_i",
+ placement);
gimple_stmt_iterator gsi = gsi_last_bb (to);
if (EDGE_COUNT (to->succs) > 1)
tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
- gimple *call = gimple_build_call (decl, 1,
- build_fold_addr_expr (sender_decl));
+ gimple *call
+ = gimple_build_call (decl, 1,
+ POINTER_TYPE_P (TREE_TYPE (sender_decl))
+ ? sender_decl : build_fold_addr_expr (sender_decl));
gimple_call_set_lhs (call, lhs);
gsi_insert_before (&start, call, GSI_NEW_STMT);
update_stmt (call);
+ /* The shared-memory range for this block overflowed. Add a barrier before
+ the GOACC_single_copy_start call. */
+ if (isolate_broadcasts)
+ {
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+ gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
+ }
+
tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
gimple *conv = gimple_build_assign (conv_tmp,
}
}
+ /* The shared-memory range for this block overflowed. Add a barrier at the
+ end. */
+ if (isolate_broadcasts)
+ {
+ gsi = gsi_start_bb (exit_block);
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+ gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
+ }
+
/* It's possible for the ET->DEST block (the work done by the active thread)
to finish with a control-flow insn, e.g. a UNIQUE function call. Split
the block and add SENDER_SEQ in the latter part to avoid having control
flow in the middle of a BB. */
decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
- call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl));
+ call = gimple_build_call (decl, 1,
+ POINTER_TYPE_P (TREE_TYPE (sender_decl))
+ ? sender_decl
+ : build_fold_addr_expr (sender_decl));
gimple_seq_add_stmt (&sender_seq, call);
gsi = gsi_last_bb (body);
gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
}
+typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
+ blk_offset_map_t;
+
static void
neuter_worker_single (parallel_g *par, unsigned outer_mask,
bitmap worker_single, bitmap vector_single,
vec<propagation_set *> *prop_set,
hash_set<tree> *partitioned_var_uses,
- record_field_map_t *record_field_map)
+ record_field_map_t *record_field_map,
+ blk_offset_map_t *blk_offset_map)
{
unsigned mask = outer_mask | par->mask;
tree record_type = (tree) block->aux;
if (has_defs)
- worker_single_copy (block, block, &def_escapes_block,
- &worker_partitioned_uses, record_type,
- record_field_map);
+ {
+ std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
+ = blk_offset_map->get (block);
+ gcc_assert (!record_type || off_rngalloc);
+ unsigned HOST_WIDE_INT offset
+ = off_rngalloc ? off_rngalloc->first : 0;
+ bool range_allocated
+ = off_rngalloc ? off_rngalloc->second : true;
+ worker_single_copy (block, block, &def_escapes_block,
+ &worker_partitioned_uses, record_type,
+ record_field_map,
+ offset, !range_allocated);
+ }
else
worker_single_simple (block, block, &def_escapes_block);
}
if (par->inner)
neuter_worker_single (par->inner, mask, worker_single, vector_single,
- prop_set, partitioned_var_uses, record_field_map);
+ prop_set, partitioned_var_uses, record_field_map,
+ blk_offset_map);
if (par->next)
neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
- prop_set, partitioned_var_uses, record_field_map);
+ prop_set, partitioned_var_uses, record_field_map,
+ blk_offset_map);
+}
+
+static void
+dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
+{
+ if (bb->flags & BB_VISITED)
+ return;
+
+ bb->flags |= BB_VISITED;
+
+ if (bb->succs)
+ {
+ edge e;
+ edge_iterator ei;
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ basic_block dest = e->dest;
+ if (dest->aux)
+ bitmap_set_bit (reachable, dest->index);
+ else
+ dfs_broadcast_reachable_1 (dest, reachable);
+ }
+ }
}
+typedef std::pair<int, tree> idx_decl_pair_t;
+
+typedef auto_vec<splay_tree> used_range_vec_t;
+
static int
-execute_omp_oacc_neuter_broadcast ()
+sort_size_descending (const void *a, const void *b)
+{
+ const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
+ const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
+ unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
+ unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
+ return bsize - asize;
+}
+
+class addr_range
+{
+public:
+ addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
+ : lo (addr_lo), hi (addr_hi)
+ { }
+ addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
+ { }
+ addr_range () : lo (0), hi (0)
+ { }
+
+ bool invalid () { return lo == 0 && hi == 0; }
+
+ unsigned HOST_WIDE_INT lo;
+ unsigned HOST_WIDE_INT hi;
+};
+
+static int
+splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
+{
+ addr_range *ar = (addr_range *) a;
+ addr_range *br = (addr_range *) b;
+ if (ar->lo == br->lo && ar->hi == br->hi)
+ return 0;
+ if (ar->hi <= br->lo)
+ return -1;
+ else if (ar->lo >= br->hi)
+ return 1;
+ return 0;
+}
+
+static void
+splay_tree_free_key (splay_tree_key k)
+{
+ addr_range *ar = (addr_range *) k;
+ delete ar;
+}
+
+static addr_range
+first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
+ unsigned HOST_WIDE_INT align, addr_range *bounds)
+{
+ splay_tree_node min = splay_tree_min (s);
+ if (min)
+ {
+ splay_tree_node next;
+ while ((next = splay_tree_successor (s, min->key)))
+ {
+ unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
+ unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
+ unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
+ if (base + size <= hi)
+ return addr_range (base, base + size);
+ min = next;
+ }
+
+ unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
+ base = (base + align - 1) & ~(align - 1);
+ if (base + size <= bounds->hi)
+ return addr_range (base, base + size);
+ else
+ return addr_range ();
+ }
+ else
+ {
+ unsigned HOST_WIDE_INT lo = bounds->lo;
+ lo = (lo + align - 1) & ~(align - 1);
+ if (lo + size <= bounds->hi)
+ return addr_range (lo, lo + size);
+ else
+ return addr_range ();
+ }
+}
+
+static int
+merge_ranges_1 (splay_tree_node n, void *ptr)
+{
+ splay_tree accum = (splay_tree) ptr;
+ addr_range ar = *(addr_range *) n->key;
+
+ splay_tree_node old = splay_tree_lookup (accum, n->key);
+
+ /* We might have an overlap. Create a new range covering the
+ overlapping parts. */
+ if (old)
+ {
+ addr_range *old_ar = (addr_range *) old->key;
+ ar.lo = MIN (old_ar->lo, ar.lo);
+ ar.hi = MAX (old_ar->hi, ar.hi);
+ splay_tree_remove (accum, old->key);
+ }
+
+ addr_range *new_ar = new addr_range (ar);
+
+ splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
+
+ return 0;
+}
+
+static void
+merge_ranges (splay_tree accum, splay_tree sp)
+{
+ splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
+}
+
+static void
+oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
+ unsigned HOST_WIDE_INT bounds_hi)
{
bb_stmt_map_t bb_stmt_map;
auto_bitmap worker_single, vector_single;
}
}
+ sbitmap *reachable
+ = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+ last_basic_block_for_fn (cfun));
+
+ bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
+
+ auto_vec<std::pair<int, tree> > priority;
+
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ if (bb->aux)
+ {
+ tree record_type = (tree) bb->aux;
+
+ basic_block bb2;
+ FOR_ALL_BB_FN (bb2, cfun)
+ bb2->flags &= ~BB_VISITED;
+
+ priority.safe_push (std::make_pair (bb->index, record_type));
+ dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
+ }
+ }
+
+ sbitmap *inverted
+ = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+ last_basic_block_for_fn (cfun));
+
+ bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
+
+ for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+ {
+ sbitmap_iterator bi;
+ unsigned int j;
+ EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
+ bitmap_set_bit (inverted[j], i);
+ }
+
+ for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+ bitmap_ior (reachable[i], reachable[i], inverted[i]);
+
+ sbitmap_vector_free (inverted);
+
+ used_range_vec_t used_ranges;
+
+ used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
+
+ blk_offset_map_t blk_offset_map;
+
+ addr_range worker_shm_bounds (bounds_lo, bounds_hi);
+
+ priority.qsort (sort_size_descending);
+ for (unsigned int i = 0; i < priority.length (); i++)
+ {
+ idx_decl_pair_t p = priority[i];
+ int blkno = p.first;
+ tree record_type = p.second;
+ HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
+ HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
+
+ splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
+ splay_tree_free_key, NULL);
+
+ if (!used_ranges[blkno])
+ used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
+ splay_tree_free_key, NULL);
+ else
+ merge_ranges (conflicts, used_ranges[blkno]);
+
+ sbitmap_iterator bi;
+ unsigned int j;
+ EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
+ if (used_ranges[j])
+ merge_ranges (conflicts, used_ranges[j]);
+
+ addr_range ar
+ = first_fit_range (conflicts, size, align, &worker_shm_bounds);
+
+ splay_tree_delete (conflicts);
+
+ if (ar.invalid ())
+ {
+ unsigned HOST_WIDE_INT base;
+ base = bounds_lo + random () % 512;
+ base = (base + align - 1) & ~(align - 1);
+ if (base + size > bounds_hi)
+ error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
+ std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
+ = std::make_pair (base, false);
+ blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
+ }
+ else
+ {
+ splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
+ (splay_tree_key) &ar);
+ if (old)
+ {
+ fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
+ "already mapped in block %d\n", (int) ar.lo,
+ (int) ar.hi, (int) ((addr_range *) old->key)->lo,
+ (int) ((addr_range *) old->key)->hi, blkno);
+ abort ();
+ }
+
+ addr_range *arp = new addr_range (ar);
+ splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
+ (splay_tree_value) blkno);
+ std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
+ = std::make_pair (ar.lo, true);
+ blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
+ }
+ }
+
+ sbitmap_vector_free (reachable);
+
neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
- &partitioned_var_uses, &record_field_map);
+ &partitioned_var_uses, &record_field_map,
+ &blk_offset_map);
for (auto it : record_field_map)
delete it.second;
fprintf (dump_file, "\n\nAfter neutering:\n\n");
dump_function_to_file (current_function_decl, dump_file, dump_flags);
}
+}
+
+static int
+execute_omp_oacc_neuter_broadcast ()
+{
+ unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
+ unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
+
+ for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+ {
+ reduction_size[i] = 0;
+ private_size[i] = 0;
+ }
+
+ /* Calculate shared memory size required for reduction variables and
+ gang-private memory for this offloaded function. */
+ basic_block bb;
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (!is_gimple_call (stmt))
+ continue;
+ gcall *call = as_a <gcall *> (stmt);
+ if (!gimple_call_internal_p (call))
+ continue;
+ enum internal_fn ifn_code = gimple_call_internal_fn (call);
+ switch (ifn_code)
+ {
+ default: break;
+ case IFN_GOACC_REDUCTION:
+ if (integer_minus_onep (gimple_call_arg (call, 3)))
+ continue;
+ else
+ {
+ unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+ /* Only count reduction variables once: the choice to pick
+ the setup call is fairly arbitrary. */
+ if (code == IFN_GOACC_REDUCTION_SETUP)
+ {
+ int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+ tree var = gimple_call_arg (call, 2);
+ tree offset = gimple_call_arg (call, 5);
+ tree var_type = TREE_TYPE (var);
+ unsigned HOST_WIDE_INT limit
+ = (tree_to_uhwi (offset)
+ + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
+ reduction_size[level]
+ = MAX (reduction_size[level], limit);
+ }
+ }
+ break;
+ case IFN_UNIQUE:
+ {
+ enum ifn_unique_kind kind
+ = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+
+ if (kind == IFN_UNIQUE_OACC_PRIVATE)
+ {
+ HOST_WIDE_INT level
+ = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+ if (level == -1)
+ break;
+ for (unsigned i = 3;
+ i < gimple_call_num_args (call);
+ i++)
+ {
+ tree arg = gimple_call_arg (call, i);
+ gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+ tree decl = TREE_OPERAND (arg, 0);
+ unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
+ private_size[level] = ((private_size[level] + align - 1)
+ & ~(align - 1));
+ unsigned HOST_WIDE_INT decl_size
+ = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
+ private_size[level] += decl_size;
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+
+ int dims[GOMP_DIM_MAX];
+ for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+ dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
+
+ /* Find bounds of shared-memory buffer space we can use. */
+ unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
+ if (targetm.goacc.shared_mem_layout)
+ targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
+ private_size, reduction_size);
+
+ /* Perform worker partitioning unless we know 'num_workers(1)'. */
+ if (dims[GOMP_DIM_WORKER] != 1)
+ oacc_do_neutering (bounds_lo, bounds_hi);
return 0;
}
if (!attr)
return false;
- /* Not relevant for 'num_workers(1)'. */
- int worker_dim
- = oacc_get_fn_dim_size (fun->decl, GOMP_DIM_WORKER);
- if (worker_dim == 1)
- return false;
-
return true;
}