From 1cd81a77f08ca49a2781f0583a6b689f2881d285 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Wed, 11 Oct 2017 08:07:18 -0700 Subject: [PATCH] [og9] OpenACC middle-end worker-partitioning support gcc/ * Makefile.in (OBJS): Add omp-sese.o. * omp-builtins.def (BUILT_IN_GOACC_BARRIER, BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END): New builtins. * omp-offload.c (omp-sese.h): Include header. (oacc_loop_xform_head_tail): Call update_stmt for modified builtin calls. (oacc_loop_process): Likewise. (default_goacc_create_propagation_record): New default implementation for TARGET_GOACC_CREATE_PROPAGATION_RECORD hook. (execute_oacc_loop_designation): New. Split out of oacc_device_lower. (execute_oacc_gimple_workers): New. Likewise. (execute_oacc_device_lower): Recreate dims array. (pass_data_oacc_loop_designation, pass_data_oacc_gimple_workers): New. (pass_oacc_loop_designation, pass_oacc_gimple_workers): New. (make_pass_oacc_loop_designation, make_pass_oacc_gimple_workers): New. * omp-offload.h (oacc_fn_attrib_level): Add prototype. * omp-sese.c: New file. * omp-sese.h: New file. * passes.def (pass_oacc_loop_designation, pass_oacc_gimple_workers): Add passes. * target.def (worker_partitioning, create_propagation_record): Add target hooks. * targhooks.h (default_goacc_create_propagation_record): Add prototype. * tree-pass.h (make_pass_oacc_loop_designation, make_pass_oacc_gimple_workers): Add prototypes. * doc/tm.texi.in (TARGET_GOACC_WORKER_PARTITIONING, TARGET_GOACC_CREATE_PROPAGATION_RECORD): Add documentation hooks. * doc/tm.texi: Regenerate. (cherry picked from openacc-gcc-9-branch commit 1de0113e1a6807da85e5c7b0f7d473234f78dd45) --- gcc/ChangeLog.omp | 32 + gcc/Makefile.in | 1 + gcc/doc/tm.texi | 10 + gcc/doc/tm.texi.in | 4 + gcc/omp-builtins.def | 8 + gcc/omp-offload.c | 159 +++- gcc/omp-offload.h | 1 + gcc/omp-sese.c | 2036 ++++++++++++++++++++++++++++++++++++++++++ gcc/omp-sese.h | 26 + gcc/passes.def | 2 + gcc/target.def | 13 + gcc/targhooks.h | 1 + gcc/tree-pass.h | 2 + 13 files changed, 2276 insertions(+), 19 deletions(-) create mode 100644 gcc/omp-sese.c create mode 100644 gcc/omp-sese.h diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index b1c627b394c4..a2b2dcfcf268 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,35 @@ +2019-09-05 Julian Brown + + * Makefile.in (OBJS): Add omp-sese.o. + * omp-builtins.def (BUILT_IN_GOACC_BARRIER, BUILT_IN_GOACC_SINGLE_START, + BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END): New + builtins. + * omp-offload.c (omp-sese.h): Include header. + (oacc_loop_xform_head_tail): Call update_stmt for modified builtin + calls. + (oacc_loop_process): Likewise. + (default_goacc_create_propagation_record): New default implementation + for TARGET_GOACC_CREATE_PROPAGATION_RECORD hook. + (execute_oacc_loop_designation): New. Split out of oacc_device_lower. + (execute_oacc_gimple_workers): New. Likewise. + (execute_oacc_device_lower): Recreate dims array. + (pass_data_oacc_loop_designation, pass_data_oacc_gimple_workers): New. + (pass_oacc_loop_designation, pass_oacc_gimple_workers): New. + (make_pass_oacc_loop_designation, make_pass_oacc_gimple_workers): New. + * omp-offload.h (oacc_fn_attrib_level): Add prototype. + * omp-sese.c: New file. + * omp-sese.h: New file. + * passes.def (pass_oacc_loop_designation, pass_oacc_gimple_workers): + Add passes. + * target.def (worker_partitioning, create_propagation_record): Add + target hooks. + * targhooks.h (default_goacc_create_propagation_record): Add prototype. + * tree-pass.h (make_pass_oacc_loop_designation, + make_pass_oacc_gimple_workers): Add prototypes. + * doc/tm.texi.in (TARGET_GOACC_WORKER_PARTITIONING, + TARGET_GOACC_CREATE_PROPAGATION_RECORD): Add documentation hooks. + * doc/tm.texi: Regenerate. + 2019-09-05 Julian Brown * omp-offload.c (convert.h): Include. diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 9e93eaaa1739..5159cd6a30ce 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1429,6 +1429,7 @@ OBJS = \ omp-expand.o \ omp-general.o \ omp-grid.o \ + omp-sese.o \ omp-low.o \ omp-oacc-kernels.o \ omp-simd-clone.o \ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index f3707c6abe3f..536a436b1c4d 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6166,6 +6166,16 @@ handle this VAR_DECL, and normal RTL expanding is resumed. Tweak variable declaration for a gang-private variable. @end deftypefn +@deftypevr {Target Hook} bool TARGET_GOACC_WORKER_PARTITIONING +Use gimple transformation for worker neutering/broadcasting. +@end deftypevr + +@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_PROPAGATION_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name}) +Create a record used to propagate local-variable state from an active +worker to other workers. A possible implementation might adjust the type +of REC to place the new variable in shared GPU memory. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_GOACC_EXPLODE_ARGS (void) Define this hook to TRUE if arguments to offload regions should be exploded, i.e. passed as true arguments rather than in an argument array. diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index cebadf4a502d..c0b92f25da79 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4212,6 +4212,10 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_GOACC_ADJUST_GANGPRIVATE_DECL +@hook TARGET_GOACC_WORKER_PARTITIONING + +@hook TARGET_GOACC_CREATE_PROPAGATION_RECORD + @hook TARGET_GOACC_EXPLODE_ARGS @node Anchored Addresses diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def index 9961c2874943..a8f10e3389e1 100644 --- a/gcc/omp-builtins.def +++ b/gcc/omp-builtins.def @@ -73,6 +73,8 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_BARRIER, "GOMP_barrier", BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_BARRIER_CANCEL, "GOMP_barrier_cancel", BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN (BUILT_IN_GOACC_BARRIER, "GOACC_barrier", + BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKWAIT, "GOMP_taskwait", BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKWAIT_DEPEND, "GOMP_taskwait_depend", @@ -410,6 +412,12 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SINGLE_COPY_START, "GOMP_single_copy_start", BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SINGLE_COPY_END, "GOMP_single_copy_end", BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN (BUILT_IN_GOACC_SINGLE_START, "GOACC_single_start", + BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN (BUILT_IN_GOACC_SINGLE_COPY_START, "GOACC_single_copy_start", + BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST) +DEF_GOACC_BUILTIN (BUILT_IN_GOACC_SINGLE_COPY_END, "GOACC_single_copy_end", + BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_OFFLOAD_REGISTER, "GOMP_offload_register_ver", BT_FN_VOID_UINT_PTR_INT_PTR, ATTR_NOTHROW_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_OFFLOAD_UNREGISTER, diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index c94dc956d7eb..a6f64aac37e8 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -52,6 +52,7 @@ along with GCC; see the file COPYING3. If not see #include "stringpool.h" #include "attribs.h" #include "cfgloop.h" +#include "omp-sese.h" #include "convert.h" /* Describe the OpenACC looping structure of a function. The entire @@ -1117,6 +1118,8 @@ oacc_loop_xform_head_tail (gcall *from, int level) else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION)) *gimple_call_arg_ptr (stmt, 3) = replacement; + update_stmt (stmt); + gsi_next (&gsi); while (gsi_end_p (gsi)) gsi = gsi_start_bb (single_succ (gsi_bb (gsi))); @@ -1141,25 +1144,28 @@ oacc_loop_process (oacc_loop *loop) gcall *call; for (ix = 0; loop->ifns.iterate (ix, &call); ix++) - switch (gimple_call_internal_fn (call)) - { - case IFN_GOACC_LOOP: + { + switch (gimple_call_internal_fn (call)) { - bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node; - gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg); - if (!is_e) - gimple_call_set_arg (call, 4, chunk_arg); - } - break; + case IFN_GOACC_LOOP: + { + bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node; + gimple_call_set_arg (call, 5, is_e ? e_mask_arg : mask_arg); + if (!is_e) + gimple_call_set_arg (call, 4, chunk_arg); + } + break; - case IFN_GOACC_TILE: - gimple_call_set_arg (call, 3, mask_arg); - gimple_call_set_arg (call, 4, e_mask_arg); - break; + case IFN_GOACC_TILE: + gimple_call_set_arg (call, 3, mask_arg); + gimple_call_set_arg (call, 4, e_mask_arg); + break; - default: - gcc_unreachable (); - } + default: + gcc_unreachable (); + } + update_stmt (call); + } unsigned dim = GOMP_DIM_GANG; unsigned mask = loop->mask | loop->e_mask; @@ -1643,12 +1649,27 @@ is_sync_builtin_call (gcall *call) return false; } +/* Default implementation creates a temporary variable of type RECORD_TYPE if + SENDER is true, else a pointer to RECORD_TYPE if SENDER is false. */ + +tree +default_goacc_create_propagation_record (tree record_type, bool sender, + const char *name) +{ + tree type = record_type; + + if (!sender) + type = build_pointer_type (type); + + return create_tmp_var (type, name); +} + /* Main entry point for oacc transformations which run on the device compiler after LTO, so we know what the target device is at this point (including the host fallback). */ static unsigned int -execute_oacc_device_lower () +execute_oacc_loop_designation () { tree attr = oacc_get_fn_attrib (current_function_decl); if (!attr) @@ -1777,10 +1798,36 @@ execute_oacc_device_lower () free_oacc_loop (l); } + free_oacc_loop (loops); + /* Offloaded targets may introduce new basic blocks, which require dominance information to update SSA. */ calculate_dominance_info (CDI_DOMINATORS); + return 0; +} + +int +execute_oacc_gimple_workers (void) +{ + oacc_do_neutering (); + calculate_dominance_info (CDI_DOMINATORS); + return 0; +} + +static unsigned int +execute_oacc_device_lower () +{ + int dims[GOMP_DIM_MAX]; + tree attr = oacc_get_fn_attrib (current_function_decl); + + if (!attr) + /* Not an offloaded function. */ + return 0; + + for (unsigned i = 0; i < GOMP_DIM_MAX; i++) + dims[i] = oacc_get_fn_dim_size (current_function_decl, i); + /* Now lower internal loop functions to target-specific code sequences. */ basic_block bb; @@ -1948,8 +1995,6 @@ execute_oacc_device_lower () } } - free_oacc_loop (loops); - return 0; } @@ -1990,6 +2035,70 @@ default_goacc_dim_limit (int ARG_UNUSED (axis)) namespace { +const pass_data pass_data_oacc_loop_designation = +{ + GIMPLE_PASS, /* type */ + "oaccloops", /* name */ + OPTGROUP_OMP, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + 0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa | TODO_cleanup_cfg + | TODO_rebuild_alias, /* todo_flags_finish */ +}; + +class pass_oacc_loop_designation : public gimple_opt_pass +{ +public: + pass_oacc_loop_designation (gcc::context *ctxt) + : gimple_opt_pass (pass_data_oacc_loop_designation, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) { return flag_openacc; }; + + virtual unsigned int execute (function *) + { + return execute_oacc_loop_designation (); + } + +}; // class pass_oacc_loop_designation + +const pass_data pass_data_oacc_gimple_workers = +{ + GIMPLE_PASS, /* type */ + "oaccworkers", /* name */ + OPTGROUP_OMP, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_cfg, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */ +}; + +class pass_oacc_gimple_workers : public gimple_opt_pass +{ +public: + pass_oacc_gimple_workers (gcc::context *ctxt) + : gimple_opt_pass (pass_data_oacc_gimple_workers, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return flag_openacc && targetm.goacc.worker_partitioning; + }; + + virtual unsigned int execute (function *) + { + return execute_oacc_gimple_workers (); + } + +}; // class pass_oacc_gimple_workers + const pass_data pass_data_oacc_device_lower = { GIMPLE_PASS, /* type */ @@ -2022,6 +2131,18 @@ public: } // anon namespace +gimple_opt_pass * +make_pass_oacc_loop_designation (gcc::context *ctxt) +{ + return new pass_oacc_loop_designation (ctxt); +} + +gimple_opt_pass * +make_pass_oacc_gimple_workers (gcc::context *ctxt) +{ + return new pass_oacc_gimple_workers (ctxt); +} + gimple_opt_pass * make_pass_oacc_device_lower (gcc::context *ctxt) { diff --git a/gcc/omp-offload.h b/gcc/omp-offload.h index 21c9236b74f0..b441854585f7 100644 --- a/gcc/omp-offload.h +++ b/gcc/omp-offload.h @@ -29,6 +29,7 @@ extern int oacc_fn_attrib_level (tree attr); extern GTY(()) vec *offload_funcs; extern GTY(()) vec *offload_vars; +extern int oacc_fn_attrib_level (tree attr); extern void omp_finish_file (void); #endif /* GCC_OMP_DEVICE_H */ diff --git a/gcc/omp-sese.c b/gcc/omp-sese.c new file mode 100644 index 000000000000..cb2389eb55c7 --- /dev/null +++ b/gcc/omp-sese.c @@ -0,0 +1,2036 @@ +/* Find single-entry, single-exit regions for OpenACC. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "gimple.h" +#include "tree-pass.h" +#include "ssa.h" +#include "cgraph.h" +#include "pretty-print.h" +#include "fold-const.h" +#include "gimplify.h" +#include "gimple-iterator.h" +#include "gimple-walk.h" +#include "tree-inline.h" +#include "langhooks.h" +#include "omp-general.h" +#include "omp-low.h" +#include "omp-grid.h" +#include "gimple-pretty-print.h" +#include "cfghooks.h" +#include "insn-config.h" +#include "recog.h" +#include "internal-fn.h" +#include "bitmap.h" +#include "tree-nested.h" +#include "stor-layout.h" +#include "tree-ssa-threadupdate.h" +#include "tree-into-ssa.h" +#include "splay-tree.h" +#include "target.h" +#include "cfgloop.h" +#include "tree-cfg.h" +#include "omp-offload.h" +#include "attribs.h" + +/* Loop structure of the function. The entire function is described as + a NULL loop. */ + +struct parallel +{ + /* Parent parallel. */ + parallel *parent; + + /* Next sibling parallel. */ + parallel *next; + + /* First child parallel. */ + parallel *inner; + + /* Partitioning mask of the parallel. */ + unsigned mask; + + /* Partitioning used within inner parallels. */ + unsigned inner_mask; + + /* Location of parallel forked and join. The forked is the first + block in the parallel and the join is the first block after of + the partition. */ + basic_block forked_block; + basic_block join_block; + + gimple *forked_stmt; + gimple *join_stmt; + + gimple *fork_stmt; + gimple *joining_stmt; + + /* Basic blocks in this parallel, but not in child parallels. The + FORKED and JOINING blocks are in the partition. The FORK and JOIN + blocks are not. */ + auto_vec blocks; + + tree record_type; + tree sender_decl; + tree receiver_decl; + +public: + parallel (parallel *parent, unsigned mode); + ~parallel (); +}; + +/* Constructor links the new parallel into it's parent's chain of + children. */ + +parallel::parallel (parallel *parent_, unsigned mask_) + :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) +{ + forked_block = join_block = 0; + forked_stmt = join_stmt = NULL; + fork_stmt = joining_stmt = NULL; + + record_type = NULL_TREE; + sender_decl = NULL_TREE; + receiver_decl = NULL_TREE; + + if (parent) + { + next = parent->inner; + parent->inner = this; + } +} + +parallel::~parallel () +{ + delete inner; + delete next; +} + +static bool +local_var_based_p (tree decl) +{ + switch (TREE_CODE (decl)) + { + case VAR_DECL: + return !is_global_var (decl); + + case COMPONENT_REF: + case BIT_FIELD_REF: + case ARRAY_REF: + return local_var_based_p (TREE_OPERAND (decl, 0)); + + default: + return false; + } +} + +/* Map of basic blocks to gimple stmts. */ +typedef hash_map bb_stmt_map_t; + +/* Calls to OpenACC routines are made by all workers/wavefronts/warps, since + the routine likely contains partitioned loops (else will do its own + neutering and variable propagation). Return TRUE if a function call CALL + should be made in (worker) single mode instead, rather than redundant + mode. */ + +static bool +omp_sese_active_worker_call (gcall *call) +{ +#define GOMP_DIM_SEQ GOMP_DIM_MAX + tree fndecl = gimple_call_fndecl (call); + + if (!fndecl) + return true; + + tree attrs = oacc_get_fn_attrib (fndecl); + + if (!attrs) + return true; + + int level = oacc_fn_attrib_level (attrs); + + /* Neither regular functions nor "seq" routines should be run by all threads + in worker-single mode. */ + return level == -1 || level == GOMP_DIM_SEQ; +#undef GOMP_DIM_SEQ +} + +/* Split basic blocks such that each forked and join unspecs are at + the start of their basic blocks. Thus afterwards each block will + have a single partitioning mode. We also do the same for return + insns, as they are executed by every thread. Return the + partitioning mode of the function as a whole. Populate MAP with + head and tail blocks. We also clear the BB visited flag, which is + used when finding partitions. */ + +static void +omp_sese_split_blocks (bb_stmt_map_t *map) +{ + auto_vec worklist; + basic_block block; + + /* Locate all the reorg instructions of interest. */ + FOR_ALL_BB_FN (block, cfun) + { + /* Clear visited flag, for use by parallel locator */ + block->flags &= ~BB_VISITED; + + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); + gcall *call = as_a (stmt); + + if (k == IFN_UNIQUE_OACC_JOIN) + worklist.safe_push (stmt); + else if (k == IFN_UNIQUE_OACC_FORK) + { + gcc_assert (gsi_one_before_end_p (gsi)); + basic_block forked_block = single_succ (block); + gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block); + + /* We push a NOP as a placeholder for the "forked" stmt. + This is then recognized in omp_sese_find_par. */ + gimple *nop = gimple_build_nop (); + gsi_insert_before (&gsi2, nop, GSI_SAME_STMT); + + worklist.safe_push (nop); + } + } + else if (gimple_code (stmt) == GIMPLE_RETURN + || gimple_code (stmt) == GIMPLE_COND + || gimple_code (stmt) == GIMPLE_SWITCH + || (gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt) + && !omp_sese_active_worker_call (as_a (stmt)))) + worklist.safe_push (stmt); + else if (is_gimple_assign (stmt)) + { + tree lhs = gimple_assign_lhs (stmt); + + /* Force assignments to components/fields/elements of local + aggregates into fully-partitioned (redundant) mode. This + avoids having to broadcast the whole aggregate. The RHS of + the assignment will be propagated using the normal + mechanism. */ + + switch (TREE_CODE (lhs)) + { + case COMPONENT_REF: + case BIT_FIELD_REF: + case ARRAY_REF: + { + tree aggr = TREE_OPERAND (lhs, 0); + + if (local_var_based_p (aggr)) + worklist.safe_push (stmt); + } + break; + + default: + ; + } + } + } + } + + /* Split blocks on the worklist. */ + unsigned ix; + gimple *stmt; + + for (ix = 0; worklist.iterate (ix, &stmt); ix++) + { + basic_block block = gimple_bb (stmt); + + if (gimple_code (stmt) == GIMPLE_COND) + { + gcond *orig_cond = as_a (stmt); + tree_code code = gimple_expr_code (orig_cond); + tree pred = make_ssa_name (boolean_type_node); + gimple *asgn = gimple_build_assign (pred, code, + gimple_cond_lhs (orig_cond), + gimple_cond_rhs (orig_cond)); + gcond *new_cond + = gimple_build_cond (NE_EXPR, pred, boolean_false_node, + gimple_cond_true_label (orig_cond), + gimple_cond_false_label (orig_cond)); + + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_insert_before (&gsi, asgn, GSI_SAME_STMT); + gsi_replace (&gsi, new_cond, true); + + edge e = split_block (block, asgn); + block = e->dest; + map->get_or_insert (block) = new_cond; + } + else if ((gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt)) + || is_gimple_assign (stmt)) + { + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_prev (&gsi); + + edge call = split_block (block, gsi_stmt (gsi)); + + gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest)); + + edge call_to_ret = split_block (call->dest, call_stmt); + + map->get_or_insert (call_to_ret->src) = call_stmt; + } + else + { + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_prev (&gsi); + + if (gsi_end_p (gsi)) + map->get_or_insert (block) = stmt; + else + { + /* Split block before insn. The insn is in the new block. */ + edge e = split_block (block, gsi_stmt (gsi)); + + block = e->dest; + map->get_or_insert (block) = stmt; + } + } + } +} + +static const char * +mask_name (unsigned mask) +{ + switch (mask) + { + case 0: return "gang redundant"; + case 1: return "gang partitioned"; + case 2: return "worker partitioned"; + case 3: return "gang+worker partitioned"; + case 4: return "vector partitioned"; + case 5: return "gang+vector partitioned"; + case 6: return "worker+vector partitioned"; + case 7: return "fully partitioned"; + default: return ""; + } +} + +/* Dump this parallel and all its inner parallels. */ + +static void +omp_sese_dump_pars (parallel *par, unsigned depth) +{ + fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n", + depth, par->mask, mask_name (par->mask), + par->forked_block ? par->forked_block->index : -1, + par->join_block ? par->join_block->index : -1); + + fprintf (dump_file, " blocks:"); + + basic_block block; + for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) + fprintf (dump_file, " %d", block->index); + fprintf (dump_file, "\n"); + if (par->inner) + omp_sese_dump_pars (par->inner, depth + 1); + + if (par->next) + omp_sese_dump_pars (par->next, depth); +} + +/* If BLOCK contains a fork/join marker, process it to create or + terminate a loop structure. Add this block to the current loop, + and then walk successor blocks. */ + +static parallel * +omp_sese_find_par (bb_stmt_map_t *map, parallel *par, basic_block block) +{ + if (block->flags & BB_VISITED) + return par; + block->flags |= BB_VISITED; + + if (gimple **stmtp = map->get (block)) + { + gimple *stmt = *stmtp; + + if (gimple_code (stmt) == GIMPLE_COND + || gimple_code (stmt) == GIMPLE_SWITCH + || gimple_code (stmt) == GIMPLE_RETURN + || (gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt)) + || is_gimple_assign (stmt)) + { + /* A single block that is forced to be at the maximum partition + level. Make a singleton par for it. */ + par = new parallel (par, GOMP_DIM_MASK (GOMP_DIM_GANG) + | GOMP_DIM_MASK (GOMP_DIM_WORKER) + | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); + par->forked_block = block; + par->forked_stmt = stmt; + par->blocks.safe_push (block); + par = par->parent; + goto walk_successors; + } + else if (gimple_nop_p (stmt)) + { + basic_block pred = single_pred (block); + gcc_assert (pred); + gimple_stmt_iterator gsi = gsi_last_bb (pred); + gimple *final_stmt = gsi_stmt (gsi); + + if (gimple_call_internal_p (final_stmt, IFN_UNIQUE)) + { + gcall *call = as_a (final_stmt); + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (call, 0))); + + if (k == IFN_UNIQUE_OACC_FORK) + { + HOST_WIDE_INT dim + = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; + + par = new parallel (par, mask); + par->forked_block = block; + par->forked_stmt = final_stmt; + par->fork_stmt = stmt; + } + else + gcc_unreachable (); + } + else + gcc_unreachable (); + } + else if (gimple_call_internal_p (stmt, IFN_UNIQUE)) + { + gcall *call = as_a (stmt); + enum ifn_unique_kind k = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (call, 0))); + if (k == IFN_UNIQUE_OACC_JOIN) + { + HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2)); + unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0; + + gcc_assert (par->mask == mask); + par->join_block = block; + par->join_stmt = stmt; + par = par->parent; + } + else + gcc_unreachable (); + } + else + gcc_unreachable (); + } + + if (par) + /* Add this block onto the current loop's list of blocks. */ + par->blocks.safe_push (block); + else + /* This must be the entry block. Create a NULL parallel. */ + par = new parallel (0, 0); + +walk_successors: + /* Walk successor blocks. */ + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, block->succs) + omp_sese_find_par (map, par, e->dest); + + return par; +} + +/* DFS walk the CFG looking for fork & join markers. Construct + loop structures as we go. MAP is a mapping of basic blocks + to head & tail markers, discovered when splitting blocks. This + speeds up the discovery. We rely on the BB visited flag having + been cleared when splitting blocks. */ + +static parallel * +omp_sese_discover_pars (bb_stmt_map_t *map) +{ + basic_block block; + + /* Mark exit blocks as visited. */ + block = EXIT_BLOCK_PTR_FOR_FN (cfun); + block->flags |= BB_VISITED; + + /* And entry block as not. */ + block = ENTRY_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + + parallel *par = omp_sese_find_par (map, 0, block); + + if (dump_file) + { + fprintf (dump_file, "\nLoops\n"); + omp_sese_dump_pars (par, 0); + fprintf (dump_file, "\n"); + } + + return par; +} + +static void +populate_single_mode_bitmaps (parallel *par, bitmap worker_single, + bitmap vector_single, unsigned outer_mask, + int depth) +{ + unsigned mask = outer_mask | par->mask; + + basic_block block; + + for (unsigned i = 0; par->blocks.iterate (i, &block); i++) + { + if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) + bitmap_set_bit (worker_single, block->index); + + if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0) + bitmap_set_bit (vector_single, block->index); + } + + if (par->inner) + populate_single_mode_bitmaps (par->inner, worker_single, vector_single, + mask, depth + 1); + if (par->next) + populate_single_mode_bitmaps (par->next, worker_single, vector_single, + outer_mask, depth); +} + +/* A map from SSA names or var decls to record fields. */ + +typedef hash_map field_map_t; + +/* For each propagation record type, this is a map from SSA names or var decls + to propagate, to the field in the record type that should be used for + transmission and reception. */ + +typedef hash_map record_field_map_t; + +static GTY(()) record_field_map_t *field_map; + +static void +install_var_field (tree var, tree record_type) +{ + field_map_t *fields = *field_map->get (record_type); + tree name; + char tmp[20]; + + if (TREE_CODE (var) == SSA_NAME) + name = SSA_NAME_IDENTIFIER (var); + else if (TREE_CODE (var) == VAR_DECL) + name = DECL_NAME (var); + else + gcc_unreachable (); + + gcc_assert (!fields->get (var)); + + if (!name) + { + sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var)); + name = get_identifier (tmp); + } + + tree type = TREE_TYPE (var); + + if (POINTER_TYPE_P (type) + && TYPE_RESTRICT (type)) + type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT); + + tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type); + + if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var)) + { + SET_DECL_ALIGN (field, DECL_ALIGN (var)); + DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var); + TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var); + } + else + SET_DECL_ALIGN (field, TYPE_ALIGN (type)); + + fields->put (var, field); + + insert_field_into_struct (record_type, field); +} + +/* Sets of SSA_NAMES or VAR_DECLs to propagate. */ +typedef hash_set propagation_set; + +static void +find_ssa_names_to_propagate (parallel *par, unsigned outer_mask, + bitmap worker_single, bitmap vector_single, + vec *prop_set) +{ + unsigned mask = outer_mask | par->mask; + + if (par->inner) + find_ssa_names_to_propagate (par->inner, mask, worker_single, + vector_single, prop_set); + if (par->next) + find_ssa_names_to_propagate (par->next, outer_mask, worker_single, + vector_single, prop_set); + + if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + { + basic_block block; + int ix; + + for (ix = 0; par->blocks.iterate (ix, &block); ix++) + { + for (gphi_iterator psi = gsi_start_phis (block); + !gsi_end_p (psi); gsi_next (&psi)) + { + gphi *phi = psi.phi (); + use_operand_p use; + ssa_op_iter iter; + + FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE) + { + tree var = USE_FROM_PTR (use); + + if (TREE_CODE (var) != SSA_NAME) + continue; + + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + basic_block def_bb = gimple_bb (def_stmt); + + if (bitmap_bit_p (worker_single, def_bb->index)) + { + if (!(*prop_set)[def_bb->index]) + (*prop_set)[def_bb->index] = new propagation_set; + + propagation_set *ws_prop = (*prop_set)[def_bb->index]; + + ws_prop->add (var); + } + } + } + + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + use_operand_p use; + ssa_op_iter iter; + gimple *stmt = gsi_stmt (gsi); + + FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE) + { + tree var = USE_FROM_PTR (use); + + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + basic_block def_bb = gimple_bb (def_stmt); + + if (bitmap_bit_p (worker_single, def_bb->index)) + { + if (!(*prop_set)[def_bb->index]) + (*prop_set)[def_bb->index] = new propagation_set; + + propagation_set *ws_prop = (*prop_set)[def_bb->index]; + + ws_prop->add (var); + } + } + } + } + } +} + +/* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a + statement. */ + +static tree +find_partitioned_var_uses_1 (tree *node, int *, void *data) +{ + walk_stmt_info *wi = (walk_stmt_info *) data; + hash_set *partitioned_var_uses = (hash_set *) wi->info; + + if (!wi->is_lhs && VAR_P (*node)) + partitioned_var_uses->add (*node); + + return NULL_TREE; +} + +static void +find_partitioned_var_uses (parallel *par, unsigned outer_mask, + hash_set *partitioned_var_uses) +{ + unsigned mask = outer_mask | par->mask; + + if (par->inner) + find_partitioned_var_uses (par->inner, mask, partitioned_var_uses); + if (par->next) + find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses); + + if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) + { + basic_block block; + int ix; + + for (ix = 0; par->blocks.iterate (ix, &block); ix++) + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + walk_stmt_info wi; + memset (&wi, 0, sizeof (wi)); + wi.info = (void *) partitioned_var_uses; + walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi); + } + } +} + +static void +find_local_vars_to_propagate (parallel *par, unsigned outer_mask, + hash_set *partitioned_var_uses, + vec *prop_set) +{ + unsigned mask = outer_mask | par->mask; + + if (par->inner) + find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses, + prop_set); + if (par->next) + find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses, + prop_set); + + if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))) + { + basic_block block; + int ix; + + for (ix = 0; par->blocks.iterate (ix, &block); ix++) + { + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + tree var; + unsigned i; + + FOR_EACH_LOCAL_DECL (cfun, i, var) + { + if (!VAR_P (var) + || is_global_var (var) + || AGGREGATE_TYPE_P (TREE_TYPE (var)) + || !partitioned_var_uses->contains (var) + || lookup_attribute ("oacc gangprivate", + DECL_ATTRIBUTES (var))) + continue; + + if (stmt_may_clobber_ref_p (stmt, var)) + { + if (dump_file) + { + fprintf (dump_file, "bb %u: local variable may be " + "clobbered in %s mode: ", block->index, + mask_name (mask)); + print_generic_expr (dump_file, var, TDF_SLIM); + fprintf (dump_file, "\n"); + } + + if (!(*prop_set)[block->index]) + (*prop_set)[block->index] = new propagation_set; + + propagation_set *ws_prop + = (*prop_set)[block->index]; + + ws_prop->add (var); + } + } + } + } + } +} + +/* Transform basic blocks FROM, TO (which may be the same block) into: + if (GOACC_single_start ()) + BLOCK; + GOACC_barrier (); + \ | / + +----+ + | | (new) predicate block + +----+-- + \ | / \ | / |t \ + +----+ +----+ +----+ | + | | | | ===> | | | f (old) from block + +----+ +----+ +----+ | + | t/ \f | / + +----+/ + (split (split before | | skip block + at end) condition) +----+ + t/ \f +*/ + +static void +worker_single_simple (basic_block from, basic_block to, + hash_set *def_escapes_block) +{ + gimple *call, *cond; + tree lhs, decl; + basic_block skip_block; + + gimple_stmt_iterator gsi = gsi_last_bb (to); + if (EDGE_COUNT (to->succs) > 1) + { + gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND); + gsi_prev (&gsi); + } + edge e = split_block (to, gsi_stmt (gsi)); + skip_block = e->dest; + + gimple_stmt_iterator start = gsi_after_labels (from); + + decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START); + lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); + call = gimple_build_call (decl, 0); + gimple_call_set_lhs (call, lhs); + gsi_insert_before (&start, call, GSI_NEW_STMT); + update_stmt (call); + + cond = gimple_build_cond (EQ_EXPR, lhs, + fold_convert_loc (UNKNOWN_LOCATION, + TREE_TYPE (lhs), + boolean_true_node), + NULL_TREE, NULL_TREE); + gsi_insert_after (&start, cond, GSI_NEW_STMT); + update_stmt (cond); + + edge et = split_block (from, cond); + et->flags &= ~EDGE_FALLTHRU; + et->flags |= EDGE_TRUE_VALUE; + /* Make the active worker the more probable path so we prefer fallthrough + (letting the idle workers jump around more). */ + et->probability = profile_probability::likely (); + + edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE); + ef->probability = et->probability.invert (); + + basic_block neutered = split_edge (ef); + gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered); + + for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + ssa_op_iter iter; + tree var; + + FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF) + { + if (def_escapes_block->contains (var)) + { + gphi *join_phi = create_phi_node (NULL_TREE, skip_block); + tree res = create_new_def_for (var, join_phi, + gimple_phi_result_ptr (join_phi)); + add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION); + + tree neutered_def = copy_ssa_name (var, NULL); + /* We really want "don't care" or some value representing + undefined here, but optimizers will probably get rid of the + zero-assignments anyway. */ + gassign *zero = gimple_build_assign (neutered_def, + build_zero_cst (TREE_TYPE (neutered_def))); + + gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING); + update_stmt (zero); + + add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered), + UNKNOWN_LOCATION); + update_stmt (join_phi); + } + } + } + + gsi = gsi_start_bb (skip_block); + + decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gimple *acc_bar = gimple_build_call (decl, 0); + + gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); + update_stmt (acc_bar); +} + +/* This is a copied and renamed omp-low.c:omp_build_component_ref. */ + +static tree +oacc_build_component_ref (tree obj, tree field) +{ + tree ret = build3 (COMPONENT_REF, TREE_TYPE (field), obj, field, NULL); + if (TREE_THIS_VOLATILE (field)) + TREE_THIS_VOLATILE (ret) |= 1; + if (TREE_READONLY (field)) + TREE_READONLY (ret) |= 1; + return ret; +} + +static tree +build_receiver_ref (tree record_type, tree var, tree receiver_decl) +{ + field_map_t *fields = *field_map->get (record_type); + tree x = build_simple_mem_ref (receiver_decl); + tree field = *fields->get (var); + TREE_THIS_NOTRAP (x) = 1; + x = oacc_build_component_ref (x, field); + return x; +} + +static tree +build_sender_ref (tree record_type, tree var, tree sender_decl) +{ + field_map_t *fields = *field_map->get (record_type); + tree field = *fields->get (var); + return oacc_build_component_ref (sender_decl, field); +} + +static int +sort_by_ssa_version_or_uid (const void *p1, const void *p2) +{ + const tree t1 = *(const tree *)p1; + const tree t2 = *(const tree *)p2; + + if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME) + return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2); + else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME) + return -1; + else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME) + return 1; + else + return DECL_UID (t1) - DECL_UID (t2); +} + +static int +sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2) +{ + const tree t1 = *(const tree *)p1; + const tree t2 = *(const tree *)p2; + unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1))); + unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2))); + if (s1 != s2) + return s2 - s1; + else + return sort_by_ssa_version_or_uid (p1, p2); +} + +static void +worker_single_copy (basic_block from, basic_block to, + hash_set *def_escapes_block, + hash_set *worker_partitioned_uses, + tree record_type) +{ + /* If we only have virtual defs, we'll have no record type, but we still want + to emit single_copy_start and (particularly) single_copy_end to act as + a vdef source on the neutered edge representing memory writes on the + non-neutered edge. */ + if (!record_type) + record_type = char_type_node; + + tree sender_decl + = targetm.goacc.create_propagation_record (record_type, true, + ".oacc_worker_o"); + tree receiver_decl + = targetm.goacc.create_propagation_record (record_type, false, + ".oacc_worker_i"); + + gimple_stmt_iterator gsi = gsi_last_bb (to); + if (EDGE_COUNT (to->succs) > 1) + gsi_prev (&gsi); + edge e = split_block (to, gsi_stmt (gsi)); + basic_block barrier_block = e->dest; + + gimple_stmt_iterator start = gsi_after_labels (from); + + tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START); + + tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); + + gimple *call = gimple_build_call (decl, 1, + build_fold_addr_expr (sender_decl)); + gimple_call_set_lhs (call, lhs); + gsi_insert_before (&start, call, GSI_NEW_STMT); + update_stmt (call); + + tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); + + gimple *conv = gimple_build_assign (conv_tmp, + fold_convert (TREE_TYPE (receiver_decl), + lhs)); + update_stmt (conv); + gsi_insert_after (&start, conv, GSI_NEW_STMT); + gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp); + gsi_insert_after (&start, asgn, GSI_NEW_STMT); + update_stmt (asgn); + + tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0); + + tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); + asgn = gimple_build_assign (recv_tmp, receiver_decl); + gsi_insert_after (&start, asgn, GSI_NEW_STMT); + update_stmt (asgn); + + gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE, + NULL_TREE); + update_stmt (cond); + + gsi_insert_after (&start, cond, GSI_NEW_STMT); + + edge et = split_block (from, cond); + et->flags &= ~EDGE_FALLTHRU; + et->flags |= EDGE_TRUE_VALUE; + /* Make the active worker the more probable path so we prefer fallthrough + (letting the idle workers jump around more). */ + et->probability = profile_probability::likely (); + + basic_block body = et->dest; + + edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE); + ef->probability = et->probability.invert (); + + decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gimple *acc_bar = gimple_build_call (decl, 0); + + gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block); + gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT); + + cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE); + gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT); + + edge et2 = split_block (barrier_block, cond); + et2->flags &= ~EDGE_FALLTHRU; + et2->flags |= EDGE_TRUE_VALUE; + et2->probability = profile_probability::unlikely (); + + basic_block exit_block = et2->dest; + + basic_block copyout_block = split_edge (et2); + edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE); + ef2->probability = et2->probability.invert (); + + gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block); + + edge copyout_to_exit = single_succ_edge (copyout_block); + + gimple_seq sender_seq = NULL; + + /* Make sure we iterate over definitions in a stable order. */ + auto_vec escape_vec (def_escapes_block->elements ()); + for (hash_set::iterator it = def_escapes_block->begin (); + it != def_escapes_block->end (); ++it) + escape_vec.quick_push (*it); + escape_vec.qsort (sort_by_ssa_version_or_uid); + + for (unsigned i = 0; i < escape_vec.length (); i++) + { + tree var = escape_vec[i]; + + if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var)) + continue; + + tree barrier_def = 0; + + if (TREE_CODE (var) == SSA_NAME) + { + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + /* The barrier phi takes one result from the actual work of the + block we're neutering, and the other result is constant zero of + the same type. */ + + gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block); + barrier_def = create_new_def_for (var, barrier_phi, + gimple_phi_result_ptr (barrier_phi)); + + add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION); + add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef, + UNKNOWN_LOCATION); + + update_stmt (barrier_phi); + } + else + gcc_assert (TREE_CODE (var) == VAR_DECL); + + /* If we had no record type, we will have no fields map. */ + field_map_t **fields_p = field_map->get (record_type); + field_map_t *fields = fields_p ? *fields_p : NULL; + + if (worker_partitioned_uses->contains (var) + && fields + && fields->get (var)) + { + tree neutered_def = make_ssa_name (TREE_TYPE (var)); + + /* Receive definition from shared memory block. */ + + tree receiver_ref = build_receiver_ref (record_type, var, + receiver_decl); + gassign *recv = gimple_build_assign (neutered_def, + receiver_ref); + gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING); + update_stmt (recv); + + if (TREE_CODE (var) == VAR_DECL) + { + /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy + to the final location now. */ + gassign *asgn = gimple_build_assign (var, neutered_def); + gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING); + update_stmt (asgn); + } + else + { + /* If it's an SSA name, create a new phi at the join node to + represent either the output from the active worker (the + barrier) or the inactive workers (the copyout block). */ + gphi *join_phi = create_phi_node (NULL_TREE, exit_block); + create_new_def_for (barrier_def, join_phi, + gimple_phi_result_ptr (join_phi)); + add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION); + add_phi_arg (join_phi, neutered_def, copyout_to_exit, + UNKNOWN_LOCATION); + update_stmt (join_phi); + } + + /* Send definition to shared memory block. */ + + tree sender_ref = build_sender_ref (record_type, var, sender_decl); + + if (TREE_CODE (var) == SSA_NAME) + { + gassign *send = gimple_build_assign (sender_ref, var); + gimple_seq_add_stmt (&sender_seq, send); + update_stmt (send); + } + else if (TREE_CODE (var) == VAR_DECL) + { + tree tmp = make_ssa_name (TREE_TYPE (var)); + gassign *send = gimple_build_assign (tmp, var); + gimple_seq_add_stmt (&sender_seq, send); + update_stmt (send); + send = gimple_build_assign (sender_ref, tmp); + gimple_seq_add_stmt (&sender_seq, send); + update_stmt (send); + } + else + gcc_unreachable (); + } + } + + /* It's possible for the ET->DEST block (the work done by the active thread) + to finish with a control-flow insn, e.g. a UNIQUE function call. Split + the block and add SENDER_SEQ in the latter part to avoid having control + flow in the middle of a BB. */ + + decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END); + call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl)); + gimple_seq_add_stmt (&sender_seq, call); + + gsi = gsi_last_bb (body); + gimple *last = gsi_stmt (gsi); + basic_block sender_block = split_block (body, last)->dest; + gsi = gsi_last_bb (sender_block); + gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); +} + +static void +neuter_worker_single (parallel *par, unsigned outer_mask, bitmap worker_single, + bitmap vector_single, vec *prop_set, + hash_set *partitioned_var_uses) +{ + unsigned mask = outer_mask | par->mask; + + if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) + { + basic_block block; + + for (unsigned i = 0; par->blocks.iterate (i, &block); i++) + { + bool has_defs = false; + hash_set def_escapes_block; + hash_set worker_partitioned_uses; + unsigned j; + tree var; + + FOR_EACH_SSA_NAME (j, var, cfun) + { + if (SSA_NAME_IS_VIRTUAL_OPERAND (var)) + { + has_defs = true; + continue; + } + + gimple *def_stmt = SSA_NAME_DEF_STMT (var); + + if (gimple_nop_p (def_stmt)) + continue; + + if (gimple_bb (def_stmt)->index != block->index) + continue; + + gimple *use_stmt; + imm_use_iterator use_iter; + bool uses_outside_block = false; + bool worker_partitioned_use = false; + + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var) + { + int blocknum = gimple_bb (use_stmt)->index; + + /* Don't propagate SSA names that are only used in the + current block, unless the usage is in a phi node: that + means the name left the block, then came back in at the + top. */ + if (blocknum != block->index + || gimple_code (use_stmt) == GIMPLE_PHI) + uses_outside_block = true; + if (!bitmap_bit_p (worker_single, blocknum)) + worker_partitioned_use = true; + } + + if (uses_outside_block) + def_escapes_block.add (var); + + if (worker_partitioned_use) + { + worker_partitioned_uses.add (var); + has_defs = true; + } + } + + propagation_set *ws_prop = (*prop_set)[block->index]; + + if (ws_prop) + { + for (propagation_set::iterator it = ws_prop->begin (); + it != ws_prop->end (); + ++it) + { + tree var = *it; + if (TREE_CODE (var) == VAR_DECL) + { + def_escapes_block.add (var); + if (partitioned_var_uses->contains (var)) + { + worker_partitioned_uses.add (var); + has_defs = true; + } + } + } + + delete ws_prop; + (*prop_set)[block->index] = 0; + } + + tree record_type = (tree) block->aux; + + if (has_defs) + worker_single_copy (block, block, &def_escapes_block, + &worker_partitioned_uses, record_type); + else + worker_single_simple (block, block, &def_escapes_block); + } + } + + if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0) + { + basic_block block; + + for (unsigned i = 0; par->blocks.iterate (i, &block); i++) + for (gimple_stmt_iterator gsi = gsi_start_bb (block); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_code (stmt) == GIMPLE_CALL + && !gimple_call_internal_p (stmt) + && !omp_sese_active_worker_call (as_a (stmt))) + { + /* If we have an OpenACC routine call in worker-single mode, + place barriers before and afterwards to prevent + clobbering re-used shared memory regions (as are used + for AMDGCN at present, for example). */ + tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gsi_insert_before (&gsi, gimple_build_call (decl, 0), + GSI_SAME_STMT); + gsi_insert_after (&gsi, gimple_build_call (decl, 0), + GSI_NEW_STMT); + } + } + } + + if (par->inner) + neuter_worker_single (par->inner, mask, worker_single, vector_single, + prop_set, partitioned_var_uses); + if (par->next) + neuter_worker_single (par->next, outer_mask, worker_single, vector_single, + prop_set, partitioned_var_uses); +} + + +void +oacc_do_neutering (void) +{ + bb_stmt_map_t bb_stmt_map; + auto_bitmap worker_single, vector_single; + + omp_sese_split_blocks (&bb_stmt_map); + + if (dump_file) + { + fprintf (dump_file, "\n\nAfter splitting:\n\n"); + dump_function_to_file (current_function_decl, dump_file, dump_flags); + } + + unsigned mask = 0; + + /* If this is a routine, calculate MASK as if the outer levels are already + partitioned. */ + tree attr = oacc_get_fn_attrib (current_function_decl); + if (attr) + { + tree dims = TREE_VALUE (attr); + unsigned ix; + for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) + { + tree allowed = TREE_PURPOSE (dims); + if (allowed && integer_zerop (allowed)) + mask |= GOMP_DIM_MASK (ix); + } + } + + parallel *par = omp_sese_discover_pars (&bb_stmt_map); + populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0); + + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + bb->aux = NULL; + + field_map = record_field_map_t::create_ggc (40); + + vec prop_set; + prop_set.create (last_basic_block_for_fn (cfun)); + + for (unsigned i = 0; i < last_basic_block_for_fn (cfun); i++) + prop_set.quick_push (0); + + find_ssa_names_to_propagate (par, mask, worker_single, vector_single, + &prop_set); + + hash_set partitioned_var_uses; + + find_partitioned_var_uses (par, mask, &partitioned_var_uses); + find_local_vars_to_propagate (par, mask, &partitioned_var_uses, &prop_set); + + FOR_ALL_BB_FN (bb, cfun) + { + propagation_set *ws_prop = prop_set[bb->index]; + if (ws_prop) + { + tree record_type = lang_hooks.types.make_type (RECORD_TYPE); + tree name = create_tmp_var_name (".oacc_ws_data_s"); + name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type); + DECL_ARTIFICIAL (name) = 1; + DECL_NAMELESS (name) = 1; + TYPE_NAME (record_type) = name; + TYPE_ARTIFICIAL (record_type) = 1; + + auto_vec field_vec (ws_prop->elements ()); + for (hash_set::iterator it = ws_prop->begin (); + it != ws_prop->end (); ++it) + field_vec.quick_push (*it); + + field_vec.qsort (sort_by_size_then_ssa_version_or_uid); + + field_map->put (record_type, field_map_t::create_ggc (17)); + + /* Insert var fields in reverse order, so the last inserted element + is the first in the structure. */ + for (int i = field_vec.length () - 1; i >= 0; i--) + install_var_field (field_vec[i], record_type); + + layout_type (record_type); + + bb->aux = (tree) record_type; + } + } + + neuter_worker_single (par, mask, worker_single, vector_single, &prop_set, + &partitioned_var_uses); + + prop_set.release (); + + /* This doesn't seem to make a difference. */ + loops_state_clear (LOOP_CLOSED_SSA); + + /* Neutering worker-single neutered blocks will invalidate dominance info. + It may be possible to incrementally update just the affected blocks, but + obliterate everything for now. */ + free_dominance_info (CDI_DOMINATORS); + free_dominance_info (CDI_POST_DOMINATORS); + + if (dump_file) + { + fprintf (dump_file, "\n\nAfter neutering:\n\n"); + dump_function_to_file (current_function_decl, dump_file, dump_flags); + } +} + +/* Analyse a group of BBs within a partitioned region and create N + Single-Entry-Single-Exit regions. Some of those regions will be + trivial ones consisting of a single BB. The blocks of a + partitioned region might form a set of disjoint graphs -- because + the region encloses a differently partitoned sub region. + + We use the linear time algorithm described in 'Finding Regions Fast: + Single Entry Single Exit and control Regions in Linear Time' + Johnson, Pearson & Pingali. That algorithm deals with complete + CFGs, where a back edge is inserted from END to START, and thus the + problem becomes one of finding equivalent loops. + + In this case we have a partial CFG. We complete it by redirecting + any incoming edge to the graph to be from an arbitrary external BB, + and similarly redirecting any outgoing edge to be to that BB. + Thus we end up with a closed graph. + + The algorithm works by building a spanning tree of an undirected + graph and keeping track of back edges from nodes further from the + root in the tree to nodes nearer to the root in the tree. In the + description below, the root is up and the tree grows downwards. + + We avoid having to deal with degenerate back-edges to the same + block, by splitting each BB into 3 -- one for input edges, one for + the node itself and one for the output edges. Such back edges are + referred to as 'Brackets'. Cycle equivalent nodes will have the + same set of brackets. + + Determining bracket equivalency is done by maintaining a list of + brackets in such a manner that the list length and final bracket + uniquely identify the set. + + We use coloring to mark all BBs with cycle equivalency with the + same color. This is the output of the 'Finding Regions Fast' + algorithm. Notice it doesn't actually find the set of nodes within + a particular region, just unorderd sets of nodes that are the + entries and exits of SESE regions. + + After determining cycle equivalency, we need to find the minimal + set of SESE regions. Do this with a DFS coloring walk of the + complete graph. We're either 'looking' or 'coloring'. When + looking, and we're in the subgraph, we start coloring the color of + the current node, and remember that node as the start of the + current color's SESE region. Every time we go to a new node, we + decrement the count of nodes with thet color. If it reaches zero, + we remember that node as the end of the current color's SESE region + and return to 'looking'. Otherwise we color the node the current + color. + + This way we end up with coloring the inside of non-trivial SESE + regions with the color of that region. */ + +/* A pair of BBs. We use this to represent SESE regions. */ +typedef std::pair bb_pair_t; +typedef auto_vec bb_pair_vec_t; + +/* A node in the undirected CFG. The discriminator SECOND indicates just + above or just below the BB idicated by FIRST. */ +typedef std::pair pseudo_node_t; + +/* A bracket indicates an edge towards the root of the spanning tree of the + undirected graph. Each bracket has a color, determined + from the currrent set of brackets. */ +struct bracket +{ + pseudo_node_t back; /* Back target */ + + /* Current color and size of set. */ + unsigned color; + unsigned size; + + bracket (pseudo_node_t back_) + : back (back_), color (~0u), size (~0u) + { + } + + unsigned get_color (auto_vec &color_counts, unsigned length) + { + if (length != size) + { + size = length; + color = color_counts.length (); + color_counts.quick_push (0); + } + color_counts[color]++; + return color; + } +}; + +typedef auto_vec bracket_vec_t; + +/* Basic block info for finding SESE regions. */ + +struct bb_sese +{ + int node; /* Node number in spanning tree. */ + int parent; /* Parent node number. */ + + /* The algorithm splits each node A into Ai, A', Ao. The incoming + edges arrive at pseudo-node Ai and the outgoing edges leave at + pseudo-node Ao. We have to remember which way we arrived at a + particular node when generating the spanning tree. dir > 0 means + we arrived at Ai, dir < 0 means we arrived at Ao. */ + int dir; + + /* Lowest numbered pseudo-node reached via a backedge from thsis + node, or any descendant. */ + pseudo_node_t high; + + int color; /* Cycle-equivalence color */ + + /* Stack of brackets for this node. */ + bracket_vec_t brackets; + + bb_sese (unsigned node_, unsigned p, int dir_) + :node (node_), parent (p), dir (dir_) + { + } + ~bb_sese (); + + /* Push a bracket ending at BACK. */ + void push (const pseudo_node_t &back) + { + if (dump_file) + fprintf (dump_file, "Pushing backedge %d:%+d\n", + back.first ? back.first->index : 0, back.second); + brackets.safe_push (bracket (back)); + } + + void append (bb_sese *child); + void remove (const pseudo_node_t &); + + /* Set node's color. */ + void set_color (auto_vec &color_counts) + { + color = brackets.last ().get_color (color_counts, brackets.length ()); + } +}; + +bb_sese::~bb_sese () +{ +} + +/* Destructively append CHILD's brackets. */ + +void +bb_sese::append (bb_sese *child) +{ + if (int len = child->brackets.length ()) + { + int ix; + + if (dump_file) + { + for (ix = 0; ix < len; ix++) + { + const pseudo_node_t &pseudo = child->brackets[ix].back; + fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n", + child->node, pseudo.first ? pseudo.first->index : 0, + pseudo.second); + } + } + if (!brackets.length ()) + std::swap (brackets, child->brackets); + else + { + brackets.reserve (len); + for (ix = 0; ix < len; ix++) + brackets.quick_push (child->brackets[ix]); + } + } +} + +/* Remove brackets that terminate at PSEUDO. */ + +void +bb_sese::remove (const pseudo_node_t &pseudo) +{ + unsigned removed = 0; + int len = brackets.length (); + + for (int ix = 0; ix < len; ix++) + { + if (brackets[ix].back == pseudo) + { + if (dump_file) + fprintf (dump_file, "Removing backedge %d:%+d\n", + pseudo.first ? pseudo.first->index : 0, pseudo.second); + removed++; + } + else if (removed) + brackets[ix-removed] = brackets[ix]; + } + while (removed--) + brackets.pop (); +} + +/* Accessors for BB's aux pointer. */ +#define BB_SET_SESE(B, S) ((B)->aux = (S)) +#define BB_GET_SESE(B) ((bb_sese *)(B)->aux) + +/* DFS walk creating SESE data structures. Only cover nodes with + BB_VISITED set. Append discovered blocks to LIST. We number in + increments of 3 so that the above and below pseudo nodes can be + implicitly numbered too. */ + +static int +omp_sese_number (int n, int p, int dir, basic_block b, + auto_vec *list) +{ + if (BB_GET_SESE (b)) + return n; + + if (dump_file) + fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n", + b->index, n, p, dir); + + BB_SET_SESE (b, new bb_sese (n, p, dir)); + p = n; + + n += 3; + list->quick_push (b); + + /* First walk the nodes on the 'other side' of this node, then walk + the nodes on the same side. */ + for (unsigned ix = 2; ix; ix--) + { + vec *edges = dir > 0 ? b->succs : b->preds; + size_t offset = (dir > 0 ? offsetof (edge_def, dest) + : offsetof (edge_def, src)); + edge e; + edge_iterator (ei); + + FOR_EACH_EDGE (e, ei, edges) + { + basic_block target = *(basic_block *)((char *)e + offset); + + if (target->flags & BB_VISITED) + n = omp_sese_number (n, p, dir, target, list); + } + dir = -dir; + } + return n; +} + +/* Process pseudo node above (DIR < 0) or below (DIR > 0) ME. + EDGES are the outgoing edges and OFFSET is the offset to the src + or dst block on the edges. */ + +static void +omp_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, + vec *edges, size_t offset) +{ + edge e; + edge_iterator (ei); + int hi_back = depth; + pseudo_node_t node_back (0, depth); + int hi_child = depth; + pseudo_node_t node_child (0, depth); + basic_block child = NULL; + unsigned num_children = 0; + int usd = -dir * sese->dir; + + if (dump_file) + fprintf (dump_file, "\nProcessing %d(%d) %+d\n", + me->index, sese->node, dir); + + if (dir < 0) + { + /* This is the above pseudo-child. It has the BB itself as an + additional child node. */ + node_child = sese->high; + hi_child = node_child.second; + if (node_child.first) + hi_child += BB_GET_SESE (node_child.first)->node; + num_children++; + } + + /* Examine each edge. + - if it is a child (a) append its bracket list and (b) record + whether it is the child with the highest reaching bracket. + - if it is an edge to ancestor, record whether it's the highest + reaching backlink. */ + FOR_EACH_EDGE (e, ei, edges) + { + basic_block target = *(basic_block *)((char *)e + offset); + + if (bb_sese *t_sese = BB_GET_SESE (target)) + { + if (t_sese->parent == sese->node && !(t_sese->dir + usd)) + { + /* Child node. Append its bracket list. */ + num_children++; + sese->append (t_sese); + + /* Compare it's hi value. */ + int t_hi = t_sese->high.second; + + if (basic_block child_hi_block = t_sese->high.first) + t_hi += BB_GET_SESE (child_hi_block)->node; + + if (hi_child > t_hi) + { + hi_child = t_hi; + node_child = t_sese->high; + child = target; + } + } + else if (t_sese->node < sese->node + dir + && !(dir < 0 && sese->parent == t_sese->node)) + { + /* Non-parental ancestor node -- a backlink. */ + int d = usd * t_sese->dir; + int back = t_sese->node + d; + + if (hi_back > back) + { + hi_back = back; + node_back = pseudo_node_t (target, d); + } + } + } + else + { /* Fallen off graph, backlink to entry node. */ + hi_back = 0; + node_back = pseudo_node_t (0, 0); + } + } + + /* Remove any brackets that terminate at this pseudo node. */ + sese->remove (pseudo_node_t (me, dir)); + + /* Now push any backlinks from this pseudo node. */ + FOR_EACH_EDGE (e, ei, edges) + { + basic_block target = *(basic_block *)((char *)e + offset); + if (bb_sese *t_sese = BB_GET_SESE (target)) + { + if (t_sese->node < sese->node + dir + && !(dir < 0 && sese->parent == t_sese->node)) + /* Non-parental ancestor node - backedge from me. */ + sese->push (pseudo_node_t (target, usd * t_sese->dir)); + } + else + { + /* back edge to entry node */ + sese->push (pseudo_node_t (0, 0)); + } + } + + /* If this node leads directly or indirectly to a no-return region of + the graph, then fake a backedge to entry node. */ + if (!sese->brackets.length () || !edges || !edges->length ()) + { + hi_back = 0; + node_back = pseudo_node_t (0, 0); + sese->push (node_back); + } + + /* Record the highest reaching backedge from us or a descendant. */ + sese->high = hi_back < hi_child ? node_back : node_child; + + if (num_children > 1) + { + /* There is more than one child -- this is a Y shaped piece of + spanning tree. We have to insert a fake backedge from this + node to the highest ancestor reached by not-the-highest + reaching child. Note that there may be multiple children + with backedges to the same highest node. That's ok and we + insert the edge to that highest node. */ + hi_child = depth; + if (dir < 0 && child) + { + node_child = sese->high; + hi_child = node_child.second; + if (node_child.first) + hi_child += BB_GET_SESE (node_child.first)->node; + } + + FOR_EACH_EDGE (e, ei, edges) + { + basic_block target = *(basic_block *)((char *)e + offset); + + if (target == child) + /* Ignore the highest child. */ + continue; + + bb_sese *t_sese = BB_GET_SESE (target); + if (!t_sese) + continue; + if (t_sese->parent != sese->node) + /* Not a child. */ + continue; + + /* Compare its hi value. */ + int t_hi = t_sese->high.second; + + if (basic_block child_hi_block = t_sese->high.first) + t_hi += BB_GET_SESE (child_hi_block)->node; + + if (hi_child > t_hi) + { + hi_child = t_hi; + node_child = t_sese->high; + } + } + + sese->push (node_child); + } +} + + +/* DFS walk of BB graph. Color node BLOCK according to COLORING then + proceed to successors. Set SESE entry and exit nodes of + REGIONS. */ + +static void +omp_sese_color (auto_vec &color_counts, bb_pair_vec_t ®ions, + basic_block block, int coloring) +{ + bb_sese *sese = BB_GET_SESE (block); + + if (block->flags & BB_VISITED) + { + /* If we've already encountered this block, either we must not + be coloring, or it must have been colored the current color. */ + gcc_assert (coloring < 0 || (sese && coloring == sese->color)); + return; + } + + block->flags |= BB_VISITED; + + if (sese) + { + if (coloring < 0) + { + /* Start coloring a region. */ + regions[sese->color].first = block; + coloring = sese->color; + } + + if (!--color_counts[sese->color] && sese->color == coloring) + { + /* Found final block of SESE region. */ + regions[sese->color].second = block; + coloring = -1; + } + else + /* Color the node, so we can assert on revisiting the node + that the graph is indeed SESE. */ + sese->color = coloring; + } + else + /* Fallen off the subgraph, we cannot be coloring. */ + gcc_assert (coloring < 0); + + /* Walk each successor block. */ + if (block->succs && block->succs->length ()) + { + edge e; + edge_iterator ei; + + FOR_EACH_EDGE (e, ei, block->succs) + omp_sese_color (color_counts, regions, e->dest, coloring); + } + else + gcc_assert (coloring < 0); +} + +/* Find minimal set of SESE regions covering BLOCKS. REGIONS might + end up with NULL entries in it. */ + +static void +omp_find_sese (auto_vec &blocks, bb_pair_vec_t ®ions) +{ + basic_block block; + int ix; + + /* First clear each BB of the whole function. */ + FOR_EACH_BB_FN (block, cfun) + { + block->flags &= ~BB_VISITED; + BB_SET_SESE (block, 0); + } + block = EXIT_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + BB_SET_SESE (block, 0); + block = ENTRY_BLOCK_PTR_FOR_FN (cfun); + block->flags &= ~BB_VISITED; + BB_SET_SESE (block, 0); + + /* Mark blocks in the function that are in this graph. */ + for (ix = 0; blocks.iterate (ix, &block); ix++) + block->flags |= BB_VISITED; + + /* Counts of nodes assigned to each color. There cannot be more + colors than blocks (and hopefully there will be fewer). */ + auto_vec color_counts; + color_counts.reserve (blocks.length ()); + + /* Worklist of nodes in the spanning tree. Again, there cannot be + more nodes in the tree than blocks (there will be fewer if the + CFG of blocks is disjoint). */ + auto_vec spanlist; + spanlist.reserve (blocks.length ()); + + /* Make sure every block has its cycle class determined. */ + for (ix = 0; blocks.iterate (ix, &block); ix++) + { + if (BB_GET_SESE (block)) + /* We already met this block in an earlier graph solve. */ + continue; + + if (dump_file) + fprintf (dump_file, "Searching graph starting at %d\n", block->index); + + /* Number the nodes reachable from block initial DFS order. */ + int depth = omp_sese_number (2, 0, +1, block, &spanlist); + + /* Now walk in reverse DFS order to find cycle equivalents. */ + while (spanlist.length ()) + { + block = spanlist.pop (); + bb_sese *sese = BB_GET_SESE (block); + + /* Do the pseudo node below. */ + omp_sese_pseudo (block, sese, depth, +1, + sese->dir > 0 ? block->succs : block->preds, + (sese->dir > 0 ? offsetof (edge_def, dest) + : offsetof (edge_def, src))); + sese->set_color (color_counts); + /* Do the pseudo node above. */ + omp_sese_pseudo (block, sese, depth, -1, + sese->dir < 0 ? block->succs : block->preds, + (sese->dir < 0 ? offsetof (edge_def, dest) + : offsetof (edge_def, src))); + } + if (dump_file) + fprintf (dump_file, "\n"); + } + + if (dump_file) + { + unsigned count; + const char *comma = ""; + + fprintf (dump_file, "Found %d cycle equivalents\n", + color_counts.length ()); + for (ix = 0; color_counts.iterate (ix, &count); ix++) + { + fprintf (dump_file, "%s%d[%d]={", comma, ix, count); + + comma = ""; + for (unsigned jx = 0; blocks.iterate (jx, &block); jx++) + if (BB_GET_SESE (block)->color == ix) + { + block->flags |= BB_VISITED; + fprintf (dump_file, "%s%d", comma, block->index); + comma=","; + } + fprintf (dump_file, "}"); + comma = ", "; + } + fprintf (dump_file, "\n"); + } + + /* Now we've colored every block in the subgraph. We now need to + determine the minimal set of SESE regions that cover that + subgraph. Do this with a DFS walk of the complete function. + During the walk we're either 'looking' or 'coloring'. When we + reach the last node of a particular color, we stop coloring and + return to looking. */ + + /* There cannot be more SESE regions than colors. */ + regions.reserve (color_counts.length ()); + for (ix = color_counts.length (); ix--;) + regions.quick_push (bb_pair_t (0, 0)); + + for (ix = 0; blocks.iterate (ix, &block); ix++) + block->flags &= ~BB_VISITED; + + omp_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1); + + if (dump_file) + { + const char *comma = ""; + int len = regions.length (); + + fprintf (dump_file, "SESE regions:"); + for (ix = 0; ix != len; ix++) + { + basic_block from = regions[ix].first; + basic_block to = regions[ix].second; + + if (from) + { + fprintf (dump_file, "%s %d{%d", comma, ix, from->index); + if (to != from) + fprintf (dump_file, "->%d", to->index); + + int color = BB_GET_SESE (from)->color; + + /* Print the blocks within the region (excluding ends). */ + FOR_EACH_BB_FN (block, cfun) + { + bb_sese *sese = BB_GET_SESE (block); + + if (sese && sese->color == color + && block != from && block != to) + fprintf (dump_file, ".%d", block->index); + } + fprintf (dump_file, "}"); + } + comma = ","; + } + fprintf (dump_file, "\n\n"); + } + + for (ix = 0; blocks.iterate (ix, &block); ix++) + delete BB_GET_SESE (block); +} + +#undef BB_SET_SESE +#undef BB_GET_SESE diff --git a/gcc/omp-sese.h b/gcc/omp-sese.h new file mode 100644 index 000000000000..57290eb9d65c --- /dev/null +++ b/gcc/omp-sese.h @@ -0,0 +1,26 @@ +/* Find single-entry, single-exit regions for OpenACC. + + Copyright (C) 2005-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_OMP_SESE_H +#define GCC_OMP_SESE_H + +extern void oacc_do_neutering (void); + +#endif diff --git a/gcc/passes.def b/gcc/passes.def index f4c4b96d96d8..e9c13bf7768d 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -178,6 +178,8 @@ along with GCC; see the file COPYING3. If not see INSERT_PASSES_AFTER (all_passes) NEXT_PASS (pass_fixup_cfg); NEXT_PASS (pass_lower_eh_dispatch); + NEXT_PASS (pass_oacc_loop_designation); + NEXT_PASS (pass_oacc_gimple_workers); NEXT_PASS (pass_oacc_device_lower); NEXT_PASS (pass_omp_device_lower); NEXT_PASS (pass_omp_target_link); diff --git a/gcc/target.def b/gcc/target.def index d82db232e404..c9c3f650e8aa 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1735,6 +1735,14 @@ DEFHOOK void, (tree var), NULL) +DEFHOOK +(create_propagation_record, +"Create a record used to propagate local-variable state from an active\n\ +worker to other workers. A possible implementation might adjust the type\n\ +of REC to place the new variable in shared GPU memory.", +tree, (tree rec, bool sender, const char *name), +default_goacc_create_propagation_record) + DEFHOOK (explode_args, "Define this hook to TRUE if arguments to offload regions should be\n\ @@ -1742,6 +1750,11 @@ exploded, i.e. passed as true arguments rather than in an argument array.", bool, (void), hook_bool_void_false) +DEFHOOKPOD +(worker_partitioning, +"Use gimple transformation for worker neutering/broadcasting.", +bool, false) + HOOK_VECTOR_END (goacc) /* Functions relating to vectorization. */ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 59436278dcf7..29b0258f49f9 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -125,6 +125,7 @@ extern bool default_goacc_validate_dims (tree, int [], int, unsigned); extern int default_goacc_dim_limit (int); extern bool default_goacc_fork_join (gcall *, const int [], bool); extern void default_goacc_reduction (gcall *); +extern tree default_goacc_create_propagation_record (tree, bool, const char *); /* These are here, and not in hooks.[ch], because not all users of hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS. */ diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index f00c91994527..d1a1d3a7d4a6 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -417,6 +417,8 @@ extern gimple_opt_pass *make_pass_diagnose_omp_blocks (gcc::context *ctxt); extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt); extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt); extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_oacc_gimple_workers (gcc::context *ctxt); extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt); extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt); extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt); -- 2.47.2