]> git.ipfire.org Git - thirdparty/gcc.git/blobdiff - gcc/omp-expand.c
Correct a function pre/postcondition [PR102403].
[thirdparty/gcc.git] / gcc / omp-expand.c
index d6755cd71ee78e1f0bbfff63da920b0ed63d4eeb..159ae0e1647baf318e85366069f3b119a1872f38 100644 (file)
@@ -2,7 +2,7 @@
    directives to separate functions, converts others into explicit calls to the
    runtime library (libgomp) and so forth
 
-Copyright (C) 2005-2017 Free Software Foundation, Inc.
+Copyright (C) 2005-2021 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -52,13 +52,13 @@ along with GCC; see the file COPYING3.  If not see
 #include "omp-general.h"
 #include "omp-offload.h"
 #include "tree-cfgcleanup.h"
+#include "alloc-pool.h"
 #include "symbol-summary.h"
-#include "cilk.h"
 #include "gomp-constants.h"
 #include "gimple-pretty-print.h"
-#include "hsa-common.h"
-#include "debug.h"
-
+#include "stringpool.h"
+#include "attribs.h"
+#include "tree-eh.h"
 
 /* OMP region information.  Every parallel and workshare
    directive is enclosed between two markers, the OMP_* directive
@@ -101,6 +101,9 @@ struct omp_region
   /* True if this is a combined parallel+workshare region.  */
   bool is_combined_parallel;
 
+  /* Copy of fd.lastprivate_conditional != 0.  */
+  bool has_lastprivate_conditional;
+
   /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
      a depend clause.  */
   gomp_ordered *ord_stmt;
@@ -174,6 +177,8 @@ workshare_safe_to_combine_p (basic_block ws_entry_bb)
     return true;
 
   gcc_assert (gimple_code (ws_stmt) == GIMPLE_OMP_FOR);
+  if (gimple_omp_for_kind (ws_stmt) != GF_OMP_FOR_KIND_FOR)
+    return false;
 
   omp_extract_for_data (as_a <gomp_for *> (ws_stmt), &fd, NULL);
 
@@ -202,11 +207,11 @@ workshare_safe_to_combine_p (basic_block ws_entry_bb)
 static tree
 omp_adjust_chunk_size (tree chunk_size, bool simd_schedule)
 {
-  if (!simd_schedule)
+  if (!simd_schedule || integer_zerop (chunk_size))
     return chunk_size;
 
-  int vf = omp_max_vf ();
-  if (vf == 1)
+  poly_uint64 vf = omp_max_vf ();
+  if (known_eq (vf, 1U))
     return chunk_size;
 
   tree type = TREE_TYPE (chunk_size);
@@ -310,6 +315,13 @@ determine_parallel_type (struct omp_region *region)
   ws_entry_bb = region->inner->entry;
   ws_exit_bb = region->inner->exit;
 
+  /* Give up for task reductions on the parallel, while it is implementable,
+     adding another big set of APIs or slowing down the normal paths is
+     not acceptable.  */
+  tree pclauses = gimple_omp_parallel_clauses (last_stmt (par_entry_bb));
+  if (omp_find_clause (pclauses, OMP_CLAUSE__REDUCTEMP_))
+    return;
+
   if (single_succ (par_entry_bb) == ws_entry_bb
       && single_succ (ws_exit_bb) == par_exit_bb
       && workshare_safe_to_combine_p (ws_entry_bb)
@@ -336,13 +348,18 @@ determine_parallel_type (struct omp_region *region)
          if (c == NULL
              || ((OMP_CLAUSE_SCHEDULE_KIND (c) & OMP_CLAUSE_SCHEDULE_MASK)
                  == OMP_CLAUSE_SCHEDULE_STATIC)
-             || omp_find_clause (clauses, OMP_CLAUSE_ORDERED))
-           {
-             region->is_combined_parallel = false;
-             region->inner->is_combined_parallel = false;
-             return;
-           }
+             || omp_find_clause (clauses, OMP_CLAUSE_ORDERED)
+             || omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_)
+             || ((c = omp_find_clause (clauses, OMP_CLAUSE__CONDTEMP_))
+                 && POINTER_TYPE_P (TREE_TYPE (OMP_CLAUSE_DECL (c)))))
+           return;
        }
+      else if (region->inner->type == GIMPLE_OMP_SECTIONS
+              && (omp_find_clause (gimple_omp_sections_clauses (ws_stmt),
+                                   OMP_CLAUSE__REDUCTEMP_)
+                  || omp_find_clause (gimple_omp_sections_clauses (ws_stmt),
+                                      OMP_CLAUSE__CONDTEMP_)))
+       return;
 
       region->is_combined_parallel = true;
       region->inner->is_combined_parallel = true;
@@ -467,38 +484,60 @@ gimple_build_cond_empty (tree cond)
   return gimple_build_cond (pred_code, lhs, rhs, NULL_TREE, NULL_TREE);
 }
 
-/* Return true if a parallel REGION is within a declare target function or
-   within a target region and is not a part of a gridified target.  */
+/* Change DECL_CONTEXT of CHILD_FNDECL to that of the parent function.
+   Add CHILD_FNDECL to decl chain of the supercontext of the block
+   ENTRY_BLOCK - this is the block which originally contained the
+   code from which CHILD_FNDECL was created.
+   
+   Together, these actions ensure that the debug info for the outlined
+   function will be emitted with the correct lexical scope.  */
 
-static bool
-parallel_needs_hsa_kernel_p (struct omp_region *region)
+static void
+adjust_context_and_scope (struct omp_region *region, tree entry_block,
+                         tree child_fndecl)
 {
-  bool indirect = false;
-  for (region = region->outer; region; region = region->outer)
-    {
-      if (region->type == GIMPLE_OMP_PARALLEL)
-       indirect = true;
-      else if (region->type == GIMPLE_OMP_TARGET)
-       {
-         gomp_target *tgt_stmt
-           = as_a <gomp_target *> (last_stmt (region->entry));
+  tree parent_fndecl = NULL_TREE;
+  gimple *entry_stmt;
+  /* OMP expansion expands inner regions before outer ones, so if
+     we e.g. have explicit task region nested in parallel region, when
+     expanding the task region current_function_decl will be the original
+     source function, but we actually want to use as context the child
+     function of the parallel.  */
+  for (region = region->outer;
+       region && parent_fndecl == NULL_TREE; region = region->outer)
+    switch (region->type)
+      {
+      case GIMPLE_OMP_PARALLEL:
+      case GIMPLE_OMP_TASK:
+      case GIMPLE_OMP_TEAMS:
+       entry_stmt = last_stmt (region->entry);
+       parent_fndecl = gimple_omp_taskreg_child_fn (entry_stmt);
+       break;
+      case GIMPLE_OMP_TARGET:
+       entry_stmt = last_stmt (region->entry);
+       parent_fndecl
+         = gimple_omp_target_child_fn (as_a <gomp_target *> (entry_stmt));
+       break;
+      default:
+       break;
+      }
 
-         if (omp_find_clause (gimple_omp_target_clauses (tgt_stmt),
-                              OMP_CLAUSE__GRIDDIM_))
-           return indirect;
-         else
-           return true;
+  if (parent_fndecl == NULL_TREE)
+    parent_fndecl = current_function_decl;
+  DECL_CONTEXT (child_fndecl) = parent_fndecl;
+
+  if (entry_block != NULL_TREE && TREE_CODE (entry_block) == BLOCK)
+    {
+      tree b = BLOCK_SUPERCONTEXT (entry_block);
+      if (TREE_CODE (b) == BLOCK)
+        {
+         DECL_CHAIN (child_fndecl) = BLOCK_VARS (b);
+         BLOCK_VARS (b) = child_fndecl;
        }
     }
-
-  if (lookup_attribute ("omp declare target",
-                       DECL_ATTRIBUTES (current_function_decl)))
-    return true;
-
-  return false;
 }
 
-/* Build the function calls to GOMP_parallel_start etc to actually
+/* Build the function calls to GOMP_parallel etc to actually
    generate the parallel operation.  REGION is the parallel region
    being expanded.  BB is the block where to insert the code.  WS_ARGS
    will be set if this is a call to a combined parallel+workshare
@@ -523,7 +562,10 @@ expand_parallel_call (struct omp_region *region, basic_block bb,
   /* Determine what flavor of GOMP_parallel we will be
      emitting.  */
   start_ix = BUILT_IN_GOMP_PARALLEL;
-  if (is_combined_parallel (region))
+  tree rtmp = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+  if (rtmp)
+    start_ix = BUILT_IN_GOMP_PARALLEL_REDUCTIONS;
+  else if (is_combined_parallel (region))
     {
       switch (region->inner->type)
        {
@@ -532,12 +574,24 @@ expand_parallel_call (struct omp_region *region, basic_block bb,
          switch (region->inner->sched_kind)
            {
            case OMP_CLAUSE_SCHEDULE_RUNTIME:
-             start_ix2 = 3;
+             /* For lastprivate(conditional:), our implementation
+                requires monotonic behavior.  */
+             if (region->inner->has_lastprivate_conditional != 0)
+               start_ix2 = 3;
+             else if ((region->inner->sched_modifiers
+                      & OMP_CLAUSE_SCHEDULE_NONMONOTONIC) != 0)
+               start_ix2 = 6;
+             else if ((region->inner->sched_modifiers
+                       & OMP_CLAUSE_SCHEDULE_MONOTONIC) == 0)
+               start_ix2 = 7;
+             else
+               start_ix2 = 3;
              break;
            case OMP_CLAUSE_SCHEDULE_DYNAMIC:
            case OMP_CLAUSE_SCHEDULE_GUIDED:
-             if (region->inner->sched_modifiers
-                 & OMP_CLAUSE_SCHEDULE_NONMONOTONIC)
+             if ((region->inner->sched_modifiers
+                  & OMP_CLAUSE_SCHEDULE_MONOTONIC) == 0
+                 && !region->inner->has_lastprivate_conditional)
                {
                  start_ix2 = 3 + region->inner->sched_kind;
                  break;
@@ -658,7 +712,7 @@ expand_parallel_call (struct omp_region *region, basic_block bb,
                                      false, GSI_CONTINUE_LINKING);
     }
 
-  gsi = gsi_last_bb (bb);
+  gsi = gsi_last_nondebug_bb (bb);
   t = gimple_omp_parallel_data_arg (entry_stmt);
   if (t == NULL)
     t1 = null_pointer_node;
@@ -678,54 +732,15 @@ expand_parallel_call (struct omp_region *region, basic_block bb,
   t = build_call_expr_loc_vec (UNKNOWN_LOCATION,
                               builtin_decl_explicit (start_ix), args);
 
-  force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
-                           false, GSI_CONTINUE_LINKING);
-
-  if (hsa_gen_requested_p ()
-      && parallel_needs_hsa_kernel_p (region))
+  if (rtmp)
     {
-      cgraph_node *child_cnode = cgraph_node::get (child_fndecl);
-      hsa_register_kernel (child_cnode);
+      tree type = TREE_TYPE (OMP_CLAUSE_DECL (rtmp));
+      t = build2 (MODIFY_EXPR, type, OMP_CLAUSE_DECL (rtmp),
+                 fold_convert (type,
+                               fold_convert (pointer_sized_int_node, t)));
     }
-}
-
-/* Insert a function call whose name is FUNC_NAME with the information from
-   ENTRY_STMT into the basic_block BB.  */
-
-static void
-expand_cilk_for_call (basic_block bb, gomp_parallel *entry_stmt,
-                     vec <tree, va_gc> *ws_args)
-{
-  tree t, t1, t2;
-  gimple_stmt_iterator gsi;
-  vec <tree, va_gc> *args;
-
-  gcc_assert (vec_safe_length (ws_args) == 2);
-  tree func_name = (*ws_args)[0];
-  tree grain = (*ws_args)[1];
-
-  tree clauses = gimple_omp_parallel_clauses (entry_stmt);
-  tree count = omp_find_clause (clauses, OMP_CLAUSE__CILK_FOR_COUNT_);
-  gcc_assert (count != NULL_TREE);
-  count = OMP_CLAUSE_OPERAND (count, 0);
-
-  gsi = gsi_last_bb (bb);
-  t = gimple_omp_parallel_data_arg (entry_stmt);
-  if (t == NULL)
-    t1 = null_pointer_node;
-  else
-    t1 = build_fold_addr_expr (t);
-  t2 = build_fold_addr_expr (gimple_omp_parallel_child_fn (entry_stmt));
-
-  vec_alloc (args, 4);
-  args->quick_push (t2);
-  args->quick_push (t1);
-  args->quick_push (count);
-  args->quick_push (grain);
-  t = build_call_expr_loc_vec (UNKNOWN_LOCATION, func_name, args);
-
-  force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, false,
-                           GSI_CONTINUE_LINKING);
+  force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                           false, GSI_CONTINUE_LINKING);
 }
 
 /* Build the function call to GOMP_task to actually
@@ -747,6 +762,7 @@ expand_task_call (struct omp_region *region, basic_block bb,
   tree depend = omp_find_clause (clauses, OMP_CLAUSE_DEPEND);
   tree finalc = omp_find_clause (clauses, OMP_CLAUSE_FINAL);
   tree priority = omp_find_clause (clauses, OMP_CLAUSE_PRIORITY);
+  tree detach = omp_find_clause (clauses, OMP_CLAUSE_DETACH);
 
   unsigned int iflags
     = (untied ? GOMP_TASK_FLAG_UNTIED : 0)
@@ -775,13 +791,19 @@ expand_task_call (struct omp_region *region, basic_block bb,
       tree tclauses = gimple_omp_for_clauses (g);
       num_tasks = omp_find_clause (tclauses, OMP_CLAUSE_NUM_TASKS);
       if (num_tasks)
-       num_tasks = OMP_CLAUSE_NUM_TASKS_EXPR (num_tasks);
+       {
+         if (OMP_CLAUSE_NUM_TASKS_STRICT (num_tasks))
+           iflags |= GOMP_TASK_FLAG_STRICT;
+         num_tasks = OMP_CLAUSE_NUM_TASKS_EXPR (num_tasks);
+       }
       else
        {
          num_tasks = omp_find_clause (tclauses, OMP_CLAUSE_GRAINSIZE);
          if (num_tasks)
            {
              iflags |= GOMP_TASK_FLAG_GRAINSIZE;
+             if (OMP_CLAUSE_GRAINSIZE_STRICT (num_tasks))
+               iflags |= GOMP_TASK_FLAG_STRICT;
              num_tasks = OMP_CLAUSE_GRAINSIZE_EXPR (num_tasks);
            }
          else
@@ -793,9 +815,16 @@ expand_task_call (struct omp_region *region, basic_block bb,
       if (omp_find_clause (tclauses, OMP_CLAUSE_NOGROUP))
        iflags |= GOMP_TASK_FLAG_NOGROUP;
       ull = fd.iter_type == long_long_unsigned_type_node;
+      if (omp_find_clause (clauses, OMP_CLAUSE_REDUCTION))
+       iflags |= GOMP_TASK_FLAG_REDUCTION;
+    }
+  else
+    {
+      if (priority)
+       iflags |= GOMP_TASK_FLAG_PRIORITY;
+      if (detach)
+       iflags |= GOMP_TASK_FLAG_DETACH;
     }
-  else if (priority)
-    iflags |= GOMP_TASK_FLAG_PRIORITY;
 
   tree flags = build_int_cst (unsigned_type_node, iflags);
 
@@ -835,7 +864,12 @@ expand_task_call (struct omp_region *region, basic_block bb,
   else
     priority = integer_zero_node;
 
-  gsi = gsi_last_bb (bb);
+  gsi = gsi_last_nondebug_bb (bb);
+
+  detach = (detach
+           ? build_fold_addr_expr (OMP_CLAUSE_DECL (detach))
+           : null_pointer_node);
+
   tree t = gimple_omp_task_data_arg (entry_stmt);
   if (t == NULL)
     t2 = null_pointer_node;
@@ -858,10 +892,84 @@ expand_task_call (struct omp_region *region, basic_block bb,
                         num_tasks, priority, startvar, endvar, step);
   else
     t = build_call_expr (builtin_decl_explicit (BUILT_IN_GOMP_TASK),
-                        9, t1, t2, t3,
+                        10, t1, t2, t3,
                         gimple_omp_task_arg_size (entry_stmt),
                         gimple_omp_task_arg_align (entry_stmt), cond, flags,
-                        depend, priority);
+                        depend, priority, detach);
+
+  force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                           false, GSI_CONTINUE_LINKING);
+}
+
+/* Build the function call to GOMP_taskwait_depend to actually
+   generate the taskwait operation.  BB is the block where to insert the
+   code.  */
+
+static void
+expand_taskwait_call (basic_block bb, gomp_task *entry_stmt)
+{
+  tree clauses = gimple_omp_task_clauses (entry_stmt);
+  tree depend = omp_find_clause (clauses, OMP_CLAUSE_DEPEND);
+  if (depend == NULL_TREE)
+    return;
+
+  depend = OMP_CLAUSE_DECL (depend);
+
+  gimple_stmt_iterator gsi = gsi_last_nondebug_bb (bb);
+  tree t
+    = build_call_expr (builtin_decl_explicit (BUILT_IN_GOMP_TASKWAIT_DEPEND),
+                      1, depend);
+
+  force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                           false, GSI_CONTINUE_LINKING);
+}
+
+/* Build the function call to GOMP_teams_reg to actually
+   generate the host teams operation.  REGION is the teams region
+   being expanded.  BB is the block where to insert the code.  */
+
+static void
+expand_teams_call (basic_block bb, gomp_teams *entry_stmt)
+{
+  tree clauses = gimple_omp_teams_clauses (entry_stmt);
+  tree num_teams = omp_find_clause (clauses, OMP_CLAUSE_NUM_TEAMS);
+  if (num_teams == NULL_TREE)
+    num_teams = build_int_cst (unsigned_type_node, 0);
+  else
+    {
+      num_teams = OMP_CLAUSE_NUM_TEAMS_EXPR (num_teams);
+      num_teams = fold_convert (unsigned_type_node, num_teams);
+    }
+  tree thread_limit = omp_find_clause (clauses, OMP_CLAUSE_THREAD_LIMIT);
+  if (thread_limit == NULL_TREE)
+    thread_limit = build_int_cst (unsigned_type_node, 0);
+  else
+    {
+      thread_limit = OMP_CLAUSE_THREAD_LIMIT_EXPR (thread_limit);
+      thread_limit = fold_convert (unsigned_type_node, thread_limit);
+    }
+
+  gimple_stmt_iterator gsi = gsi_last_nondebug_bb (bb);
+  tree t = gimple_omp_teams_data_arg (entry_stmt), t1;
+  if (t == NULL)
+    t1 = null_pointer_node;
+  else
+    t1 = build_fold_addr_expr (t);
+  tree child_fndecl = gimple_omp_teams_child_fn (entry_stmt);
+  tree t2 = build_fold_addr_expr (child_fndecl);
+
+  vec<tree, va_gc> *args;
+  vec_alloc (args, 5);
+  args->quick_push (t2);
+  args->quick_push (t1);
+  args->quick_push (num_teams);
+  args->quick_push (thread_limit);
+  /* For future extensibility.  */
+  args->quick_push (build_zero_cst (unsigned_type_node));
+
+  t = build_call_expr_loc_vec (UNKNOWN_LOCATION,
+                              builtin_decl_explicit (BUILT_IN_GOMP_TEAMS_REG),
+                              args);
 
   force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
                            false, GSI_CONTINUE_LINKING);
@@ -912,15 +1020,15 @@ remove_exit_barrier (struct omp_region *region)
      statements that can appear in between are extremely limited -- no
      memory operations at all.  Here, we allow nothing at all, so the
      only thing we allow to precede this GIMPLE_OMP_RETURN is a label.  */
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
-  gsi_prev (&gsi);
+  gsi_prev_nondebug (&gsi);
   if (!gsi_end_p (gsi) && gimple_code (gsi_stmt (gsi)) != GIMPLE_LABEL)
     return;
 
   FOR_EACH_EDGE (e, ei, exit_bb->preds)
     {
-      gsi = gsi_last_bb (e->src);
+      gsi = gsi_last_nondebug_bb (e->src);
       if (gsi_end_p (gsi))
        continue;
       stmt = gsi_stmt (gsi);
@@ -1113,6 +1221,17 @@ expand_omp_taskreg (struct omp_region *region)
   vec<tree, va_gc> *ws_args;
 
   entry_stmt = last_stmt (region->entry);
+  if (gimple_code (entry_stmt) == GIMPLE_OMP_TASK
+      && gimple_omp_task_taskwait_p (entry_stmt))
+    {
+      new_bb = region->entry;
+      gsi = gsi_last_nondebug_bb (region->entry);
+      gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_TASK);
+      gsi_remove (&gsi, true);
+      expand_taskwait_call (new_bb, as_a <gomp_task *> (entry_stmt));
+      return;
+    }
+
   child_fn = gimple_omp_taskreg_child_fn (entry_stmt);
   child_cfun = DECL_STRUCT_FUNCTION (child_fn);
 
@@ -1122,18 +1241,7 @@ expand_omp_taskreg (struct omp_region *region)
   else
     exit_bb = region->exit;
 
-  bool is_cilk_for
-    = (flag_cilkplus
-       && gimple_code (entry_stmt) == GIMPLE_OMP_PARALLEL
-       && omp_find_clause (gimple_omp_parallel_clauses (entry_stmt),
-                          OMP_CLAUSE__CILK_FOR_COUNT_) != NULL_TREE);
-
-  if (is_cilk_for)
-    /* If it is a _Cilk_for statement, it is modelled *like* a parallel for,
-       and the inner statement contains the name of the built-in function
-       and grain.  */
-    ws_args = region->inner->ws_args;
-  else if (is_combined_parallel (region))
+  if (is_combined_parallel (region))
     ws_args = region->ws_args;
   else
     ws_args = NULL;
@@ -1147,9 +1255,10 @@ expand_omp_taskreg (struct omp_region *region)
 
       entry_succ_e = single_succ_edge (entry_bb);
 
-      gsi = gsi_last_bb (entry_bb);
+      gsi = gsi_last_nondebug_bb (entry_bb);
       gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_PARALLEL
-                 || gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_TASK);
+                 || gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_TASK
+                 || gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_TEAMS);
       gsi_remove (&gsi, true);
 
       new_bb = entry_bb;
@@ -1202,8 +1311,8 @@ expand_omp_taskreg (struct omp_region *region)
                     effectively doing a STRIP_NOPS.  */
 
                  if (TREE_CODE (arg) == ADDR_EXPR
-                     && TREE_OPERAND (arg, 0)
-                       == gimple_omp_taskreg_data_arg (entry_stmt))
+                     && (TREE_OPERAND (arg, 0)
+                         == gimple_omp_taskreg_data_arg (entry_stmt)))
                    {
                      parcopy_stmt = stmt;
                      break;
@@ -1260,15 +1369,16 @@ expand_omp_taskreg (struct omp_region *region)
 
       /* Split ENTRY_BB at GIMPLE_OMP_PARALLEL or GIMPLE_OMP_TASK,
         so that it can be moved to the child function.  */
-      gsi = gsi_last_bb (entry_bb);
+      gsi = gsi_last_nondebug_bb (entry_bb);
       stmt = gsi_stmt (gsi);
       gcc_assert (stmt && (gimple_code (stmt) == GIMPLE_OMP_PARALLEL
-                          || gimple_code (stmt) == GIMPLE_OMP_TASK));
+                          || gimple_code (stmt) == GIMPLE_OMP_TASK
+                          || gimple_code (stmt) == GIMPLE_OMP_TEAMS));
       e = split_block (entry_bb, stmt);
       gsi_remove (&gsi, true);
       entry_bb = e->dest;
       edge e2 = NULL;
-      if (gimple_code (entry_stmt) == GIMPLE_OMP_PARALLEL)
+      if (gimple_code (entry_stmt) != GIMPLE_OMP_TASK)
        single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
       else
        {
@@ -1276,7 +1386,7 @@ expand_omp_taskreg (struct omp_region *region)
          gcc_assert (e2->dest == region->exit);
          remove_edge (BRANCH_EDGE (entry_bb));
          set_immediate_dominator (CDI_DOMINATORS, e2->dest, e->src);
-         gsi = gsi_last_bb (region->exit);
+         gsi = gsi_last_nondebug_bb (region->exit);
          gcc_assert (!gsi_end_p (gsi)
                      && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
          gsi_remove (&gsi, true);
@@ -1285,7 +1395,7 @@ expand_omp_taskreg (struct omp_region *region)
       /* Convert GIMPLE_OMP_{RETURN,CONTINUE} into a RETURN_EXPR.  */
       if (exit_bb)
        {
-         gsi = gsi_last_bb (exit_bb);
+         gsi = gsi_last_nondebug_bb (exit_bb);
          gcc_assert (!gsi_end_p (gsi)
                      && (gimple_code (gsi_stmt (gsi))
                          == (e2 ? GIMPLE_OMP_CONTINUE : GIMPLE_OMP_RETURN)));
@@ -1306,11 +1416,6 @@ expand_omp_taskreg (struct omp_region *region)
       else
        block = gimple_block (entry_stmt);
 
-      /* Make sure to generate early debug for the function before
-         outlining anything.  */
-      if (! gimple_in_ssa_p (cfun))
-       (*debug_hooks->early_global_decl) (cfun->decl);
-
       new_bb = move_sese_region_to_fn (child_cfun, entry_bb, exit_bb, block);
       if (exit_bb)
        single_succ_edge (new_bb)->flags = EDGE_FALLTHRU;
@@ -1360,6 +1465,7 @@ expand_omp_taskreg (struct omp_region *region)
 
       if (optimize)
        optimize_omp_library_calls (entry_stmt);
+      update_max_bb_count ();
       cgraph_edge::rebuild_edges ();
 
       /* Some EH regions might become dead, see PR34608.  If
@@ -1390,13 +1496,13 @@ expand_omp_taskreg (struct omp_region *region)
        }
     }
 
-  /* Emit a library call to launch the children threads.  */
-  if (is_cilk_for)
-    expand_cilk_for_call (new_bb,
-                         as_a <gomp_parallel *> (entry_stmt), ws_args);
-  else if (gimple_code (entry_stmt) == GIMPLE_OMP_PARALLEL)
+  adjust_context_and_scope (region, gimple_block (entry_stmt), child_fn);
+
+  if (gimple_code (entry_stmt) == GIMPLE_OMP_PARALLEL)
     expand_parallel_call (region, new_bb,
                          as_a <gomp_parallel *> (entry_stmt), ws_args);
+  else if (gimple_code (entry_stmt) == GIMPLE_OMP_TEAMS)
+    expand_teams_call (new_bb, as_a <gomp_teams *> (entry_stmt));
   else
     expand_task_call (region, new_bb, as_a <gomp_task *> (entry_stmt));
   if (gimple_in_ssa_p (cfun))
@@ -1421,8 +1527,8 @@ struct oacc_collapse
 static tree
 expand_oacc_collapse_init (const struct omp_for_data *fd,
                           gimple_stmt_iterator *gsi,
-                          oacc_collapse *counts, tree bound_type,
-                          location_t loc)
+                          oacc_collapse *counts, tree diff_type,
+                          tree bound_type, location_t loc)
 {
   tree tiling = fd->tiling;
   tree total = build_int_cst (bound_type, 1);
@@ -1439,15 +1545,12 @@ expand_oacc_collapse_init (const struct omp_for_data *fd,
       const omp_for_data_loop *loop = &fd->loops[ix];
 
       tree iter_type = TREE_TYPE (loop->v);
-      tree diff_type = iter_type;
       tree plus_type = iter_type;
 
-      gcc_assert (loop->cond_code == fd->loop.cond_code);
+      gcc_assert (loop->cond_code == LT_EXPR || loop->cond_code == GT_EXPR);
 
       if (POINTER_TYPE_P (iter_type))
        plus_type = sizetype;
-      if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
-       diff_type = signed_type_for (diff_type);
 
       if (tiling)
        {
@@ -1535,7 +1638,8 @@ expand_oacc_collapse_init (const struct omp_for_data *fd,
 static void
 expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
                           gimple_stmt_iterator *gsi,
-                          const oacc_collapse *counts, tree ivar)
+                          const oacc_collapse *counts, tree ivar,
+                          tree diff_type)
 {
   tree ivar_type = TREE_TYPE (ivar);
 
@@ -1547,7 +1651,6 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
       const oacc_collapse *collapse = &counts[ix];
       tree v = inner ? loop->v : collapse->outer;
       tree iter_type = TREE_TYPE (v);
-      tree diff_type = TREE_TYPE (collapse->step);
       tree plus_type = iter_type;
       enum tree_code plus_code = PLUS_EXPR;
       tree expr;
@@ -1569,7 +1672,7 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
        }
 
       expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr),
-                         collapse->step);
+                         fold_convert (diff_type, collapse->step));
       expr = fold_build2 (plus_code, iter_type,
                          inner ? collapse->outer : collapse->base,
                          fold_convert (plus_type, expr));
@@ -1605,7 +1708,39 @@ expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
        count = 0;
    and set ZERO_ITER_BB to that bb.  If this isn't the outermost
    of the combined loop constructs, just initialize COUNTS array
-   from the _looptemp_ clauses.  */
+   from the _looptemp_ clauses.  For loop nests with non-rectangular
+   loops, do this only for the rectangular loops.  Then pick
+   the loops which reference outer vars in their bound expressions
+   and the loops which they refer to and for this sub-nest compute
+   number of iterations.  For triangular loops use Faulhaber's formula,
+   otherwise as a fallback, compute by iterating the loops.
+   If e.g. the sub-nest is
+       for (I = N11; I COND1 N12; I += STEP1)
+       for (J = M21 * I + N21; J COND2 M22 * I + N22; J += STEP2)
+       for (K = M31 * J + N31; K COND3 M32 * J + N32; K += STEP3)
+   do:
+       COUNT = 0;
+       for (tmpi = N11; tmpi COND1 N12; tmpi += STEP1)
+       for (tmpj = M21 * tmpi + N21;
+            tmpj COND2 M22 * tmpi + N22; tmpj += STEP2)
+         {
+           int tmpk1 = M31 * tmpj + N31;
+           int tmpk2 = M32 * tmpj + N32;
+           if (tmpk1 COND3 tmpk2)
+             {
+               if (COND3 is <)
+                 adj = STEP3 - 1;
+               else
+                 adj = STEP3 + 1;
+               COUNT += (adj + tmpk2 - tmpk1) / STEP3;
+             }
+         }
+   and finally multiply the counts of the rectangular loops not
+   in the sub-nest with COUNT.  Also, as counts[fd->last_nonrect]
+   store number of iterations of the loops from fd->first_nonrect
+   to fd->last_nonrect inclusive, i.e. the above COUNT multiplied
+   by the counts of rectangular loops not referenced in any non-rectangular
+   loops sandwitched in between those.  */
 
 /* NOTE: It *could* be better to moosh all of the BBs together,
    creating one larger BB with all the computation and the unexpected
@@ -1667,6 +1802,23 @@ expand_omp_for_init_counts (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
          else
            counts[0] = NULL_TREE;
        }
+      if (fd->non_rect
+         && fd->last_nonrect == fd->first_nonrect + 1
+         && !TYPE_UNSIGNED (TREE_TYPE (fd->loops[fd->last_nonrect].v)))
+       {
+         tree c[4];
+         for (i = 0; i < 4; i++)
+           {
+             innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
+                                       OMP_CLAUSE__LOOPTEMP_);
+             gcc_assert (innerc);
+             c[i] = OMP_CLAUSE_DECL (innerc);
+           }
+         counts[0] = c[0];
+         fd->first_inner_iterations = c[1];
+         fd->factor = c[2];
+         fd->adjn1 = c[3];
+       }
       return;
     }
 
@@ -1684,12 +1836,23 @@ expand_omp_for_init_counts (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
          break;
        }
     }
+  bool rect_count_seen = false;
   for (i = 0; i < (fd->ordered ? fd->ordered : fd->collapse); i++)
     {
       tree itype = TREE_TYPE (fd->loops[i].v);
 
       if (i >= fd->collapse && counts[i])
        continue;
+      if (fd->non_rect)
+       {
+         /* Skip loops that use outer iterators in their expressions
+            during this phase.  */
+         if (fd->loops[i].m1 || fd->loops[i].m2)
+           {
+             counts[i] = build_zero_cst (type);
+             continue;
+           }
+       }
       if ((SSA_VAR_P (fd->loop.n2) || i >= fd->collapse)
          && ((t = fold_binary (fd->loops[i].cond_code, boolean_type_node,
                                fold_convert (itype, fd->loops[i].n1),
@@ -1747,7 +1910,7 @@ expand_omp_for_init_counts (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
          if (l2_dom_bb == NULL)
            l2_dom_bb = entry_bb;
          entry_bb = e->dest;
-         *gsi = gsi_last_bb (entry_bb);
+         *gsi = gsi_last_nondebug_bb (entry_bb);
        }
 
       if (POINTER_TYPE_P (itype))
@@ -1785,13 +1948,455 @@ expand_omp_for_init_counts (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
        }
       if (SSA_VAR_P (fd->loop.n2) && i < fd->collapse)
        {
-         if (i == 0)
-           t = counts[0];
+         if (fd->non_rect && i >= fd->first_nonrect && i <= fd->last_nonrect)
+           continue;
+         if (!rect_count_seen)
+           {
+             t = counts[i];
+             rect_count_seen = true;
+           }
          else
            t = fold_build2 (MULT_EXPR, type, fd->loop.n2, counts[i]);
          expand_omp_build_assign (gsi, fd->loop.n2, t);
        }
     }
+  if (fd->non_rect && SSA_VAR_P (fd->loop.n2))
+    {
+      gcc_assert (fd->last_nonrect != -1);
+
+      counts[fd->last_nonrect] = create_tmp_reg (type, ".count");
+      expand_omp_build_assign (gsi, counts[fd->last_nonrect],
+                              build_zero_cst (type));
+      for (i = fd->first_nonrect + 1; i < fd->last_nonrect; i++)
+       if (fd->loops[i].m1
+           || fd->loops[i].m2
+           || fd->loops[i].non_rect_referenced)
+         break;
+      if (i == fd->last_nonrect
+         && fd->loops[i].outer == fd->last_nonrect - fd->first_nonrect
+         && !TYPE_UNSIGNED (TREE_TYPE (fd->loops[i].v)))
+       {
+         int o = fd->first_nonrect;
+         tree itype = TREE_TYPE (fd->loops[o].v);
+         tree n1o = create_tmp_reg (itype, ".n1o");
+         t = fold_convert (itype, unshare_expr (fd->loops[o].n1));
+         expand_omp_build_assign (gsi, n1o, t);
+         tree n2o = create_tmp_reg (itype, ".n2o");
+         t = fold_convert (itype, unshare_expr (fd->loops[o].n2));
+         expand_omp_build_assign (gsi, n2o, t);
+         if (fd->loops[i].m1 && fd->loops[i].m2)
+           t = fold_build2 (MINUS_EXPR, itype, unshare_expr (fd->loops[i].m2),
+                            unshare_expr (fd->loops[i].m1));
+         else if (fd->loops[i].m1)
+           t = fold_unary (NEGATE_EXPR, itype,
+                           unshare_expr (fd->loops[i].m1));
+         else
+           t = unshare_expr (fd->loops[i].m2);
+         tree m2minusm1
+           = force_gimple_operand_gsi (gsi, t, true, NULL_TREE,
+                                       true, GSI_SAME_STMT);
+
+         gimple_stmt_iterator gsi2 = *gsi;
+         gsi_prev (&gsi2);
+         e = split_block (entry_bb, gsi_stmt (gsi2));
+         e = split_block (e->dest, (gimple *) NULL);
+         basic_block bb1 = e->src;
+         entry_bb = e->dest;
+         *gsi = gsi_after_labels (entry_bb);
+
+         gsi2 = gsi_after_labels (bb1);
+         tree ostep = fold_convert (itype, fd->loops[o].step);
+         t = build_int_cst (itype, (fd->loops[o].cond_code
+                                    == LT_EXPR ? -1 : 1));
+         t = fold_build2 (PLUS_EXPR, itype, ostep, t);
+         t = fold_build2 (PLUS_EXPR, itype, t, n2o);
+         t = fold_build2 (MINUS_EXPR, itype, t, n1o);
+         if (TYPE_UNSIGNED (itype)
+             && fd->loops[o].cond_code == GT_EXPR)
+           t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                            fold_build1 (NEGATE_EXPR, itype, t),
+                            fold_build1 (NEGATE_EXPR, itype, ostep));
+         else
+           t = fold_build2 (TRUNC_DIV_EXPR, itype, t, ostep);
+         tree outer_niters
+           = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                       true, GSI_SAME_STMT);
+         t = fold_build2 (MINUS_EXPR, itype, outer_niters,
+                          build_one_cst (itype));
+         t = fold_build2 (MULT_EXPR, itype, t, ostep);
+         t = fold_build2 (PLUS_EXPR, itype, n1o, t);
+         tree last = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                               true, GSI_SAME_STMT);
+         tree n1, n2, n1e, n2e;
+         t = fold_convert (itype, unshare_expr (fd->loops[i].n1));
+         if (fd->loops[i].m1)
+           {
+             n1 = fold_convert (itype, unshare_expr (fd->loops[i].m1));
+             n1 = fold_build2 (MULT_EXPR, itype, n1o, n1);
+             n1 = fold_build2 (PLUS_EXPR, itype, n1, t);
+           }
+         else
+           n1 = t;
+         n1 = force_gimple_operand_gsi (&gsi2, n1, true, NULL_TREE,
+                                        true, GSI_SAME_STMT);
+         t = fold_convert (itype, unshare_expr (fd->loops[i].n2));
+         if (fd->loops[i].m2)
+           {
+             n2 = fold_convert (itype, unshare_expr (fd->loops[i].m2));
+             n2 = fold_build2 (MULT_EXPR, itype, n1o, n2);
+             n2 = fold_build2 (PLUS_EXPR, itype, n2, t);
+           }
+         else
+           n2 = t;
+         n2 = force_gimple_operand_gsi (&gsi2, n2, true, NULL_TREE,
+                                        true, GSI_SAME_STMT);
+         t = fold_convert (itype, unshare_expr (fd->loops[i].n1));
+         if (fd->loops[i].m1)
+           {
+             n1e = fold_convert (itype, unshare_expr (fd->loops[i].m1));
+             n1e = fold_build2 (MULT_EXPR, itype, last, n1e);
+             n1e = fold_build2 (PLUS_EXPR, itype, n1e, t);
+           }
+         else
+           n1e = t;
+         n1e = force_gimple_operand_gsi (&gsi2, n1e, true, NULL_TREE,
+                                         true, GSI_SAME_STMT);
+         t = fold_convert (itype, unshare_expr (fd->loops[i].n2));
+         if (fd->loops[i].m2)
+           {
+             n2e = fold_convert (itype, unshare_expr (fd->loops[i].m2));
+             n2e = fold_build2 (MULT_EXPR, itype, last, n2e);
+             n2e = fold_build2 (PLUS_EXPR, itype, n2e, t);
+           }
+         else
+           n2e = t;
+         n2e = force_gimple_operand_gsi (&gsi2, n2e, true, NULL_TREE,
+                                         true, GSI_SAME_STMT);
+         gcond *cond_stmt
+           = gimple_build_cond (fd->loops[i].cond_code, n1, n2,
+                                NULL_TREE, NULL_TREE);
+         gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+         e = split_block (bb1, cond_stmt);
+         e->flags = EDGE_TRUE_VALUE;
+         e->probability = profile_probability::likely ().guessed ();
+         basic_block bb2 = e->dest;
+         gsi2 = gsi_after_labels (bb2);
+
+         cond_stmt = gimple_build_cond (fd->loops[i].cond_code, n1e, n2e,
+                                        NULL_TREE, NULL_TREE);
+         gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+         e = split_block (bb2, cond_stmt);
+         e->flags = EDGE_TRUE_VALUE;
+         e->probability = profile_probability::likely ().guessed ();
+         gsi2 = gsi_after_labels (e->dest);
+
+         tree step = fold_convert (itype, fd->loops[i].step);
+         t = build_int_cst (itype, (fd->loops[i].cond_code
+                                    == LT_EXPR ? -1 : 1));
+         t = fold_build2 (PLUS_EXPR, itype, step, t);
+         t = fold_build2 (PLUS_EXPR, itype, t, n2);
+         t = fold_build2 (MINUS_EXPR, itype, t, n1);
+         if (TYPE_UNSIGNED (itype)
+             && fd->loops[i].cond_code == GT_EXPR)
+           t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                            fold_build1 (NEGATE_EXPR, itype, t),
+                            fold_build1 (NEGATE_EXPR, itype, step));
+         else
+           t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+         tree first_inner_iterations
+           = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                       true, GSI_SAME_STMT);
+         t = fold_build2 (MULT_EXPR, itype, m2minusm1, ostep);
+         if (TYPE_UNSIGNED (itype)
+             && fd->loops[i].cond_code == GT_EXPR)
+           t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                            fold_build1 (NEGATE_EXPR, itype, t),
+                            fold_build1 (NEGATE_EXPR, itype, step));
+         else
+           t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+         tree factor
+           = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                       true, GSI_SAME_STMT);
+         t = fold_build2 (MINUS_EXPR, itype, outer_niters,
+                          build_one_cst (itype));
+         t = fold_build2 (MULT_EXPR, itype, t, outer_niters);
+         t = fold_build2 (RSHIFT_EXPR, itype, t, integer_one_node);
+         t = fold_build2 (MULT_EXPR, itype, factor, t);
+         t = fold_build2 (PLUS_EXPR, itype,
+                          fold_build2 (MULT_EXPR, itype, outer_niters,
+                                       first_inner_iterations), t);
+         expand_omp_build_assign (&gsi2, counts[fd->last_nonrect],
+                                  fold_convert (type, t));
+
+         basic_block bb3 = create_empty_bb (bb1);
+         add_bb_to_loop (bb3, bb1->loop_father);
+
+         e = make_edge (bb1, bb3, EDGE_FALSE_VALUE);
+         e->probability = profile_probability::unlikely ().guessed ();
+
+         gsi2 = gsi_after_labels (bb3);
+         cond_stmt = gimple_build_cond (fd->loops[i].cond_code, n1e, n2e,
+                                        NULL_TREE, NULL_TREE);
+         gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+         e = split_block (bb3, cond_stmt);
+         e->flags = EDGE_TRUE_VALUE;
+         e->probability = profile_probability::likely ().guessed ();
+         basic_block bb4 = e->dest;
+
+         ne = make_edge (bb3, entry_bb, EDGE_FALSE_VALUE);
+         ne->probability = e->probability.invert ();
+
+         basic_block bb5 = create_empty_bb (bb2);
+         add_bb_to_loop (bb5, bb2->loop_father);
+
+         ne = make_edge (bb2, bb5, EDGE_FALSE_VALUE);
+         ne->probability = profile_probability::unlikely ().guessed ();
+
+         for (int j = 0; j < 2; j++)
+           {
+             gsi2 = gsi_after_labels (j ? bb5 : bb4);
+             t = fold_build2 (MINUS_EXPR, itype,
+                              unshare_expr (fd->loops[i].n1),
+                              unshare_expr (fd->loops[i].n2));
+             t = fold_build2 (TRUNC_DIV_EXPR, itype, t, m2minusm1);
+             tree tem
+               = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                           true, GSI_SAME_STMT);
+             t = fold_build2 (MINUS_EXPR, itype, tem, n1o);
+             t = fold_build2 (TRUNC_MOD_EXPR, itype, t, ostep);
+             t = fold_build2 (MINUS_EXPR, itype, tem, t);
+             tem = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                             true, GSI_SAME_STMT);
+             t = fold_convert (itype, unshare_expr (fd->loops[i].n1));
+             if (fd->loops[i].m1)
+               {
+                 n1 = fold_convert (itype, unshare_expr (fd->loops[i].m1));
+                 n1 = fold_build2 (MULT_EXPR, itype, tem, n1);
+                 n1 = fold_build2 (PLUS_EXPR, itype, n1, t);
+               }
+             else
+               n1 = t;
+             n1 = force_gimple_operand_gsi (&gsi2, n1, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             t = fold_convert (itype, unshare_expr (fd->loops[i].n2));
+             if (fd->loops[i].m2)
+               {
+                 n2 = fold_convert (itype, unshare_expr (fd->loops[i].m2));
+                 n2 = fold_build2 (MULT_EXPR, itype, tem, n2);
+                 n2 = fold_build2 (PLUS_EXPR, itype, n2, t);
+               }
+             else
+               n2 = t;
+             n2 = force_gimple_operand_gsi (&gsi2, n2, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             expand_omp_build_assign (&gsi2, j ? n2o : n1o, tem);
+
+             cond_stmt = gimple_build_cond (fd->loops[i].cond_code, n1, n2,
+                                            NULL_TREE, NULL_TREE);
+             gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+             e = split_block (gsi_bb (gsi2), cond_stmt);
+             e->flags = j ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE;
+             e->probability = profile_probability::unlikely ().guessed ();
+             ne = make_edge (e->src, bb1,
+                             j ? EDGE_FALSE_VALUE : EDGE_TRUE_VALUE);
+             ne->probability = e->probability.invert ();
+             gsi2 = gsi_after_labels (e->dest);
+
+             t = fold_build2 (PLUS_EXPR, itype, tem, ostep);
+             expand_omp_build_assign (&gsi2, j ? n2o : n1o, t);
+
+             make_edge (e->dest, bb1, EDGE_FALLTHRU);
+           }
+
+         set_immediate_dominator (CDI_DOMINATORS, bb3, bb1);
+         set_immediate_dominator (CDI_DOMINATORS, bb5, bb2);
+         set_immediate_dominator (CDI_DOMINATORS, entry_bb, bb1);
+
+         if (fd->first_nonrect + 1 == fd->last_nonrect)
+           {
+             fd->first_inner_iterations = first_inner_iterations;
+             fd->factor = factor;
+             fd->adjn1 = n1o;
+           }
+       }
+      else
+       {
+         /* Fallback implementation.  Evaluate the loops with m1/m2
+            non-NULL as well as their outer loops at runtime using temporaries
+            instead of the original iteration variables, and in the
+            body just bump the counter.  */
+         gimple_stmt_iterator gsi2 = *gsi;
+         gsi_prev (&gsi2);
+         e = split_block (entry_bb, gsi_stmt (gsi2));
+         e = split_block (e->dest, (gimple *) NULL);
+         basic_block cur_bb = e->src;
+         basic_block next_bb = e->dest;
+         entry_bb = e->dest;
+         *gsi = gsi_after_labels (entry_bb);
+
+         tree *vs = XALLOCAVEC (tree, fd->last_nonrect);
+         memset (vs, 0, fd->last_nonrect * sizeof (tree));
+
+         for (i = 0; i <= fd->last_nonrect; i++)
+           {
+             if (fd->loops[i].m1 == NULL_TREE
+                 && fd->loops[i].m2 == NULL_TREE
+                 && !fd->loops[i].non_rect_referenced)
+               continue;
+
+             tree itype = TREE_TYPE (fd->loops[i].v);
+
+             gsi2 = gsi_after_labels (cur_bb);
+             tree n1, n2;
+             t = fold_convert (itype, unshare_expr (fd->loops[i].n1));
+             if (fd->loops[i].m1)
+               {
+                 n1 = fold_convert (itype, unshare_expr (fd->loops[i].m1));
+                 n1 = fold_build2 (MULT_EXPR, itype,
+                                   vs[i - fd->loops[i].outer], n1);
+                 n1 = fold_build2 (PLUS_EXPR, itype, n1, t);
+               }
+             else
+               n1 = t;
+             n1 = force_gimple_operand_gsi (&gsi2, n1, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             if (i < fd->last_nonrect)
+               {
+                 vs[i] = create_tmp_reg (itype, ".it");
+                 expand_omp_build_assign (&gsi2, vs[i], n1);
+               }
+             t = fold_convert (itype, unshare_expr (fd->loops[i].n2));
+             if (fd->loops[i].m2)
+               {
+                 n2 = fold_convert (itype, unshare_expr (fd->loops[i].m2));
+                 n2 = fold_build2 (MULT_EXPR, itype,
+                                   vs[i - fd->loops[i].outer], n2);
+                 n2 = fold_build2 (PLUS_EXPR, itype, n2, t);
+               }
+             else
+               n2 = t;
+             n2 = force_gimple_operand_gsi (&gsi2, n2, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             if (i == fd->last_nonrect)
+               {
+                 gcond *cond_stmt
+                   = gimple_build_cond (fd->loops[i].cond_code, n1, n2,
+                                        NULL_TREE, NULL_TREE);
+                 gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+                 e = split_block (cur_bb, cond_stmt);
+                 e->flags = EDGE_TRUE_VALUE;
+                 ne = make_edge (cur_bb, next_bb, EDGE_FALSE_VALUE);
+                 e->probability = profile_probability::likely ().guessed ();
+                 ne->probability = e->probability.invert ();
+                 gsi2 = gsi_after_labels (e->dest);
+
+                 t = build_int_cst (itype, (fd->loops[i].cond_code == LT_EXPR
+                                            ? -1 : 1));
+                 t = fold_build2 (PLUS_EXPR, itype,
+                                  fold_convert (itype, fd->loops[i].step), t);
+                 t = fold_build2 (PLUS_EXPR, itype, t, n2);
+                 t = fold_build2 (MINUS_EXPR, itype, t, n1);
+                 tree step = fold_convert (itype, fd->loops[i].step);
+                 if (TYPE_UNSIGNED (itype)
+                     && fd->loops[i].cond_code == GT_EXPR)
+                   t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                                    fold_build1 (NEGATE_EXPR, itype, t),
+                                    fold_build1 (NEGATE_EXPR, itype, step));
+                 else
+                   t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+                 t = fold_convert (type, t);
+                 t = fold_build2 (PLUS_EXPR, type,
+                                  counts[fd->last_nonrect], t);
+                 t = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                               true, GSI_SAME_STMT);
+                 expand_omp_build_assign (&gsi2, counts[fd->last_nonrect], t);
+                 e = make_edge (e->dest, next_bb, EDGE_FALLTHRU);
+                 set_immediate_dominator (CDI_DOMINATORS, next_bb, cur_bb);
+                 break;
+               }
+             e = split_block (cur_bb, last_stmt (cur_bb));
+
+             basic_block new_cur_bb = create_empty_bb (cur_bb);
+             add_bb_to_loop (new_cur_bb, cur_bb->loop_father);
+
+             gsi2 = gsi_after_labels (e->dest);
+             tree step = fold_convert (itype,
+                                       unshare_expr (fd->loops[i].step));
+             t = fold_build2 (PLUS_EXPR, itype, vs[i], step);
+             t = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                           true, GSI_SAME_STMT);
+             expand_omp_build_assign (&gsi2, vs[i], t);
+
+             ne = split_block (e->dest, last_stmt (e->dest));
+             gsi2 = gsi_after_labels (ne->dest);
+
+             gcond *cond_stmt
+               = gimple_build_cond (fd->loops[i].cond_code, vs[i], n2,
+                                    NULL_TREE, NULL_TREE);
+             gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+             edge e3, e4;
+             if (next_bb == entry_bb)
+               {
+                 e3 = find_edge (ne->dest, next_bb);
+                 e3->flags = EDGE_FALSE_VALUE;
+               }
+             else
+               e3 = make_edge (ne->dest, next_bb, EDGE_FALSE_VALUE);
+             e4 = make_edge (ne->dest, new_cur_bb, EDGE_TRUE_VALUE);
+             e4->probability = profile_probability::likely ().guessed ();
+             e3->probability = e4->probability.invert ();
+             basic_block esrc = e->src;
+             make_edge (e->src, ne->dest, EDGE_FALLTHRU);
+             cur_bb = new_cur_bb;
+             basic_block latch_bb = next_bb;
+             next_bb = e->dest;
+             remove_edge (e);
+             set_immediate_dominator (CDI_DOMINATORS, ne->dest, esrc);
+             set_immediate_dominator (CDI_DOMINATORS, latch_bb, ne->dest);
+             set_immediate_dominator (CDI_DOMINATORS, cur_bb, ne->dest);
+           }
+       }
+      t = NULL_TREE;
+      for (i = fd->first_nonrect; i < fd->last_nonrect; i++)
+       if (!fd->loops[i].non_rect_referenced
+           && fd->loops[i].m1 == NULL_TREE
+           && fd->loops[i].m2 == NULL_TREE)
+         {
+           if (t == NULL_TREE)
+             t = counts[i];
+           else
+             t = fold_build2 (MULT_EXPR, type, t, counts[i]);
+         }
+      if (t)
+       {
+         t = fold_build2 (MULT_EXPR, type, counts[fd->last_nonrect], t);
+         expand_omp_build_assign (gsi, counts[fd->last_nonrect], t);
+       }
+      if (!rect_count_seen)
+       t = counts[fd->last_nonrect];
+      else
+       t = fold_build2 (MULT_EXPR, type, fd->loop.n2,
+                        counts[fd->last_nonrect]);
+      expand_omp_build_assign (gsi, fd->loop.n2, t);
+    }
+  else if (fd->non_rect)
+    {
+      tree t = fd->loop.n2;
+      gcc_assert (TREE_CODE (t) == INTEGER_CST);
+      int non_rect_referenced = 0, non_rect = 0;
+      for (i = 0; i < fd->collapse; i++)
+       {
+         if ((i < fd->first_nonrect || i > fd->last_nonrect)
+             && !integer_zerop (counts[i]))
+           t = fold_build2 (TRUNC_DIV_EXPR, type, t, counts[i]);
+         if (fd->loops[i].non_rect_referenced)
+           non_rect_referenced++;
+         if (fd->loops[i].m1 || fd->loops[i].m2)
+           non_rect++;
+       }
+      gcc_assert (non_rect == 1 && non_rect_referenced == 1);
+      counts[fd->last_nonrect] = t;
+    }
 }
 
 /* Helper function for expand_omp_{for_*,simd}.  Generate code like:
@@ -1804,11 +2409,43 @@ expand_omp_for_init_counts (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
    if this loop doesn't have an inner loop construct combined with it.
    If it does have an inner loop construct combined with it and the
    iteration count isn't known constant, store values from counts array
-   into its _looptemp_ temporaries instead.  */
+   into its _looptemp_ temporaries instead.
+   For non-rectangular loops (between fd->first_nonrect and fd->last_nonrect
+   inclusive), use the count of all those loops together, and either
+   find quadratic etc. equation roots, or as a fallback, do:
+       COUNT = 0;
+       for (tmpi = N11; tmpi COND1 N12; tmpi += STEP1)
+       for (tmpj = M21 * tmpi + N21;
+            tmpj COND2 M22 * tmpi + N22; tmpj += STEP2)
+         {
+           int tmpk1 = M31 * tmpj + N31;
+           int tmpk2 = M32 * tmpj + N32;
+           if (tmpk1 COND3 tmpk2)
+             {
+               if (COND3 is <)
+                 adj = STEP3 - 1;
+               else
+                 adj = STEP3 + 1;
+               int temp = (adj + tmpk2 - tmpk1) / STEP3;
+               if (COUNT + temp > T)
+                 {
+                   V1 = tmpi;
+                   V2 = tmpj;
+                   V3 = tmpk1 + (T - COUNT) * STEP3;
+                   goto done;
+                 }
+               else
+                 COUNT += temp;
+             }
+         }
+       done:;
+   but for optional innermost or outermost rectangular loops that aren't
+   referenced by other loop expressions keep doing the division/modulo.  */
 
 static void
 expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
-                         tree *counts, gimple *inner_stmt, tree startvar)
+                         tree *counts, tree *nonrect_bounds,
+                         gimple *inner_stmt, tree startvar)
 {
   int i;
   if (gimple_omp_for_combined_p (fd->for_stmt))
@@ -1826,7 +2463,12 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
         use it.  */
       tree innerc = omp_find_clause (clauses, OMP_CLAUSE__LOOPTEMP_);
       gcc_assert (innerc);
-      for (i = 0; i < fd->collapse; i++)
+      int count = 0;
+      if (fd->non_rect
+         && fd->last_nonrect == fd->first_nonrect + 1
+         && !TYPE_UNSIGNED (TREE_TYPE (fd->loops[fd->last_nonrect].v)))
+       count = 4;
+      for (i = 0; i < fd->collapse + count; i++)
        {
          innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
                                    OMP_CLAUSE__LOOPTEMP_);
@@ -1834,7 +2476,19 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
          if (i)
            {
              tree tem = OMP_CLAUSE_DECL (innerc);
-             tree t = fold_convert (TREE_TYPE (tem), counts[i]);
+             tree t;
+             if (i < fd->collapse)
+               t = counts[i];
+             else
+               switch (i - fd->collapse)
+                 {
+                 case 0: t = counts[0]; break;
+                 case 1: t = fd->first_inner_iterations; break;
+                 case 2: t = fd->factor; break;
+                 case 3: t = fd->adjn1; break;
+                 default: gcc_unreachable ();
+                 }
+             t = fold_convert (TREE_TYPE (tem), t);
              t = force_gimple_operand_gsi (gsi, t, false, NULL_TREE,
                                            false, GSI_CONTINUE_LINKING);
              gassign *stmt = gimple_build_assign (tem, t);
@@ -1855,33 +2509,487 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
       itype = vtype;
       if (POINTER_TYPE_P (vtype))
        itype = signed_type_for (vtype);
-      if (i != 0)
+      if (i != 0 && (i != fd->last_nonrect || fd->first_nonrect))
        t = fold_build2 (TRUNC_MOD_EXPR, type, tem, counts[i]);
       else
        t = tem;
-      t = fold_convert (itype, t);
-      t = fold_build2 (MULT_EXPR, itype, t,
-                      fold_convert (itype, fd->loops[i].step));
-      if (POINTER_TYPE_P (vtype))
-       t = fold_build_pointer_plus (fd->loops[i].n1, t);
-      else
-       t = fold_build2 (PLUS_EXPR, itype, fd->loops[i].n1, t);
-      t = force_gimple_operand_gsi (gsi, t,
-                                   DECL_P (fd->loops[i].v)
-                                   && TREE_ADDRESSABLE (fd->loops[i].v),
-                                   NULL_TREE, false,
-                                   GSI_CONTINUE_LINKING);
-      stmt = gimple_build_assign (fd->loops[i].v, t);
-      gsi_insert_after (gsi, stmt, GSI_CONTINUE_LINKING);
-      if (i != 0)
+      if (i == fd->last_nonrect)
        {
-         t = fold_build2 (TRUNC_DIV_EXPR, type, tem, counts[i]);
-         t = force_gimple_operand_gsi (gsi, t, false, NULL_TREE,
+         t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE,
+                                       false, GSI_CONTINUE_LINKING);
+         tree stopval = t;
+         tree idx = create_tmp_reg (type, ".count");
+         expand_omp_build_assign (gsi, idx,
+                                  build_zero_cst (type), true);
+         basic_block bb_triang = NULL, bb_triang_dom = NULL;
+         if (fd->first_nonrect + 1 == fd->last_nonrect
+             && (TREE_CODE (fd->loop.n2) == INTEGER_CST
+                 || fd->first_inner_iterations)
+             && (optab_handler (sqrt_optab, TYPE_MODE (double_type_node))
+                 != CODE_FOR_nothing)
+             && !integer_zerop (fd->loop.n2))
+           {
+             tree outer_n1 = fd->adjn1 ? fd->adjn1 : fd->loops[i - 1].n1;
+             tree itype = TREE_TYPE (fd->loops[i].v);
+             tree first_inner_iterations = fd->first_inner_iterations;
+             tree factor = fd->factor;
+             gcond *cond_stmt
+               = gimple_build_cond (NE_EXPR, factor,
+                                    build_zero_cst (TREE_TYPE (factor)),
+                                    NULL_TREE, NULL_TREE);
+             gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING);
+             edge e = split_block (gsi_bb (*gsi), cond_stmt);
+             basic_block bb0 = e->src;
+             e->flags = EDGE_TRUE_VALUE;
+             e->probability = profile_probability::likely ();
+             bb_triang_dom = bb0;
+             *gsi = gsi_after_labels (e->dest);
+             tree slltype = long_long_integer_type_node;
+             tree ulltype = long_long_unsigned_type_node;
+             tree stopvalull = fold_convert (ulltype, stopval);
+             stopvalull
+               = force_gimple_operand_gsi (gsi, stopvalull, true, NULL_TREE,
+                                           false, GSI_CONTINUE_LINKING);
+             first_inner_iterations
+               = fold_convert (slltype, first_inner_iterations);
+             first_inner_iterations
+               = force_gimple_operand_gsi (gsi, first_inner_iterations, true,
+                                           NULL_TREE, false,
+                                           GSI_CONTINUE_LINKING);
+             factor = fold_convert (slltype, factor);
+             factor
+               = force_gimple_operand_gsi (gsi, factor, true, NULL_TREE,
+                                           false, GSI_CONTINUE_LINKING);
+             tree first_inner_iterationsd
+               = fold_build1 (FLOAT_EXPR, double_type_node,
+                              first_inner_iterations);
+             first_inner_iterationsd
+               = force_gimple_operand_gsi (gsi, first_inner_iterationsd, true,
+                                           NULL_TREE, false,
+                                           GSI_CONTINUE_LINKING);
+             tree factord = fold_build1 (FLOAT_EXPR, double_type_node,
+                                         factor);
+             factord = force_gimple_operand_gsi (gsi, factord, true,
+                                                 NULL_TREE, false,
+                                                 GSI_CONTINUE_LINKING);
+             tree stopvald = fold_build1 (FLOAT_EXPR, double_type_node,
+                                          stopvalull);
+             stopvald = force_gimple_operand_gsi (gsi, stopvald, true,
+                                                  NULL_TREE, false,
+                                                  GSI_CONTINUE_LINKING);
+             /* Temporarily disable flag_rounding_math, values will be
+                decimal numbers divided by 2 and worst case imprecisions
+                due to too large values ought to be caught later by the
+                checks for fallback.  */
+             int save_flag_rounding_math = flag_rounding_math;
+             flag_rounding_math = 0;
+             t = fold_build2 (RDIV_EXPR, double_type_node, factord,
+                              build_real (double_type_node, dconst2));
+             tree t3 = fold_build2 (MINUS_EXPR, double_type_node,
+                                    first_inner_iterationsd, t);
+             t3 = force_gimple_operand_gsi (gsi, t3, true, NULL_TREE, false,
+                                            GSI_CONTINUE_LINKING);
+             t = fold_build2 (MULT_EXPR, double_type_node, factord,
+                              build_real (double_type_node, dconst2));
+             t = fold_build2 (MULT_EXPR, double_type_node, t, stopvald);
+             t = fold_build2 (PLUS_EXPR, double_type_node, t,
+                              fold_build2 (MULT_EXPR, double_type_node,
+                                           t3, t3));
+             flag_rounding_math = save_flag_rounding_math;
+             t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false,
+                                           GSI_CONTINUE_LINKING);
+             if (flag_exceptions
+                 && cfun->can_throw_non_call_exceptions
+                 && operation_could_trap_p (LT_EXPR, true, false, NULL_TREE))
+               {
+                 tree tem = fold_build2 (LT_EXPR, boolean_type_node, t,
+                                         build_zero_cst (double_type_node));
+                 tem = force_gimple_operand_gsi (gsi, tem, true, NULL_TREE,
+                                                 false, GSI_CONTINUE_LINKING);
+                 cond_stmt = gimple_build_cond (NE_EXPR, tem,
+                                                boolean_false_node,
+                                                NULL_TREE, NULL_TREE);
+               }
+             else
+               cond_stmt
+                 = gimple_build_cond (LT_EXPR, t,
+                                      build_zero_cst (double_type_node),
+                                      NULL_TREE, NULL_TREE);
+             gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING);
+             e = split_block (gsi_bb (*gsi), cond_stmt);
+             basic_block bb1 = e->src;
+             e->flags = EDGE_FALSE_VALUE;
+             e->probability = profile_probability::very_likely ();
+             *gsi = gsi_after_labels (e->dest);
+             gcall *call = gimple_build_call_internal (IFN_SQRT, 1, t);
+             tree sqrtr = create_tmp_var (double_type_node);
+             gimple_call_set_lhs (call, sqrtr);
+             gsi_insert_after (gsi, call, GSI_CONTINUE_LINKING);
+             t = fold_build2 (MINUS_EXPR, double_type_node, sqrtr, t3);
+             t = fold_build2 (RDIV_EXPR, double_type_node, t, factord);
+             t = fold_build1 (FIX_TRUNC_EXPR, ulltype, t);
+             tree c = create_tmp_var (ulltype);
+             tree d = create_tmp_var (ulltype);
+             expand_omp_build_assign (gsi, c, t, true);
+             t = fold_build2 (MINUS_EXPR, ulltype, c,
+                              build_one_cst (ulltype));
+             t = fold_build2 (MULT_EXPR, ulltype, c, t);
+             t = fold_build2 (RSHIFT_EXPR, ulltype, t, integer_one_node);
+             t = fold_build2 (MULT_EXPR, ulltype,
+                              fold_convert (ulltype, fd->factor), t);
+             tree t2
+               = fold_build2 (MULT_EXPR, ulltype, c,
+                              fold_convert (ulltype,
+                                            fd->first_inner_iterations));
+             t = fold_build2 (PLUS_EXPR, ulltype, t, t2);
+             expand_omp_build_assign (gsi, d, t, true);
+             t = fold_build2 (MULT_EXPR, ulltype,
+                              fold_convert (ulltype, fd->factor), c);
+             t = fold_build2 (PLUS_EXPR, ulltype,
+                              t, fold_convert (ulltype,
+                                               fd->first_inner_iterations));
+             t2 = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false,
+                                            GSI_CONTINUE_LINKING);
+             cond_stmt = gimple_build_cond (GE_EXPR, stopvalull, d,
+                                            NULL_TREE, NULL_TREE);
+             gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING);
+             e = split_block (gsi_bb (*gsi), cond_stmt);
+             basic_block bb2 = e->src;
+             e->flags = EDGE_TRUE_VALUE;
+             e->probability = profile_probability::very_likely ();
+             *gsi = gsi_after_labels (e->dest);
+             t = fold_build2 (PLUS_EXPR, ulltype, d, t2);
+             t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false,
+                                           GSI_CONTINUE_LINKING);
+             cond_stmt = gimple_build_cond (GE_EXPR, stopvalull, t,
+                                            NULL_TREE, NULL_TREE);
+             gsi_insert_after (gsi, cond_stmt, GSI_CONTINUE_LINKING);
+             e = split_block (gsi_bb (*gsi), cond_stmt);
+             basic_block bb3 = e->src;
+             e->flags = EDGE_FALSE_VALUE;
+             e->probability = profile_probability::very_likely ();
+             *gsi = gsi_after_labels (e->dest);
+             t = fold_convert (itype, c);
+             t = fold_build2 (MULT_EXPR, itype, t, fd->loops[i - 1].step);
+             t = fold_build2 (PLUS_EXPR, itype, outer_n1, t);
+             t = force_gimple_operand_gsi (gsi, t, true, NULL_TREE, false,
+                                           GSI_CONTINUE_LINKING);
+             expand_omp_build_assign (gsi, fd->loops[i - 1].v, t, true);
+             t2 = fold_build2 (MINUS_EXPR, ulltype, stopvalull, d);
+             t2 = fold_convert (itype, t2);
+             t2 = fold_build2 (MULT_EXPR, itype, t2, fd->loops[i].step);
+             t2 = fold_build2 (PLUS_EXPR, itype, t2, fd->loops[i].n1);
+             if (fd->loops[i].m1)
+               {
+                 t = fold_build2 (MULT_EXPR, itype, t, fd->loops[i].m1);
+                 t2 = fold_build2 (PLUS_EXPR, itype, t2, t);
+               }
+             expand_omp_build_assign (gsi, fd->loops[i].v, t2, true);
+             e = split_block (gsi_bb (*gsi), gsi_stmt (*gsi));
+             bb_triang = e->src;
+             *gsi = gsi_after_labels (e->dest);
+             remove_edge (e);
+             e = make_edge (bb1, gsi_bb (*gsi), EDGE_TRUE_VALUE);
+             e->probability = profile_probability::very_unlikely ();
+             e = make_edge (bb2, gsi_bb (*gsi), EDGE_FALSE_VALUE);
+             e->probability = profile_probability::very_unlikely ();
+             e = make_edge (bb3, gsi_bb (*gsi), EDGE_TRUE_VALUE);
+             e->probability = profile_probability::very_unlikely ();
+
+             basic_block bb4 = create_empty_bb (bb0);
+             add_bb_to_loop (bb4, bb0->loop_father);
+             e = make_edge (bb0, bb4, EDGE_FALSE_VALUE);
+             e->probability = profile_probability::unlikely ();
+             make_edge (bb4, gsi_bb (*gsi), EDGE_FALLTHRU);
+             set_immediate_dominator (CDI_DOMINATORS, bb4, bb0);
+             set_immediate_dominator (CDI_DOMINATORS, gsi_bb (*gsi), bb0);
+             gimple_stmt_iterator gsi2 = gsi_after_labels (bb4);
+             t2 = fold_build2 (TRUNC_DIV_EXPR, type,
+                               counts[i], counts[i - 1]);
+             t2 = force_gimple_operand_gsi (&gsi2, t2, true, NULL_TREE, false,
+                                            GSI_CONTINUE_LINKING);
+             t = fold_build2 (TRUNC_MOD_EXPR, type, stopval, t2);
+             t2 = fold_build2 (TRUNC_DIV_EXPR, type, stopval, t2);
+             t = fold_convert (itype, t);
+             t2 = fold_convert (itype, t2);
+             t = fold_build2 (MULT_EXPR, itype, t,
+                              fold_convert (itype, fd->loops[i].step));
+             t = fold_build2 (PLUS_EXPR, itype, fd->loops[i].n1, t);
+             t2 = fold_build2 (MULT_EXPR, itype, t2,
+                               fold_convert (itype, fd->loops[i - 1].step));
+             t2 = fold_build2 (PLUS_EXPR, itype, fd->loops[i - 1].n1, t2);
+             t2 = force_gimple_operand_gsi (&gsi2, t2, false, NULL_TREE,
+                                            false, GSI_CONTINUE_LINKING);
+             stmt = gimple_build_assign (fd->loops[i - 1].v, t2);
+             gsi_insert_after (&gsi2, stmt, GSI_CONTINUE_LINKING);
+             if (fd->loops[i].m1)
+               {
+                 t2 = fold_build2 (MULT_EXPR, itype, fd->loops[i].m1,
+                                   fd->loops[i - 1].v);
+                 t = fold_build2 (PLUS_EXPR, itype, t, t2);
+               }
+             t = force_gimple_operand_gsi (&gsi2, t, false, NULL_TREE,
+                                           false, GSI_CONTINUE_LINKING);
+             stmt = gimple_build_assign (fd->loops[i].v, t);
+             gsi_insert_after (&gsi2, stmt, GSI_CONTINUE_LINKING);
+           }
+         /* Fallback implementation.  Evaluate the loops in between
+            (inclusive) fd->first_nonrect and fd->last_nonrect at
+            runtime unsing temporaries instead of the original iteration
+            variables, in the body just bump the counter and compare
+            with the desired value.  */
+         gimple_stmt_iterator gsi2 = *gsi;
+         basic_block entry_bb = gsi_bb (gsi2);
+         edge e = split_block (entry_bb, gsi_stmt (gsi2));
+         e = split_block (e->dest, (gimple *) NULL);
+         basic_block dom_bb = NULL;
+         basic_block cur_bb = e->src;
+         basic_block next_bb = e->dest;
+         entry_bb = e->dest;
+         *gsi = gsi_after_labels (entry_bb);
+
+         tree *vs = XALLOCAVEC (tree, fd->last_nonrect);
+         tree n1 = NULL_TREE, n2 = NULL_TREE;
+         memset (vs, 0, fd->last_nonrect * sizeof (tree));
+
+         for (int j = fd->first_nonrect; j <= fd->last_nonrect; j++)
+           {
+             tree itype = TREE_TYPE (fd->loops[j].v);
+             bool rect_p = (fd->loops[j].m1 == NULL_TREE
+                            && fd->loops[j].m2 == NULL_TREE
+                            && !fd->loops[j].non_rect_referenced);
+             gsi2 = gsi_after_labels (cur_bb);
+             t = fold_convert (itype, unshare_expr (fd->loops[j].n1));
+             if (fd->loops[j].m1)
+               {
+                 n1 = fold_convert (itype, unshare_expr (fd->loops[j].m1));
+                 n1 = fold_build2 (MULT_EXPR, itype,
+                                   vs[j - fd->loops[j].outer], n1);
+                 n1 = fold_build2 (PLUS_EXPR, itype, n1, t);
+               }
+             else if (rect_p)
+               n1 = build_zero_cst (type);
+             else
+               n1 = t;
+             n1 = force_gimple_operand_gsi (&gsi2, n1, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             if (j < fd->last_nonrect)
+               {
+                 vs[j] = create_tmp_reg (rect_p ? type : itype, ".it");
+                 expand_omp_build_assign (&gsi2, vs[j], n1);
+               }
+             t = fold_convert (itype, unshare_expr (fd->loops[j].n2));
+             if (fd->loops[j].m2)
+               {
+                 n2 = fold_convert (itype, unshare_expr (fd->loops[j].m2));
+                 n2 = fold_build2 (MULT_EXPR, itype,
+                                   vs[j - fd->loops[j].outer], n2);
+                 n2 = fold_build2 (PLUS_EXPR, itype, n2, t);
+               }
+             else if (rect_p)
+               n2 = counts[j];
+             else
+               n2 = t;
+             n2 = force_gimple_operand_gsi (&gsi2, n2, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             if (j == fd->last_nonrect)
+               {
+                 gcond *cond_stmt
+                   = gimple_build_cond (fd->loops[j].cond_code, n1, n2,
+                                        NULL_TREE, NULL_TREE);
+                 gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+                 e = split_block (cur_bb, cond_stmt);
+                 e->flags = EDGE_TRUE_VALUE;
+                 edge ne = make_edge (cur_bb, next_bb, EDGE_FALSE_VALUE);
+                 e->probability = profile_probability::likely ().guessed ();
+                 ne->probability = e->probability.invert ();
+                 gsi2 = gsi_after_labels (e->dest);
+
+                 t = build_int_cst (itype, (fd->loops[j].cond_code == LT_EXPR
+                                            ? -1 : 1));
+                 t = fold_build2 (PLUS_EXPR, itype,
+                                  fold_convert (itype, fd->loops[j].step), t);
+                 t = fold_build2 (PLUS_EXPR, itype, t, n2);
+                 t = fold_build2 (MINUS_EXPR, itype, t, n1);
+                 tree step = fold_convert (itype, fd->loops[j].step);
+                 if (TYPE_UNSIGNED (itype)
+                     && fd->loops[j].cond_code == GT_EXPR)
+                   t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                                    fold_build1 (NEGATE_EXPR, itype, t),
+                                    fold_build1 (NEGATE_EXPR, itype, step));
+                 else
+                   t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+                 t = fold_convert (type, t);
+                 t = fold_build2 (PLUS_EXPR, type, idx, t);
+                 t = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                               true, GSI_SAME_STMT);
+                 e = make_edge (e->dest, next_bb, EDGE_FALLTHRU);
+                 set_immediate_dominator (CDI_DOMINATORS, next_bb, cur_bb);
+                 cond_stmt
+                   = gimple_build_cond (LE_EXPR, t, stopval, NULL_TREE,
+                                        NULL_TREE);
+                 gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+                 e = split_block (gsi_bb (gsi2), cond_stmt);
+                 e->flags = EDGE_TRUE_VALUE;
+                 e->probability = profile_probability::likely ().guessed ();
+                 ne = make_edge (e->src, entry_bb, EDGE_FALSE_VALUE);
+                 ne->probability = e->probability.invert ();
+                 gsi2 = gsi_after_labels (e->dest);
+                 expand_omp_build_assign (&gsi2, idx, t);
+                 set_immediate_dominator (CDI_DOMINATORS, entry_bb, dom_bb);
+                 break;
+               }
+             e = split_block (cur_bb, last_stmt (cur_bb));
+
+             basic_block new_cur_bb = create_empty_bb (cur_bb);
+             add_bb_to_loop (new_cur_bb, cur_bb->loop_father);
+
+             gsi2 = gsi_after_labels (e->dest);
+             if (rect_p)
+               t = fold_build2 (PLUS_EXPR, type, vs[j],
+                                build_one_cst (type));
+             else
+               {
+                 tree step
+                   = fold_convert (itype, unshare_expr (fd->loops[j].step));
+                 t = fold_build2 (PLUS_EXPR, itype, vs[j], step);
+               }
+             t = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                           true, GSI_SAME_STMT);
+             expand_omp_build_assign (&gsi2, vs[j], t);
+
+             edge ne = split_block (e->dest, last_stmt (e->dest));
+             gsi2 = gsi_after_labels (ne->dest);
+
+             gcond *cond_stmt;
+             if (next_bb == entry_bb)
+               /* No need to actually check the outermost condition.  */
+               cond_stmt
+                 = gimple_build_cond (EQ_EXPR, boolean_true_node,
+                                      boolean_true_node,
+                                      NULL_TREE, NULL_TREE);
+             else
+               cond_stmt
+                 = gimple_build_cond (rect_p ? LT_EXPR
+                                             : fd->loops[j].cond_code,
+                                      vs[j], n2, NULL_TREE, NULL_TREE);
+             gsi_insert_before (&gsi2, cond_stmt, GSI_SAME_STMT);
+             edge e3, e4;
+             if (next_bb == entry_bb)
+               {
+                 e3 = find_edge (ne->dest, next_bb);
+                 e3->flags = EDGE_FALSE_VALUE;
+                 dom_bb = ne->dest;
+               }
+             else
+               e3 = make_edge (ne->dest, next_bb, EDGE_FALSE_VALUE);
+             e4 = make_edge (ne->dest, new_cur_bb, EDGE_TRUE_VALUE);
+             e4->probability = profile_probability::likely ().guessed ();
+             e3->probability = e4->probability.invert ();
+             basic_block esrc = e->src;
+             make_edge (e->src, ne->dest, EDGE_FALLTHRU);
+             cur_bb = new_cur_bb;
+             basic_block latch_bb = next_bb;
+             next_bb = e->dest;
+             remove_edge (e);
+             set_immediate_dominator (CDI_DOMINATORS, ne->dest, esrc);
+             set_immediate_dominator (CDI_DOMINATORS, latch_bb, ne->dest);
+             set_immediate_dominator (CDI_DOMINATORS, cur_bb, ne->dest);
+           }
+         for (int j = fd->last_nonrect; j >= fd->first_nonrect; j--)
+           {
+             tree itype = TREE_TYPE (fd->loops[j].v);
+             bool rect_p = (fd->loops[j].m1 == NULL_TREE
+                            && fd->loops[j].m2 == NULL_TREE
+                            && !fd->loops[j].non_rect_referenced);
+             if (j == fd->last_nonrect)
+               {
+                 t = fold_build2 (MINUS_EXPR, type, stopval, idx);
+                 t = fold_convert (itype, t);
+                 tree t2
+                   = fold_convert (itype, unshare_expr (fd->loops[j].step));
+                 t = fold_build2 (MULT_EXPR, itype, t, t2);
+                 t = fold_build2 (PLUS_EXPR, itype, n1, t);
+               }
+             else if (rect_p)
+               {
+                 t = fold_convert (itype, vs[j]);
+                 t = fold_build2 (MULT_EXPR, itype, t,
+                                  fold_convert (itype, fd->loops[j].step));
+                 if (POINTER_TYPE_P (vtype))
+                   t = fold_build_pointer_plus (fd->loops[j].n1, t);
+                 else
+                   t = fold_build2 (PLUS_EXPR, itype, fd->loops[j].n1, t);
+               }
+             else
+               t = vs[j];
+             t = force_gimple_operand_gsi (gsi, t, false,
+                                           NULL_TREE, true,
+                                           GSI_SAME_STMT);
+             stmt = gimple_build_assign (fd->loops[j].v, t);
+             gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+           }
+         if (gsi_end_p (*gsi))
+           *gsi = gsi_last_bb (gsi_bb (*gsi));
+         else
+           gsi_prev (gsi);
+         if (bb_triang)
+           {
+             e = split_block (gsi_bb (*gsi), gsi_stmt (*gsi));
+             make_edge (bb_triang, e->dest, EDGE_FALLTHRU);
+             *gsi = gsi_after_labels (e->dest);
+             if (!gsi_end_p (*gsi))
+               gsi_insert_before (gsi, gimple_build_nop (), GSI_NEW_STMT);
+             set_immediate_dominator (CDI_DOMINATORS, e->dest, bb_triang_dom);
+           }
+       }
+      else
+       {
+         t = fold_convert (itype, t);
+         t = fold_build2 (MULT_EXPR, itype, t,
+                          fold_convert (itype, fd->loops[i].step));
+         if (POINTER_TYPE_P (vtype))
+           t = fold_build_pointer_plus (fd->loops[i].n1, t);
+         else
+           t = fold_build2 (PLUS_EXPR, itype, fd->loops[i].n1, t);
+         t = force_gimple_operand_gsi (gsi, t,
+                                       DECL_P (fd->loops[i].v)
+                                       && TREE_ADDRESSABLE (fd->loops[i].v),
+                                       NULL_TREE, false,
+                                       GSI_CONTINUE_LINKING);
+         stmt = gimple_build_assign (fd->loops[i].v, t);
+         gsi_insert_after (gsi, stmt, GSI_CONTINUE_LINKING);
+       }
+      if (i != 0 && (i != fd->last_nonrect || fd->first_nonrect))
+       {
+         t = fold_build2 (TRUNC_DIV_EXPR, type, tem, counts[i]);
+         t = force_gimple_operand_gsi (gsi, t, false, NULL_TREE,
                                        false, GSI_CONTINUE_LINKING);
          stmt = gimple_build_assign (tem, t);
          gsi_insert_after (gsi, stmt, GSI_CONTINUE_LINKING);
        }
+      if (i == fd->last_nonrect)
+       i = fd->first_nonrect;
     }
+  if (fd->non_rect)
+    for (i = 0; i <= fd->last_nonrect; i++)
+      if (fd->loops[i].m2)
+       {
+         tree itype = TREE_TYPE (fd->loops[i].v);
+
+         tree t = fold_convert (itype, unshare_expr (fd->loops[i].m2));
+         t = fold_build2 (MULT_EXPR, itype,
+                          fd->loops[i - fd->loops[i].outer].v, t);
+         t = fold_build2 (PLUS_EXPR, itype, t,
+                          fold_convert (itype,
+                                        unshare_expr (fd->loops[i].n2)));
+         nonrect_bounds[i] = create_tmp_reg (itype, ".bound");
+         t = force_gimple_operand_gsi (gsi, t, false,
+                                       NULL_TREE, false,
+                                       GSI_CONTINUE_LINKING);
+         stmt = gimple_build_assign (nonrect_bounds[i], t);
+         gsi_insert_after (gsi, stmt, GSI_CONTINUE_LINKING);
+       }
 }
 
 /* Helper function for expand_omp_for_*.  Generate code like:
@@ -1895,11 +3003,38 @@ expand_omp_for_init_vars (struct omp_for_data *fd, gimple_stmt_iterator *gsi,
     L12:
        V2 = N21;
        V1 += STEP1;
-       goto BODY_BB;  */
+       goto BODY_BB;
+   For non-rectangular loops, use temporaries stored in nonrect_bounds
+   for the upper bounds if M?2 multiplier is present.  Given e.g.
+   for (V1 = N11; V1 cond1 N12; V1 += STEP1)
+   for (V2 = N21; V2 cond2 N22; V2 += STEP2)
+   for (V3 = N31; V3 cond3 N32; V3 += STEP3)
+   for (V4 = N41 + M41 * V2; V4 cond4 N42 + M42 * V2; V4 += STEP4)
+   do:
+    L10:
+       V4 += STEP4;
+       if (V4 cond4 NONRECT_BOUND4) goto BODY_BB; else goto L11;
+    L11:
+       V4 = N41 + M41 * V2; // This can be left out if the loop
+                            // refers to the immediate parent loop
+       V3 += STEP3;
+       if (V3 cond3 N32) goto BODY_BB; else goto L12;
+    L12:
+       V3 = N31;
+       V2 += STEP2;
+       if (V2 cond2 N22) goto L120; else goto L13;
+    L120:
+       V4 = N41 + M41 * V2;
+       NONRECT_BOUND4 = N42 + M42 * V2;
+       if (V4 cond4 NONRECT_BOUND4) goto BODY_BB; else goto L12;
+    L13:
+       V2 = N21;
+       V1 += STEP1;
+       goto L120;  */
 
 static basic_block
-extract_omp_for_update_vars (struct omp_for_data *fd, basic_block cont_bb,
-                            basic_block body_bb)
+extract_omp_for_update_vars (struct omp_for_data *fd, tree *nonrect_bounds,
+                            basic_block cont_bb, basic_block body_bb)
 {
   basic_block last_bb, bb, collapse_bb = NULL;
   int i;
@@ -1920,17 +3055,28 @@ extract_omp_for_update_vars (struct omp_for_data *fd, basic_block cont_bb,
       if (i < fd->collapse - 1)
        {
          e = make_edge (last_bb, bb, EDGE_FALSE_VALUE);
-         e->probability = profile_probability::guessed_always ().apply_scale (1, 8);
+         e->probability
+           = profile_probability::guessed_always ().apply_scale (1, 8);
 
-         t = fd->loops[i + 1].n1;
-         t = force_gimple_operand_gsi (&gsi, t,
-                                       DECL_P (fd->loops[i + 1].v)
-                                       && TREE_ADDRESSABLE (fd->loops[i
-                                                                      + 1].v),
-                                       NULL_TREE, false,
-                                       GSI_CONTINUE_LINKING);
-         stmt = gimple_build_assign (fd->loops[i + 1].v, t);
-         gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+         struct omp_for_data_loop *l = &fd->loops[i + 1];
+         if (l->m1 == NULL_TREE || l->outer != 1)
+           {
+             t = l->n1;
+             if (l->m1)
+               {
+                 tree t2
+                   = fold_build2 (MULT_EXPR, TREE_TYPE (t),
+                                  fd->loops[i + 1 - l->outer].v, l->m1);
+                 t = fold_build2 (PLUS_EXPR, TREE_TYPE (t), t2, t);
+               }
+             t = force_gimple_operand_gsi (&gsi, t,
+                                           DECL_P (l->v)
+                                           && TREE_ADDRESSABLE (l->v),
+                                           NULL_TREE, false,
+                                           GSI_CONTINUE_LINKING);
+             stmt = gimple_build_assign (l->v, t);
+             gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+           }
        }
       else
        collapse_bb = bb;
@@ -1948,9 +3094,83 @@ extract_omp_for_update_vars (struct omp_for_data *fd, basic_block cont_bb,
       stmt = gimple_build_assign (fd->loops[i].v, t);
       gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
 
+      if (fd->loops[i].non_rect_referenced)
+       {
+         basic_block update_bb = NULL, prev_bb = NULL;
+         for (int j = i + 1; j <= fd->last_nonrect; j++)
+           if (j - fd->loops[j].outer == i)
+             {
+               tree n1, n2;
+               struct omp_for_data_loop *l = &fd->loops[j];
+               basic_block this_bb = create_empty_bb (last_bb);
+               add_bb_to_loop (this_bb, last_bb->loop_father);
+               gimple_stmt_iterator gsi2 = gsi_start_bb (this_bb);
+               if (prev_bb)
+                 {
+                   e = make_edge (prev_bb, this_bb, EDGE_TRUE_VALUE);
+                   e->probability
+                     = profile_probability::guessed_always ().apply_scale (7,
+                                                                           8);
+                   set_immediate_dominator (CDI_DOMINATORS, this_bb, prev_bb);
+                 }
+               if (l->m1)
+                 {
+                   t = fold_build2 (MULT_EXPR, TREE_TYPE (l->m1), l->m1,
+                                    fd->loops[i].v);
+                   t = fold_build2 (PLUS_EXPR, TREE_TYPE (l->v), t, l->n1);
+                   n1 = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                                  false,
+                                                  GSI_CONTINUE_LINKING);
+                   stmt = gimple_build_assign (l->v, n1);
+                   gsi_insert_after (&gsi2, stmt, GSI_CONTINUE_LINKING);
+                   n1 = l->v;
+                 }
+               else
+                 n1 = force_gimple_operand_gsi (&gsi2, l->n1, true,
+                                                NULL_TREE, false,
+                                                GSI_CONTINUE_LINKING);
+               if (l->m2)
+                 {
+                   t = fold_build2 (MULT_EXPR, TREE_TYPE (l->m2), l->m2,
+                                    fd->loops[i].v);
+                   t = fold_build2 (PLUS_EXPR, TREE_TYPE (nonrect_bounds[j]),
+                                    t, unshare_expr (l->n2));
+                   n2 = force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                                                  false,
+                                                  GSI_CONTINUE_LINKING);
+                   stmt = gimple_build_assign (nonrect_bounds[j], n2);
+                   gsi_insert_after (&gsi2, stmt, GSI_CONTINUE_LINKING);
+                   n2 = nonrect_bounds[j];
+                 }
+               else
+                 n2 = force_gimple_operand_gsi (&gsi2, unshare_expr (l->n2),
+                                                true, NULL_TREE, false,
+                                                GSI_CONTINUE_LINKING);
+               gcond *cond_stmt
+                 = gimple_build_cond (l->cond_code, n1, n2,
+                                      NULL_TREE, NULL_TREE);
+               gsi_insert_after (&gsi2, cond_stmt, GSI_CONTINUE_LINKING);
+               if (update_bb == NULL)
+                 update_bb = this_bb;
+               e = make_edge (this_bb, bb, EDGE_FALSE_VALUE);
+               e->probability
+                 = profile_probability::guessed_always ().apply_scale (1, 8);
+               if (prev_bb == NULL)
+                 set_immediate_dominator (CDI_DOMINATORS, this_bb, bb);
+               prev_bb = this_bb;
+             }
+         e = make_edge (prev_bb, body_bb, EDGE_TRUE_VALUE);
+         e->probability
+           = profile_probability::guessed_always ().apply_scale (7, 8);
+         body_bb = update_bb;
+       }
+
       if (i > 0)
        {
-         t = fd->loops[i].n2;
+         if (fd->loops[i].m2)
+           t = nonrect_bounds[i];
+         else
+           t = unshare_expr (fd->loops[i].n2);
          t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
                                        false, GSI_CONTINUE_LINKING);
          tree v = fd->loops[i].v;
@@ -1960,11 +3180,17 @@ extract_omp_for_update_vars (struct omp_for_data *fd, basic_block cont_bb,
          t = fold_build2 (fd->loops[i].cond_code, boolean_type_node, v, t);
          stmt = gimple_build_cond_empty (t);
          gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
+         if (walk_tree (gimple_cond_lhs_ptr (as_a <gcond *> (stmt)),
+                        expand_omp_regimplify_p, NULL, NULL)
+             || walk_tree (gimple_cond_rhs_ptr (as_a <gcond *> (stmt)),
+                           expand_omp_regimplify_p, NULL, NULL))
+           gimple_regimplify_operands (stmt, &gsi);
          e = make_edge (bb, body_bb, EDGE_TRUE_VALUE);
          e->probability = profile_probability::guessed_always ().apply_scale (7, 8);
        }
       else
        make_edge (bb, body_bb, EDGE_FALLTHRU);
+      set_immediate_dominator (CDI_DOMINATORS, bb, last_bb);
       last_bb = bb;
     }
 
@@ -2026,8 +3252,8 @@ expand_omp_ordered_sink (gimple_stmt_iterator *gsi, struct omp_for_data *fd,
              forward = tree_int_cst_sgn (step) != -1;
            }
          if (forward ^ OMP_CLAUSE_DEPEND_SINK_NEGATIVE (deps))
-           warning_at (loc, 0, "%<depend(sink)%> clause waiting for "
-                               "lexically later iteration");
+           warning_at (loc, 0, "%<depend%> clause with %<sink%> modifier "
+                               "waiting for lexically later iteration");
          break;
        }
       deps = TREE_CHAIN (deps);
@@ -2163,8 +3389,9 @@ expand_omp_ordered_sink (gimple_stmt_iterator *gsi, struct omp_for_data *fd,
                               build_int_cst (itype, 0));
          if (integer_zerop (t) && !warned_step)
            {
-             warning_at (loc, 0, "%<depend(sink)%> refers to iteration never "
-                                 "in the iteration space");
+             warning_at (loc, 0, "%<depend%> clause with %<sink%> modifier "
+                                 "refers to iteration never in the iteration "
+                                 "space");
              warned_step = true;
            }
          cond = fold_build2_loc (loc, BIT_AND_EXPR, boolean_type_node,
@@ -2382,7 +3609,7 @@ expand_omp_for_ordered_loops (struct omp_for_data *fd, tree *counts,
 
       if (e2)
        {
-         struct loop *loop = alloc_loop ();
+         class loop *loop = alloc_loop ();
          loop->header = new_header;
          loop->latch = e2->src;
          add_loop (loop, body_bb->loop_father);
@@ -2512,6 +3739,7 @@ expand_omp_for_generic (struct omp_region *region,
                        struct omp_for_data *fd,
                        enum built_in_function start_fn,
                        enum built_in_function next_fn,
+                       tree sched_arg,
                        gimple *inner_stmt)
 {
   tree type, istart0, iend0, iend;
@@ -2552,13 +3780,59 @@ expand_omp_for_generic (struct omp_region *region,
   l3_bb = BRANCH_EDGE (entry_bb)->dest;
   exit_bb = region->exit;
 
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
 
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
   if (fd->ordered
-      && omp_find_clause (gimple_omp_for_clauses (gsi_stmt (gsi)),
+      && omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
                          OMP_CLAUSE_LASTPRIVATE))
     ordered_lastprivate = false;
+  tree reductions = NULL_TREE;
+  tree mem = NULL_TREE, cond_var = NULL_TREE, condtemp = NULL_TREE;
+  tree memv = NULL_TREE;
+  if (fd->lastprivate_conditional)
+    {
+      tree c = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
+                               OMP_CLAUSE__CONDTEMP_);
+      if (fd->have_pointer_condtemp)
+       condtemp = OMP_CLAUSE_DECL (c);
+      c = omp_find_clause (OMP_CLAUSE_CHAIN (c), OMP_CLAUSE__CONDTEMP_);
+      cond_var = OMP_CLAUSE_DECL (c);
+    }
+  if (sched_arg)
+    {
+      if (fd->have_reductemp)
+       {
+         tree c = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
+                                   OMP_CLAUSE__REDUCTEMP_);
+         reductions = OMP_CLAUSE_DECL (c);
+         gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+         gimple *g = SSA_NAME_DEF_STMT (reductions);
+         reductions = gimple_assign_rhs1 (g);
+         OMP_CLAUSE_DECL (c) = reductions;
+         entry_bb = gimple_bb (g);
+         edge e = split_block (entry_bb, g);
+         if (region->entry == entry_bb)
+           region->entry = e->dest;
+         gsi = gsi_last_bb (entry_bb);
+       }
+      else
+       reductions = null_pointer_node;
+      if (fd->have_pointer_condtemp)
+       {
+         tree type = TREE_TYPE (condtemp);
+         memv = create_tmp_var (type);
+         TREE_ADDRESSABLE (memv) = 1;
+         unsigned HOST_WIDE_INT sz
+           = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (type)));
+         sz *= fd->lastprivate_conditional;
+         expand_omp_build_assign (&gsi, memv, build_int_cst (type, sz),
+                                  false);
+         mem = build_fold_addr_expr (memv);
+       }
+      else
+       mem = null_pointer_node;
+    }
   if (fd->collapse > 1 || fd->ordered)
     {
       int first_zero_iter1 = -1, first_zero_iter2 = -1;
@@ -2577,12 +3851,12 @@ expand_omp_for_generic (struct omp_region *region,
          for (i = first_zero_iter1;
               i < (fd->ordered ? fd->ordered : fd->collapse); i++)
            if (SSA_VAR_P (counts[i]))
-             TREE_NO_WARNING (counts[i]) = 1;
+             suppress_warning (counts[i], OPT_Wuninitialized);
          gsi_prev (&gsi);
          e = split_block (entry_bb, gsi_stmt (gsi));
          entry_bb = e->dest;
          make_edge (zero_iter1_bb, entry_bb, EDGE_FALLTHRU);
-         gsi = gsi_last_bb (entry_bb);
+         gsi = gsi_last_nondebug_bb (entry_bb);
          set_immediate_dominator (CDI_DOMINATORS, entry_bb,
                                   get_immediate_dominator (CDI_DOMINATORS,
                                                            zero_iter1_bb));
@@ -2594,7 +3868,7 @@ expand_omp_for_generic (struct omp_region *region,
             be executed in that case, so just avoid uninit warnings.  */
          for (i = first_zero_iter2; i < fd->ordered; i++)
            if (SSA_VAR_P (counts[i]))
-             TREE_NO_WARNING (counts[i]) = 1;
+             suppress_warning (counts[i], OPT_Wuninitialized);
          if (zero_iter1_bb)
            make_edge (zero_iter2_bb, entry_bb, EDGE_FALLTHRU);
          else
@@ -2603,7 +3877,7 @@ expand_omp_for_generic (struct omp_region *region,
              e = split_block (entry_bb, gsi_stmt (gsi));
              entry_bb = e->dest;
              make_edge (zero_iter2_bb, entry_bb, EDGE_FALLTHRU);
-             gsi = gsi_last_bb (entry_bb);
+             gsi = gsi_last_nondebug_bb (entry_bb);
              set_immediate_dominator (CDI_DOMINATORS, entry_bb,
                                       get_immediate_dominator
                                         (CDI_DOMINATORS, zero_iter2_bb));
@@ -2745,7 +4019,18 @@ expand_omp_for_generic (struct omp_region *region,
            {
              t = fold_convert (fd->iter_type, fd->chunk_size);
              t = omp_adjust_chunk_size (t, fd->simd_schedule);
-             if (fd->ordered)
+             if (sched_arg)
+               {
+                 if (fd->ordered)
+                   t = build_call_expr (builtin_decl_explicit (start_fn),
+                                        8, t0, t1, sched_arg, t, t3, t4,
+                                        reductions, mem);
+                 else
+                   t = build_call_expr (builtin_decl_explicit (start_fn),
+                                        9, t0, t1, t2, sched_arg, t, t3, t4,
+                                        reductions, mem);
+               }
+             else if (fd->ordered)
                t = build_call_expr (builtin_decl_explicit (start_fn),
                                     5, t0, t1, t, t3, t4);
              else
@@ -2778,7 +4063,11 @@ expand_omp_for_generic (struct omp_region *region,
              tree bfn_decl = builtin_decl_explicit (start_fn);
              t = fold_convert (fd->iter_type, fd->chunk_size);
              t = omp_adjust_chunk_size (t, fd->simd_schedule);
-             t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
+             if (sched_arg)
+               t = build_call_expr (bfn_decl, 10, t5, t0, t1, t2, sched_arg,
+                                    t, t3, t4, reductions, mem);
+             else
+               t = build_call_expr (bfn_decl, 7, t5, t0, t1, t2, t, t3, t4);
            }
          else
            t = build_call_expr (builtin_decl_explicit (start_fn),
@@ -2792,11 +4081,23 @@ expand_omp_for_generic (struct omp_region *region,
                                true, GSI_SAME_STMT);
   if (arr && !TREE_STATIC (arr))
     {
-      tree clobber = build_constructor (TREE_TYPE (arr), NULL);
-      TREE_THIS_VOLATILE (clobber) = 1;
+      tree clobber = build_clobber (TREE_TYPE (arr));
       gsi_insert_before (&gsi, gimple_build_assign (arr, clobber),
                         GSI_SAME_STMT);
     }
+  if (fd->have_pointer_condtemp)
+    expand_omp_build_assign (&gsi, condtemp, memv, false);
+  if (fd->have_reductemp)
+    {
+      gimple *g = gsi_stmt (gsi);
+      gsi_remove (&gsi, true);
+      release_ssa_name (gimple_assign_lhs (g));
+
+      entry_bb = region->entry;
+      gsi = gsi_last_nondebug_bb (entry_bb);
+
+      gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
+    }
   gsi_insert_after (&gsi, gimple_build_cond_empty (t), GSI_SAME_STMT);
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
@@ -2856,6 +4157,35 @@ expand_omp_for_generic (struct omp_region *region,
                                NULL_TREE, false, GSI_CONTINUE_LINKING);
   assign_stmt = gimple_build_assign (startvar, t);
   gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+  if (cond_var)
+    {
+      tree itype = TREE_TYPE (cond_var);
+      /* For lastprivate(conditional:) itervar, we need some iteration
+        counter that starts at unsigned non-zero and increases.
+        Prefer as few IVs as possible, so if we can use startvar
+        itself, use that, or startvar + constant (those would be
+        incremented with step), and as last resort use the s0 + 1
+        incremented by 1.  */
+      if ((fd->ordered && fd->collapse == 1)
+         || bias
+         || POINTER_TYPE_P (type)
+         || TREE_CODE (fd->loop.n1) != INTEGER_CST
+         || fd->loop.cond_code != LT_EXPR)
+       t = fold_build2 (PLUS_EXPR, itype, fold_convert (itype, istart0),
+                        build_int_cst (itype, 1));
+      else if (tree_int_cst_sgn (fd->loop.n1) == 1)
+       t = fold_convert (itype, t);
+      else
+       {
+         tree c = fold_convert (itype, fd->loop.n1);
+         c = fold_build2 (MINUS_EXPR, itype, build_int_cst (itype, 1), c);
+         t = fold_build2 (PLUS_EXPR, itype, fold_convert (itype, t), c);
+       }
+      t = force_gimple_operand_gsi (&gsi, t, false,
+                                   NULL_TREE, false, GSI_CONTINUE_LINKING);
+      assign_stmt = gimple_build_assign (cond_var, t);
+      gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+    }
 
   t = iend0;
   if (fd->ordered && fd->collapse == 1)
@@ -2902,9 +4232,8 @@ expand_omp_for_generic (struct omp_region *region,
          && !OMP_CLAUSE_LINEAR_NO_COPYIN (c))
        {
          tree d = OMP_CLAUSE_DECL (c);
-         bool is_ref = omp_is_reference (d);
          tree t = d, a, dest;
-         if (is_ref)
+         if (omp_privatize_by_reference (t))
            t = build_simple_mem_ref_loc (OMP_CLAUSE_LOCATION (c), t);
          tree type = TREE_TYPE (t);
          if (POINTER_TYPE_P (type))
@@ -2938,11 +4267,10 @@ expand_omp_for_generic (struct omp_region *region,
                           : POINTER_PLUS_EXPR, TREE_TYPE (t), v, a);
          t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
                                        false, GSI_CONTINUE_LINKING);
-         assign_stmt = gimple_build_assign (dest, t);
-         gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+         expand_omp_build_assign (&gsi, dest, t, true);
        }
   if (fd->collapse > 1)
-    expand_omp_for_init_vars (fd, &gsi, counts, inner_stmt, startvar);
+    expand_omp_for_init_vars (fd, &gsi, counts, NULL, inner_stmt, startvar);
 
   if (fd->ordered)
     {
@@ -2992,13 +4320,18 @@ expand_omp_for_generic (struct omp_region *region,
          gsi = gsi_last_bb (l0_bb);
          expand_omp_build_assign (&gsi, counts[fd->collapse - 1],
                                   istart0, true);
-         gsi = gsi_last_bb (cont_bb);
-         t = fold_build2 (PLUS_EXPR, fd->iter_type, counts[fd->collapse - 1],
-                          build_int_cst (fd->iter_type, 1));
-         expand_omp_build_assign (&gsi, counts[fd->collapse - 1], t);
-         tree aref = build4 (ARRAY_REF, fd->iter_type, counts[fd->ordered],
-                             size_zero_node, NULL_TREE, NULL_TREE);
-         expand_omp_build_assign (&gsi, aref, counts[fd->collapse - 1]);
+         if (cont_bb)
+           {
+             gsi = gsi_last_bb (cont_bb);
+             t = fold_build2 (PLUS_EXPR, fd->iter_type,
+                              counts[fd->collapse - 1],
+                              build_int_cst (fd->iter_type, 1));
+             expand_omp_build_assign (&gsi, counts[fd->collapse - 1], t);
+             tree aref = build4 (ARRAY_REF, fd->iter_type,
+                                 counts[fd->ordered], size_zero_node,
+                                 NULL_TREE, NULL_TREE);
+             expand_omp_build_assign (&gsi, aref, counts[fd->collapse - 1]);
+           }
          t = counts[fd->collapse - 1];
        }
       else if (fd->collapse > 1)
@@ -3021,12 +4354,31 @@ expand_omp_for_generic (struct omp_region *region,
     {
       /* Code to control the increment and predicate for the sequential
         loop goes in the CONT_BB.  */
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       gcc_assert (gimple_code (cont_stmt) == GIMPLE_OMP_CONTINUE);
       vmain = gimple_omp_continue_control_use (cont_stmt);
       vback = gimple_omp_continue_control_def (cont_stmt);
 
+      if (cond_var)
+       {
+         tree itype = TREE_TYPE (cond_var);
+         tree t2;
+         if ((fd->ordered && fd->collapse == 1)
+              || bias
+              || POINTER_TYPE_P (type)
+              || TREE_CODE (fd->loop.n1) != INTEGER_CST
+              || fd->loop.cond_code != LT_EXPR)
+           t2 = build_int_cst (itype, 1);
+         else
+           t2 = fold_convert (itype, fd->loop.step);
+         t2 = fold_build2 (PLUS_EXPR, itype, cond_var, t2);
+         t2 = force_gimple_operand_gsi (&gsi, t2, false,
+                                        NULL_TREE, true, GSI_SAME_STMT);
+         assign_stmt = gimple_build_assign (cond_var, t2);
+         gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+       }
+
       if (!gimple_omp_for_combined_p (fd->for_stmt))
        {
          if (POINTER_TYPE_P (type))
@@ -3042,20 +4394,21 @@ expand_omp_for_generic (struct omp_region *region,
 
          if (fd->ordered && counts[fd->collapse - 1] == NULL_TREE)
            {
+             tree tem;
              if (fd->collapse > 1)
-               t = fd->loop.v;
+               tem = fd->loop.v;
              else
                {
-                 t = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->loops[0].v),
-                                  fd->loops[0].v, fd->loops[0].n1);
-                 t = fold_convert (fd->iter_type, t);
+                 tem = fold_build2 (MINUS_EXPR, TREE_TYPE (fd->loops[0].v),
+                                    fd->loops[0].v, fd->loops[0].n1);
+                 tem = fold_convert (fd->iter_type, tem);
                }
              tree aref = build4 (ARRAY_REF, fd->iter_type,
                                  counts[fd->ordered], size_zero_node,
                                  NULL_TREE, NULL_TREE);
-             t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
-                                           true, GSI_SAME_STMT);
-             expand_omp_build_assign (&gsi, aref, t);
+             tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE,
+                                             true, GSI_SAME_STMT);
+             expand_omp_build_assign (&gsi, aref, tem);
            }
 
          t = build2 (fd->loop.cond_code, boolean_type_node,
@@ -3069,7 +4422,7 @@ expand_omp_for_generic (struct omp_region *region,
       gsi_remove (&gsi, true);
 
       if (fd->collapse > 1 && !gimple_omp_for_combined_p (fd->for_stmt))
-       collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
+       collapse_bb = extract_omp_for_update_vars (fd, NULL, cont_bb, l1_bb);
 
       /* Emit code to get the next parallel iteration in L2_BB.  */
       gsi = gsi_start_bb (l2_bb);
@@ -3087,7 +4440,7 @@ expand_omp_for_generic (struct omp_region *region,
     }
 
   /* Add the loop cleanup function.  */
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
     t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
   else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
@@ -3095,17 +4448,24 @@ expand_omp_for_generic (struct omp_region *region,
   else
     t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
   gcall *call_stmt = gimple_build_call (t, 0);
-  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
-  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
   if (fd->ordered)
     {
       tree arr = counts[fd->ordered];
-      tree clobber = build_constructor (TREE_TYPE (arr), NULL);
-      TREE_THIS_VOLATILE (clobber) = 1;
+      tree clobber = build_clobber (TREE_TYPE (arr));
       gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
                        GSI_SAME_STMT);
     }
+  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+    {
+      gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
+      if (fd->have_reductemp)
+       {
+         gimple *g = gimple_build_assign (reductions, NOP_EXPR,
+                                          gimple_call_lhs (call_stmt));
+         gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+       }
+    }
+  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
   gsi_remove (&gsi, true);
 
   /* Connect the new blocks.  */
@@ -3167,10 +4527,13 @@ expand_omp_for_generic (struct omp_region *region,
          gphi_iterator psi;
          for (psi = gsi_start_phis (l3_bb); !gsi_end_p (psi); gsi_next (&psi))
            {
-             source_location locus;
+             location_t locus;
              gphi *nphi;
              gphi *exit_phi = psi.phi ();
 
+             if (virtual_operand_p (gimple_phi_result (exit_phi)))
+               continue;
+
              edge l2_to_l3 = find_edge (l2_bb, l3_bb);
              tree exit_res = PHI_ARG_DEF_FROM_EDGE (exit_phi, l2_to_l3);
 
@@ -3193,7 +4556,7 @@ expand_omp_for_generic (struct omp_region *region,
              add_phi_arg (nphi, exit_res, l2_to_l0, UNKNOWN_LOCATION);
 
              add_phi_arg (inner_phi, new_res, l0_to_l1, UNKNOWN_LOCATION);
-           };
+           }
        }
 
       set_immediate_dominator (CDI_DOMINATORS, l2_bb,
@@ -3208,14 +4571,14 @@ expand_omp_for_generic (struct omp_region *region,
       /* We enter expand_omp_for_generic with a loop.  This original loop may
         have its own loop struct, or it may be part of an outer loop struct
         (which may be the fake loop).  */
-      struct loop *outer_loop = entry_bb->loop_father;
+      class loop *outer_loop = entry_bb->loop_father;
       bool orig_loop_has_loop_struct = l1_bb->loop_father != outer_loop;
 
       add_bb_to_loop (l2_bb, outer_loop);
 
       /* We've added a new loop around the original loop.  Allocate the
         corresponding loop struct.  */
-      struct loop *new_loop = alloc_loop ();
+      class loop *new_loop = alloc_loop ();
       new_loop->header = l0_bb;
       new_loop->latch = l2_bb;
       add_loop (new_loop, outer_loop);
@@ -3225,7 +4588,7 @@ expand_omp_for_generic (struct omp_region *region,
       if (!orig_loop_has_loop_struct
          && !gimple_omp_for_combined_p (fd->for_stmt))
        {
-         struct loop *orig_loop = alloc_loop ();
+         class loop *orig_loop = alloc_loop ();
          orig_loop->header = l1_bb;
          /* The loop may have multiple latches.  */
          add_loop (orig_loop, new_loop);
@@ -3233,6 +4596,127 @@ expand_omp_for_generic (struct omp_region *region,
     }
 }
 
+/* Helper function for expand_omp_for_static_nochunk.  If PTR is NULL,
+   compute needed allocation size.  If !ALLOC of team allocations,
+   if ALLOC of thread allocation.  SZ is the initial needed size for
+   other purposes, ALLOC_ALIGN guaranteed alignment of allocation in bytes,
+   CNT number of elements of each array, for !ALLOC this is
+   omp_get_num_threads (), for ALLOC number of iterations handled by the
+   current thread.  If PTR is non-NULL, it is the start of the allocation
+   and this routine shall assign to OMP_CLAUSE_DECL (c) of those _scantemp_
+   clauses pointers to the corresponding arrays.  */
+
+static tree
+expand_omp_scantemp_alloc (tree clauses, tree ptr, unsigned HOST_WIDE_INT sz,
+                          unsigned HOST_WIDE_INT alloc_align, tree cnt,
+                          gimple_stmt_iterator *gsi, bool alloc)
+{
+  tree eltsz = NULL_TREE;
+  unsigned HOST_WIDE_INT preval = 0;
+  if (ptr && sz)
+    ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (ptr),
+                      ptr, size_int (sz));
+  for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+    if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE__SCANTEMP_
+       && !OMP_CLAUSE__SCANTEMP__CONTROL (c)
+       && (!OMP_CLAUSE__SCANTEMP__ALLOC (c)) != alloc)
+      {
+       tree pointee_type = TREE_TYPE (TREE_TYPE (OMP_CLAUSE_DECL (c)));
+       unsigned HOST_WIDE_INT al = TYPE_ALIGN_UNIT (pointee_type);
+       if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (pointee_type)))
+         {
+           unsigned HOST_WIDE_INT szl
+             = tree_to_uhwi (TYPE_SIZE_UNIT (pointee_type));
+           szl = least_bit_hwi (szl);
+           if (szl)
+             al = MIN (al, szl);
+         }
+       if (ptr == NULL_TREE)
+         {
+           if (eltsz == NULL_TREE)
+             eltsz = TYPE_SIZE_UNIT (pointee_type);
+           else
+             eltsz = size_binop (PLUS_EXPR, eltsz,
+                                 TYPE_SIZE_UNIT (pointee_type));
+         }
+       if (preval == 0 && al <= alloc_align)
+         {
+           unsigned HOST_WIDE_INT diff = ROUND_UP (sz, al) - sz;
+           sz += diff;
+           if (diff && ptr)
+             ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (ptr),
+                                ptr, size_int (diff));
+         }
+       else if (al > preval)
+         {
+           if (ptr)
+             {
+               ptr = fold_convert (pointer_sized_int_node, ptr);
+               ptr = fold_build2 (PLUS_EXPR, pointer_sized_int_node, ptr,
+                                  build_int_cst (pointer_sized_int_node,
+                                                 al - 1));
+               ptr = fold_build2 (BIT_AND_EXPR, pointer_sized_int_node, ptr,
+                                  build_int_cst (pointer_sized_int_node,
+                                                 -(HOST_WIDE_INT) al));
+               ptr = fold_convert (ptr_type_node, ptr);
+             }
+           else
+             sz += al - 1;
+         }
+       if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (pointee_type)))
+         preval = al;
+       else
+         preval = 1;
+       if (ptr)
+         {
+           expand_omp_build_assign (gsi, OMP_CLAUSE_DECL (c), ptr, false);
+           ptr = OMP_CLAUSE_DECL (c);
+           ptr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (ptr), ptr,
+                              size_binop (MULT_EXPR, cnt,
+                                          TYPE_SIZE_UNIT (pointee_type)));
+         }
+      }
+
+  if (ptr == NULL_TREE)
+    {
+      eltsz = size_binop (MULT_EXPR, eltsz, cnt);
+      if (sz)
+       eltsz = size_binop (PLUS_EXPR, eltsz, size_int (sz));
+      return eltsz;
+    }
+  else
+    return ptr;
+}
+
+/* Return the last _looptemp_ clause if one has been created for
+   lastprivate on distribute parallel for{, simd} or taskloop.
+   FD is the loop data and INNERC should be the second _looptemp_
+   clause (the one holding the end of the range).
+   This is followed by collapse - 1 _looptemp_ clauses for the
+   counts[1] and up, and for triangular loops followed by 4
+   further _looptemp_ clauses (one for counts[0], one first_inner_iterations,
+   one factor and one adjn1).  After this there is optionally one
+   _looptemp_ clause that this function returns.  */
+
+static tree
+find_lastprivate_looptemp (struct omp_for_data *fd, tree innerc)
+{
+  gcc_assert (innerc);
+  int count = fd->collapse - 1;
+  if (fd->non_rect
+      && fd->last_nonrect == fd->first_nonrect + 1
+      && !TYPE_UNSIGNED (TREE_TYPE (fd->loops[fd->last_nonrect].v)))
+    count += 4;
+  for (int i = 0; i < count; i++)
+    {
+      innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
+                               OMP_CLAUSE__LOOPTEMP_);
+      gcc_assert (innerc);
+    }
+  return omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
+                         OMP_CLAUSE__LOOPTEMP_);
+}
+
 /* A subroutine of expand_omp_for.  Generate code for a parallel
    loop with static schedule and no specified chunk size.  Given
    parameters:
@@ -3275,16 +4759,19 @@ expand_omp_for_static_nochunk (struct omp_region *region,
                               struct omp_for_data *fd,
                               gimple *inner_stmt)
 {
-  tree n, q, s0, e0, e, t, tt, nthreads, threadid;
+  tree n, q, s0, e0, e, t, tt, nthreads = NULL_TREE, threadid;
   tree type, itype, vmain, vback;
   basic_block entry_bb, second_bb, third_bb, exit_bb, seq_start_bb;
   basic_block body_bb, cont_bb, collapse_bb = NULL;
-  basic_block fin_bb;
-  gimple_stmt_iterator gsi;
+  basic_block fin_bb, fourth_bb = NULL, fifth_bb = NULL, sixth_bb = NULL;
+  basic_block exit1_bb = NULL, exit2_bb = NULL, exit3_bb = NULL;
+  gimple_stmt_iterator gsi, gsip;
   edge ep;
   bool broken_loop = region->cont == NULL;
   tree *counts = NULL;
   tree n1, n2, step;
+  tree reductions = NULL_TREE;
+  tree cond_var = NULL_TREE, condtemp = NULL_TREE;
 
   itype = type = TREE_TYPE (fd->loop.v);
   if (POINTER_TYPE_P (type))
@@ -3307,8 +4794,10 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   exit_bb = region->exit;
 
   /* Iteration space partitioning goes in ENTRY_BB.  */
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
+  gsip = gsi;
+  gsi_prev (&gsip);
 
   if (fd->collapse > 1)
     {
@@ -3338,7 +4827,7 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       n2 = force_gimple_operand_gsi (&gsi, n2, true, NULL_TREE,
                                     true, GSI_SAME_STMT);
       gcond *cond_stmt = gimple_build_cond (fd->loop.cond_code, n1, n2,
-                                                NULL_TREE, NULL_TREE);
+                                           NULL_TREE, NULL_TREE);
       gsi_insert_before (&gsi, cond_stmt, GSI_SAME_STMT);
       if (walk_tree (gimple_cond_lhs_ptr (cond_stmt),
                     expand_omp_regimplify_p, NULL, NULL)
@@ -3368,6 +4857,106 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       gsi = gsi_last_bb (entry_bb);
     }
 
+  if (fd->lastprivate_conditional)
+    {
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      tree c = omp_find_clause (clauses, OMP_CLAUSE__CONDTEMP_);
+      if (fd->have_pointer_condtemp)
+       condtemp = OMP_CLAUSE_DECL (c);
+      c = omp_find_clause (OMP_CLAUSE_CHAIN (c), OMP_CLAUSE__CONDTEMP_);
+      cond_var = OMP_CLAUSE_DECL (c);
+    }
+  if (fd->have_reductemp
+      /* For scan, we don't want to reinitialize condtemp before the
+        second loop.  */
+      || (fd->have_pointer_condtemp && !fd->have_scantemp)
+      || fd->have_nonctrl_scantemp)
+    {
+      tree t1 = build_int_cst (long_integer_type_node, 0);
+      tree t2 = build_int_cst (long_integer_type_node, 1);
+      tree t3 = build_int_cstu (long_integer_type_node,
+                               (HOST_WIDE_INT_1U << 31) + 1);
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      gimple_stmt_iterator gsi2 = gsi_none ();
+      gimple *g = NULL;
+      tree mem = null_pointer_node, memv = NULL_TREE;
+      unsigned HOST_WIDE_INT condtemp_sz = 0;
+      unsigned HOST_WIDE_INT alloc_align = 0;
+      if (fd->have_reductemp)
+       {
+         gcc_assert (!fd->have_nonctrl_scantemp);
+         tree c = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+         reductions = OMP_CLAUSE_DECL (c);
+         gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+         g = SSA_NAME_DEF_STMT (reductions);
+         reductions = gimple_assign_rhs1 (g);
+         OMP_CLAUSE_DECL (c) = reductions;
+         gsi2 = gsi_for_stmt (g);
+       }
+      else
+       {
+         if (gsi_end_p (gsip))
+           gsi2 = gsi_after_labels (region->entry);
+         else
+           gsi2 = gsip;
+         reductions = null_pointer_node;
+       }
+      if (fd->have_pointer_condtemp || fd->have_nonctrl_scantemp)
+       {
+         tree type;
+         if (fd->have_pointer_condtemp)
+           type = TREE_TYPE (condtemp);
+         else
+           type = ptr_type_node;
+         memv = create_tmp_var (type);
+         TREE_ADDRESSABLE (memv) = 1;
+         unsigned HOST_WIDE_INT sz = 0;
+         tree size = NULL_TREE;
+         if (fd->have_pointer_condtemp)
+           {
+             sz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (type)));
+             sz *= fd->lastprivate_conditional;
+             condtemp_sz = sz;
+           }
+         if (fd->have_nonctrl_scantemp)
+           {
+             nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
+             gimple *g = gimple_build_call (nthreads, 0);
+             nthreads = create_tmp_var (integer_type_node);
+             gimple_call_set_lhs (g, nthreads);
+             gsi_insert_before (&gsi2, g, GSI_SAME_STMT);
+             nthreads = fold_convert (sizetype, nthreads);
+             alloc_align = TYPE_ALIGN_UNIT (long_long_integer_type_node);
+             size = expand_omp_scantemp_alloc (clauses, NULL_TREE, sz,
+                                               alloc_align, nthreads, NULL,
+                                               false);
+             size = fold_convert (type, size);
+           }
+         else
+           size = build_int_cst (type, sz);
+         expand_omp_build_assign (&gsi2, memv, size, false);
+         mem = build_fold_addr_expr (memv);
+       }
+      tree t
+       = build_call_expr (builtin_decl_explicit (BUILT_IN_GOMP_LOOP_START),
+                          9, t1, t2, t2, t3, t1, null_pointer_node,
+                          null_pointer_node, reductions, mem);
+      force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                               true, GSI_SAME_STMT);
+      if (fd->have_pointer_condtemp)
+       expand_omp_build_assign (&gsi2, condtemp, memv, false);
+      if (fd->have_nonctrl_scantemp)
+       {
+         tree ptr = fd->have_pointer_condtemp ? condtemp : memv;
+         expand_omp_scantemp_alloc (clauses, ptr, condtemp_sz,
+                                    alloc_align, nthreads, &gsi2, false);
+       }
+      if (fd->have_reductemp)
+       {
+         gsi_remove (&gsi2, true);
+         release_ssa_name (gimple_assign_lhs (g));
+       }
+    }
   switch (gimple_omp_for_kind (fd->for_stmt))
     {
     case GF_OMP_FOR_KIND_FOR:
@@ -3439,7 +5028,7 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   gsi_insert_before (&gsi, cond_stmt, GSI_SAME_STMT);
 
   second_bb = split_block (entry_bb, cond_stmt)->dest;
-  gsi = gsi_last_bb (second_bb);
+  gsi = gsi_last_nondebug_bb (second_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
 
   gsi_insert_before (&gsi, gimple_build_assign (tt, build_int_cst (itype, 0)),
@@ -3449,9 +5038,75 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
 
   third_bb = split_block (second_bb, assign_stmt)->dest;
-  gsi = gsi_last_bb (third_bb);
+  gsi = gsi_last_nondebug_bb (third_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
 
+  if (fd->have_nonctrl_scantemp)
+    {
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      tree controlp = NULL_TREE, controlb = NULL_TREE;
+      for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE__SCANTEMP_
+           && OMP_CLAUSE__SCANTEMP__CONTROL (c))
+         {
+           if (TREE_TYPE (OMP_CLAUSE_DECL (c)) == boolean_type_node)
+             controlb = OMP_CLAUSE_DECL (c);
+           else
+             controlp = OMP_CLAUSE_DECL (c);
+           if (controlb && controlp)
+             break;
+         }
+      gcc_assert (controlp && controlb);
+      tree cnt = create_tmp_var (sizetype);
+      gimple *g = gimple_build_assign (cnt, NOP_EXPR, q);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      unsigned HOST_WIDE_INT alloc_align = TYPE_ALIGN_UNIT (ptr_type_node);
+      tree sz = expand_omp_scantemp_alloc (clauses, NULL_TREE, 0,
+                                          alloc_align, cnt, NULL, true);
+      tree size = create_tmp_var (sizetype);
+      expand_omp_build_assign (&gsi, size, sz, false);
+      tree cmp = fold_build2 (GT_EXPR, boolean_type_node,
+                             size, size_int (16384));
+      expand_omp_build_assign (&gsi, controlb, cmp);
+      g = gimple_build_cond (NE_EXPR, controlb, boolean_false_node,
+                            NULL_TREE, NULL_TREE);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      fourth_bb = split_block (third_bb, g)->dest;
+      gsi = gsi_last_nondebug_bb (fourth_bb);
+      /* FIXME: Once we have allocators, this should use allocator.  */
+      g = gimple_build_call (builtin_decl_explicit (BUILT_IN_MALLOC), 1, size);
+      gimple_call_set_lhs (g, controlp);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      expand_omp_scantemp_alloc (clauses, controlp, 0, alloc_align, cnt,
+                                &gsi, true);
+      gsi_prev (&gsi);
+      g = gsi_stmt (gsi);
+      fifth_bb = split_block (fourth_bb, g)->dest;
+      gsi = gsi_last_nondebug_bb (fifth_bb);
+
+      g = gimple_build_call (builtin_decl_implicit (BUILT_IN_STACK_SAVE), 0);
+      gimple_call_set_lhs (g, controlp);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      tree alloca_decl = builtin_decl_explicit (BUILT_IN_ALLOCA_WITH_ALIGN);
+      for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE__SCANTEMP_
+           && OMP_CLAUSE__SCANTEMP__ALLOC (c))
+         {
+           tree tmp = create_tmp_var (sizetype);
+           tree pointee_type = TREE_TYPE (TREE_TYPE (OMP_CLAUSE_DECL (c)));
+           g = gimple_build_assign (tmp, MULT_EXPR, cnt,
+                                    TYPE_SIZE_UNIT (pointee_type));
+           gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+           g = gimple_build_call (alloca_decl, 2, tmp,
+                                  size_int (TYPE_ALIGN (pointee_type)));
+           gimple_call_set_lhs (g, OMP_CLAUSE_DECL (c));
+           gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+         }
+
+      sixth_bb = split_block (fifth_bb, g)->dest;
+      gsi = gsi_last_nondebug_bb (sixth_bb);
+    }
+
   t = build2 (MULT_EXPR, itype, q, threadid);
   t = build2 (PLUS_EXPR, itype, t, tt);
   s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
@@ -3486,15 +5141,7 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       if (fd->collapse > 1 && TREE_CODE (fd->loop.n2) != INTEGER_CST
          && gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_DISTRIBUTE)
        {
-         int i;
-         for (i = 1; i < fd->collapse; i++)
-           {
-             innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
-                                       OMP_CLAUSE__LOOPTEMP_);
-             gcc_assert (innerc);
-           }
-         innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
-                                   OMP_CLAUSE__LOOPTEMP_);
+         innerc = find_lastprivate_looptemp (fd, innerc);
          if (innerc)
            {
              /* If needed (distribute parallel for with lastprivate),
@@ -3511,7 +5158,12 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   t = fold_convert (itype, s0);
   t = fold_build2 (MULT_EXPR, itype, t, step);
   if (POINTER_TYPE_P (type))
-    t = fold_build_pointer_plus (n1, t);
+    {
+      t = fold_build_pointer_plus (n1, t);
+      if (!POINTER_TYPE_P (TREE_TYPE (startvar))
+         && TYPE_PRECISION (TREE_TYPE (startvar)) > TYPE_PRECISION (type))
+       t = fold_convert (signed_type_for (type), t);
+    }
   else
     t = fold_build2 (PLUS_EXPR, type, t, n1);
   t = fold_convert (TREE_TYPE (startvar), t);
@@ -3521,11 +5173,43 @@ expand_omp_for_static_nochunk (struct omp_region *region,
                                NULL_TREE, false, GSI_CONTINUE_LINKING);
   assign_stmt = gimple_build_assign (startvar, t);
   gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+  if (cond_var)
+    {
+      tree itype = TREE_TYPE (cond_var);
+      /* For lastprivate(conditional:) itervar, we need some iteration
+        counter that starts at unsigned non-zero and increases.
+        Prefer as few IVs as possible, so if we can use startvar
+        itself, use that, or startvar + constant (those would be
+        incremented with step), and as last resort use the s0 + 1
+        incremented by 1.  */
+      if (POINTER_TYPE_P (type)
+         || TREE_CODE (n1) != INTEGER_CST
+         || fd->loop.cond_code != LT_EXPR)
+       t = fold_build2 (PLUS_EXPR, itype, fold_convert (itype, s0),
+                        build_int_cst (itype, 1));
+      else if (tree_int_cst_sgn (n1) == 1)
+       t = fold_convert (itype, t);
+      else
+       {
+         tree c = fold_convert (itype, n1);
+         c = fold_build2 (MINUS_EXPR, itype, build_int_cst (itype, 1), c);
+         t = fold_build2 (PLUS_EXPR, itype, fold_convert (itype, t), c);
+       }
+      t = force_gimple_operand_gsi (&gsi, t, false,
+                                   NULL_TREE, false, GSI_CONTINUE_LINKING);
+      assign_stmt = gimple_build_assign (cond_var, t);
+      gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+    }
 
   t = fold_convert (itype, e0);
   t = fold_build2 (MULT_EXPR, itype, t, step);
   if (POINTER_TYPE_P (type))
-    t = fold_build_pointer_plus (n1, t);
+    {
+      t = fold_build_pointer_plus (n1, t);
+      if (!POINTER_TYPE_P (TREE_TYPE (startvar))
+         && TYPE_PRECISION (TREE_TYPE (startvar)) > TYPE_PRECISION (type))
+       t = fold_convert (signed_type_for (type), t);
+    }
   else
     t = fold_build2 (PLUS_EXPR, type, t, n1);
   t = fold_convert (TREE_TYPE (startvar), t);
@@ -3543,6 +5227,7 @@ expand_omp_for_static_nochunk (struct omp_region *region,
     }
   /* Handle linear clause adjustments.  */
   tree itercnt = NULL_TREE;
+  tree *nonrect_bounds = NULL;
   if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_FOR)
     for (tree c = gimple_omp_for_clauses (fd->for_stmt);
         c; c = OMP_CLAUSE_CHAIN (c))
@@ -3550,9 +5235,8 @@ expand_omp_for_static_nochunk (struct omp_region *region,
          && !OMP_CLAUSE_LINEAR_NO_COPYIN (c))
        {
          tree d = OMP_CLAUSE_DECL (c);
-         bool is_ref = omp_is_reference (d);
          tree t = d, a, dest;
-         if (is_ref)
+         if (omp_privatize_by_reference (t))
            t = build_simple_mem_ref_loc (OMP_CLAUSE_LOCATION (c), t);
          if (itercnt == NULL_TREE)
            {
@@ -3581,22 +5265,46 @@ expand_omp_for_static_nochunk (struct omp_region *region,
                           : POINTER_PLUS_EXPR, TREE_TYPE (t), t, a);
          t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
                                        false, GSI_CONTINUE_LINKING);
-         assign_stmt = gimple_build_assign (dest, t);
-         gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+         expand_omp_build_assign (&gsi, dest, t, true);
        }
   if (fd->collapse > 1)
-    expand_omp_for_init_vars (fd, &gsi, counts, inner_stmt, startvar);
+    {
+      if (fd->non_rect)
+       {
+         nonrect_bounds = XALLOCAVEC (tree, fd->last_nonrect + 1);
+         memset (nonrect_bounds, 0, sizeof (tree) * (fd->last_nonrect + 1));
+       }
+      expand_omp_for_init_vars (fd, &gsi, counts, nonrect_bounds, inner_stmt,
+                               startvar);
+    }
 
   if (!broken_loop)
     {
       /* The code controlling the sequential loop replaces the
         GIMPLE_OMP_CONTINUE.  */
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       gcc_assert (gimple_code (cont_stmt) == GIMPLE_OMP_CONTINUE);
       vmain = gimple_omp_continue_control_use (cont_stmt);
       vback = gimple_omp_continue_control_def (cont_stmt);
 
+      if (cond_var)
+       {
+         tree itype = TREE_TYPE (cond_var);
+         tree t2;
+         if (POINTER_TYPE_P (type)
+             || TREE_CODE (n1) != INTEGER_CST
+             || fd->loop.cond_code != LT_EXPR)
+           t2 = build_int_cst (itype, 1);
+         else
+           t2 = fold_convert (itype, step);
+         t2 = fold_build2 (PLUS_EXPR, itype, cond_var, t2);
+         t2 = force_gimple_operand_gsi (&gsi, t2, false,
+                                        NULL_TREE, true, GSI_SAME_STMT);
+         assign_stmt = gimple_build_assign (cond_var, t2);
+         gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+       }
+
       if (!gimple_omp_for_combined_p (fd->for_stmt))
        {
          if (POINTER_TYPE_P (type))
@@ -3620,15 +5328,76 @@ expand_omp_for_static_nochunk (struct omp_region *region,
       gsi_remove (&gsi, true);
 
       if (fd->collapse > 1 && !gimple_omp_for_combined_p (fd->for_stmt))
-       collapse_bb = extract_omp_for_update_vars (fd, cont_bb, body_bb);
+       collapse_bb = extract_omp_for_update_vars (fd, nonrect_bounds,
+                                                  cont_bb, body_bb);
     }
 
   /* Replace the GIMPLE_OMP_RETURN with a barrier, or nothing.  */
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
     {
       t = gimple_omp_return_lhs (gsi_stmt (gsi));
-      gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
+      if (fd->have_reductemp
+         || ((fd->have_pointer_condtemp || fd->have_scantemp)
+             && !fd->have_nonctrl_scantemp))
+       {
+         tree fn;
+         if (t)
+           fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+         else
+           fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+         gcall *g = gimple_build_call (fn, 0);
+         if (t)
+           {
+             gimple_call_set_lhs (g, t);
+             if (fd->have_reductemp)
+               gsi_insert_after (&gsi, gimple_build_assign (reductions,
+                                                            NOP_EXPR, t),
+                                 GSI_SAME_STMT);
+           }
+         gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+       }
+      else
+       gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
+    }
+  else if ((fd->have_pointer_condtemp || fd->have_scantemp)
+          && !fd->have_nonctrl_scantemp)
+    {
+      tree fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+      gcall *g = gimple_build_call (fn, 0);
+      gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+    }
+  if (fd->have_scantemp && !fd->have_nonctrl_scantemp)
+    {
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      tree controlp = NULL_TREE, controlb = NULL_TREE;
+      for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+       if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE__SCANTEMP_
+           && OMP_CLAUSE__SCANTEMP__CONTROL (c))
+         {
+           if (TREE_TYPE (OMP_CLAUSE_DECL (c)) == boolean_type_node)
+             controlb = OMP_CLAUSE_DECL (c);
+           else
+             controlp = OMP_CLAUSE_DECL (c);
+           if (controlb && controlp)
+             break;
+         }
+      gcc_assert (controlp && controlb);
+      gimple *g = gimple_build_cond (NE_EXPR, controlb, boolean_false_node,
+                                    NULL_TREE, NULL_TREE);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      exit1_bb = split_block (exit_bb, g)->dest;
+      gsi = gsi_after_labels (exit1_bb);
+      g = gimple_build_call (builtin_decl_explicit (BUILT_IN_FREE), 1,
+                            controlp);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      exit2_bb = split_block (exit1_bb, g)->dest;
+      gsi = gsi_after_labels (exit2_bb);
+      g = gimple_build_call (builtin_decl_implicit (BUILT_IN_STACK_RESTORE), 1,
+                            controlp);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      exit3_bb = split_block (exit2_bb, g)->dest;
+      gsi = gsi_after_labels (exit3_bb);
     }
   gsi_remove (&gsi, true);
 
@@ -3638,8 +5407,34 @@ expand_omp_for_static_nochunk (struct omp_region *region,
   ep = find_edge (entry_bb, second_bb);
   ep->flags = EDGE_TRUE_VALUE;
   ep->probability = profile_probability::guessed_always ().apply_scale (1, 4);
-  find_edge (third_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
-  find_edge (third_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
+  if (fourth_bb)
+    {
+      ep = make_edge (third_bb, fifth_bb, EDGE_FALSE_VALUE);
+      ep->probability
+       = profile_probability::guessed_always ().apply_scale (1, 2);
+      ep = find_edge (third_bb, fourth_bb);
+      ep->flags = EDGE_TRUE_VALUE;
+      ep->probability
+       = profile_probability::guessed_always ().apply_scale (1, 2);
+      ep = find_edge (fourth_bb, fifth_bb);
+      redirect_edge_and_branch (ep, sixth_bb);
+    }
+  else
+    sixth_bb = third_bb;
+  find_edge (sixth_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
+  find_edge (sixth_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
+  if (exit1_bb)
+    {
+      ep = make_edge (exit_bb, exit2_bb, EDGE_FALSE_VALUE);
+      ep->probability
+       = profile_probability::guessed_always ().apply_scale (1, 2);
+      ep = find_edge (exit_bb, exit1_bb);
+      ep->flags = EDGE_TRUE_VALUE;
+      ep->probability
+       = profile_probability::guessed_always ().apply_scale (1, 2);
+      ep = find_edge (exit1_bb, exit2_bb);
+      redirect_edge_and_branch (ep, exit3_bb);
+    }
 
   if (!broken_loop)
     {
@@ -3667,14 +5462,24 @@ expand_omp_for_static_nochunk (struct omp_region *region,
 
   set_immediate_dominator (CDI_DOMINATORS, second_bb, entry_bb);
   set_immediate_dominator (CDI_DOMINATORS, third_bb, entry_bb);
-  set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, third_bb);
+  if (fourth_bb)
+    {
+      set_immediate_dominator (CDI_DOMINATORS, fifth_bb, third_bb);
+      set_immediate_dominator (CDI_DOMINATORS, sixth_bb, third_bb);
+    }
+  set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, sixth_bb);
 
   set_immediate_dominator (CDI_DOMINATORS, body_bb,
                           recompute_dominator (CDI_DOMINATORS, body_bb));
   set_immediate_dominator (CDI_DOMINATORS, fin_bb,
                           recompute_dominator (CDI_DOMINATORS, fin_bb));
+  if (exit1_bb)
+    {
+      set_immediate_dominator (CDI_DOMINATORS, exit2_bb, exit_bb);
+      set_immediate_dominator (CDI_DOMINATORS, exit3_bb, exit_bb);
+    }
 
-  struct loop *loop = body_bb->loop_father;
+  class loop *loop = body_bb->loop_father;
   if (loop != entry_bb->loop_father)
     {
       gcc_assert (broken_loop || loop->header == body_bb);
@@ -3760,11 +5565,13 @@ expand_omp_for_static_chunk (struct omp_region *region,
   tree type, itype, vmain, vback, vextra;
   basic_block entry_bb, exit_bb, body_bb, seq_start_bb, iter_part_bb;
   basic_block trip_update_bb = NULL, cont_bb, collapse_bb = NULL, fin_bb;
-  gimple_stmt_iterator gsi;
+  gimple_stmt_iterator gsi, gsip;
   edge se;
   bool broken_loop = region->cont == NULL;
   tree *counts = NULL;
   tree n1, n2, step;
+  tree reductions = NULL_TREE;
+  tree cond_var = NULL_TREE, condtemp = NULL_TREE;
 
   itype = type = TREE_TYPE (fd->loop.v);
   if (POINTER_TYPE_P (type))
@@ -3791,8 +5598,10 @@ expand_omp_for_static_chunk (struct omp_region *region,
   exit_bb = region->exit;
 
   /* Trip and adjustment setup goes in ENTRY_BB.  */
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
+  gsip = gsi;
+  gsi_prev (&gsip);
 
   if (fd->collapse > 1)
     {
@@ -3852,6 +5661,69 @@ expand_omp_for_static_chunk (struct omp_region *region,
       gsi = gsi_last_bb (entry_bb);
     }
 
+  if (fd->lastprivate_conditional)
+    {
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      tree c = omp_find_clause (clauses, OMP_CLAUSE__CONDTEMP_);
+      if (fd->have_pointer_condtemp)
+       condtemp = OMP_CLAUSE_DECL (c);
+      c = omp_find_clause (OMP_CLAUSE_CHAIN (c), OMP_CLAUSE__CONDTEMP_);
+      cond_var = OMP_CLAUSE_DECL (c);
+    }
+  if (fd->have_reductemp || fd->have_pointer_condtemp)
+    {
+      tree t1 = build_int_cst (long_integer_type_node, 0);
+      tree t2 = build_int_cst (long_integer_type_node, 1);
+      tree t3 = build_int_cstu (long_integer_type_node,
+                               (HOST_WIDE_INT_1U << 31) + 1);
+      tree clauses = gimple_omp_for_clauses (fd->for_stmt);
+      gimple_stmt_iterator gsi2 = gsi_none ();
+      gimple *g = NULL;
+      tree mem = null_pointer_node, memv = NULL_TREE;
+      if (fd->have_reductemp)
+       {
+         tree c = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+         reductions = OMP_CLAUSE_DECL (c);
+         gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+         g = SSA_NAME_DEF_STMT (reductions);
+         reductions = gimple_assign_rhs1 (g);
+         OMP_CLAUSE_DECL (c) = reductions;
+         gsi2 = gsi_for_stmt (g);
+       }
+      else
+       {
+         if (gsi_end_p (gsip))
+           gsi2 = gsi_after_labels (region->entry);
+         else
+           gsi2 = gsip;
+         reductions = null_pointer_node;
+       }
+      if (fd->have_pointer_condtemp)
+       {
+         tree type = TREE_TYPE (condtemp);
+         memv = create_tmp_var (type);
+         TREE_ADDRESSABLE (memv) = 1;
+         unsigned HOST_WIDE_INT sz
+           = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (type)));
+         sz *= fd->lastprivate_conditional;
+         expand_omp_build_assign (&gsi2, memv, build_int_cst (type, sz),
+                                  false);
+         mem = build_fold_addr_expr (memv);
+       }
+      tree t
+       = build_call_expr (builtin_decl_explicit (BUILT_IN_GOMP_LOOP_START),
+                          9, t1, t2, t2, t3, t1, null_pointer_node,
+                          null_pointer_node, reductions, mem);
+      force_gimple_operand_gsi (&gsi2, t, true, NULL_TREE,
+                               true, GSI_SAME_STMT);
+      if (fd->have_pointer_condtemp)
+       expand_omp_build_assign (&gsi2, condtemp, memv, false);
+      if (fd->have_reductemp)
+       {
+         gsi_remove (&gsi2, true);
+         release_ssa_name (gimple_assign_lhs (g));
+       }
+    }
   switch (gimple_omp_for_kind (fd->for_stmt))
     {
     case GF_OMP_FOR_KIND_FOR:
@@ -3984,15 +5856,7 @@ expand_omp_for_static_chunk (struct omp_region *region,
       if (fd->collapse > 1 && TREE_CODE (fd->loop.n2) != INTEGER_CST
          && gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_DISTRIBUTE)
        {
-         int i;
-         for (i = 1; i < fd->collapse; i++)
-           {
-             innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
-                                       OMP_CLAUSE__LOOPTEMP_);
-             gcc_assert (innerc);
-           }
-         innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
-                                   OMP_CLAUSE__LOOPTEMP_);
+         innerc = find_lastprivate_looptemp (fd, innerc);
          if (innerc)
            {
              /* If needed (distribute parallel for with lastprivate),
@@ -4010,7 +5874,12 @@ expand_omp_for_static_chunk (struct omp_region *region,
   t = fold_convert (itype, s0);
   t = fold_build2 (MULT_EXPR, itype, t, step);
   if (POINTER_TYPE_P (type))
-    t = fold_build_pointer_plus (n1, t);
+    {
+      t = fold_build_pointer_plus (n1, t);
+      if (!POINTER_TYPE_P (TREE_TYPE (startvar))
+         && TYPE_PRECISION (TREE_TYPE (startvar)) > TYPE_PRECISION (type))
+       t = fold_convert (signed_type_for (type), t);
+    }
   else
     t = fold_build2 (PLUS_EXPR, type, t, n1);
   t = fold_convert (TREE_TYPE (startvar), t);
@@ -4020,11 +5889,43 @@ expand_omp_for_static_chunk (struct omp_region *region,
                                NULL_TREE, false, GSI_CONTINUE_LINKING);
   assign_stmt = gimple_build_assign (startvar, t);
   gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+  if (cond_var)
+    {
+      tree itype = TREE_TYPE (cond_var);
+      /* For lastprivate(conditional:) itervar, we need some iteration
+        counter that starts at unsigned non-zero and increases.
+        Prefer as few IVs as possible, so if we can use startvar
+        itself, use that, or startvar + constant (those would be
+        incremented with step), and as last resort use the s0 + 1
+        incremented by 1.  */
+      if (POINTER_TYPE_P (type)
+         || TREE_CODE (n1) != INTEGER_CST
+         || fd->loop.cond_code != LT_EXPR)
+       t = fold_build2 (PLUS_EXPR, itype, fold_convert (itype, s0),
+                        build_int_cst (itype, 1));
+      else if (tree_int_cst_sgn (n1) == 1)
+       t = fold_convert (itype, t);
+      else
+       {
+         tree c = fold_convert (itype, n1);
+         c = fold_build2 (MINUS_EXPR, itype, build_int_cst (itype, 1), c);
+         t = fold_build2 (PLUS_EXPR, itype, fold_convert (itype, t), c);
+       }
+      t = force_gimple_operand_gsi (&gsi, t, false,
+                                   NULL_TREE, false, GSI_CONTINUE_LINKING);
+      assign_stmt = gimple_build_assign (cond_var, t);
+      gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+    }
 
   t = fold_convert (itype, e0);
   t = fold_build2 (MULT_EXPR, itype, t, step);
   if (POINTER_TYPE_P (type))
-    t = fold_build_pointer_plus (n1, t);
+    {
+      t = fold_build_pointer_plus (n1, t);
+      if (!POINTER_TYPE_P (TREE_TYPE (startvar))
+         && TYPE_PRECISION (TREE_TYPE (startvar)) > TYPE_PRECISION (type))
+       t = fold_convert (signed_type_for (type), t);
+    }
   else
     t = fold_build2 (PLUS_EXPR, type, t, n1);
   t = fold_convert (TREE_TYPE (startvar), t);
@@ -4049,9 +5950,8 @@ expand_omp_for_static_chunk (struct omp_region *region,
          && !OMP_CLAUSE_LINEAR_NO_COPYIN (c))
        {
          tree d = OMP_CLAUSE_DECL (c);
-         bool is_ref = omp_is_reference (d);
          tree t = d, a, dest;
-         if (is_ref)
+         if (omp_privatize_by_reference (t))
            t = build_simple_mem_ref_loc (OMP_CLAUSE_LOCATION (c), t);
          tree type = TREE_TYPE (t);
          if (POINTER_TYPE_P (type))
@@ -4087,21 +5987,37 @@ expand_omp_for_static_chunk (struct omp_region *region,
                           : POINTER_PLUS_EXPR, TREE_TYPE (t), v, a);
          t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
                                        false, GSI_CONTINUE_LINKING);
-         assign_stmt = gimple_build_assign (dest, t);
-         gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
+         expand_omp_build_assign (&gsi, dest, t, true);
        }
   if (fd->collapse > 1)
-    expand_omp_for_init_vars (fd, &gsi, counts, inner_stmt, startvar);
+    expand_omp_for_init_vars (fd, &gsi, counts, NULL, inner_stmt, startvar);
 
   if (!broken_loop)
     {
       /* The code controlling the sequential loop goes in CONT_BB,
         replacing the GIMPLE_OMP_CONTINUE.  */
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       vmain = gimple_omp_continue_control_use (cont_stmt);
       vback = gimple_omp_continue_control_def (cont_stmt);
 
+      if (cond_var)
+       {
+         tree itype = TREE_TYPE (cond_var);
+         tree t2;
+         if (POINTER_TYPE_P (type)
+             || TREE_CODE (n1) != INTEGER_CST
+             || fd->loop.cond_code != LT_EXPR)
+           t2 = build_int_cst (itype, 1);
+         else
+           t2 = fold_convert (itype, step);
+         t2 = fold_build2 (PLUS_EXPR, itype, cond_var, t2);
+         t2 = force_gimple_operand_gsi (&gsi, t2, false,
+                                        NULL_TREE, true, GSI_SAME_STMT);
+         assign_stmt = gimple_build_assign (cond_var, t2);
+         gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+       }
+
       if (!gimple_omp_for_combined_p (fd->for_stmt))
        {
          if (POINTER_TYPE_P (type))
@@ -4129,7 +6045,7 @@ expand_omp_for_static_chunk (struct omp_region *region,
       gsi_remove (&gsi, true);
 
       if (fd->collapse > 1 && !gimple_omp_for_combined_p (fd->for_stmt))
-       collapse_bb = extract_omp_for_update_vars (fd, cont_bb, body_bb);
+       collapse_bb = extract_omp_for_update_vars (fd, NULL, cont_bb, body_bb);
 
       /* Trip update code goes into TRIP_UPDATE_BB.  */
       gsi = gsi_start_bb (trip_update_bb);
@@ -4141,11 +6057,36 @@ expand_omp_for_static_chunk (struct omp_region *region,
     }
 
   /* Replace the GIMPLE_OMP_RETURN with a barrier, or nothing.  */
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   if (!gimple_omp_return_nowait_p (gsi_stmt (gsi)))
     {
       t = gimple_omp_return_lhs (gsi_stmt (gsi));
-      gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
+      if (fd->have_reductemp || fd->have_pointer_condtemp)
+       {
+         tree fn;
+         if (t)
+           fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+         else
+           fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+         gcall *g = gimple_build_call (fn, 0);
+         if (t)
+           {
+             gimple_call_set_lhs (g, t);
+             if (fd->have_reductemp)
+               gsi_insert_after (&gsi, gimple_build_assign (reductions,
+                                                            NOP_EXPR, t),
+                                 GSI_SAME_STMT);
+           }
+         gsi_insert_after (&gsi, g, GSI_SAME_STMT);
+       }
+      else
+       gsi_insert_after (&gsi, omp_build_barrier (t), GSI_SAME_STMT);
+    }
+  else if (fd->have_pointer_condtemp)
+    {
+      tree fn = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+      gcall *g = gimple_build_call (fn, 0);
+      gsi_insert_after (&gsi, g, GSI_SAME_STMT);
     }
   gsi_remove (&gsi, true);
 
@@ -4203,7 +6144,7 @@ expand_omp_for_static_chunk (struct omp_region *region,
           gsi_next (&psi), ++i)
        {
          gphi *nphi;
-         source_location locus;
+         location_t locus;
 
          phi = psi.phi ();
          if (operand_equal_p (gimple_phi_arg_def (phi, 0),
@@ -4276,8 +6217,8 @@ expand_omp_for_static_chunk (struct omp_region *region,
 
   if (!broken_loop)
     {
-      struct loop *loop = body_bb->loop_father;
-      struct loop *trip_loop = alloc_loop ();
+      class loop *loop = body_bb->loop_father;
+      class loop *trip_loop = alloc_loop ();
       trip_loop->header = iter_part_bb;
       trip_loop->latch = trip_update_bb;
       add_loop (trip_loop, iter_part_bb->loop_father);
@@ -4302,193 +6243,6 @@ expand_omp_for_static_chunk (struct omp_region *region,
     }
 }
 
-/* A subroutine of expand_omp_for.  Generate code for _Cilk_for loop.
-   Given parameters:
-   for (V = N1; V cond N2; V += STEP) BODY;
-
-   where COND is "<" or ">" or "!=", we generate pseudocode
-
-   for (ind_var = low; ind_var < high; ind_var++)
-     {
-       V = n1 + (ind_var * STEP)
-
-       <BODY>
-     }
-
-   In the above pseudocode, low and high are function parameters of the
-   child function.  In the function below, we are inserting a temp.
-   variable that will be making a call to two OMP functions that will not be
-   found in the body of _Cilk_for (since OMP_FOR cannot be mixed
-   with _Cilk_for).  These functions are replaced with low and high
-   by the function that handles taskreg.  */
-
-
-static void
-expand_cilk_for (struct omp_region *region, struct omp_for_data *fd)
-{
-  bool broken_loop = region->cont == NULL;
-  basic_block entry_bb = region->entry;
-  basic_block cont_bb = region->cont;
-
-  gcc_assert (EDGE_COUNT (entry_bb->succs) == 2);
-  gcc_assert (broken_loop
-             || BRANCH_EDGE (entry_bb)->dest == FALLTHRU_EDGE (cont_bb)->dest);
-  basic_block l0_bb = FALLTHRU_EDGE (entry_bb)->dest;
-  basic_block l1_bb, l2_bb;
-
-  if (!broken_loop)
-    {
-      gcc_assert (BRANCH_EDGE (cont_bb)->dest == l0_bb);
-      gcc_assert (EDGE_COUNT (cont_bb->succs) == 2);
-      l1_bb = split_block (cont_bb, last_stmt (cont_bb))->dest;
-      l2_bb = BRANCH_EDGE (entry_bb)->dest;
-    }
-  else
-    {
-      BRANCH_EDGE (entry_bb)->flags &= ~EDGE_ABNORMAL;
-      l1_bb = split_edge (BRANCH_EDGE (entry_bb));
-      l2_bb = single_succ (l1_bb);
-    }
-  basic_block exit_bb = region->exit;
-  basic_block l2_dom_bb = NULL;
-
-  gimple_stmt_iterator gsi = gsi_last_bb (entry_bb);
-
-  /* Below statements until the "tree high_val = ..." are pseudo statements
-     used to pass information to be used by expand_omp_taskreg.
-     low_val and high_val will be replaced by the __low and __high
-     parameter from the child function.
-
-     The call_exprs part is a place-holder, it is mainly used
-     to distinctly identify to the top-level part that this is
-     where we should put low and high (reasoning given in header
-     comment).  */
-
-  gomp_parallel *par_stmt
-    = as_a <gomp_parallel *> (last_stmt (region->outer->entry));
-  tree child_fndecl = gimple_omp_parallel_child_fn (par_stmt);
-  tree t, low_val = NULL_TREE, high_val = NULL_TREE;
-  for (t = DECL_ARGUMENTS (child_fndecl); t; t = TREE_CHAIN (t))
-    {
-      if (id_equal (DECL_NAME (t), "__high"))
-       high_val = t;
-      else if (id_equal (DECL_NAME (t), "__low"))
-       low_val = t;
-    }
-  gcc_assert (low_val && high_val);
-
-  tree type = TREE_TYPE (low_val);
-  tree ind_var = create_tmp_reg (type, "__cilk_ind_var");
-  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
-
-  /* Not needed in SSA form right now.  */
-  gcc_assert (!gimple_in_ssa_p (cfun));
-  if (l2_dom_bb == NULL)
-    l2_dom_bb = l1_bb;
-
-  tree n1 = low_val;
-  tree n2 = high_val;
-
-  gimple *stmt = gimple_build_assign (ind_var, n1);
-
-  /* Replace the GIMPLE_OMP_FOR statement.  */
-  gsi_replace (&gsi, stmt, true);
-
-  if (!broken_loop)
-    {
-      /* Code to control the increment goes in the CONT_BB.  */
-      gsi = gsi_last_bb (cont_bb);
-      stmt = gsi_stmt (gsi);
-      gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE);
-      stmt = gimple_build_assign (ind_var, PLUS_EXPR, ind_var,
-                                 build_one_cst (type));
-
-      /* Replace GIMPLE_OMP_CONTINUE.  */
-      gsi_replace (&gsi, stmt, true);
-    }
-
-  /* Emit the condition in L1_BB.  */
-  gsi = gsi_after_labels (l1_bb);
-  t = fold_build2 (MULT_EXPR, TREE_TYPE (fd->loop.step),
-                  fold_convert (TREE_TYPE (fd->loop.step), ind_var),
-                  fd->loop.step);
-  if (POINTER_TYPE_P (TREE_TYPE (fd->loop.n1)))
-    t = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (fd->loop.n1),
-                    fd->loop.n1, fold_convert (sizetype, t));
-  else
-    t = fold_build2 (PLUS_EXPR, TREE_TYPE (fd->loop.n1),
-                    fd->loop.n1, fold_convert (TREE_TYPE (fd->loop.n1), t));
-  t = fold_convert (TREE_TYPE (fd->loop.v), t);
-  expand_omp_build_assign (&gsi, fd->loop.v, t);
-
-  /* The condition is always '<' since the runtime will fill in the low
-     and high values.  */
-  stmt = gimple_build_cond (LT_EXPR, ind_var, n2, NULL_TREE, NULL_TREE);
-  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
-
-  /* Remove GIMPLE_OMP_RETURN.  */
-  gsi = gsi_last_bb (exit_bb);
-  gsi_remove (&gsi, true);
-
-  /* Connect the new blocks.  */
-  remove_edge (FALLTHRU_EDGE (entry_bb));
-
-  edge e, ne;
-  if (!broken_loop)
-    {
-      remove_edge (BRANCH_EDGE (entry_bb));
-      make_edge (entry_bb, l1_bb, EDGE_FALLTHRU);
-
-      e = BRANCH_EDGE (l1_bb);
-      ne = FALLTHRU_EDGE (l1_bb);
-      e->flags = EDGE_TRUE_VALUE;
-    }
-  else
-    {
-      single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
-
-      ne = single_succ_edge (l1_bb);
-      e = make_edge (l1_bb, l0_bb, EDGE_TRUE_VALUE);
-
-    }
-  ne->flags = EDGE_FALSE_VALUE;
-  e->probability = profile_probability::guessed_always ().apply_scale (7, 8);
-  ne->probability = e->probability.invert ();
-
-  set_immediate_dominator (CDI_DOMINATORS, l1_bb, entry_bb);
-  set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
-  set_immediate_dominator (CDI_DOMINATORS, l0_bb, l1_bb);
-
-  if (!broken_loop)
-    {
-      struct loop *loop = alloc_loop ();
-      loop->header = l1_bb;
-      loop->latch = cont_bb;
-      add_loop (loop, l1_bb->loop_father);
-      loop->safelen = INT_MAX;
-    }
-
-  /* Pick the correct library function based on the precision of the
-     induction variable type.  */
-  tree lib_fun = NULL_TREE;
-  if (TYPE_PRECISION (type) == 32)
-    lib_fun = cilk_for_32_fndecl;
-  else if (TYPE_PRECISION (type) == 64)
-    lib_fun = cilk_for_64_fndecl;
-  else
-    gcc_unreachable ();
-
-  gcc_assert (fd->sched_kind == OMP_CLAUSE_SCHEDULE_CILKFOR);
-
-  /* WS_ARGS contains the library function flavor to call:
-     __libcilkrts_cilk_for_64 or __libcilkrts_cilk_for_32), and the
-     user-defined grain value.  If the user does not define one, then zero
-     is passed in by the parser.  */
-  vec_alloc (region->ws_args, 2);
-  region->ws_args->quick_push (lib_fun);
-  region->ws_args->quick_push (fd->chunk_size);
-}
-
 /* A subroutine of expand_omp_for.  Generate code for a simd non-worksharing
    loop.  Given parameters:
 
@@ -4505,49 +6259,8 @@ expand_cilk_for (struct omp_region *region, struct omp_for_data *fd)
        if (V cond N2) goto L0; else goto L2;
     L2:
 
-    For collapsed loops, given parameters:
-      collapse(3)
-      for (V1 = N11; V1 cond1 N12; V1 += STEP1)
-       for (V2 = N21; V2 cond2 N22; V2 += STEP2)
-         for (V3 = N31; V3 cond3 N32; V3 += STEP3)
-           BODY;
-
-    we generate pseudocode
-
-       if (cond3 is <)
-         adj = STEP3 - 1;
-       else
-         adj = STEP3 + 1;
-       count3 = (adj + N32 - N31) / STEP3;
-       if (cond2 is <)
-         adj = STEP2 - 1;
-       else
-         adj = STEP2 + 1;
-       count2 = (adj + N22 - N21) / STEP2;
-       if (cond1 is <)
-         adj = STEP1 - 1;
-       else
-         adj = STEP1 + 1;
-       count1 = (adj + N12 - N11) / STEP1;
-       count = count1 * count2 * count3;
-       V = 0;
-       V1 = N11;
-       V2 = N21;
-       V3 = N31;
-       goto L1;
-    L0:
-       BODY;
-       V += 1;
-       V3 += STEP3;
-       V2 += (V3 cond3 N32) ? 0 : STEP2;
-       V3 = (V3 cond3 N32) ? V3 : N31;
-       V1 += (V2 cond2 N22) ? 0 : STEP1;
-       V2 = (V2 cond2 N22) ? V2 : N21;
-    L1:
-       if (V < count) goto L0; else goto L2;
-    L2:
-
-      */
+    For collapsed loops, emit the outer loops as scalar
+    and only try to vectorize the innermost loop.  */
 
 static void
 expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
@@ -4562,22 +6275,37 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   tree *counts = NULL;
   int i;
   int safelen_int = INT_MAX;
+  bool dont_vectorize = false;
   tree safelen = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
                                  OMP_CLAUSE_SAFELEN);
   tree simduid = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
                                  OMP_CLAUSE__SIMDUID_);
+  tree ifc = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
+                             OMP_CLAUSE_IF);
+  tree simdlen = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
+                                 OMP_CLAUSE_SIMDLEN);
+  tree condtemp = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
+                                  OMP_CLAUSE__CONDTEMP_);
   tree n1, n2;
+  tree cond_var = condtemp ? OMP_CLAUSE_DECL (condtemp) : NULL_TREE;
 
   if (safelen)
     {
+      poly_uint64 val;
       safelen = OMP_CLAUSE_SAFELEN_EXPR (safelen);
-      if (TREE_CODE (safelen) != INTEGER_CST)
+      if (!poly_int_tree_p (safelen, &val))
        safelen_int = 0;
-      else if (tree_fits_uhwi_p (safelen) && tree_to_uhwi (safelen) < INT_MAX)
-       safelen_int = tree_to_uhwi (safelen);
+      else
+       safelen_int = MIN (constant_lower_bound (val), INT_MAX);
       if (safelen_int == 1)
        safelen_int = 0;
     }
+  if ((ifc && integer_zerop (OMP_CLAUSE_IF_EXPR (ifc)))
+      || (simdlen && integer_onep (OMP_CLAUSE_SIMDLEN_EXPR (simdlen))))
+    {
+      safelen_int = 0;
+      dont_vectorize = true;
+    }
   type = TREE_TYPE (fd->loop.v);
   entry_bb = region->entry;
   cont_bb = region->cont;
@@ -4601,12 +6329,14 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   exit_bb = region->exit;
   l2_dom_bb = NULL;
 
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
 
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
   /* Not needed in SSA form right now.  */
   gcc_assert (!gimple_in_ssa_p (cfun));
-  if (fd->collapse > 1)
+  if (fd->collapse > 1
+      && (gimple_omp_for_combined_into_p (fd->for_stmt)
+         || broken_loop))
     {
       int first_zero_iter = -1, dummy = -1;
       basic_block zero_iter_bb = l2_bb, dummy_bb = NULL;
@@ -4633,6 +6363,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       n2 = OMP_CLAUSE_DECL (innerc);
     }
   tree step = fd->loop.step;
+  tree orig_step = step; /* May be different from step if is_simt.  */
 
   bool is_simt = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
                                  OMP_CLAUSE__SIMT_);
@@ -4670,24 +6401,177 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       step = fold_build2 (MULT_EXPR, TREE_TYPE (step), step, vf);
     }
 
-  expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+  tree n2var = NULL_TREE;
+  tree n2v = NULL_TREE;
+  tree *nonrect_bounds = NULL;
+  tree min_arg1 = NULL_TREE, min_arg2 = NULL_TREE;
   if (fd->collapse > 1)
     {
-      if (gimple_omp_for_combined_into_p (fd->for_stmt))
+      if (broken_loop || gimple_omp_for_combined_into_p (fd->for_stmt))
        {
+         if (fd->non_rect)
+           {
+             nonrect_bounds = XALLOCAVEC (tree, fd->last_nonrect + 1);
+             memset (nonrect_bounds, 0,
+                     sizeof (tree) * (fd->last_nonrect + 1));
+           }
+         expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+         gcc_assert (entry_bb == gsi_bb (gsi));
+         gcc_assert (fd->for_stmt == gsi_stmt (gsi));
          gsi_prev (&gsi);
-         expand_omp_for_init_vars (fd, &gsi, counts, NULL, n1);
-         gsi_next (&gsi);
+         entry_bb = split_block (entry_bb, gsi_stmt (gsi))->dest;
+         expand_omp_for_init_vars (fd, &gsi, counts, nonrect_bounds,
+                                   NULL, n1);
+         gsi = gsi_for_stmt (fd->for_stmt);
        }
+      if (broken_loop)
+       ;
+      else if (gimple_omp_for_combined_into_p (fd->for_stmt))
+       {
+         /* Compute in n2var the limit for the first innermost loop,
+            i.e. fd->loop.v + MIN (n2 - fd->loop.v, cnt)
+            where cnt is how many iterations would the loop have if
+            all further iterations were assigned to the current task.  */
+         n2var = create_tmp_var (type);
+         i = fd->collapse - 1;
+         tree itype = TREE_TYPE (fd->loops[i].v);
+         if (POINTER_TYPE_P (itype))
+           itype = signed_type_for (itype);
+         t = build_int_cst (itype, (fd->loops[i].cond_code == LT_EXPR
+                                    ? -1 : 1));
+         t = fold_build2 (PLUS_EXPR, itype,
+                          fold_convert (itype, fd->loops[i].step), t);
+         t = fold_build2 (PLUS_EXPR, itype, t,
+                          fold_convert (itype, fd->loops[i].n2));
+         if (fd->loops[i].m2)
+           {
+             tree t2 = fold_convert (itype,
+                                     fd->loops[i - fd->loops[i].outer].v);
+             tree t3 = fold_convert (itype, fd->loops[i].m2);
+             t2 = fold_build2 (MULT_EXPR, TREE_TYPE (t), t2, t3);
+             t = fold_build2 (PLUS_EXPR, itype, t, t2);
+           }
+         t = fold_build2 (MINUS_EXPR, itype, t,
+                          fold_convert (itype, fd->loops[i].v));
+         if (TYPE_UNSIGNED (itype) && fd->loops[i].cond_code == GT_EXPR)
+           t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                            fold_build1 (NEGATE_EXPR, itype, t),
+                            fold_build1 (NEGATE_EXPR, itype,
+                                         fold_convert (itype,
+                                                       fd->loops[i].step)));
+         else
+           t = fold_build2 (TRUNC_DIV_EXPR, itype, t,
+                            fold_convert (itype, fd->loops[i].step));
+         t = fold_convert (type, t);
+         tree t2 = fold_build2 (MINUS_EXPR, type, n2, n1);
+         min_arg1 = create_tmp_var (type);
+         expand_omp_build_assign (&gsi, min_arg1, t2);
+         min_arg2 = create_tmp_var (type);
+         expand_omp_build_assign (&gsi, min_arg2, t);
+       }
+      else
+       {
+         if (TREE_CODE (n2) == INTEGER_CST)
+           {
+             /* Indicate for lastprivate handling that at least one iteration
+                has been performed, without wasting runtime.  */
+             if (integer_nonzerop (n2))
+               expand_omp_build_assign (&gsi, fd->loop.v,
+                                        fold_convert (type, n2));
+             else
+               /* Indicate that no iteration has been performed.  */
+               expand_omp_build_assign (&gsi, fd->loop.v,
+                                        build_one_cst (type));
+           }
+         else
+           {
+             expand_omp_build_assign (&gsi, fd->loop.v,
+                                      build_zero_cst (type));
+             expand_omp_build_assign (&gsi, n2, build_one_cst (type));
+           }
+         for (i = 0; i < fd->collapse; i++)
+           {
+             t = fold_convert (TREE_TYPE (fd->loops[i].v), fd->loops[i].n1);
+             if (fd->loops[i].m1)
+               {
+                 tree t2
+                   = fold_convert (TREE_TYPE (t),
+                                   fd->loops[i - fd->loops[i].outer].v);
+                 tree t3 = fold_convert (TREE_TYPE (t), fd->loops[i].m1);
+                 t2 = fold_build2 (MULT_EXPR, TREE_TYPE (t), t2, t3);
+                 t = fold_build2 (PLUS_EXPR, TREE_TYPE (t), t, t2);
+               }
+             expand_omp_build_assign (&gsi, fd->loops[i].v, t);
+             /* For normal non-combined collapsed loops just initialize
+                the outermost iterator in the entry_bb.  */
+             if (!broken_loop)
+               break;
+           }
+       }
+    }
+  else
+    expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+  tree altv = NULL_TREE, altn2 = NULL_TREE;
+  if (fd->collapse == 1
+      && !broken_loop
+      && TREE_CODE (orig_step) != INTEGER_CST)
+    {
+      /* The vectorizer currently punts on loops with non-constant steps
+        for the main IV (can't compute number of iterations and gives up
+        because of that).  As for OpenMP loops it is always possible to
+        compute the number of iterations upfront, use an alternate IV
+        as the loop iterator:
+        altn2 = n1 < n2 ? (n2 - n1 + step - 1) / step : 0;
+        for (i = n1, altv = 0; altv < altn2; altv++, i += step)  */
+      altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v)));
+      expand_omp_build_assign (&gsi, altv, build_zero_cst (TREE_TYPE (altv)));
+      tree itype = TREE_TYPE (fd->loop.v);
+      if (POINTER_TYPE_P (itype))
+       itype = signed_type_for (itype);
+      t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1));
+      t = fold_build2 (PLUS_EXPR, itype,
+                      fold_convert (itype, step), t);
+      t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2));
+      t = fold_build2 (MINUS_EXPR, itype, t,
+                      fold_convert (itype, fd->loop.v));
+      if (TYPE_UNSIGNED (itype) && fd->loop.cond_code == GT_EXPR)
+       t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                        fold_build1 (NEGATE_EXPR, itype, t),
+                        fold_build1 (NEGATE_EXPR, itype,
+                                     fold_convert (itype, step)));
+      else
+       t = fold_build2 (TRUNC_DIV_EXPR, itype, t,
+                        fold_convert (itype, step));
+      t = fold_convert (TREE_TYPE (altv), t);
+      altn2 = create_tmp_var (TREE_TYPE (altv));
+      expand_omp_build_assign (&gsi, altn2, t);
+      tree t2 = fold_convert (TREE_TYPE (fd->loop.v), n2);
+      t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE,
+                                    true, GSI_SAME_STMT);
+      t2 = fold_build2 (fd->loop.cond_code, boolean_type_node, fd->loop.v, t2);
+      gassign *g = gimple_build_assign (altn2, COND_EXPR, t2, altn2,
+                                       build_zero_cst (TREE_TYPE (altv)));
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+    }
+  else if (fd->collapse > 1
+          && !broken_loop
+          && !gimple_omp_for_combined_into_p (fd->for_stmt)
+          && TREE_CODE (fd->loops[fd->collapse - 1].step) != INTEGER_CST)
+    {
+      altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v)));
+      altn2 = create_tmp_var (TREE_TYPE (altv));
+    }
+  if (cond_var)
+    {
+      if (POINTER_TYPE_P (type)
+         || TREE_CODE (n1) != INTEGER_CST
+         || fd->loop.cond_code != LT_EXPR
+         || tree_int_cst_sgn (n1) != 1)
+       expand_omp_build_assign (&gsi, cond_var,
+                                build_one_cst (TREE_TYPE (cond_var)));
       else
-       for (i = 0; i < fd->collapse; i++)
-         {
-           tree itype = TREE_TYPE (fd->loops[i].v);
-           if (POINTER_TYPE_P (itype))
-             itype = signed_type_for (itype);
-           t = fold_convert (TREE_TYPE (fd->loops[i].v), fd->loops[i].n1);
-           expand_omp_build_assign (&gsi, fd->loops[i].v, t);
-         }
+       expand_omp_build_assign (&gsi, cond_var,
+                                fold_convert (TREE_TYPE (cond_var), n1));
     }
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
@@ -4696,15 +6580,27 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   if (!broken_loop)
     {
       /* Code to control the increment goes in the CONT_BB.  */
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       stmt = gsi_stmt (gsi);
       gcc_assert (gimple_code (stmt) == GIMPLE_OMP_CONTINUE);
 
-      if (POINTER_TYPE_P (type))
-       t = fold_build_pointer_plus (fd->loop.v, step);
-      else
-       t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
-      expand_omp_build_assign (&gsi, fd->loop.v, t);
+      if (fd->collapse == 1
+         || gimple_omp_for_combined_into_p (fd->for_stmt))
+       {
+         if (POINTER_TYPE_P (type))
+           t = fold_build_pointer_plus (fd->loop.v, step);
+         else
+           t = fold_build2 (PLUS_EXPR, type, fd->loop.v, step);
+         expand_omp_build_assign (&gsi, fd->loop.v, t);
+       }
+      else if (TREE_CODE (n2) != INTEGER_CST)
+       expand_omp_build_assign (&gsi, fd->loop.v, build_one_cst (type));
+      if (altv)
+       {
+         t = fold_build2 (PLUS_EXPR, TREE_TYPE (altv), altv,
+                          build_one_cst (TREE_TYPE (altv)));
+         expand_omp_build_assign (&gsi, altv, t);
+       }
 
       if (fd->collapse > 1)
        {
@@ -4722,33 +6618,19 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
                               fd->loops[i].v, t);
            }
          expand_omp_build_assign (&gsi, fd->loops[i].v, t);
-
-         for (i = fd->collapse - 1; i > 0; i--)
-           {
-             tree itype = TREE_TYPE (fd->loops[i].v);
-             tree itype2 = TREE_TYPE (fd->loops[i - 1].v);
-             if (POINTER_TYPE_P (itype2))
-               itype2 = signed_type_for (itype2);
-             t = build3 (COND_EXPR, itype2,
-                         build2 (fd->loops[i].cond_code, boolean_type_node,
-                                 fd->loops[i].v,
-                                 fold_convert (itype, fd->loops[i].n2)),
-                         build_int_cst (itype2, 0),
-                         fold_convert (itype2, fd->loops[i - 1].step));
-             if (POINTER_TYPE_P (TREE_TYPE (fd->loops[i - 1].v)))
-               t = fold_build_pointer_plus (fd->loops[i - 1].v, t);
-             else
-               t = fold_build2 (PLUS_EXPR, itype2, fd->loops[i - 1].v, t);
-             expand_omp_build_assign (&gsi, fd->loops[i - 1].v, t);
-
-             t = build3 (COND_EXPR, itype,
-                         build2 (fd->loops[i].cond_code, boolean_type_node,
-                                 fd->loops[i].v,
-                                 fold_convert (itype, fd->loops[i].n2)),
-                         fd->loops[i].v,
-                         fold_convert (itype, fd->loops[i].n1));
-             expand_omp_build_assign (&gsi, fd->loops[i].v, t);
-           }
+       }
+      if (cond_var)
+       {
+         if (POINTER_TYPE_P (type)
+             || TREE_CODE (n1) != INTEGER_CST
+             || fd->loop.cond_code != LT_EXPR
+             || tree_int_cst_sgn (n1) != 1)
+           t = fold_build2 (PLUS_EXPR, TREE_TYPE (cond_var), cond_var,
+                            build_one_cst (TREE_TYPE (cond_var)));
+         else
+           t = fold_build2 (PLUS_EXPR, TREE_TYPE (cond_var), cond_var,
+                            fold_convert (TREE_TYPE (cond_var), step));
+         expand_omp_build_assign (&gsi, cond_var, t);
        }
 
       /* Remove GIMPLE_OMP_CONTINUE.  */
@@ -4758,14 +6640,40 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   /* Emit the condition in L1_BB.  */
   gsi = gsi_start_bb (l1_bb);
 
-  t = fold_convert (type, n2);
-  t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
-                               false, GSI_CONTINUE_LINKING);
-  tree v = fd->loop.v;
-  if (DECL_P (v) && TREE_ADDRESSABLE (v))
-    v = force_gimple_operand_gsi (&gsi, v, true, NULL_TREE,
-                                 false, GSI_CONTINUE_LINKING);
-  t = build2 (fd->loop.cond_code, boolean_type_node, v, t);
+  if (altv)
+    t = build2 (LT_EXPR, boolean_type_node, altv, altn2);
+  else if (fd->collapse > 1
+          && !gimple_omp_for_combined_into_p (fd->for_stmt)
+          && !broken_loop)
+    {
+      i = fd->collapse - 1;
+      tree itype = TREE_TYPE (fd->loops[i].v);
+      if (fd->loops[i].m2)
+       t = n2v = create_tmp_var (itype);
+      else
+       t = fold_convert (itype, fd->loops[i].n2);
+      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                                   false, GSI_CONTINUE_LINKING);
+      tree v = fd->loops[i].v;
+      if (DECL_P (v) && TREE_ADDRESSABLE (v))
+       v = force_gimple_operand_gsi (&gsi, v, true, NULL_TREE,
+                                     false, GSI_CONTINUE_LINKING);
+      t = build2 (fd->loops[i].cond_code, boolean_type_node, v, t);
+    }
+  else
+    {
+      if (fd->collapse > 1 && !broken_loop)
+       t = n2var;
+      else
+       t = fold_convert (type, n2);
+      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                                   false, GSI_CONTINUE_LINKING);
+      tree v = fd->loop.v;
+      if (DECL_P (v) && TREE_ADDRESSABLE (v))
+       v = force_gimple_operand_gsi (&gsi, v, true, NULL_TREE,
+                                     false, GSI_CONTINUE_LINKING);
+      t = build2 (fd->loop.cond_code, boolean_type_node, v, t);
+    }
   cond_stmt = gimple_build_cond_empty (t);
   gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
   if (walk_tree (gimple_cond_lhs_ptr (cond_stmt), expand_omp_regimplify_p,
@@ -4781,7 +6689,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   if (is_simt)
     {
       gsi = gsi_start_bb (l2_bb);
-      step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step);
+      step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), orig_step, step);
       if (POINTER_TYPE_P (type))
        t = fold_build_pointer_plus (fd->loop.v, step);
       else
@@ -4790,7 +6698,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
     }
 
   /* Remove GIMPLE_OMP_RETURN.  */
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   gsi_remove (&gsi, true);
 
   /* Connect the new blocks.  */
@@ -4830,15 +6738,216 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       FALLTHRU_EDGE (entry_bb)->flags = EDGE_TRUE_VALUE;
       FALLTHRU_EDGE (entry_bb)->probability
         = profile_probability::guessed_always ().apply_scale (7, 8);
-      BRANCH_EDGE (entry_bb)->probability 
+      BRANCH_EDGE (entry_bb)->probability
         = FALLTHRU_EDGE (entry_bb)->probability.invert ();
       l2_dom_bb = entry_bb;
     }
   set_immediate_dominator (CDI_DOMINATORS, l2_bb, l2_dom_bb);
 
+  if (!broken_loop && fd->collapse > 1)
+    {
+      basic_block last_bb = l1_bb;
+      basic_block init_bb = NULL;
+      for (i = fd->collapse - 2; i >= 0; i--)
+       {
+         tree nextn2v = NULL_TREE;
+         if (EDGE_SUCC (last_bb, 0)->flags & EDGE_FALSE_VALUE)
+           e = EDGE_SUCC (last_bb, 0);
+         else
+           e = EDGE_SUCC (last_bb, 1);
+         basic_block bb = split_edge (e);
+         if (POINTER_TYPE_P (TREE_TYPE (fd->loops[i].v)))
+           {
+             t = fold_convert (sizetype, fd->loops[i].step);
+             t = fold_build_pointer_plus (fd->loops[i].v, t);
+           }
+         else
+           {
+             t = fold_convert (TREE_TYPE (fd->loops[i].v),
+                               fd->loops[i].step);
+             t = fold_build2 (PLUS_EXPR, TREE_TYPE (fd->loops[i].v),
+                              fd->loops[i].v, t);
+           }
+         gsi = gsi_after_labels (bb);
+         expand_omp_build_assign (&gsi, fd->loops[i].v, t);
+
+         bb = split_block (bb, last_stmt (bb))->dest;
+         gsi = gsi_start_bb (bb);
+         tree itype = TREE_TYPE (fd->loops[i].v);
+         if (fd->loops[i].m2)
+           t = nextn2v = create_tmp_var (itype);
+         else
+           t = fold_convert (itype, fd->loops[i].n2);
+         t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+                                       false, GSI_CONTINUE_LINKING);
+         tree v = fd->loops[i].v;
+         if (DECL_P (v) && TREE_ADDRESSABLE (v))
+           v = force_gimple_operand_gsi (&gsi, v, true, NULL_TREE,
+                                         false, GSI_CONTINUE_LINKING);
+         t = build2 (fd->loops[i].cond_code, boolean_type_node, v, t);
+         cond_stmt = gimple_build_cond_empty (t);
+         gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+         if (walk_tree (gimple_cond_lhs_ptr (cond_stmt),
+                        expand_omp_regimplify_p, NULL, NULL)
+             || walk_tree (gimple_cond_rhs_ptr (cond_stmt),
+                           expand_omp_regimplify_p, NULL, NULL))
+           {
+             gsi = gsi_for_stmt (cond_stmt);
+             gimple_regimplify_operands (cond_stmt, &gsi);
+           }
+         ne = single_succ_edge (bb);
+         ne->flags = EDGE_FALSE_VALUE;
+
+         init_bb = create_empty_bb (bb);
+         set_immediate_dominator (CDI_DOMINATORS, init_bb, bb);
+         add_bb_to_loop (init_bb, bb->loop_father);
+         e = make_edge (bb, init_bb, EDGE_TRUE_VALUE);
+         e->probability
+           = profile_probability::guessed_always ().apply_scale (7, 8);
+         ne->probability = e->probability.invert ();
+
+         gsi = gsi_after_labels (init_bb);
+         t = fold_convert (TREE_TYPE (fd->loops[i + 1].v),
+                           fd->loops[i + 1].n1);
+         if (fd->loops[i + 1].m1)
+           {
+             tree t2 = fold_convert (TREE_TYPE (t),
+                                     fd->loops[i + 1
+                                               - fd->loops[i + 1].outer].v);
+             tree t3 = fold_convert (TREE_TYPE (t), fd->loops[i + 1].m1);
+             t2 = fold_build2 (MULT_EXPR, TREE_TYPE (t), t2, t3);
+             t = fold_build2 (PLUS_EXPR, TREE_TYPE (t), t, t2);
+           }
+         expand_omp_build_assign (&gsi, fd->loops[i + 1].v, t);
+         if (fd->loops[i + 1].m2)
+           {
+             if (i + 2 == fd->collapse && (n2var || altv))
+               {
+                 gcc_assert (n2v == NULL_TREE);
+                 n2v = create_tmp_var (TREE_TYPE (fd->loops[i + 1].v));
+               }
+             t = fold_convert (TREE_TYPE (fd->loops[i + 1].v),
+                               fd->loops[i + 1].n2);
+             tree t2 = fold_convert (TREE_TYPE (t),
+                                     fd->loops[i + 1
+                                               - fd->loops[i + 1].outer].v);
+             tree t3 = fold_convert (TREE_TYPE (t), fd->loops[i + 1].m2);
+             t2 = fold_build2 (MULT_EXPR, TREE_TYPE (t), t2, t3);
+             t = fold_build2 (PLUS_EXPR, TREE_TYPE (t), t, t2);
+             expand_omp_build_assign (&gsi, n2v, t);
+           }
+         if (i + 2 == fd->collapse && n2var)
+           {
+             /* For composite simd, n2 is the first iteration the current
+                task shouldn't already handle, so we effectively want to use
+                for (V3 = N31; V < N2 && V3 < N32; V++, V3 += STEP3)
+                as the vectorized loop.  Except the vectorizer will not
+                vectorize that, so instead compute N2VAR as
+                N2VAR = V + MIN (N2 - V, COUNTS3) and use
+                for (V3 = N31; V < N2VAR; V++, V3 += STEP3)
+                as the loop to vectorize.  */
+             tree t2 = fold_build2 (MINUS_EXPR, type, n2, fd->loop.v);
+             if (fd->loops[i + 1].m1 || fd->loops[i + 1].m2)
+               {
+                 t = build_int_cst (itype, (fd->loops[i + 1].cond_code
+                                            == LT_EXPR ? -1 : 1));
+                 t = fold_build2 (PLUS_EXPR, itype,
+                                  fold_convert (itype,
+                                                fd->loops[i + 1].step), t);
+                 if (fd->loops[i + 1].m2)
+                   t = fold_build2 (PLUS_EXPR, itype, t, n2v);
+                 else
+                   t = fold_build2 (PLUS_EXPR, itype, t,
+                                    fold_convert (itype,
+                                                  fd->loops[i + 1].n2));
+                 t = fold_build2 (MINUS_EXPR, itype, t,
+                                  fold_convert (itype, fd->loops[i + 1].v));
+                 tree step = fold_convert (itype, fd->loops[i + 1].step);
+                 if (TYPE_UNSIGNED (itype)
+                     && fd->loops[i + 1].cond_code == GT_EXPR)
+                   t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                                    fold_build1 (NEGATE_EXPR, itype, t),
+                                    fold_build1 (NEGATE_EXPR, itype, step));
+                 else
+                   t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+                 t = fold_convert (type, t);
+               }
+             else
+               t = counts[i + 1];
+             expand_omp_build_assign (&gsi, min_arg1, t2);
+             expand_omp_build_assign (&gsi, min_arg2, t);
+             e = split_block (init_bb, last_stmt (init_bb));
+             gsi = gsi_after_labels (e->dest);
+             init_bb = e->dest;
+             remove_edge (FALLTHRU_EDGE (entry_bb));
+             make_edge (entry_bb, init_bb, EDGE_FALLTHRU);
+             set_immediate_dominator (CDI_DOMINATORS, init_bb, entry_bb);
+             set_immediate_dominator (CDI_DOMINATORS, l1_bb, init_bb);
+             t = fold_build2 (MIN_EXPR, type, min_arg1, min_arg2);
+             t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t);
+             expand_omp_build_assign (&gsi, n2var, t);
+           }
+         if (i + 2 == fd->collapse && altv)
+           {
+             /* The vectorizer currently punts on loops with non-constant
+                steps for the main IV (can't compute number of iterations
+                and gives up because of that).  As for OpenMP loops it is
+                always possible to compute the number of iterations upfront,
+                use an alternate IV as the loop iterator.  */
+             expand_omp_build_assign (&gsi, altv,
+                                      build_zero_cst (TREE_TYPE (altv)));
+             tree itype = TREE_TYPE (fd->loops[i + 1].v);
+             if (POINTER_TYPE_P (itype))
+               itype = signed_type_for (itype);
+             t = build_int_cst (itype, (fd->loops[i + 1].cond_code == LT_EXPR
+                                        ? -1 : 1));
+             t = fold_build2 (PLUS_EXPR, itype,
+                              fold_convert (itype, fd->loops[i + 1].step), t);
+             t = fold_build2 (PLUS_EXPR, itype, t,
+                              fold_convert (itype,
+                                            fd->loops[i + 1].m2
+                                            ? n2v : fd->loops[i + 1].n2));
+             t = fold_build2 (MINUS_EXPR, itype, t,
+                              fold_convert (itype, fd->loops[i + 1].v));
+             tree step = fold_convert (itype, fd->loops[i + 1].step);
+             if (TYPE_UNSIGNED (itype)
+                 && fd->loops[i + 1].cond_code == GT_EXPR)
+               t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                                fold_build1 (NEGATE_EXPR, itype, t),
+                                fold_build1 (NEGATE_EXPR, itype, step));
+             else
+               t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+             t = fold_convert (TREE_TYPE (altv), t);
+             expand_omp_build_assign (&gsi, altn2, t);
+             tree t2 = fold_convert (TREE_TYPE (fd->loops[i + 1].v),
+                                     fd->loops[i + 1].m2
+                                     ? n2v : fd->loops[i + 1].n2);
+             t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             t2 = fold_build2 (fd->loops[i + 1].cond_code, boolean_type_node,
+                               fd->loops[i + 1].v, t2);
+             gassign *g
+               = gimple_build_assign (altn2, COND_EXPR, t2, altn2,
+                                      build_zero_cst (TREE_TYPE (altv)));
+             gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+           }
+         n2v = nextn2v;
+
+         make_edge (init_bb, last_bb, EDGE_FALLTHRU);
+         if (!gimple_omp_for_combined_into_p (fd->for_stmt))
+           {
+             e = find_edge (entry_bb, last_bb);
+             redirect_edge_succ (e, bb);
+             set_immediate_dominator (CDI_DOMINATORS, bb, entry_bb);
+             set_immediate_dominator (CDI_DOMINATORS, last_bb, init_bb);
+           }
+
+         last_bb = bb;
+       }
+    }
   if (!broken_loop)
     {
-      struct loop *loop = alloc_loop ();
+      class loop *loop = alloc_loop ();
       loop->header = l1_bb;
       loop->latch = cont_bb;
       add_loop (loop, l1_bb->loop_father);
@@ -4851,14 +6960,22 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
       /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
         the loop.  */
       if ((flag_tree_loop_vectorize
-          || (!global_options_set.x_flag_tree_loop_vectorize
-              && !global_options_set.x_flag_tree_vectorize))
+          || !global_options_set.x_flag_tree_loop_vectorize)
          && flag_tree_loop_optimize
          && loop->safelen > 1)
        {
          loop->force_vectorize = true;
+         if (simdlen && tree_fits_uhwi_p (OMP_CLAUSE_SIMDLEN_EXPR (simdlen)))
+           {
+             unsigned HOST_WIDE_INT v
+               = tree_to_uhwi (OMP_CLAUSE_SIMDLEN_EXPR (simdlen));
+             if (v < INT_MAX && v <= (unsigned HOST_WIDE_INT) loop->safelen)
+               loop->simdlen = v;
+           }
          cfun->has_force_vectorize_loops = true;
        }
+      else if (dont_vectorize)
+       loop->dont_vectorize = true;
     }
   else if (simduid)
     cfun->has_simduid_loops = true;
@@ -4917,7 +7034,7 @@ expand_omp_taskloop_for_outer (struct omp_region *region,
   gcc_assert (BRANCH_EDGE (entry_bb)->dest == FALLTHRU_EDGE (cont_bb)->dest);
   exit_bb = region->exit;
 
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
   gimple *for_stmt = gsi_stmt (gsi);
   gcc_assert (gimple_code (for_stmt) == GIMPLE_OMP_FOR);
   if (fd->collapse > 1)
@@ -4937,7 +7054,7 @@ expand_omp_taskloop_for_outer (struct omp_region *region,
             be executed in that case, so just avoid uninit warnings.  */
          for (i = first_zero_iter; i < fd->collapse; i++)
            if (SSA_VAR_P (counts[i]))
-             TREE_NO_WARNING (counts[i]) = 1;
+             suppress_warning (counts[i], OPT_Wuninitialized);
          gsi_prev (&gsi);
          edge e = split_block (entry_bb, gsi_stmt (gsi));
          entry_bb = e->dest;
@@ -4981,15 +7098,7 @@ expand_omp_taskloop_for_outer (struct omp_region *region,
   tree endvar = OMP_CLAUSE_DECL (innerc);
   if (fd->collapse > 1 && TREE_CODE (fd->loop.n2) != INTEGER_CST)
     {
-      gcc_assert (innerc);
-      for (i = 1; i < fd->collapse; i++)
-       {
-         innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
-                                   OMP_CLAUSE__LOOPTEMP_);
-         gcc_assert (innerc);
-       }
-      innerc = omp_find_clause (OMP_CLAUSE_CHAIN (innerc),
-                               OMP_CLAUSE__LOOPTEMP_);
+      innerc = find_lastprivate_looptemp (fd, innerc);
       if (innerc)
        {
          /* If needed (inner taskloop has lastprivate clause), propagate
@@ -5012,16 +7121,16 @@ expand_omp_taskloop_for_outer (struct omp_region *region,
   assign_stmt = gimple_build_assign (endvar, t1);
   gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
   if (fd->collapse > 1)
-    expand_omp_for_init_vars (fd, &gsi, counts, inner_stmt, startvar);
+    expand_omp_for_init_vars (fd, &gsi, counts, NULL, inner_stmt, startvar);
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
   gsi = gsi_for_stmt (for_stmt);
   gsi_remove (&gsi, true);
 
-  gsi = gsi_last_bb (cont_bb);
+  gsi = gsi_last_nondebug_bb (cont_bb);
   gsi_remove (&gsi, true);
 
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   gsi_remove (&gsi, true);
 
   FALLTHRU_EDGE (entry_bb)->probability = profile_probability::always ();
@@ -5095,7 +7204,7 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
   exit_bb = region->exit;
 
   /* Iteration space partitioning goes in ENTRY_BB.  */
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
 
   if (fd->collapse > 1)
@@ -5167,14 +7276,26 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
        assign_stmt = gimple_build_assign (fd->loop.v, NOP_EXPR, e);
       gsi_insert_after (&gsi, assign_stmt, GSI_CONTINUE_LINKING);
     }
+
+  tree *nonrect_bounds = NULL;
   if (fd->collapse > 1)
-    expand_omp_for_init_vars (fd, &gsi, counts, inner_stmt, startvar);
+    {
+      if (fd->non_rect)
+       {
+         nonrect_bounds = XALLOCAVEC (tree, fd->last_nonrect + 1);
+         memset (nonrect_bounds, 0, sizeof (tree) * (fd->last_nonrect + 1));
+       }
+      gcc_assert (gsi_bb (gsi) == entry_bb);
+      expand_omp_for_init_vars (fd, &gsi, counts, nonrect_bounds, inner_stmt,
+                               startvar);
+      entry_bb = gsi_bb (gsi);
+    }
 
   if (!broken_loop)
     {
       /* The code controlling the sequential loop replaces the
         GIMPLE_OMP_CONTINUE.  */
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       gcc_assert (gimple_code (cont_stmt) == GIMPLE_OMP_CONTINUE);
       vmain = gimple_omp_continue_control_use (cont_stmt);
@@ -5203,7 +7324,8 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
       gsi_remove (&gsi, true);
 
       if (fd->collapse > 1 && !gimple_omp_for_combined_p (fd->for_stmt))
-       collapse_bb = extract_omp_for_update_vars (fd, cont_bb, body_bb);
+       collapse_bb = extract_omp_for_update_vars (fd, nonrect_bounds,
+                                                  cont_bb, body_bb);
     }
 
   /* Remove the GIMPLE_OMP_FOR statement.  */
@@ -5211,7 +7333,7 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
   gsi_remove (&gsi, true);
 
   /* Remove the GIMPLE_OMP_RETURN statement.  */
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   gsi_remove (&gsi, true);
 
   FALLTHRU_EDGE (entry_bb)->probability = profile_probability::always ();
@@ -5251,7 +7373,7 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
 
   if (!broken_loop && !gimple_omp_for_combined_p (fd->for_stmt))
     {
-      struct loop *loop = alloc_loop ();
+      class loop *loop = alloc_loop ();
       loop->header = body_bb;
       if (collapse_bb == NULL)
        loop->latch = cont_bb;
@@ -5306,6 +7428,21 @@ expand_omp_taskloop_for_inner (struct omp_region *region,
 static void
 expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 {
+  bool is_oacc_kernels_parallelized
+    = (lookup_attribute ("oacc kernels parallelized",
+                        DECL_ATTRIBUTES (current_function_decl)) != NULL);
+  {
+    bool is_oacc_kernels
+      = (lookup_attribute ("oacc kernels",
+                          DECL_ATTRIBUTES (current_function_decl)) != NULL);
+    if (is_oacc_kernels_parallelized)
+      gcc_checking_assert (is_oacc_kernels);
+  }
+  gcc_assert (gimple_in_ssa_p (cfun) == is_oacc_kernels_parallelized);
+  /* In the following, some of the 'gimple_in_ssa_p (cfun)' conditionals are
+     for SSA specifics, and some are for 'parloops' OpenACC
+     'kernels'-parallelized specifics.  */
+
   tree v = fd->loop.v;
   enum tree_code cond_code = fd->loop.cond_code;
   enum tree_code plus_code = PLUS_EXPR;
@@ -5327,16 +7464,24 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
       plus_code = POINTER_PLUS_EXPR;
       plus_type = sizetype;
     }
+  for (int ix = fd->collapse; ix--;)
+    {
+      tree diff_type2 = TREE_TYPE (fd->loops[ix].step);
+      if (TYPE_PRECISION (diff_type) < TYPE_PRECISION (diff_type2))
+       diff_type = diff_type2;
+    }
   if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
     diff_type = signed_type_for (diff_type);
+  if (TYPE_PRECISION (diff_type) < TYPE_PRECISION (integer_type_node))
+    diff_type = integer_type_node;
 
   basic_block entry_bb = region->entry; /* BB ending in OMP_FOR */
   basic_block exit_bb = region->exit; /* BB ending in OMP_RETURN */
   basic_block cont_bb = region->cont; /* BB ending in OMP_CONTINUE  */
   basic_block bottom_bb = NULL;
 
-  /* entry_bb has two sucessors; the branch edge is to the exit
-     block,  fallthrough edge to body.  */
+  /* entry_bb has two successors; the branch edge is to the exit
+     block, fallthrough edge to body.  */
   gcc_assert (EDGE_COUNT (entry_bb->succs) == 2
              && BRANCH_EDGE (entry_bb)->dest == exit_bb);
 
@@ -5392,7 +7537,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
   entry_bb = split->src;
 
   /* Chunk setup goes at end of entry_bb, replacing the omp_for.  */
-  gsi = gsi_last_bb (entry_bb);
+  gsi = gsi_last_nondebug_bb (entry_bb);
   gomp_for *for_stmt = as_a <gomp_for *> (gsi_stmt (gsi));
   loc = gimple_location (for_stmt);
 
@@ -5408,7 +7553,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
     {
       gcc_assert (!gimple_in_ssa_p (cfun) && up);
       counts = XALLOCAVEC (struct oacc_collapse, fd->collapse);
-      tree total = expand_oacc_collapse_init (fd, &gsi, counts,
+      tree total = expand_oacc_collapse_init (fd, &gsi, counts, diff_type,
                                              TREE_TYPE (fd->loop.n2), loc);
 
       if (SSA_VAR_P (fd->loop.n2))
@@ -5519,7 +7664,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 
   if (gimple_in_ssa_p (cfun))
     {
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
 
       offset = gimple_omp_continue_control_use (cont_stmt);
@@ -5570,7 +7715,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
       gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
 
       if (fd->collapse > 1 || fd->tiling)
-       expand_oacc_collapse_vars (fd, false, &gsi, counts, v);
+       expand_oacc_collapse_vars (fd, false, &gsi, counts, v, diff_type);
 
       if (fd->tiling)
        {
@@ -5630,9 +7775,18 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 
          split->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
 
+         /* Add a dummy exit for the tiled block when cont_bb is missing.  */
+         if (cont_bb == NULL)
+           {
+             edge e = make_edge (body_bb, exit_bb, EDGE_FALSE_VALUE);
+             e->probability = profile_probability::even ();
+             split->probability = profile_probability::even ();
+           }
+
          /* Initialize the user's loop vars.  */
          gsi = gsi_start_bb (elem_body_bb);
-         expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset);
+         expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset,
+                                    diff_type);
        }
     }
 
@@ -5643,7 +7797,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
      occur, especially when noreturn routines are involved.  */
   if (cont_bb)
     {
-      gsi = gsi_last_bb (cont_bb);
+      gsi = gsi_last_nondebug_bb (cont_bb);
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       loc = gimple_location (cont_stmt);
 
@@ -5664,9 +7818,16 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
          cont_bb = split->dest;
 
          split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
-         make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE);
+         split->probability = profile_probability::unlikely ().guessed ();
+         edge latch_edge
+           = make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE);
+         latch_edge->probability = profile_probability::likely ().guessed ();
 
-         make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE);
+         edge skip_edge = make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE);
+         skip_edge->probability = profile_probability::unlikely ().guessed ();
+         edge loop_entry_edge = EDGE_SUCC (body_bb, 1 - skip_edge->dest_idx);
+         loop_entry_edge->probability
+           = profile_probability::likely ().guessed ();
 
          gsi = gsi_for_stmt (cont_stmt);
        }
@@ -5719,11 +7880,13 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 
          /* Fixup edges from bottom_bb.  */
          split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
-         make_edge (bottom_bb, head_bb, EDGE_TRUE_VALUE);
+         split->probability = profile_probability::unlikely ().guessed ();
+         edge latch_edge = make_edge (bottom_bb, head_bb, EDGE_TRUE_VALUE);
+         latch_edge->probability = profile_probability::likely ().guessed ();
        }
     }
 
-  gsi = gsi_last_bb (exit_bb);
+  gsi = gsi_last_nondebug_bb (exit_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
   loc = gimple_location (gsi_stmt (gsi));
 
@@ -5749,12 +7912,12 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
     {
       /* We now have one, two or three nested loops.  Update the loop
         structures.  */
-      struct loop *parent = entry_bb->loop_father;
-      struct loop *body = body_bb->loop_father;
+      class loop *parent = entry_bb->loop_father;
+      class loop *body = body_bb->loop_father;
 
       if (chunking)
        {
-         struct loop *chunk_loop = alloc_loop ();
+         class loop *chunk_loop = alloc_loop ();
          chunk_loop->header = head_bb;
          chunk_loop->latch = bottom_bb;
          add_loop (chunk_loop, parent);
@@ -5770,7 +7933,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 
       if (parent)
        {
-         struct loop *body_loop = alloc_loop ();
+         class loop *body_loop = alloc_loop ();
          body_loop->header = body_bb;
          body_loop->latch = cont_bb;
          add_loop (body_loop, parent);
@@ -5778,7 +7941,7 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
          if (fd->tiling)
            {
              /* Insert tiling's element loop.  */
-             struct loop *inner_loop = alloc_loop ();
+             class loop *inner_loop = alloc_loop ();
              inner_loop->header = elem_body_bb;
              inner_loop->latch = elem_cont_bb;
              add_loop (inner_loop, body_loop);
@@ -5795,14 +7958,55 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
   struct omp_for_data fd;
   struct omp_for_data_loop *loops;
 
-  loops
-    = (struct omp_for_data_loop *)
-      alloca (gimple_omp_for_collapse (last_stmt (region->entry))
-             * sizeof (struct omp_for_data_loop));
+  loops = XALLOCAVEC (struct omp_for_data_loop,
+                     gimple_omp_for_collapse (last_stmt (region->entry)));
   omp_extract_for_data (as_a <gomp_for *> (last_stmt (region->entry)),
                        &fd, loops);
   region->sched_kind = fd.sched_kind;
   region->sched_modifiers = fd.sched_modifiers;
+  region->has_lastprivate_conditional = fd.lastprivate_conditional != 0;
+  if (fd.non_rect && !gimple_omp_for_combined_into_p (fd.for_stmt))
+    {
+      for (int i = fd.first_nonrect; i <= fd.last_nonrect; i++)
+       if ((loops[i].m1 || loops[i].m2)
+           && (loops[i].m1 == NULL_TREE
+               || TREE_CODE (loops[i].m1) == INTEGER_CST)
+           && (loops[i].m2 == NULL_TREE
+               || TREE_CODE (loops[i].m2) == INTEGER_CST)
+           && TREE_CODE (loops[i].step) == INTEGER_CST
+           && TREE_CODE (loops[i - loops[i].outer].step) == INTEGER_CST)
+         {
+           tree t;
+           tree itype = TREE_TYPE (loops[i].v);
+           if (loops[i].m1 && loops[i].m2)
+             t = fold_build2 (MINUS_EXPR, itype, loops[i].m2, loops[i].m1);
+           else if (loops[i].m1)
+             t = fold_build1 (NEGATE_EXPR, itype, loops[i].m1);
+           else
+             t = loops[i].m2;
+           t = fold_build2 (MULT_EXPR, itype, t,
+                            fold_convert (itype,
+                                          loops[i - loops[i].outer].step));
+           if (TYPE_UNSIGNED (itype) && loops[i].cond_code == GT_EXPR)
+             t = fold_build2 (TRUNC_MOD_EXPR, itype,
+                              fold_build1 (NEGATE_EXPR, itype, t),
+                              fold_build1 (NEGATE_EXPR, itype,
+                                           fold_convert (itype,
+                                                         loops[i].step)));
+           else
+             t = fold_build2 (TRUNC_MOD_EXPR, itype, t,
+                              fold_convert (itype, loops[i].step));
+           if (integer_nonzerop (t))
+             error_at (gimple_location (fd.for_stmt),
+                       "invalid OpenMP non-rectangular loop step; "
+                       "%<(%E - %E) * %E%> is not a multiple of loop %d "
+                       "step %qE",
+                       loops[i].m2 ? loops[i].m2 : integer_zero_node,
+                       loops[i].m1 ? loops[i].m1 : integer_zero_node,
+                       loops[i - loops[i].outer].step, i + 1,
+                       loops[i].step);
+         }
+    }
 
   gcc_assert (EDGE_COUNT (region->entry->succs) == 2);
   BRANCH_EDGE (region->entry)->flags &= ~EDGE_ABNORMAL;
@@ -5819,13 +8023,11 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
        original loops from being detected.  Fix that up.  */
     loops_state_set (LOOPS_NEED_FIXUP);
 
-  if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
+  if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_SIMD)
     expand_omp_simd (region, &fd);
-  else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_CILKFOR)
-    expand_cilk_for (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
     {
-      gcc_assert (!inner_stmt);
+      gcc_assert (!inner_stmt && !fd.non_rect);
       expand_oacc_for (region, &fd);
     }
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_TASKLOOP)
@@ -5846,39 +8048,75 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
   else
     {
       int fn_index, start_ix, next_ix;
+      unsigned HOST_WIDE_INT sched = 0;
+      tree sched_arg = NULL_TREE;
 
       gcc_assert (gimple_omp_for_kind (fd.for_stmt)
-                 == GF_OMP_FOR_KIND_FOR);
+                 == GF_OMP_FOR_KIND_FOR && !fd.non_rect);
       if (fd.chunk_size == NULL
          && fd.sched_kind == OMP_CLAUSE_SCHEDULE_STATIC)
        fd.chunk_size = integer_zero_node;
-      gcc_assert (fd.sched_kind != OMP_CLAUSE_SCHEDULE_AUTO);
       switch (fd.sched_kind)
        {
        case OMP_CLAUSE_SCHEDULE_RUNTIME:
-         fn_index = 3;
+         if ((fd.sched_modifiers & OMP_CLAUSE_SCHEDULE_NONMONOTONIC) != 0
+             && fd.lastprivate_conditional == 0)
+           {
+             gcc_assert (!fd.have_ordered);
+             fn_index = 6;
+             sched = 4;
+           }
+         else if ((fd.sched_modifiers & OMP_CLAUSE_SCHEDULE_MONOTONIC) == 0
+                  && !fd.have_ordered
+                  && fd.lastprivate_conditional == 0)
+           fn_index = 7;
+         else
+           {
+             fn_index = 3;
+             sched = (HOST_WIDE_INT_1U << 31);
+           }
          break;
        case OMP_CLAUSE_SCHEDULE_DYNAMIC:
        case OMP_CLAUSE_SCHEDULE_GUIDED:
-         if ((fd.sched_modifiers & OMP_CLAUSE_SCHEDULE_NONMONOTONIC)
-             && !fd.ordered
-             && !fd.have_ordered)
+         if ((fd.sched_modifiers & OMP_CLAUSE_SCHEDULE_MONOTONIC) == 0
+             && !fd.have_ordered
+             && fd.lastprivate_conditional == 0)
            {
              fn_index = 3 + fd.sched_kind;
+             sched = (fd.sched_kind == OMP_CLAUSE_SCHEDULE_GUIDED) + 2;
              break;
            }
-         /* FALLTHRU */
-       default:
          fn_index = fd.sched_kind;
+         sched = (fd.sched_kind == OMP_CLAUSE_SCHEDULE_GUIDED) + 2;
+         sched += (HOST_WIDE_INT_1U << 31);
+         break;
+       case OMP_CLAUSE_SCHEDULE_STATIC:
+         gcc_assert (fd.have_ordered);
+         fn_index = 0;
+         sched = (HOST_WIDE_INT_1U << 31) + 1;
          break;
+       default:
+         gcc_unreachable ();
        }
       if (!fd.ordered)
-       fn_index += fd.have_ordered * 6;
+       fn_index += fd.have_ordered * 8;
       if (fd.ordered)
        start_ix = ((int)BUILT_IN_GOMP_LOOP_DOACROSS_STATIC_START) + fn_index;
       else
        start_ix = ((int)BUILT_IN_GOMP_LOOP_STATIC_START) + fn_index;
       next_ix = ((int)BUILT_IN_GOMP_LOOP_STATIC_NEXT) + fn_index;
+      if (fd.have_reductemp || fd.have_pointer_condtemp)
+       {
+         if (fd.ordered)
+           start_ix = (int)BUILT_IN_GOMP_LOOP_DOACROSS_START;
+         else if (fd.have_ordered)
+           start_ix = (int)BUILT_IN_GOMP_LOOP_ORDERED_START;
+         else
+           start_ix = (int)BUILT_IN_GOMP_LOOP_START;
+         sched_arg = build_int_cstu (long_integer_type_node, sched);
+         if (!fd.chunk_size)
+           fd.chunk_size = integer_zero_node;
+       }
       if (fd.iter_type == long_long_unsigned_type_node)
        {
          start_ix += ((int)BUILT_IN_GOMP_LOOP_ULL_STATIC_START
@@ -5887,7 +8125,8 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
                      - (int)BUILT_IN_GOMP_LOOP_STATIC_NEXT);
        }
       expand_omp_for_generic (region, &fd, (enum built_in_function) start_ix,
-                             (enum built_in_function) next_ix, inner_stmt);
+                             (enum built_in_function) next_ix, sched_arg,
+                             inner_stmt);
     }
 
   if (gimple_in_ssa_p (cfun))
@@ -5950,7 +8189,7 @@ expand_omp_sections (struct omp_region *region)
       len = EDGE_COUNT (l0_bb->succs);
       gcc_assert (len > 0);
       e = EDGE_SUCC (l0_bb, len - 1);
-      si = gsi_last_bb (e->dest);
+      si = gsi_last_nondebug_bb (e->dest);
       l2 = NULL_TREE;
       if (gsi_end_p (si)
          || gimple_code (gsi_stmt (si)) != GIMPLE_OMP_SECTION)
@@ -5958,7 +8197,7 @@ expand_omp_sections (struct omp_region *region)
       else
        FOR_EACH_EDGE (e, ei, l0_bb->succs)
          {
-           si = gsi_last_bb (e->dest);
+           si = gsi_last_nondebug_bb (e->dest);
            if (gsi_end_p (si)
                || gimple_code (gsi_stmt (si)) != GIMPLE_OMP_SECTION)
              {
@@ -5983,11 +8222,70 @@ expand_omp_sections (struct omp_region *region)
 
   /* The call to GOMP_sections_start goes in ENTRY_BB, replacing the
      GIMPLE_OMP_SECTIONS statement.  */
-  si = gsi_last_bb (entry_bb);
+  si = gsi_last_nondebug_bb (entry_bb);
   sections_stmt = as_a <gomp_sections *> (gsi_stmt (si));
   gcc_assert (gimple_code (sections_stmt) == GIMPLE_OMP_SECTIONS);
   vin = gimple_omp_sections_control (sections_stmt);
-  if (!is_combined_parallel (region))
+  tree clauses = gimple_omp_sections_clauses (sections_stmt);
+  tree reductmp = omp_find_clause (clauses, OMP_CLAUSE__REDUCTEMP_);
+  tree condtmp = omp_find_clause (clauses, OMP_CLAUSE__CONDTEMP_);
+  tree cond_var = NULL_TREE;
+  if (reductmp || condtmp)
+    {
+      tree reductions = null_pointer_node, mem = null_pointer_node;
+      tree memv = NULL_TREE, condtemp = NULL_TREE;
+      gimple_stmt_iterator gsi = gsi_none ();
+      gimple *g = NULL;
+      if (reductmp)
+       {
+         reductions = OMP_CLAUSE_DECL (reductmp);
+         gcc_assert (TREE_CODE (reductions) == SSA_NAME);
+         g = SSA_NAME_DEF_STMT (reductions);
+         reductions = gimple_assign_rhs1 (g);
+         OMP_CLAUSE_DECL (reductmp) = reductions;
+         gsi = gsi_for_stmt (g);
+       }
+      else
+       gsi = si;
+      if (condtmp)
+       {
+         condtemp = OMP_CLAUSE_DECL (condtmp);
+         tree c = omp_find_clause (OMP_CLAUSE_CHAIN (condtmp),
+                                   OMP_CLAUSE__CONDTEMP_);
+         cond_var = OMP_CLAUSE_DECL (c);
+         tree type = TREE_TYPE (condtemp);
+         memv = create_tmp_var (type);
+         TREE_ADDRESSABLE (memv) = 1;
+         unsigned cnt = 0;
+         for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+           if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE
+               && OMP_CLAUSE_LASTPRIVATE_CONDITIONAL (c))
+             ++cnt;
+         unsigned HOST_WIDE_INT sz
+           = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (type))) * cnt;
+         expand_omp_build_assign (&gsi, memv, build_int_cst (type, sz),
+                                  false);
+         mem = build_fold_addr_expr (memv);
+       }
+      t = build_int_cst (unsigned_type_node, len - 1);
+      u = builtin_decl_explicit (BUILT_IN_GOMP_SECTIONS2_START);
+      stmt = gimple_build_call (u, 3, t, reductions, mem);
+      gimple_call_set_lhs (stmt, vin);
+      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+      if (condtmp)
+       {
+         expand_omp_build_assign (&gsi, condtemp, memv, false);
+         tree t = build2 (PLUS_EXPR, TREE_TYPE (cond_var),
+                          vin, build_one_cst (TREE_TYPE (cond_var)));
+         expand_omp_build_assign (&gsi, cond_var, t, false);
+       }
+      if (reductmp)
+       {
+         gsi_remove (&gsi, true);
+         release_ssa_name (gimple_assign_lhs (g));
+       }
+    }
+  else if (!is_combined_parallel (region))
     {
       /* If we are not inside a combined parallel+sections region,
         call GOMP_sections_start.  */
@@ -6001,13 +8299,16 @@ expand_omp_sections (struct omp_region *region)
       u = builtin_decl_explicit (BUILT_IN_GOMP_SECTIONS_NEXT);
       stmt = gimple_build_call (u, 0);
     }
-  gimple_call_set_lhs (stmt, vin);
-  gsi_insert_after (&si, stmt, GSI_SAME_STMT);
+  if (!reductmp && !condtmp)
+    {
+      gimple_call_set_lhs (stmt, vin);
+      gsi_insert_after (&si, stmt, GSI_SAME_STMT);
+    }
   gsi_remove (&si, true);
 
   /* The switch() statement replacing GIMPLE_OMP_SECTIONS_SWITCH goes in
      L0_BB.  */
-  switch_si = gsi_last_bb (l0_bb);
+  switch_si = gsi_last_nondebug_bb (l0_bb);
   gcc_assert (gimple_code (gsi_stmt (switch_si)) == GIMPLE_OMP_SECTIONS_SWITCH);
   if (exit_reachable)
     {
@@ -6049,7 +8350,7 @@ expand_omp_sections (struct omp_region *region)
       u = build_case_label (u, NULL, t);
       label_vec.quick_push (u);
 
-      si = gsi_last_bb (s_entry_bb);
+      si = gsi_last_nondebug_bb (s_entry_bb);
       gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_SECTION);
       gcc_assert (i < len || gimple_omp_section_last_p (gsi_stmt (si)));
       gsi_remove (&si, true);
@@ -6058,7 +8359,7 @@ expand_omp_sections (struct omp_region *region)
       if (s_exit_bb == NULL)
        continue;
 
-      si = gsi_last_bb (s_exit_bb);
+      si = gsi_last_nondebug_bb (s_exit_bb);
       gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_RETURN);
       gsi_remove (&si, true);
 
@@ -6084,20 +8385,26 @@ expand_omp_sections (struct omp_region *region)
       tree bfn_decl;
 
       /* Code to get the next section goes in L1_BB.  */
-      si = gsi_last_bb (l1_bb);
+      si = gsi_last_nondebug_bb (l1_bb);
       gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_CONTINUE);
 
       bfn_decl = builtin_decl_explicit (BUILT_IN_GOMP_SECTIONS_NEXT);
       stmt = gimple_build_call (bfn_decl, 0);
       gimple_call_set_lhs (stmt, vnext);
-      gsi_insert_after (&si, stmt, GSI_SAME_STMT);
+      gsi_insert_before (&si, stmt, GSI_SAME_STMT);
+      if (cond_var)
+       {
+         tree t = build2 (PLUS_EXPR, TREE_TYPE (cond_var),
+                          vnext, build_one_cst (TREE_TYPE (cond_var)));
+         expand_omp_build_assign (&si, cond_var, t, false);
+       }
       gsi_remove (&si, true);
 
       single_succ_edge (l1_bb)->flags = EDGE_FALLTHRU;
     }
 
   /* Cleanup function replaces GIMPLE_OMP_RETURN in EXIT_BB.  */
-  si = gsi_last_bb (l2_bb);
+  si = gsi_last_nondebug_bb (l2_bb);
   if (gimple_omp_return_nowait_p (gsi_stmt (si)))
     t = builtin_decl_explicit (BUILT_IN_GOMP_SECTIONS_END_NOWAIT);
   else if (gimple_omp_return_lhs (gsi_stmt (si)))
@@ -6113,7 +8420,7 @@ expand_omp_sections (struct omp_region *region)
   set_immediate_dominator (CDI_DOMINATORS, default_bb, l0_bb);
 }
 
-/* Expand code for an OpenMP single directive.  We've already expanded
+/* Expand code for an OpenMP single or scope directive.  We've already expanded
    much of the code, here we simply place the GOMP_barrier call.  */
 
 static void
@@ -6125,12 +8432,13 @@ expand_omp_single (struct omp_region *region)
   entry_bb = region->entry;
   exit_bb = region->exit;
 
-  si = gsi_last_bb (entry_bb);
-  gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_SINGLE);
+  si = gsi_last_nondebug_bb (entry_bb);
+  gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_SINGLE
+             || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_SCOPE);
   gsi_remove (&si, true);
   single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
 
-  si = gsi_last_bb (exit_bb);
+  si = gsi_last_nondebug_bb (exit_bb);
   if (!gimple_omp_return_nowait_p (gsi_stmt (si)))
     {
       tree t = gimple_omp_return_lhs (gsi_stmt (si));
@@ -6153,25 +8461,86 @@ expand_omp_synch (struct omp_region *region)
   entry_bb = region->entry;
   exit_bb = region->exit;
 
-  si = gsi_last_bb (entry_bb);
+  si = gsi_last_nondebug_bb (entry_bb);
   gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_SINGLE
              || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_MASTER
+             || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_MASKED
              || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_TASKGROUP
              || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_ORDERED
              || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_CRITICAL
              || gimple_code (gsi_stmt (si)) == GIMPLE_OMP_TEAMS);
+  if (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_TEAMS
+      && gimple_omp_teams_host (as_a <gomp_teams *> (gsi_stmt (si))))
+    {
+      expand_omp_taskreg (region);
+      return;
+    }
   gsi_remove (&si, true);
   single_succ_edge (entry_bb)->flags = EDGE_FALLTHRU;
 
   if (exit_bb)
     {
-      si = gsi_last_bb (exit_bb);
+      si = gsi_last_nondebug_bb (exit_bb);
       gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_RETURN);
       gsi_remove (&si, true);
       single_succ_edge (exit_bb)->flags = EDGE_FALLTHRU;
     }
 }
 
+/* Translate enum omp_memory_order to enum memmodel for the embedded
+   fail clause in there.  */
+
+static enum memmodel
+omp_memory_order_to_fail_memmodel (enum omp_memory_order mo)
+{
+  switch (mo & OMP_FAIL_MEMORY_ORDER_MASK)
+    {
+    case OMP_FAIL_MEMORY_ORDER_UNSPECIFIED:
+      switch (mo & OMP_MEMORY_ORDER_MASK)
+       {
+       case OMP_MEMORY_ORDER_RELAXED: return MEMMODEL_RELAXED;
+       case OMP_MEMORY_ORDER_ACQUIRE: return MEMMODEL_ACQUIRE;
+       case OMP_MEMORY_ORDER_RELEASE: return MEMMODEL_RELAXED;
+       case OMP_MEMORY_ORDER_ACQ_REL: return MEMMODEL_ACQUIRE;
+       case OMP_MEMORY_ORDER_SEQ_CST: return MEMMODEL_SEQ_CST;
+       default: break;
+       }
+      gcc_unreachable ();
+    case OMP_FAIL_MEMORY_ORDER_RELAXED: return MEMMODEL_RELAXED;
+    case OMP_FAIL_MEMORY_ORDER_ACQUIRE: return MEMMODEL_ACQUIRE;
+    case OMP_FAIL_MEMORY_ORDER_SEQ_CST: return MEMMODEL_SEQ_CST;
+    default: gcc_unreachable ();
+    }
+}
+
+/* Translate enum omp_memory_order to enum memmodel.  The two enums
+   are using different numbers so that OMP_MEMORY_ORDER_UNSPECIFIED
+   is 0 and omp_memory_order has the fail mode encoded in it too.  */
+
+static enum memmodel
+omp_memory_order_to_memmodel (enum omp_memory_order mo)
+{
+  enum memmodel ret, fail_ret;
+  switch (mo & OMP_MEMORY_ORDER_MASK)
+    {
+    case OMP_MEMORY_ORDER_RELAXED: ret = MEMMODEL_RELAXED; break;
+    case OMP_MEMORY_ORDER_ACQUIRE: ret = MEMMODEL_ACQUIRE; break;
+    case OMP_MEMORY_ORDER_RELEASE: ret = MEMMODEL_RELEASE; break;
+    case OMP_MEMORY_ORDER_ACQ_REL: ret = MEMMODEL_ACQ_REL; break;
+    case OMP_MEMORY_ORDER_SEQ_CST: ret = MEMMODEL_SEQ_CST; break;
+    default: gcc_unreachable ();
+    }
+  /* If we drop the -Winvalid-memory-model warning for C++17 P0418R2,
+     we can just return ret here unconditionally.  Otherwise, work around
+     it here and make sure fail memmodel is not stronger.  */
+  if ((mo & OMP_FAIL_MEMORY_ORDER_MASK) == OMP_FAIL_MEMORY_ORDER_UNSPECIFIED)
+    return ret;
+  fail_ret = omp_memory_order_to_fail_memmodel (mo);
+  if (fail_ret > ret)
+    return fail_ret;
+  return ret;
+}
+
 /* A subroutine of expand_omp_atomic.  Attempt to implement the atomic
    operation as a normal volatile load.  */
 
@@ -6186,7 +8555,7 @@ expand_omp_atomic_load (basic_block load_bb, tree addr,
   gimple *stmt;
   tree decl, call, type, itype;
 
-  gsi = gsi_last_bb (load_bb);
+  gsi = gsi_last_nondebug_bb (load_bb);
   stmt = gsi_stmt (gsi);
   gcc_assert (gimple_code (stmt) == GIMPLE_OMP_ATOMIC_LOAD);
   loc = gimple_location (stmt);
@@ -6203,11 +8572,9 @@ expand_omp_atomic_load (basic_block load_bb, tree addr,
   type = TREE_TYPE (loaded_val);
   itype = TREE_TYPE (TREE_TYPE (decl));
 
-  call = build_call_expr_loc (loc, decl, 2, addr,
-                             build_int_cst (NULL,
-                                            gimple_omp_atomic_seq_cst_p (stmt)
-                                            ? MEMMODEL_SEQ_CST
-                                            : MEMMODEL_RELAXED));
+  enum omp_memory_order omo = gimple_omp_atomic_memory_order (stmt);
+  tree mo = build_int_cst (NULL, omp_memory_order_to_memmodel (omo));
+  call = build_call_expr_loc (loc, decl, 2, addr, mo);
   if (!useless_type_conversion_p (type, itype))
     call = fold_build1_loc (loc, VIEW_CONVERT_EXPR, type, call);
   call = build2_loc (loc, MODIFY_EXPR, void_type_node, loaded_val, call);
@@ -6216,7 +8583,7 @@ expand_omp_atomic_load (basic_block load_bb, tree addr,
   gsi_remove (&gsi, true);
 
   store_bb = single_succ (load_bb);
-  gsi = gsi_last_bb (store_bb);
+  gsi = gsi_last_nondebug_bb (store_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_ATOMIC_STORE);
   gsi_remove (&gsi, true);
 
@@ -6242,14 +8609,14 @@ expand_omp_atomic_store (basic_block load_bb, tree addr,
   machine_mode imode;
   bool exchange;
 
-  gsi = gsi_last_bb (load_bb);
+  gsi = gsi_last_nondebug_bb (load_bb);
   stmt = gsi_stmt (gsi);
   gcc_assert (gimple_code (stmt) == GIMPLE_OMP_ATOMIC_LOAD);
 
   /* If the load value is needed, then this isn't a store but an exchange.  */
   exchange = gimple_omp_atomic_need_value_p (stmt);
 
-  gsi = gsi_last_bb (store_bb);
+  gsi = gsi_last_nondebug_bb (store_bb);
   stmt = gsi_stmt (gsi);
   gcc_assert (gimple_code (stmt) == GIMPLE_OMP_ATOMIC_STORE);
   loc = gimple_location (stmt);
@@ -6278,11 +8645,9 @@ expand_omp_atomic_store (basic_block load_bb, tree addr,
 
   if (!useless_type_conversion_p (itype, type))
     stored_val = fold_build1_loc (loc, VIEW_CONVERT_EXPR, itype, stored_val);
-  call = build_call_expr_loc (loc, decl, 3, addr, stored_val,
-                             build_int_cst (NULL,
-                                            gimple_omp_atomic_seq_cst_p (stmt)
-                                            ? MEMMODEL_SEQ_CST
-                                            : MEMMODEL_RELAXED));
+  enum omp_memory_order omo = gimple_omp_atomic_memory_order (stmt);
+  tree mo = build_int_cst (NULL, omp_memory_order_to_memmodel (omo));
+  call = build_call_expr_loc (loc, decl, 3, addr, stored_val, mo);
   if (exchange)
     {
       if (!useless_type_conversion_p (type, itype))
@@ -6294,7 +8659,7 @@ expand_omp_atomic_store (basic_block load_bb, tree addr,
   gsi_remove (&gsi, true);
 
   /* Remove the GIMPLE_OMP_ATOMIC_LOAD that we verified above.  */
-  gsi = gsi_last_bb (load_bb);
+  gsi = gsi_last_nondebug_bb (load_bb);
   gsi_remove (&gsi, true);
 
   if (gimple_in_ssa_p (cfun))
@@ -6323,7 +8688,6 @@ expand_omp_atomic_fetch_op (basic_block load_bb,
   enum tree_code code;
   bool need_old, need_new;
   machine_mode imode;
-  bool seq_cst;
 
   /* We expect to find the following sequences:
 
@@ -6341,15 +8705,24 @@ expand_omp_atomic_fetch_op (basic_block load_bb,
 
   gsi = gsi_after_labels (store_bb);
   stmt = gsi_stmt (gsi);
+  if (is_gimple_debug (stmt))
+    {
+      gsi_next_nondebug (&gsi);
+      if (gsi_end_p (gsi))
+       return false;
+      stmt = gsi_stmt (gsi);
+    }
   loc = gimple_location (stmt);
   if (!is_gimple_assign (stmt))
     return false;
-  gsi_next (&gsi);
+  gsi_next_nondebug (&gsi);
   if (gimple_code (gsi_stmt (gsi)) != GIMPLE_OMP_ATOMIC_STORE)
     return false;
   need_new = gimple_omp_atomic_need_value_p (gsi_stmt (gsi));
   need_old = gimple_omp_atomic_need_value_p (last_stmt (load_bb));
-  seq_cst = gimple_omp_atomic_seq_cst_p (last_stmt (load_bb));
+  enum omp_memory_order omo
+    = gimple_omp_atomic_memory_order (last_stmt (load_bb));
+  enum memmodel mo = omp_memory_order_to_memmodel (omo);
   gcc_checking_assert (!need_old || !need_new);
 
   if (!operand_equal_p (gimple_assign_lhs (stmt), stored_val, 0))
@@ -6408,7 +8781,7 @@ expand_omp_atomic_fetch_op (basic_block load_bb,
   if (!can_compare_and_swap_p (imode, true) || !can_atomic_load_p (imode))
     return false;
 
-  gsi = gsi_last_bb (load_bb);
+  gsi = gsi_last_nondebug_bb (load_bb);
   gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_ATOMIC_LOAD);
 
   /* OpenMP does not imply any barrier-like semantics on its atomic ops.
@@ -6416,32 +8789,285 @@ expand_omp_atomic_fetch_op (basic_block load_bb,
      use the RELAXED memory model.  */
   call = build_call_expr_loc (loc, decl, 3, addr,
                              fold_convert_loc (loc, itype, rhs),
-                             build_int_cst (NULL,
-                                            seq_cst ? MEMMODEL_SEQ_CST
-                                                    : MEMMODEL_RELAXED));
+                             build_int_cst (NULL, mo));
+
+  if (need_old || need_new)
+    {
+      lhs = need_old ? loaded_val : stored_val;
+      call = fold_convert_loc (loc, TREE_TYPE (lhs), call);
+      call = build2_loc (loc, MODIFY_EXPR, void_type_node, lhs, call);
+    }
+  else
+    call = fold_convert_loc (loc, void_type_node, call);
+  force_gimple_operand_gsi (&gsi, call, true, NULL_TREE, true, GSI_SAME_STMT);
+  gsi_remove (&gsi, true);
+
+  gsi = gsi_last_nondebug_bb (store_bb);
+  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_ATOMIC_STORE);
+  gsi_remove (&gsi, true);
+  gsi = gsi_last_nondebug_bb (store_bb);
+  stmt = gsi_stmt (gsi);
+  gsi_remove (&gsi, true);
+
+  if (gimple_in_ssa_p (cfun))
+    {
+      release_defs (stmt);
+      update_ssa (TODO_update_ssa_no_phi);
+    }
+
+  return true;
+}
+
+/* A subroutine of expand_omp_atomic.  Attempt to implement the atomic
+   compare and exchange as an ATOMIC_COMPARE_EXCHANGE internal function.
+   Returns false if the expression is not of the proper form.  */
+
+static bool
+expand_omp_atomic_cas (basic_block load_bb, tree addr,
+                      tree loaded_val, tree stored_val, int index)
+{
+  /* We expect to find the following sequences:
+
+   load_bb:
+       GIMPLE_OMP_ATOMIC_LOAD (tmp, mem)
+
+   store_bb:
+       val = tmp == e ? d : tmp;
+       GIMPLE_OMP_ATOMIC_STORE (val)
+
+     or in store_bb instead:
+       tmp2 = tmp == e;
+       val = tmp2 ? d : tmp;
+       GIMPLE_OMP_ATOMIC_STORE (val)
+
+     or:
+       tmp3 = VIEW_CONVERT_EXPR<integral_type>(tmp);
+       val = e == tmp3 ? d : tmp;
+       GIMPLE_OMP_ATOMIC_STORE (val)
+
+     etc.  */
+
+
+  basic_block store_bb = single_succ (load_bb);
+  gimple_stmt_iterator gsi = gsi_last_nondebug_bb (store_bb);
+  gimple *store_stmt = gsi_stmt (gsi);
+  if (!store_stmt || gimple_code (store_stmt) != GIMPLE_OMP_ATOMIC_STORE)
+    return false;
+  gsi_prev_nondebug (&gsi);
+  if (gsi_end_p (gsi))
+    return false;
+  gimple *condexpr_stmt = gsi_stmt (gsi);
+  if (!is_gimple_assign (condexpr_stmt)
+      || gimple_assign_rhs_code (condexpr_stmt) != COND_EXPR)
+    return false;
+  if (!operand_equal_p (gimple_assign_lhs (condexpr_stmt), stored_val, 0))
+    return false;
+  gimple *cond_stmt = NULL;
+  gimple *vce_stmt = NULL;
+  gsi_prev_nondebug (&gsi);
+  if (!gsi_end_p (gsi))
+    {
+      cond_stmt = gsi_stmt (gsi);
+      if (!is_gimple_assign (cond_stmt))
+       return false;
+      if (gimple_assign_rhs_code (cond_stmt) == EQ_EXPR)
+       {
+         gsi_prev_nondebug (&gsi);
+         if (!gsi_end_p (gsi))
+           {
+             vce_stmt = gsi_stmt (gsi);
+             if (!is_gimple_assign (vce_stmt)
+                 || gimple_assign_rhs_code (vce_stmt) != VIEW_CONVERT_EXPR)
+               return false;
+           }
+       }
+      else if (gimple_assign_rhs_code (cond_stmt) == VIEW_CONVERT_EXPR)
+       std::swap (vce_stmt, cond_stmt);
+      else
+       return false;
+      if (vce_stmt)
+       {
+         tree vce_rhs = gimple_assign_rhs1 (vce_stmt);
+         if (TREE_CODE (vce_rhs) != VIEW_CONVERT_EXPR
+             || !operand_equal_p (TREE_OPERAND (vce_rhs, 0), loaded_val))
+           return false;
+         if (!INTEGRAL_TYPE_P (TREE_TYPE (vce_rhs))
+             || !SCALAR_FLOAT_TYPE_P (TREE_TYPE (loaded_val))
+             || !tree_int_cst_equal (TYPE_SIZE (TREE_TYPE (vce_rhs)),
+                                     TYPE_SIZE (TREE_TYPE (loaded_val))))
+           return false;
+         gsi_prev_nondebug (&gsi);
+         if (!gsi_end_p (gsi))
+           return false;
+       }
+    }
+  tree cond = gimple_assign_rhs1 (condexpr_stmt);
+  tree cond_op1, cond_op2;
+  if (cond_stmt)
+    {
+      if (!operand_equal_p (cond, gimple_assign_lhs (cond_stmt)))
+       return false;
+      cond_op1 = gimple_assign_rhs1 (cond_stmt);
+      cond_op2 = gimple_assign_rhs2 (cond_stmt);
+    }
+  else if (TREE_CODE (cond) != EQ_EXPR && TREE_CODE (cond) != NE_EXPR)
+    return false;
+  else
+    {
+      cond_op1 = TREE_OPERAND (cond, 0);
+      cond_op2 = TREE_OPERAND (cond, 1);
+    }
+  tree d;
+  if (TREE_CODE (cond) == NE_EXPR)
+    {
+      if (!operand_equal_p (gimple_assign_rhs2 (condexpr_stmt), loaded_val))
+       return false;
+      d = gimple_assign_rhs3 (condexpr_stmt);
+    }
+  else if (!operand_equal_p (gimple_assign_rhs3 (condexpr_stmt), loaded_val))
+    return false;
+  else
+    d = gimple_assign_rhs2 (condexpr_stmt);
+  tree e = vce_stmt ? gimple_assign_lhs (vce_stmt) : loaded_val;
+  if (operand_equal_p (e, cond_op1))
+    e = cond_op2;
+  else if (operand_equal_p (e, cond_op2))
+    e = cond_op1;
+  else
+    return false;
+
+  location_t loc = gimple_location (store_stmt);
+  gimple *load_stmt = last_stmt (load_bb);
+  bool need_new = gimple_omp_atomic_need_value_p (store_stmt);
+  bool need_old = gimple_omp_atomic_need_value_p (load_stmt);
+  bool weak = gimple_omp_atomic_weak_p (load_stmt);
+  enum omp_memory_order omo = gimple_omp_atomic_memory_order (load_stmt);
+  tree mo = build_int_cst (NULL, omp_memory_order_to_memmodel (omo));
+  tree fmo = build_int_cst (NULL, omp_memory_order_to_fail_memmodel (omo));
+  gcc_checking_assert (!need_old || !need_new);
+
+  enum built_in_function fncode
+    = (enum built_in_function) ((int) BUILT_IN_SYNC_VAL_COMPARE_AND_SWAP_N
+                               + index + 1);
+  tree cmpxchg = builtin_decl_explicit (fncode);
+  if (cmpxchg == NULL_TREE)
+    return false;
+  tree itype = TREE_TYPE (TREE_TYPE (cmpxchg));
+
+  if (!can_compare_and_swap_p (TYPE_MODE (itype), true)
+      || !can_atomic_load_p (TYPE_MODE (itype)))
+    return false;
+
+  tree type = TYPE_MAIN_VARIANT (TREE_TYPE (loaded_val));
+  if (SCALAR_FLOAT_TYPE_P (type) && !vce_stmt)
+    return false;
+
+  gsi = gsi_for_stmt (store_stmt);
+  if (!useless_type_conversion_p (itype, TREE_TYPE (e)))
+    {
+      tree ne = create_tmp_reg (itype);
+      gimple *g = gimple_build_assign (ne, NOP_EXPR, e);
+      gimple_set_location (g, loc);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      e = ne;
+    }
+  if (!useless_type_conversion_p (itype, TREE_TYPE (d)))
+    {
+      tree nd = create_tmp_reg (itype);
+      enum tree_code code;
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (d)))
+       {
+         code = VIEW_CONVERT_EXPR;
+         d = build1 (VIEW_CONVERT_EXPR, itype, d);
+       }
+      else
+       code = NOP_EXPR;
+      gimple *g = gimple_build_assign (nd, code, d);
+      gimple_set_location (g, loc);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+      d = nd;
+    }
+
+  tree ctype = build_complex_type (itype);
+  int flag = int_size_in_bytes (itype) + (weak ? 256 : 0);
+  gimple *g
+    = gimple_build_call_internal (IFN_ATOMIC_COMPARE_EXCHANGE, 6, addr, e, d,
+                                 build_int_cst (integer_type_node, flag),
+                                 mo, fmo);
+  tree cres = create_tmp_reg (ctype);
+  gimple_call_set_lhs (g, cres);
+  gimple_set_location (g, loc);
+  gsi_insert_before (&gsi, g, GSI_SAME_STMT);
 
-  if (need_old || need_new)
+  if (cond_stmt || need_old || need_new)
     {
-      lhs = need_old ? loaded_val : stored_val;
-      call = fold_convert_loc (loc, TREE_TYPE (lhs), call);
-      call = build2_loc (loc, MODIFY_EXPR, void_type_node, lhs, call);
+      tree im = create_tmp_reg (itype);
+      g = gimple_build_assign (im, IMAGPART_EXPR,
+                              build1 (IMAGPART_EXPR, itype, cres));
+      gimple_set_location (g, loc);
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+
+      tree re = NULL_TREE;
+      if (need_old || need_new)
+       {
+         re = create_tmp_reg (itype);
+         g = gimple_build_assign (re, REALPART_EXPR,
+                                  build1 (REALPART_EXPR, itype, cres));
+         gimple_set_location (g, loc);
+         gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+       }
+
+      if (cond_stmt)
+       {
+         g = gimple_build_assign (gimple_assign_lhs (cond_stmt),
+                                  NOP_EXPR, im);
+         gimple_set_location (g, loc);
+         gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+       }
+      else if (need_new)
+       {
+         g = gimple_build_assign (create_tmp_reg (itype), COND_EXPR,
+                                  build2 (NE_EXPR, boolean_type_node,
+                                          im, build_zero_cst (itype)),
+                                  d, re);
+         gimple_set_location (g, loc);
+         gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+         re = gimple_assign_lhs (g);
+       }
+
+      if (need_old || need_new)
+       {
+         tree v = need_old ? loaded_val : stored_val;
+         enum tree_code code;
+         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (v)))
+           {
+             code = VIEW_CONVERT_EXPR;
+             re = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (v), re);
+           }
+         else if (!useless_type_conversion_p (TREE_TYPE (v), itype))
+           code = NOP_EXPR;
+         else
+           code = TREE_CODE (re);
+         g = gimple_build_assign (v, code, re);
+         gimple_set_location (g, loc);
+         gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+       }
     }
-  else
-    call = fold_convert_loc (loc, void_type_node, call);
-  force_gimple_operand_gsi (&gsi, call, true, NULL_TREE, true, GSI_SAME_STMT);
-  gsi_remove (&gsi, true);
 
-  gsi = gsi_last_bb (store_bb);
-  gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_ATOMIC_STORE);
   gsi_remove (&gsi, true);
-  gsi = gsi_last_bb (store_bb);
-  stmt = gsi_stmt (gsi);
+  gsi = gsi_for_stmt (load_stmt);
   gsi_remove (&gsi, true);
-
-  if (gimple_in_ssa_p (cfun))
+  gsi = gsi_for_stmt (condexpr_stmt);
+  gsi_remove (&gsi, true);
+  if (cond_stmt)
     {
-      release_defs (stmt);
-      update_ssa (TODO_update_ssa_no_phi);
+      gsi = gsi_for_stmt (cond_stmt);
+      gsi_remove (&gsi, true);
+    }
+  if (vce_stmt)
+    {
+      gsi = gsi_for_stmt (vce_stmt);
+      gsi_remove (&gsi, true);
     }
 
   return true;
@@ -6465,21 +9091,20 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
                            int index)
 {
   tree loadedi, storedi, initial, new_storedi, old_vali;
-  tree type, itype, cmpxchg, iaddr;
+  tree type, itype, cmpxchg, iaddr, atype;
   gimple_stmt_iterator si;
   basic_block loop_header = single_succ (load_bb);
   gimple *phi, *stmt;
   edge e;
   enum built_in_function fncode;
 
-  /* ??? We need a non-pointer interface to __atomic_compare_exchange in
-     order to use the RELAXED memory model effectively.  */
   fncode = (enum built_in_function)((int)BUILT_IN_SYNC_VAL_COMPARE_AND_SWAP_N
                                    + index + 1);
   cmpxchg = builtin_decl_explicit (fncode);
   if (cmpxchg == NULL_TREE)
     return false;
-  type = TYPE_MAIN_VARIANT (TREE_TYPE (TREE_TYPE (addr)));
+  type = TYPE_MAIN_VARIANT (TREE_TYPE (loaded_val));
+  atype = type;
   itype = TREE_TYPE (TREE_TYPE (cmpxchg));
 
   if (!can_compare_and_swap_p (TYPE_MODE (itype), true)
@@ -6487,8 +9112,12 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
     return false;
 
   /* Load the initial value, replacing the GIMPLE_OMP_ATOMIC_LOAD.  */
-  si = gsi_last_bb (load_bb);
+  si = gsi_last_nondebug_bb (load_bb);
   gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_ATOMIC_LOAD);
+  location_t loc = gimple_location (gsi_stmt (si));
+  enum omp_memory_order omo = gimple_omp_atomic_memory_order (gsi_stmt (si));
+  tree mo = build_int_cst (NULL, omp_memory_order_to_memmodel (omo));
+  tree fmo = build_int_cst (NULL, omp_memory_order_to_fail_memmodel (omo));
 
   /* For floating-point values, we'll need to view-convert them to integers
      so that we can perform the atomic compare and swap.  Simplify the
@@ -6499,6 +9128,7 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
 
       iaddr = create_tmp_reg (build_pointer_type_for_mode (itype, ptr_mode,
                                                           true));
+      atype = itype;
       iaddr_val
        = force_gimple_operand_gsi (&si,
                                    fold_convert (TREE_TYPE (iaddr), addr),
@@ -6519,13 +9149,17 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
   tree loaddecl = builtin_decl_explicit (fncode);
   if (loaddecl)
     initial
-      = fold_convert (TREE_TYPE (TREE_TYPE (iaddr)),
+      = fold_convert (atype,
                      build_call_expr (loaddecl, 2, iaddr,
                                       build_int_cst (NULL_TREE,
                                                      MEMMODEL_RELAXED)));
   else
-    initial = build2 (MEM_REF, TREE_TYPE (TREE_TYPE (iaddr)), iaddr,
-                     build_int_cst (TREE_TYPE (iaddr), 0));
+    {
+      tree off
+       = build_int_cst (build_pointer_type_for_mode (atype, ptr_mode,
+                                                     true), 0);
+      initial = build2 (MEM_REF, atype, iaddr, off);
+    }
 
   initial
     = force_gimple_operand_gsi (&si, initial, true, NULL_TREE, true,
@@ -6567,7 +9201,7 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
     }
   gsi_remove (&si, true);
 
-  si = gsi_last_bb (store_bb);
+  si = gsi_last_nondebug_bb (store_bb);
   gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_ATOMIC_STORE);
 
   if (iaddr == addr)
@@ -6580,7 +9214,15 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
                                  GSI_SAME_STMT);
 
   /* Build the compare&swap statement.  */
-  new_storedi = build_call_expr (cmpxchg, 3, iaddr, loadedi, storedi);
+  tree ctype = build_complex_type (itype);
+  int flag = int_size_in_bytes (itype);
+  new_storedi = build_call_expr_internal_loc (loc, IFN_ATOMIC_COMPARE_EXCHANGE,
+                                             ctype, 6, iaddr, loadedi,
+                                             storedi,
+                                             build_int_cst (integer_type_node,
+                                                            flag),
+                                             mo, fmo);
+  new_storedi = build1 (REALPART_EXPR, itype, new_storedi);
   new_storedi = force_gimple_operand_gsi (&si,
                                          fold_convert (TREE_TYPE (loadedi),
                                                        new_storedi),
@@ -6627,7 +9269,7 @@ expand_omp_atomic_pipeline (basic_block load_bb, basic_block store_bb,
   /* Remove GIMPLE_OMP_ATOMIC_STORE.  */
   gsi_remove (&si, true);
 
-  struct loop *loop = alloc_loop ();
+  class loop *loop = alloc_loop ();
   loop->header = loop_header;
   loop->latch = store_bb;
   add_loop (loop, loop_header->loop_father);
@@ -6670,22 +9312,27 @@ expand_omp_atomic_mutex (basic_block load_bb, basic_block store_bb,
   gassign *stmt;
   tree t;
 
-  si = gsi_last_bb (load_bb);
+  si = gsi_last_nondebug_bb (load_bb);
   gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_ATOMIC_LOAD);
 
   t = builtin_decl_explicit (BUILT_IN_GOMP_ATOMIC_START);
   t = build_call_expr (t, 0);
   force_gimple_operand_gsi (&si, t, true, NULL_TREE, true, GSI_SAME_STMT);
 
-  stmt = gimple_build_assign (loaded_val, build_simple_mem_ref (addr));
+  tree mem = build_simple_mem_ref (addr);
+  TREE_TYPE (mem) = TREE_TYPE (loaded_val);
+  TREE_OPERAND (mem, 1)
+    = fold_convert (build_pointer_type_for_mode (TREE_TYPE (mem), ptr_mode,
+                                                true),
+                   TREE_OPERAND (mem, 1));
+  stmt = gimple_build_assign (loaded_val, mem);
   gsi_insert_before (&si, stmt, GSI_SAME_STMT);
   gsi_remove (&si, true);
 
-  si = gsi_last_bb (store_bb);
+  si = gsi_last_nondebug_bb (store_bb);
   gcc_assert (gimple_code (gsi_stmt (si)) == GIMPLE_OMP_ATOMIC_STORE);
 
-  stmt = gimple_build_assign (build_simple_mem_ref (unshare_expr (addr)),
-                             stored_val);
+  stmt = gimple_build_assign (unshare_expr (mem), stored_val);
   gsi_insert_before (&si, stmt, GSI_SAME_STMT);
 
   t = builtin_decl_explicit (BUILT_IN_GOMP_ATOMIC_END);
@@ -6714,7 +9361,7 @@ expand_omp_atomic (struct omp_region *region)
   tree loaded_val = gimple_omp_atomic_load_lhs (load);
   tree addr = gimple_omp_atomic_load_rhs (load);
   tree stored_val = gimple_omp_atomic_store_val (store);
-  tree type = TYPE_MAIN_VARIANT (TREE_TYPE (TREE_TYPE (addr)));
+  tree type = TYPE_MAIN_VARIANT (TREE_TYPE (loaded_val));
   HOST_WIDE_INT index;
 
   /* Make sure the type is one of the supported sizes.  */
@@ -6728,17 +9375,18 @@ expand_omp_atomic (struct omp_region *region)
       if (exact_log2 (align) >= index)
        {
          /* Atomic load.  */
+         scalar_mode smode;
          if (loaded_val == stored_val
-             && (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_INT
-                 || GET_MODE_CLASS (TYPE_MODE (type)) == MODE_FLOAT)
-             && GET_MODE_BITSIZE (TYPE_MODE (type)) <= BITS_PER_WORD
+             && (is_int_mode (TYPE_MODE (type), &smode)
+                 || is_float_mode (TYPE_MODE (type), &smode))
+             && GET_MODE_BITSIZE (smode) <= BITS_PER_WORD
              && expand_omp_atomic_load (load_bb, addr, loaded_val, index))
            return;
 
          /* Atomic store.  */
-         if ((GET_MODE_CLASS (TYPE_MODE (type)) == MODE_INT
-              || GET_MODE_CLASS (TYPE_MODE (type)) == MODE_FLOAT)
-             && GET_MODE_BITSIZE (TYPE_MODE (type)) <= BITS_PER_WORD
+         if ((is_int_mode (TYPE_MODE (type), &smode)
+              || is_float_mode (TYPE_MODE (type), &smode))
+             && GET_MODE_BITSIZE (smode) <= BITS_PER_WORD
              && store_bb == single_succ (load_bb)
              && first_stmt (store_bb) == store
              && expand_omp_atomic_store (load_bb, addr, loaded_val,
@@ -6752,6 +9400,13 @@ expand_omp_atomic (struct omp_region *region)
                                             loaded_val, stored_val, index))
            return;
 
+         /* When possible, use ATOMIC_COMPARE_EXCHANGE ifn without a loop.  */
+         if (store_bb == single_succ (load_bb)
+             && !gimple_in_ssa_p (cfun)
+             && expand_omp_atomic_cas (load_bb, addr, loaded_val, stored_val,
+                                       index))
+           return;
+
          /* If we don't have specialized __sync builtins, try and implement
             as a compare and swap loop.  */
          if (expand_omp_atomic_pipeline (load_bb, store_bb, addr,
@@ -6771,14 +9426,14 @@ static void
 mark_loops_in_oacc_kernels_region (basic_block region_entry,
                                   basic_block region_exit)
 {
-  struct loop *outer = region_entry->loop_father;
+  class loop *outer = region_entry->loop_father;
   gcc_assert (region_exit == NULL || outer == region_exit->loop_father);
 
   /* Don't parallelize the kernels region if it contains more than one outer
      loop.  */
   unsigned int nr_outer_loops = 0;
-  struct loop *single_outer = NULL;
-  for (struct loop *loop = outer->inner; loop != NULL; loop = loop->next)
+  class loop *single_outer = NULL;
+  for (class loop *loop = outer->inner; loop != NULL; loop = loop->next)
     {
       gcc_assert (loop_outer (loop) == outer);
 
@@ -6795,124 +9450,17 @@ mark_loops_in_oacc_kernels_region (basic_block region_entry,
   if (nr_outer_loops != 1)
     return;
 
-  for (struct loop *loop = single_outer->inner;
+  for (class loop *loop = single_outer->inner;
        loop != NULL;
        loop = loop->inner)
     if (loop->next)
       return;
 
   /* Mark the loops in the region.  */
-  for (struct loop *loop = single_outer; loop != NULL; loop = loop->inner)
+  for (class loop *loop = single_outer; loop != NULL; loop = loop->inner)
     loop->in_oacc_kernels_region = true;
 }
 
-/* Types used to pass grid and wortkgroup sizes to kernel invocation.  */
-
-struct GTY(()) grid_launch_attributes_trees
-{
-  tree kernel_dim_array_type;
-  tree kernel_lattrs_dimnum_decl;
-  tree kernel_lattrs_grid_decl;
-  tree kernel_lattrs_group_decl;
-  tree kernel_launch_attributes_type;
-};
-
-static GTY(()) struct grid_launch_attributes_trees *grid_attr_trees;
-
-/* Create types used to pass kernel launch attributes to target.  */
-
-static void
-grid_create_kernel_launch_attr_types (void)
-{
-  if (grid_attr_trees)
-    return;
-  grid_attr_trees = ggc_alloc <grid_launch_attributes_trees> ();
-
-  tree dim_arr_index_type
-    = build_index_type (build_int_cst (integer_type_node, 2));
-  grid_attr_trees->kernel_dim_array_type
-    = build_array_type (uint32_type_node, dim_arr_index_type);
-
-  grid_attr_trees->kernel_launch_attributes_type = make_node (RECORD_TYPE);
-  grid_attr_trees->kernel_lattrs_dimnum_decl
-    = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("ndim"),
-                 uint32_type_node);
-  DECL_CHAIN (grid_attr_trees->kernel_lattrs_dimnum_decl) = NULL_TREE;
-
-  grid_attr_trees->kernel_lattrs_grid_decl
-    = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("grid_size"),
-                 grid_attr_trees->kernel_dim_array_type);
-  DECL_CHAIN (grid_attr_trees->kernel_lattrs_grid_decl)
-    = grid_attr_trees->kernel_lattrs_dimnum_decl;
-  grid_attr_trees->kernel_lattrs_group_decl
-    = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("group_size"),
-                 grid_attr_trees->kernel_dim_array_type);
-  DECL_CHAIN (grid_attr_trees->kernel_lattrs_group_decl)
-    = grid_attr_trees->kernel_lattrs_grid_decl;
-  finish_builtin_struct (grid_attr_trees->kernel_launch_attributes_type,
-                        "__gomp_kernel_launch_attributes",
-                        grid_attr_trees->kernel_lattrs_group_decl, NULL_TREE);
-}
-
-/* Insert before the current statement in GSI a store of VALUE to INDEX of
-   array (of type kernel_dim_array_type) FLD_DECL of RANGE_VAR.  VALUE must be
-   of type uint32_type_node.  */
-
-static void
-grid_insert_store_range_dim (gimple_stmt_iterator *gsi, tree range_var,
-                            tree fld_decl, int index, tree value)
-{
-  tree ref = build4 (ARRAY_REF, uint32_type_node,
-                    build3 (COMPONENT_REF,
-                            grid_attr_trees->kernel_dim_array_type,
-                            range_var, fld_decl, NULL_TREE),
-                    build_int_cst (integer_type_node, index),
-                    NULL_TREE, NULL_TREE);
-  gsi_insert_before (gsi, gimple_build_assign (ref, value), GSI_SAME_STMT);
-}
-
-/* Return a tree representation of a pointer to a structure with grid and
-   work-group size information.  Statements filling that information will be
-   inserted before GSI, TGT_STMT is the target statement which has the
-   necessary information in it.  */
-
-static tree
-grid_get_kernel_launch_attributes (gimple_stmt_iterator *gsi,
-                                      gomp_target *tgt_stmt)
-{
-  grid_create_kernel_launch_attr_types ();
-  tree lattrs = create_tmp_var (grid_attr_trees->kernel_launch_attributes_type,
-                               "__kernel_launch_attrs");
-
-  unsigned max_dim = 0;
-  for (tree clause = gimple_omp_target_clauses (tgt_stmt);
-       clause;
-       clause = OMP_CLAUSE_CHAIN (clause))
-    {
-      if (OMP_CLAUSE_CODE (clause) != OMP_CLAUSE__GRIDDIM_)
-       continue;
-
-      unsigned dim = OMP_CLAUSE__GRIDDIM__DIMENSION (clause);
-      max_dim = MAX (dim, max_dim);
-
-      grid_insert_store_range_dim (gsi, lattrs,
-                                  grid_attr_trees->kernel_lattrs_grid_decl,
-                                  dim, OMP_CLAUSE__GRIDDIM__SIZE (clause));
-      grid_insert_store_range_dim (gsi, lattrs,
-                                  grid_attr_trees->kernel_lattrs_group_decl,
-                                  dim, OMP_CLAUSE__GRIDDIM__GROUP (clause));
-    }
-
-  tree dimref = build3 (COMPONENT_REF, uint32_type_node, lattrs,
-                       grid_attr_trees->kernel_lattrs_dimnum_decl, NULL_TREE);
-  gcc_checking_assert (max_dim <= 2);
-  tree dimensions = build_int_cstu (uint32_type_node, max_dim + 1);
-  gsi_insert_before (gsi, gimple_build_assign (dimref, dimensions),
-                    GSI_SAME_STMT);
-  TREE_ADDRESSABLE (lattrs) = 1;
-  return build_fold_addr_expr (lattrs);
-}
-
 /* Build target argument identifier from the DEVICE identifier, value
    identifier ID and whether the element also has a SUBSEQUENT_PARAM.  */
 
@@ -7003,16 +9551,6 @@ get_target_arguments (gimple_stmt_iterator *gsi, gomp_target *tgt_stmt)
                                           GOMP_TARGET_ARG_THREAD_LIMIT, t,
                                           &args);
 
-  /* Add HSA-specific grid sizes, if available.  */
-  if (omp_find_clause (gimple_omp_target_clauses (tgt_stmt),
-                      OMP_CLAUSE__GRIDDIM_))
-    {
-      int id = GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES;
-      t = get_target_argument_identifier (GOMP_DEVICE_HSA, true, id);
-      args.quick_push (t);
-      args.quick_push (grid_get_kernel_launch_attributes (gsi, tgt_stmt));
-    }
-
   /* Produce more, perhaps device specific, arguments here.  */
 
   tree argarray = create_tmp_var (build_array_type_nelts (ptr_type_node,
@@ -7047,13 +9585,15 @@ expand_omp_target (struct omp_region *region)
   gomp_target *entry_stmt;
   gimple *stmt;
   edge e;
-  bool offloaded, data_region;
+  bool offloaded;
+  int target_kind;
 
   entry_stmt = as_a <gomp_target *> (last_stmt (region->entry));
+  target_kind = gimple_omp_target_kind (entry_stmt);
   new_bb = region->entry;
 
   offloaded = is_gimple_omp_offloaded (entry_stmt);
-  switch (gimple_omp_target_kind (entry_stmt))
+  switch (target_kind)
     {
     case GF_OMP_TARGET_KIND_REGION:
     case GF_OMP_TARGET_KIND_UPDATE:
@@ -7061,15 +9601,17 @@ expand_omp_target (struct omp_region *region)
     case GF_OMP_TARGET_KIND_EXIT_DATA:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
     case GF_OMP_TARGET_KIND_OACC_KERNELS:
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
-    case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
+    case GF_OMP_TARGET_KIND_OACC_ENTER_DATA:
+    case GF_OMP_TARGET_KIND_OACC_EXIT_DATA:
     case GF_OMP_TARGET_KIND_OACC_DECLARE:
-      data_region = false;
-      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
     case GF_OMP_TARGET_KIND_DATA:
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
-      data_region = true;
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       break;
     default:
       gcc_unreachable ();
@@ -7091,16 +9633,44 @@ expand_omp_target (struct omp_region *region)
   entry_bb = region->entry;
   exit_bb = region->exit;
 
-  if (gimple_omp_target_kind (entry_stmt) == GF_OMP_TARGET_KIND_OACC_KERNELS)
-    {
-      mark_loops_in_oacc_kernels_region (region->entry, region->exit);
+  if (target_kind == GF_OMP_TARGET_KIND_OACC_KERNELS)
+    mark_loops_in_oacc_kernels_region (region->entry, region->exit);
 
-      /* Further down, both OpenACC kernels and OpenACC parallel constructs
-        will be mappted to BUILT_IN_GOACC_PARALLEL, and to distinguish the
-        two, there is an "oacc kernels" attribute set for OpenACC kernels.  */
+  /* Going on, all OpenACC compute constructs are mapped to
+     'BUILT_IN_GOACC_PARALLEL', and get their compute regions outlined.
+     To distinguish between them, we attach attributes.  */
+  switch (target_kind)
+    {
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL:
+      DECL_ATTRIBUTES (child_fn)
+       = tree_cons (get_identifier ("oacc parallel"),
+                    NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_KERNELS:
       DECL_ATTRIBUTES (child_fn)
        = tree_cons (get_identifier ("oacc kernels"),
                     NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+      DECL_ATTRIBUTES (child_fn)
+       = tree_cons (get_identifier ("oacc serial"),
+                    NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+      DECL_ATTRIBUTES (child_fn)
+       = tree_cons (get_identifier ("oacc parallel_kernels_parallelized"),
+                    NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
+      DECL_ATTRIBUTES (child_fn)
+       = tree_cons (get_identifier ("oacc parallel_kernels_gang_single"),
+                    NULL_TREE, DECL_ATTRIBUTES (child_fn));
+      break;
+    default:
+      /* Make sure we don't miss any.  */
+      gcc_checking_assert (!(is_gimple_omp_oacc (entry_stmt)
+                            && is_gimple_omp_offloaded (entry_stmt)));
+      break;
     }
 
   if (offloaded)
@@ -7179,7 +9749,7 @@ expand_omp_target (struct omp_region *region)
 
       /* Split ENTRY_BB at GIMPLE_*,
         so that it can be moved to the child function.  */
-      gsi = gsi_last_bb (entry_bb);
+      gsi = gsi_last_nondebug_bb (entry_bb);
       stmt = gsi_stmt (gsi);
       gcc_assert (stmt
                  && gimple_code (stmt) == gimple_code (entry_stmt));
@@ -7191,7 +9761,7 @@ expand_omp_target (struct omp_region *region)
       /* Convert GIMPLE_OMP_RETURN into a RETURN_EXPR.  */
       if (exit_bb)
        {
-         gsi = gsi_last_bb (exit_bb);
+         gsi = gsi_last_nondebug_bb (exit_bb);
          gcc_assert (!gsi_end_p (gsi)
                      && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
          stmt = gimple_build_return (NULL);
@@ -7199,11 +9769,6 @@ expand_omp_target (struct omp_region *region)
          gsi_remove (&gsi, true);
        }
 
-      /* Make sure to generate early debug for the function before
-         outlining anything.  */
-      if (! gimple_in_ssa_p (cfun))
-       (*debug_hooks->early_global_decl) (cfun->decl);
-
       /* Move the offloading region into CHILD_CFUN.  */
 
       block = gimple_block (entry_stmt);
@@ -7240,7 +9805,11 @@ expand_omp_target (struct omp_region *region)
 
       /* Add the new function to the offload table.  */
       if (ENABLE_OFFLOADING)
-       vec_safe_push (offload_funcs, child_fn);
+       {
+         if (in_lto_p)
+           DECL_PRESERVE_P (child_fn) = 1;
+         vec_safe_push (offload_funcs, child_fn);
+       }
 
       bool need_asm = DECL_ASSEMBLER_NAME_SET_P (current_function_decl)
                      && !DECL_ASSEMBLER_NAME_SET_P (child_fn);
@@ -7276,13 +9845,14 @@ expand_omp_target (struct omp_region *region)
          dump_function_header (dump_file, child_fn, dump_flags);
          dump_function_to_file (child_fn, dump_file, dump_flags);
        }
+
+      adjust_context_and_scope (region, gimple_block (entry_stmt), child_fn);
     }
 
   /* Emit a library call to launch the offloading region, or do data
      transfers.  */
-  tree t1, t2, t3, t4, device, cond, depend, c, clauses;
+  tree t1, t2, t3, t4, depend, c, clauses;
   enum built_in_function start_ix;
-  location_t clause_loc;
   unsigned int flags_i = 0;
 
   switch (gimple_omp_target_kind (entry_stmt))
@@ -7303,19 +9873,26 @@ expand_omp_target (struct omp_region *region)
       start_ix = BUILT_IN_GOMP_TARGET_ENTER_EXIT_DATA;
       flags_i |= GOMP_TARGET_FLAG_EXIT_DATA;
       break;
-    case GF_OMP_TARGET_KIND_OACC_KERNELS:
     case GF_OMP_TARGET_KIND_OACC_PARALLEL:
+    case GF_OMP_TARGET_KIND_OACC_KERNELS:
+    case GF_OMP_TARGET_KIND_OACC_SERIAL:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+    case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
       start_ix = BUILT_IN_GOACC_PARALLEL;
       break;
     case GF_OMP_TARGET_KIND_OACC_DATA:
     case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+    case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
       start_ix = BUILT_IN_GOACC_DATA_START;
       break;
     case GF_OMP_TARGET_KIND_OACC_UPDATE:
       start_ix = BUILT_IN_GOACC_UPDATE;
       break;
-    case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
-      start_ix = BUILT_IN_GOACC_ENTER_EXIT_DATA;
+    case GF_OMP_TARGET_KIND_OACC_ENTER_DATA:
+      start_ix = BUILT_IN_GOACC_ENTER_DATA;
+      break;
+    case GF_OMP_TARGET_KIND_OACC_EXIT_DATA:
+      start_ix = BUILT_IN_GOACC_EXIT_DATA;
       break;
     case GF_OMP_TARGET_KIND_OACC_DECLARE:
       start_ix = BUILT_IN_GOACC_DECLARE;
@@ -7326,54 +9903,74 @@ expand_omp_target (struct omp_region *region)
 
   clauses = gimple_omp_target_clauses (entry_stmt);
 
-  /* By default, the value of DEVICE is GOMP_DEVICE_ICV (let runtime
-     library choose) and there is no conditional.  */
-  cond = NULL_TREE;
-  device = build_int_cst (integer_type_node, GOMP_DEVICE_ICV);
-
-  c = omp_find_clause (clauses, OMP_CLAUSE_IF);
-  if (c)
-    cond = OMP_CLAUSE_IF_EXPR (c);
-
-  c = omp_find_clause (clauses, OMP_CLAUSE_DEVICE);
-  if (c)
+  tree device = NULL_TREE;
+  location_t device_loc = UNKNOWN_LOCATION;
+  tree goacc_flags = NULL_TREE;
+  if (is_gimple_omp_oacc (entry_stmt))
     {
-      /* Even if we pass it to all library function calls, it is currently only
-        defined/used for the OpenMP target ones.  */
-      gcc_checking_assert (start_ix == BUILT_IN_GOMP_TARGET
-                          || start_ix == BUILT_IN_GOMP_TARGET_DATA
-                          || start_ix == BUILT_IN_GOMP_TARGET_UPDATE
-                          || start_ix == BUILT_IN_GOMP_TARGET_ENTER_EXIT_DATA);
-
-      device = OMP_CLAUSE_DEVICE_ID (c);
-      clause_loc = OMP_CLAUSE_LOCATION (c);
+      /* By default, no GOACC_FLAGs are set.  */
+      goacc_flags = integer_zero_node;
     }
   else
-    clause_loc = gimple_location (entry_stmt);
-
-  c = omp_find_clause (clauses, OMP_CLAUSE_NOWAIT);
-  if (c)
-    flags_i |= GOMP_TARGET_FLAG_NOWAIT;
+    {
+      c = omp_find_clause (clauses, OMP_CLAUSE_DEVICE);
+      if (c)
+       {
+         device = OMP_CLAUSE_DEVICE_ID (c);
+         device_loc = OMP_CLAUSE_LOCATION (c);
+         if (OMP_CLAUSE_DEVICE_ANCESTOR (c))
+           sorry_at (device_loc, "%<ancestor%> not yet supported");
+       }
+      else
+       {
+         /* By default, the value of DEVICE is GOMP_DEVICE_ICV (let runtime
+            library choose).  */
+         device = build_int_cst (integer_type_node, GOMP_DEVICE_ICV);
+         device_loc = gimple_location (entry_stmt);
+       }
 
-  /* Ensure 'device' is of the correct type.  */
-  device = fold_convert_loc (clause_loc, integer_type_node, device);
+      c = omp_find_clause (clauses, OMP_CLAUSE_NOWAIT);
+      /* FIXME: in_reduction(...) nowait is unimplemented yet, pretend
+        nowait doesn't appear.  */
+      if (c && omp_find_clause (clauses, OMP_CLAUSE_IN_REDUCTION))
+       c = NULL;
+      if (c)
+       flags_i |= GOMP_TARGET_FLAG_NOWAIT;
+    }
 
-  /* If we found the clause 'if (cond)', build
-     (cond ? device : GOMP_DEVICE_HOST_FALLBACK).  */
+  /* By default, there is no conditional.  */
+  tree cond = NULL_TREE;
+  c = omp_find_clause (clauses, OMP_CLAUSE_IF);
+  if (c)
+    cond = OMP_CLAUSE_IF_EXPR (c);
+  /* If we found the clause 'if (cond)', build:
+     OpenACC: goacc_flags = (cond ? goacc_flags : flags | GOACC_FLAG_HOST_FALLBACK)
+     OpenMP: device = (cond ? device : GOMP_DEVICE_HOST_FALLBACK) */
   if (cond)
     {
+      tree *tp;
+      if (is_gimple_omp_oacc (entry_stmt))
+       tp = &goacc_flags;
+      else
+       {
+         /* Ensure 'device' is of the correct type.  */
+         device = fold_convert_loc (device_loc, integer_type_node, device);
+
+         tp = &device;
+       }
+
       cond = gimple_boolify (cond);
 
       basic_block cond_bb, then_bb, else_bb;
       edge e;
       tree tmp_var;
 
-      tmp_var = create_tmp_var (TREE_TYPE (device));
+      tmp_var = create_tmp_var (TREE_TYPE (*tp));
       if (offloaded)
        e = split_block_after_labels (new_bb);
       else
        {
-         gsi = gsi_last_bb (new_bb);
+         gsi = gsi_last_nondebug_bb (new_bb);
          gsi_prev (&gsi);
          e = split_block (new_bb, gsi_stmt (gsi));
        }
@@ -7391,13 +9988,20 @@ expand_omp_target (struct omp_region *region)
       gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
 
       gsi = gsi_start_bb (then_bb);
-      stmt = gimple_build_assign (tmp_var, device);
+      stmt = gimple_build_assign (tmp_var, *tp);
       gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
 
       gsi = gsi_start_bb (else_bb);
-      stmt = gimple_build_assign (tmp_var,
-                                 build_int_cst (integer_type_node,
-                                                GOMP_DEVICE_HOST_FALLBACK));
+      if (is_gimple_omp_oacc (entry_stmt))
+       stmt = gimple_build_assign (tmp_var,
+                                   BIT_IOR_EXPR,
+                                   *tp,
+                                   build_int_cst (integer_type_node,
+                                                  GOACC_FLAG_HOST_FALLBACK));
+      else
+       stmt = gimple_build_assign (tmp_var,
+                                   build_int_cst (integer_type_node,
+                                                  GOMP_DEVICE_HOST_FALLBACK));
       gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING);
 
       make_edge (cond_bb, then_bb, EDGE_TRUE_VALUE);
@@ -7407,14 +10011,17 @@ expand_omp_target (struct omp_region *region)
       make_edge (then_bb, new_bb, EDGE_FALLTHRU);
       make_edge (else_bb, new_bb, EDGE_FALLTHRU);
 
-      device = tmp_var;
-      gsi = gsi_last_bb (new_bb);
+      *tp = tmp_var;
+
+      gsi = gsi_last_nondebug_bb (new_bb);
     }
   else
     {
-      gsi = gsi_last_bb (new_bb);
-      device = force_gimple_operand_gsi (&gsi, device, true, NULL_TREE,
-                                        true, GSI_SAME_STMT);
+      gsi = gsi_last_nondebug_bb (new_bb);
+
+      if (device != NULL_TREE)
+       device = force_gimple_operand_gsi (&gsi, device, true, NULL_TREE,
+                                          true, GSI_SAME_STMT);
     }
 
   t = gimple_omp_target_data_arg (entry_stmt);
@@ -7438,7 +10045,17 @@ expand_omp_target (struct omp_region *region)
   bool tagging = false;
   /* The maximum number used by any start_ix, without varargs.  */
   auto_vec<tree, 11> args;
-  args.quick_push (device);
+  if (is_gimple_omp_oacc (entry_stmt))
+    {
+      tree goacc_flags_m = fold_build1 (GOACC_FLAGS_MARSHAL_OP,
+                                       TREE_TYPE (goacc_flags), goacc_flags);
+      goacc_flags_m = force_gimple_operand_gsi (&gsi, goacc_flags_m, true,
+                                               NULL_TREE, true,
+                                               GSI_SAME_STMT);
+      args.quick_push (goacc_flags_m);
+    }
+  else
+    args.quick_push (device);
   if (offloaded)
     args.quick_push (build_fold_addr_expr (child_fn));
   args.quick_push (t1);
@@ -7465,10 +10082,22 @@ expand_omp_target (struct omp_region *region)
        args.quick_push (get_target_arguments (&gsi, entry_stmt));
       break;
     case BUILT_IN_GOACC_PARALLEL:
-      oacc_set_fn_attrib (child_fn, clauses, &args);
+      if (lookup_attribute ("oacc serial", DECL_ATTRIBUTES (child_fn)) != NULL)
+       {
+         tree dims = NULL_TREE;
+         unsigned int ix;
+
+         /* For serial constructs we set all dimensions to 1.  */
+         for (ix = GOMP_DIM_MAX; ix--;)
+           dims = tree_cons (NULL_TREE, integer_one_node, dims);
+         oacc_replace_fn_attrib (child_fn, dims);
+       }
+      else
+       oacc_set_fn_attrib (child_fn, clauses, &args);
       tagging = true;
       /* FALLTHRU */
-    case BUILT_IN_GOACC_ENTER_EXIT_DATA:
+    case BUILT_IN_GOACC_ENTER_DATA:
+    case BUILT_IN_GOACC_EXIT_DATA:
     case BUILT_IN_GOACC_UPDATE:
       {
        tree t_async = NULL_TREE;
@@ -7504,7 +10133,9 @@ expand_omp_target (struct omp_region *region)
                                              i_async));
          }
        if (t_async)
-         args.safe_push (t_async);
+         args.safe_push (force_gimple_operand_gsi (&gsi, t_async, true,
+                                                   NULL_TREE, true,
+                                                   GSI_SAME_STMT));
 
        /* Save the argument index, and ... */
        unsigned t_wait_idx = args.length ();
@@ -7517,9 +10148,12 @@ expand_omp_target (struct omp_region *region)
        for (; c; c = OMP_CLAUSE_CHAIN (c))
          if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_WAIT)
            {
-             args.safe_push (fold_convert_loc (OMP_CLAUSE_LOCATION (c),
-                                               integer_type_node,
-                                               OMP_CLAUSE_WAIT_EXPR (c)));
+             tree arg = fold_convert_loc (OMP_CLAUSE_LOCATION (c),
+                                          integer_type_node,
+                                          OMP_CLAUSE_WAIT_EXPR (c));
+             arg = force_gimple_operand_gsi (&gsi, arg, true, NULL_TREE, true,
+                                             GSI_SAME_STMT);
+             args.safe_push (arg);
              num_waits++;
            }
 
@@ -7554,313 +10188,6 @@ expand_omp_target (struct omp_region *region)
       gcc_assert (g && gimple_code (g) == GIMPLE_OMP_TARGET);
       gsi_remove (&gsi, true);
     }
-  if (data_region && region->exit)
-    {
-      gsi = gsi_last_bb (region->exit);
-      g = gsi_stmt (gsi);
-      gcc_assert (g && gimple_code (g) == GIMPLE_OMP_RETURN);
-      gsi_remove (&gsi, true);
-    }
-}
-
-/* Expand KFOR loop as a HSA grifidied kernel, i.e. as a body only with
-   iteration variable derived from the thread number.  INTRA_GROUP means this
-   is an expansion of a loop iterating over work-items within a separate
-   iteration over groups.  */
-
-static void
-grid_expand_omp_for_loop (struct omp_region *kfor, bool intra_group)
-{
-  gimple_stmt_iterator gsi;
-  gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
-  gcc_checking_assert (gimple_omp_for_kind (for_stmt)
-                      == GF_OMP_FOR_KIND_GRID_LOOP);
-  size_t collapse = gimple_omp_for_collapse (for_stmt);
-  struct omp_for_data_loop *loops
-    = XALLOCAVEC (struct omp_for_data_loop,
-                 gimple_omp_for_collapse (for_stmt));
-  struct omp_for_data fd;
-
-  remove_edge (BRANCH_EDGE (kfor->entry));
-  basic_block body_bb = FALLTHRU_EDGE (kfor->entry)->dest;
-
-  gcc_assert (kfor->cont);
-  omp_extract_for_data (for_stmt, &fd, loops);
-
-  gsi = gsi_start_bb (body_bb);
-
-  for (size_t dim = 0; dim < collapse; dim++)
-    {
-      tree type, itype;
-      itype = type = TREE_TYPE (fd.loops[dim].v);
-      if (POINTER_TYPE_P (type))
-       itype = signed_type_for (type);
-
-      tree n1 = fd.loops[dim].n1;
-      tree step = fd.loops[dim].step;
-      n1 = force_gimple_operand_gsi (&gsi, fold_convert (type, n1),
-                                    true, NULL_TREE, true, GSI_SAME_STMT);
-      step = force_gimple_operand_gsi (&gsi, fold_convert (itype, step),
-                                      true, NULL_TREE, true, GSI_SAME_STMT);
-      tree threadid;
-      if (gimple_omp_for_grid_group_iter (for_stmt))
-       {
-         gcc_checking_assert (!intra_group);
-         threadid = build_call_expr (builtin_decl_explicit
-                                     (BUILT_IN_HSA_WORKGROUPID), 1,
-                                     build_int_cstu (unsigned_type_node, dim));
-       }
-      else if (intra_group)
-       threadid = build_call_expr (builtin_decl_explicit
-                                   (BUILT_IN_HSA_WORKITEMID), 1,
-                                   build_int_cstu (unsigned_type_node, dim));
-      else
-       threadid = build_call_expr (builtin_decl_explicit
-                                   (BUILT_IN_HSA_WORKITEMABSID), 1,
-                                   build_int_cstu (unsigned_type_node, dim));
-      threadid = fold_convert (itype, threadid);
-      threadid = force_gimple_operand_gsi (&gsi, threadid, true, NULL_TREE,
-                                          true, GSI_SAME_STMT);
-
-      tree startvar = fd.loops[dim].v;
-      tree t = fold_build2 (MULT_EXPR, itype, threadid, step);
-      if (POINTER_TYPE_P (type))
-       t = fold_build_pointer_plus (n1, t);
-      else
-       t = fold_build2 (PLUS_EXPR, type, t, n1);
-      t = fold_convert (type, t);
-      t = force_gimple_operand_gsi (&gsi, t,
-                                   DECL_P (startvar)
-                                   && TREE_ADDRESSABLE (startvar),
-                                   NULL_TREE, true, GSI_SAME_STMT);
-      gassign *assign_stmt = gimple_build_assign (startvar, t);
-      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
-    }
-  /* Remove the omp for statement.  */
-  gsi = gsi_last_bb (kfor->entry);
-  gsi_remove (&gsi, true);
-
-  /* Remove the GIMPLE_OMP_CONTINUE statement.  */
-  gsi = gsi_last_bb (kfor->cont);
-  gcc_assert (!gsi_end_p (gsi)
-             && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_CONTINUE);
-  gsi_remove (&gsi, true);
-
-  /* Replace the GIMPLE_OMP_RETURN with a barrier, if necessary.  */
-  gsi = gsi_last_bb (kfor->exit);
-  gcc_assert (!gsi_end_p (gsi)
-             && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
-  if (intra_group)
-    gsi_insert_before (&gsi, omp_build_barrier (NULL_TREE), GSI_SAME_STMT);
-  gsi_remove (&gsi, true);
-
-  /* Fixup the much simpler CFG.  */
-  remove_edge (find_edge (kfor->cont, body_bb));
-
-  if (kfor->cont != body_bb)
-    set_immediate_dominator (CDI_DOMINATORS, kfor->cont, body_bb);
-  set_immediate_dominator (CDI_DOMINATORS, kfor->exit, kfor->cont);
-}
-
-/* Structure passed to grid_remap_kernel_arg_accesses so that it can remap
-   argument_decls.  */
-
-struct grid_arg_decl_map
-{
-  tree old_arg;
-  tree new_arg;
-};
-
-/* Invoked through walk_gimple_op, will remap all PARM_DECLs to the ones
-   pertaining to kernel function.  */
-
-static tree
-grid_remap_kernel_arg_accesses (tree *tp, int *walk_subtrees, void *data)
-{
-  struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
-  struct grid_arg_decl_map *adm = (struct grid_arg_decl_map *) wi->info;
-  tree t = *tp;
-
-  if (t == adm->old_arg)
-    *tp = adm->new_arg;
-  *walk_subtrees = !TYPE_P (t) && !DECL_P (t);
-  return NULL_TREE;
-}
-
-/* If TARGET region contains a kernel body for loop, remove its region from the
-   TARGET and expand it in HSA gridified kernel fashion.  */
-
-static void
-grid_expand_target_grid_body (struct omp_region *target)
-{
-  if (!hsa_gen_requested_p ())
-    return;
-
-  gomp_target *tgt_stmt = as_a <gomp_target *> (last_stmt (target->entry));
-  struct omp_region **pp;
-
-  for (pp = &target->inner; *pp; pp = &(*pp)->next)
-    if ((*pp)->type == GIMPLE_OMP_GRID_BODY)
-      break;
-
-  struct omp_region *gpukernel = *pp;
-
-  tree orig_child_fndecl = gimple_omp_target_child_fn (tgt_stmt);
-  if (!gpukernel)
-    {
-      /* HSA cannot handle OACC stuff.  */
-      if (gimple_omp_target_kind (tgt_stmt) != GF_OMP_TARGET_KIND_REGION)
-       return;
-      gcc_checking_assert (orig_child_fndecl);
-      gcc_assert (!omp_find_clause (gimple_omp_target_clauses (tgt_stmt),
-                                   OMP_CLAUSE__GRIDDIM_));
-      cgraph_node *n = cgraph_node::get (orig_child_fndecl);
-
-      hsa_register_kernel (n);
-      return;
-    }
-
-  gcc_assert (omp_find_clause (gimple_omp_target_clauses (tgt_stmt),
-                              OMP_CLAUSE__GRIDDIM_));
-  tree inside_block
-    = gimple_block (first_stmt (single_succ (gpukernel->entry)));
-  *pp = gpukernel->next;
-  for (pp = &gpukernel->inner; *pp; pp = &(*pp)->next)
-    if ((*pp)->type == GIMPLE_OMP_FOR)
-      break;
-
-  struct omp_region *kfor = *pp;
-  gcc_assert (kfor);
-  gomp_for *for_stmt = as_a <gomp_for *> (last_stmt (kfor->entry));
-  gcc_assert (gimple_omp_for_kind (for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP);
-  *pp = kfor->next;
-  if (kfor->inner)
-    {
-      if (gimple_omp_for_grid_group_iter (for_stmt))
-       {
-         struct omp_region **next_pp;
-         for (pp = &kfor->inner; *pp; pp = next_pp)
-           {
-             next_pp = &(*pp)->next;
-             if ((*pp)->type != GIMPLE_OMP_FOR)
-               continue;
-             gomp_for *inner = as_a <gomp_for *> (last_stmt ((*pp)->entry));
-             gcc_assert (gimple_omp_for_kind (inner)
-                         == GF_OMP_FOR_KIND_GRID_LOOP);
-             grid_expand_omp_for_loop (*pp, true);
-             *pp = (*pp)->next;
-             next_pp = pp;
-           }
-       }
-      expand_omp (kfor->inner);
-    }
-  if (gpukernel->inner)
-    expand_omp (gpukernel->inner);
-
-  tree kern_fndecl = copy_node (orig_child_fndecl);
-  DECL_NAME (kern_fndecl) = clone_function_name (kern_fndecl, "kernel");
-  SET_DECL_ASSEMBLER_NAME (kern_fndecl, DECL_NAME (kern_fndecl));
-  tree tgtblock = gimple_block (tgt_stmt);
-  tree fniniblock = make_node (BLOCK);
-  BLOCK_ABSTRACT_ORIGIN (fniniblock) = tgtblock;
-  BLOCK_SOURCE_LOCATION (fniniblock) = BLOCK_SOURCE_LOCATION (tgtblock);
-  BLOCK_SOURCE_END_LOCATION (fniniblock) = BLOCK_SOURCE_END_LOCATION (tgtblock);
-  BLOCK_SUPERCONTEXT (fniniblock) = kern_fndecl;
-  DECL_INITIAL (kern_fndecl) = fniniblock;
-  push_struct_function (kern_fndecl);
-  cfun->function_end_locus = gimple_location (tgt_stmt);
-  init_tree_ssa (cfun);
-  pop_cfun ();
-
-  /* Make sure to generate early debug for the function before
-     outlining anything.  */
-  if (! gimple_in_ssa_p (cfun))
-    (*debug_hooks->early_global_decl) (cfun->decl);
-
-  tree old_parm_decl = DECL_ARGUMENTS (kern_fndecl);
-  gcc_assert (!DECL_CHAIN (old_parm_decl));
-  tree new_parm_decl = copy_node (DECL_ARGUMENTS (kern_fndecl));
-  DECL_CONTEXT (new_parm_decl) = kern_fndecl;
-  DECL_ARGUMENTS (kern_fndecl) = new_parm_decl;
-  gcc_assert (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (kern_fndecl))));
-  DECL_RESULT (kern_fndecl) = copy_node (DECL_RESULT (kern_fndecl));
-  DECL_CONTEXT (DECL_RESULT (kern_fndecl)) = kern_fndecl;
-  struct function *kern_cfun = DECL_STRUCT_FUNCTION (kern_fndecl);
-  kern_cfun->curr_properties = cfun->curr_properties;
-
-  grid_expand_omp_for_loop (kfor, false);
-
-  /* Remove the omp for statement.  */
-  gimple_stmt_iterator gsi = gsi_last_bb (gpukernel->entry);
-  gsi_remove (&gsi, true);
-  /* Replace the GIMPLE_OMP_RETURN at the end of the kernel region with a real
-     return.  */
-  gsi = gsi_last_bb (gpukernel->exit);
-  gcc_assert (!gsi_end_p (gsi)
-             && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN);
-  gimple *ret_stmt = gimple_build_return (NULL);
-  gsi_insert_after (&gsi, ret_stmt, GSI_SAME_STMT);
-  gsi_remove (&gsi, true);
-
-  /* Statements in the first BB in the target construct have been produced by
-     target lowering and must be copied inside the GPUKERNEL, with the two
-     exceptions of the first OMP statement and the OMP_DATA assignment
-     statement.  */
-  gsi = gsi_start_bb (single_succ (gpukernel->entry));
-  tree data_arg = gimple_omp_target_data_arg (tgt_stmt);
-  tree sender = data_arg ? TREE_VEC_ELT (data_arg, 0) : NULL;
-  for (gimple_stmt_iterator tsi = gsi_start_bb (single_succ (target->entry));
-       !gsi_end_p (tsi); gsi_next (&tsi))
-    {
-      gimple *stmt = gsi_stmt (tsi);
-      if (is_gimple_omp (stmt))
-       break;
-      if (sender
-         && is_gimple_assign (stmt)
-         && TREE_CODE (gimple_assign_rhs1 (stmt)) == ADDR_EXPR
-         && TREE_OPERAND (gimple_assign_rhs1 (stmt), 0) == sender)
-       continue;
-      gimple *copy = gimple_copy (stmt);
-      gsi_insert_before (&gsi, copy, GSI_SAME_STMT);
-      gimple_set_block (copy, fniniblock);
-    }
-
-  move_sese_region_to_fn (kern_cfun, single_succ (gpukernel->entry),
-                         gpukernel->exit, inside_block);
-
-  cgraph_node *kcn = cgraph_node::get_create (kern_fndecl);
-  kcn->mark_force_output ();
-  cgraph_node *orig_child = cgraph_node::get (orig_child_fndecl);
-
-  hsa_register_kernel (kcn, orig_child);
-
-  cgraph_node::add_new_function (kern_fndecl, true);
-  push_cfun (kern_cfun);
-  cgraph_edge::rebuild_edges ();
-
-  /* Re-map any mention of the PARM_DECL of the original function to the
-     PARM_DECL of the new one.
-
-     TODO: It would be great if lowering produced references into the GPU
-     kernel decl straight away and we did not have to do this.  */
-  struct grid_arg_decl_map adm;
-  adm.old_arg = old_parm_decl;
-  adm.new_arg = new_parm_decl;
-  basic_block bb;
-  FOR_EACH_BB_FN (bb, kern_cfun)
-    {
-      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         gimple *stmt = gsi_stmt (gsi);
-         struct walk_stmt_info wi;
-         memset (&wi, 0, sizeof (wi));
-         wi.info = &adm;
-         walk_gimple_op (stmt, grid_remap_kernel_arg_accesses, &wi);
-       }
-    }
-  pop_cfun ();
-
-  return;
 }
 
 /* Expand the parallel region tree rooted at REGION.  Expansion
@@ -7882,8 +10209,6 @@ expand_omp (struct omp_region *region)
         region.  */
       if (region->type == GIMPLE_OMP_PARALLEL)
        determine_parallel_type (region);
-      else if (region->type == GIMPLE_OMP_TARGET)
-       grid_expand_target_grid_body (region);
 
       if (region->type == GIMPLE_OMP_FOR
          && gimple_omp_for_combined_p (last_stmt (region->entry)))
@@ -7917,6 +10242,7 @@ expand_omp (struct omp_region *region)
          break;
 
        case GIMPLE_OMP_SINGLE:
+       case GIMPLE_OMP_SCOPE:
          expand_omp_single (region);
          break;
 
@@ -7937,6 +10263,7 @@ expand_omp (struct omp_region *region)
          }
          /* FALLTHRU */
        case GIMPLE_OMP_MASTER:
+       case GIMPLE_OMP_MASKED:
        case GIMPLE_OMP_TASKGROUP:
        case GIMPLE_OMP_CRITICAL:
        case GIMPLE_OMP_TEAMS:
@@ -7979,7 +10306,7 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
   gimple *stmt;
   basic_block son;
 
-  gsi = gsi_last_bb (bb);
+  gsi = gsi_last_nondebug_bb (bb);
   if (!gsi_end_p (gsi) && is_gimple_omp (gsi_stmt (gsi)))
     {
       struct omp_region *region;
@@ -8027,17 +10354,22 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
              switch (gimple_omp_target_kind (stmt))
                {
                case GF_OMP_TARGET_KIND_REGION:
-               case GF_OMP_TARGET_KIND_DATA:
                case GF_OMP_TARGET_KIND_OACC_PARALLEL:
                case GF_OMP_TARGET_KIND_OACC_KERNELS:
-               case GF_OMP_TARGET_KIND_OACC_DATA:
-               case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+               case GF_OMP_TARGET_KIND_OACC_SERIAL:
+               case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+               case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
                  break;
                case GF_OMP_TARGET_KIND_UPDATE:
                case GF_OMP_TARGET_KIND_ENTER_DATA:
                case GF_OMP_TARGET_KIND_EXIT_DATA:
+               case GF_OMP_TARGET_KIND_DATA:
+               case GF_OMP_TARGET_KIND_OACC_DATA:
+               case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+               case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
                case GF_OMP_TARGET_KIND_OACC_UPDATE:
-               case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
+               case GF_OMP_TARGET_KIND_OACC_ENTER_DATA:
+               case GF_OMP_TARGET_KIND_OACC_EXIT_DATA:
                case GF_OMP_TARGET_KIND_OACC_DECLARE:
                  /* ..., other than for those stand-alone directives...  */
                  region = NULL;
@@ -8053,6 +10385,10 @@ build_omp_regions_1 (basic_block bb, struct omp_region *parent,
            /* #pragma omp ordered depend is also just a stand-alone
               directive.  */
            region = NULL;
+         else if (code == GIMPLE_OMP_TASK
+                  && gimple_omp_task_taskwait_p (stmt))
+           /* #pragma omp taskwait depend(...) is a stand-alone directive.  */
+           region = NULL;
          /* ..., this directive becomes the parent for a new region.  */
          if (region)
            parent = region;
@@ -8166,7 +10502,7 @@ public:
   /* opt_pass methods: */
   virtual unsigned int execute (function *)
     {
-      bool gate = ((flag_cilkplus != 0 || flag_openacc != 0 || flag_openmp != 0
+      bool gate = ((flag_openacc != 0 || flag_openmp != 0
                    || flag_openmp_simd != 0)
                   && !seen_error ());
 
@@ -8243,19 +10579,26 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
   switch (code)
     {
     case GIMPLE_OMP_PARALLEL:
-    case GIMPLE_OMP_TASK:
     case GIMPLE_OMP_FOR:
     case GIMPLE_OMP_SINGLE:
     case GIMPLE_OMP_TEAMS:
     case GIMPLE_OMP_MASTER:
+    case GIMPLE_OMP_MASKED:
+    case GIMPLE_OMP_SCOPE:
     case GIMPLE_OMP_TASKGROUP:
     case GIMPLE_OMP_CRITICAL:
     case GIMPLE_OMP_SECTION:
-    case GIMPLE_OMP_GRID_BODY:
       cur_region = new_omp_region (bb, code, cur_region);
       fallthru = true;
       break;
 
+    case GIMPLE_OMP_TASK:
+      cur_region = new_omp_region (bb, code, cur_region);
+      fallthru = true;
+      if (gimple_omp_task_taskwait_p (last))
+       cur_region = cur_region->outer;
+      break;
+
     case GIMPLE_OMP_ORDERED:
       cur_region = new_omp_region (bb, code, cur_region);
       fallthru = true;
@@ -8271,17 +10614,22 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
       switch (gimple_omp_target_kind (last))
        {
        case GF_OMP_TARGET_KIND_REGION:
-       case GF_OMP_TARGET_KIND_DATA:
        case GF_OMP_TARGET_KIND_OACC_PARALLEL:
        case GF_OMP_TARGET_KIND_OACC_KERNELS:
-       case GF_OMP_TARGET_KIND_OACC_DATA:
-       case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+       case GF_OMP_TARGET_KIND_OACC_SERIAL:
+       case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_PARALLELIZED:
+       case GF_OMP_TARGET_KIND_OACC_PARALLEL_KERNELS_GANG_SINGLE:
          break;
        case GF_OMP_TARGET_KIND_UPDATE:
        case GF_OMP_TARGET_KIND_ENTER_DATA:
        case GF_OMP_TARGET_KIND_EXIT_DATA:
+       case GF_OMP_TARGET_KIND_DATA:
+       case GF_OMP_TARGET_KIND_OACC_DATA:
+       case GF_OMP_TARGET_KIND_OACC_HOST_DATA:
+       case GF_OMP_TARGET_KIND_OACC_DATA_KERNELS:
        case GF_OMP_TARGET_KIND_OACC_UPDATE:
-       case GF_OMP_TARGET_KIND_OACC_ENTER_EXIT_DATA:
+       case GF_OMP_TARGET_KIND_OACC_ENTER_DATA:
+       case GF_OMP_TARGET_KIND_OACC_EXIT_DATA:
        case GF_OMP_TARGET_KIND_OACC_DECLARE:
          cur_region = cur_region->outer;
          break;
@@ -8385,5 +10733,3 @@ omp_make_gimple_edges (basic_block bb, struct omp_region **region,
 
   return fallthru;
 }
-
-#include "gt-omp-expand.h"