openacc: Add runtime alias checking for OpenACC kernels

author Andrew Stubbs <ams@codesourcery.com>

Tue, 16 Nov 2021 15:19:53 +0000 (16:19 +0100)

committer Frederik Harwath <frederik@codesourcery.com>

Wed, 17 Nov 2021 07:50:55 +0000 (08:50 +0100)
author Andrew Stubbs <ams@codesourcery.com>
Tue, 16 Nov 2021 15:19:53 +0000 (16:19 +0100)
committer Frederik Harwath <frederik@codesourcery.com>
Wed, 17 Nov 2021 07:50:55 +0000 (08:50 +0100)
diff --git a/gcc/graphite-isl-ast-to-gimple.c b/gcc/graphite-isl-ast-to-gimple.c

index c516170d9493182513067882c8d81fd026f8a89a..bdabe588c3d85f4e633236878cf0487a97307eae 100644 (file)
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -58,6 +58,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "graphite.h"
  #include "graphite-oacc.h"
  #include "stdlib.h"
+#include "internal-fn.h"
  
  struct ast_build_info
  {
@@ -1698,6 +1699,127 @@ graphite_oacc_analyze_scop (scop_p scop)
        print_isl_schedule (dump_file, scop->original_schedule);
      }
  
+  if (flag_graphite_runtime_alias_checks
+      && scop->unhandled_alias_ddrs.length () > 0)
+    {
+      sese_info_p region = scop->scop_info;
+
+      /* Usually there will be a chunking loop with the actual work loop
+        inside it.  In some corner cases there may only be one loop.  */
+      loop_p top_loop = region->region.entry->dest->loop_father;
+      loop_p active_loop = top_loop->inner ? top_loop->inner : top_loop;
+      tree cond = generate_alias_cond (scop->unhandled_alias_ddrs, active_loop);
+
+      /* Walk back to GOACC_LOOP block.  */
+      basic_block goacc_loop_block = region->region.entry->src;
+
+      /* Find the GOACC_LOOP calls. If there aren't any then this is not an
+        OpenACC kernels loop and will need different handling.  */
+      gimple_stmt_iterator gsitop = gsi_start_bb (goacc_loop_block);
+      while (!gsi_end_p (gsitop)
+            && (!is_gimple_call (gsi_stmt (gsitop))
+                || !gimple_call_internal_p (gsi_stmt (gsitop))
+                || (gimple_call_internal_fn (gsi_stmt (gsitop))
+                    != IFN_GOACC_LOOP)))
+       gsi_next (&gsitop);
+
+      if (!gsi_end_p (gsitop))
+       {
+         /* Move the GOACC_LOOP CHUNK and STEP calls to after any hoisted
+            statements.  There ought not be any problematic dependencies because
+            the chunk size and step are only computed for very specific purposes.
+            They may not be at the very top of the block, but they should be
+            found together (the asserts test this assuption). */
+         gimple_stmt_iterator gsibottom = gsi_last_bb (goacc_loop_block);
+         gsi_move_after (&gsitop, &gsibottom);
+         gimple_stmt_iterator gsiinsert = gsibottom;
+         gcc_checking_assert (is_gimple_call (gsi_stmt (gsitop))
+                              && gimple_call_internal_p (gsi_stmt (gsitop))
+                              && (gimple_call_internal_fn (gsi_stmt (gsitop))
+                                  == IFN_GOACC_LOOP));
+         gsi_move_after (&gsitop, &gsibottom);
+
+         /* Insert "noalias_p = COND" before the GOACC_LOOP statements.
+            Note that these likely depend on some of the hoisted statements.  */
+         tree cond_val = force_gimple_operand_gsi (&gsiinsert, cond, true, NULL,
+                                                   true, GSI_NEW_STMT);
+
+         /* Insert the cond_val into each GOACC_LOOP call in the region.  */
+         for (int n = -1; n < (int)region->bbs.length (); n++)
+           {
+             /* Cover the region plus goacc_loop_block.  */
+             basic_block bb = n < 0 ? goacc_loop_block : region->bbs[n];
+
+             for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+                  !gsi_end_p (gsi);
+                  gsi_next (&gsi))
+               {
+                 gimple *stmt = gsi_stmt (gsi);
+                 if (!is_gimple_call (stmt)
+                     || !gimple_call_internal_p (stmt))
+                   continue;
+
+                 gcall *goacc_call = as_a <gcall*> (stmt);
+                 if (gimple_call_internal_fn (goacc_call) != IFN_GOACC_LOOP)
+                   continue;
+
+                 enum ifn_goacc_loop_kind code = (enum ifn_goacc_loop_kind)
+                   TREE_INT_CST_LOW (gimple_call_arg (goacc_call, 0));
+                 int argno = 0;
+                 switch (code)
+                   {
+                   case IFN_GOACC_LOOP_CHUNKS:
+                   case IFN_GOACC_LOOP_STEP:
+                     argno = 6;
+                     break;
+
+                   case IFN_GOACC_LOOP_OFFSET:
+                   case IFN_GOACC_LOOP_BOUND:
+                     argno = 7;
+                     break;
+
+                   default:
+                     gcc_unreachable ();
+                   }
+
+                 gimple_call_set_arg (goacc_call, argno, cond_val);
+                 update_stmt (goacc_call);
+
+                 if (dump_enabled_p () && dump_flags & TDF_DETAILS)
+                   dump_printf (MSG_NOTE,
+                                "Runtime alias condition applied to: %G",
+                                goacc_call);
+               }
+           }
+       }
+      else
+       {
+         /* There wasn't any GOACC_LOOP calls where we expected to find them,
+            therefore this isn't an OpenACC parallel loop.  If it runs
+            sequentially then there's no need to worry about aliasing, so
+            nothing much to do here.  */
+         if (dump_enabled_p ())
+           dump_printf (MSG_NOTE, "Runtime alias check *not* inserted for"
+                        " bb %d (GOACC_LOOP not found)");
+
+         /* Unset can_be_parallel, in case something else might use it.  */
+         for (unsigned int i = 0; i < region->bbs.length (); i++)
+           if (region->bbs[i]->loop_father)
+             region->bbs[i]->loop_father->can_be_parallel = 0;
+       }
+
+      /* The loop-nest vec is shared by all DDRs. */
+      DDR_LOOP_NEST (scop->unhandled_alias_ddrs[0]).release ();
+
+      unsigned int i;
+      struct data_dependence_relation *ddr;
+
+      FOR_EACH_VEC_ELT (scop->unhandled_alias_ddrs, i, ddr)
+       if (ddr)
+         free_dependence_relation (ddr);
+      scop->unhandled_alias_ddrs.truncate (0);
+    }
+
    /* Analyze dependences in SCoP and mark loops as parallelizable accordingly. */
    isl_schedule_foreach_schedule_node_top_down (
        scop->original_schedule, visit_schedule_loop_node, scop->dependence);
diff --git a/gcc/graphite-scop-detection.c b/gcc/graphite-scop-detection.c

index 3d4ee30e825064e4bcce16aa2c464568ba9775a2..8b41044bce5e37803cb67411bee87963f73bda8b 100644 (file)
--- a/gcc/graphite-scop-detection.c
+++ b/gcc/graphite-scop-detection.c
@@ -1679,7 +1679,7 @@ dr_defs_outside_region (const sese_l &region, data_reference_p dr)
            break;
          }
  
-  return opt_result::success ();
+  return res;
  }
  
  /* Check that all constituents of DR that are used by the
@@ -1691,21 +1691,23 @@ dr_well_analyzed_for_runtime_alias_check_p (data_reference_p dr)
    static const char* error =
      "data-reference not well-analyzed for runtime check.";
    gimple* stmt = DR_STMT (dr);
+  opt_result res = opt_result::success ();
  
    if (! DR_BASE_ADDRESS (dr))
-    return opt_result::failure_at (stmt, "%s no base address.\n", error);
+    res = opt_result::failure_at (stmt, "%s no base address.\n", error);
    else if (! DR_OFFSET (dr))
-    return opt_result::failure_at (stmt, "%s no offset.\n", error);
+    res = opt_result::failure_at (stmt, "%s no offset.\n", error);
    else if (! DR_INIT (dr))
-    return opt_result::failure_at (stmt, "%s no init.\n", error);
+    res = opt_result::failure_at (stmt, "%s no init.\n", error);
    else if (! DR_STEP (dr))
-    return opt_result::failure_at (stmt, "%s no step.\n", error);
+    res = opt_result::failure_at (stmt, "%s no step.\n", error);
    else if (! tree_fits_uhwi_p (DR_STEP (dr)))
-    return opt_result::failure_at (stmt, "%s step too large.\n", error);
+    res = opt_result::failure_at (stmt, "%s step too large.\n", error);
  
-  DEBUG_PRINT (dump_data_reference (dump_file, dr));
+  if (!res)
+    DEBUG_PRINT (dump_data_reference (dump_file, dr));
  
-  return opt_result::success ();
+  return res;
  }
  
  /* Return TRUE if it is possible to create a runtime alias check for
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c

index 7a40ea2da1a0985a2c6d60eb89cfece9ffa75138..182868501fe7cbfa2e36e01674304c85a0cc50db 100644 (file)
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -7762,10 +7762,11 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
        ass = gimple_build_assign (chunk_no, expr);
        gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
  
-      call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+      call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
                                          build_int_cst (integer_type_node,
                                                         IFN_GOACC_LOOP_CHUNKS),
-                                        dir, range, s, chunk_size, gwv);
+                                        dir, range, s, chunk_size, gwv,
+                                        integer_one_node);
        gimple_call_set_lhs (call, chunk_max);
        gimple_set_location (call, loc);
        gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7773,10 +7774,11 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
    else
      chunk_size = chunk_no;
  
-  call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
                                      build_int_cst (integer_type_node,
                                                     IFN_GOACC_LOOP_STEP),
-                                    dir, range, s, chunk_size, gwv);
+                                    dir, range, s, chunk_size, gwv,
+                                    integer_one_node);
    gimple_call_set_lhs (call, step);
    gimple_set_location (call, loc);
    gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7810,20 +7812,20 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
    /* Loop offset & bound go into head_bb.  */
    gsi = gsi_start_bb (head_bb);
  
-  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+  call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
                                      build_int_cst (integer_type_node,
                                                     IFN_GOACC_LOOP_OFFSET),
-                                    dir, range, s,
-                                    chunk_size, gwv, chunk_no);
+                                    dir, range, s, chunk_size, gwv, chunk_no,
+                                    integer_one_node);
    gimple_call_set_lhs (call, offset_init);
    gimple_set_location (call, loc);
    gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
  
-  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+  call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
                                      build_int_cst (integer_type_node,
                                                     IFN_GOACC_LOOP_BOUND),
-                                    dir, range, s,
-                                    chunk_size, gwv, offset_init);
+                                    dir, range, s, chunk_size, gwv,
+                                    offset_init, integer_one_node);
    gimple_call_set_lhs (call, bound);
    gimple_set_location (call, loc);
    gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
@@ -7873,22 +7875,25 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
           tree chunk = build_int_cst (diff_type, 0); /* Never chunked.  */
  
           t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
-         call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
-                                            element_s, chunk, e_gwv, chunk);
+         call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+                                            element_s, chunk, e_gwv, chunk,
+                                            integer_one_node);
           gimple_call_set_lhs (call, e_offset);
           gimple_set_location (call, loc);
           gsi_insert_before (&gsi, call, GSI_SAME_STMT);
  
           t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
-         call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
-                                            element_s, chunk, e_gwv, e_offset);
+         call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+                                            element_s, chunk, e_gwv, e_offset,
+                                            integer_one_node);
           gimple_call_set_lhs (call, e_bound);
           gimple_set_location (call, loc);
           gsi_insert_before (&gsi, call, GSI_SAME_STMT);
  
           t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
-         call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
-                                            element_s, chunk, e_gwv);
+         call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+                                            element_s, chunk, e_gwv,
+                                            integer_one_node);
           gimple_call_set_lhs (call, e_step);
           gimple_set_location (call, loc);
           gsi_insert_before (&gsi, call, GSI_SAME_STMT);
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c

index 68cc5a9d9e5df5c5e4144404f6254c50760348f3..94a975a88660c949bca998f31ec7dd85fc4001bd 100644 (file)
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -584,6 +584,7 @@ oacc_xform_loop (gcall *call)
    unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
    unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
    tree vf_by_vectorizer = NULL_TREE;
+  tree noalias = NULL_TREE;
  
    /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
    if (!lhs)
@@ -648,202 +649,244 @@ oacc_xform_loop (gcall *call)
  
    switch (code)
      {
-    default: gcc_unreachable ();
+    default:
+      gcc_unreachable ();
  
      case IFN_GOACC_LOOP_CHUNKS:
+      noalias = gimple_call_arg (call, 6);
        if (!chunking)
-       r = build_int_cst (type, 1);
+        r = build_int_cst (type, 1);
        else
-       {
-         /* chunk_max
-            = (range - dir) / (chunks * step * num_threads) + dir  */
-         tree per = oacc_thread_numbers (false, mask, &seq);
-         per = fold_convert (type, per);
-         chunk_size = fold_convert (type, chunk_size);
-         per = fold_build2 (MULT_EXPR, type, per, chunk_size);
-         per = fold_build2 (MULT_EXPR, type, per, step);
-         r = fold_build2 (MINUS_EXPR, type, range, dir);
-         r = fold_build2 (PLUS_EXPR, type, r, per);
-         r = build2 (TRUNC_DIV_EXPR, type, r, per);
-       }
+        {
+          /* chunk_max
+             = (range - dir) / (chunks * step * num_threads) + dir  */
+          tree per = oacc_thread_numbers (false, mask, &seq);
+          per = fold_convert (type, per);
+          noalias = fold_convert (type, noalias);
+          per = fold_build2 (MULT_EXPR, type, per, noalias);
+          per = fold_build2 (MAX_EXPR, type, per, fold_convert (type, integer_one_node));
+          chunk_size = fold_convert (type, chunk_size);
+          per = fold_build2 (MULT_EXPR, type, per, chunk_size);
+          per = fold_build2 (MULT_EXPR, type, per, step);
+          r = fold_build2 (MINUS_EXPR, type, range, dir);
+          r = fold_build2 (PLUS_EXPR, type, r, per);
+          r = build2 (TRUNC_DIV_EXPR, type, r, per);
+        }
        break;
  
      case IFN_GOACC_LOOP_STEP:
+      noalias = gimple_call_arg (call, 6);
        {
-       if (vf_by_vectorizer)
-         r = step;
-       else
-         {
-           /* If striding, step by the entire compute volume, otherwise
-              step by the inner volume.  */
-           unsigned volume = striding ? mask : inner_mask;
-
-           r = oacc_thread_numbers (false, volume, &seq);
-           r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
-         }
+        if (vf_by_vectorizer)
+          r = step;
+        else
+          {
+            /* If striding, step by the entire compute volume, otherwise
+               step by the inner volume.  */
+            unsigned volume = striding ? mask : inner_mask;
+
+            noalias = fold_convert (type, noalias);
+            r = oacc_thread_numbers (false, volume, &seq);
+            r = fold_convert (type, r);
+            r = build2 (MULT_EXPR, type, r, noalias);
+            r = build2 (MAX_EXPR, type, r, fold_convert (type, fold_convert (type, integer_one_node)));
+            r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+          }
+        break;
        }
-      break;
-
-    case IFN_GOACC_LOOP_OFFSET:
-      if (vf_by_vectorizer)
-       {
-         /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
-            the loop.  */
-         if (flag_tree_loop_vectorize
-             || !global_options_set.x_flag_tree_loop_vectorize)
-           {
-             /* Enable vectorization on non-SIMT targets.  */
-             basic_block bb = gsi_bb (gsi);
-             class loop *chunk_loop = bb->loop_father;
-             class loop *inner_loop = chunk_loop->inner;
-
-             /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
-                so we know that the outer chunking loop will be executed just
-                once and the inner loop is the one which must be
-                vectorized (unless it has been optimized out for some
-                reason).  */
-             gcc_assert (!chunking);
-
-             if (inner_loop)
-               {
-                 inner_loop->force_vectorize = true;
-                 inner_loop->safelen = INT_MAX;
-
-                 cfun->has_force_vectorize_loops = true;
-               }
-           }
  
-         /* ...and expand the abstract loops such that the vectorizer can
-            work on them more effectively.
-
-            It might be nicer to merge this code with the "!striding" case
-            below, particularly if chunking support is added.  */
-         tree warppos
-           = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
-         warppos = fold_convert (diff_type, warppos);
-
-         tree volume
-           = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
-         volume = fold_convert (diff_type, volume);
-
-         tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-         chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
-         chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
-         chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
-                                   per);
-
-         warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
-
-         tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
-         chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
-         r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
-       }
-      else if (striding)
-       {
-         r = oacc_thread_numbers (true, mask, &seq);
-         r = fold_convert (diff_type, r);
-       }
-      else
-       {
-         tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
-         tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
-         tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
-                                    inner_size, outer_size);
-
-         volume = fold_convert (diff_type, volume);
-         if (chunking)
-           chunk_size = fold_convert (diff_type, chunk_size);
-         else
-           {
-             tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-             /* chunk_size = (range + per - 1) / per.  */
-             chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
-             chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
-             chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
-           }
-
-         tree span = build2 (MULT_EXPR, diff_type, chunk_size,
-                             fold_convert (diff_type, inner_size));
-         r = oacc_thread_numbers (true, outer_mask, &seq);
-         r = fold_convert (diff_type, r);
-         r = build2 (MULT_EXPR, diff_type, r, span);
-
-         tree inner = oacc_thread_numbers (true, inner_mask, &seq);
-         inner = fold_convert (diff_type, inner);
-         r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
-
-         if (chunking)
-           {
-             tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
-             tree per
-               = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
-             per = build2 (MULT_EXPR, diff_type, per, chunk);
-
-             r = build2 (PLUS_EXPR, diff_type, r, per);
-           }
-       }
-      r = fold_build2 (MULT_EXPR, diff_type, r, step);
-      if (type != diff_type)
-       r = fold_convert (type, r);
-      break;
-
-    case IFN_GOACC_LOOP_BOUND:
-      if (vf_by_vectorizer)
-       {
-         tree volume
-           = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
-         volume = fold_convert (diff_type, volume);
-
-         tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-         chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
-         chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
-         chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
-                                   per);
-
-         vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
-         tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
-                                     vf_by_vectorizer);
-         vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
-         tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
-         vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
-         r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type,
-                          range, vecend);
-       }
-      else if (striding)
-       r = range;
-      else
-       {
-         tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
-         tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
-         tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
-                                    inner_size, outer_size);
-
-         volume = fold_convert (diff_type, volume);
-         if (chunking)
-           chunk_size = fold_convert (diff_type, chunk_size);
-         else
-           {
-             tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-             /* chunk_size = (range + per - 1) / per.  */
-             chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
-             chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
-             chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
-           }
-
-         tree span = build2 (MULT_EXPR, diff_type, chunk_size,
-                             fold_convert (diff_type, inner_size));
-
-         r = fold_build2 (MULT_EXPR, diff_type, span, step);
+      case IFN_GOACC_LOOP_OFFSET:
+       noalias = gimple_call_arg (call, 7);
+        if (vf_by_vectorizer)
+          {
+            /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
+               the loop.  */
+            if (flag_tree_loop_vectorize
+                || !global_options_set.x_flag_tree_loop_vectorize)
+              {
+                /* Enable vectorization on non-SIMT targets.  */
+                basic_block bb = gsi_bb (gsi);
+                class loop *chunk_loop = bb->loop_father;
+                class loop *inner_loop = chunk_loop->inner;
+
+                /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
+                   so we know that the outer chunking loop will be executed
+                   just once and the inner loop is the one which must be
+                   vectorized (unless it has been optimized out for some
+                   reason).  */
+                gcc_assert (!chunking);
+
+                if (inner_loop)
+                  {
+                    inner_loop->force_vectorize = true;
+                    inner_loop->safelen = INT_MAX;
+
+                    cfun->has_force_vectorize_loops = true;
+                  }
+              }
+
+            /* ...and expand the abstract loops such that the vectorizer can
+               work on them more effectively.
+
+               It might be nicer to merge this code with the "!striding" case
+               below, particularly if chunking support is added.  */
+            tree warppos
+                = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
+            warppos = fold_convert (diff_type, warppos);
+
+            tree volume
+                = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+            volume = fold_convert (diff_type, volume);
+
+            tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+            chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+            chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+            chunk_size
+                = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+
+            warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
+
+            tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+            chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
+            r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
+          }
+        else if (striding)
+          {
+            r = oacc_thread_numbers (true, mask, &seq);
+            r = fold_convert (diff_type, r);
+            tree tmp1 = build2 (NE_EXPR, boolean_type_node, r,
+                                fold_convert (diff_type, integer_zero_node));
+            tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+                                boolean_false_node);
+            tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+                                fold_convert (diff_type, tmp1),
+                                fold_convert (diff_type, tmp2));
+            tree tmp4 = build2 (MULT_EXPR, diff_type, tmp3, range);
+            r = build2 (PLUS_EXPR, diff_type, r, tmp4);
+          }
+        else
+          {
+            tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+            tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+            tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+                                       inner_size, outer_size);
+
+            volume = fold_convert (diff_type, volume);
+            if (chunking)
+              chunk_size = fold_convert (diff_type, chunk_size);
+            else
+              {
+                tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+                /* chunk_size = (range + per - 1) / per.  */
+                chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+                chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+                chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+              }
+
+            /* Curtail the range in all but one thread when there may be
+               aliasing to prevent parallelization.  */
+            tree n = oacc_thread_numbers (true, mask, &seq);
+            n = fold_convert (diff_type, n);
+            tree tmp1 = build2 (NE_EXPR, boolean_type_node, n,
+                                fold_convert (diff_type, integer_zero_node));
+            tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+                                boolean_false_node);
+            tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+                                fold_convert (diff_type, tmp1),
+                                fold_convert (diff_type, tmp2));
+            range = build2 (MULT_EXPR, diff_type, tmp3, range);
+
+            tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+                                fold_convert (diff_type, inner_size));
+            r = oacc_thread_numbers (true, outer_mask, &seq);
+            r = fold_convert (diff_type, r);
+            r = build2 (PLUS_EXPR, diff_type, r, range);
+            r = build2 (MULT_EXPR, diff_type, r, span);
+
+            tree inner = oacc_thread_numbers (true, inner_mask, &seq);
+
+            inner = fold_convert (diff_type, inner);
+            r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
+
+            if (chunking)
+              {
+                tree chunk
+                    = fold_convert (diff_type, gimple_call_arg (call, 6));
+                tree per
+                    = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
+                per = build2 (MULT_EXPR, diff_type, per, chunk);
+
+                r = build2 (PLUS_EXPR, diff_type, r, per);
+              }
+          }
+        r = fold_build2 (MULT_EXPR, diff_type, r, step);
+        if (type != diff_type)
+          r = fold_convert (type, r);
+        break;
  
-         tree offset = gimple_call_arg (call, 6);
-         r = build2 (PLUS_EXPR, diff_type, r,
-                     fold_convert (diff_type, offset));
-         r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
-                     diff_type, r, range);
-       }
-      if (diff_type != type)
-       r = fold_convert (type, r);
-      break;
+      case IFN_GOACC_LOOP_BOUND:
+        if (vf_by_vectorizer)
+          {
+            tree volume
+                = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+            volume = fold_convert (diff_type, volume);
+
+            tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+            chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+            chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+            chunk_size
+                = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+
+            vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
+            tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+                                        vf_by_vectorizer);
+            vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
+            tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
+            vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
+            r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
+                             diff_type, range, vecend);
+          }
+        else if (striding)
+          r = range;
+        else
+          {
+            noalias = fold_convert (diff_type, gimple_call_arg (call, 7));
+
+            tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+            tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+            tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+                                       inner_size, outer_size);
+
+            volume = fold_convert (diff_type, volume);
+            volume = fold_build2 (MULT_EXPR, diff_type, volume, noalias);
+            volume
+                = fold_build2 (MAX_EXPR, diff_type, volume, fold_convert (diff_type, integer_one_node));
+            if (chunking)
+              chunk_size = fold_convert (diff_type, chunk_size);
+            else
+              {
+                tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+                /* chunk_size = (range + per - 1) / per.  */
+                chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+                chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+                chunk_size
+                    = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+              }
+
+            tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+                                fold_convert (diff_type, inner_size));
+
+            r = fold_build2 (MULT_EXPR, diff_type, span, step);
+
+            tree offset = gimple_call_arg (call, 6);
+            r = build2 (PLUS_EXPR, diff_type, r,
+                        fold_convert (diff_type, offset));
+            r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type, r,
+                        range);
+          }
+        if (diff_type != type)
+          r = fold_convert (type, r);
+        break;
      }
  
    gimplify_assign (lhs, r, &seq);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c

new file mode 100644 (file)

index 0000000..2fb1c71
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c
@@ -0,0 +1,79 @@
+/* Test that a simple array copy does the right thing when the input and
+   output data overlap.  The GPU kernel should automatically switch to
+   a sequential operation mode in order to give the expected results.  */
+
+#include <stdlib.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+  /* We cannot use copyin for two overlapping arrays because we get an error
+     that the memory is already present.  We also cannot do the pointer
+     arithmetic inside the kernels region because it just ends up using
+     host pointers (bug?).  Using enter data with a single array, and
+     acc_deviceptr solves the problem.  */
+#pragma acc enter data copyin(data[0:n])
+
+  int *a = (int*)acc_deviceptr (data+to);
+  int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+  for (int i = 0; i < count; i++)
+    a[i] = b[i];
+
+#pragma acc exit data copyout(data[0:n])
+}
+
+#define N 2000
+
+int data[N];
+
+int
+main ()
+{
+  for (int i=0; i < N; i++)
+    data[i] = i;
+
+  /* Baseline test; no aliasing. The high part of the data is copied to
+     the lower part.  */
+  int to = 0;
+  int from = N/2;
+  int count = N/2;
+  f (data, N, to, from, count);
+  for (int i=0; i < N; i++)
+    if (data[i] != (i%count)+count)
+      exit (1);
+
+  /* Check various amounts of data overlap.  */
+  int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+  for (int t = 0; t < sizeof (tests)/sizeof(tests[0]); t++)
+    {
+      for (int i=0; i < N; i++)
+       data[i] = i;
+
+      /* Output overlaps the latter part of input; expect the initial no-aliased
+        part of the input to repeat throughout the aliased portion.  */
+      to = tests[t];
+      from = 0;
+      count = N-tests[t];
+      f (data, N, to, from, count);
+      for (int i=0; i < N; i++)
+       if (data[i] != i%tests[t])
+       exit (2);
+
+      for (int i=0; i < N; i++)
+       data[i] = i;
+
+      /* Input overlaps the latter part of the output; expect the copy to work
+        in the obvious manner.  */
+      to = 0;
+      from = tests[t];
+      count = N-tests[t];
+      f (data, N, to, from, count);
+      for (int i=0; i < count; i++)
+       if (data[i+to] != i+tests[t])
+       exit (3);
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c

new file mode 100644 (file)

index 0000000..96c0329
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c
@@ -0,0 +1,90 @@
+/* Test that a simple array copy does the right thing when the input and
+   output data overlap.  The GPU kernel should automatically switch to
+   a sequential operation mode in order to give the expected results.
+
+   This test does not check the correctness of the output (there are other
+   tests for that), but checks that the code really does select the faster
+   path, when it can, by comparing the timing.  */
+
+/* No optimization means no issue with aliasing.
+   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } 
+   { dg-skip-if "" { *-*-* } { "-foffload=disable" } { "" } } */
+
+#include <stdlib.h>
+#include <sys/time.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+  int *a = (int*)acc_deviceptr (data+to);
+  int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+  for (int i = 0; i < count; i++)
+    a[i] = b[i];
+}
+
+#define N 1000000
+int data[N];
+
+int
+main ()
+{
+  struct timeval start, stop, difference;
+  long basetime, aliastime;
+
+  for (int i=0; i < N; i++)
+    data[i] = i;
+
+  /* Ensure that the data copies are outside the timed zone.  */
+#pragma acc enter data copyin(data[0:N])
+
+  /* Baseline test; no aliasing. The high part of the data is copied to
+     the lower part.  */
+  int to = 0;
+  int from = N/2;
+  int count = N/2;
+  gettimeofday (&start, NULL);
+  f (data, N, to, from, count);
+  gettimeofday (&stop, NULL);
+  timersub (&stop, &start, &difference);
+  basetime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+  /* Check various amounts of data overlap.  */
+  int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+  for (int i = 0; i < sizeof (tests)/sizeof(tests[0]); i++)
+    {
+      to = 0;
+      from = N/2 - tests[i];
+      gettimeofday (&start, NULL);
+      f (data, N, to, from, count);
+      gettimeofday (&stop, NULL);
+      timersub (&stop, &start, &difference);
+      aliastime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+      /* If the aliased runtime is less than 200% of the non-aliased runtime
+        then the runtime alias check probably selected the wrong path.
+        (Actually we expect the difference to be far greater than that.)  */
+      if (basetime*2 > aliastime)
+       exit (1);
+    }
+
+  /* Repeat the baseline check just to make sure it didn't also get slower
+     after the first run.  */
+  to = 0;
+  from = N/2;
+  gettimeofday (&start, NULL);
+  f (data, N, to, from, count);
+  gettimeofday (&stop, NULL);
+  timersub (&stop, &start, &difference);
+  int controltime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+  /* The two times should be roughly the same, but we just check it wouldn't
+     pass the aliastime test above.  */
+  if (basetime*2 <= controltime)
+    exit (2);
+
+#pragma acc exit data copyout(data[0:N])
+
+  return 0;
+}
author	Andrew Stubbs <ams@codesourcery.com>
	Tue, 16 Nov 2021 15:19:53 +0000 (16:19 +0100)
committer	Frederik Harwath <frederik@codesourcery.com>
	Wed, 17 Nov 2021 07:50:55 +0000 (08:50 +0100)
gcc/graphite-isl-ast-to-gimple.c		patch \| blob \| blame \| history
gcc/graphite-scop-detection.c		patch \| blob \| blame \| history
gcc/omp-expand.c		patch \| blob \| blame \| history
gcc/omp-offload.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c	[new file with mode: 0644]	patch \| blob
libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c	[new file with mode: 0644]	patch \| blob