[og10] openacc: Adjust loop lowering for AMD GCN

author Julian Brown <julian@codesourcery.com>

Fri, 6 Nov 2020 23:17:29 +0000 (15:17 -0800)

committer Kwok Cheung Yeung <kcy@codesourcery.com>

Tue, 21 Jun 2022 13:11:29 +0000 (14:11 +0100)
author Julian Brown <julian@codesourcery.com>
Fri, 6 Nov 2020 23:17:29 +0000 (15:17 -0800)
committer Kwok Cheung Yeung <kcy@codesourcery.com>
Tue, 21 Jun 2022 13:11:29 +0000 (14:11 +0100)
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp

index 862c5c2dd31e8fa81bc8df00833866a02da68c1f..74b7f799ccaa6da7b4f6ee27c4cfd6c98bec20c7 100644 (file)
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,10 @@
+2021-01-13  Julian Brown  <julian@codesourcery.com>
+
+       * omp-offload.cc (oacc_thread_numbers): Add VF_BY_VECTORIZER parameter.
+       Add overloaded wrapper for previous arguments & behaviour.
+       (oacc_xform_loop): Lower vector loops to iterate a multiple of
+       omp_max_vf times over contiguous steps on non-SIMT targets.
+
  2020-07-27  Andrew Stubbs  <ams@codesourcery.com>
  
         * dwarf2cfi.cc (get_cfa_from_loc_descr): Support register spans
diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc

index 6131479b16f0715b6bdea44e2b35bfd50acabe14..ac4d4cfaff6abb145f539deabaefc8f87fbba310 100644 (file)
--- a/gcc/omp-offload.cc
+++ b/gcc/omp-offload.cc
@@ -491,11 +491,13 @@ oacc_dim_call (bool pos, int dim, gimple_seq *seq)
  }
  
  /* Find the number of threads (POS = false), or thread number (POS =
-   true) for an OpenACC region partitioned as MASK.  Setup code
+   true) for an OpenACC region partitioned as MASK.  If VF_BY_VECTORIZER is
+   true, use that as the vectorization factor for the auto-vectorized
+   dimension size, instead of calling the builtin function.  Setup code
     required for the calculation is added to SEQ.  */
  
  static tree
-oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
+oacc_thread_numbers (bool pos, int mask, tree vf_by_vectorizer, gimple_seq *seq)
  {
    tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
    unsigned ix;
@@ -508,13 +510,15 @@ oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
           {
             /* We had an outer index, so scale that by the size of
                this dimension.  */
-           tree n = oacc_dim_call (false, ix, seq);
+           tree n = (ix == GOMP_DIM_VECTOR && vf_by_vectorizer)
+                    ? vf_by_vectorizer : oacc_dim_call (false, ix, seq);
             res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
           }
         if (pos)
           {
             /* Determine index in this dimension.  */
-           tree id = oacc_dim_call (true, ix, seq);
+           tree id = (ix == GOMP_DIM_VECTOR && vf_by_vectorizer)
+                     ? integer_zero_node :  oacc_dim_call (true, ix, seq);
             if (res)
               res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
             else
@@ -528,6 +532,12 @@ oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
    return res;
  }
  
+static tree
+oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
+{
+  return oacc_thread_numbers (pos, mask, NULL_TREE, seq);
+}
+
  /* Transform IFN_GOACC_LOOP calls to actual code.  See
     expand_oacc_for for where these are generated.  At the vector
     level, we stride loops, such that each member of a warp will
@@ -555,6 +565,7 @@ oacc_xform_loop (gcall *call)
    bool chunking = false, striding = true;
    unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
    unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
+  tree vf_by_vectorizer = NULL_TREE;
  
    /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
    if (!lhs)
@@ -582,16 +593,39 @@ oacc_xform_loop (gcall *call)
        striding = integer_onep (chunk_size);
        chunking = !striding;
      }
+
+  if (!chunking
+      && !targetm.simt.vf
+      && (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
+    {
+      poly_uint64 max_vf = omp_max_vf ();
+      vf_by_vectorizer = build_int_cst (integer_type_node, max_vf);
+    }
+
  #endif
  
-  /* striding=true, chunking=true
+  /* For SIMT targets:
+
+     striding=true, chunking=true
         -> invalid.
       striding=true, chunking=false
         -> chunks=1
       striding=false,chunking=true
         -> chunks=ceil (range/(chunksize*threads*step))
       striding=false,chunking=false
-       -> chunk_size=ceil(range/(threads*step)),chunks=1  */
+       -> chunk_size=ceil(range/(threads*step)),chunks=1
+
+     For non-SIMT targets:
+
+      striding=N/A, chunking=true
+       -> as above, for now.
+      striding=N/A, chunking=false
+       -> chunks=1
+          threads=gangs*workers*vf
+          chunk_size=ceil(range/(threads*step))
+          inner chunking loop steps by "step", vf*chunk_size times.
+  */
+
    push_gimplify_context (true);
  
    switch (code)
@@ -610,49 +644,83 @@ oacc_xform_loop (gcall *call)
           chunk_size = fold_convert (type, chunk_size);
           per = fold_build2 (MULT_EXPR, type, per, chunk_size);
           per = fold_build2 (MULT_EXPR, type, per, step);
-         r = build2 (MINUS_EXPR, type, range, dir);
-         r = build2 (PLUS_EXPR, type, r, per);
+         r = fold_build2 (MINUS_EXPR, type, range, dir);
+         r = fold_build2 (PLUS_EXPR, type, r, per);
           r = build2 (TRUNC_DIV_EXPR, type, r, per);
         }
        break;
  
      case IFN_GOACC_LOOP_STEP:
        {
-       /* If striding, step by the entire compute volume, otherwise
-          step by the inner volume.  */
-       unsigned volume = striding ? mask : inner_mask;
+       if (vf_by_vectorizer)
+         r = step;
+       else
+         {
+           /* If striding, step by the entire compute volume, otherwise
+              step by the inner volume.  */
+           unsigned volume = striding ? mask : inner_mask;
  
-       r = oacc_thread_numbers (false, volume, &seq);
-       r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+           r = oacc_thread_numbers (false, volume, &seq);
+           r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+         }
        }
        break;
  
      case IFN_GOACC_LOOP_OFFSET:
-      /* Enable vectorization on non-SIMT targets.  */
-      if (!targetm.simt.vf
-         && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+      if (vf_by_vectorizer)
+       {
           /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
              the loop.  */
-         && (flag_tree_loop_vectorize
-             || !OPTION_SET_P (flag_tree_loop_vectorize)))
-       {
-         basic_block bb = gsi_bb (gsi);
-         class loop *parent = bb->loop_father;
-         class loop *body = parent->inner;
-
-         parent->force_vectorize = true;
-         parent->safelen = INT_MAX;
-
-         /* "Chunking loops" may have inner loops.  */
-         if (parent->inner)
+         if (flag_tree_loop_vectorize
+             || !OPTION_SET_P (flag_tree_loop_vectorize))
             {
-             body->force_vectorize = true;
-             body->safelen = INT_MAX;
+             /* Enable vectorization on non-SIMT targets.  */
+             basic_block bb = gsi_bb (gsi);
+             class loop *chunk_loop = bb->loop_father;
+             class loop *inner_loop = chunk_loop->inner;
+
+             /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
+                so we know that the outer chunking loop will be executed just
+                once and the inner loop is the one which must be
+                vectorized (unless it has been optimized out for some
+                reason).  */
+             gcc_assert (!chunking);
+
+             if (inner_loop)
+               {
+                 inner_loop->force_vectorize = true;
+                 inner_loop->safelen = INT_MAX;
+
+                 cfun->has_force_vectorize_loops = true;
+               }
             }
  
-         cfun->has_force_vectorize_loops = true;
+         /* ...and expand the abstract loops such that the vectorizer can
+            work on them more effectively.
+
+            It might be nicer to merge this code with the "!striding" case
+            below, particularly if chunking support is added.  */
+         tree warppos
+           = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
+         warppos = fold_convert (diff_type, warppos);
+
+         tree volume
+           = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+         volume = fold_convert (diff_type, volume);
+
+         tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+         chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+         chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+         chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
+                                   per);
+
+         warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
+
+         tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+         chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
+         r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
         }
-      if (striding)
+      else if (striding)
         {
           r = oacc_thread_numbers (true, mask, &seq);
           r = fold_convert (diff_type, r);
@@ -670,7 +738,7 @@ oacc_xform_loop (gcall *call)
           else
             {
               tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-
+             /* chunk_size = (range + per - 1) / per.  */
               chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
               chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
               chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
@@ -702,7 +770,28 @@ oacc_xform_loop (gcall *call)
        break;
  
      case IFN_GOACC_LOOP_BOUND:
-      if (striding)
+      if (vf_by_vectorizer)
+       {
+         tree volume
+           = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+         volume = fold_convert (diff_type, volume);
+
+         tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+         chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+         chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+         chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
+                                   per);
+
+         vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
+         tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+                                     vf_by_vectorizer);
+         vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
+         tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
+         vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
+         r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type,
+                          range, vecend);
+       }
+      else if (striding)
         r = range;
        else
         {
@@ -717,7 +806,7 @@ oacc_xform_loop (gcall *call)
           else
             {
               tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-
+             /* chunk_size = (range + per - 1) / per.  */
               chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
               chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
               chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp

index f891f8ea2525b734947f902502efe1249bc0566e..c3c9c7b5486a5d4f05cbd2bdad4ee2b4f6be217a 100644 (file)
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,13 @@
+2021-01-13  Julian Brown  <julian@codesourcery.com>
+
+       * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for loop
+       lowering changes.
+       * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
+
  2020-07-16  Tobias Burnus  <tobias@codesourcery.com>
  
         * testsuite/libgomp.oacc-fortran/firstprivate-int.f90: Use
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c

index d3f6ea24e7e3739733e74cf2ec0ee20258d4d925..18d56f6dd4bacaf93d234ef969d9617a4692eebe 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@@ -67,12 +67,23 @@ int main ()
        int expected = ix;
        if(ondev)
         {
-         int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-                          / (gangsize * workersize * vectorsize);
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+         int use_vectorsize = 64;
+#else
+         int use_vectorsize = vectorsize;
+#endif
+         int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+                          / (gangsize * workersize * use_vectorsize);
           
+#ifdef ACC_DEVICE_TYPE_radeon
+         int g = ix / (chunk_size * workersize * use_vectorsize);
+         int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+         int v = 0;
+#else
           int g = ix / (chunk_size * workersize * vectorsize);
           int w = (ix / vectorsize) % workersize;
           int v = ix % vectorsize;
+#endif
  
           expected = (g << 16) | (w << 8) | v;
         }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c

index 4099d6072da98389d58648c5cb9a56c36b0eb52b..e29e89dc00c428e8f5188980e0c4125e42ea2820 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@@ -64,12 +64,23 @@ int main ()
        int val = ix;
        if (ondev)
         {
-         int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-                          / (gangsize * workersize * vectorsize);
-         
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+         int use_vectorsize = 64;
+#else
+         int use_vectorsize = vectorsize;
+#endif
+         int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+                          / (gangsize * workersize * use_vectorsize);
+
+#ifdef ACC_DEVICE_TYPE_radeon
+         int g = ix / (chunk_size * workersize * use_vectorsize);
+         int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+         int v = 0;
+#else
           int g = ix / (chunk_size * vectorsize * workersize);
           int w = ix / vectorsize % workersize;
           int v = ix % vectorsize;
+#endif
  
           val = (g << 16) | (w << 8) | v;
         }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c

index fadb2627f73ef78dd2241f5e7c6aabea9aa05568..616cf50202c8a16f28ddfbc744ca1390a5059cfe 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -63,8 +63,24 @@ int main ()
        if(ondev)
         {
           int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+         int use_vecsize = 64;
+#  else
+         int use_vecsize = vectorsize;
+#  endif
+         /* For Radeon, the loop is split into contiguous blocks of
+            chunk_size * vector_size, with chunk_size selected to cover the
+            whole iteration space.  Each block is then autovectorized where
+            possible.  */
+         int chunk_size = (N + workersize * use_vecsize - 1)
+                          / (workersize * use_vecsize);
+         int w = ix / (chunk_size * use_vecsize);
+         int v = 0;
+#else
           int w = (ix / vectorsize) % workersize;
           int v = ix % vectorsize;
+#endif
  
           val = (g << 16) | (w << 8) | v;
         }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c

index 77326068a666db754f38988b38fc228a350617f8..560b748f1fe654be74cccf0f095855e41c02c065 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@@ -65,8 +65,24 @@ int main ()
        if(ondev)
         {
           int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+         int use_vecsize = 64;
+#  else
+         int use_vecsize = vectorsize;
+#  endif
+         /* For Radeon, the loop is split into contiguous blocks of
+            chunk_size * vector_size, with chunk_size selected to cover the
+            whole iteration space.  Each block is then autovectorized where
+            possible.  */
+         int chunk_size = (N + workersize * use_vecsize - 1)
+                          / (workersize * use_vecsize);
+         int w = ix / (chunk_size * use_vecsize);
+         int v = 0;
+#else
           int w = (ix / vectorsize) % workersize;
           int v = ix % vectorsize;
+#endif
  
           expected = (g << 16) | (w << 8) | v;
         }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c

index 81e08119214a1619d01ca5033f8bc0e0b2c8c8fe..59249a071e58487e0e8b8e383e650ebf8273c667 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@@ -62,12 +62,23 @@ int main ()
        int expected = ix;
        if(ondev)
         {
-         int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-                          / (gangsize * workersize * vectorsize);
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+         int use_vectorsize = 64;
+#else
+         int use_vectorsize = vectorsize;
+#endif
+         int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+                          / (gangsize * workersize * use_vectorsize);
           
-         int g = ix / (chunk_size * vectorsize * workersize);
+#ifdef ACC_DEVICE_TYPE_radeon
+         int g = ix / (chunk_size * workersize * use_vectorsize);
+         int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+         int v = 0;
+#else
+         int g = ix / (chunk_size * workersize * vectorsize);
           int w = (ix / vectorsize) % workersize;
           int v = ix % vectorsize;
+#endif
  
           expected = (g << 16) | (w << 8) | v;
         }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c

index 647d075bb0068753f31507c7a034b5627376225b..8eada237665a298a4aa85ec3c16149b5dc47940f 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@@ -61,8 +61,24 @@ int main ()
        if(ondev)
         {
           int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+         int use_vecsize = 64;
+#  else
+         int use_vecsize = vectorsize;
+#  endif
+         /* For Radeon, the loop is split into contiguous blocks of
+            chunk_size * vector_size, with chunk_size selected to cover the
+            whole iteration space.  Each block is then autovectorized where
+            possible.  */
+         int chunk_size = (N + workersize * use_vecsize - 1)
+                          / (workersize * use_vecsize);
+         int w = ix / (chunk_size * use_vecsize);
+         int v = 0;
+#else
           int w = (ix / vectorsize) % workersize;
           int v = ix % vectorsize;
+#endif
  
           expected = (g << 16) | (w << 8) | v;
         }
author	Julian Brown <julian@codesourcery.com>
	Fri, 6 Nov 2020 23:17:29 +0000 (15:17 -0800)
committer	Kwok Cheung Yeung <kcy@codesourcery.com>
	Tue, 21 Jun 2022 13:11:29 +0000 (14:11 +0100)
gcc/ChangeLog.omp		patch \| blob \| blame \| history
gcc/omp-offload.cc		patch \| blob \| blame \| history
libgomp/ChangeLog.omp		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c		patch \| blob \| blame \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c		patch \| blob \| blame \| history