]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution
authorThomas Schwinge <thomas@codesourcery.com>
Mon, 12 Dec 2022 21:05:37 +0000 (22:05 +0100)
committerThomas Schwinge <thomas@codesourcery.com>
Fri, 20 Jan 2023 20:18:22 +0000 (21:18 +0100)
For example, this allows for '-muniform-simt' code to be executed
single-threaded, which currently fails (device-side 'trap'), as the 0xffffffff
mask isn't correct if not all 32 threads of a warp are active.  The same
issue/fix, I suppose but have not verified, would apply if we were to allow for
OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.

We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 0xffffffff',
which evidently appears to do the right thing.  (I've tested '-muniform-simt'
code executing single-threaded.)

gcc/
* config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
non-full-warp execution.
gcc/testsuite/
* gcc.target/nvptx/nvptx.exp
(check_effective_target_default_ptx_isa_version_at_least_6_0):
New.
* gcc.target/nvptx/uniform-simt-5.c: New.
libgomp/
* plugin/plugin-nvptx.c (nvptx_exec): Assert what we know about
'blockDimX'.

gcc/ChangeLog.omp
gcc/config/nvptx/nvptx.md
gcc/testsuite/ChangeLog.omp
gcc/testsuite/gcc.target/nvptx/nvptx.exp
gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c [new file with mode: 0644]
libgomp/ChangeLog.omp
libgomp/plugin/plugin-nvptx.c

index 2d4b75134138607e8963843250213030022b6e34..382cd5c80c2257a7a5d0dc7032d7aebaf4de3bf4 100644 (file)
@@ -1,3 +1,8 @@
+2023-01-20  Thomas Schwinge  <thomas@codesourcery.com>
+
+       * config/nvptx/nvptx.md (nvptx_uniform_warp_check): Make fit for
+       non-full-warp execution.
+
 2023-01-19  Tobias Burnus  <tobias@codesourcery.com>
 
        Backported from master:
index 04c150b89820025a655a3c346a9be3b0f869b091..d27126556ce07816f8c16895fee3cd9c3961131a 100644 (file)
       "{",
       "\\t"              ".reg.b32"        "\\t" "%%r_act;",
       "%.\\t"            "vote.ballot.b32" "\\t" "%%r_act,1;",
+      /* For '%r_exp', we essentially need 'activemask.b32', but that is "Introduced in PTX ISA version 6.2", and this code here is used only 'if (!TARGET_PTX_6_0)'.  Thus, emulate it.
+         TODO Is that actually correct?  Wouldn't 'activemask.b32' rather replace our 'vote.ballot.b32' given that it registers the *currently active threads*?  */
+      /* Compute the "membermask" of all threads of the warp that are expected to be converged here.
+        For OpenACC, '%ntid.x' is 'vector_length', which per 'nvptx_goacc_validate_dims' always is a multiple of 32.
+        For OpenMP, '%ntid.x' always is 32.
+        Thus, this is typically 0xffffffff, but additionally always for the case that not all 32 threads of the warp have been launched.
+        This assume that lane IDs are assigned in ascending order.  */
+      //TODO Can we rely on '1 << 32 == 0', and '0 - 1 = 0xffffffff'?
+      //TODO https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
+      //TODO https://stackoverflow.com/questions/54055195/activemask-vs-ballot-sync
+      "\\t"              ".reg.b32"        "\\t" "%%r_exp;",
+      "%.\\t"            "mov.b32"         "\\t" "%%r_exp, %%ntid.x;",
+      "%.\\t"            "shl.b32"         "\\t" "%%r_exp, 1, %%r_exp;",
+      "%.\\t"            "sub.u32"         "\\t" "%%r_exp, %%r_exp, 1;",
       "\\t"              ".reg.pred"       "\\t" "%%r_do_abort;",
       "\\t"              "mov.pred"        "\\t" "%%r_do_abort,0;",
       "%.\\t"            "setp.ne.b32"     "\\t" "%%r_do_abort,%%r_act,"
-                                                 "0xffffffff;",
+                                                 "%%r_exp;",
       "@ %%r_do_abort\\t" "trap;",
       "@ %%r_do_abort\\t" "exit;",
       "}",
index d4b483b124b3659258986f11453cfafdd47a6bcb..7339bf41482d4fe99f8cda05152ae21b2de1c353 100644 (file)
@@ -1,3 +1,10 @@
+2023-01-20  Thomas Schwinge  <thomas@codesourcery.com>
+
+       * gcc.target/nvptx/nvptx.exp
+       (check_effective_target_default_ptx_isa_version_at_least_6_0):
+       New.
+       * gcc.target/nvptx/uniform-simt-5.c: New.
+
 2023-01-16  Tobias Burnus  <tobias@codesourcery.com>
 
        Backported from master:
index e9622ae7aaa8d5a550fb20981ef2b763a92af735..17e03daeb7e0288ddbb82271200f5d45144e73fe 100644 (file)
@@ -49,6 +49,11 @@ proc check_effective_target_default_ptx_isa_version_at_least { major minor } {
     return $res
 }
 
+# Return 1 if code by default compiles for at least PTX ISA version 6.0.
+proc check_effective_target_default_ptx_isa_version_at_least_6_0 { } {
+    return [check_effective_target_default_ptx_isa_version_at_least 6 0]
+}
+
 # Return 1 if code with PTX ISA version major.minor or higher can be run.
 proc check_effective_target_runtime_ptx_isa_version_at_least { major minor } {
     set name runtime_ptx_isa_version_${major}_${minor}
diff --git a/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c b/gcc/testsuite/gcc.target/nvptx/uniform-simt-5.c
new file mode 100644 (file)
index 0000000..b2f7819
--- /dev/null
@@ -0,0 +1,28 @@
+/* Verify that '-muniform-simt' code may be executed single-threaded.
+
+   { dg-do run }
+   { dg-options {-save-temps -O2 -muniform-simt} } */
+
+enum memmodel
+{
+  MEMMODEL_RELAXED = 0
+};
+
+unsigned long long int v64;
+unsigned long long int *p64 = &v64;
+
+int
+main()
+{
+  /* Trigger uniform-SIMT processing.  */
+  __atomic_fetch_add (p64, v64, MEMMODEL_RELAXED);
+
+  return 0;
+}
+
+/* Per 'omp_simt_exit':
+     - 'nvptx_warpsync'
+       { dg-final { scan-assembler-times {bar\.warp\.sync\t0xffffffff;} 1 { target default_ptx_isa_version_at_least_6_0 } } }
+     - 'nvptx_uniform_warp_check'
+       { dg-final { scan-assembler-times {vote\.ballot\.b32\t%r_act,1;} 1 { target { ! default_ptx_isa_version_at_least_6_0 } } } }
+*/
index 33aa4b01350b677af7e69fbebc4f41c351233886..4447b74a2abd1b9e0e3ee97abea9c66718c498a4 100644 (file)
@@ -1,5 +1,8 @@
 2023-01-20  Thomas Schwinge  <thomas@codesourcery.com>
 
+       * plugin/plugin-nvptx.c (nvptx_exec): Assert what we know about
+       'blockDimX'.
+
        PR target/85463
        * config/nvptx/error.c (exit): Don't override.
        * testsuite/libgomp.oacc-fortran/error_stop-1.f: Update.
index 4a1b9f579e400195faaea083506c0c3853ec2a02..b2fabc61cc88b1e56e09dae4455eff88de1f29bc 100644 (file)
@@ -998,6 +998,9 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
                                            api_info);
     }
 
+  /* Per 'nvptx_goacc_validate_dims'.  */
+  assert (dims[GOMP_DIM_VECTOR] % warp_size == 0);
+
   kargs[0] = &dp;
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
                    dims[GOMP_DIM_GANG], 1, 1,