From 36dc87133fa96de4dd6b82887c79dc879e985381 Mon Sep 17 00:00:00 2001
From: vries <vries@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Mon, 30 Jul 2018 08:17:26 +0000
Subject: [PATCH] [libgomp, nvptx] Handle per-function max-threads-per-block in
 default dims

Currently parallel-loop-1.c fails at -O0 on a Quadro M1200, because one of the
kernel launch configurations exceeds the resources available in the device, due
to the default dimensions chosen by the runtime.

This patch fixes that by taking the per-function max_threads_per_block into
account when using the default dimensions.

2018-07-30  Tom de Vries  <tdevries@suse.de>

	* plugin/plugin-nvptx.c (MIN, MAX): Redefine.
	(nvptx_exec): Ensure worker and vector default dims don't exceed
	targ_fn->max_threads_per_block.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@263062 138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog             |  6 ++++++
 libgomp/plugin/plugin-nvptx.c | 29 +++++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 1d218f4be6c0..6cd30bbf49d0 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,9 @@
+2018-07-30  Tom de Vries  <tdevries@suse.de>
+
+	* plugin/plugin-nvptx.c (MIN, MAX): Redefine.
+	(nvptx_exec): Ensure worker and vector default dims don't exceed
+	targ_fn->max_threads_per_block.
+
 2018-07-30  Tom de Vries  <tdevries@suse.de>
 
 	* plugin/plugin-nvptx.c (struct ptx_device): Add default_dims field.
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5c522aaf2819..b6ec5f88d59a 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -141,6 +141,11 @@ init_cuda_lib (void)
 
 #include "secure_getenv.h"
 
+#undef MIN
+#undef MAX
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
 /* Convenience macros for the frequently used CUDA library call and
    error handling sequence as well as CUDA library calls that
    do the error checking themselves or don't do it at all.  */
@@ -1135,6 +1140,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   void *kargs[1];
   void *hp, *dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
+  int warp_size = nvthd->ptx_dev->warp_size;
   const char *maybe_abort_msg = "(perhaps abort was called)";
 
   function = targ_fn->fn;
@@ -1175,7 +1181,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
 	  int gang, worker, vector;
 	  {
-	    int warp_size = nvthd->ptx_dev->warp_size;
 	    int block_size = nvthd->ptx_dev->max_threads_per_block;
 	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 	    int dev_size = nvthd->ptx_dev->num_sms;
@@ -1213,9 +1218,25 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 	}
       pthread_mutex_unlock (&ptx_dev_lock);
 
-      for (i = 0; i != GOMP_DIM_MAX; i++)
-	if (!dims[i])
-	  dims[i] = nvthd->ptx_dev->default_dims[i];
+      {
+	bool default_dim_p[GOMP_DIM_MAX];
+	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  {
+	    default_dim_p[i] = !dims[i];
+	    if (default_dim_p[i])
+	      dims[i] = nvthd->ptx_dev->default_dims[i];
+	  }
+
+	if (default_dim_p[GOMP_DIM_VECTOR])
+	  dims[GOMP_DIM_VECTOR]
+	    = MIN (dims[GOMP_DIM_VECTOR],
+		   (targ_fn->max_threads_per_block / warp_size * warp_size));
+
+	if (default_dim_p[GOMP_DIM_WORKER])
+	  dims[GOMP_DIM_WORKER]
+	    = MIN (dims[GOMP_DIM_WORKER],
+		   targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+      }
     }
 
   /* Check if the accelerator has sufficient hardware resources to
-- 
2.47.2