From: Julian Brown <julian@codesourcery.com>
Date: Tue, 26 Feb 2019 21:18:36 +0000 (-0800)
Subject: Adjustments and additions to testcases
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c40efbf10d4ce8be1f30715f586da648746fd868;p=thirdparty%2Fgcc.git

Adjustments and additions to testcases

Some additions of redundant "present" clauses dropped.

2018-10-22  Cesar Philippidis  <cesar@codesourcery.com>

	gcc/testsuite/
	* g++.dg/goacc/loop-1.c: New test.
	* g++.dg/goacc/loop-2.c: New test.
	* g++.dg/goacc/loop-3.c: New test.

2018-10-22  James Norris  <jnorris@codesourcery.com>
	    Cesar Philippidis  <cesar@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update parallel
	regions to denote variables copyied in via acc enter data as
	present.
	* testsuite/libgomp.oacc-fortran/data-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/data-4.f90: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/subr.h: Reimplement.
	* testsuite/libgomp.oacc-c-c++-common/subr.ptx: Regenerated PTX.
	* testsuite/libgomp.oacc-c-c++-common/timer.h: Removed.
	* testsuite/libgomp.oacc-c-c++-common/lib-69.c: Change async checks.
	* testsuite/libgomp.oacc-c-c++-common/lib-70.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-71.c: Rework kernel i/f.
	* testsuite/libgomp.oacc-c-c++-common/lib-72.c: Rework kernel i/f and
	change async checks.
	* testsuite/libgomp.oacc-c-c++-common/lib-73.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-74.c: Rework kernel i/f and
	timing checks.
	* testsuite/libgomp.oacc-c-c++-common/lib-75.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-76.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-78.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-80.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-82.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-93.c: New test.

(cherry picked from openacc-gcc-9-branch commit
18e5e71a03b69316246a417f5d957cbaa64245a5)
---

diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp
index 9f9086f82e81..11be8f7ed887 100644
--- a/gcc/testsuite/ChangeLog.omp
+++ b/gcc/testsuite/ChangeLog.omp
@@ -1,3 +1,9 @@
+2018-10-22  Cesar Philippidis  <cesar@codesourcery.com>
+
+	* g++.dg/goacc/loop-1.c: New test.
+	* g++.dg/goacc/loop-2.c: New test.
+	* g++.dg/goacc/loop-3.c: New test.
+
 2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
             Nathan Sidwell  <nathan@acm.org>
             Julian Brown  <julian@codesourcery.com>
diff --git a/gcc/testsuite/g++.dg/goacc/loop-1.c b/gcc/testsuite/g++.dg/goacc/loop-1.c
new file mode 100644
index 000000000000..51b20b0e2da0
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/loop-1.c
@@ -0,0 +1,23 @@
+void
+f (int i, float j, int k)
+{
+#pragma acc parallel num_gangs (i) num_workers (i) vector_length (i)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (j) /* { dg-error "'num_gangs' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_workers (j) /* { dg-error "'num_workers' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel vector_length (j) /* { dg-error "'vector_length' expression must be integral" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+}
diff --git a/gcc/testsuite/g++.dg/goacc/loop-2.c b/gcc/testsuite/g++.dg/goacc/loop-2.c
new file mode 100644
index 000000000000..ddfb4804353a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/loop-2.c
@@ -0,0 +1,70 @@
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (num: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10)
+  for (i = 0; i < 20; ++i)
+    ;
+
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, *) /* { dg-error "duplicate operand to clause" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop gang (static: 5, num: 10, static: *) /* { dg-error "duplicate 'num' argument" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (static: 234) /* { dg-error "expected 'num' before" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234)
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop worker (num: 234, num: 12) /* { dg-error "duplicate operand to clause" } */
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels
+#pragma acc loop vector /* { dg-error "gang, worker and vector must occur in this order in a loop nest" } */
+  for (i = 0; i < 20; ++i)
+#pragma acc loop worker
+    for (j = 0; j < 25; ++j)
+      ;
+
+#pragma acc kernels
+#pragma acc loop worker (length: 20) /* { dg-error "expected 'num' before 'length'" } */
+  for (i = 0; i < 20; ++i)
+#pragma acc loop vector (length: 10)
+    for (j = 0; j < 25; ++j)
+      ;
+
+#pragma acc kernels
+#pragma acc loop worker
+  for (i = 0; i < 20; ++i)
+#pragma acc loop vector
+    for (j = 0; j < 25; ++j)
+      ;
+}
diff --git a/gcc/testsuite/g++.dg/goacc/loop-3.c b/gcc/testsuite/g++.dg/goacc/loop-3.c
new file mode 100644
index 000000000000..c43b4f33e62a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/goacc/loop-3.c
@@ -0,0 +1,43 @@
+void
+f (int i, int j, int k)
+{
+#pragma acc kernels num_gangs (10) /* { dg-error "'num_gangs' is not valid" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels num_workers (10) /* { dg-error "'num_workers' is not valid" } */
+#pragma acc loop worker
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc kernels vector_length (10) /* { dg-error "'vector_length' is not valid" } */
+#pragma acc loop vector
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (10) num_workers (20) vector_length (32)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (i) num_workers (j) vector_length (k)
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_gangs (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel num_workers (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+
+#pragma acc parallel vector_length (10, i) /* { dg-error "expected '\\)' before ',' token" } */
+#pragma acc loop gang
+  for (i = 0; i < 20; ++i)
+    ;
+}
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index f810768a4c1c..1ff3517e67cb 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,33 @@
+2018-10-22  James Norris  <jnorris@codesourcery.com>
+	    Cesar Philippidis  <cesar@codesourcery.com>
+	    Tom de Vries  <tom@codesourcery.com>
+
+	* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update parallel
+	regions to denote variables copyied in via acc enter data as
+	present.
+	* testsuite/libgomp.oacc-fortran/data-3.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/data-4.f90: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/subr.h: Reimplement.
+	* testsuite/libgomp.oacc-c-c++-common/subr.ptx: Regenerated PTX.
+	* testsuite/libgomp.oacc-c-c++-common/timer.h: Removed.
+	* testsuite/libgomp.oacc-c-c++-common/lib-69.c: Change async checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-70.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-71.c: Rework kernel i/f.
+	* testsuite/libgomp.oacc-c-c++-common/lib-72.c: Rework kernel i/f and
+	change async checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-73.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-74.c: Rework kernel i/f and
+	timing checks.
+	* testsuite/libgomp.oacc-c-c++-common/lib-75.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-76.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-77.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-78.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-79.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-80.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-81.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-82.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/lib-93.c: New test.
+
 2018-12-13  Cesar Philippidis  <cesar@codesourcery.com>
             Nathan Sidwell  <nathan@acm.org>
             Julian Brown  <julian@codesourcery.com>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
index c10bebaab136..455ca5b45278 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-69.c
@@ -9,46 +9,14 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuModuleLoad (&module, "./subr.ptx");
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuModuleLoad failed: %d\n", r);
@@ -62,20 +30,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
     abort ();
@@ -90,7 +44,7 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -118,11 +72,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
index 912b266ec390..ee06898af791 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-70.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,47 +11,17 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 3;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t diff;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +36,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -96,9 +53,29 @@ main (int argc, char **argv)
 	  abort ();
     }
 
+  gettimeofday (&tv1, NULL);
+
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[0], NULL, 0);
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+      abort ();
+    }
+
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxLaunch failed: %d\n", r);
+      abort ();
+    }
+
+  gettimeofday (&tv2, NULL);
+
+  diff = tv2.tv_sec - tv1.tv_sec;
+
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,7 +89,7 @@ main (int argc, char **argv)
 	}
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  sleep ((diff + 1) * N);
 
   for (i = 0; i < N; i++)
     {
@@ -123,10 +100,6 @@ main (int argc, char **argv)
 	}
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
index e383ba04d699..920ff5f27ffb 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-72.c
@@ -10,45 +10,13 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -63,20 +31,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
     {
@@ -87,7 +41,7 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
     
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -100,7 +54,12 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize () failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -108,11 +67,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
index 43a8b7e63957..4fa9d5a6c289 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-73.c
@@ -1,6 +1,7 @@
 /* { dg-do run { target openacc_nvidia_accel_selected } } */
 /* { dg-additional-options "-lcuda" } */
 
+#include <sys/time.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -10,47 +11,15 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  const int N = 10;
+  const int N = 6;
   int i;
   CUstream streams[N];
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -65,20 +34,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   for (i = 0; i < N; i++)
     {
       streams[i] = (CUstream) acc_get_cuda_stream (i);
@@ -98,13 +53,12 @@ main (int argc, char **argv)
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
     }
 
   if (acc_async_test_all () != 0)
@@ -113,7 +67,12 @@ main (int argc, char **argv)
       abort ();
     }
 
-  sleep ((int) (dtime / 1000.0f) + 1);
+  r = cuCtxSynchronize ();
+  if (r != CUDA_SUCCESS)
+    {
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+      abort ();
+    }
 
   if (acc_async_test_all () != 1)
     {
@@ -121,11 +80,6 @@ main (int argc, char **argv)
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
index 0efcf0d52229..e25d8944622e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-74.c
@@ -5,77 +5,53 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  gettimeofday (&tv2, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -91,11 +67,9 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -103,38 +77,31 @@ main (int argc, char **argv)
     }
 
   acc_wait (0);
-  /* Test unseen async-argument.  */
-  acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (atime < dtime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait (0);
-  /* Test unseen async-argument.  */
-  acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (0.010 < atime)
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
index 194221187482..53e285f38399 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-75.c
@@ -6,78 +6,55 @@
 #include <stdlib.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -93,16 +70,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -112,27 +84,18 @@ main (int argc, char **argv)
       acc_wait (0);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
index 11d9d621f87f..787dcb886ef7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-76.c
@@ -6,78 +6,55 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
+  gettimeofday (&tv2, NULL);
 
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -98,16 +75,11 @@ main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -117,27 +89,19 @@ main (int argc, char **argv)
       acc_wait (i);
     }
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
index 4f58fb23cfe6..0bed15fa7b35 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-78.c
@@ -6,77 +6,53 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  gettimeofday (&tv2, NULL);
 
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   stream = (CUstream) acc_get_cuda_stream (0);
   if (stream != NULL)
@@ -92,11 +68,9 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
+  gettimeofday (&tv1, NULL);
 
-  start_timer (0);
-
-  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
       fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -105,33 +79,30 @@ main (int argc, char **argv)
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (atime < dtime)
+  if (t2 > (t1 + (t1 * 0.10)))
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all ();
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (0.010 < atime)
+  if (t2 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
index b2e2687e4e76..16eb446acd4c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-79.c
@@ -6,80 +6,55 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime, hitime, lotime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
-
-  devnum = 2;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
       abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
       abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 200.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
-
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
+  gettimeofday (&tv2, NULL);
 
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
   if (r != CUDA_SUCCESS)
@@ -105,16 +80,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (0, stream))
     abort ();
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -156,7 +126,7 @@ main (int argc, char **argv)
 
   acc_wait (1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
   if (acc_async_test (0) != 1)
     abort ();
@@ -164,25 +134,16 @@ main (int argc, char **argv)
   if (acc_async_test (1) != 1)
     abort ();
 
-  hitime = dtime * N;
-  hitime += hitime * 0.02;
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  lotime = dtime * N;
-  lotime -= lotime * 0.02;
+  t1 *= N;
 
-  if (atime > hitime || atime < lotime)
+  if (((abs (t2 - t1) / t1) * 100.0) > 1.0)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
-  free (a);
-  acc_free (d_a);
-
   acc_shutdown (acc_device_nvidia);
 
   exit (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
index d5f18f00319c..77de9ba29044 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c
@@ -6,78 +6,55 @@
 #include <unistd.h>
 #include <openacc.h>
 #include <cuda.h>
-#include "timer.h"
+#include <sys/time.h>
 
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 2;
   int i;
   CUstream *streams, stream;
-  unsigned long *a, *d_a, dticks;
-  int nbytes;
-  float atime, dtime;
-  void *kargs[2];
-  int clkrate;
-  int devnum, nprocs;
+  struct timeval tv1, tv2;
+  time_t t1, t2;
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
+  r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
+      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
       abort ();
     }
 
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
+  r = cuModuleGetFunction (&delay, module, "delay");
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
+      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
       abort ();
     }
 
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
+  gettimeofday (&tv1, NULL);
 
-  r = cuModuleLoad (&module, "subr.ptx");
+  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, NULL, NULL, 0);
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
-      abort ();
+      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
+	abort ();
     }
 
-  r = cuModuleGetFunction (&delay, module, "delay");
+  r = cuCtxSynchronize ();
   if (r != CUDA_SUCCESS)
     {
-      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
-      abort ();
+      fprintf (stderr, "cuCtxSynchronize failed: %d\n", r);
+	abort ();
     }
 
-  nbytes = nprocs * sizeof (unsigned long);
-
-  dtime = 500.0;
-
-  dticks = (unsigned long) (dtime * clkrate);
+  gettimeofday (&tv2, NULL);
 
-  N = nprocs;
-
-  a = (unsigned long *) malloc (nbytes);
-  d_a = (unsigned long *) acc_malloc (nbytes);
-
-  acc_map_data (a, d_a, nbytes);
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
   streams = (CUstream *) malloc (N * sizeof (void *));
 
@@ -98,11 +75,6 @@ main (int argc, char **argv)
 	  abort ();
     }
 
-  init_timers (1);
-
-  kargs[0] = (void *) &d_a;
-  kargs[1] = (void *) &dticks;
-
   stream = (CUstream) acc_get_cuda_stream (N);
   if (stream != NULL)
     abort ();
@@ -117,11 +89,11 @@ main (int argc, char **argv)
   if (!acc_set_cuda_stream (N, stream))
     abort ();
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   for (i = 0; i < N; i++)
     {
-      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
+      r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], NULL, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
@@ -129,6 +101,10 @@ main (int argc, char **argv)
 	}
     }
 
+  gettimeofday (&tv2, NULL);
+
+  t2 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
   acc_wait_all_async (N);
 
   for (i = 0; i <= N; i++)
@@ -145,15 +121,13 @@ main (int argc, char **argv)
 	abort ();
     }
 
-  atime = stop_timer (0);
-
-  if (atime < dtime)
+  if ((t1 * N) < t2)
     {
-      fprintf (stderr, "actual time < delay time\n");
+      fprintf (stderr, "too long 1\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   stream = (CUstream) acc_get_cuda_stream (N + 1);
   if (stream != NULL)
@@ -173,35 +147,33 @@ main (int argc, char **argv)
 
   acc_wait (N + 1);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
+
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
 
-  if (0.10 < atime)
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 2\n");
       abort ();
     }
 
-  start_timer (0);
+  gettimeofday (&tv1, NULL);
 
   acc_wait_all_async (N);
 
   acc_wait (N);
 
-  atime = stop_timer (0);
+  gettimeofday (&tv2, NULL);
 
-  if (0.10 < atime)
+  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);
+
+  if (t1 > 1000)
     {
-      fprintf (stderr, "actual time too long\n");
+      fprintf (stderr, "too long 3\n");
       abort ();
     }
 
-  acc_unmap_data (a);
-
-  fini_timers ();
-
   free (streams);
-  free (a);
-  acc_free (d_a);
 
   acc_shutdown (acc_device_nvidia);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
index be30a7f28ac0..ecf7488a0b90 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-82.c
@@ -10,46 +10,18 @@
 int
 main (int argc, char **argv)
 {
-  CUdevice dev;
   CUfunction delay2;
   CUmodule module;
   CUresult r;
-  int N;
+  const int N = 32;
   int i;
   CUstream *streams;
-  unsigned long **a, **d_a, *tid, ticks;
+  unsigned long **a, **d_a, *tid;
   int nbytes;
-  void *kargs[3];
-  int clkrate;
-  int devnum, nprocs;
+  void *kargs[2];
 
   acc_init (acc_device_nvidia);
 
-  devnum = acc_get_device_num (acc_device_nvidia);
-
-  r = cuDeviceGet (&dev, devnum);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
-      abort ();
-    }
-
-  r =
-    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-			  dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
-      abort ();
-    }
-
   r = cuModuleLoad (&module, "subr.ptx");
   if (r != CUDA_SUCCESS)
     {
@@ -66,10 +38,6 @@ main (int argc, char **argv)
 
   nbytes = sizeof (int);
 
-  ticks = (unsigned long) (200.0 * clkrate);
-
-  N = nprocs;
-
   streams = (CUstream *) malloc (N * sizeof (void *));
 
   a = (unsigned long **) malloc (N * sizeof (unsigned long *));
@@ -103,8 +71,7 @@ main (int argc, char **argv)
   for (i = 0; i < N; i++)
     {
       kargs[0] = (void *) &d_a[i];
-      kargs[1] = (void *) &ticks;
-      kargs[2] = (void *) &tid[i];
+      kargs[1] = (void *) &tid[i];
 
       r = cuLaunchKernel (delay2, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
       if (r != CUDA_SUCCESS)
@@ -112,8 +79,6 @@ main (int argc, char **argv)
 	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
 	  abort ();
 	}
-
-      ticks = (unsigned long) (50.0 * clkrate);
     }
 
   acc_wait_all_async (0);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
new file mode 100644
index 000000000000..bc60a16c64f1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/lib-93.c
@@ -0,0 +1,19 @@
+/* { dg-do run { target { ! openacc_nvidia_accel_configured } } } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+int
+main (void)
+{
+  fprintf (stderr, "CheCKpOInT\n");
+  acc_init (acc_device_nvidia);
+
+  acc_shutdown (acc_device_nvidia);
+
+  return 0;
+}
+
+/* { dg-output "CheCKpOInT(\n|\r\n|\r).*" } */
+/* { dg-output "device type nvidia not supported" } */
+/* { dg-shouldfail "" } */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
index ca29f0889e4e..0c9096fd8e80 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.h
@@ -1,46 +1,24 @@
 
-#ifdef ACC_DEVICE_TYPE_nvidia
-
 #pragma acc routine nohost
-static int clock (void)
-{
-  int thetime;
-
-  asm __volatile__ ("mov.u32 %0, %%clock;" : "=r"(thetime));
-
-  return thetime;
-}
-
-#endif
-
 void
-delay (unsigned long *d_o, unsigned long delay)
+delay ()
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
-
-  return;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 }
 
+#pragma acc routine nohost
 void
-delay2 (unsigned long *d_o, unsigned long delay, unsigned long tid)
+delay2 (unsigned long *d_o, unsigned long tid)
 {
-  int start, ticks;
+  int i, sum;
+  const int N = 500000;
 
-  start = clock ();
-
-  ticks = 0;
-
-  while (ticks < delay)
-    ticks = clock () - start;
+  for (i = 0; i < N; i++)
+    sum = sum + 1;
 
   d_o[0] = tid;
-
-  return;
 }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
index 6f748fcaf9ef..88b63bfb74b3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/subr.ptx
@@ -1,148 +1,90 @@
-// BEGIN PREAMBLE
-	.version	3.1
-	.target	sm_30
+	.version 3.1
+	.target sm_30
 	.address_size 64
-// END PREAMBLE
 
-// BEGIN FUNCTION DEF: clock
-.func (.param.u32 %out_retval)clock
-{
-.reg.u32 %retval;
-	.reg.u64 %hr10;
-	.reg.u32 %r22;
-	.reg.u32 %r23;
-	.reg.u32 %r24;
-	.local.align 8 .b8 %frame[8];
-	// #APP 
-// 7 "subr.c" 1
-	mov.u32 %r24, %clock;
-// 0 "" 2
-	// #NO_APP 
-		st.local.u32	[%frame], %r24;
-		ld.local.u32	%r22, [%frame];
-		mov.u32	%r23, %r22;
-		mov.u32	%retval, %r23;
-	st.param.u32	[%out_retval], %retval;
-	ret;
-	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay
-.visible .entry delay(.param.u64 %in_ar1, .param.u64 %in_ar2)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u32 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u64 %r31;
-	.reg.pred %r32;
-	.local.align 8 .b8 %frame[24];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-		mov.u64	%r24, %ar1;
-		st.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
+	.visible .entry delay
 	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r26, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r26;
-		mov.u32	%r27, 0;
-		st.local.u32	[%frame], %r27;
-		bra	$L4;
-$L5:
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r28, [%retval_in];
-}
-		mov.u32	%r23, %r28;
-		ld.local.u32	%r30, [%frame+4];
-		sub.u32	%r29, %r23, %r30;
-		st.local.u32	[%frame], %r29;
-$L4:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r31, [%frame+16];
-		setp.lo.u64 %r32,%r22,%r31;
-	@%r32	bra	$L5;
+	.reg .u64 %hr10;
+	.reg .u32 %r22;
+	.reg .u32 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .pred %r30;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[16];
+	cvta.local.u64 %frame,%farray;
+	mov.u32 %r22,500000;
+	st.u32 [%frame+8],%r22;
+	mov.u32 %r23,0;
+	st.u32 [%frame],%r23;
+	bra $L2;
+	$L3:
+	ld.u32 %r25,[%frame+4];
+	add.u32 %r24,%r25,1;
+	st.u32 [%frame+4],%r24;
+	ld.u32 %r27,[%frame];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame],%r26;
+	$L2:
+	ld.u32 %r28,[%frame];
+	ld.u32 %r29,[%frame+8];
+	setp.lt.s32 %r30,%r28,%r29;
+	@%r30 
+	bra $L3;
 	ret;
 	}
-// END FUNCTION DEF
-// BEGIN GLOBAL FUNCTION DEF: delay2
-.visible .entry delay2(.param.u64 %in_ar1, .param.u64 %in_ar2, .param.u64 %in_ar3)
-{
-	.reg.u64 %ar1;
-	.reg.u64 %ar2;
-	.reg.u64 %ar3;
-	.reg.u64 %hr10;
-	.reg.u64 %r22;
-	.reg.u32 %r23;
-	.reg.u64 %r24;
-	.reg.u64 %r25;
-	.reg.u64 %r26;
-	.reg.u32 %r27;
-	.reg.u32 %r28;
-	.reg.u32 %r29;
-	.reg.u32 %r30;
-	.reg.u32 %r31;
-	.reg.u64 %r32;
-	.reg.pred %r33;
-	.reg.u64 %r34;
-	.reg.u64 %r35;
-	.local.align 8 .b8 %frame[32];
-	ld.param.u64 %ar1, [%in_ar1];
-	ld.param.u64 %ar2, [%in_ar2];
-	ld.param.u64 %ar3, [%in_ar3];
-		mov.u64	%r24, %ar1;
-		st.local.u64	[%frame+8], %r24;
-		mov.u64	%r25, %ar2;
-		st.local.u64	[%frame+16], %r25;
-		mov.u64	%r26, %ar3;
-		st.local.u64	[%frame+24], %r26;
-	{
-		.param.u32 %retval_in;
-	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r27, [%retval_in];
-}
-		st.local.u32	[%frame+4], %r27;
-		mov.u32	%r28, 0;
-		st.local.u32	[%frame], %r28;
-		bra	$L8;
-$L9:
-	{
-		.param.u32 %retval_in;
+
+	.visible .entry delay2 (.param .u64 %in_ar1, .param .u64 %in_ar2)
 	{
-		call (%retval_in), clock;
-	}
-		ld.param.u32	%r29, [%retval_in];
-}
-		mov.u32	%r23, %r29;
-		ld.local.u32	%r31, [%frame+4];
-		sub.u32	%r30, %r23, %r31;
-		st.local.u32	[%frame], %r30;
-$L8:
-		ld.local.s32	%r22, [%frame];
-		ld.local.u64	%r32, [%frame+16];
-		setp.lo.u64 %r33,%r22,%r32;
-	@%r33	bra	$L9;
-		ld.local.u64	%r34, [%frame+8];
-		ld.local.u64	%r35, [%frame+24];
-		st.u64	[%r34], %r35;
+	.reg .u64 %ar1;
+	.reg .u64 %ar2;
+	.reg .u64 %hr10;
+	.reg .u64 %r22;
+	.reg .u64 %r23;
+	.reg .u32 %r24;
+	.reg .u32 %r25;
+	.reg .u32 %r26;
+	.reg .u32 %r27;
+	.reg .u32 %r28;
+	.reg .u32 %r29;
+	.reg .u32 %r30;
+	.reg .u32 %r31;
+	.reg .pred %r32;
+	.reg .u64 %r33;
+	.reg .u64 %r34;
+	.reg .u64 %frame;
+	.local .align 8 .b8 %farray[32];
+	cvta.local.u64 %frame,%farray;
+	ld.param.u64 %ar1,[%in_ar1];
+	ld.param.u64 %ar2,[%in_ar2];
+	mov.u64 %r22,%ar1;
+	st.u64 [%frame+16],%r22;
+	mov.u64 %r23,%ar2;
+	st.u64 [%frame+24],%r23;
+	mov.u32 %r24,500000;
+	st.u32 [%frame+8],%r24;
+	mov.u32 %r25,0;
+	st.u32 [%frame],%r25;
+	bra $L5;
+	$L6:
+	ld.u32 %r27,[%frame+4];
+	add.u32 %r26,%r27,1;
+	st.u32 [%frame+4],%r26;
+	ld.u32 %r29,[%frame];
+	add.u32 %r28,%r29,1;
+	st.u32 [%frame],%r28;
+	$L5:
+	ld.u32 %r30,[%frame];
+	ld.u32 %r31,[%frame+8];
+	setp.lt.s32 %r32,%r30,%r31;
+	@%r32 
+	bra $L6;
+	ld.u64 %r33,[%frame+16];
+	ld.u64 %r34,[%frame+24];
+	st.u64 [%r33],%r34;
 	ret;
 	}
-// END FUNCTION DEF
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h b/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
index 53749da5a0de..e69de29bb2d1 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/timer.h
@@ -1,103 +0,0 @@
-
-#include <stdio.h>
-#include <cuda.h>
-
-static int _Tnum_timers;
-static CUevent *_Tstart_events, *_Tstop_events;
-static CUstream _Tstream;
-
-void
-init_timers (int ntimers)
-{
-  int i;
-  CUresult r;
-
-  _Tnum_timers = ntimers;
-
-  _Tstart_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-  _Tstop_events = (CUevent *) malloc (_Tnum_timers * sizeof (CUevent));
-
-  r = cuStreamCreate (&_Tstream, CU_STREAM_DEFAULT);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
-      abort ();
-    }
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      r = cuEventCreate (&_Tstart_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-
-      r = cuEventCreate (&_Tstop_events[i], CU_EVENT_DEFAULT);
-      if (r != CUDA_SUCCESS)
-	{
-	  fprintf (stderr, "cuEventCreate failed: %d\n", r);
-	  abort ();
-	}
-    }
-}
-
-void
-fini_timers (void)
-{
-  int i;
-
-  for (i = 0; i < _Tnum_timers; i++)
-    {
-      cuEventDestroy (_Tstart_events[i]);
-      cuEventDestroy (_Tstop_events[i]);
-    }
-
-  cuStreamDestroy (_Tstream);
-
-  free (_Tstart_events);
-  free (_Tstop_events);
-}
-
-void
-start_timer (int timer)
-{
-  CUresult r;
-
-  r = cuEventRecord (_Tstart_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-}
-
-float
-stop_timer (int timer)
-{
-  CUresult r;
-  float etime;
-
-  r = cuEventRecord (_Tstop_events[timer], _Tstream);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventRecord failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventSynchronize (_Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventSynchronize failed: %d\n", r);
-      abort ();
-    }
-
-  r = cuEventElapsedTime (&etime, _Tstart_events[timer], _Tstop_events[timer]);
-  if (r != CUDA_SUCCESS)
-    {
-      fprintf (stderr, "cuEventElapsedTime failed: %d\n", r);
-      abort ();
-    }
-
-  return etime;
-}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90 b/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
index 19eb4bd6a450..b5586be06090 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/data-3.f90
@@ -55,7 +55,8 @@ program asyncwait
   c(:) = 0.0
   d(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N))
 
   !$acc parallel async (1)
   do i = 1, N
@@ -76,7 +77,8 @@ program asyncwait
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N))
 
   do i = 1, N
      if (a(i) .ne. 3.0) STOP 5
@@ -91,7 +93,8 @@ program asyncwait
   d(:) = 0.0
   e(:) = 0.0
 
-  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) create (d(1:N)) copyin (e(1:N))
+  !$acc enter data copyin (a(1:N)) create (b(1:N)) create (c(1:N)) &
+  !$acc& create (d(1:N)) copyin (e(1:N))
 
   !$acc parallel async (1)
   do i = 1, N
@@ -118,7 +121,8 @@ program asyncwait
   !$acc end parallel
 
   !$acc wait (1)
-  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) copyout (d(1:N)) copyout (e(1:N))
+  !$acc exit data copyout (a(1:N)) copyout (b(1:N)) copyout (c(1:N)) &
+  !$acc& copyout (d(1:N)) copyout (e(1:N))
   !$acc exit data delete (N)
 
   do i = 1, N