From: Julian Brown <julian@codesourcery.com>
Date: Fri, 12 Jul 2019 21:40:34 +0000 (-0700)
Subject: [og9] AMD GCN offloading support
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=868d3ad10f2dfd532a494bfe1513200eb361a6de;p=thirdparty%2Fgcc.git

[og9] AMD GCN offloading support

	gcc/
	* config.gcc (amdgcn-*-*): Add default option for gfx906.
	* config/gcn/mkoffload.c: New.
	* config/gcn/offload.h: New.

	libgcc/
	* Makefile.in: Allow disabling of emutls.
	* config/gcn/gomp_print.c: New.
	* config/gcn/reduction.c: New.
	* config/gcn/t-amdgcn (LIB2ADD): Add gomp_print.c and reduction.c.
	Disable emutls.c.
	* config/gcn/t-gcn-hsa: New.

	libgomp/
	* Makefile.am (libgomp_la_SOURCES): Add gomp_print.c.
	* Makefile.in: Regenerate.
	* affinity-fmt.c: Rename calls to gomp_write_string from
	gomp_print_string.
	* config.h.in (PLUGIN_GCN): Add #undef.
	* config/nvptx/libgomp-plugin.c: Rename to...
	* config/accel/libgomp-plugin.c: ...this.
	* config/nvptx/lock.c: Rename to...
	* config/accel/lock.c: ...this.
	* config/nvptx/mutex.c: Rename to...
	* config/accel/mutex.c: ...this.
	* config/nvptx/mutex.h: Rename to...
	* config/accel/mutex.h: ...this.
	* config/nvptx/oacc-async.c: Rename to...
	* config/accel/oacc-async.c: ...this.
	* config/nvptx/oacc-cuda.c: Rename to...
	* config/accel/oacc-cuda.c: ...this.
	* config/nvptx/oacc-host.c: Rename to...
	* config/accel/oacc-host.c: ...this.
	* config/nvptx/oacc-init.c: Rename to...
	* config/accel/oacc-init.c: ...this.
	* config/nvptx/oacc-mem.c: Rename to...
	* config/accel/oacc-mem.c: ...this.
	* config/nvptx/oacc-plugin.c: Rename to...
	* config/accel/oacc-plugin.c: ...this.
	* config/nvptx/omp-lock.h: Rename to...
	* config/accel/omp-lock.h: ...this.
	* config/nvptx/openacc.f90: Rename to...
	* config/accel/openacc.f90: ...this.  Add acc_device_hsa and
	acc_device_gcn.
	* config/nvptx/pool.h: Rename to...
	* config/accel/pool.h: ...this.
	* config/nvptx/proc.c: Rename to...
	* config/accel/proc.c: ...this.  Add omp_get_num_procs alias.
	* config/nvptx/ptrlock.c: Rename to...
	* config/accel/ptrlock.c: ...this.
	* config/nvptx/ptrlock.h: Rename to...
	* config/accel/ptrlock.h: ...this.
	* config/nvptx/sem.c: Rename to...
	* config/accel/sem.c: ...this.
	* config/nvptx/sem.h: Rename to...
	* config/accel/sem.h: ...this.
	* config/nvptx/thread-stacksize.h: Rename to...
	* config/accel/thread-stacksize.h: ...this.
	* config/gcn/affinity-fmt.c: New.
	* config/gcn/bar.c: New.
	* config/gcn/bar.h: New.
	* config/gcn/doacross.h: New.
	* config/gcn/gomp_print.c: New.
	* config/gcn/icv-device.c: New.
	* config/gcn/simple-bar.h: New.
	* config/gcn/target.c: New.
	* config/gcn/task.c: New.
	* config/gcn/team.c: New.
	* config/gcn/time.c: New.
	* config/linux/gomp_print.c: New.
	* configure.ac (amdgcn*-*-*): Disable pthreads.
	* configure: Regenerated.
	* configure.tgt (nvptx*-*-*): Add 'accel' config_path.
	(amdgcn*-*-*): Set config_path.
	* fortran.c (omp_display_affinity_): Rename calls to gomp_write_string
	from gomp_print_string.
	* libgomp-plugin.h (enum offload_target_type): Add
	OFFLOAD_TARGET_TYPE_GCN.
	(GOMP_OFFLOAD_openacc_async_construct): Change parameter type to int.
	* libgomp.h (gcn_thrs, set_gcn_thrs, gomp_thread): Add for __AMDGCN__.
	(gomp_print_string): Rename to...
	(gomp_write_string): ...this.
	* libgomp.map (GOMP_4.5): Add gomp_rpint_string, gomp_print_integer,
	gomp_print_double.
	* oacc-async.c (lookup_goacc_asyncqueue): Pass target_id to async queue
	construct function.
	* oacc-host.c (host_openacc_async_construct): Add dummy device
	parameter.
	* oacc-init.c (name_of_acc_device_t): Add acc_device_gcn.
	* oacc-int.h (goacc_thread): Add dummy implementation for __AMDGCN__.
	* oacc-parallel.c (GOACC_enter_exit_data): Support acc_async_noval and
	zero-length array sections.
	* omp.h.in (gomp_print_string, gomp_print_integer, gomp_print_double):
	Add prototypes.
	* omp_lib.f90.in (gomp_print_string, gomp_print_integer,
	gomp_print_double): Add interfaces.
	* openacc.f90 (openacc_kinds): Add acc_device_gcn.  Bump
	acc_device_current code.
	* openacc.h (acc_device_t): Add acc_device_gcn, bump acc_device_current
	code.
	* openacc_lib.h (acc_device_hsa, acc_device_gcn): Add.
	* plugin/Makefrag.am (PLUGIN_GCN): Support building GCN plugin.
	* plugin/configfrag.am (PLUGIN_GCN, PLUGIN_GCN_CPPFLAGS,
	PLUGIN_GCN_LDFLAGS, PLUGIN_GCN_LIBS): Add.  Add suport for GCN plugin.
	* plugin/plugin-gcn.c: New.
	* target.c (stdio.h): Include unconditionally.
	(gomp_copy_host2dev): Add function comment.
	(copy_host2dev_immediate): New function.
	(gomp_map_pointer, gomp_map_vars_internal): Use
	copy_host2dev_immediate where appropriate.
	(offload_target_to_plugin_name): Support gcn.
	* team.c (gomp_free_pool_helper): Support gcn.
	* testsuite/Makefile.in: Regenerated.
	* testsuite/lib/libgomp.exp
	(check_effective_target_openacc_amdgcn_accel_present): New.
	(check_effective_target_openacc_amdgcn_accel_selected): New.
	* testsuite/libgomp.c/c.exp (generate_tests, test_lists,
	generated_tests): New.
	(tests): Add generated tests.
	* testsuite/libgomp.c/for-1.h: New.
	* testsuite/libgomp.c/for-2.h: New.
	* testsuite/libgomp.c/for-3.h: New.
	* testsuite/libgomp.c/for-3.list: New.
	* testsuite/libgomp.c/for-5.c: New.
	* testsuite/libgomp.c/for-5.list: New.
	* testsuite/libgomp.c/for-6.c: New.
	* testsuite/libgomp.c/for-6.list: New.
	* testsuite/libgomp.c/target-print-1.c: New.
	* testsuite/libgomp.fortran/target-print-1.f90: New.
	* testsuite/libgomp.oacc-c++/c++.exp (amdgcn*): Add support for AMD GCN.
	* testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c: Adjust for
	portability.
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip unsuitable
	test for AMD GCN.
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for
	portability.
	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/private-variables-2.c: New.
	* testsuite/libgomp.oacc-c-c++-common/tile-1.c: Skip for AMD GCN.
	* testsuite/libgomp.oacc-c/c.exp (amdgcn*): Add support for AMD GCN.
	* testsuite/libgomp.oacc-c/offload-target-1.c: Add AMD GCN support.
	* testsuite/libgomp.oacc-c/print-1.c: New.
	* testsuite/libgomp.oacc-fortran/fortran.exp (amdgcn*): Add AMD GCN
	support.
	* testsuite/libgomp.oacc-fortran/atomic_capture-1.f90: Adjust for
	portability.
	* testsuite/libgomp.oacc-fortran/collapse-1.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/collapse-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/error_stop-1.f: Support AMD GCN.
	* testsuite/libgomp.oacc-fortran/error_stop-2.f: Support AMD GCN.
	* testsuite/libgomp.oacc-fortran/error_stop-3.f: Support AMD GCN.
	* testsuite/libgomp.oacc-fortran/print-1.f90: New.

(cherry picked from openacc-gcc-9-branch commit
dfe3cbfb88da3984ed8c791e941059ad514af0b2)
---

diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index 9e1e9315923a..4a806549d50c 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,10 @@
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	* config.gcc (amdgcn-*-*): Add default option for gfx906.
+	* config/gcn/mkoffload.c: New.
+	* config/gcn/offload.h: New.
+
 2019-06-25  Kwok Cheung Yeung  <kcy@codesourcery.com>
             Andrew Stubbs  <ams@codesourcery.com>
 
diff --git a/gcc/config.gcc b/gcc/config.gcc
index aff3bfad3d1c..38e60c9bee8a 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -4126,7 +4126,7 @@ case "${target}" in
 		for which in arch tune; do
 			eval "val=\$with_$which"
 			case ${val} in
-			"" | carrizo | fiji | gfx900 )
+			"" | carrizo | fiji | gfx900 | gfx906 )
 				# OK
 				;;
 			*)
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
new file mode 100644
index 000000000000..f26b90239694
--- /dev/null
+++ b/gcc/config/gcn/mkoffload.c
@@ -0,0 +1,702 @@
+/* Offload image generation tool for AMD GCN.
+
+   Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Munges GCN assembly into a C source file defining the GCN code as a
+   string.
+
+   This is not a complete assembler.  We presume the source is well
+   formed from the compiler and can die horribly if it is not.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "obstack.h"
+#include "diagnostic.h"
+#include "intl.h"
+#include <libgen.h>
+#include "collect-utils.h"
+#include "gomp-constants.h"
+
+const char tool_name[] = "gcn mkoffload";
+
+#define COMMENT_PREFIX "#"
+
+struct id_map
+{
+  id_map *next;
+  char *gcn_name;
+};
+
+static id_map *func_ids, **funcs_tail = &func_ids;
+static id_map *var_ids, **vars_tail = &var_ids;
+
+/* Files to unlink.  */
+static const char *gcn_s1_name;
+static const char *gcn_s2_name;
+static const char *gcn_o_name;
+static const char *gcn_cfile_name;
+
+enum offload_abi offload_abi = OFFLOAD_ABI_UNSET;
+
+/* Delete tempfiles.  */
+
+void
+tool_cleanup (bool from_signal ATTRIBUTE_UNUSED)
+{
+  if (gcn_cfile_name)
+    maybe_unlink (gcn_cfile_name);
+  if (gcn_s1_name)
+    maybe_unlink (gcn_s1_name);
+  if (gcn_s2_name)
+    maybe_unlink (gcn_s2_name);
+  if (gcn_o_name)
+    maybe_unlink (gcn_o_name);
+}
+
+static void
+mkoffload_cleanup (void)
+{
+  tool_cleanup (false);
+}
+
+/* Unlink FILE unless requested otherwise.  */
+
+void
+maybe_unlink (const char *file)
+{
+  if (!save_temps)
+    {
+      if (unlink_if_ordinary (file) && errno != ENOENT)
+	fatal_error (input_location, "deleting file %s: %m", file);
+    }
+  else if (verbose)
+    fprintf (stderr, "[Leaving %s]\n", file);
+}
+
+/* Add or change the value of an environment variable, outputting the
+   change to standard error if in verbose mode.  */
+
+static void
+xputenv (const char *string)
+{
+  if (verbose)
+    fprintf (stderr, "%s\n", string);
+  putenv (CONST_CAST (char *, string));
+}
+
+/* Read the whole input file.  It will be NUL terminated (but
+   remember, there could be a NUL in the file itself.  */
+
+static const char *
+read_file (FILE *stream, size_t *plen)
+{
+  size_t alloc = 16384;
+  size_t base = 0;
+  char *buffer;
+
+  if (!fseek (stream, 0, SEEK_END))
+    {
+      /* Get the file size.  */
+      long s = ftell (stream);
+      if (s >= 0)
+	alloc = s + 100;
+      fseek (stream, 0, SEEK_SET);
+    }
+  buffer = XNEWVEC (char, alloc);
+
+  for (;;)
+    {
+      size_t n = fread (buffer + base, 1, alloc - base - 1, stream);
+
+      if (!n)
+	break;
+      base += n;
+      if (base + 1 == alloc)
+	{
+	  alloc *= 2;
+	  buffer = XRESIZEVEC (char, buffer, alloc);
+	}
+    }
+  buffer[base] = 0;
+  *plen = base;
+  return buffer;
+}
+
+/* Parse STR, saving found tokens into PVALUES and return their number.
+   Tokens are assumed to be delimited by ':'.  */
+
+static unsigned
+parse_env_var (const char *str, char ***pvalues)
+{
+  const char *curval, *nextval;
+  char **values;
+  unsigned num = 1, i;
+
+  curval = strchr (str, ':');
+  while (curval)
+    {
+      num++;
+      curval = strchr (curval + 1, ':');
+    }
+
+  values = (char **) xmalloc (num * sizeof (char *));
+  curval = str;
+  nextval = strchr (curval, ':');
+  if (nextval == NULL)
+    nextval = strchr (curval, '\0');
+
+  for (i = 0; i < num; i++)
+    {
+      int l = nextval - curval;
+      values[i] = (char *) xmalloc (l + 1);
+      memcpy (values[i], curval, l);
+      values[i][l] = 0;
+      curval = nextval + 1;
+      nextval = strchr (curval, ':');
+      if (nextval == NULL)
+	nextval = strchr (curval, '\0');
+    }
+  *pvalues = values;
+  return num;
+}
+
+/* Auxiliary function that frees elements of PTR and PTR itself.
+   N is number of elements to be freed.  If PTR is NULL, nothing is freed.
+   If an element is NULL, subsequent elements are not freed.  */
+
+static void
+free_array_of_ptrs (void **ptr, unsigned n)
+{
+  unsigned i;
+  if (!ptr)
+    return;
+  for (i = 0; i < n; i++)
+    {
+      if (!ptr[i])
+	break;
+      free (ptr[i]);
+    }
+  free (ptr);
+  return;
+}
+
+/* Check whether NAME can be accessed in MODE.  This is like access,
+   except that it never considers directories to be executable.  */
+
+static int
+access_check (const char *name, int mode)
+{
+  if (mode == X_OK)
+    {
+      struct stat st;
+
+      if (stat (name, &st) < 0 || S_ISDIR (st.st_mode))
+	return -1;
+    }
+
+  return access (name, mode);
+}
+
+/* Parse an input assembler file, extract the offload tables etc.,
+   and output (1) the assembler code, minus the tables (which can contain
+   problematic relocations), and (2) a C file with the offload tables
+   encoded as structured data.  */
+
+static void
+process_asm (FILE *in, FILE *out, FILE *cfile)
+{
+  int fn_count = 0, var_count = 0, dims_count = 0;
+  struct obstack fns_os, vars_os, varsizes_os, dims_os;
+  obstack_init (&fns_os);
+  obstack_init (&vars_os);
+  obstack_init (&varsizes_os);
+  obstack_init (&dims_os);
+
+  struct oaccdims
+  {
+    int d[3];
+    char *name;
+  } dim;
+
+  /* Always add _init_array and _fini_array as kernels.  */
+  obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
+  obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
+  fn_count += 2;
+
+  char buf[1000];
+  enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+  while (fgets (buf, sizeof (buf), in))
+    {
+      switch (state)
+	{
+	case IN_CODE:
+	  {
+	    if (sscanf (buf, " ;; OPENACC-DIMS: %d, %d, %d : %ms\n",
+			&dim.d[0], &dim.d[1], &dim.d[2], &dim.name) == 4)
+	      {
+		obstack_grow (&dims_os, &dim, sizeof (dim));
+		dims_count++;
+	      }
+	    break;
+	  }
+	case IN_VARS:
+	  {
+	    char *varname;
+	    unsigned varsize;
+	    if (sscanf (buf, " .8byte %ms\n", &varname))
+	      {
+		obstack_ptr_grow (&vars_os, varname);
+		fgets (buf, sizeof (buf), in);
+		if (!sscanf (buf, " .8byte %u\n", &varsize))
+		  abort ();
+		obstack_int_grow (&varsizes_os, varsize);
+		var_count++;
+
+		/* The HSA Runtime cannot locate the symbol if it is not
+		   exported from the kernel.  */
+		fprintf (out, "\t.global %s\n", varname);
+	      }
+	    break;
+	  }
+	case IN_FUNCS:
+	  {
+	    char *funcname;
+	    if (sscanf (buf, "\t.8byte\t%ms\n", &funcname))
+	      {
+		obstack_ptr_grow (&fns_os, funcname);
+		fn_count++;
+		continue;
+	      }
+	    break;
+	  }
+	}
+
+      char dummy;
+      if (sscanf (buf, " .section .gnu.offload_vars%c", &dummy) > 0)
+	state = IN_VARS;
+      else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
+	state = IN_FUNCS;
+      else if (sscanf (buf, " .section %c", &dummy) > 0
+	       || sscanf (buf, " .text%c", &dummy) > 0
+	       || sscanf (buf, " .bss%c", &dummy) > 0
+	       || sscanf (buf, " .data%c", &dummy) > 0
+	       || sscanf (buf, " .ident %c", &dummy) > 0)
+	state = IN_CODE;
+
+      if (state == IN_CODE)
+	fputs (buf, out);
+    }
+
+  char **fns = XOBFINISH (&fns_os, char **);
+  struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+
+  fprintf (cfile, "#include <stdlib.h>\n");
+  fprintf (cfile, "#include <stdbool.h>\n\n");
+
+  char **vars = XOBFINISH (&vars_os, char **);
+  unsigned *varsizes = XOBFINISH (&varsizes_os, unsigned *);
+  fprintf (cfile,
+	   "static const struct global_var_info {\n"
+	   "  const char *name;\n"
+	   "  void *address;\n"
+	   "} vars[] = {\n");
+  int i;
+  for (i = 0; i < var_count; ++i)
+    {
+      const char *sep = i < var_count - 1 ? "," : " ";
+      fprintf (cfile, "  { \"%s\", NULL }%s /* size: %u */\n", vars[i], sep,
+	       varsizes[i]);
+    }
+  fprintf (cfile, "};\n\n");
+
+  obstack_free (&vars_os, NULL);
+  obstack_free (&varsizes_os, NULL);
+
+  /* Dump out function idents.  */
+  fprintf (cfile, "static const struct hsa_kernel_description {\n"
+	   "  const char *name;\n"
+	   "  unsigned omp_data_size;\n"
+	   "  bool gridified_kernel_p;\n"
+	   "  unsigned kernel_dependencies_count;\n"
+	   "  const char **kernel_dependencies;\n"
+	   "  int oacc_dims[3];\n"
+	   "} gcn_kernels[] = {\n  ");
+  dim.d[0] = dim.d[1] = dim.d[2] = 0;
+  const char *comma;
+  for (comma = "", i = 0; i < fn_count; comma = ",\n  ", i++)
+    {
+      /* Find if we recorded dimensions for this function.  */
+      int *d = dim.d;		/* Previously zeroed.  */
+      for (int j = 0; j < dims_count; j++)
+	if (strcmp (fns[i], dims[j].name) == 0)
+	  {
+	    d = dims[j].d;
+	    break;
+	  }
+
+      fprintf (cfile, "%s{\"%s\", 0, 0, 0, NULL, {%d, %d, %d}}", comma,
+	       fns[i], d[0], d[1], d[2]);
+
+      free (fns[i]);
+    }
+  fprintf (cfile, "\n};\n\n");
+
+  obstack_free (&fns_os, NULL);
+  for (i = 0; i < dims_count; i++)
+    free (dims[i].name);
+  obstack_free (&dims_os, NULL);
+}
+
+/* Embed an object file into a C source file.  */
+
+static void
+process_obj (FILE *in, FILE *cfile)
+{
+  size_t len = 0;
+  const char *input = read_file (in, &len);
+  id_map const *id;
+  unsigned ix;
+
+  /* Dump out an array containing the binary.
+     FIXME: do this with objcopy.  */
+  fprintf (cfile, "static unsigned char gcn_code[] = {");
+  for (size_t i = 0; i < len; i += 17)
+    {
+      fprintf (cfile, "\n\t");
+      for (size_t j = i; j < i + 17 && j < len; j++)
+	fprintf (cfile, "%3u,", (unsigned char) input[j]);
+    }
+  fprintf (cfile, "\n};\n\n");
+
+  fprintf (cfile,
+	   "static const struct gcn_image {\n"
+	   "  char magic[4];\n"
+	   "  size_t size;\n"
+	   "  void *image;\n"
+	   "} gcn_image = {\n"
+	   "  \"GCN\",\n"
+	   "  %zu,\n"
+	   "  gcn_code\n"
+	   "};\n\n",
+	   len);
+
+  fprintf (cfile,
+	   "static const struct gcn_image_desc {\n"
+	   "  const struct gcn_image *gcn_image;\n"
+	   "  unsigned kernel_count;\n"
+	   "  const struct hsa_kernel_description *kernel_infos;\n"
+	   "  unsigned global_variable_count;\n"
+	   "  const struct global_var_info *global_variables;\n"
+	   "} target_data = {\n"
+	   "  &gcn_image,\n"
+	   "  sizeof (gcn_kernels) / sizeof (gcn_kernels[0]),\n"
+	   "  gcn_kernels,\n"
+	   "  sizeof (vars) / sizeof (vars[0]),\n"
+	   "  vars\n"
+	   "};\n\n");
+
+  fprintf (cfile,
+	   "#ifdef __cplusplus\n"
+	   "extern \"C\" {\n"
+	   "#endif\n"
+	   "extern void GOMP_offload_register_ver"
+	   " (unsigned, const void *, int, const void *);\n"
+	   "extern void GOMP_offload_unregister_ver"
+	   " (unsigned, const void *, int, const void *);\n"
+	   "#ifdef __cplusplus\n"
+	   "}\n"
+	   "#endif\n\n");
+
+  fprintf (cfile, "extern const void *const __OFFLOAD_TABLE__[];\n\n");
+
+  fprintf (cfile, "static __attribute__((constructor)) void init (void)\n"
+	   "{\n"
+	   "  GOMP_offload_register_ver (%#x, __OFFLOAD_TABLE__,"
+	   " %d/*GCN*/, &target_data);\n"
+	   "};\n",
+	   GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+	   GOMP_DEVICE_GCN);
+
+  fprintf (cfile, "static __attribute__((destructor)) void fini (void)\n"
+	   "{\n"
+	   "  GOMP_offload_unregister_ver (%#x, __OFFLOAD_TABLE__,"
+	   " %d/*GCN*/, &target_data);\n"
+	   "};\n",
+	   GOMP_VERSION_PACK (GOMP_VERSION, GOMP_VERSION_GCN),
+	   GOMP_DEVICE_GCN);
+}
+
+/* Compile a C file using the host compiler.  */
+
+static void
+compile_native (const char *infile, const char *outfile, const char *compiler)
+{
+  const char *collect_gcc_options = getenv ("COLLECT_GCC_OPTIONS");
+  if (!collect_gcc_options)
+    fatal_error (input_location,
+		 "environment variable COLLECT_GCC_OPTIONS must be set");
+
+  struct obstack argv_obstack;
+  obstack_init (&argv_obstack);
+  obstack_ptr_grow (&argv_obstack, compiler);
+  if (save_temps)
+    obstack_ptr_grow (&argv_obstack, "-save-temps");
+  if (verbose)
+    obstack_ptr_grow (&argv_obstack, "-v");
+  switch (offload_abi)
+    {
+    case OFFLOAD_ABI_LP64:
+      obstack_ptr_grow (&argv_obstack, "-m64");
+      break;
+    case OFFLOAD_ABI_ILP32:
+      obstack_ptr_grow (&argv_obstack, "-m32");
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  obstack_ptr_grow (&argv_obstack, infile);
+  obstack_ptr_grow (&argv_obstack, "-c");
+  obstack_ptr_grow (&argv_obstack, "-o");
+  obstack_ptr_grow (&argv_obstack, outfile);
+  obstack_ptr_grow (&argv_obstack, NULL);
+
+  const char **new_argv = XOBFINISH (&argv_obstack, const char **);
+  fork_execute (new_argv[0], CONST_CAST (char **, new_argv), true);
+  obstack_free (&argv_obstack, NULL);
+}
+
+int
+main (int argc, char **argv)
+{
+  FILE *in = stdin;
+  FILE *out = stdout;
+  FILE *cfile = stdout;
+  const char *outname = 0, *offloadsrc = 0;
+
+  progname = "mkoffload";
+  diagnostic_initialize (global_dc, 0);
+
+  if (atexit (mkoffload_cleanup) != 0)
+    fatal_error (input_location, "atexit failed");
+
+  char *collect_gcc = getenv ("COLLECT_GCC");
+  if (collect_gcc == NULL)
+    fatal_error (input_location, "COLLECT_GCC must be set.");
+  const char *gcc_path = dirname (ASTRDUP (collect_gcc));
+  const char *gcc_exec = basename (ASTRDUP (collect_gcc));
+
+  size_t len = (strlen (gcc_path) + 1 + strlen (GCC_INSTALL_NAME) + 1);
+  char *driver = XALLOCAVEC (char, len);
+
+  if (strcmp (gcc_exec, collect_gcc) == 0)
+    /* collect_gcc has no path, so it was found in PATH.  Make sure we also
+       find accel-gcc in PATH.  */
+    gcc_path = NULL;
+
+  int driver_used = 0;
+  if (gcc_path != NULL)
+    driver_used = sprintf (driver, "%s/", gcc_path);
+  sprintf (driver + driver_used, "%s", GCC_INSTALL_NAME);
+
+  bool found = false;
+  if (gcc_path == NULL)
+    found = true;
+  else if (access_check (driver, X_OK) == 0)
+    found = true;
+  else
+    {
+      /* Don't use alloca pointer with XRESIZEVEC.  */
+      driver = NULL;
+      /* Look in all COMPILER_PATHs for GCC_INSTALL_NAME.  */
+      char **paths = NULL;
+      unsigned n_paths;
+      n_paths = parse_env_var (getenv ("COMPILER_PATH"), &paths);
+      for (unsigned i = 0; i < n_paths; i++)
+	{
+	  len = strlen (paths[i]) + 1 + strlen (GCC_INSTALL_NAME) + 1;
+	  driver = XRESIZEVEC (char, driver, len);
+	  sprintf (driver, "%s/%s", paths[i], GCC_INSTALL_NAME);
+	  if (access_check (driver, X_OK) == 0)
+	    {
+	      found = true;
+	      break;
+	    }
+	}
+      free_array_of_ptrs ((void **) paths, n_paths);
+    }
+
+  if (!found)
+    fatal_error (input_location,
+		 "offload compiler %s not found", GCC_INSTALL_NAME);
+
+  /* We may be called with all the arguments stored in some file and
+     passed with @file.  Expand them into argv before processing.  */
+  expandargv (&argc, &argv);
+
+  /* Scan the argument vector.  */
+  bool fopenmp = false;
+  bool fopenacc = false;
+  for (int i = 1; i < argc; i++)
+    {
+#define STR "-foffload-abi="
+      if (strncmp (argv[i], STR, strlen (STR)) == 0)
+	{
+	  if (strcmp (argv[i] + strlen (STR), "lp64") == 0)
+	    offload_abi = OFFLOAD_ABI_LP64;
+	  else if (strcmp (argv[i] + strlen (STR), "ilp32") == 0)
+	    offload_abi = OFFLOAD_ABI_ILP32;
+	  else
+	    fatal_error (input_location,
+			 "unrecognizable argument of option " STR);
+	}
+#undef STR
+      else if (strcmp (argv[i], "-fopenmp") == 0)
+	fopenmp = true;
+      else if (strcmp (argv[i], "-fopenacc") == 0)
+	fopenacc = true;
+      else if (strcmp (argv[i], "-save-temps") == 0)
+	save_temps = true;
+      else if (strcmp (argv[i], "-v") == 0)
+	verbose = true;
+    }
+  if (!(fopenacc ^ fopenmp))
+    fatal_error (input_location, "either -fopenacc or -fopenmp must be set");
+
+  const char *abi;
+  switch (offload_abi)
+    {
+    case OFFLOAD_ABI_LP64:
+      abi = "-m64";
+      break;
+    case OFFLOAD_ABI_ILP32:
+      abi = "-m32";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  gcn_s1_name = make_temp_file (".mkoffload.1.s");
+  gcn_s2_name = make_temp_file (".mkoffload.2.s");
+  gcn_o_name = make_temp_file (".mkoffload.hsaco");
+  gcn_cfile_name = make_temp_file (".c");
+
+  /* Build arguments for compiler pass.  */
+  struct obstack cc_argv_obstack;
+  obstack_init (&cc_argv_obstack);
+  obstack_ptr_grow (&cc_argv_obstack, driver);
+  obstack_ptr_grow (&cc_argv_obstack, "-S");
+
+  if (save_temps)
+    obstack_ptr_grow (&cc_argv_obstack, "-save-temps");
+  if (verbose)
+    obstack_ptr_grow (&cc_argv_obstack, "-v");
+  obstack_ptr_grow (&cc_argv_obstack, abi);
+  obstack_ptr_grow (&cc_argv_obstack, "-xlto");
+  if (fopenmp)
+    obstack_ptr_grow (&cc_argv_obstack, "-mgomp");
+
+  for (int ix = 1; ix != argc; ix++)
+    {
+      if (!strcmp (argv[ix], "-o") && ix + 1 != argc)
+	outname = argv[++ix];
+      else
+	{
+	  obstack_ptr_grow (&cc_argv_obstack, argv[ix]);
+
+	  if (argv[ix][0] != '-')
+	    offloadsrc = argv[ix];
+	}
+    }
+
+  obstack_ptr_grow (&cc_argv_obstack, "-o");
+  obstack_ptr_grow (&cc_argv_obstack, gcn_s1_name);
+  obstack_ptr_grow (&cc_argv_obstack,
+		    concat ("-mlocal-symbol-id=", offloadsrc, NULL));
+  obstack_ptr_grow (&cc_argv_obstack, NULL);
+  const char **cc_argv = XOBFINISH (&cc_argv_obstack, const char **);
+
+  /* Build arguments for assemble/link pass.  */
+  struct obstack ld_argv_obstack;
+  obstack_init (&ld_argv_obstack);
+  obstack_ptr_grow (&ld_argv_obstack, driver);
+  obstack_ptr_grow (&ld_argv_obstack, gcn_s2_name);
+  obstack_ptr_grow (&ld_argv_obstack, "-lgomp");
+
+  for (int i = 1; i < argc; i++)
+    if (strncmp (argv[i], "-l", 2) == 0
+	|| strncmp (argv[i], "-Wl", 3) == 0
+	|| strncmp (argv[i], "-march", 6) == 0)
+      obstack_ptr_grow (&ld_argv_obstack, argv[i]);
+
+  obstack_ptr_grow (&ld_argv_obstack, "-o");
+  obstack_ptr_grow (&ld_argv_obstack, gcn_o_name);
+  obstack_ptr_grow (&ld_argv_obstack, NULL);
+  const char **ld_argv = XOBFINISH (&ld_argv_obstack, const char **);
+
+  /* Clean up unhelpful environment variables.  */
+  char *execpath = getenv ("GCC_EXEC_PREFIX");
+  char *cpath = getenv ("COMPILER_PATH");
+  char *lpath = getenv ("LIBRARY_PATH");
+  unsetenv ("GCC_EXEC_PREFIX");
+  unsetenv ("COMPILER_PATH");
+  unsetenv ("LIBRARY_PATH");
+
+  /* Run the compiler pass.  */
+  fork_execute (cc_argv[0], CONST_CAST (char **, cc_argv), true);
+  obstack_free (&cc_argv_obstack, NULL);
+
+  in = fopen (gcn_s1_name, "r");
+  if (!in)
+    fatal_error (input_location, "cannot open intermediate gcn asm file");
+
+  out = fopen (gcn_s2_name, "w");
+  if (!out)
+    fatal_error (input_location, "cannot open '%s'", gcn_s2_name);
+
+  cfile = fopen (gcn_cfile_name, "w");
+  if (!cfile)
+    fatal_error (input_location, "cannot open '%s'", gcn_cfile_name);
+
+  process_asm (in, out, cfile);
+
+  fclose (in);
+  fclose (out);
+
+  /* Run the assemble/link pass.  */
+  fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true);
+  obstack_free (&ld_argv_obstack, NULL);
+
+  in = fopen (gcn_o_name, "r");
+  if (!in)
+    fatal_error (input_location, "cannot open intermediate gcn obj file");
+
+  process_obj (in, cfile);
+
+  fclose (in);
+  fclose (cfile);
+
+  xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL));
+  xputenv (concat ("COMPILER_PATH=", cpath, NULL));
+  xputenv (concat ("LIBRARY_PATH=", lpath, NULL));
+
+  compile_native (gcn_cfile_name, outname, collect_gcc);
+
+  return 0;
+}
diff --git a/gcc/config/gcn/offload.h b/gcc/config/gcn/offload.h
new file mode 100644
index 000000000000..795ee3f0e0e4
--- /dev/null
+++ b/gcc/config/gcn/offload.h
@@ -0,0 +1,35 @@
+/* Support for AMD GCN offloading.
+
+   Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_GCN_OFFLOAD_H
+#define GCC_GCN_OFFLOAD_H
+
+/* Support for OpenACC acc_on_device.  */
+
+#include "gomp-constants.h"
+
+#define ACCEL_COMPILER_acc_device GOMP_DEVICE_GCN
+
+#endif
diff --git a/include/gomp-constants.h b/include/gomp-constants.h
index 22f9520524de..0e15cfb303ec 100644
--- a/include/gomp-constants.h
+++ b/include/gomp-constants.h
@@ -219,7 +219,8 @@ enum gomp_map_kind
 #define GOMP_DEVICE_NVIDIA_PTX		5
 #define GOMP_DEVICE_INTEL_MIC		6
 #define GOMP_DEVICE_HSA			7
-#define GOMP_DEVICE_CURRENT		8
+#define GOMP_DEVICE_GCN			8
+#define GOMP_DEVICE_CURRENT		9
 
 #define GOMP_DEVICE_ICV			-1
 #define GOMP_DEVICE_HOST_FALLBACK	-2
@@ -276,6 +277,7 @@ enum gomp_map_kind
 #define GOMP_VERSION_NVIDIA_PTX 1
 #define GOMP_VERSION_INTEL_MIC 0
 #define GOMP_VERSION_HSA 0
+#define GOMP_VERSION_GCN 0
 
 #define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
 #define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
diff --git a/libgcc/ChangeLog.omp b/libgcc/ChangeLog.omp
index 45fc6aaf5301..a237a7008379 100644
--- a/libgcc/ChangeLog.omp
+++ b/libgcc/ChangeLog.omp
@@ -1,3 +1,13 @@
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	* Makefile.in: Allow disabling of emutls.
+	* config/gcn/gomp_print.c: New.
+	* config/gcn/reduction.c: New.
+	* config/gcn/t-amdgcn (LIB2ADD): Add gomp_print.c and reduction.c.
+	Disable emutls.c.
+	* config/gcn/t-gcn-hsa: New.
+
 2019-06-25  Andrew Stubbs  <ams@codesourcery.com>
 
 	Backport from mainline:
diff --git a/libgcc/Makefile.in b/libgcc/Makefile.in
index ea390a5bbeae..3261ab05cd94 100644
--- a/libgcc/Makefile.in
+++ b/libgcc/Makefile.in
@@ -430,9 +430,11 @@ LIB2ADD += enable-execute-stack.c
 # While emutls.c has nothing to do with EH, it is in LIB2ADDEH*
 # instead of LIB2ADD because that's the way to be sure on some targets
 # (e.g. *-*-darwin*) only one copy of it is linked.
+ifneq ($(enable_emutls),no)
 LIB2ADDEH += $(srcdir)/emutls.c
 LIB2ADDEHSTATIC += $(srcdir)/emutls.c
 LIB2ADDEHSHARED += $(srcdir)/emutls.c
+endif
 
 # Library members defined in libgcc2.c.
 lib2funcs = _muldi3 _negdi2 _lshrdi3 _ashldi3 _ashrdi3 _cmpdi2 _ucmpdi2	   \
diff --git a/libgcc/config/gcn/gomp_print.c b/libgcc/config/gcn/gomp_print.c
new file mode 100644
index 000000000000..5d1a3fc811d4
--- /dev/null
+++ b/libgcc/config/gcn/gomp_print.c
@@ -0,0 +1,101 @@
+/* Newlib may not have been built yet.  */
+typedef long int64_t;
+typedef long size_t;
+extern char *strncpy (char *dst, const char *src, size_t length);
+extern void exit(int);
+
+void gomp_print_string (const char *msg, const char *value);
+void gomp_print_integer (const char *msg, int64_t value);
+void gomp_print_double (const char *msg, double value);
+
+/* This struct must match the one used by gcn-run and libgomp.
+   It holds all the data output from a kernel (besides mapping data).
+
+   The base address pointer can be found at kernargs+16.
+
+   The next_output counter must be atomically incremented for each
+   print output.  Only when the print data is fully written can the
+   "written" flag be set.  */
+struct output {
+  int return_value;
+  unsigned int next_output;
+  struct printf_data {
+    int written;
+    char msg[128];
+    int type;
+    union {
+      int64_t ivalue;
+      double dvalue;
+      char text[128];
+    };
+  } queue[1024];
+  unsigned int consumed;
+};
+
+static struct printf_data *
+reserve_print_slot (void) {
+  /* The kernargs pointer is in s[8:9].
+     This will break if the enable_sgpr_* flags are ever changed.  */
+  char *kernargs;
+  asm ("s_mov_b64 %0, s[8:9]" : "=Sg"(kernargs));
+
+  /* The output data is at kernargs[2].  */
+  struct output *data = *(struct output **)(kernargs + 16);
+
+  /* Reserve the slot.  */
+  unsigned int index = __atomic_fetch_add (&data->next_output, 1,
+					   __ATOMIC_ACQUIRE);
+
+  /* Spinlock while the host catches up.  */
+  if (index >= 1024)
+    while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
+	   <= (index - 1024))
+      asm ("s_sleep 64");
+
+  if ((unsigned int)(index + 1) < data->consumed)
+    {
+      /* Overflow.  */
+      exit (1);
+    }
+  return &(data->queue[index%1024]);
+}
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+  struct printf_data *output = reserve_print_slot ();
+  output->type = 2; /* String.  */
+
+  strncpy (output->msg, msg, 127);
+  output->msg[127] = '\0';
+  strncpy (output->text, value, 127);
+  output->text[127] = '\0';
+
+  __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+  struct printf_data *output = reserve_print_slot ();
+  output->type = 0; /* Integer.  */
+
+  strncpy (output->msg, msg, 127);
+  output->msg[127] = '\0';
+  output->ivalue = value;
+
+  __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+  struct printf_data *output = reserve_print_slot ();
+  output->type = 1; /* Double.  */
+
+  strncpy (output->msg, msg, 127);
+  output->msg[127] = '\0';
+  output->dvalue = value;
+
+  __atomic_store_n (&output->written, 1, __ATOMIC_RELEASE);
+}
diff --git a/libgcc/config/gcn/reduction.c b/libgcc/config/gcn/reduction.c
new file mode 100644
index 000000000000..eafcee8ca8a9
--- /dev/null
+++ b/libgcc/config/gcn/reduction.c
@@ -0,0 +1,30 @@
+/* Oversized reductions lock variable
+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* We use a global lock variable for reductions on objects larger than
+   64 bits.  Until and unless proven that lock contention for
+   different reductions is a problem, a single lock will suffice.  */
+
+unsigned volatile __reduction_lock = 0;
diff --git a/libgcc/config/gcn/t-amdgcn b/libgcc/config/gcn/t-amdgcn
index adbd866a1d94..c04de3ce4bc1 100644
--- a/libgcc/config/gcn/t-amdgcn
+++ b/libgcc/config/gcn/t-amdgcn
@@ -1,7 +1,11 @@
+LIB2ADD += $(srcdir)/config/gcn/gomp_print.c
+
 LIB2ADD += $(srcdir)/config/gcn/lib2-divmod.c \
 	   $(srcdir)/config/gcn/lib2-divmod-hi.c \
 	   $(srcdir)/config/gcn/unwind-gcn.c
 
+LIB2ADD += $(srcdir)/config/gcn/reduction.c
+
 LIB2ADDEH=
 LIB2FUNCS_EXCLUDE=__main
 
@@ -13,5 +17,10 @@ LIBGCC2_DEBUG_CFLAGS = -g0
 crt0.o: $(srcdir)/config/gcn/crt0.c
 	$(crt_compile) -c $<
 
-# Prevent building "advanced" stuff (for example, gcov support).
+# Prevent building "advanced" stuff (for example, gcov support).  We don't
+# support it, and it may cause the build to fail, because of alloca usage, for
+# example.
 INHIBIT_LIBC_CFLAGS = -Dinhibit_libc
+
+# Disable emutls.c (temporarily?)
+enable_emutls = no
diff --git a/libgcc/config/gcn/t-gcn-hsa b/libgcc/config/gcn/t-gcn-hsa
new file mode 100644
index 000000000000..1600a586ac4d
--- /dev/null
+++ b/libgcc/config/gcn/t-gcn-hsa
@@ -0,0 +1,52 @@
+#  Copyright (C) 2016-2019 Free Software Foundation, Inc.
+#
+#  This file is free software; you can redistribute it and/or modify it under
+#  the terms of the GNU General Public License as published by the Free
+#  Software Foundation; either version 3 of the License, or (at your option)
+#  any later version.
+#
+#  This file is distributed in the hope that it will be useful, but WITHOUT
+#  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+#  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with GCC; see the file COPYING3.  If not see
+#  <http://www.gnu.org/licenses/>.
+
+GTM_H += $(HASH_TABLE_H)
+
+driver-gcn.o: $(srcdir)/config/gcn/driver-gcn.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+
+CFLAGS-mkoffload.o += $(DRIVER_DEFINES) \
+	-DGCC_INSTALL_NAME=\"$(GCC_INSTALL_NAME)\"
+mkoffload.o: $(srcdir)/config/gcn/mkoffload.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+ALL_HOST_OBJS += mkoffload.o
+
+mkoffload$(exeext): mkoffload.o collect-utils.o libcommon-target.a \
+		      $(LIBIBERTY) $(LIBDEPS)
+	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ \
+	  mkoffload.o collect-utils.o libcommon-target.a $(LIBIBERTY) $(LIBS)
+
+CFLAGS-gcn-run.o += -DVERSION_STRING=$(PKGVERSION_s)
+COMPILE-gcn-run.o = $(filter-out -fno-rtti,$(COMPILE))
+gcn-run.o: $(srcdir)/config/gcn/gcn-run.c
+	$(COMPILE-gcn-run.o) -x c -std=gnu11 -Wno-error=pedantic $<
+	$(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-run.o
+
+gcn-run$(exeext): gcn-run.o
+	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
+
+MULTILIB_OPTIONS = march=gfx900 march=gfx906
+MULTILIB_DIRNAMES = gfx900 gfx906
+
+PASSES_EXTRA += $(srcdir)/config/gcn/gcn-passes.def
+gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
+ALL_HOST_OBJS += gcn-tree.o
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp
index caf5f6e2fb1b..e6c81d4f43f4 100644
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,160 @@
+2019-07-31  Julian Brown  <julian@codesourcery.com>
+	    Andrew Stubbs  <ams@codesourcery.com>
+
+	* Makefile.am (libgomp_la_SOURCES): Add gomp_print.c.
+	* Makefile.in: Regenerate.
+	* affinity-fmt.c: Rename calls to gomp_write_string from
+	gomp_print_string.
+	* config.h.in (PLUGIN_GCN): Add #undef.
+	* config/nvptx/libgomp-plugin.c: Rename to...
+	* config/accel/libgomp-plugin.c: ...this.
+	* config/nvptx/lock.c: Rename to...
+	* config/accel/lock.c: ...this.
+	* config/nvptx/mutex.c: Rename to...
+	* config/accel/mutex.c: ...this.
+	* config/nvptx/mutex.h: Rename to...
+	* config/accel/mutex.h: ...this.
+	* config/nvptx/oacc-async.c: Rename to...
+	* config/accel/oacc-async.c: ...this.
+	* config/nvptx/oacc-cuda.c: Rename to...
+	* config/accel/oacc-cuda.c: ...this.
+	* config/nvptx/oacc-host.c: Rename to...
+	* config/accel/oacc-host.c: ...this.
+	* config/nvptx/oacc-init.c: Rename to...
+	* config/accel/oacc-init.c: ...this.
+	* config/nvptx/oacc-mem.c: Rename to...
+	* config/accel/oacc-mem.c: ...this.
+	* config/nvptx/oacc-plugin.c: Rename to...
+	* config/accel/oacc-plugin.c: ...this.
+	* config/nvptx/omp-lock.h: Rename to...
+	* config/accel/omp-lock.h: ...this.
+	* config/nvptx/openacc.f90: Rename to...
+	* config/accel/openacc.f90: ...this.  Add acc_device_hsa and
+	acc_device_gcn.
+	* config/nvptx/pool.h: Rename to...
+	* config/accel/pool.h: ...this.
+	* config/nvptx/proc.c: Rename to...
+	* config/accel/proc.c: ...this.  Add omp_get_num_procs alias.
+	* config/nvptx/ptrlock.c: Rename to...
+	* config/accel/ptrlock.c: ...this.
+	* config/nvptx/ptrlock.h: Rename to...
+	* config/accel/ptrlock.h: ...this.
+	* config/nvptx/sem.c: Rename to...
+	* config/accel/sem.c: ...this.
+	* config/nvptx/sem.h: Rename to...
+	* config/accel/sem.h: ...this.
+	* config/nvptx/thread-stacksize.h: Rename to...
+	* config/accel/thread-stacksize.h: ...this.
+	* config/gcn/affinity-fmt.c: New.
+	* config/gcn/bar.c: New.
+	* config/gcn/bar.h: New.
+	* config/gcn/doacross.h: New.
+	* config/gcn/gomp_print.c: New.
+	* config/gcn/icv-device.c: New.
+	* config/gcn/simple-bar.h: New.
+	* config/gcn/target.c: New.
+	* config/gcn/task.c: New.
+	* config/gcn/team.c: New.
+	* config/gcn/time.c: New.
+	* config/linux/gomp_print.c: New.
+	* configure.ac (amdgcn*-*-*): Disable pthreads.
+	* configure: Regenerated.
+	* configure.tgt (nvptx*-*-*): Add 'accel' config_path.
+	(amdgcn*-*-*): Set config_path.
+	* fortran.c (omp_display_affinity_): Rename calls to gomp_write_string
+	from gomp_print_string.
+	* libgomp-plugin.h (enum offload_target_type): Add
+	OFFLOAD_TARGET_TYPE_GCN.
+	(GOMP_OFFLOAD_openacc_async_construct): Change parameter type to int.
+	* libgomp.h (gcn_thrs, set_gcn_thrs, gomp_thread): Add for __AMDGCN__.
+	(gomp_print_string): Rename to...
+	(gomp_write_string): ...this.
+	* libgomp.map (GOMP_4.5): Add gomp_rpint_string, gomp_print_integer,
+	gomp_print_double.
+	* oacc-async.c (lookup_goacc_asyncqueue): Pass target_id to async queue
+	construct function.
+	* oacc-host.c (host_openacc_async_construct): Add dummy device
+	parameter.
+	* oacc-init.c (name_of_acc_device_t): Add acc_device_gcn.
+	* oacc-int.h (goacc_thread): Add dummy implementation for __AMDGCN__.
+	* oacc-parallel.c (GOACC_enter_exit_data): Support acc_async_noval and
+	zero-length array sections.
+	* omp.h.in (gomp_print_string, gomp_print_integer, gomp_print_double):
+	Add prototypes.
+	* omp_lib.f90.in (gomp_print_string, gomp_print_integer,
+	gomp_print_double): Add interfaces.
+	* openacc.f90 (openacc_kinds): Add acc_device_gcn.  Bump
+	acc_device_current code.
+	* openacc.h (acc_device_t): Add acc_device_gcn, bump acc_device_current
+	code.
+	* openacc_lib.h (acc_device_hsa, acc_device_gcn): Add.
+	* plugin/Makefrag.am (PLUGIN_GCN): Support building GCN plugin.
+	* plugin/configfrag.am (PLUGIN_GCN, PLUGIN_GCN_CPPFLAGS,
+	PLUGIN_GCN_LDFLAGS, PLUGIN_GCN_LIBS): Add.  Add suport for GCN plugin.
+	* plugin/plugin-gcn.c: New.
+	* target.c (stdio.h): Include unconditionally.
+	(gomp_copy_host2dev): Add function comment.
+	(copy_host2dev_immediate): New function.
+	(gomp_map_pointer, gomp_map_vars_internal): Use
+	copy_host2dev_immediate where appropriate.
+	(offload_target_to_plugin_name): Support gcn.
+	* team.c (gomp_free_pool_helper): Support gcn.
+	* testsuite/Makefile.in: Regenerated.
+	* testsuite/lib/libgomp.exp
+	(check_effective_target_openacc_amdgcn_accel_present): New.
+	(check_effective_target_openacc_amdgcn_accel_selected): New.
+	* testsuite/libgomp.c/c.exp (generate_tests, test_lists,
+	generated_tests): New.
+	(tests): Add generated tests.
+	* testsuite/libgomp.c/for-1.h: New.
+	* testsuite/libgomp.c/for-2.h: New.
+	* testsuite/libgomp.c/for-3.h: New.
+	* testsuite/libgomp.c/for-3.list: New.
+	* testsuite/libgomp.c/for-5.c: New.
+	* testsuite/libgomp.c/for-5.list: New.
+	* testsuite/libgomp.c/for-6.c: New.
+	* testsuite/libgomp.c/for-6.list: New.
+	* testsuite/libgomp.c/target-print-1.c: New.
+	* testsuite/libgomp.fortran/target-print-1.f90: New.
+	* testsuite/libgomp.oacc-c++/c++.exp (amdgcn*): Add support for AMD GCN.
+	* testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c: Adjust for
+	portability.
+	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip unsuitable
+	test for AMD GCN.
+	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for
+	portability.
+	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/serial-dims.c: Likewise.
+	* testsuite/libgomp.oacc-c-c++-common/private-variables-2.c: New.
+	* testsuite/libgomp.oacc-c-c++-common/tile-1.c: Skip for AMD GCN.
+	* testsuite/libgomp.oacc-c/c.exp (amdgcn*): Add support for AMD GCN.
+	* testsuite/libgomp.oacc-c/offload-target-1.c: Add AMD GCN support.
+	* testsuite/libgomp.oacc-c/print-1.c: New.
+	* testsuite/libgomp.oacc-fortran/fortran.exp (amdgcn*): Add AMD GCN
+	support.
+	* testsuite/libgomp.oacc-fortran/atomic_capture-1.f90: Adjust for
+	portability.
+	* testsuite/libgomp.oacc-fortran/collapse-1.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/collapse-2.f90: Likewise.
+	* testsuite/libgomp.oacc-fortran/error_stop-1.f: Support AMD GCN.
+	* testsuite/libgomp.oacc-fortran/error_stop-2.f: Support AMD GCN.
+	* testsuite/libgomp.oacc-fortran/error_stop-3.f: Support AMD GCN.
+	* testsuite/libgomp.oacc-fortran/print-1.f90: New.
+
 2019-01-23  Thomas Schwinge <thomas@codesourcery.com>
 
 	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: Update.
diff --git a/libgomp/Makefile.am b/libgomp/Makefile.am
index 00848cdc792e..ba9b56c4cfae 100644
--- a/libgomp/Makefile.am
+++ b/libgomp/Makefile.am
@@ -72,7 +72,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
 	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
 	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
 	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	affinity-fmt.c teams.c oacc-profiling.c \
+	affinity-fmt.c teams.c gomp_print.c oacc-profiling.c \
 	oacc-profiling-acc_register_library.c
 
 include $(top_srcdir)/plugin/Makefrag.am
diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index 63f919cb7ca3..650437f25573 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -120,7 +120,8 @@ host_triplet = @host@
 target_triplet = @target@
 @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
 @PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la
-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90
+@PLUGIN_GCN_TRUE@am__append_3 = libgomp-plugin-gcn.la
+@USE_FORTRAN_TRUE@am__append_4 = openacc.f90
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -180,15 +181,26 @@ am__installdirs = "$(DESTDIR)$(toolexeclibdir)" "$(DESTDIR)$(infodir)" \
 	"$(DESTDIR)$(toolexeclibdir)"
 LTLIBRARIES = $(toolexeclib_LTLIBRARIES)
 am__DEPENDENCIES_1 =
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \
+@PLUGIN_GCN_TRUE@	$(am__DEPENDENCIES_1)
+@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS =  \
+@PLUGIN_GCN_TRUE@	libgomp_plugin_gcn_la-plugin-gcn.lo
+libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libgomp_plugin_gcn_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(libgomp_plugin_gcn_la_LDFLAGS) $(LDFLAGS) -o $@
+@PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_rpath = -rpath \
+@PLUGIN_GCN_TRUE@	$(toolexeclibdir)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_DEPENDENCIES = libgomp.la \
 @PLUGIN_HSA_TRUE@	$(am__DEPENDENCIES_1)
 @PLUGIN_HSA_TRUE@am_libgomp_plugin_hsa_la_OBJECTS =  \
 @PLUGIN_HSA_TRUE@	libgomp_plugin_hsa_la-plugin-hsa.lo
 libgomp_plugin_hsa_la_OBJECTS = $(am_libgomp_plugin_hsa_la_OBJECTS)
-AM_V_lt = $(am__v_lt_@AM_V@)
-am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
-am__v_lt_0 = --silent
-am__v_lt_1 = 
 libgomp_plugin_hsa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
 	$(libgomp_plugin_hsa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
 	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
@@ -216,7 +228,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
 	target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
 	oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
 	oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
-	teams.lo oacc-profiling.lo \
+	teams.lo gomp_print.lo oacc-profiling.lo \
 	oacc-profiling-acc_register_library.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 AM_V_P = $(am__v_P_@AM_V@)
@@ -265,7 +277,8 @@ AM_V_FCLD = $(am__v_FCLD_@AM_V@)
 am__v_FCLD_ = $(am__v_FCLD_@AM_DEFAULT_V@)
 am__v_FCLD_0 = @echo "  FCLD    " $@;
 am__v_FCLD_1 = 
-SOURCES = $(libgomp_plugin_hsa_la_SOURCES) \
+SOURCES = $(libgomp_plugin_gcn_la_SOURCES) \
+	$(libgomp_plugin_hsa_la_SOURCES) \
 	$(libgomp_plugin_nvptx_la_SOURCES) $(libgomp_la_SOURCES)
 AM_V_DVIPS = $(am__v_DVIPS_@AM_V@)
 am__v_DVIPS_ = $(am__v_DVIPS_@AM_DEFAULT_V@)
@@ -429,6 +442,10 @@ PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PLUGIN_GCN = @PLUGIN_GCN@
+PLUGIN_GCN_CPPFLAGS = @PLUGIN_GCN_CPPFLAGS@
+PLUGIN_GCN_LDFLAGS = @PLUGIN_GCN_LDFLAGS@
+PLUGIN_GCN_LIBS = @PLUGIN_GCN_LIBS@
 PLUGIN_HSA = @PLUGIN_HSA@
 PLUGIN_HSA_CPPFLAGS = @PLUGIN_HSA_CPPFLAGS@
 PLUGIN_HSA_LDFLAGS = @PLUGIN_HSA_LDFLAGS@
@@ -502,6 +519,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -529,7 +547,8 @@ libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
 AM_CPPFLAGS = $(addprefix -I, $(search_path)) $(LIBFFIINCS)
 AM_CFLAGS = $(XCFLAGS)
 AM_LDFLAGS = $(XLDFLAGS) $(SECTION_LDFLAGS) $(OPT_LDFLAGS)
-toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) $(am__append_2)
+toolexeclib_LTLIBRARIES = libgomp.la $(am__append_1) $(am__append_2) \
+	$(am__append_3)
 nodist_toolexeclib_HEADERS = libgomp.spec
 
 # -Wc is only a libtool option.
@@ -554,8 +573,8 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
 	affinity.c target.c splay-tree.c libgomp-plugin.c \
 	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
 	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	affinity-fmt.c teams.c oacc-profiling.c \
-	oacc-profiling-acc_register_library.c $(am__append_3)
+	affinity-fmt.c teams.c gomp_print.c oacc-profiling.c \
+	oacc-profiling-acc_register_library.c $(am__append_4)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -578,6 +597,18 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
 @PLUGIN_HSA_TRUE@	$(lt_host_flags) $(PLUGIN_HSA_LDFLAGS)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
+
+# AMD GCN plugin
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+@PLUGIN_GCN_TRUE@	-D_GNU_SOURCE
+
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LDFLAGS =  \
+@PLUGIN_GCN_TRUE@	$(libgomp_plugin_gcn_version_info) \
+@PLUGIN_GCN_TRUE@	$(lt_host_flags) $(PLUGIN_GCN_LDFLAGS)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
 nodist_noinst_HEADERS = libgomp_f.h
 nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
 @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
@@ -714,6 +745,9 @@ clean-toolexeclibLTLIBRARIES:
 	  rm -f $${locs}; \
 	}
 
+libgomp-plugin-gcn.la: $(libgomp_plugin_gcn_la_OBJECTS) $(libgomp_plugin_gcn_la_DEPENDENCIES) $(EXTRA_libgomp_plugin_gcn_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libgomp_plugin_gcn_la_LINK) $(am_libgomp_plugin_gcn_la_rpath) $(libgomp_plugin_gcn_la_OBJECTS) $(libgomp_plugin_gcn_la_LIBADD) $(LIBS)
+
 libgomp-plugin-hsa.la: $(libgomp_plugin_hsa_la_OBJECTS) $(libgomp_plugin_hsa_la_DEPENDENCIES) $(EXTRA_libgomp_plugin_hsa_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(libgomp_plugin_hsa_la_LINK) $(am_libgomp_plugin_hsa_la_rpath) $(libgomp_plugin_hsa_la_OBJECTS) $(libgomp_plugin_hsa_la_LIBADD) $(LIBS)
 
@@ -739,11 +773,13 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gomp_print.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
@@ -796,6 +832,13 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
 
+libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-plugin-gcn.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Tpo -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='plugin/plugin-gcn.c' object='libgomp_plugin_gcn_la-plugin-gcn.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c
+
 libgomp_plugin_hsa_la-plugin-hsa.lo: plugin/plugin-hsa.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_hsa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_hsa_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_hsa_la-plugin-hsa.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Tpo -c -o libgomp_plugin_hsa_la-plugin-hsa.lo `test -f 'plugin/plugin-hsa.c' || echo '$(srcdir)/'`plugin/plugin-hsa.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Tpo $(DEPDIR)/libgomp_plugin_hsa_la-plugin-hsa.Plo
diff --git a/libgomp/affinity-fmt.c b/libgomp/affinity-fmt.c
index d9c6e181233a..cd5783f4658e 100644
--- a/libgomp/affinity-fmt.c
+++ b/libgomp/affinity-fmt.c
@@ -38,7 +38,7 @@
 #endif
 
 void
-gomp_print_string (const char *str, size_t len)
+gomp_write_string (const char *str, size_t len)
 {
   fwrite (str, 1, len, stderr);
 }
@@ -462,13 +462,13 @@ omp_display_affinity (const char *format)
   if (ret < sizeof buf)
     {
       buf[ret] = '\n';
-      gomp_print_string (buf, ret + 1);
+      gomp_write_string (buf, ret + 1);
       return;
     }
   b = gomp_malloc (ret + 1);
   ialias_call (omp_capture_affinity) (b, ret + 1, format);
   b[ret] = '\n';
-  gomp_print_string (b, ret + 1);
+  gomp_write_string (b, ret + 1);
   free (b);
 }
 
@@ -483,13 +483,13 @@ gomp_display_affinity_thread (gomp_thread_handle handle,
   if (ret < sizeof buf)
     {
       buf[ret] = '\n';
-      gomp_print_string (buf, ret + 1);
+      gomp_write_string (buf, ret + 1);
       return;
     }
   b = gomp_malloc (ret + 1);
   gomp_display_affinity (b, ret + 1, gomp_affinity_format_var,
   			 handle, ts, place);
   b[ret] = '\n';
-  gomp_print_string (b, ret + 1);
+  gomp_write_string (b, ret + 1);
   free (b);
 }
diff --git a/libgomp/config.h.in b/libgomp/config.h.in
index 5ad5f532af19..778a59eb1b7d 100644
--- a/libgomp/config.h.in
+++ b/libgomp/config.h.in
@@ -170,6 +170,9 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
+/* Define to 1 if the GCN plugin is built, 0 if not. */
+#undef PLUGIN_GCN
+
 /* Define to 1 if the HSA plugin is built, 0 if not. */
 #undef PLUGIN_HSA
 
diff --git a/libgomp/config/nvptx/libgomp-plugin.c b/libgomp/config/accel/libgomp-plugin.c
similarity index 100%
rename from libgomp/config/nvptx/libgomp-plugin.c
rename to libgomp/config/accel/libgomp-plugin.c
diff --git a/libgomp/config/nvptx/lock.c b/libgomp/config/accel/lock.c
similarity index 100%
rename from libgomp/config/nvptx/lock.c
rename to libgomp/config/accel/lock.c
diff --git a/libgomp/config/nvptx/mutex.c b/libgomp/config/accel/mutex.c
similarity index 100%
rename from libgomp/config/nvptx/mutex.c
rename to libgomp/config/accel/mutex.c
diff --git a/libgomp/config/nvptx/mutex.h b/libgomp/config/accel/mutex.h
similarity index 100%
rename from libgomp/config/nvptx/mutex.h
rename to libgomp/config/accel/mutex.h
diff --git a/libgomp/config/nvptx/oacc-async.c b/libgomp/config/accel/oacc-async.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-async.c
rename to libgomp/config/accel/oacc-async.c
diff --git a/libgomp/config/nvptx/oacc-cuda.c b/libgomp/config/accel/oacc-cuda.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-cuda.c
rename to libgomp/config/accel/oacc-cuda.c
diff --git a/libgomp/config/nvptx/oacc-host.c b/libgomp/config/accel/oacc-host.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-host.c
rename to libgomp/config/accel/oacc-host.c
diff --git a/libgomp/config/nvptx/oacc-init.c b/libgomp/config/accel/oacc-init.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-init.c
rename to libgomp/config/accel/oacc-init.c
diff --git a/libgomp/config/nvptx/oacc-mem.c b/libgomp/config/accel/oacc-mem.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-mem.c
rename to libgomp/config/accel/oacc-mem.c
diff --git a/libgomp/config/nvptx/oacc-plugin.c b/libgomp/config/accel/oacc-plugin.c
similarity index 100%
rename from libgomp/config/nvptx/oacc-plugin.c
rename to libgomp/config/accel/oacc-plugin.c
diff --git a/libgomp/config/nvptx/omp-lock.h b/libgomp/config/accel/omp-lock.h
similarity index 100%
rename from libgomp/config/nvptx/omp-lock.h
rename to libgomp/config/accel/omp-lock.h
diff --git a/libgomp/config/nvptx/openacc.f90 b/libgomp/config/accel/openacc.f90
similarity index 96%
rename from libgomp/config/nvptx/openacc.f90
rename to libgomp/config/accel/openacc.f90
index a7f690e15727..c2331d893ba7 100644
--- a/libgomp/config/nvptx/openacc.f90
+++ b/libgomp/config/accel/openacc.f90
@@ -51,6 +51,8 @@ module openacc_kinds
   ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
   integer (acc_device_kind), parameter :: acc_device_not_host = 4
   integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+  integer (acc_device_kind), parameter :: acc_device_hsa = 6
+  integer (acc_device_kind), parameter :: acc_device_gcn = 7
 
 end module
 
diff --git a/libgomp/config/nvptx/pool.h b/libgomp/config/accel/pool.h
similarity index 100%
rename from libgomp/config/nvptx/pool.h
rename to libgomp/config/accel/pool.h
diff --git a/libgomp/config/nvptx/proc.c b/libgomp/config/accel/proc.c
similarity index 98%
rename from libgomp/config/nvptx/proc.c
rename to libgomp/config/accel/proc.c
index 8ca0b0a1ee1a..fb2d4830df73 100644
--- a/libgomp/config/nvptx/proc.c
+++ b/libgomp/config/accel/proc.c
@@ -39,3 +39,4 @@ omp_get_num_procs (void)
 {
   return gomp_icv (false)->nthreads_var;
 }
+ialias (omp_get_num_procs)
diff --git a/libgomp/config/nvptx/ptrlock.c b/libgomp/config/accel/ptrlock.c
similarity index 100%
rename from libgomp/config/nvptx/ptrlock.c
rename to libgomp/config/accel/ptrlock.c
diff --git a/libgomp/config/nvptx/ptrlock.h b/libgomp/config/accel/ptrlock.h
similarity index 100%
rename from libgomp/config/nvptx/ptrlock.h
rename to libgomp/config/accel/ptrlock.h
diff --git a/libgomp/config/nvptx/sem.c b/libgomp/config/accel/sem.c
similarity index 100%
rename from libgomp/config/nvptx/sem.c
rename to libgomp/config/accel/sem.c
diff --git a/libgomp/config/nvptx/sem.h b/libgomp/config/accel/sem.h
similarity index 100%
rename from libgomp/config/nvptx/sem.h
rename to libgomp/config/accel/sem.h
diff --git a/libgomp/config/nvptx/thread-stacksize.h b/libgomp/config/accel/thread-stacksize.h
similarity index 100%
rename from libgomp/config/nvptx/thread-stacksize.h
rename to libgomp/config/accel/thread-stacksize.h
diff --git a/libgomp/config/gcn/affinity-fmt.c b/libgomp/config/gcn/affinity-fmt.c
new file mode 100644
index 000000000000..3585f414460f
--- /dev/null
+++ b/libgomp/config/gcn/affinity-fmt.c
@@ -0,0 +1,51 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRIx64.  */
+#endif
+#ifdef HAVE_UNAME
+#include <sys/utsname.h>
+#endif
+
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx,
+   while the nvptx newlib implementation does not support those functions.
+   Override the configure test results here.  */
+#undef HAVE_GETPID
+#undef HAVE_GETHOSTNAME
+
+/* The GCN newlib implementation does not support fwrite, but it does support
+   write.  Map fwrite to write.  */
+#undef fwrite
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
+
+#include "../../affinity-fmt.c"
+
diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c
new file mode 100644
index 000000000000..592dacd26f27
--- /dev/null
+++ b/libgomp/config/gcn/bar.c
@@ -0,0 +1,230 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an AMD GCN specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and s_barrier instruction.  It
+   uses MEMMODEL_RELAXED here because barriers are within workgroups and
+   therefore don't need to flush caches.  */
+
+#include <limits.h>
+#include "libgomp.h"
+
+
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+			MEMMODEL_RELAXED);
+    }
+  asm ("s_barrier" ::: "memory");
+}
+
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */
+
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+  /* Deferring to gomp_barrier_wait does not use the optimization opportunity
+     allowed by the interface contract for all-but-last participants.  The
+     original implementation in config/linux/bar.c handles this better.  */
+  gomp_barrier_wait (bar);
+}
+
+void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  asm ("s_barrier" ::: "memory");
+}
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state &= ~BAR_CANCELLED;
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
+	  asm ("s_barrier" ::: "memory");
+	  return;
+	}
+    }
+
+  generation = state;
+  state &= ~BAR_CANCELLED;
+  int retry = 100;
+  do
+    {
+      if (retry-- == 0)
+	{
+	  /* It really shouldn't happen that barriers get out of sync, but
+	     if they do then this will loop until they realign, so we need
+	     to avoid an infinite loop where the thread just isn't there.  */
+	  gomp_print_string ("Barrier sync failed (another thread died?);",
+			     " aborting.");
+	  abort();
+	}
+
+      asm ("s_barrier" ::: "memory");
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  unsigned int generation, gen;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      /* BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+      struct gomp_thread *thr = gomp_thread ();
+      struct gomp_team *team = thr->ts.team;
+
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+      if (__builtin_expect (team->task_count, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  state &= ~BAR_WAS_LAST;
+	}
+      else
+	{
+	  state += BAR_INCR - BAR_WAS_LAST;
+	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
+	  asm ("s_barrier" ::: "memory");
+	  return false;
+	}
+    }
+
+  if (__builtin_expect (state & BAR_CANCELLED, 0))
+    return true;
+
+  generation = state;
+  int retry = 100;
+  do
+    {
+      if (retry-- == 0)
+	{
+	  /* It really shouldn't happen that barriers get out of sync, but
+	     if they do then this will loop until they realign, so we need
+	     to avoid an infinite loop where the thread just isn't there.  */
+	  gomp_print_string ("Barrier sync failed (another thread died?);",
+			     " aborting.");
+	  abort();
+	}
+
+      asm ("s_barrier" ::: "memory");
+      gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+	return true;
+      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+	{
+	  gomp_barrier_handle_tasks (state);
+	  gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+	}
+      generation |= gen & BAR_WAITING_FOR_TASK;
+    }
+  while (gen != state + BAR_INCR);
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      return;
+    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);
+  gomp_team_barrier_wake (&team->barrier, INT_MAX);
+}
diff --git a/libgomp/config/gcn/bar.h b/libgomp/config/gcn/bar.h
new file mode 100644
index 000000000000..ec8851ba0787
--- /dev/null
+++ b/libgomp/config/gcn/bar.h
@@ -0,0 +1,168 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is an AMD GCN specific implementation of a barrier synchronization
+   mechanism for libgomp.  This type is private to the library.  This
+   implementation uses atomic instructions and s_barrier instruction.  It
+   uses MEMMODEL_RELAXED here because barriers are within workgroups and
+   therefore don't need to flush caches.  */
+
+#ifndef GOMP_BARRIER_H
+#define GOMP_BARRIER_H 1
+
+#include "mutex.h"
+
+typedef struct
+{
+  unsigned total;
+  unsigned generation;
+  unsigned awaited;
+  unsigned awaited_final;
+} gomp_barrier_t;
+
+typedef unsigned int gomp_barrier_state_t;
+
+/* The generation field contains a counter in the high bits, with a few
+   low bits dedicated to flags.  Note that TASK_PENDING and WAS_LAST can
+   share space because WAS_LAST is never stored back to generation.  */
+#define BAR_TASK_PENDING	1
+#define BAR_WAS_LAST		1
+#define BAR_WAITING_FOR_TASK	2
+#define BAR_CANCELLED		4
+#define BAR_INCR		8
+
+static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
+{
+  bar->total = count;
+  bar->awaited = count;
+  bar->awaited_final = count;
+  bar->generation = 0;
+}
+
+static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
+{
+  __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_RELAXED);
+  bar->total = count;
+}
+
+static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
+{
+}
+
+extern void gomp_barrier_wait (gomp_barrier_t *);
+extern void gomp_barrier_wait_last (gomp_barrier_t *);
+extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
+extern void gomp_team_barrier_wait (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
+extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
+					gomp_barrier_state_t);
+extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
+extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
+					       gomp_barrier_state_t);
+extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
+struct gomp_team;
+extern void gomp_team_barrier_cancel (struct gomp_team *);
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* A memory barrier is needed before exiting from the various forms
+     of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
+     2.8.6 flush Construct, which says there is an implicit flush during
+     a barrier region.  This is a convenient place to add the barrier,
+     so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE.  */
+  if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_RELAXED) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline gomp_barrier_state_t
+gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
+{
+  return gomp_barrier_wait_start (bar);
+}
+
+/* This is like gomp_barrier_wait_start, except it decrements
+   bar->awaited_final rather than bar->awaited and should be used
+   for the gomp_team_end barrier only.  */
+static inline gomp_barrier_state_t
+gomp_barrier_wait_final_start (gomp_barrier_t *bar)
+{
+  unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
+  ret &= -BAR_INCR | BAR_CANCELLED;
+  /* See above gomp_barrier_wait_start comment.  */
+  if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_RELAXED) == 0)
+    ret |= BAR_WAS_LAST;
+  return ret;
+}
+
+static inline bool
+gomp_barrier_last_thread (gomp_barrier_state_t state)
+{
+  return state & BAR_WAS_LAST;
+}
+
+/* All the inlines below must be called with team->task_lock
+   held.  */
+
+static inline void
+gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
+{
+  bar->generation &= ~BAR_TASK_PENDING;
+}
+
+static inline void
+gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  bar->generation |= BAR_WAITING_FOR_TASK;
+}
+
+static inline bool
+gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
+{
+  return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
+}
+
+static inline bool
+gomp_team_barrier_cancelled (gomp_barrier_t *bar)
+{
+  return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
+}
+
+static inline void
+gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  bar->generation = (state & -BAR_INCR) + BAR_INCR;
+}
+
+#endif /* GOMP_BARRIER_H */
diff --git a/libgomp/config/gcn/doacross.h b/libgomp/config/gcn/doacross.h
new file mode 100644
index 000000000000..2bff18ae1a8c
--- /dev/null
+++ b/libgomp/config/gcn/doacross.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is the AMD GCN implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+
+static inline int
+cpu_relax (void)
+{
+  /* This can be implemented as just a memory barrier, but a sleep seems
+     like it should allow the wavefront to yield (maybe?)
+     Use the shortest possible sleep time of 1*64 cycles.  */
+  asm volatile ("s_sleep\t1" ::: "memory");
+  return 0;
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* Prevent compiler from optimizing based on bounds of containing object.  */
+  asm ("" : "+r" (addr));
+  do
+    {
+       /* An alternative implementation might use s_setprio to lower the
+	  priority temporarily, and then restore it after.  */
+      int i = cpu_relax ();
+      cur = addr[i];
+    }
+  while (cur <= expected);
+}
+
+#endif /* GOMP_DOACROSS_H */
diff --git a/libgomp/config/gcn/gomp_print.c b/libgomp/config/gcn/gomp_print.c
new file mode 100644
index 000000000000..1c755730f087
--- /dev/null
+++ b/libgomp/config/gcn/gomp_print.c
@@ -0,0 +1,2 @@
+/* The GCN gomp_print routines live in libgcc where they are available
+   to stand-alone toolchains configured without libgomp.  */
diff --git a/libgomp/config/gcn/icv-device.c b/libgomp/config/gcn/icv-device.c
new file mode 100644
index 000000000000..cbb9dfa11330
--- /dev/null
+++ b/libgomp/config/gcn/icv-device.c
@@ -0,0 +1,72 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+   expected to replace.  */
+
+#include "libgomp.h"
+
+void
+omp_set_default_device (int device_num __attribute__((unused)))
+{
+}
+
+int
+omp_get_default_device (void)
+{
+  return 0;
+}
+
+int
+omp_get_num_devices (void)
+{
+  return 0;
+}
+
+int
+omp_get_num_teams (void)
+{
+  return gomp_num_teams_var + 1;
+}
+
+int __attribute__ ((__optimize__ ("O2")))
+omp_get_team_num (void)
+{
+  return __builtin_gcn_dim_pos (0);
+}
+
+int
+omp_is_initial_device (void)
+{
+  /* AMD GCN is an accelerator-only target.  */
+  return 0;
+}
+
+ialias (omp_set_default_device)
+ialias (omp_get_default_device)
+ialias (omp_get_num_devices)
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
+ialias (omp_is_initial_device)
diff --git a/libgomp/config/gcn/simple-bar.h b/libgomp/config/gcn/simple-bar.h
new file mode 100644
index 000000000000..802e0f5c3017
--- /dev/null
+++ b/libgomp/config/gcn/simple-bar.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a simplified barrier that is suitable for thread pool
+   synchronizaton.  Only a subset of full barrier API (bar.h) is exposed.
+   Here in the AMD GCN-specific implementation, we expect that thread pool
+   corresponds to the wavefronts within a work group.  */
+
+#ifndef GOMP_SIMPLE_BARRIER_H
+#define GOMP_SIMPLE_BARRIER_H 1
+
+/* AMD GCN has no use for this type.  */
+typedef int gomp_simple_barrier_t;
+
+/* GCN barriers block all wavefronts, so the count is not interesting.  */
+static inline void
+gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
+{
+}
+
+static inline void
+gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
+{
+}
+
+static inline void
+gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
+{
+  asm volatile ("s_barrier" ::: "memory");
+}
+
+static inline void
+gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
+{
+  /* GCN has no way to signal a barrier without waiting.  */
+  asm volatile ("s_barrier" ::: "memory");
+}
+
+#endif /* GOMP_SIMPLE_BARRIER_H */
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
new file mode 100644
index 000000000000..5ec577787726
--- /dev/null
+++ b/libgomp/config/gcn/target.c
@@ -0,0 +1,49 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <limits.h>
+
+void
+GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
+{
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      icv->thread_limit_var
+	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+    }
+  unsigned int num_workgroups, workgroup_id;
+  num_workgroups = __builtin_gcn_dim_size (0);
+  workgroup_id = __builtin_gcn_dim_pos (0);
+  if (!num_teams || num_teams >= num_workgroups)
+    num_teams = num_workgroups;
+  else if (workgroup_id >= num_teams)
+    {
+      gomp_free_thread (gcn_thrs ());
+      exit (0);
+    }
+  gomp_num_teams_var = num_teams - 1;
+}
diff --git a/libgomp/config/gcn/task.c b/libgomp/config/gcn/task.c
new file mode 100644
index 000000000000..a13565034b6c
--- /dev/null
+++ b/libgomp/config/gcn/task.c
@@ -0,0 +1,39 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles the maintainence of tasks in response to task
+   creation and termination.  */
+
+#include "libgomp.h"
+
+/* AMD GCN is an accelerator-only target, so this should never be called.  */
+
+bool
+gomp_target_task_fn (void *data)
+{
+  __builtin_unreachable ();
+}
+
+#include "../../task.c"
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
new file mode 100644
index 000000000000..79aec65a24e9
--- /dev/null
+++ b/libgomp/config/gcn/team.c
@@ -0,0 +1,202 @@
+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles maintainance of threads on AMD GCN.  */
+
+#include "libgomp.h"
+#include <stdlib.h>
+#include <string.h>
+
+static void gomp_thread_start (struct gomp_thread_pool *);
+void gomp_print_string (const char *msg, const char *val);
+
+/* This externally visible function handles target region entry.  It
+   sets up a per-team thread pool and transfers control by returning to
+   the kernel in the master thread or gomp_thread_start in other threads.
+
+   The name of this function is part of the interface with the compiler: for
+   each OpenMP kernel the compiler configures the stack, then calls here.
+
+   Likewise, gomp_gcn_exit_kernel is called during the kernel epilogue.  */
+
+void
+gomp_gcn_enter_kernel (void)
+{
+  int tid, ntids;
+  tid = __builtin_gcn_dim_pos (1);
+  ntids = __builtin_gcn_dim_size (1);
+  if (tid == 0)
+    {
+      gomp_global_icv.nthreads_var = ntids;
+      /* Starting additional threads is not supported.  */
+      gomp_global_icv.dyn_var = true;
+
+      set_gcn_thrs (calloc (ntids, sizeof (struct gomp_thread)));
+      if (gcn_thrs () == NULL)
+	goto oom;
+
+      struct gomp_thread_pool *pool = malloc (sizeof (*pool));
+      if (pool == NULL)
+	goto oom;
+
+      pool->threads = malloc (ntids * sizeof (*pool->threads));
+      if (pool->threads == NULL)
+	goto oom;
+
+      for (tid = 0; tid < ntids; tid++)
+	pool->threads[tid] = gcn_thrs () + tid;
+      pool->threads_size = ntids;
+      pool->threads_used = ntids;
+      pool->threads_busy = 1;
+      pool->last_team = NULL;
+      gomp_simple_barrier_init (&pool->threads_dock, ntids);
+
+      gcn_thrs ()[0].thread_pool = pool;
+      asm ("s_barrier" ::: "memory");
+      return;  /* Return to kernel.  */
+    }
+  else
+    {
+      asm ("s_barrier" ::: "memory");
+      gomp_thread_start (gcn_thrs ()[0].thread_pool);
+      /* gomp_thread_start does not return.  */
+    }
+
+oom:
+  gomp_print_string ("GCN heap exhausted; try setting GCN_HEAP_SIZE.", "");
+  abort();
+}
+
+void
+gomp_gcn_exit_kernel (void)
+{
+  gomp_free_thread (gcn_thrs ());
+}
+
+/* This function contains the idle loop in which a thread waits
+   to be called up to become part of a team.  */
+
+static void
+gomp_thread_start (struct gomp_thread_pool *pool)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  gomp_sem_init (&thr->release, 0);
+  thr->thread_pool = pool;
+
+  /* The loop exits only when "fn" is assigned "gomp_free_pool_helper",
+     which contains "s_endpgm", or an infinite no-op loop is
+     suspected (this happens when the thread master crashes).  */
+  int nul_limit = 99;
+  do
+    {
+      gomp_simple_barrier_wait (&pool->threads_dock);
+      if (!thr->fn)
+	{
+	  if (nul_limit-- > 0)
+	    continue;
+	  else
+	    {
+	      gomp_print_string ("team master not responding;",
+				 " slave thread aborting");
+	      abort();
+	    }
+	}
+      thr->fn (thr->data);
+      thr->fn = NULL;
+
+      struct gomp_task *task = thr->task;
+      gomp_team_barrier_wait_final (&thr->ts.team->barrier);
+      gomp_finish_task (task);
+    }
+  while (1);
+}
+
+/* Launch a team.  */
+
+void
+gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+		 unsigned flags, struct gomp_team *team,
+		 struct gomp_taskgroup *taskgroup)
+{
+  struct gomp_thread *thr, *nthr;
+  struct gomp_task *task;
+  struct gomp_task_icv *icv;
+  struct gomp_thread_pool *pool;
+  unsigned long nthreads_var;
+
+  thr = gomp_thread ();
+  pool = thr->thread_pool;
+  task = thr->task;
+  icv = task ? &task->icv : &gomp_global_icv;
+
+  /* Always save the previous state, even if this isn't a nested team.
+     In particular, we should save any work share state from an outer
+     orphaned work share construct.  */
+  team->prev_ts = thr->ts;
+
+  thr->ts.team = team;
+  thr->ts.team_id = 0;
+  ++thr->ts.level;
+  if (nthreads > 1)
+    ++thr->ts.active_level;
+  thr->ts.work_share = &team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+  thr->ts.single_count = 0;
+  thr->ts.static_trip = 0;
+  thr->task = &team->implicit_task[0];
+  nthreads_var = icv->nthreads_var;
+  gomp_init_task (thr->task, task, icv);
+  team->implicit_task[0].icv.nthreads_var = nthreads_var;
+  team->implicit_task[0].taskgroup = taskgroup;
+
+  if (nthreads == 1)
+    return;
+
+  /* Release existing idle threads.  */
+  for (unsigned i = 1; i < nthreads; ++i)
+    {
+      nthr = pool->threads[i];
+      nthr->ts.team = team;
+      nthr->ts.work_share = &team->work_shares[0];
+      nthr->ts.last_work_share = NULL;
+      nthr->ts.team_id = i;
+      nthr->ts.level = team->prev_ts.level + 1;
+      nthr->ts.active_level = thr->ts.active_level;
+      nthr->ts.single_count = 0;
+      nthr->ts.static_trip = 0;
+      nthr->task = &team->implicit_task[i];
+      gomp_init_task (nthr->task, task, icv);
+      team->implicit_task[i].icv.nthreads_var = nthreads_var;
+      team->implicit_task[i].taskgroup = taskgroup;
+      nthr->fn = fn;
+      nthr->data = data;
+      team->ordered_release[i] = &nthr->release;
+    }
+
+  gomp_simple_barrier_wait (&pool->threads_dock);
+}
+
+#include "../../team.c"
diff --git a/libgomp/config/gcn/time.c b/libgomp/config/gcn/time.c
new file mode 100644
index 000000000000..f189e55889c4
--- /dev/null
+++ b/libgomp/config/gcn/time.c
@@ -0,0 +1,52 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file implements timer routines for AMD GCN.  */
+
+#include "libgomp.h"
+
+/* According to AMD:
+    dGPU RTC is 27MHz
+    AGPU RTC is 100MHz
+   FIXME: DTRT on an APU.  */
+#define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */
+
+double
+omp_get_wtime (void)
+{
+  uint64_t clock;
+  asm ("s_memrealtime %0\n\t"
+       "s_waitcnt 0" : "=r" (clock));
+  return clock * RTC_TICKS;
+}
+
+double
+omp_get_wtick (void)
+{
+  return RTC_TICKS;
+}
+
+ialias (omp_get_wtime)
+ialias (omp_get_wtick)
diff --git a/libgomp/config/linux/gomp_print.c b/libgomp/config/linux/gomp_print.c
new file mode 100644
index 000000000000..811bdd6e9a93
--- /dev/null
+++ b/libgomp/config/linux/gomp_print.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <stdint.h>
+
+void
+gomp_print_string (const char *msg, const char *value)
+{
+  printf ("%s%s\n", msg, value);
+}
+
+void
+gomp_print_integer (const char *msg, int64_t value)
+{
+  printf ("%s%ld\n", msg, value);
+}
+
+void
+gomp_print_double (const char *msg, double value)
+{
+  printf ("%s%f\n", msg, value);
+}
diff --git a/libgomp/configure b/libgomp/configure
index afceea5cf880..39da8af45461 100755
--- a/libgomp/configure
+++ b/libgomp/configure
@@ -661,6 +661,8 @@ LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE
 LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE
 OPT_LDFLAGS
 SECTION_LDFLAGS
+PLUGIN_GCN_FALSE
+PLUGIN_GCN_TRUE
 PLUGIN_HSA_FALSE
 PLUGIN_HSA_TRUE
 PLUGIN_NVPTX_FALSE
@@ -669,6 +671,10 @@ offload_additional_lib_paths
 offload_additional_options
 offload_targets
 offload_plugins
+PLUGIN_GCN_LIBS
+PLUGIN_GCN_LDFLAGS
+PLUGIN_GCN_CPPFLAGS
+PLUGIN_GCN
 PLUGIN_HSA_LIBS
 PLUGIN_HSA_LDFLAGS
 PLUGIN_HSA_CPPFLAGS
@@ -11396,7 +11402,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11399 "configure"
+#line 11405 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11502,7 +11508,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11505 "configure"
+#line 11511 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -15315,7 +15321,7 @@ case "$host" in
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
-  nvptx*-*-*)
+  nvptx*-*-* | amdgcn*-*-*)
     # NVPTX does not support Pthreads, has its own code replacement.
     libgomp_use_pthreads=no
     # NVPTX is an accelerator-only target
@@ -15645,6 +15651,15 @@ PLUGIN_HSA_LIBS=
 
 
 
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+
+
+
+
+
 # Parse '--enable-offload-targets', figure out the corresponding libgomp
 # plugins, and configure to find the corresponding offload compilers.
 # 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -15756,6 +15771,29 @@ rm -f core conftest.err conftest.$ac_objext \
             ;;
         esac
         ;;
+
+      amdgcn*)
+	case "${target}" in
+	  x86_64-*-*)
+	    case " ${CC} ${CFLAGS} " in
+	      *" -m32 "*)
+		PLUGIN_GCN=0
+		;;
+	      *)
+		tgt_name=gcn
+		PLUGIN_GCN=$tgt
+		PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+		PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+		PLUGIN_GCN_LIBS="-ldl"
+		PLUGIN_GCN=1
+		;;
+	      esac
+	    ;;
+	  *-*-*)
+	    PLUGIN_GCN=0
+	     ;;
+	esac
+	;;
       *)
 	as_fn_error $? "unknown offload target specified" "$LINENO" 5
 	;;
@@ -15820,6 +15858,19 @@ cat >>confdefs.h <<_ACEOF
 #define PLUGIN_HSA $PLUGIN_HSA
 _ACEOF
 
+ if test $PLUGIN_GCN = 1; then
+  PLUGIN_GCN_TRUE=
+  PLUGIN_GCN_FALSE='#'
+else
+  PLUGIN_GCN_TRUE='#'
+  PLUGIN_GCN_FALSE=
+fi
+
+
+cat >>confdefs.h <<_ACEOF
+#define PLUGIN_GCN $PLUGIN_GCN
+_ACEOF
+
 
 if test "$HSA_RUNTIME_LIB" != ""; then
   HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
@@ -17445,6 +17496,10 @@ if test -z "${PLUGIN_HSA_TRUE}" && test -z "${PLUGIN_HSA_FALSE}"; then
   as_fn_error $? "conditional \"PLUGIN_HSA\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${PLUGIN_GCN_TRUE}" && test -z "${PLUGIN_GCN_FALSE}"; then
+  as_fn_error $? "conditional \"PLUGIN_GCN\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_TRUE}" && test -z "${LIBGOMP_BUILD_VERSIONED_SHLIB_FALSE}"; then
   as_fn_error $? "conditional \"LIBGOMP_BUILD_VERSIONED_SHLIB\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
diff --git a/libgomp/configure.ac b/libgomp/configure.ac
index 2350b78f310e..33e183909b30 100644
--- a/libgomp/configure.ac
+++ b/libgomp/configure.ac
@@ -177,7 +177,7 @@ case "$host" in
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
-  nvptx*-*-*)
+  nvptx*-*-* | amdgcn*-*-*)
     # NVPTX does not support Pthreads, has its own code replacement.
     libgomp_use_pthreads=no
     # NVPTX is an accelerator-only target
diff --git a/libgomp/configure.tgt b/libgomp/configure.tgt
index b88bf72fe3de..06ee115ece92 100644
--- a/libgomp/configure.tgt
+++ b/libgomp/configure.tgt
@@ -154,7 +154,7 @@ case "${target}" in
 	;;
 
   nvptx*-*-*)
-	config_path="nvptx"
+	config_path="nvptx accel"
 	;;
 
   *-*-rtems*)
@@ -164,6 +164,10 @@ case "${target}" in
 	fi
 	;;
 
+  amdgcn*-*-*)
+	config_path="gcn accel"
+	;;
+
   *)
 	;;
 
diff --git a/libgomp/fortran.c b/libgomp/fortran.c
index 4d544be1c99b..fdc324d129d7 100644
--- a/libgomp/fortran.c
+++ b/libgomp/fortran.c
@@ -626,7 +626,7 @@ omp_display_affinity_ (const char *format, size_t format_len)
   if (ret < sizeof buf)
     {
       buf[ret] = '\n';
-      gomp_print_string (buf, ret + 1);
+      gomp_write_string (buf, ret + 1);
     }
   else
     {
@@ -635,7 +635,7 @@ omp_display_affinity_ (const char *format, size_t format_len)
 			     format_len ? fmt : gomp_affinity_format_var,
 			     gomp_thread_self (), &thr->ts, thr->place);
       b[ret] = '\n';
-      gomp_print_string (b, ret + 1);
+      gomp_write_string (b, ret + 1);
       free (b);
     }
   if (fmt && fmt != fmt_buf)
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index da8ce260564a..bd63c422b0ce 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -50,7 +50,8 @@ enum offload_target_type
   /* OFFLOAD_TARGET_TYPE_HOST_NONSHM = 3 removed.  */
   OFFLOAD_TARGET_TYPE_NVIDIA_PTX = 5,
   OFFLOAD_TARGET_TYPE_INTEL_MIC = 6,
-  OFFLOAD_TARGET_TYPE_HSA = 7
+  OFFLOAD_TARGET_TYPE_HSA = 7,
+  OFFLOAD_TARGET_TYPE_GCN = 8
 };
 
 /* Container type for passing device properties.  */
@@ -120,7 +121,7 @@ extern void GOMP_OFFLOAD_openacc_exec (void (*) (void *), size_t, void **,
 				       void **, unsigned *, void *);
 extern void *GOMP_OFFLOAD_openacc_create_thread_data (int);
 extern void GOMP_OFFLOAD_openacc_destroy_thread_data (void *);
-extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (void);
+extern struct goacc_asyncqueue *GOMP_OFFLOAD_openacc_async_construct (int);
 extern bool GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *);
 extern int GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *);
 extern bool GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 31403ba67a9c..803f72db9223 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -692,6 +692,24 @@ static inline struct gomp_thread *gomp_thread (void)
   asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
   return nvptx_thrs + tid;
 }
+#elif defined __AMDGCN__
+static inline struct gomp_thread *gcn_thrs (void)
+{
+  /* The value is at the bottom of LDS.  */
+  struct gomp_thread * __lds *thrs = (struct gomp_thread * __lds *)4;
+  return *thrs;
+}
+static inline void set_gcn_thrs (struct gomp_thread *val)
+{
+  /* The value is at the bottom of LDS.  */
+  struct gomp_thread * __lds *thrs = (struct gomp_thread * __lds *)4;
+  *thrs = val;
+}
+static inline struct gomp_thread *gomp_thread (void)
+{
+  int tid = __builtin_gcn_dim_pos(1);
+  return gcn_thrs () + tid;
+}
 #elif defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct gomp_thread gomp_tls_data;
 static inline struct gomp_thread *gomp_thread (void)
@@ -751,7 +769,7 @@ extern void gomp_display_affinity_place (char *, size_t, size_t *, int);
 
 /* affinity-fmt.c */
 
-extern void gomp_print_string (const char *str, size_t len);
+extern void gomp_write_string (const char *str, size_t len);
 extern void gomp_set_affinity_format (const char *, size_t);
 extern void gomp_display_string (char *, size_t, size_t *, const char *,
 				 size_t);
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 3be4a0cd37eb..02596b0e2656 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -312,6 +312,9 @@ GOMP_4.5 {
 	GOMP_loop_ull_nonmonotonic_guided_start;
 	GOMP_parallel_loop_nonmonotonic_dynamic;
 	GOMP_parallel_loop_nonmonotonic_guided;
+	gomp_print_string;
+	gomp_print_integer;
+	gomp_print_double;
 } GOMP_4.0.1;
 
 GOMP_5.0 {
diff --git a/libgomp/oacc-async.c b/libgomp/oacc-async.c
index 1760e8c90c63..2b24ae7adc28 100644
--- a/libgomp/oacc-async.c
+++ b/libgomp/oacc-async.c
@@ -100,7 +100,8 @@ lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
 
   if (!dev->openacc.async.asyncqueue[async])
     {
-      dev->openacc.async.asyncqueue[async] = dev->openacc.async.construct_func ();
+      dev->openacc.async.asyncqueue[async]
+	= dev->openacc.async.construct_func (dev->target_id);
 
       if (!dev->openacc.async.asyncqueue[async])
 	{
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index beeca287e15b..21f73302f035 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -257,7 +257,7 @@ host_openacc_async_queue_callback (struct goacc_asyncqueue *aq
 }
 
 static struct goacc_asyncqueue *
-host_openacc_async_construct (void)
+host_openacc_async_construct (int device __attribute__((unused)))
 {
   /* Non-NULL 0xffff... value as opaque dummy.  */
   return (struct goacc_asyncqueue *) -1;
diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 766f59f39635..c9896e7ca24f 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -110,8 +110,9 @@ name_of_acc_device_t (enum acc_device_t type)
     case acc_device_host: return "host";
     case acc_device_not_host: return "not_host";
     case acc_device_nvidia: return "nvidia";
-    case /* not supported */ _acc_device_intel_mic:
+    case acc_device_gcn: return "gcn";
     case /* not supported */ _acc_device_hsa:
+    case /* not supported */ _acc_device_intel_mic:
     default: gomp_fatal ("unknown device type %u", (unsigned) type);
     }
 }
diff --git a/libgomp/oacc-int.h b/libgomp/oacc-int.h
index 5ca9944601e2..9dc6c8a57139 100644
--- a/libgomp/oacc-int.h
+++ b/libgomp/oacc-int.h
@@ -82,7 +82,14 @@ struct goacc_thread
   void *target_tls;
 };
 
-#if defined HAVE_TLS || defined USE_EMUTLS
+#ifdef __AMDGCN__
+static inline struct goacc_thread *
+goacc_thread (void)
+{
+  /* Unused in the offload libgomp for OpenACC: return a dummy value.  */
+  return 0;
+}
+#elif defined HAVE_TLS || defined USE_EMUTLS
 extern __thread struct goacc_thread *goacc_tls_data;
 static inline struct goacc_thread *
 goacc_thread (void)
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index a374f91f0d73..6b089763e9a4 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -732,6 +732,8 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  || kind == GOMP_MAP_FORCE_FROM)
 	finalize = true;
     }
+  else if (num_waits == acc_async_noval)
+    acc_wait_all_async (async);
 
   /* Determine if this is an "acc enter data".  */
   for (i = 0; i < mapnum; ++i)
@@ -749,7 +751,8 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  || kind == GOMP_MAP_FORCE_TO
 	  || kind == GOMP_MAP_TO
 	  || kind == GOMP_MAP_ALLOC
-	  || kind == GOMP_MAP_DECLARE_ALLOCATE)
+	  || kind == GOMP_MAP_DECLARE_ALLOCATE
+	  || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION)
 	{
 	  data_enter = true;
 	  break;
@@ -761,7 +764,8 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
 	  || kind == GOMP_MAP_FORCE_DETACH
 	  || kind == GOMP_MAP_FROM
 	  || kind == GOMP_MAP_FORCE_FROM
-	  || kind == GOMP_MAP_DECLARE_DEALLOCATE)
+	  || kind == GOMP_MAP_DECLARE_DEALLOCATE
+	  || kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
 	break;
 
       gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -866,6 +870,10 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		case GOMP_MAP_FORCE_ALLOC:
 		  acc_create_async (hostaddrs[i], sizes[i], async);
 		  break;
+		case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
+		  if (hostaddrs[i] != NULL)
+		    acc_create_async (hostaddrs[i], 1, async);
+		  break;
 		case GOMP_MAP_TO:
 		case GOMP_MAP_FORCE_TO:
 		  if (hostaddrs[i])
@@ -986,6 +994,15 @@ GOACC_enter_exit_data (int flags_m, size_t mapnum,
 			acc_delete_async (hostaddrs[i], sizes[i], async);
 		    }
 		  break;
+		case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION:
+		  if (acc_is_present (hostaddrs[i], 1))
+		    {
+		      if (finalize)
+			acc_delete_finalize_async (hostaddrs[i], 1, async);
+		      else
+			acc_delete_async (hostaddrs[i], 1, async);
+		    }
+		  break;
 		case GOMP_MAP_DETACH:
 		case GOMP_MAP_FORCE_DETACH:
 		case GOMP_MAP_FORCE_PRESENT:
@@ -1196,6 +1213,7 @@ GOACC_update (int flags_m, size_t mapnum,
 	{
 	case GOMP_MAP_POINTER:
 	case GOMP_MAP_TO_PSET:
+	case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
 	  break;
 
 	case GOMP_MAP_ALWAYS_POINTER:
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index d7ac71400ad6..0d8f4763b550 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -188,6 +188,18 @@ extern __SIZE_TYPE__ omp_capture_affinity (char *, __SIZE_TYPE__, const char *)
 extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW;
 extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW;
 
+/************************************************************************/
+/* Libgomp extensions.                                                  */
+
+#include <stdint.h>
+
+/* Print a message, and value, possibly from a GPU-offloaded function.
+   Primarily intended for debug messages.
+   Maximum message & value length is 128 bytes.  */
+void gomp_print_string (const char *msg, const char *value);
+void gomp_print_integer (const char *msg, int64_t value);
+void gomp_print_double (const char *msg, double value);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index c4e7c0dff4ad..65aec54da6b9 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -484,4 +484,30 @@
           end function
         end interface
 
+        ! Libgomp extensions.
+
+        interface
+          subroutine gomp_print_string (msg, str) bind(C)
+            use iso_c_binding, only: c_char
+            character (kind=c_char) :: msg(*)
+            character (kind=c_char) :: str(*)
+          end subroutine gomp_print_string
+        end interface
+
+        interface
+          subroutine gomp_print_integer (msg, i) bind(C)
+            use iso_c_binding, only: c_char, c_int64_t
+            character (kind=c_char) :: msg(*)
+            integer (kind=c_int64_t), value :: i
+          end subroutine gomp_print_integer
+        end interface
+
+        interface
+          subroutine gomp_print_double (msg, d) bind(C)
+            use iso_c_binding, only: c_char, c_double
+            character (kind=c_char) :: msg(*)
+            real (kind=c_double), value :: d
+          end subroutine gomp_print_double
+        end interface
+
       end module omp_lib
diff --git a/libgomp/openacc.f90 b/libgomp/openacc.f90
index 8b514c54e398..e675843a4114 100644
--- a/libgomp/openacc.f90
+++ b/libgomp/openacc.f90
@@ -37,7 +37,7 @@ module openacc_kinds
   integer, parameter :: acc_device_kind = int32
 
   public :: acc_device_none, acc_device_default, acc_device_host
-  public :: acc_device_not_host, acc_device_nvidia
+  public :: acc_device_not_host, acc_device_nvidia, acc_device_gcn
 
   ! Keep in sync with include/gomp-constants.h.
   integer (acc_device_kind), parameter :: acc_device_none = 0
@@ -46,7 +46,9 @@ module openacc_kinds
   ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
   integer (acc_device_kind), parameter :: acc_device_not_host = 4
   integer (acc_device_kind), parameter :: acc_device_nvidia = 5
-  integer (acc_device_kind), parameter :: acc_device_current = 8
+  integer (acc_device_kind), parameter :: acc_device_hsa = 7
+  integer (acc_device_kind), parameter :: acc_device_gcn = 8
+  integer (acc_device_kind), parameter :: acc_device_current = 9
 
   public :: acc_device_property
 
diff --git a/libgomp/openacc.h b/libgomp/openacc.h
index 26084dc5ddd3..83e7c46a70d6 100644
--- a/libgomp/openacc.h
+++ b/libgomp/openacc.h
@@ -57,7 +57,8 @@ typedef enum acc_device_t {
   acc_device_nvidia = 5,
   /* not supported */ _acc_device_intel_mic = 6,
   /* not supported */ _acc_device_hsa = 7,
-  acc_device_current = 8,
+  acc_device_gcn = 8,
+  acc_device_current = 9,
   _ACC_device_hwm,
   /* Ensure enumeration is layout compatible with int.  */
   _ACC_highest = __INT_MAX__,
diff --git a/libgomp/openacc_lib.h b/libgomp/openacc_lib.h
index 9c26fecff41c..3284b961631c 100644
--- a/libgomp/openacc_lib.h
+++ b/libgomp/openacc_lib.h
@@ -42,6 +42,8 @@
 !     removed.
       integer (acc_device_kind), parameter :: acc_device_not_host = 4
       integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+      integer (acc_device_kind), parameter :: acc_device_hsa = 7
+      integer (acc_device_kind), parameter :: acc_device_gcn = 8
 
       integer, parameter :: acc_handle_kind = 4
 
diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am
index 168ef59de413..45ed043e333d 100644
--- a/libgomp/plugin/Makefrag.am
+++ b/libgomp/plugin/Makefrag.am
@@ -52,3 +52,17 @@ libgomp_plugin_hsa_la_LDFLAGS += $(PLUGIN_HSA_LDFLAGS)
 libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
 endif
+
+if PLUGIN_GCN
+# AMD GCN plugin
+libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+	-D_GNU_SOURCE
+libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
+	$(lt_host_flags)
+libgomp_plugin_gcn_la_LDFLAGS += $(PLUGIN_GCN_LDFLAGS)
+libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
+endif
diff --git a/libgomp/plugin/configfrag.ac b/libgomp/plugin/configfrag.ac
index 13ca26f47d99..6fedd28eccc8 100644
--- a/libgomp/plugin/configfrag.ac
+++ b/libgomp/plugin/configfrag.ac
@@ -137,6 +137,15 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS)
 AC_SUBST(PLUGIN_HSA_LDFLAGS)
 AC_SUBST(PLUGIN_HSA_LIBS)
 
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+AC_SUBST(PLUGIN_GCN)
+AC_SUBST(PLUGIN_GCN_CPPFLAGS)
+AC_SUBST(PLUGIN_GCN_LDFLAGS)
+AC_SUBST(PLUGIN_GCN_LIBS)
+
 # Parse '--enable-offload-targets', figure out the corresponding libgomp
 # plugins, and configure to find the corresponding offload compilers.
 # 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -237,6 +246,29 @@ if test x"$enable_offload_targets" != x; then
             ;;
         esac
         ;;
+
+      amdgcn*)
+	case "${target}" in
+	  x86_64-*-*)
+	    case " ${CC} ${CFLAGS} " in
+	      *" -m32 "*)
+		PLUGIN_GCN=0
+		;;
+	      *)
+		tgt_name=gcn
+		PLUGIN_GCN=$tgt
+		PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+		PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+		PLUGIN_GCN_LIBS="-ldl"
+		PLUGIN_GCN=1
+		;;
+	      esac
+	    ;;
+	  *-*-*)
+	    PLUGIN_GCN=0
+	     ;;
+	esac
+	;;
       *)
 	AC_MSG_ERROR([unknown offload target specified])
 	;;
@@ -275,6 +307,9 @@ AC_DEFINE_UNQUOTED([PLUGIN_NVPTX_DYNAMIC], [$PLUGIN_NVPTX_DYNAMIC],
 AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
 AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
   [Define to 1 if the HSA plugin is built, 0 if not.])
+AM_CONDITIONAL([PLUGIN_GCN], [test $PLUGIN_GCN = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_GCN], [$PLUGIN_GCN],
+  [Define to 1 if the GCN plugin is built, 0 if not.])
 
 if test "$HSA_RUNTIME_LIB" != ""; then
   HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
new file mode 100644
index 000000000000..b059348c7bf6
--- /dev/null
+++ b/libgomp/plugin/plugin-gcn.c
@@ -0,0 +1,3482 @@
+/* Plugin for AMD GCN execution.
+
+   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <hsa.h>
+#include <dlfcn.h>
+#include <signal.h>
+#include "libgomp-plugin.h"
+#include "gomp-constants.h"
+#include <elf.h>
+#include "oacc-plugin.h"
+#include "oacc-int.h"
+#include <assert.h>
+
+#define obstack_chunk_alloc GOMP_PLUGIN_malloc
+#define obstack_chunk_free free
+#include "obstack.h"
+
+/* These probably won't be in elf.h for a while.  */
+#define R_AMDGPU_NONE		0
+#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
+#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
+#define R_AMDGPU_ABS64		3	/* S + A  */
+#define R_AMDGPU_REL32		4	/* S + A - P  */
+#define R_AMDGPU_REL64		5	/* S + A - P  */
+#define R_AMDGPU_ABS32		6	/* S + A  */
+#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
+#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
+#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
+#define reserved		12
+#define R_AMDGPU_RELATIVE64	13	/* B + A  */
+
+/* Secure getenv() which returns NULL if running as SUID/SGID.  */
+#ifndef HAVE_SECURE_GETENV
+#ifdef HAVE___SECURE_GETENV
+#define secure_getenv __secure_getenv
+#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
+  && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
+
+#include <unistd.h>
+
+/* Implementation of secure_getenv() for targets where it is not provided but
+   we have at least means to test real and effective IDs. */
+
+static char *
+secure_getenv (const char *name)
+{
+  if ((getuid () == geteuid ()) && (getgid () == getegid ()))
+    return getenv (name);
+  else
+    return NULL;
+}
+
+#else
+#define secure_getenv getenv
+#endif
+#endif
+
+struct gcn_thread
+{
+  int async;
+};
+
+static inline struct gcn_thread *
+gcn_thread (void)
+{
+  return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+/* As an HSA runtime is dlopened, following structure defines function
+   pointers utilized by the HSA plug-in.  */
+
+struct hsa_runtime_fn_info
+{
+  /* HSA runtime.  */
+  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
+					const char **status_string);
+  hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
+					  void *value);
+  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
+					 hsa_agent_info_t attribute,
+					 void *value);
+  hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
+				      hsa_isa_info_t attribute,
+				      uint32_t index,
+				      void *value);
+  hsa_status_t (*hsa_init_fn) (void);
+  hsa_status_t (*hsa_iterate_agents_fn)
+    (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
+  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
+					  hsa_region_info_t attribute,
+					  void *value);
+  hsa_status_t (*hsa_queue_create_fn)
+    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
+     void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+     void *data, uint32_t private_segment_size,
+     uint32_t group_segment_size, hsa_queue_t **queue);
+  hsa_status_t (*hsa_agent_iterate_regions_fn)
+    (hsa_agent_t agent,
+     hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
+  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
+  hsa_status_t (*hsa_executable_create_fn)
+    (hsa_profile_t profile, hsa_executable_state_t executable_state,
+     const char *options, hsa_executable_t *executable);
+  hsa_status_t (*hsa_executable_global_variable_define_fn)
+    (hsa_executable_t executable, const char *variable_name, void *address);
+  hsa_status_t (*hsa_executable_load_code_object_fn)
+    (hsa_executable_t executable, hsa_agent_t agent,
+     hsa_code_object_t code_object, const char *options);
+  hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
+					   const char *options);
+  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
+					uint32_t num_consumers,
+					const hsa_agent_t *consumers,
+					hsa_signal_t *signal);
+  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
+					  void **ptr);
+  hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
+  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
+  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
+  hsa_status_t (*hsa_executable_get_symbol_fn)
+    (hsa_executable_t executable, const char *module_name,
+     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
+     hsa_executable_symbol_t *symbol);
+  hsa_status_t (*hsa_executable_symbol_get_info_fn)
+    (hsa_executable_symbol_t executable_symbol,
+     hsa_executable_symbol_info_t attribute, void *value);
+  hsa_status_t (*hsa_executable_iterate_symbols_fn)
+    (hsa_executable_t executable,
+     hsa_status_t (*callback)(hsa_executable_t executable,
+			      hsa_executable_symbol_t symbol, void *data),
+     void *data);
+  uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
+						    uint64_t value);
+  uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
+  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
+				       hsa_signal_value_t value);
+  void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
+				       hsa_signal_value_t value);
+  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
+    (hsa_signal_t signal, hsa_signal_condition_t condition,
+     hsa_signal_value_t compare_value, uint64_t timeout_hint,
+     hsa_wait_state_t wait_state_hint);
+  hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
+  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
+
+  hsa_status_t (*hsa_code_object_deserialize_fn)
+    (void *serialized_code_object, size_t serialized_code_object_size,
+     const char *options, hsa_code_object_t *code_object);
+};
+
+/* HSA runtime functions that are initialized in init_hsa_context.  */
+
+static struct hsa_runtime_fn_info hsa_fns;
+
+/* Keep the following GOMP prefixed structures in sync with respective parts of
+   the compiler.  */
+
+/* Structure describing the run-time and grid properties of an HSA kernel
+   lauch.  */
+
+struct GOMP_kernel_launch_attributes
+{
+  /* Number of dimensions the workload has.  Maximum number is 3.  */
+  uint32_t ndim;
+  /* Size of the grid in the three respective dimensions.  */
+  uint32_t gdims[3];
+  /* Size of work-groups in the respective dimensions.  */
+  uint32_t wdims[3];
+};
+
+/* Collection of information needed for a dispatch of a kernel from a
+   kernel.  */
+
+struct GOMP_hsa_kernel_dispatch
+{
+  /* Pointer to a command queue associated with a kernel dispatch agent.  */
+  void *queue;
+  /* Pointer to reserved memory for OMP data struct copying.  */
+  void *omp_data_memory;
+  /* Pointer to a memory space used for kernel arguments passing.  */
+  void *kernarg_address;
+  /* Kernel object.  */
+  uint64_t object;
+  /* Synchronization signal used for dispatch synchronization.  */
+  uint64_t signal;
+  /* Private segment size.  */
+  uint32_t private_segment_size;
+  /* Group segment size.  */
+  uint32_t group_segment_size;
+  /* Number of children kernel dispatches.  */
+  uint64_t kernel_dispatch_count;
+  /* Debug purpose argument.  */
+  uint64_t debug;
+  /* Levels-var ICV.  */
+  uint64_t omp_level;
+  /* Kernel dispatch structures created for children kernel dispatches.  */
+  struct GOMP_hsa_kernel_dispatch **children_dispatches;
+  /* Number of threads.  */
+  uint32_t omp_num_threads;
+};
+
+/* Structure of the default kernargs segment, supporting gomp_print_*.
+   This will only be used if the requested space is less than 9 bytes.  */
+
+struct kernargs {
+  /* Leave space for the real kernel arguments.
+     OpenACC and OpenMP only use one pointer.  */
+  int64_t dummy1;
+  int64_t dummy2;
+
+  /* A pointer to struct output, below, for console output data.  */
+  int64_t out_ptr;
+
+  /* A pointer to struct heap, below.  */
+  int64_t heap_ptr;
+
+  /* Output data.  */
+  struct output {
+    int return_value;
+    unsigned int next_output;
+    struct printf_data {
+      int written;
+      char msg[128];
+      int type;
+      union {
+	int64_t ivalue;
+	double dvalue;
+	char text[128];
+      };
+    } queue[1024];
+    unsigned int consumed;
+  } output_data;
+};
+
+/* Heap space, allocated target-side, provided for use of newlib malloc.
+   Each module should have it's own heap allocated.
+   Beware that heap usage increases with OpenMP teams.  */
+static size_t gcn_kernel_heap_size = 100*1024*1024;  /* 100MB.  */
+struct heap {
+  int64_t size;
+  char data[0];
+};
+
+/* GCN specific definition of asynchronous queues.  */
+
+#define ASYNC_QUEUE_SIZE 64
+#define DRAIN_QUEUE_SYNCHRONOUS_P false
+#define DEBUG_QUEUES 0
+#define DEBUG_THREAD_SLEEP 0
+#define DEBUG_THREAD_SIGNAL 0
+
+struct kernel_launch
+{
+  struct kernel_info *kernel;
+  void *vars;
+  struct GOMP_kernel_launch_attributes kla;
+};
+
+struct callback
+{
+  void (*fn)(void *);
+  void *data;
+};
+
+struct queue_entry
+{
+  int type;
+  union {
+    struct kernel_launch launch;
+    struct callback callback;
+  } u;
+};
+
+struct goacc_asyncqueue
+{
+  struct agent_info *agent;
+  hsa_queue_t *hsa_queue;
+
+  pthread_t thread_drain_queue;
+  pthread_mutex_t mutex;
+  pthread_cond_t queue_cond_in;
+  pthread_cond_t queue_cond_out;
+  struct queue_entry queue[ASYNC_QUEUE_SIZE];
+  int queue_first;
+  int queue_n;
+  int drain_queue_stop;
+
+  int id;
+  struct goacc_asyncqueue *prev;
+  struct goacc_asyncqueue *next;
+};
+
+/* Part of the libgomp plugin interface.  Return the name of the accelerator,
+   which is "gcn".  */
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+  return "gcn";
+}
+
+/* Part of the libgomp plugin interface.  Return the specific capabilities the
+   HSA accelerator have.  */
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+  /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
+  return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
+	    | GOMP_OFFLOAD_CAP_OPENACC_200;
+}
+
+/* Part of the libgomp plugin interface.  Identify as HSA accelerator.  */
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+  return OFFLOAD_TARGET_TYPE_GCN;
+}
+
+/* Return the libgomp version number we're compatible with.  There is
+   no requirement for cross-version compatibility.  */
+
+unsigned
+GOMP_OFFLOAD_version (void)
+{
+  return GOMP_VERSION;
+}
+
+/* Flag to decide whether print to stderr information about what is going on.
+   Set in init_debug depending on environment variables.  */
+
+static bool debug;
+
+/* Flag to decide if the runtime should suppress a possible fallback to host
+   execution.  */
+
+static bool suppress_host_fallback;
+
+/* Flag to locate HSA runtime shared library that is dlopened
+   by this plug-in.  */
+
+static const char *hsa_runtime_lib;
+
+/* Flag to decide if the runtime should support also CPU devices (can be
+   a simulator).  */
+
+static bool support_cpu_devices;
+
+/* Runtime dimension overrides.  Zero indicates default.  */
+
+static int override_x_dim = 0;
+static int override_z_dim = 0;
+
+/* Initialize debug and suppress_host_fallback according to the environment.  */
+
+static void
+init_environment_variables (void)
+{
+  if (secure_getenv ("GCN_DEBUG"))
+    debug = true;
+  else
+    debug = false;
+
+  if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
+    suppress_host_fallback = true;
+  else
+    suppress_host_fallback = false;
+
+  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
+  if (hsa_runtime_lib == NULL)
+    hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
+
+  support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
+
+  const char *x = secure_getenv ("GCN_NUM_TEAMS");
+  if (!x)
+    x = secure_getenv ("GCN_NUM_GANGS");
+  if (x)
+    override_x_dim = atoi (x);
+
+  const char *z = secure_getenv ("GCN_NUM_THREADS");
+  if (!z)
+    z = secure_getenv ("GCN_NUM_WORKERS");
+  if (z)
+    override_z_dim = atoi (z);
+
+  const char *heap = secure_getenv ("GCN_HEAP_SIZE");
+  if (heap)
+    {
+      size_t tmp = atol (heap);
+      if (tmp)
+	gcn_kernel_heap_size = tmp;
+    }
+}
+
+/* Print a message to stderr if HSA_DEBUG value is set to true.  */
+
+#define HSA_DPRINT(...) \
+  do \
+  { \
+    if (debug) \
+      { \
+	fprintf (stderr, __VA_ARGS__); \
+      } \
+  } \
+  while (false);
+
+/* Flush stderr if GCN_DEBUG value is set to true.  */
+
+#define HSA_FLUSH()				\
+  do {						\
+    if (debug)					\
+      fflush (stderr);				\
+  } while (0)
+
+/* Print a logging message with PREFIX to stderr if HSA_DEBUG value
+   is set to true.  */
+
+#define HSA_LOG(prefix, ...)			\
+  do						\
+    {						\
+      HSA_DPRINT (prefix);			\
+      HSA_DPRINT (__VA_ARGS__);			\
+      HSA_FLUSH ();				\
+    } while (false)
+
+/* Print a debugging message to stderr.  */
+
+#define HSA_DEBUG(...) HSA_LOG ("GCN debug: ", __VA_ARGS__)
+
+/* Print a warning message to stderr.  */
+
+#define HSA_WARNING(...) HSA_LOG ("GCN warning: ", __VA_ARGS__)
+
+/* Print HSA warning STR with an HSA STATUS code.  */
+
+static void
+hsa_warn (const char *str, hsa_status_t status)
+{
+  if (!debug)
+    return;
+
+  const char *hsa_error_msg;
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+
+  fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
+	   hsa_error_msg);
+}
+
+/* Report a fatal error STR together with the HSA error corresponding to STATUS
+   and terminate execution of the current process.  */
+
+static void
+hsa_fatal (const char *str, hsa_status_t status)
+{
+  const char *hsa_error_msg;
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+  GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
+		     hsa_error_msg);
+}
+
+/* Like hsa_fatal, except only report error message, and return FALSE
+   for propagating error processing to outside of plugin.  */
+
+static bool
+hsa_error (const char *str, hsa_status_t status)
+{
+  const char *hsa_error_msg;
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+  GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
+		     hsa_error_msg);
+  return false;
+}
+
+struct hsa_kernel_description
+{
+  const char *name;
+  unsigned omp_data_size;
+  bool gridified_kernel_p;
+  unsigned kernel_dependencies_count;
+  const char **kernel_dependencies;
+  int oacc_dims[3];  /* Only present for GCN kernels.  */
+};
+
+struct global_var_info
+{
+  const char *name;
+  void *address;
+};
+
+/* Data passed by the static initializer of a compilation unit containing GCN
+   object code to GOMP_offload_register.  */
+
+struct gcn_image_desc
+{
+  union {
+    struct gcn_image {
+      char magic[4];  /* Will be "GCN" for GCN code objects.  */
+      size_t size;
+      void *image;
+    } *gcn_image;
+  };
+  const unsigned kernel_count;
+  struct hsa_kernel_description *kernel_infos;
+  const unsigned global_variable_count;
+  struct global_var_info *global_variables;
+};
+
+struct agent_info;
+
+/* Information required to identify, finalize and run any given kernel.  */
+
+struct kernel_info
+{
+  /* Name of the kernel, required to locate it within the GCN object-code
+     module.  */
+  const char *name;
+  /* Size of memory space for OMP data.  */
+  unsigned omp_data_size;
+  /* The specific agent the kernel has been or will be finalized for and run
+     on.  */
+  struct agent_info *agent;
+  /* The specific module where the kernel takes place.  */
+  struct module_info *module;
+  /* Mutex enforcing that at most once thread ever initializes a kernel for
+     use.  A thread should have locked agent->module_rwlock for reading before
+     acquiring it.  */
+  pthread_mutex_t init_mutex;
+  /* Flag indicating whether the kernel has been initialized and all fields
+     below it contain valid data.  */
+  bool initialized;
+  /* Flag indicating that the kernel has a problem that blocks an execution.  */
+  bool initialization_failed;
+  /* The object to be put into the dispatch queue.  */
+  uint64_t object;
+  /* Required size of kernel arguments.  */
+  uint32_t kernarg_segment_size;
+  /* Required size of group segment.  */
+  uint32_t group_segment_size;
+  /* Required size of private segment.  */
+  uint32_t private_segment_size;
+  /* List of all kernel dependencies.  */
+  const char **dependencies;
+  /* Number of dependencies.  */
+  unsigned dependencies_count;
+  /* Maximum OMP data size necessary for kernel from kernel dispatches.  */
+  unsigned max_omp_data_size;
+  /* True if the kernel is gridified.  */
+  bool gridified_kernel_p;
+};
+
+/* Information about a particular GCN module, its image and kernels.  */
+
+struct module_info
+{
+  /* The description with which the program has registered the image.  */
+  struct gcn_image_desc *image_desc;
+  /* GCN heap allocation.  */
+  struct heap *heap;
+  /* Physical boundaries of the loaded module.  */
+  Elf64_Addr phys_address_start;
+  Elf64_Addr phys_address_end;
+
+  bool constructors_run_p;
+  struct kernel_info *init_array_func, *fini_array_func;
+
+  /* Number of kernels in this module.  */
+  int kernel_count;
+  /* An array of kernel_info structures describing each kernel in this
+     module.  */
+  struct kernel_info kernels[];
+};
+
+/* Description of an HSA GPU agent and the program associated with it.  */
+
+struct agent_info
+{
+  /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
+  hsa_agent_t id;
+  /* The user-visible device number.  */
+  int device_id;
+  /* Whether the agent has been initialized.  The fields below are usable only
+     if it has been.  */
+  bool initialized;
+  /* Precomuted check for problem architectures.  */
+  bool gfx900_p;
+
+  /* Command queues of the agent.  */
+  hsa_queue_t *sync_queue;
+  struct goacc_asyncqueue *async_queues, *omp_async_queue;
+  pthread_mutex_t async_queues_mutex;
+
+  /* The HSA memory region from which to allocate kernel arguments.  */
+  hsa_region_t kernarg_region;
+
+  /* Read-write lock that protects kernels which are running or about to be run
+     from interference with loading and unloading of images.  Needs to be
+     locked for reading while a kernel is being run, and for writing if the
+     list of modules is manipulated (and thus the HSA program invalidated).  */
+  pthread_rwlock_t module_rwlock;
+
+  /* The module associated with this kernel.  */
+  struct module_info *module;
+
+  /* Mutex enforcing that only one thread will finalize the HSA program.  A
+     thread should have locked agent->module_rwlock for reading before
+     acquiring it.  */
+  pthread_mutex_t prog_mutex;
+  /* Flag whether the HSA program that consists of all the modules has been
+     finalized.  */
+  bool prog_finalized;
+  /* HSA executable - the finalized program that is used to locate kernels.  */
+  hsa_executable_t executable;
+};
+
+static bool create_and_finalize_hsa_program (struct agent_info *);
+
+/* Information about the whole HSA environment and all of its agents.  */
+
+struct hsa_context_info
+{
+  /* Whether the structure has been initialized.  */
+  bool initialized;
+  /* Number of usable GPU HSA agents in the system.  */
+  int agent_count;
+  /* Array of agent_info structures describing the individual HSA agents.  */
+  struct agent_info *agents;
+};
+
+/* Information about the whole HSA environment and all of its agents.  */
+
+static struct hsa_context_info hsa_context;
+
+static bool
+init_hsa_runtime_functions (void)
+{
+#define DLSYM_FN(function) \
+  hsa_fns.function##_fn = dlsym (handle, #function); \
+  if (hsa_fns.function##_fn == NULL) \
+    return false;
+  void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
+  if (handle == NULL)
+    return false;
+
+  DLSYM_FN (hsa_status_string)
+  DLSYM_FN (hsa_system_get_info)
+  DLSYM_FN (hsa_agent_get_info)
+  DLSYM_FN (hsa_init)
+  DLSYM_FN (hsa_iterate_agents)
+  DLSYM_FN (hsa_region_get_info)
+  DLSYM_FN (hsa_queue_create)
+  DLSYM_FN (hsa_agent_iterate_regions)
+  DLSYM_FN (hsa_executable_destroy)
+  DLSYM_FN (hsa_executable_create)
+  DLSYM_FN (hsa_executable_global_variable_define)
+  DLSYM_FN (hsa_executable_load_code_object)
+  DLSYM_FN (hsa_executable_freeze)
+  DLSYM_FN (hsa_signal_create)
+  DLSYM_FN (hsa_memory_allocate)
+  DLSYM_FN (hsa_memory_copy)
+  DLSYM_FN (hsa_memory_free)
+  DLSYM_FN (hsa_signal_destroy)
+  DLSYM_FN (hsa_executable_get_symbol)
+  DLSYM_FN (hsa_executable_symbol_get_info)
+  DLSYM_FN (hsa_executable_iterate_symbols)
+  DLSYM_FN (hsa_queue_add_write_index_release)
+  DLSYM_FN (hsa_queue_load_read_index_acquire)
+  DLSYM_FN (hsa_signal_wait_acquire)
+  DLSYM_FN (hsa_signal_store_relaxed)
+  DLSYM_FN (hsa_signal_store_release)
+  DLSYM_FN (hsa_signal_load_acquire)
+  DLSYM_FN (hsa_queue_destroy)
+  DLSYM_FN (hsa_code_object_deserialize)
+  return true;
+#undef DLSYM_FN
+}
+
+static void
+dump_hsa_system_info (void)
+{
+  hsa_status_t status;
+
+  hsa_endianness_t endianness;
+  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
+					   &endianness);
+  if (status == HSA_STATUS_SUCCESS)
+    switch (endianness)
+      {
+      case HSA_ENDIANNESS_LITTLE:
+	HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
+	break;
+      case HSA_ENDIANNESS_BIG:
+	HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
+	break;
+      default:
+	HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
+      }
+  else
+    HSA_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
+
+  uint8_t extensions[128];
+  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
+					   &extensions);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
+	HSA_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
+    }
+  else
+    HSA_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+}
+
+static void
+dump_machine_model (hsa_machine_model_t machine_model, const char *s)
+{
+  switch (machine_model)
+    {
+    case HSA_MACHINE_MODEL_SMALL:
+      HSA_DEBUG ("%s: SMALL\n", s);
+      break;
+    case HSA_MACHINE_MODEL_LARGE:
+      HSA_DEBUG ("%s: LARGE\n", s);
+      break;
+    default:
+      HSA_DEBUG ("%s: UNKNOWN\n", s);
+      break;
+    }
+}
+
+static void
+dump_profile (hsa_profile_t profile, const char *s)
+{
+  switch (profile)
+    {
+    case HSA_PROFILE_FULL:
+      HSA_DEBUG ("%s: FULL\n", s);
+      break;
+    case HSA_PROFILE_BASE:
+      HSA_DEBUG ("%s: BASE\n", s);
+      break;
+    default:
+      HSA_DEBUG ("%s: UNKNOWN\n", s);
+      break;
+    }
+}
+
+static void dump_hsa_regions (hsa_agent_t agent);
+
+static hsa_status_t
+dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
+{
+  hsa_status_t status;
+
+  char buf[64];
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
+					  &buf);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_NAME: FAILED\n");
+
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
+					  &buf);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
+
+  hsa_machine_model_t machine_model;
+  status
+    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
+				     &machine_model);
+  if (status == HSA_STATUS_SUCCESS)
+    dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
+
+  hsa_profile_t profile;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
+					  &profile);
+  if (status == HSA_STATUS_SUCCESS)
+    dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_PROFILE: FAILED\n");
+
+  hsa_device_type_t device_type;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+					  &device_type);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      switch (device_type)
+	{
+	case HSA_DEVICE_TYPE_CPU:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
+	  break;
+	case HSA_DEVICE_TYPE_GPU:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
+	  break;
+	case HSA_DEVICE_TYPE_DSP:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
+	  break;
+	default:
+	  HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
+	  break;
+	}
+    }
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_DEVICE: FAILED\n");
+
+  uint32_t size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
+					  &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
+
+  uint32_t max_dim;
+  status = hsa_fns.hsa_agent_get_info_fn (agent,
+					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+					  &max_dim);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
+
+  uint32_t max_size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent,
+					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
+					  &max_size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
+
+  uint32_t grid_max_dim;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
+					  &grid_max_dim);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
+
+  uint32_t grid_max_size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
+					  &grid_max_size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
+  else
+    HSA_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
+
+  dump_hsa_regions (agent);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Return true if the agent is a GPU and acceptable of concurrent submissions
+   from different threads.  */
+
+static bool
+suitable_hsa_agent_p (hsa_agent_t agent)
+{
+  hsa_device_type_t device_type;
+  hsa_status_t status
+    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+				     &device_type);
+  if (status != HSA_STATUS_SUCCESS)
+    return false;
+
+  switch (device_type)
+    {
+    case HSA_DEVICE_TYPE_GPU:
+      break;
+    case HSA_DEVICE_TYPE_CPU:
+      if (!support_cpu_devices)
+	return false;
+      break;
+    default:
+      return false;
+    }
+
+  uint32_t features = 0;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
+					  &features);
+  if (status != HSA_STATUS_SUCCESS
+      || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
+    return false;
+  hsa_queue_type_t queue_type;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
+					  &queue_type);
+  if (status != HSA_STATUS_SUCCESS
+      || (queue_type != HSA_QUEUE_TYPE_MULTI))
+    return false;
+
+  return true;
+}
+
+/* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
+   agent_count in hsa_context.  */
+
+static hsa_status_t
+count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
+{
+  if (suitable_hsa_agent_p (agent))
+    hsa_context.agent_count++;
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
+   id to the describing structure in the hsa context.  The index of the
+   structure is pointed to by DATA, increment it afterwards.  */
+
+static hsa_status_t
+assign_agent_ids (hsa_agent_t agent, void *data)
+{
+  if (suitable_hsa_agent_p (agent))
+    {
+      int *agent_index = (int *) data;
+      hsa_context.agents[*agent_index].id = agent;
+      ++*agent_index;
+    }
+  return HSA_STATUS_SUCCESS;
+}
+
+static void
+finalize_async_thread (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+  if (aq->drain_queue_stop == 2)
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      return;
+    }
+
+  aq->drain_queue_stop = 1;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("Signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  while (aq->drain_queue_stop != 2)
+    {
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
+		   " to sleep\n", aq->agent->device_id, aq->id);
+      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
+		   aq->agent->device_id, aq->id);
+    }
+
+  HSA_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
+	     aq->id);
+  pthread_mutex_unlock (&aq->mutex);
+
+  int err = pthread_join (aq->thread_drain_queue, NULL);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
+		       aq->agent->device_id, aq->id, strerror (err));
+  HSA_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
+}
+
+/* Initialize hsa_context if it has not already been done.
+   Return TRUE on success.  */
+
+static bool
+init_hsa_context (void)
+{
+  hsa_status_t status;
+  int agent_index = 0;
+
+  if (hsa_context.initialized)
+    return true;
+  init_environment_variables ();
+  if (!init_hsa_runtime_functions ())
+    {
+      HSA_DEBUG ("Run-time could not be dynamically opened\n");
+      if (suppress_host_fallback)
+	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+      return false;
+    }
+  status = hsa_fns.hsa_init_fn ();
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Run-time could not be initialized", status);
+  HSA_DEBUG ("HSA run-time initialized for GCN\n");
+
+  if (debug)
+    dump_hsa_system_info ();
+
+  status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("GCN GPU devices could not be enumerated", status);
+  HSA_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
+
+  hsa_context.agents
+    = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
+				  * sizeof (struct agent_info));
+  status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
+  if (agent_index != hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
+      return false;
+    }
+
+  if (debug)
+    {
+      status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
+      if (status != HSA_STATUS_SUCCESS)
+	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
+    }
+
+  hsa_context.initialized = true;
+  return true;
+}
+
+/* Verify that hsa_context has already been initialized and return the
+   agent_info structure describing device number N.  Return NULL on error.  */
+
+static struct agent_info *
+get_agent_info (int n)
+{
+  if (!hsa_context.initialized)
+    {
+      GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
+      return NULL;
+    }
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
+      return NULL;
+    }
+  if (!hsa_context.agents[n].initialized)
+    {
+      GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
+      return NULL;
+    }
+  return &hsa_context.agents[n];
+}
+
+/* Callback of dispatch queues to report errors.  */
+
+static void
+queue_callback (hsa_status_t status,
+		hsa_queue_t *queue __attribute__ ((unused)),
+		void *data __attribute__ ((unused)))
+{
+  hsa_fatal ("Asynchronous queue error", status);
+}
+
+static hsa_status_t
+dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
+{
+  hsa_status_t status;
+
+  hsa_region_segment_t segment;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+					   &segment);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      if (segment == HSA_REGION_SEGMENT_GLOBAL)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
+      else if (segment == HSA_REGION_SEGMENT_READONLY)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
+      else if (segment == HSA_REGION_SEGMENT_PRIVATE)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
+      else if (segment == HSA_REGION_SEGMENT_GROUP)
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
+      else
+	HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
+    }
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_SEGMENT: FAILED\n");
+
+  if (segment == HSA_REGION_SEGMENT_GLOBAL)
+    {
+      uint32_t flags;
+      status
+	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+					  &flags);
+      if (status == HSA_STATUS_SUCCESS)
+	{
+	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+	    HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
+	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
+	    HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
+	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
+	    HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
+	}
+      else
+	HSA_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
+    }
+
+  size_t size;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_SIZE: FAILED\n");
+
+  status
+    = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
+				      &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
+
+  bool alloc_allowed;
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
+				      &alloc_allowed);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
+
+  if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
+    return HSA_STATUS_SUCCESS;
+
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
+				      &size);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
+
+  size_t align;
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
+				      &align);
+  if (status == HSA_STATUS_SUCCESS)
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
+  else
+    HSA_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
+
+  return HSA_STATUS_SUCCESS;
+}
+
+static void
+dump_hsa_regions (hsa_agent_t agent)
+{
+  hsa_status_t status;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
+						 dump_hsa_region,
+						 NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_error ("Dumping hsa regions failed", status);
+}
+
+/* Return malloc'd string with name of SYMBOL.  */
+
+static char *
+get_executable_symbol_name (hsa_executable_symbol_t symbol)
+{
+  hsa_status_t status;
+  char *res;
+  uint32_t len;
+  const hsa_executable_symbol_info_t info_name_length
+    = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
+						      &len);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not get length of symbol name", status);
+      return NULL;
+    }
+
+  res = GOMP_PLUGIN_malloc (len + 1);
+
+  const hsa_executable_symbol_info_t info_name
+    = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
+
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not get symbol name", status);
+      free (res);
+      return NULL;
+    }
+
+  res[len] = '\0';
+
+  return res;
+}
+
+/* Helper function for dump_executable_symbols.  */
+
+static hsa_status_t
+dump_executable_symbol (hsa_executable_t executable,
+			hsa_executable_symbol_t symbol,
+			void *data __attribute__((unused)))
+{
+  char *name = get_executable_symbol_name (symbol);
+
+  if (name)
+    {
+      HSA_DEBUG ("executable symbol: %s\n", name);
+      free (name);
+    }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Dump all global symbol in executable.  */
+
+static void
+dump_executable_symbols (hsa_executable_t executable)
+{
+  hsa_status_t status;
+  status
+    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+						 dump_executable_symbol,
+						 NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not dump HSA executable symbols", status);
+}
+
+/* Helper function for find_executable_symbol.  */
+
+static hsa_status_t
+find_executable_symbol_1 (hsa_executable_t executable,
+			  hsa_executable_symbol_t symbol,
+			  void *data)
+{
+  hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
+  *res = symbol;
+  return HSA_STATUS_INFO_BREAK;
+}
+
+/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
+   found, return false.  */
+
+static bool
+find_executable_symbol (hsa_executable_t executable,
+			hsa_executable_symbol_t *symbol)
+{
+  hsa_status_t status;
+
+  status
+    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+						 find_executable_symbol_1,
+						 symbol);
+  if (status != HSA_STATUS_INFO_BREAK)
+    {
+      hsa_error ("Could not find executable symbol", status);
+      return false;
+    }
+
+  return true;
+}
+
+/* Callback of hsa_agent_iterate_regions.  Determine if a memory REGION can be
+   used for kernarg allocations and if so write it to the memory pointed to by
+   DATA and break the query.  */
+
+static hsa_status_t
+get_kernarg_memory_region (hsa_region_t region, void *data)
+{
+  hsa_status_t status;
+  hsa_region_segment_t segment;
+
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+					   &segment);
+  if (status != HSA_STATUS_SUCCESS)
+    return status;
+  if (segment != HSA_REGION_SEGMENT_GLOBAL)
+    return HSA_STATUS_SUCCESS;
+
+  uint32_t flags;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+					   &flags);
+  if (status != HSA_STATUS_SUCCESS)
+    return status;
+  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+    {
+      hsa_region_t *ret = (hsa_region_t *) data;
+      *ret = region;
+      return HSA_STATUS_INFO_BREAK;
+    }
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Part of the libgomp plugin interface.  Return the number of HSA devices on
+   the system.  */
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+  if (!init_hsa_context ())
+    return 0;
+  return hsa_context.agent_count;
+}
+
+union gomp_device_property_value
+GOMP_OFFLOAD_get_property (int device, int prop)
+{
+  struct agent_info *agent = get_agent_info (device);
+  hsa_region_t region = agent->kernarg_region;
+
+  union gomp_device_property_value propval = { .val = 0 };
+
+  static char buf[64];
+  buf[0] = '\0';
+  size_t size;
+  hsa_status_t status;
+
+  switch (prop)
+    {
+    case GOMP_DEVICE_PROPERTY_FREE_MEMORY:
+      /* Not known: fall through.  */
+    case GOMP_DEVICE_PROPERTY_MEMORY:
+      status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE,
+					       &size);
+      propval.val = size;
+      break;
+      break;
+    case GOMP_DEVICE_PROPERTY_NAME:
+      status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+					      buf);
+      if (status == HSA_STATUS_SUCCESS)
+	propval.ptr = buf;
+      break;
+    case GOMP_DEVICE_PROPERTY_VENDOR:
+      status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+					      HSA_AGENT_INFO_VENDOR_NAME,
+					      buf);
+      if (status == HSA_STATUS_SUCCESS)
+	propval.ptr = buf;
+      break;
+    case GOMP_DEVICE_PROPERTY_DRIVER:
+      propval.ptr = "HSA Runtime";
+      break;
+    }
+
+  return propval;
+}
+
+static void
+queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
+		   void *vars, struct GOMP_kernel_launch_attributes *kla)
+{
+  assert (aq->agent == kernel->agent);
+
+  if (aq->queue_n == ASYNC_QUEUE_SIZE)
+    GOMP_PLUGIN_fatal ("ran out of async queue in thread %d:%d",
+		       aq->agent->device_id, aq->id);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  int queue_last = ((aq->queue_first + aq->queue_n)
+		    % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = 0;
+  aq->queue[queue_last].u.launch.kernel = kernel;
+  aq->queue[queue_last].u.launch.vars = vars;
+  aq->queue[queue_last].u.launch.kla = *kla;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+static void
+queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
+		     void *data)
+{
+  if (aq->queue_n == ASYNC_QUEUE_SIZE)
+    GOMP_PLUGIN_fatal ("Async thread %d:%d: error: queue overflowed",
+		       aq->agent->device_id, aq->id);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  int queue_last = ((aq->queue_first + aq->queue_n)
+		    % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = 1;
+  aq->queue[queue_last].u.callback.fn = fn;
+  aq->queue[queue_last].u.callback.data = data;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+static void run_kernel (struct kernel_info *kernel, void *vars,
+			struct GOMP_kernel_launch_attributes *kla,
+			struct goacc_asyncqueue *aq, bool module_locked);
+
+static void
+execute_queue_entry (struct goacc_asyncqueue *aq, int index)
+{
+  struct queue_entry *entry = &aq->queue[index];
+  if (entry->type == 0)
+    {
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
+		   aq->agent->device_id, aq->id, index);
+      run_kernel (entry->u.launch.kernel,
+		  entry->u.launch.vars,
+		  &entry->u.launch.kla, aq, false);
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
+		   aq->agent->device_id, aq->id, index);
+    }
+  else if (entry->type == 1)
+    {
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
+		   aq->agent->device_id, aq->id, index);
+      entry->u.callback.fn (entry->u.callback.data);
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
+		   aq->agent->device_id, aq->id, index);
+    }
+  else
+    GOMP_PLUGIN_fatal ("Unknown queue element");
+}
+
+static void *
+drain_queue (void *thread_arg)
+{
+  struct goacc_asyncqueue *aq = thread_arg;
+
+  if (DRAIN_QUEUE_SYNCHRONOUS_P)
+    {
+      aq->drain_queue_stop = 2;
+      return NULL;
+    }
+
+  pthread_mutex_lock (&aq->mutex);
+
+  while (true)
+    {
+      if (aq->drain_queue_stop)
+	break;
+
+      if (aq->queue_n > 0)
+	{
+	  pthread_mutex_unlock (&aq->mutex);
+	  execute_queue_entry (aq, aq->queue_first);
+
+	  pthread_mutex_lock (&aq->mutex);
+	  aq->queue_first = ((aq->queue_first + 1)
+			     % ASYNC_QUEUE_SIZE);
+	  aq->queue_n--;
+
+	  if (DEBUG_THREAD_SIGNAL)
+	    HSA_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
+		       aq->agent->device_id, aq->id);
+	  pthread_cond_broadcast (&aq->queue_cond_out);
+	  pthread_mutex_unlock (&aq->mutex);
+
+	  if (DEBUG_QUEUES)
+	    HSA_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
+		       aq->id);
+	  pthread_mutex_lock (&aq->mutex);
+	}
+      else
+	{
+	  if (DEBUG_THREAD_SLEEP)
+	    HSA_DEBUG ("Async thread %d:%d: going to sleep\n",
+		       aq->agent->device_id, aq->id);
+	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
+	  if (DEBUG_THREAD_SLEEP)
+	    HSA_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
+		       aq->agent->device_id, aq->id);
+	}
+    }
+
+  aq->drain_queue_stop = 2;
+  if (DEBUG_THREAD_SIGNAL)
+    HSA_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_broadcast (&aq->queue_cond_out);
+  pthread_mutex_unlock (&aq->mutex);
+
+  HSA_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
+  return NULL;
+}
+
+static void
+drain_queue_synchronous (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+
+  while (aq->queue_n > 0)
+    {
+      execute_queue_entry (aq, aq->queue_first);
+
+      aq->queue_first = ((aq->queue_first + 1)
+			 % ASYNC_QUEUE_SIZE);
+      aq->queue_n--;
+    }
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Part of the libgomp plugin interface.  Initialize agent number N so that it
+   can be used for computation.  Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_init_device (int n)
+{
+  if (!init_hsa_context ())
+    return false;
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
+      return false;
+    }
+  struct agent_info *agent = &hsa_context.agents[n];
+
+  if (agent->initialized)
+    return true;
+
+  agent->device_id = n;
+
+  if (pthread_rwlock_init (&agent->module_rwlock, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->prog_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+      return false;
+    }
+  agent->async_queues = NULL;
+  agent->omp_async_queue = NULL;
+
+  uint32_t queue_size;
+  hsa_status_t status;
+  status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+					  &queue_size);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error requesting maximum queue size of the GCN agent",
+		      status);
+
+  char buf[64];
+  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+					  &buf);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error querying the name of the agent", status);
+  agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
+
+  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
+					HSA_QUEUE_TYPE_MULTI, queue_callback,
+					NULL, UINT32_MAX, UINT32_MAX,
+					&agent->sync_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error creating command queue", status);
+
+  agent->kernarg_region.handle = (uint64_t) -1;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
+						 get_kernarg_memory_region,
+						 &agent->kernarg_region);
+  if (agent->kernarg_region.handle == (uint64_t) -1)
+    {
+      GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
+			 "arguments");
+      return false;
+    }
+  HSA_DEBUG ("Selected kernel arguments memory region:\n");
+  dump_hsa_region (agent->kernarg_region, NULL);
+
+  HSA_DEBUG ("GCN agent %d initialized\n", n);
+
+  agent->initialized = true;
+  return true;
+}
+
+/* Free the HSA program in agent and everything associated with it and set
+   agent->prog_finalized and the initialized flags of all kernels to false.
+   Return TRUE on success.  */
+
+static bool
+destroy_hsa_program (struct agent_info *agent)
+{
+  if (!agent->prog_finalized)
+    return true;
+
+  hsa_status_t status;
+
+  HSA_DEBUG ("Destroying the current GCN program.\n");
+
+  status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Could not destroy GCN executable", status);
+
+  if (agent->module)
+    {
+      int i;
+      for (i = 0; i < agent->module->kernel_count; i++)
+	agent->module->kernels[i].initialized = false;
+
+      if (agent->module->heap)
+	{
+	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
+	  agent->module->heap = NULL;
+	}
+    }
+  agent->prog_finalized = false;
+  return true;
+}
+
+/* Initialize KERNEL from D and other parameters.  Return true on success. */
+
+static bool
+init_basic_kernel_info (struct kernel_info *kernel,
+			struct hsa_kernel_description *d,
+			struct agent_info *agent,
+			struct module_info *module)
+{
+  kernel->agent = agent;
+  kernel->module = module;
+  kernel->name = d->name;
+  kernel->omp_data_size = d->omp_data_size;
+  kernel->gridified_kernel_p = d->gridified_kernel_p;
+  kernel->dependencies_count = d->kernel_dependencies_count;
+  kernel->dependencies = d->kernel_dependencies;
+  if (pthread_mutex_init (&kernel->init_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
+      return false;
+    }
+  return true;
+}
+
+static void init_kernel (struct kernel_info *kernel);
+
+/* Part of the libgomp plugin interface.  Load GCN object-code module
+   described by struct gcn_image_desc in TARGET_DATA and return references to
+   kernel descriptors in TARGET_TABLE.  */
+
+int
+GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
+			 struct addr_pair **target_table)
+{
+  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_GCN)
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+      return -1;
+    }
+
+  struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
+  struct agent_info *agent;
+  struct addr_pair *pair;
+  struct module_info *module;
+  struct kernel_info *kernel;
+  int kernel_count = image_desc->kernel_count;
+  unsigned var_count = image_desc->global_variable_count;
+
+  agent = get_agent_info (ord);
+  if (!agent)
+    return -1;
+
+  if (pthread_rwlock_wrlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+      return -1;
+    }
+  if (agent->prog_finalized
+      && !destroy_hsa_program (agent))
+    return -1;
+
+  HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
+  HSA_DEBUG ("Encountered %u global variables in an image\n", var_count);
+  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
+			     * sizeof (struct addr_pair));
+  *target_table = pair;
+  module = (struct module_info *)
+    GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
+				+ kernel_count * sizeof (struct kernel_info));
+  module->image_desc = image_desc;
+  module->kernel_count = kernel_count;
+  module->heap = NULL;
+  module->constructors_run_p = false;
+
+  kernel = &module->kernels[0];
+
+  /* We have the magic code for a native GCN ELF kernel, not something
+     else.  */
+  if (strcmp (image_desc->gcn_image->magic, "GCN") != 0)
+    return -1;
+
+  /* Allocate memory for kernel dependencies.  */
+  for (unsigned i = 0; i < kernel_count; i++)
+    {
+      struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
+      if (!init_basic_kernel_info (kernel, d, agent, module))
+	return -1;
+      if (strcmp (d->name, "_init_array") == 0)
+	module->init_array_func = kernel;
+      else if (strcmp (d->name, "_fini_array") == 0)
+        module->fini_array_func = kernel;
+      else
+	{
+	  pair->start = (uintptr_t) kernel;
+	  pair->end = (uintptr_t) (kernel + 1);
+	  pair++;
+	}
+      kernel++;
+    }
+
+  agent->module = module;
+  if (pthread_rwlock_unlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+      return -1;
+    }
+
+  if (!create_and_finalize_hsa_program (agent))
+    return -1;
+
+  for (unsigned i = 0; i < var_count; i++)
+    {
+      struct global_var_info *v = &image_desc->global_variables[i];
+      HSA_DEBUG ("Looking for variable %s\n", v->name);
+
+      hsa_status_t status;
+      hsa_executable_symbol_t var_symbol;
+      status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						     v->name, agent->id,
+						     0, &var_symbol);
+
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not find symbol for variable in the code object",
+		   status);
+
+      uint64_t var_addr;
+      uint32_t var_size;
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable from its symbol", status);
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable size from its symbol", status);
+
+      pair->start = var_addr;
+      pair->end = var_addr + var_size;
+      HSA_DEBUG ("Found variable %s at %p with size %u\n", v->name,
+		 (void *)var_addr, var_size);
+      pair++;
+    }
+
+  /* Ensure that constructors are run first.  */
+  struct GOMP_kernel_launch_attributes kla =
+    { 3,
+      /* Grid size.  */
+      { 1, 64, 1 },
+      /* Work-group size.  */
+      { 1, 64, 1 }
+    };
+
+  if (module->init_array_func)
+    {
+      init_kernel (module->init_array_func);
+      run_kernel (module->init_array_func, NULL, &kla, NULL, false);
+    }
+  module->constructors_run_p = true;
+
+  return kernel_count + var_count;
+}
+
+/* Find the load_offset for MODULE, savte to *LOAD_OFFSET, and return true.  If
+   not found, return false.  */
+
+static bool
+find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
+		  struct module_info *module, Elf64_Ehdr *image,
+		  Elf64_Shdr *sections)
+{
+  bool res = false;
+
+  hsa_status_t status;
+
+  hsa_executable_symbol_t symbol;
+  if (!find_executable_symbol (agent->executable, &symbol))
+    return false;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not extract symbol address", status);
+      return false;
+    }
+
+  char *symbol_name = get_executable_symbol_name (symbol);
+  if (symbol_name == NULL)
+    return false;
+
+  /* Find the kernel function in ELF, and calculate actual load offset.  */
+  for (int i = 0; i < image->e_shnum; i++)
+    if (sections[i].sh_type == SHT_SYMTAB)
+      {
+	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
+	char *strings = (char *)image + strtab->sh_offset;
+
+	for (size_t offset = 0;
+	     offset < sections[i].sh_size;
+	     offset += sections[i].sh_entsize)
+	  {
+	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
+					  + sections[i].sh_offset
+					  + offset);
+	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
+	      {
+		*load_offset -= sym->st_value;
+		res = true;
+		break;
+	      }
+	  }
+      }
+
+  free (symbol_name);
+  return res;
+}
+
+/* Create and finalize the program consisting of all loaded modules.  */
+
+static bool
+create_and_finalize_hsa_program (struct agent_info *agent)
+{
+  hsa_status_t status;
+  int reloc_count = 0;
+  bool res = true;
+  if (pthread_mutex_lock (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+      return false;
+    }
+  if (agent->prog_finalized)
+    goto final;
+
+  status
+    = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
+					HSA_EXECUTABLE_STATE_UNFROZEN,
+					"", &agent->executable);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not create GCN executable", status);
+      goto fail;
+    }
+
+  struct obstack unmodified_sections_os;
+  obstack_init (&unmodified_sections_os);
+
+  /* Load any GCN modules.  */
+  struct module_info *module = agent->module;
+  if (module)
+    {
+      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+
+      /* Hide relocations from the HSA runtime loader.
+	 Keep a copy of the unmodified section headers to use later.  */
+      Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
+						  + image->e_shoff);
+      Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
+      memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
+      for (int i = image->e_shnum - 1; i >= 0; i--)
+	{
+	  if (image_sections[i].sh_type == SHT_RELA
+	      || image_sections[i].sh_type == SHT_REL)
+	    /* Change section type to something harmless.  */
+	    image_sections[i].sh_type = SHT_NOTE;
+	}
+      obstack_ptr_grow (&unmodified_sections_os, sections);
+
+      hsa_code_object_t co = { 0 };
+      status = hsa_fns.hsa_code_object_deserialize_fn
+	(module->image_desc->gcn_image->image,
+	 module->image_desc->gcn_image->size,
+	 NULL, &co);
+      if (status != HSA_STATUS_SUCCESS)
+	{
+	  hsa_error ("Could not deserialize GCN code object", status);
+	  goto fail;
+	}
+
+      status = hsa_fns.hsa_executable_load_code_object_fn
+	(agent->executable, agent->id, co, "");
+      if (status != HSA_STATUS_SUCCESS)
+	{
+	  hsa_error ("Could not load GCN code object", status);
+	  goto fail;
+	}
+
+      if (!module->heap)
+	{
+	  status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+						   gcn_kernel_heap_size,
+						   (void**)&module->heap);
+	  if (status != HSA_STATUS_SUCCESS)
+	    {
+	      hsa_error ("Could not allocate memory for GCN heap", status);
+	      goto fail;
+	    }
+
+	  module->heap->size = gcn_kernel_heap_size;
+	}
+
+    }
+  Elf64_Shdr **unmodified_sections = obstack_finish (&unmodified_sections_os);
+
+  if (debug)
+    dump_executable_symbols (agent->executable);
+
+  status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not freeze the GCN executable", status);
+      goto fail;
+    }
+
+  int s = 0;
+  if (agent->module)
+    {
+      struct module_info *module = agent->module;
+      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+      Elf64_Shdr *sections = unmodified_sections[s++];
+
+      Elf64_Addr load_offset;
+      if (!find_load_offset (&load_offset, agent, module, image, sections))
+	goto fail;
+
+      /* Record the physical load address range.
+	 We need this for data copies later.  */
+      Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
+      Elf64_Addr low = ~0, high = 0;
+      for (int i = 0; i < image->e_phnum; i++)
+	if (segments[i].p_memsz > 0)
+	  {
+	    if (segments[i].p_paddr < low)
+	      low = segments[i].p_paddr;
+	    if (segments[i].p_paddr > high)
+	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
+	  }
+      module->phys_address_start = low + load_offset;
+      module->phys_address_end = high + load_offset;
+
+      // Find dynamic symbol table
+      Elf64_Shdr *dynsym = NULL;
+      for (int i = 0; i < image->e_shnum; i++)
+	if (sections[i].sh_type == SHT_DYNSYM)
+	  {
+	    dynsym = &sections[i];
+	    break;
+	  }
+
+      /* Fix up relocations.  */
+      for (int i = 0; i < image->e_shnum; i++)
+	{
+	  if (sections[i].sh_type == SHT_RELA)
+	    for (size_t offset = 0;
+		 offset < sections[i].sh_size;
+		 offset += sections[i].sh_entsize)
+	      {
+		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
+						  + sections[i].sh_offset
+						  + offset);
+		Elf64_Sym *sym =
+		  (dynsym
+		   ? (Elf64_Sym*)((char*)image
+				  + dynsym->sh_offset
+				  + (dynsym->sh_entsize
+				     * ELF64_R_SYM (reloc->r_info)))
+		   : NULL);
+
+		int64_t S = (sym ? sym->st_value : 0);
+		int64_t P = reloc->r_offset + load_offset;
+		int64_t A = reloc->r_addend;
+		int64_t B = load_offset;
+		int64_t V, size;
+		switch (ELF64_R_TYPE (reloc->r_info))
+		  {
+		  case R_AMDGPU_ABS32_LO:
+		    V = (S + A) & 0xFFFFFFFF;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_ABS32_HI:
+		    V = (S + A) >> 32;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_ABS64:
+		    V = S + A;
+		    size = 8;
+		    break;
+		  case R_AMDGPU_REL32:
+		    V = S + A - P;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_REL64:
+		    /* FIXME
+		       LLD seems to emit REL64 where the the assembler has
+		       ABS64.  This is clearly wrong because it's not what the
+		       compiler is expecting.  Let's assume, for now, that
+		       it's a bug.  In any case, GCN kernels are always self
+		       contained and therefore relative relocations will have
+		       been resolved already, so this should be a safe
+		       workaround.  */
+		    V = S + A/* - P*/;
+		    size = 8;
+		    break;
+		  case R_AMDGPU_ABS32:
+		    V = S + A;
+		    size = 4;
+		    break;
+		    /* TODO R_AMDGPU_GOTPCREL */
+		    /* TODO R_AMDGPU_GOTPCREL32_LO */
+		    /* TODO R_AMDGPU_GOTPCREL32_HI */
+		  case R_AMDGPU_REL32_LO:
+		    V = (S + A - P) & 0xFFFFFFFF;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_REL32_HI:
+		    V = (S + A - P) >> 32;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_RELATIVE64:
+		    V = B + A;
+		    size = 8;
+		    break;
+		  default:
+		    fprintf (stderr, "Error: unsupported relocation type.\n");
+		    exit (1);
+		  }
+		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
+		if (status != HSA_STATUS_SUCCESS)
+		  {
+		    hsa_error ("Failed to fix up relocation", status);
+		    goto fail;
+		  }
+		reloc_count++;
+	      }
+	}
+
+      free (sections);
+    }
+  obstack_free (&unmodified_sections_os, NULL);
+
+  HSA_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
+	     agent->device_id, reloc_count);
+
+final:
+  agent->prog_finalized = true;
+
+  if (pthread_mutex_unlock (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+      res = false;
+    }
+
+  return res;
+
+fail:
+  res = false;
+  goto final;
+}
+
+/* Create kernel dispatch data structure for given KERNEL.  */
+
+static struct GOMP_hsa_kernel_dispatch *
+create_single_kernel_dispatch (struct kernel_info *kernel,
+			       unsigned omp_data_size)
+{
+  struct agent_info *agent = kernel->agent;
+  struct GOMP_hsa_kernel_dispatch *shadow
+    = GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
+
+  shadow->omp_data_memory
+    = omp_data_size > 0 ? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
+  unsigned dispatch_count = kernel->dependencies_count;
+  if (dispatch_count != 0)
+    GOMP_PLUGIN_fatal ("kernel->dependencies_count != 0");
+  shadow->kernel_dispatch_count = 0;
+
+  shadow->object = kernel->object;
+
+  hsa_signal_t sync_signal;
+  hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating the GCN sync signal", status);
+
+  shadow->signal = sync_signal.handle;
+  shadow->private_segment_size = kernel->private_segment_size;
+  shadow->group_segment_size = kernel->group_segment_size;
+
+  /* Ensure that there is space for the gomp_print data.
+     See also gcn-run.c, in GCC.  */
+  size_t kss = kernel->kernarg_segment_size;
+  bool use_gomp_print = false;
+  if (kss <= 8)
+    {
+      kss = sizeof (struct kernargs);
+      use_gomp_print = true;
+    }
+
+  status
+    = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+				      kss,
+				      &shadow->kernarg_address);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
+
+  struct kernargs *kernargs = shadow->kernarg_address;
+  if (use_gomp_print)
+    {
+      /* Zero-initialize the output_data (minimum needed).  */
+      kernargs->out_ptr = (int64_t)&kernargs->output_data;
+      kernargs->output_data.next_output = 0;
+      for (unsigned i = 0;
+	   i < (sizeof (kernargs->output_data.queue)
+		/ sizeof (kernargs->output_data.queue[0]));
+	   i++)
+	kernargs->output_data.queue[i].written = 0;
+      kernargs->output_data.consumed = 0;
+
+      /* Pass in the heap location.  */
+      kernargs->heap_ptr = (int64_t)kernel->module->heap;
+    }
+
+  kernargs->output_data.return_value = 0xcafe0000;
+
+  return shadow;
+}
+
+/* Output any data written by gomp_print_*.
+   Only enabled when the requested kernarg_segment_size would not
+   overwrite the gomp_print data.
+   We print all entries from print_index to the next entry without a "written"
+   flag.  Subsequent calls should use the returned print_index value to resume
+   from the same point.  */
+static void
+gomp_print_output (struct kernel_info *kernel, struct kernargs *kernargs,
+		   bool final)
+{
+  if (kernel->kernarg_segment_size <= 8)
+    {
+      unsigned int limit = (sizeof (kernargs->output_data.queue)
+			    / sizeof (kernargs->output_data.queue[0]));
+
+      unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
+					   __ATOMIC_ACQUIRE);
+      unsigned int to = kernargs->output_data.next_output;
+
+      if (from > to)
+	{
+	  /* Overflow.  */
+	  if (final)
+	    printf ("GCN print buffer overflowed.\n");
+	  return;
+	}
+
+      unsigned int i;
+      for (i = from; i < to; i++)
+	{
+	  struct printf_data *data = &kernargs->output_data.queue[i%limit];
+
+	  if (!data->written && !final)
+	    break;
+
+	  switch (data->type)
+	    {
+	    case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
+	    case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
+	    case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
+	    case 3: printf ("%.128s%.128s", data->msg, data->text); break;
+	    default: printf ("GCN print buffer error!\n"); break;
+	    }
+	  data->written = 0;
+	  __atomic_store_n (&kernargs->output_data.consumed, i+1,
+			    __ATOMIC_RELEASE);
+	}
+      fflush (stdout);
+    }
+}
+
+/* Release data structure created for a kernel dispatch in SHADOW argument.  */
+
+static void
+release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
+{
+  HSA_DEBUG ("Released kernel dispatch: %p has value: %lu (%p)\n", shadow,
+	     shadow->debug, (void *) shadow->debug);
+
+  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  hsa_fns.hsa_signal_destroy_fn (s);
+
+  free (shadow->omp_data_memory);
+
+  free (shadow);
+}
+
+/* Initialize a KERNEL without its dependencies.  MAX_OMP_DATA_SIZE is used
+   to calculate maximum necessary memory for OMP data allocation.  */
+
+static void
+init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size)
+{
+  hsa_status_t status;
+  struct agent_info *agent = kernel->agent;
+  hsa_executable_symbol_t kernel_symbol;
+  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						 kernel->name, agent->id,
+						 0, &kernel_symbol);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_warn ("Could not find symbol for kernel in the code object", status);
+      fprintf (stderr, "not found name: '%s'\n", kernel->name);
+      dump_executable_symbols (agent->executable);
+      goto failure;
+    }
+  HSA_DEBUG ("Located kernel %s\n", kernel->name);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not extract a kernel object from its symbol", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+     &kernel->kernarg_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel argument size", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+     &kernel->group_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel group segment size", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+     &kernel->private_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel private segment size",
+	       status);
+
+  HSA_DEBUG ("Kernel structure for %s fully initialized with "
+	     "following segment sizes: \n", kernel->name);
+  HSA_DEBUG ("  group_segment_size: %u\n",
+	     (unsigned) kernel->group_segment_size);
+  HSA_DEBUG ("  private_segment_size: %u\n",
+	     (unsigned) kernel->private_segment_size);
+  HSA_DEBUG ("  kernarg_segment_size: %u\n",
+	     (unsigned) kernel->kernarg_segment_size);
+  HSA_DEBUG ("  omp_data_size: %u\n", kernel->omp_data_size);
+  HSA_DEBUG ("  gridified_kernel_p: %u\n", kernel->gridified_kernel_p);
+
+  if (kernel->omp_data_size > *max_omp_data_size)
+    *max_omp_data_size = kernel->omp_data_size;
+
+  return;
+
+failure:
+  kernel->initialization_failed = true;
+}
+
+/* Indent stream F by INDENT spaces.  */
+
+static void
+indent_stream (FILE *f, unsigned indent)
+{
+  fprintf (f, "%*s", indent, "");
+}
+
+/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
+
+static void
+print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch,
+		       unsigned indent)
+{
+  indent_stream (stderr, indent);
+  fprintf (stderr, "this: %p\n", dispatch);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "queue: %p\n", dispatch->queue);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "object: %lu\n", dispatch->object);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "signal: %lu\n", dispatch->signal);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "private_segment_size: %u\n",
+	   dispatch->private_segment_size);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "group_segment_size: %u\n",
+	   dispatch->group_segment_size);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "children dispatches: %lu\n",
+	   dispatch->kernel_dispatch_count);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "omp_num_threads: %u\n",
+	   dispatch->omp_num_threads);
+  fprintf (stderr, "\n");
+}
+
+/* Create kernel dispatch data structure for a KERNEL and all its
+   dependencies.  */
+
+static struct GOMP_hsa_kernel_dispatch *
+create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size)
+{
+  struct GOMP_hsa_kernel_dispatch *shadow
+    = create_single_kernel_dispatch (kernel, omp_data_size);
+  shadow->omp_num_threads = 64;
+  shadow->debug = 0;
+  shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
+
+  return shadow;
+}
+
+/* Do all the work that is necessary before running KERNEL for the first time.
+   The function assumes the program has been created, finalized and frozen by
+   create_and_finalize_hsa_program.  */
+
+static void
+init_kernel (struct kernel_info *kernel)
+{
+  if (pthread_mutex_lock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
+  if (kernel->initialized)
+    {
+      if (pthread_mutex_unlock (&kernel->init_mutex))
+	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+			   "mutex");
+
+      return;
+    }
+
+  /* Precomputed maximum size of OMP data necessary for a kernel from kernel
+     dispatch operation.  */
+  init_single_kernel (kernel, &kernel->max_omp_data_size);
+
+  if (!kernel->initialization_failed)
+    {
+      HSA_DEBUG ("\n");
+
+      kernel->initialized = true;
+    }
+  if (pthread_mutex_unlock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+		       "mutex");
+}
+
+/* Calculate the maximum grid size for OMP threads / OACC workers.
+   This depends on the kernel's resource usage levels.  */
+
+static int
+limit_worker_threads (int threads)
+{
+  /* FIXME Do something more inteligent here.
+     GCN can always run 4 threads within a Compute Unit, but
+     more than that depends on register usage.  */
+  if (threads > 16)
+    threads = 16;
+  return threads;
+}
+
+/* Parse the target attributes INPUT provided by the compiler and return true
+   if we should run anything all.  If INPUT is NULL, fill DEF with default
+   values, then store INPUT or DEF into *RESULT.  */
+
+static bool
+parse_target_attributes (void **input,
+			 struct GOMP_kernel_launch_attributes *def,
+			 struct GOMP_kernel_launch_attributes **result,
+			 struct agent_info *agent)
+{
+  if (!input)
+    GOMP_PLUGIN_fatal ("No target arguments provided");
+
+  bool grid_attrs_found = false;
+  bool gcn_dims_found = false;
+  int gcn_teams = 0;
+  int gcn_threads = 0;
+  while (*input)
+    {
+      intptr_t id = (intptr_t) *input++, val;
+
+      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+	val = (intptr_t) *input++;
+      else
+	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+
+      val = (val > INT_MAX) ? INT_MAX : val;
+
+      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
+	  && ((id & GOMP_TARGET_ARG_ID_MASK)
+	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
+	{
+	  grid_attrs_found = true;
+	  break;
+	}
+      else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
+	{
+	  gcn_dims_found = true;
+	  switch (id & GOMP_TARGET_ARG_ID_MASK)
+	    {
+	    case GOMP_TARGET_ARG_NUM_TEAMS:
+	      gcn_teams = val;
+	      break;
+	    case GOMP_TARGET_ARG_THREAD_LIMIT:
+	      gcn_threads = limit_worker_threads (val);
+	      break;
+	    default:
+	      ;
+	    }
+	}
+    }
+
+  if (gcn_dims_found)
+    {
+      if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
+	{
+	  gcn_threads = 4;
+	  HSA_DEBUG ("VEGA BUG WORKAROUND: reducing default number of "
+		     "threads to 4 per team.\n");
+	  HSA_DEBUG (" - If this is not a Vega 10 device, please use "
+		     "GCN_NUM_THREADS=16\n");
+	}
+
+      def->ndim = 3;
+      /* Fiji has 64 CUs.  */
+      def->gdims[0] = (gcn_teams > 0) ? gcn_teams : 64;
+      /* Each thread is 64 work items wide.  */
+      def->gdims[1] = 64;
+      /* A work group can have 16 wavefronts.  */
+      def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
+      def->wdims[0] = 1; /* Single team per work-group.  */
+      def->wdims[1] = 64;
+      def->wdims[2] = 16;
+      *result = def;
+      return true;
+    }
+  else if (!grid_attrs_found)
+    {
+      def->ndim = 1;
+      def->gdims[0] = 1;
+      def->gdims[1] = 1;
+      def->gdims[2] = 1;
+      def->wdims[0] = 1;
+      def->wdims[1] = 1;
+      def->wdims[2] = 1;
+      *result = def;
+      HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
+      return true;
+    }
+
+  struct GOMP_kernel_launch_attributes *kla;
+  kla = (struct GOMP_kernel_launch_attributes *) *input;
+  *result = kla;
+  if (kla->ndim == 0 || kla->ndim > 3)
+    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
+
+  HSA_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
+  unsigned i;
+  for (i = 0; i < kla->ndim; i++)
+    {
+      HSA_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
+		 kla->gdims[i], kla->wdims[i]);
+      if (kla->gdims[i] == 0)
+	return false;
+    }
+  return true;
+}
+
+/* Return the group size given the requested GROUP size, GRID size and number
+   of grid dimensions NDIM.  */
+
+static uint32_t
+get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
+{
+  if (group == 0)
+    {
+      /* TODO: Provide a default via environment or device characteristics.  */
+      if (ndim == 1)
+	group = 64;
+      else if (ndim == 2)
+	group = 8;
+      else
+	group = 4;
+    }
+
+  if (group > grid)
+    group = grid;
+  return group;
+}
+
+/* Return true if the HSA runtime can run function FN_PTR.  */
+
+bool
+GOMP_OFFLOAD_can_run (void *fn_ptr)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  init_kernel (kernel);
+  if (kernel->initialization_failed)
+    goto failure;
+
+  return true;
+
+failure:
+  if (suppress_host_fallback)
+    GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+  HSA_DEBUG ("GCN target cannot be launched, doing a host fallback\n");
+  return false;
+}
+
+/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
+
+void
+packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
+{
+  __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
+/* Run KERNEL on its agent, pass VARS to it as arguments and take
+   launchattributes from KLA.  MODULE_LOCKED indicates that the caller
+   already holds the lock and run_kernel need not lock it again.
+   If AQ is NULL then agent->sync_queue will be used.  */
+
+static void
+run_kernel (struct kernel_info *kernel, void *vars,
+	    struct GOMP_kernel_launch_attributes *kla,
+	    struct goacc_asyncqueue *aq, bool module_locked)
+{
+  HSA_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
+	     (aq ? aq->id : 0));
+  HSA_DEBUG ("GCN launch attribs: gdims:[");
+  int i;
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	HSA_DPRINT (", ");
+      HSA_DPRINT ("%u", kla->gdims[i]);
+    }
+  HSA_DPRINT ("], normalized gdims:[");
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	HSA_DPRINT (", ");
+      HSA_DPRINT ("%u", kla->gdims[i] / kla->wdims[i]);
+    }
+  HSA_DPRINT ("], wdims:[");
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	HSA_DPRINT (", ");
+      HSA_DPRINT ("%u", kla->wdims[i]);
+    }
+  HSA_DPRINT ("]\n");
+  HSA_FLUSH ();
+
+  struct agent_info *agent = kernel->agent;
+  if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
+    GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
+
+  if (!agent->initialized)
+    GOMP_PLUGIN_fatal ("Agent must be initialized");
+
+  if (!kernel->initialized)
+    GOMP_PLUGIN_fatal ("Called kernel must be initialized");
+
+  struct GOMP_hsa_kernel_dispatch *shadow
+    = create_kernel_dispatch (kernel, kernel->max_omp_data_size);
+
+  hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
+  shadow->queue = command_q;
+
+  if (debug)
+    {
+      fprintf (stderr, "\nKernel has following dependencies:\n");
+      print_kernel_dispatch (shadow, 2);
+    }
+
+  uint64_t index
+    = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
+  HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
+
+  /* Wait until the queue is not full before writing the packet.   */
+  while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
+	 >= command_q->size)
+    ;
+
+  /* Do not allow the dimensions to be overridden when running
+     constructors or destructors.  */
+  struct module_info *module = kernel->module;
+  bool init_fini_p = kernel == module->init_array_func
+		     || kernel == module->fini_array_func;
+  int override_x = init_fini_p ? 0 : override_x_dim;
+  int override_z = init_fini_p ? 0 : override_z_dim;
+
+  hsa_kernel_dispatch_packet_t *packet;
+  packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
+	   + index % command_q->size;
+
+  memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
+  packet->grid_size_x = override_x ? : kla->gdims[0];
+  packet->workgroup_size_x = get_group_size (kla->ndim,
+					     packet->grid_size_x,
+					     kla->wdims[0]);
+
+  if (kla->ndim >= 2)
+    {
+      packet->grid_size_y = kla->gdims[1];
+      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
+						 kla->wdims[1]);
+    }
+  else
+    {
+      packet->grid_size_y = 1;
+      packet->workgroup_size_y = 1;
+    }
+
+  if (kla->ndim == 3)
+    {
+      packet->grid_size_z = limit_worker_threads (override_z
+						  ? : kla->gdims[2]);
+      packet->workgroup_size_z = get_group_size (kla->ndim,
+						 packet->grid_size_z,
+						 kla->wdims[2]);
+    }
+  else
+    {
+      packet->grid_size_z = 1;
+      packet->workgroup_size_z = 1;
+    }
+
+  HSA_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
+	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
+	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
+	     packet->grid_size_x / packet->workgroup_size_x,
+	     packet->grid_size_y / packet->workgroup_size_y,
+	     packet->grid_size_z / packet->workgroup_size_z,
+	     packet->workgroup_size_x, packet->workgroup_size_y,
+	     packet->workgroup_size_z);
+
+  packet->private_segment_size = kernel->private_segment_size;
+  packet->group_segment_size = kernel->group_segment_size;
+  packet->kernel_object = kernel->object;
+  packet->kernarg_address = shadow->kernarg_address;
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  packet->completion_signal = s;
+  hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
+  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+
+  /* PR hsa/70337.  */
+  size_t vars_size = sizeof (vars);
+  if (kernel->kernarg_segment_size > vars_size)
+    {
+      if (kernel->kernarg_segment_size != vars_size
+	  + sizeof (struct hsa_kernel_runtime *))
+	GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
+      memcpy (packet->kernarg_address + vars_size, &shadow,
+	      sizeof (struct hsa_kernel_runtime *));
+    }
+
+  HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
+
+  uint16_t header;
+  header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+  HSA_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
+	     agent->device_id);
+
+  packet_store_release ((uint32_t *) packet, header,
+			(uint16_t) kla->ndim
+			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
+
+  hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
+				       index);
+
+  HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
+
+  /* Root signal waits with 1ms timeout.  */
+  while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
+					     1000 * 1000,
+					     HSA_WAIT_STATE_BLOCKED) != 0)
+    {
+      gomp_print_output (kernel, shadow->kernarg_address, false);
+    }
+  gomp_print_output (kernel, shadow->kernarg_address, true);
+
+  struct kernargs *kernargs = shadow->kernarg_address;
+  unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
+
+  release_kernel_dispatch (shadow);
+
+  if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
+    GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
+
+  unsigned int upper = (return_value & ~0xffff) >> 16;
+  if (upper == 0xcafe)
+    ; // exit not called, normal termination.
+  else if (upper == 0xffff)
+    ; // exit called.
+  else
+    {
+      GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
+			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
+			 return_value);
+      abort ();
+    }
+
+  if (upper == 0xffff)
+    {
+      unsigned int signal = (return_value >> 8) & 0xff;
+
+      if (signal == SIGABRT)
+	{
+	  HSA_DEBUG ("GCN Kernel aborted\n");
+	  abort ();
+	}
+      else if (signal != 0)
+	{
+	  HSA_DEBUG ("GCN Kernel received unknown signal\n");
+	  abort ();
+	}
+
+      HSA_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
+      exit (return_value & 0xff);
+    }
+}
+
+/* Part of the libgomp plugin interface.  Run a kernel on device N (the number
+   is actually ignored, we assume the FN_PTR has been mapped using the correct
+   device) and pass it an array of pointers in VARS as a parameter.  The kernel
+   is identified by FN_PTR which must point to a kernel_info structure.  */
+
+void
+GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
+{
+  struct agent_info *agent = get_agent_info (device);
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+  struct GOMP_kernel_launch_attributes def;
+  struct GOMP_kernel_launch_attributes *kla;
+  assert (agent == kernel->agent);
+
+  if (!parse_target_attributes (args, &def, &kla, agent))
+    {
+      HSA_DEBUG ("Will not run GCN kernel because the grid size is zero\n");
+      return;
+    }
+  run_kernel (kernel, vars, kla, NULL, false);
+}
+
+/* Set up an async queue for OpenMP.  There will be only one.
+   FIXME: is this thread-safe if two threads call this function?  */
+static void
+maybe_init_omp_async (struct agent_info *agent)
+{
+  if (!agent->omp_async_queue)
+    agent->omp_async_queue
+      = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
+}
+
+/* Part of the libgomp plugin interface.  Run a kernel like GOMP_OFFLOAD_run
+   does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
+   has finished.  */
+
+void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+			void **args, void *async_data)
+{
+  HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
+  struct agent_info *agent = get_agent_info (device);
+  struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
+  struct GOMP_kernel_launch_attributes def;
+  struct GOMP_kernel_launch_attributes *kla;
+  assert (agent == kernel->agent);
+
+  if (!parse_target_attributes (args, &def, &kla, agent))
+    {
+      HSA_DEBUG ("Will not run GCN kernel because the grid size is zero\n");
+      return;
+    }
+
+  maybe_init_omp_async (agent);
+  queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
+  queue_push_callback (agent->omp_async_queue,
+		       GOMP_PLUGIN_target_task_completion, async_data);
+}
+
+/* Deinitialize all information associated with MODULE and kernels within
+   it.  Return TRUE on success.  */
+
+static bool
+destroy_module (struct module_info *module, bool locked)
+{
+  /* Run destructors before destroying module.  */
+  struct GOMP_kernel_launch_attributes kla =
+    { 3,
+      /* Grid size.  */
+      { 1, 64, 1 },
+      /* Work-group size.  */
+      { 1, 64, 1 }
+    };
+
+  if (module->fini_array_func)
+    {
+      init_kernel (module->fini_array_func);
+      run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
+    }
+  module->constructors_run_p = false;
+
+  int i;
+  for (i = 0; i < module->kernel_count; i++)
+    if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
+      {
+	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
+			   "mutex");
+	return false;
+      }
+
+  return true;
+}
+
+/* Part of the libgomp plugin interface.  Unload GCN object-code module
+   described by struct gcn_image_desc in TARGET_DATA from agent number N.
+   Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
+{
+  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+      return false;
+    }
+
+  struct agent_info *agent;
+  agent = get_agent_info (n);
+  if (!agent)
+    return false;
+
+  if (pthread_rwlock_wrlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+      return false;
+    }
+
+  if (!agent->module || agent->module->image_desc != target_data)
+    {
+      GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
+			 "loaded before");
+      return false;
+    }
+
+  if (!destroy_module (agent->module, true))
+    return false;
+  free (agent->module);
+  agent->module = NULL;
+  if (!destroy_hsa_program (agent))
+    return false;
+  if (pthread_rwlock_unlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+      return false;
+    }
+  return true;
+}
+
+/* Part of the libgomp plugin interface.  Deinitialize all information and
+   status associated with agent number N.  We do not attempt any
+   synchronization, assuming the user and libgomp will not attempt
+   deinitialization of a device that is in any way being used at the same
+   time.  Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_fini_device (int n)
+{
+  struct agent_info *agent = get_agent_info (n);
+  if (!agent)
+    return false;
+
+  if (!agent->initialized)
+    return true;
+
+  if (agent->omp_async_queue)
+    {
+      GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
+      agent->omp_async_queue = NULL;
+    }
+
+  if (agent->module)
+    {
+      if (!destroy_module (agent->module, false))
+        return false;
+      free (agent->module);
+      agent->module = NULL;
+    }
+
+  if (!destroy_hsa_program (agent))
+    return false;
+
+  /*release_agent_shared_libraries (agent);*/
+
+  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error destroying command queue", status);
+
+  if (pthread_mutex_destroy (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_rwlock_destroy (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
+      return false;
+    }
+
+  if (pthread_mutex_destroy (&agent->async_queues_mutex))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
+      return false;
+    }
+  agent->initialized = false;
+  return true;
+}
+
+static void *
+GOMP_OFFLOAD_alloc_by_agent (struct agent_info *agent, size_t size)
+{
+  HSA_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
+
+  /* Zero-size allocations are invalid, so in order to return a valid pointer
+     we need to pass a valid size.  One source of zero-size allocations is
+     kernargs for kernels that have no inputs or outputs (the kernel may
+     only use gomp_print, for example).  */
+  if (size == 0)
+    size = 4;
+
+  void *ptr;
+  hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+							size, &ptr);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not allocate device memory", status);
+      return NULL;
+    }
+
+  return ptr;
+}
+
+void *
+GOMP_OFFLOAD_alloc (int n, size_t size)
+{
+  struct agent_info *agent = get_agent_info (n);
+  return GOMP_OFFLOAD_alloc_by_agent (agent, size);
+}
+
+bool
+GOMP_OFFLOAD_free (int device, void *ptr)
+{
+  HSA_DEBUG ("Freeing memory on device %d\n", device);
+
+  hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not free device memory", status);
+      return false;
+    }
+
+  return true;
+}
+
+/* Returns true if PTR falls within the bounds of any loaded kernel image.  */
+
+static bool
+image_address_p (struct agent_info *agent, const void *ptr)
+{
+  Elf64_Addr addr = (Elf64_Addr)ptr;
+  if (agent->module)
+    {
+      if (addr >= agent->module->phys_address_start
+	  && addr <= agent->module->phys_address_end)
+	return true;
+    }
+  return false;
+}
+
+struct copy_data
+{
+  void *dst;
+  const void *src;
+  size_t len;
+  bool use_hsa_memory_copy;
+  struct goacc_asyncqueue *aq;
+};
+
+static void
+copy_data (void *data_)
+{
+  struct copy_data *data = (struct copy_data *)data_;
+  HSA_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
+	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
+	     data->dst);
+  if (data->use_hsa_memory_copy)
+    hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len);
+  else
+    memcpy (data->dst, data->src, data->len);
+  free (data);
+}
+
+static void
+queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
+		 size_t len, bool use_hsa_memory_copy)
+{
+  if (DEBUG_QUEUES)
+    HSA_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
+	       aq->agent->device_id, aq->id, len, src, dst);
+  struct copy_data *data
+    = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
+  data->dst = dst;
+  data->src = src;
+  data->len = len;
+  data->use_hsa_memory_copy = use_hsa_memory_copy;
+  data->aq = aq;
+  queue_push_callback (aq, copy_data, data);
+}
+
+bool
+GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
+{
+  HSA_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
+	     src, dst);
+
+  /* memcpy only works for addresses allocated with hsa_memory_allocate,
+     but hsa_memory_copy seems unable to read from .rodata variables.  */
+  if (image_address_p (get_agent_info (device), src))
+    hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  else
+    memcpy (dst, src, n);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
+{
+  HSA_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
+	     device, dst);
+  /* memcpy only works for addresses allocated with hsa_memory_allocate,
+     but hsa_memory_copy seems unable to read from .rodata variables.  */
+  if (image_address_p (get_agent_info (device), dst))
+    hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  else
+    memcpy (dst, src, n);
+  return true;
+}
+
+/* Part of the libgomp plugin interface.  */
+
+bool
+GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
+{
+  struct gcn_thread *thread_data = gcn_thread ();
+
+  if (thread_data && !async_synchronous_p (thread_data->async))
+    {
+      struct agent_info *agent = get_agent_info (device);
+      maybe_init_omp_async (agent);
+      queue_push_copy (agent->omp_async_queue, dst, src, n, false);
+      return true;
+    }
+
+  HSA_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
+	     device, src, device, dst);
+  /* We can assume that dev2dev moves are always within allocated memory.  */
+  memcpy (dst, src, n);
+  return true;
+}
+
+static int
+queue_empty (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+  int res = aq->queue_n == 0 ? 1 : 0;
+  pthread_mutex_unlock (&aq->mutex);
+
+  return res;
+}
+
+static void
+wait_queue (struct goacc_asyncqueue *aq)
+{
+  if (DRAIN_QUEUE_SYNCHRONOUS_P)
+    {
+      drain_queue_synchronous (aq);
+      return;
+    }
+
+  pthread_mutex_lock (&aq->mutex);
+
+  while (aq->queue_n > 0)
+    {
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
+		   aq->agent->device_id, aq->id);
+      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+      if (DEBUG_THREAD_SLEEP)
+	HSA_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
+		   aq->id);
+    }
+
+  pthread_mutex_unlock (&aq->mutex);
+  HSA_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
+}
+
+static void
+gomp_offload_free (void *ptr)
+{
+  HSA_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
+  GOMP_OFFLOAD_free (0, ptr);
+}
+
+static void
+gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
+	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
+	  struct goacc_asyncqueue *aq)
+{
+  if (!GOMP_OFFLOAD_can_run (kernel))
+    GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
+
+  // For some reason, devaddrs must be double-indirect on the target
+  void **ind_da = GOMP_OFFLOAD_alloc_by_agent (kernel->agent,
+					       sizeof (void*) * mapnum);
+  for (size_t i = 0; i < mapnum; i++)
+    ind_da[i] = devaddrs[i] ? devaddrs[i] : hostaddrs[i];
+
+  struct hsa_kernel_description *hsa_kernel_desc = NULL;
+  for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
+    {
+      struct hsa_kernel_description *d
+	= &kernel->module->image_desc->kernel_infos[i];
+      if (d->name == kernel->name)
+	{
+	  hsa_kernel_desc = d;
+	  break;
+	}
+    }
+
+  /* We may have statically-determined dimensions in
+     hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
+     invocation at runtime in dims[].  We allow static dimensions to take
+     priority over dynamic dimensions when present (non-zero).  */
+  if (hsa_kernel_desc->oacc_dims[0] > 0)
+    dims[0] = hsa_kernel_desc->oacc_dims[0];
+  if (hsa_kernel_desc->oacc_dims[1] > 0)
+    dims[1] = hsa_kernel_desc->oacc_dims[1];
+  if (hsa_kernel_desc->oacc_dims[2] > 0)
+    dims[2] = hsa_kernel_desc->oacc_dims[2];
+
+  /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
+     There isn't really a correct answer for this without a clue about the
+     problem size, so let's do a reasonable number of single-worker gangs.
+     64 gangs matches a typical Fiji device.  */
+
+  if (dims[0] == 0) dims[0] = 64; /* Gangs.  */
+  if (dims[1] == 0) dims[1] = 16; /* Workers.  */
+
+  /* The incoming dimensions are expressed in terms of gangs, workers, and
+     vectors.  The HSA dimensions are expressed in terms of "work-items",
+     which means multiples of vector lanes.
+
+     The "grid size" specifies the size of the problem space, and the
+     "work-group size" specifies how much of that we want a single compute
+     unit to chew on at once.
+
+     The three dimensions do not really correspond to hardware, but the
+     important thing is that the HSA runtime will launch as many
+     work-groups as it takes to process the entire grid, and each
+     work-group will contain as many wave-fronts as it takes to process
+     the work-items in that group.
+
+     Essentially, as long as we set the Y dimension to 64 (the number of
+     vector lanes in hardware), and the Z group size to the maximum (16),
+     then we will get the gangs (X) and workers (Z) launched as we expect.
+
+     The reason for the apparent reversal of vector and worker dimension
+     order is to do with the way the run-time distributes work-items across
+     v1 and v2.  */
+  struct GOMP_kernel_launch_attributes kla =
+    {3,
+     /* Grid size.  */
+     {dims[0], 64, dims[1]},
+     /* Work-group size.  */
+     {1,       64, 16}
+    };
+
+  if (!async)
+    {
+      run_kernel (kernel, ind_da, &kla, NULL, false);
+      gomp_offload_free (ind_da);
+    }
+  else
+    {
+      queue_push_launch (aq, kernel, ind_da, &kla);
+      if (DEBUG_QUEUES)
+	HSA_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
+		   aq->agent->device_id, aq->id, ind_da);
+      queue_push_callback (aq, gomp_offload_free, ind_da);
+    }
+}
+
+void
+GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
+			   void **hostaddrs, void **devaddrs, unsigned *dims,
+			   void *targ_mem_desc)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
+	    NULL);
+}
+
+void
+GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
+				 void **hostaddrs, void **devaddrs,
+				 unsigned *dims, void *targ_mem_desc,
+				 struct goacc_asyncqueue *aq)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
+	    aq);
+}
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device)
+{
+  struct agent_info *agent = get_agent_info (device);
+
+  pthread_mutex_lock (&agent->async_queues_mutex);
+
+  struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
+  aq->agent = get_agent_info (device);
+  aq->prev = NULL;
+  aq->next = agent->async_queues;
+  if (aq->next)
+    {
+      aq->next->prev = aq;
+      aq->id = aq->next->id + 1;
+    }
+  else
+    aq->id = 1;
+  agent->async_queues = aq;
+
+  aq->queue_first = 0;
+  aq->queue_n = 0;
+  aq->drain_queue_stop = 0;
+
+  if (pthread_mutex_init (&aq->mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+      return false;
+    }
+  if (pthread_cond_init (&aq->queue_cond_in, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+      return false;
+    }
+  if (pthread_cond_init (&aq->queue_cond_out, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+      return false;
+    }
+
+  hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
+						     ASYNC_QUEUE_SIZE,
+						     HSA_QUEUE_TYPE_MULTI,
+						     queue_callback, NULL,
+						     UINT32_MAX, UINT32_MAX,
+						     &aq->hsa_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating command queue", status);
+
+  int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
+		       strerror (err));
+  HSA_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
+	     aq->id);
+
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+
+  return aq;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = aq->agent;
+
+  finalize_async_thread (aq);
+
+  pthread_mutex_lock (&agent->async_queues_mutex);
+
+  int err;
+  if ((err = pthread_mutex_destroy (&aq->mutex)))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
+      goto fail;
+    }
+  if (pthread_cond_destroy (&aq->queue_cond_in))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+      goto fail;
+    }
+  if (pthread_cond_destroy (&aq->queue_cond_out))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+      goto fail;
+    }
+  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Error destroying command queue", status);
+      goto fail;
+    }
+
+  if (aq->prev)
+    aq->prev->next = aq->next;
+  if (aq->next)
+    aq->next->prev = aq->prev;
+  if (agent->async_queues == aq)
+    agent->async_queues = aq->next;
+
+  HSA_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
+
+  free (aq);
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+  return true;
+
+fail:
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+  return false;
+}
+
+int
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
+{
+  return queue_empty (aq);
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  wait_queue (aq);
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+				      struct goacc_asyncqueue *aq2)
+{
+  /* FIXME: what should happen here????  */
+  wait_queue (aq1);
+  wait_queue (aq2);
+  return true;
+}
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+					   void (*fn) (void *), void *data)
+{
+  queue_push_callback (aq, fn, data);
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = get_agent_info (device);
+  assert (agent == aq->agent);
+  queue_push_copy (aq, dst, src, n, image_address_p (agent, dst));
+  return true;
+}
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = get_agent_info (device);
+  assert (agent == aq->agent);
+  queue_push_copy (aq, dst, src, n, image_address_p (agent, src));
+  return true;
+}
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
+{
+  struct gcn_thread *thread_data
+    = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
+
+  thread_data->async = GOMP_ASYNC_SYNC;
+
+  return (void *) thread_data;
+}
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+  free (data);
+}
diff --git a/libgomp/target.c b/libgomp/target.c
index c81e5ababb74..4645894f869c 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -39,7 +39,7 @@
 #include <string.h>
 #include <assert.h>
 #include <errno.h>
-#ifdef RC_CHECKING
+#if defined(RC_CHECKING)
 #include <stdio.h>
 #endif
 
@@ -302,6 +302,12 @@ gomp_to_device_kind_p (int kind)
     }
 }
 
+/* Copy host memory to an offload device.  In asynchronous mode (if AQ is
+   non-NULL), this is only safe when the source memory is a global or heap
+   location (otherwise a copy may take place from a dangling pointer to an
+   expired stack frame).  Use copy_host2dev_immediate for copies from stack
+   locations.  */
+
 attribute_hidden void
 gomp_copy_host2dev (struct gomp_device_descr *devicep,
 		    struct goacc_asyncqueue *aq,
@@ -340,6 +346,17 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
     gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz);
 }
 
+/* Use this variant for host-to-device copies from stack locations that may not
+   be live at the time an asynchronous copy operation takes place.  */
+
+static void
+copy_host2dev_immediate (struct gomp_device_descr *devicep, void *d,
+			 const void *h, size_t sz,
+			 struct gomp_coalesce_buf *cbuf)
+{
+  gomp_copy_host2dev (devicep, NULL, d, h, sz, cbuf);
+}
+
 attribute_hidden void
 gomp_copy_dev2host (struct gomp_device_descr *devicep,
 		    struct goacc_asyncqueue *aq,
@@ -600,10 +617,10 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
   if (cur_node.host_start == (uintptr_t) NULL)
     {
       cur_node.tgt_offset = (uintptr_t) NULL;
-      gomp_copy_host2dev (devicep, aq,
-			  (void *) (tgt->tgt_start + target_offset),
-			  (void *) &cur_node.tgt_offset,
-			  sizeof (void *), cbuf);
+      copy_host2dev_immediate (devicep,
+			       (void *) (tgt->tgt_start + target_offset),
+			       (void *) &cur_node.tgt_offset,
+			       sizeof (void *), cbuf);
       return;
     }
   /* Add bias to the pointer value.  */
@@ -622,8 +639,9 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq,
      array section.  Now subtract bias to get what we want
      to initialize the pointer with.  */
   cur_node.tgt_offset -= bias;
-  gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset),
-		      (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
+  copy_host2dev_immediate (devicep, (void *) (tgt->tgt_start + target_offset),
+			   (void *) &cur_node.tgt_offset, sizeof (void *),
+			   cbuf);
 }
 
 static void
@@ -1442,13 +1460,13 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i - 1);
 		if (cur_node.tgt_offset)
 		  cur_node.tgt_offset -= sizes[i];
-		gomp_copy_host2dev (devicep, aq,
-				    (void *) (n->tgt->tgt_start
-					      + n->tgt_offset
-					      + cur_node.host_start
-					      - n->host_start),
-				    (void *) &cur_node.tgt_offset,
-				    sizeof (void *), cbufp);
+		copy_host2dev_immediate (devicep,
+					 (void *) (n->tgt->tgt_start
+						   + n->tgt_offset
+						   + cur_node.host_start
+						   - n->host_start),
+					 (void *) &cur_node.tgt_offset,
+					 sizeof (void *), cbufp);
 		cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
 				      + cur_node.host_start - n->host_start;
 		continue;
@@ -1687,8 +1705,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
 		    /* We intentionally do not use coalescing here, as it's not
 		       data allocated by the current call to this function.  */
-		    gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset,
-					&tgt_addr, sizeof (void *), NULL);
+		    copy_host2dev_immediate (devicep, (void *) n->tgt_offset,
+					     &tgt_addr, sizeof (void *), NULL);
 		  }
 		array++;
 	      }
@@ -1810,10 +1828,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
       for (i = 0; i < mapnum; i++)
 	{
 	  cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
-	  gomp_copy_host2dev (devicep, aq,
-			      (void *) (tgt->tgt_start + i * sizeof (void *)),
-			      (void *) &cur_node.tgt_offset, sizeof (void *),
-			      cbufp);
+	  copy_host2dev_immediate (devicep,
+	    (void *) (tgt->tgt_start + i * sizeof (void *)),
+	    (void *) &cur_node.tgt_offset, sizeof (void *), cbufp);
 	}
     }
 
@@ -3725,6 +3742,8 @@ offload_target_to_plugin_name (const char *offload_target)
     return "nvptx";
   else if (strncmp (offload_target, "hsa", 3) == 0)
     return "hsa";
+  else if (strstr (offload_target, "gcn") != NULL)
+    return "gcn";
   else
     gomp_fatal ("Unknown offload target: %s", offload_target);
 }
diff --git a/libgomp/team.c b/libgomp/team.c
index c422da3701df..b26caaaaec68 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -239,6 +239,9 @@ gomp_free_pool_helper (void *thread_pool)
   pthread_exit (NULL);
 #elif defined(__nvptx__)
   asm ("exit;");
+#elif defined(__AMDGCN__)
+  asm ("s_dcache_wb\n\t"
+       "s_endpgm");
 #else
 #error gomp_free_pool_helper must terminate the thread
 #endif
diff --git a/libgomp/testsuite/Makefile.in b/libgomp/testsuite/Makefile.in
index db794f35d6de..9c4d7b37c6e4 100644
--- a/libgomp/testsuite/Makefile.in
+++ b/libgomp/testsuite/Makefile.in
@@ -207,6 +207,10 @@ PACKAGE_URL = @PACKAGE_URL@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PLUGIN_GCN = @PLUGIN_GCN@
+PLUGIN_GCN_CPPFLAGS = @PLUGIN_GCN_CPPFLAGS@
+PLUGIN_GCN_LDFLAGS = @PLUGIN_GCN_LDFLAGS@
+PLUGIN_GCN_LIBS = @PLUGIN_GCN_LIBS@
 PLUGIN_HSA = @PLUGIN_HSA@
 PLUGIN_HSA_CPPFLAGS = @PLUGIN_HSA_CPPFLAGS@
 PLUGIN_HSA_LDFLAGS = @PLUGIN_HSA_LDFLAGS@
@@ -280,6 +284,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index a0fe4071cf9a..19bee806fb08 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -445,3 +445,28 @@ proc check_effective_target_hsa_offloading_selected {} {
 	check_effective_target_hsa_offloading_selected_nocache
     }]
 }
+# Return 1 if at least one AMD GCN board is present.
+
+proc check_effective_target_openacc_amdgcn_accel_present { } {
+    return [check_runtime openacc_amdgcn_accel_present {
+	#include <openacc.h>
+	int main () {
+	return !(acc_get_num_devices (acc_device_gcn) > 0);
+	}
+    } "" ]
+}
+
+# Return 1 if at least one AMD GCN board is present, and the AMD GCN device
+# type is selected by default.
+
+proc check_effective_target_openacc_amdgcn_accel_selected { } {
+    if { ![check_effective_target_openacc_amdgcn_accel_present] } {
+	return 0;
+    }
+    global offload_target_openacc
+    if { [string match "amdgcn*" $offload_target_openacc] } {
+        return 1;
+    }
+    return 0;
+}
+
diff --git a/libgomp/testsuite/libgomp.c/c.exp b/libgomp/testsuite/libgomp.c/c.exp
index 31bdd5795dc2..463d953f53df 100644
--- a/libgomp/testsuite/libgomp.c/c.exp
+++ b/libgomp/testsuite/libgomp.c/c.exp
@@ -23,10 +23,81 @@ dg-init
 # Turn on OpenMP.
 lappend ALWAYS_CFLAGS "additional_flags=-fopenmp"
 
+# Generate new tests for each DO_TEST entry in TEST_LIST.
+proc generate_tests { test_list } {
+    global srcdir
+    global subdir
+
+    # Get corresponding source file.
+    set base_file [regsub "\.list" $test_list ""]
+    set base_file [regsub "$srcdir/$subdir/" $base_file ""]
+    set c_file $base_file.c
+
+    # Get dg directives from c file.
+    set dg_directives ""
+    set fp [open "$srcdir/$subdir/$c_file" r]
+    while {[gets $fp line] >= 0} {
+	if {[regexp -line -- "^/\\* \{ dg-" $line]} {
+	    if { "$dg_directives" == "" } {
+		set sep ""
+	    } else {
+		set sep "\n"
+	    }
+	    set dg_directives "$dg_directives$sep$line"
+	}
+    }
+    close $fp
+
+    # Get list of tests.
+    set fp [open "$test_list" r]
+    set file_data [read $fp]
+    close $fp
+    set file_data [regsub -all "DO_TEST" $file_data ""]
+    set file_data [regsub -all "\\(" $file_data ""]
+    set file_data [regsub -all "\\)" $file_data ""]
+    set file_data [regsub -all \[\n\] $file_data ""]
+    set file_data [string trimleft $file_data " "]
+    set tests [split $file_data]
+
+    # Create directory to generate files.
+    set test_dir [pwd]
+    set generated_dir $test_dir/generated/libgomp.c
+    file mkdir $generated_dir
+
+    # Generate tests.
+    set new_files []
+    set i 1
+    foreach test $tests {
+	set new_file "$generated_dir/$base_file-$test.c"
+
+	set fp [open "$new_file" w]
+	puts $fp "$dg_directives"
+	puts $fp "#define ONE_TEST $test"
+	puts $fp "#define TEST_NR $i"
+	puts $fp "#include \"$srcdir/$subdir/$c_file\""
+	close $fp
+
+	set i [expr $i + 1]
+	lappend new_files $new_file
+    }
+
+    return $new_files
+}
+
+# Generate tests for each .list file
+set test_lists [find $srcdir/$subdir *.list]
+set generated_tests []
+foreach test_list $test_lists {
+    set generated_tests [concat \
+			     $generated_tests \
+			     [generate_tests $test_list]]
+}
+
 # Gather a list of all tests.
 set tests [lsort [concat \
 		      [find $srcdir/$subdir *.c] \
-		      [find $srcdir/$subdir/../libgomp.c-c++-common *.c]]]
+		      [find $srcdir/$subdir/../libgomp.c-c++-common *.c] \
+		      $generated_tests]]
 
 set ld_library_path $always_ld_library_path
 append ld_library_path [gcc-set-multilib-library-path $GCC_UNDER_TEST]
diff --git a/libgomp/testsuite/libgomp.c/for-1.h b/libgomp/testsuite/libgomp.c/for-1.h
new file mode 100644
index 000000000000..fa82c5b20d74
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-1.h
@@ -0,0 +1,25 @@
+#define S
+#define N(x) M(x, G, static)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(static, 32)
+#define N(x) M(x, G, static32)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(auto)
+#define N(x) M(x, G, auto)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(guided, 32)
+#define N(x) M(x, G, guided32)
+#include "for-2.h"
+#undef S
+#undef N
+#define S schedule(runtime)
+#define N(x) M(x, G, runtime)
+#include "for-2.h"
+#undef S
+#undef N
diff --git a/libgomp/testsuite/libgomp.c/for-2.h b/libgomp/testsuite/libgomp.c/for-2.h
new file mode 100644
index 000000000000..6d8e34974fdd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-2.h
@@ -0,0 +1,313 @@
+#ifndef VARS
+#define VARS
+int a[1500];
+float b[10][15][10];
+__attribute__((noreturn)) void
+noreturn (void)
+{
+  for (;;);
+}
+#endif
+#ifndef SC
+#define SC
+#endif
+#ifndef OMPTGT
+#define OMPTGT
+#endif
+#ifndef OMPTO
+#define OMPTO(v) do {} while (0)
+#endif
+#ifndef OMPFROM
+#define OMPFROM(v) do {} while (0)
+#endif
+
+__attribute__((noinline, noclone)) void
+N(f0) (void)
+{
+  int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 0; i < 1500; i++)
+    a[i] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f1) (void)
+{
+  OMPTGT
+#pragma omp F S
+  for (unsigned int i = __INT_MAX__; i < 3000U + __INT_MAX__; i += 2)
+    a[(i - __INT_MAX__) >> 1] -= 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f2) (void)
+{
+  unsigned long long i;
+  OMPTGT
+#pragma omp F S
+  for (i = __LONG_LONG_MAX__ + 4500ULL - 27;
+       i > __LONG_LONG_MAX__ - 27ULL; i -= 3)
+    a[(i + 26LL - __LONG_LONG_MAX__) / 3] -= 4;
+}
+
+__attribute__((noinline, noclone)) void
+N(f3) (long long n1, long long n2, long long s3)
+{
+  OMPTGT
+#pragma omp F S
+  for (long long i = n1 + 23; i > n2 - 25; i -= s3)
+    a[i + 48] += 7;
+}
+
+__attribute__((noinline, noclone)) void
+N(f4) (void)
+{
+  unsigned int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 30; i < 20; i += 2)
+    a[i] += 10;
+}
+
+__attribute__((noinline, noclone)) void
+N(f5) (int n11, int n12, int n21, int n22, int n31, int n32,
+       int s1, int s2, int s3)
+{
+  SC int v1, v2, v3;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = n11; v1 < n12; v1 += s1)
+    for (v2 = n21; v2 < n22; v2 += s2)
+      for (v3 = n31; v3 < n32; v3 += s3)
+	b[v1][v2][v3] += 2.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f6) (int n11, int n12, int n21, int n22, long long n31, long long n32,
+       int s1, int s2, long long int s3)
+{
+  SC int v1, v2;
+  SC long long v3;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = n11; v1 > n12; v1 += s1)
+    for (v2 = n21; v2 > n22; v2 += s2)
+      for (v3 = n31; v3 > n32; v3 += s3)
+	b[v1][v2 / 2][v3] -= 4.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f7) (void)
+{
+  SC unsigned int v1, v3;
+  SC unsigned long long v2;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = 0; v1 < 20; v1 += 2)
+    for (v2 = __LONG_LONG_MAX__ + 16ULL;
+	 v2 > __LONG_LONG_MAX__ - 29ULL; v2 -= 3)
+      for (v3 = 10; v3 > 0; v3--)
+	b[v1 >> 1][(v2 - __LONG_LONG_MAX__ + 64) / 3 - 12][v3 - 1] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f8) (void)
+{
+  SC long long v1, v2, v3;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (v1 = 0; v1 < 20; v1 += 2)
+    for (v2 = 30; v2 < 20; v2++)
+      for (v3 = 10; v3 < 0; v3--)
+	b[v1][v2][v3] += 5.5;
+}
+
+__attribute__((noinline, noclone)) void
+N(f9) (void)
+{
+  int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 20; i < 10; i++)
+    {
+      a[i] += 2;
+      noreturn ();
+      a[i] -= 4;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+N(f10) (void)
+{
+  SC int i;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (i = 0; i < 10; i++)
+    for (int j = 10; j < 8; j++)
+      for (long k = -10; k < 10; k++)
+	{
+	  b[i][j][k] += 4;
+	  noreturn ();
+	  b[i][j][k] -= 8;
+	}
+}
+
+__attribute__((noinline, noclone)) void
+N(f11) (int n)
+{
+  int i;
+  OMPTGT
+#pragma omp F S
+  for (i = 20; i < n; i++)
+    {
+      a[i] += 8;
+      noreturn ();
+      a[i] -= 16;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+N(f12) (int n)
+{
+  SC int i;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (i = 0; i < 10; i++)
+    for (int j = n; j < 8; j++)
+      for (long k = -10; k < 10; k++)
+	{
+	  b[i][j][k] += 16;
+	  noreturn ();
+	  b[i][j][k] -= 32;
+	}
+}
+
+__attribute__((noinline, noclone)) void
+N(f13) (void)
+{
+  int *i;
+  OMPTGT
+#pragma omp F S
+  for (i = a; i < &a[1500]; i++)
+    i[0] += 2;
+}
+
+__attribute__((noinline, noclone)) void
+N(f14) (void)
+{
+  SC float *i;
+  OMPTGT
+#pragma omp F S collapse(3)
+  for (i = &b[0][0][0]; i < &b[0][0][10]; i++)
+    for (float *j = &b[0][15][0]; j > &b[0][0][0]; j -= 10)
+      for (float *k = &b[0][0][10]; k > &b[0][0][0]; --k)
+	b[i - &b[0][0][0]][(j - &b[0][0][0]) / 10 - 1][(k - &b[0][0][0]) - 1]
+	  -= 3.5;
+}
+
+__attribute__((noinline, noclone)) int
+N(test) (void)
+{
+  int i, j, k;
+  for (i = 0; i < 1500; i++)
+    a[i] = i - 25;
+  OMPTO (a);
+  N(f0) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 23)
+      return 1;
+  N(f1) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 25)
+      return 1;
+  N(f2) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 29)
+      return 1;
+  N(f3) (1500LL - 1 - 23 - 48, -1LL + 25 - 48, 1LL);
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  N(f3) (1500LL - 1 - 23 - 48, 1500LL - 1, 7LL);
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  N(f4) ();
+  OMPFROM (a);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	b[i][j][k] = i - 2.5 + 1.5 * j - 1.5 * k;
+  OMPTO (b);
+  N(f5) (0, 10, 0, 15, 0, 10, 1, 1, 1);
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f5) (0, 10, 30, 15, 0, 10, 4, 5, 6);
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f6) (9, -1, 29, 0, 9, -1, -1, -2, -1);
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i - 4.5 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f7) ();
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f8) ();
+  OMPFROM (b);
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f9) ();
+  N(f10) ();
+  N(f11) (10);
+  N(f12) (12);
+  OMPFROM (a);
+  OMPFROM (b);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 22)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
+	  return 1;
+  N(f13) ();
+  N(f14) ();
+  OMPFROM (a);
+  OMPFROM (b);
+  for (i = 0; i < 1500; i++)
+    if (a[i] != i - 20)
+      return 1;
+  for (i = 0; i < 10; i++)
+    for (j = 0; j < 15; j++)
+      for (k = 0; k < 10; k++)
+	if (b[i][j][k] != i - 2.5 + 1.5 * j - 1.5 * k)
+	  return 1;
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-3.c b/libgomp/testsuite/libgomp.c/for-3.c
new file mode 100644
index 000000000000..d040f11b8e61
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-3.c
@@ -0,0 +1,123 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#if TEST_ALL || TEST_NR == 1
+#define F distribute
+#define G d
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 2
+#define F distribute
+#define G d_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 3
+#define F distribute simd
+#define G ds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 4
+#define F distribute simd
+#define G ds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (5 <= TEST_NR && TEST_NR <= 9)
+#define F distribute parallel for
+#define G dpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (10 <= TEST_NR && TEST_NR <= 14)
+#define F distribute parallel for dist_schedule(static, 128)
+#define G dpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (15 <= TEST_NR && TEST_NR <= 19)
+#define F distribute parallel for simd
+#define G dpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (20 <= TEST_NR && TEST_NR <= 24)
+#define F distribute parallel for simd dist_schedule(static, 128)
+#define G dpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#pragma omp end declare target
+
+int
+main ()
+{
+  int err = 0;
+
+  #pragma omp target teams reduction(|:err)
+  {
+#define DO_TEST_1(test) \
+    do {	      \
+      err |= test (); \
+    } while (0)
+
+#ifdef ONE_TEST
+  DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1(test);
+#include "for-3.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+  }
+
+  if (err)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-3.list b/libgomp/testsuite/libgomp.c/for-3.list
new file mode 100644
index 000000000000..6fb25a581dc6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-3.list
@@ -0,0 +1,24 @@
+DO_TEST (test_d_normal)
+DO_TEST (test_d_ds128_normal)
+DO_TEST (test_ds_normal)
+DO_TEST (test_ds_ds128_normal)
+DO_TEST (test_dpf_static)
+DO_TEST (test_dpf_static32)
+DO_TEST (test_dpf_auto)
+DO_TEST (test_dpf_guided32)
+DO_TEST (test_dpf_runtime)
+DO_TEST (test_dpf_ds128_static)
+DO_TEST (test_dpf_ds128_static32)
+DO_TEST (test_dpf_ds128_auto)
+DO_TEST (test_dpf_ds128_guided32)
+DO_TEST (test_dpf_ds128_runtime)
+DO_TEST (test_dpfs_static)
+DO_TEST (test_dpfs_static32)
+DO_TEST (test_dpfs_auto)
+DO_TEST (test_dpfs_guided32)
+DO_TEST (test_dpfs_runtime)
+DO_TEST (test_dpfs_ds128_static)
+DO_TEST (test_dpfs_ds128_static32)
+DO_TEST (test_dpfs_ds128_auto)
+DO_TEST (test_dpfs_ds128_guided32)
+DO_TEST (test_dpfs_ds128_runtime)
diff --git a/libgomp/testsuite/libgomp.c/for-5.c b/libgomp/testsuite/libgomp.c/for-5.c
new file mode 100644
index 000000000000..8f1257927310
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-5.c
@@ -0,0 +1,161 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#if TEST_ALL || (1 <= TEST_NR && TEST_NR <= 5)
+#define F target parallel for
+#define G tpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 6
+#define F target simd
+#define G t_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (7 <= TEST_NR && TEST_NR <= 11)
+#define F target parallel for simd
+#define G tpf_simd
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 12
+#define F target teams distribute
+#define G ttd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 13
+#define F target teams distribute
+#define G ttd_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 14
+#define F target teams distribute simd
+#define G ttds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 15
+#define F target teams distribute simd
+#define G ttds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (16 <= TEST_NR && TEST_NR <= 20)
+#define F target teams distribute parallel for
+#define G ttdpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (21 <= TEST_NR && TEST_NR <= 25)
+#define F target teams distribute parallel for dist_schedule(static, 128)
+#define G ttdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (26 <= TEST_NR && TEST_NR <= 30)
+#define F target teams distribute parallel for simd
+#define G ttdpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (31 <= TEST_NR && TEST_NR <= 35)
+#define F target teams distribute parallel for simd dist_schedule(static, 128)
+#define G ttdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+int
+main ()
+{
+#define DO_TEST_1(test)				\
+  do {						\
+    if (test ())				\
+      abort ();					\
+  } while (0)
+
+#ifdef ONE_TEST
+  DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1 (test);
+#include "for-5.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-5.list b/libgomp/testsuite/libgomp.c/for-5.list
new file mode 100644
index 000000000000..48d0c3aa7f6f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-5.list
@@ -0,0 +1,35 @@
+DO_TEST (test_tpf_static)
+DO_TEST (test_tpf_static32)
+DO_TEST (test_tpf_auto)
+DO_TEST (test_tpf_guided32)
+DO_TEST (test_tpf_runtime)
+DO_TEST (test_t_simd_normal)
+DO_TEST (test_tpf_simd_static)
+DO_TEST (test_tpf_simd_static32)
+DO_TEST (test_tpf_simd_auto)
+DO_TEST (test_tpf_simd_guided32)
+DO_TEST (test_tpf_simd_runtime)
+DO_TEST (test_ttd_normal)
+DO_TEST (test_ttd_ds128_normal)
+DO_TEST (test_ttds_normal)
+DO_TEST (test_ttds_ds128_normal)
+DO_TEST (test_ttdpf_static)
+DO_TEST (test_ttdpf_static32)
+DO_TEST (test_ttdpf_auto)
+DO_TEST (test_ttdpf_guided32)
+DO_TEST (test_ttdpf_runtime)
+DO_TEST (test_ttdpf_ds128_static)
+DO_TEST (test_ttdpf_ds128_static32)
+DO_TEST (test_ttdpf_ds128_auto)
+DO_TEST (test_ttdpf_ds128_guided32)
+DO_TEST (test_ttdpf_ds128_runtime)
+DO_TEST (test_ttdpfs_static)
+DO_TEST (test_ttdpfs_static32)
+DO_TEST (test_ttdpfs_auto)
+DO_TEST (test_ttdpfs_guided32)
+DO_TEST (test_ttdpfs_runtime)
+DO_TEST (test_ttdpfs_ds128_static)
+DO_TEST (test_ttdpfs_ds128_static32)
+DO_TEST (test_ttdpfs_ds128_auto)
+DO_TEST (test_ttdpfs_ds128_guided32)
+DO_TEST (test_ttdpfs_ds128_runtime)
diff --git a/libgomp/testsuite/libgomp.c/for-6.c b/libgomp/testsuite/libgomp.c/for-6.c
new file mode 100644
index 000000000000..50a866a3cf7b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-6.c
@@ -0,0 +1,135 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#ifndef ONE_TEST
+#define TEST_ALL 1
+#else
+#define TEST_ALL 0
+#endif
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPTGT
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPTGT DO_PRAGMA (omp target)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#if TEST_ALL || TEST_NR == 1
+#define F teams distribute
+#define G td
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 2
+#define F teams distribute
+#define G td_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 3
+#define F teams distribute simd
+#define G tds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || TEST_NR == 4
+#define F teams distribute simd
+#define G tds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (5 <= TEST_NR && TEST_NR <= 9)
+#define F teams distribute parallel for
+#define G tdpf
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (10 <= TEST_NR && TEST_NR <= 14)
+#define F teams distribute parallel for dist_schedule(static, 128)
+#define G tdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (15 <= TEST_NR && TEST_NR <= 19)
+#define F teams distribute parallel for simd
+#define G tdpfs
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+#if TEST_ALL || (20 <= TEST_NR && TEST_NR <= 24)
+#define F teams distribute parallel for simd dist_schedule(static, 128)
+#define G tdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+#endif
+
+int
+main ()
+{
+#define DO_TEST_1(test)				\
+  do {						\
+    if (test ())				\
+      abort ();					\
+  } while (0)
+
+#ifdef ONE_TEST
+  DO_TEST_1 (ONE_TEST);
+#else
+#define DO_TEST(test) DO_TEST_1 (test);
+#include "for-6.list"
+#undef DO_TEST
+#endif
+#undef DO_TEST_1
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-6.list b/libgomp/testsuite/libgomp.c/for-6.list
new file mode 100644
index 000000000000..438ecff0a2f3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-6.list
@@ -0,0 +1,24 @@
+DO_TEST (test_td_normal)
+DO_TEST (test_td_ds128_normal)
+DO_TEST (test_tds_normal)
+DO_TEST (test_tds_ds128_normal)
+DO_TEST (test_tdpf_static)
+DO_TEST (test_tdpf_static32)
+DO_TEST (test_tdpf_auto)
+DO_TEST (test_tdpf_guided32)
+DO_TEST (test_tdpf_runtime)
+DO_TEST (test_tdpf_ds128_static)
+DO_TEST (test_tdpf_ds128_static32)
+DO_TEST (test_tdpf_ds128_auto)
+DO_TEST (test_tdpf_ds128_guided32)
+DO_TEST (test_tdpf_ds128_runtime)
+DO_TEST (test_tdpfs_static)
+DO_TEST (test_tdpfs_static32)
+DO_TEST (test_tdpfs_auto)
+DO_TEST (test_tdpfs_guided32)
+DO_TEST (test_tdpfs_runtime)
+DO_TEST (test_tdpfs_ds128_static)
+DO_TEST (test_tdpfs_ds128_static32)
+DO_TEST (test_tdpfs_ds128_auto)
+DO_TEST (test_tdpfs_ds128_guided32)
+DO_TEST (test_tdpfs_ds128_runtime)
diff --git a/libgomp/testsuite/libgomp.c/target-print-1.c b/libgomp/testsuite/libgomp.c/target-print-1.c
new file mode 100644
index 000000000000..5857b875ced8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-print-1.c
@@ -0,0 +1,17 @@
+/* Ensure that printf on the offload device works.  */
+
+/* { dg-do run } */
+/* { dg-output "The answer is 42(\n|\r\n|\r)+" } */
+
+#include <stdio.h>
+
+int var = 42;
+
+int
+main ()
+{
+#pragma omp target
+    {
+      printf ("The answer is %d\n", var);
+    }
+}
diff --git a/libgomp/testsuite/libgomp.fortran/target-print-1.f90 b/libgomp/testsuite/libgomp.fortran/target-print-1.f90
new file mode 100644
index 000000000000..f4f00e233736
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-print-1.f90
@@ -0,0 +1,15 @@
+! Ensure that printf on the offload device works.
+
+! { dg-do run }
+! { dg-output "The answer is 42(\n|\r\n|\r)+" }
+! { dg-xfail-if "no write for nvidia" { openacc_nvidia_accel_selected } }
+
+program main
+  implicit none
+  integer :: var = 42
+
+!$omp target
+  write (0, '("The answer is ", I2)') var
+!$omp end target
+
+end program main
diff --git a/libgomp/testsuite/libgomp.oacc-c++/c++.exp b/libgomp/testsuite/libgomp.oacc-c++/c++.exp
index b8b44518b861..1285a6a1c6d6 100644
--- a/libgomp/testsuite/libgomp.oacc-c++/c++.exp
+++ b/libgomp/testsuite/libgomp.oacc-c++/c++.exp
@@ -111,6 +111,10 @@ if { $lang_test_file_found } {
 
 		set acc_mem_shared 0
 	    }
+	    amdgcn* {
+		set acc_mem_shared 0
+		set tagopt "-DACC_DEVICE_TYPE_gcn=\"$offload_target_openacc\""
+	    }
 	    default {
 		error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
 	    }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
index 842f2de4722d..6a24a2dde6a3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/atomic_capture-2.c
@@ -37,11 +37,9 @@ main(int argc, char **argv)
       imin = idata[i] < imin ? idata[i] : imin;
     }
 
-  if (imax != 1234 || imin != 0)
+  if (imax != 1234 || imin < 0 || imin > 1)
     abort ();
 
-  return 0;
-
   igot = 0;
   iexp = 32;
 
@@ -443,17 +441,16 @@ main(int argc, char **argv)
     }
   }
 
+  int ones = 0, zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (idata[i] != 1)
-	  abort ();
-      }
-    else
-      {
-	if (idata[i] != 0)
-	  abort ();
-      }
+    if (idata[i] == 1)
+      ones++;
+    else if (idata[i] == 0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (iexp != igot)
     abort ();
@@ -491,17 +488,16 @@ main(int argc, char **argv)
       }
   }
 
+  ones = zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (idata[i] != 0)
-	  abort ();
-      }
-    else
-      {
-	if (idata[i] != 1)
-	  abort ();
-      }
+    if (idata[i] == 1)
+      ones++;
+    else if (idata[i] == 0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (iexp != igot)
     abort ();
@@ -579,7 +575,7 @@ main(int argc, char **argv)
   if (lexp != lgot)
     abort ();
 
-  lgot = 2LL;
+  lgot = 2LL << N;
   lexp = 2LL;
 
 #pragma acc data copy (lgot, ldata[0:N])
@@ -587,7 +583,7 @@ main(int argc, char **argv)
 #pragma acc parallel loop
     for (i = 0; i < N; i++)
       {
-        long long expr = 1LL << N;
+        long long expr = 2LL;
 
 #pragma acc atomic capture
         { lgot = lgot / expr; ldata[i] = lgot; }
@@ -1450,17 +1446,16 @@ main(int argc, char **argv)
       }
   }
 
+  ones = zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (fdata[i] != 1.0)
-	  abort ();
-      }
-    else
-      {
-	if (fdata[i] != 0.0)
-	  abort ();
-      }
+    if (fdata[i] == 1.0)
+      ones++;
+    else if (fdata[i] == 0.0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (fexp != fgot)
     abort ();
@@ -1498,17 +1493,16 @@ main(int argc, char **argv)
       }
   }
 
+  ones = zeros = 0;
+
   for (i = 0; i < N; i++)
-    if (i % 2 == 0)
-      {
-	if (fdata[i] != 0.0)
-	  abort ();
-      }
-    else
-      {
-	if (fdata[i] != 1.0)
-	  abort ();
-      }
+    if (fdata[i] == 1.0)
+      ones++;
+    else if (fdata[i] == 0.0)
+      zeros++;
+
+  if (ones != N / 2 || zeros != N / 2)
+    abort ();
 
   if (fexp != fgot)
     abort ();
@@ -1569,7 +1563,7 @@ main(int argc, char **argv)
     abort ();
 
   fgot = 8192.0*8192.0*64.0;
-  fexp = 1.0;
+  fexp = fgot;
 
 #pragma acc data copy (fgot, fdata[0:N])
   {
@@ -1586,15 +1580,15 @@ main(int argc, char **argv)
   if (fexp != fgot)
     abort ();
 
-  fgot = 4.0;
-  fexp = 4.0;
+  fgot = 2.0 * (1LL << N);
+  fexp = 2.0;
 
 #pragma acc data copy (fgot, fdata[0:N])
   {
 #pragma acc parallel loop
     for (i = 0; i < N; i++)
       {
-        long long expr = 1LL << N;
+        long long expr = 2LL;
 
 #pragma acc atomic capture
         { fgot = fgot / expr; fdata[i] = fgot; }
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
index 9642b3919424..05946c900f1a 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+   { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
 /* { dg-additional-options "-fopenacc-dim=32" } */
 
 #include <stdio.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
index 766e5782b463..5c8430120618 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@@ -9,11 +9,13 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int gangsize, workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	    copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
   {
 #pragma acc loop gang worker vector
     for (unsigned ix = 0; ix < N; ix++)
@@ -32,6 +34,10 @@ int main ()
 	else
 	  ary[ix] = ix;
       }
+
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -39,11 +45,12 @@ int main ()
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * workersize * vectorsize);
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
index 0bec6e19510a..9c4a85f7b16b 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@@ -8,8 +8,10 @@ int main ()
   int ix;
   int ondev = 0;
   int t = 0, h = 0;
-  
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ondev)
+  int gangsize, workersize, vectorsize;
+
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
+	copy(ondev) copyout(gangsize, workersize, vectorsize)
   {
 #pragma acc loop gang worker vector reduction(+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -28,18 +30,22 @@ int main ()
 	  }
 	t += val;
       }
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
     {
       int val = ix;
-      if(ondev)
+      if (ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * vectorsize * workersize);
+	  int w = ix / vectorsize % workersize;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
index da4921d15f98..1173c1f57bb8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c
@@ -9,8 +9,9 @@ int main ()
   int ix;
   int ondev = 0;
   int t = 0,  h = 0;
+  int vectorsize;
 
-#pragma acc parallel vector_length(32) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ondev) copyout(vectorsize)
   {
 #pragma acc loop vector reduction (+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -29,6 +30,7 @@ int main ()
 	  }
 	t += val;
       }
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -38,7 +40,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
index 15e2bc2f83bf..84c2296a7b12 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c
@@ -9,8 +9,9 @@ int main ()
   int ix;
   int ondev = 0;
   int q = 0,  h = 0;
+  int vectorsize;
 
-#pragma acc parallel vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel vector_length(32) copy(q) copy(ondev) copyout(vectorsize)
   {
     int t = q;
     
@@ -32,6 +33,7 @@ int main ()
 	t += val;
       }
     q = t;
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -41,7 +43,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
index a1bb845987d1..7344fa8bf001 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c
@@ -9,8 +9,10 @@ int main ()
   int ix;
   int ondev = 0;
   int t = 0,  h = 0;
+  int workersize;
 
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+	    copyout(workersize)
   {
 #pragma acc loop worker reduction(+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -29,6 +31,7 @@ int main ()
 	  }
 	t += val;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -37,7 +40,7 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  val = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
index ae43bb47d8f6..d99877ab8a9d 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c
@@ -9,8 +9,10 @@ int main ()
   int ix;
   int ondev = 0;
   int q = 0,  h = 0;
+  int workersize;
 
-#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) \
+	    copyout(workersize)
   {
     int t = q;
     
@@ -32,6 +34,7 @@ int main ()
 	t += val;
       }
     q = t;
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  val = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
index 71d3969f7b63..c360ad11e7cb 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -8,8 +8,10 @@ int main ()
   int ix;
   int ondev = 0;
   int t = 0, h = 0;
+  int workersize, vectorsize;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
+	    copyout(workersize, vectorsize)
   {
 #pragma acc loop worker vector reduction (+:t)
     for (unsigned ix = 0; ix < N; ix++)
@@ -28,6 +30,8 @@ int main ()
 	  }
 	t += val;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -36,8 +40,8 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
index 6010cd2498a6..8c858f305633 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c
@@ -9,11 +9,13 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+	    copyout(vectorsize)
   {
 #pragma acc loop vector
     for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +33,7 @@ int main ()
 	else
 	  ary[ix] = ix;
       }
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
index 10b80f197de5..8731c805b798 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c
@@ -9,12 +9,14 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
-  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 16 } */
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize)
+  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "vector" { target *-*-* } 17 } */
   {
 #pragma acc loop worker
     for (unsigned ix = 0; ix < N; ix++)
@@ -32,6 +34,7 @@ int main ()
 	else
 	  ary[ix] = ix;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -40,7 +43,7 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  expected = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
index cd4cc994b826..fd4e4cf5ea9c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@@ -9,11 +9,13 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize, vectorsize)
   {
 #pragma acc loop worker vector
     for (unsigned ix = 0; ix < N; ix++)
@@ -31,6 +33,8 @@ int main ()
 	else
 	  ary[ix] = ix;
       }
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -39,8 +43,8 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index c4430cecf71d..ec63e3fe2c91 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -12,7 +12,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
   else
     __builtin_abort ();
@@ -23,7 +24,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
   else
     __builtin_abort ();
@@ -34,7 +36,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
   else
     __builtin_abort ();
@@ -177,9 +180,8 @@ int main ()
 	if (vectors_actual != 32)
 	  __builtin_abort ();
       }
-    else
-      if (vectors_actual != 1)
-	__builtin_abort ();
+    else if (vectors_actual != 1)
+      __builtin_abort ();
     if (gangs_min != 0 || gangs_max != 0
 	|| workers_min != 0 || workers_max != 0
 	|| vectors_min != 0 || vectors_max != vectors_actual - 1)
@@ -325,6 +327,10 @@ int main ()
 	  /* We're actually executing with num_workers (32).  */
 	  /* workers_actual = 32; */
 	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  workers_actual = 4;
+	}
       else
 	__builtin_abort ();
 #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@@ -404,6 +410,13 @@ int main ()
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 32;
 	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  /* Because of the way vectors are implemented for GCN, a vector loop
+	     containing a seq routine call will not vectorize calls to that
+	     routine.  Hence, we'll only get one "vector".  */
+	  vectors_actual = 1;
+	}
       else
 	__builtin_abort ();
 #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
@@ -430,6 +443,9 @@ int main ()
        in the following case.  So, limit ourselves here.  */
     if (acc_get_device_type () == acc_device_nvidia)
       gangs = 3;
+    /* Similar appears to be true for GCN.  */
+    if (acc_get_device_type () == acc_device_gcn)
+      gangs = 3;
     int gangs_actual = gangs;
 #define WORKERS 3
     int workers_actual = WORKERS;
@@ -456,6 +472,11 @@ int main ()
 	  /* The GCC nvptx back end enforces vector_length (32).  */
 	  vectors_actual = 32;
 	}
+      else if (acc_on_device (acc_device_gcn))
+	{
+	  /* See above comments about GCN vectors_actual.  */
+	  vectors_actual = 1;
+	}
       else
 	__builtin_abort ();
 #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables-2.c
new file mode 100644
index 000000000000..51fb394a0b13
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-variables-2.c
@@ -0,0 +1,217 @@
+#include <assert.h>
+
+/* Worker propagation: plain scalar variables.  */
+
+void
+worker_bcast_1 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32)
+  {
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+	int j;
+	int x = (i ^ 3) * 3;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x * j;
+
+	x = (i | 5) * 5;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += x * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	assert (arr[idx] == idx + (i ^ 3) * 3 * j + (i | 5) * 5 * j);
+      }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int
+select_var (int s, int x, int y)
+{
+  if (s)
+    return x;
+  else
+    return y;
+}
+
+/* Worker propagation: scalars through function calls.  */
+
+void
+worker_bcast_2 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
+  {
+    int j;
+
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+	int x, y, z;
+
+	x = i * 5;
+	y = i * 7;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_var (j & 1, x, y) * j;
+
+	#pragma acc loop worker vector
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_var (j & 1, y, x) * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	int answer = idx + ((j & 1) ? i * 5 : i * 7) * j
+		     + ((j & 1) ? i * 7 : i * 5) * j;
+        assert (arr[idx] == answer);
+      }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int
+select_addr (int s, int *x, int *y)
+{
+  if (s)
+    return *x;
+  else
+    return *y;
+}
+
+/* Worker propagation: addresses of locals through function calls.  */
+
+void
+worker_bcast_3 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32) vector_length(32)
+  {
+    int j;
+
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+        int x, y, z;
+
+	x = i * 5;
+	y = i * 7;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_addr (j & 1, &x, &y) * j;
+
+	#pragma acc loop worker vector
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += select_addr (j & 1, &y, &x) * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	int answer = idx + ((j & 1) ? i * 5 : i * 7) * j
+		     + ((j & 1) ? i * 7 : i * 5) * j;
+	assert (arr[idx] == answer);
+      }
+}
+
+#pragma acc routine seq
+__attribute__((noinline)) static int *
+select_ptr (int s, int *x, int *y)
+{
+  if (s)
+    return x;
+  else
+    return y;
+}
+
+/* Worker propagation: writes through pointers.  */
+
+void
+worker_bcast_4 (void)
+{
+  int i, arr[32 * 32];
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(32) num_workers(32)
+  {
+    int j;
+
+    #pragma acc loop gang
+    for (i = 0; i < 32; i++)
+      {
+	int x, y, z;
+	int *p, *q, *r;
+
+	p = &x;
+	q = &y;
+
+	x = i * 5;
+	y = i * 7;
+	r = select_ptr (i & 1, p, q);
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += (x + y + 2 * (*p) + (*q)) * j;
+
+	/* This write can affect either x or y: both should be broadcast into
+	   the next loop.  */
+	(*r) += 20;
+
+	#pragma acc loop worker
+	for (j = 0; j < 32; j++)
+	  arr[i * 32 + j] += (x + y + 2 * (*p) + (*q)) * j;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++)
+      {
+	int idx = i * 32 + j;
+	int x = i * 5, y = i * 7;
+	int answer = idx + (3 * x + 2 * y) * j
+		     + ((i & 1) ? (3 * (x + 20) + 2 * y)
+				: (3 * x + 2 * (y + 20))) * j;
+	assert (arr[idx] == answer);
+      }
+}
+
+
+int main ()
+{
+  worker_bcast_1 ();
+  worker_bcast_2 ();
+  worker_bcast_3 ();
+  worker_bcast_4 ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
index a97e046b687f..da13d84908a8 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@@ -30,14 +30,18 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int gangsize, workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
   {
     ondev = acc_on_device (acc_device_not_host);
     gang (ary);
+    gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -45,11 +49,12 @@ int main ()
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + 32*32*32 - 1) / (32*32*32);
+	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
+			   / (gangsize * workersize * vectorsize);
 	  
-	  int g = ix / (chunk_size * 32 * 32);
-	  int w = ix / 32 % 32;
-	  int v = ix % 32;
+	  int g = ix / (chunk_size * vectorsize * workersize);
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
index b1e3e3a596af..dd7bb6cdcd1e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-v-1.c
@@ -30,14 +30,17 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
+	    copyout(vectorsize)
   {
     ondev = acc_on_device (acc_device_not_host);
     vector (ary);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -47,7 +50,7 @@ int main ()
 	{
 	  int g = 0;
 	  int w = 0;
-	  int v = ix % 32;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
index e14947cdf94f..bf9228105c5f 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-w-1.c
@@ -31,14 +31,17 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize)
   {
     ondev = acc_on_device (acc_device_not_host);
     worker (ary);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -47,7 +50,7 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = ix % 32;
+	  int w = ix % workersize;
 	  int v = 0;
 
 	  expected = (g << 16) | (w << 8) | v;
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
index 23dbc1ae4014..73696e4e59a3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@@ -30,14 +30,18 @@ int main ()
   int ix;
   int exit = 0;
   int ondev = 0;
+  int workersize, vectorsize;
 
   for (ix = 0; ix < N;ix++)
     ary[ix] = -1;
   
-#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev)
+#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
+	    copyout(workersize, vectorsize)
   {
     ondev = acc_on_device (acc_device_not_host);
     worker (ary);
+    workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
+    vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
   }
 
   for (ix = 0; ix < N; ix++)
@@ -46,8 +50,8 @@ int main ()
       if(ondev)
 	{
 	  int g = 0;
-	  int w = (ix / 32) % 32;
-	  int v = ix % 32;
+	  int w = (ix / vectorsize) % workersize;
+	  int v = ix % vectorsize;
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
index 886214843f14..b469eedd1bcd 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c
@@ -2,8 +2,14 @@
 #include <openacc.h>
 #include <gomp-constants.h>
 
+#ifdef ACC_DEVICE_TYPE_gcn
+/* FIXME: Max. number of workers may increase for GCN in the future.  */
+#define NUM_WORKERS 4
+#define NUM_VECTORS 1
+#else
 #define NUM_WORKERS 16
 #define NUM_VECTORS 32
+#endif
 #define WIDTH 64
 #define HEIGHT 32
 
@@ -37,7 +43,8 @@ int DoWorkVec (int nw)
       ary[ix][jx] = 0xdeadbeef;
 
   printf ("spawning %d ...", nw); fflush (stdout);
-  
+
+/* { dg-warning "region contains vector partitioned code but is not vector partitioned" "vector" { target openacc_amdgcn_accel_selected } 48 } */
 #pragma acc parallel num_workers(nw) vector_length (NUM_VECTORS) copy (ary)
   {
     WorkVec ((int *)ary, WIDTH, HEIGHT, nw, NUM_VECTORS);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c
index dc6e1eac938d..d4692091b84e 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/serial-dims.c
@@ -11,7 +11,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
   else
     __builtin_abort ();
@@ -22,7 +23,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
   else
     __builtin_abort ();
@@ -33,7 +35,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+	   || acc_on_device ((int) acc_device_gcn))
     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
   else
     __builtin_abort ();
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
index 5130591dd818..076e3cd75fe3 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/tile-1.c
@@ -1,3 +1,6 @@
+/* AMD GCN does not use 32-lane vectors.
+   { dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
+
 /* { dg-additional-options "-fopenacc-dim=32" } */
 
 #include <stdio.h>
diff --git a/libgomp/testsuite/libgomp.oacc-c/c.exp b/libgomp/testsuite/libgomp.oacc-c/c.exp
index aba6665b5cb0..f7005ebba48f 100644
--- a/libgomp/testsuite/libgomp.oacc-c/c.exp
+++ b/libgomp/testsuite/libgomp.oacc-c/c.exp
@@ -72,6 +72,10 @@ foreach offload_target [concat [split $offload_targets ":"] "disable"] {
 
 	    set acc_mem_shared 0
 	}
+	amdgcn* {
+	    set acc_mem_shared 0
+	    set tagopt "-DACC_DEVICE_TYPE_gcn=\"$offload_target_openacc\""
+	}
 	default {
 	    error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c b/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c
index b62a587ec08f..7265f48deb94 100644
--- a/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c/offload-targets-1.c
@@ -26,6 +26,9 @@ int main ()
 #if defined ACC_DEVICE_TYPE_nvidia
   offload_target_requested = ACC_DEVICE_TYPE_nvidia;
   acc_device_type_requested = acc_device_nvidia;
+#elif defined ACC_DEVICE_TYPE_gcn
+  offload_target_requested = ACC_DEVICE_TYPE_gcn;
+  acc_device_type_requested = acc_device_gcn;
 #elif defined ACC_DEVICE_TYPE_host
   offload_target_requested = ACC_DEVICE_TYPE_host;
   acc_device_type_requested = acc_device_host;
diff --git a/libgomp/testsuite/libgomp.oacc-c/print-1.c b/libgomp/testsuite/libgomp.oacc-c/print-1.c
new file mode 100644
index 000000000000..593885b5c2cd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c/print-1.c
@@ -0,0 +1,17 @@
+/* Ensure that printf on the offload device works.  */
+
+/* { dg-do run } */
+/* { dg-output "The answer is 42(\n|\r\n|\r)+" } */
+
+#include <stdio.h>
+
+int var = 42;
+
+int
+main ()
+{
+#pragma acc parallel
+    {
+      printf ("The answer is %d\n", var);
+    }
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
index 5a4a1e03f64a..9dd2339509ca 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/atomic_capture-1.f90
@@ -275,7 +275,7 @@ program main
   if (ltmp .neqv. .not. lexp) STOP 33
   if (lgot .neqv. lexp) STOP 34
 
-  igot = 1
+  igot = 0
   iexp = N
 
   !$acc parallel loop copy (igot, itmp)
@@ -287,12 +287,16 @@ program main
     end do
   !$acc end parallel loop
 
+  itmp = 0
+  do i = 1, N
+     if (iarr(i) == 0 .and. itmp == 0) itmp = i
+  end do
   do i = 1, N
-     if (.not. (1 <= iarr(i) .and. iarr(i) < iexp)) STOP 35
+     if (iarr(i) == 0 .and. i /= itmp) STOP 35
   end do
   if (igot /= iexp) STOP 36
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -304,8 +308,12 @@ program main
     end do
   !$acc end parallel loop
 
+  itmp = 0
   do i = 1, N
-     if (.not. (iarr(i) == 1 .or. iarr(i) == N)) STOP 37
+     if (iarr(i) == N + 1 .and. itmp == 0) itmp = i
+  end do
+  do i = 1, N
+      if (iarr(i) == N + 1 .and. i /= itmp) STOP 37
   end do
   if (igot /= iexp) STOP 38
 
@@ -314,7 +322,7 @@ program main
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       iarr(i) = igot
       igot = iand (igot, iexpr)
@@ -322,9 +330,13 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 39
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 39
   if (igot /= iexp) STOP 40
 
   igot = 0
@@ -340,10 +352,14 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= 0)) STOP 41
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 42
+  if (igot /= iexp) STOP 41
+  if (itmp /= 496) STOP 42
 
   igot = -1
   iexp = 0 
@@ -358,12 +374,16 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 43
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 44
+  if (igot /= iexp) STOP 43
+  if (itmp /= 528) STOP 44
 
-  igot = 1
+  igot = 0
   iexp = N
 
   !$acc parallel loop copy (igot, itmp)
@@ -375,12 +395,16 @@ program main
     end do
   !$acc end parallel loop
 
+  itmp = 0
+  do i = 1, N
+     if (iarr(i) == 0 .and. itmp == 0) itmp = i
+  end do
   do i = 1, N
-     if (.not. (1 <= iarr(i) .and. iarr(i) < iexp)) STOP 45
+     if (iarr(i) == 0 .and. itmp /= i) STOP 45
   end do
   if (igot /= iexp) STOP 46
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -392,8 +416,12 @@ program main
     end do
   !$acc end parallel loop
 
+  itmp = 0
   do i = 1, N
-     if (.not. (iarr(i) == 1 .or. iarr(i) == N)) STOP 47
+     if (iarr(i) == N + 1 .and. itmp == 0) itmp = i
+  end do
+  do i = 1, N
+      if (iarr(i) == N + 1 .and. i /= itmp) STOP 47
   end do
   if (igot /= iexp) STOP 48
 
@@ -402,7 +430,7 @@ program main
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       iarr(i) = igot
       igot = iand (iexpr, igot)
@@ -410,14 +438,18 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 49
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 49
   if (igot /= iexp) STOP 50
 
   igot = 0
   iexp = -1 
-	!!
+
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
       iexpr = lshift (1, i)
@@ -428,10 +460,14 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= 0)) STOP 51
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 52
+  if (igot /= iexp) STOP 51
+  if (itmp /= 496) STOP 52
 
   igot = -1
   iexp = 0 
@@ -446,10 +482,14 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) < 0)) STOP 53
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
-  if (igot /= iexp) STOP 54
+  if (igot /= iexp) STOP 53
+  if (itmp /= 528) STOP 54
 
   fgot = 1234.0
   fexp = 1266.0
@@ -720,7 +760,7 @@ program main
   end do
   if (igot /= iexp) STOP 88
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -733,7 +773,7 @@ program main
   !$acc end parallel loop
 
   do i = 1, N
-     if (.not. (iarr(i) == iexp)) STOP 89
+     if (iarr(i) .lt. 1 .or. iarr(i) .gt. N) STOP 89
   end do
   if (igot /= iexp) STOP 90
 
@@ -742,7 +782,7 @@ program main
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       igot = iand (igot, iexpr)
       iarr(i) = igot
@@ -750,9 +790,13 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= 0)) STOP 91
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 91
   if (igot /= iexp) STOP 92
 
   igot = 0
@@ -768,9 +812,13 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= -1)) STOP 93
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 93
   if (igot /= iexp) STOP 94
 
   igot = -1
@@ -786,9 +834,13 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= 0)) STOP 95
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 95
   if (igot /= iexp) STOP 96
 
   igot = 1
@@ -808,7 +860,7 @@ program main
   end do
   if (igot /= iexp) STOP 98
 
-  igot = N
+  igot = N + 1
   iexp = 1
 
   !$acc parallel loop copy (igot, itmp)
@@ -821,7 +873,7 @@ program main
   !$acc end parallel loop
 
   do i = 1, N
-     if (.not. (iarr(i) == iexp )) STOP 99
+     if (iarr(i) .lt. 1 .or. iarr(i) .gt. N) STOP 99
   end do
   if (igot /= iexp) STOP 100
 
@@ -830,7 +882,7 @@ program main
 
   !$acc parallel loop copy (igot, itmp)
     do i = 0, N - 1
-      iexpr = ibclr (-2, i)
+      iexpr = ibclr (-1, i)
   !$acc atomic capture
       igot = iand (iexpr, igot)
       iarr(i) = igot
@@ -838,9 +890,13 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= 0)) STOP 101
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 101
   if (igot /= iexp) STOP 102
 
   igot = 0
@@ -856,9 +912,13 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) >= iexp)) STOP 103
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 528) STOP 103
   if (igot /= iexp) STOP 104
 
   igot = -1
@@ -874,9 +934,12 @@ program main
     end do
   !$acc end parallel loop
 
-  do i = 1, N
-     if (.not. (iarr(i - 1) <= iexp)) STOP 105
+  itmp = 0
+  do i = 0, N - 1
+     do j = 0, N - 1
+        if (btest (iarr(i), j)) itmp = itmp + 1
+     end do
   end do
+  if (itmp /= 496) STOP 105
   if (igot /= iexp) STOP 106
-
 end program
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90
index 918c5d0d5b1c..659e5db5d0bd 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/collapse-1.f90
@@ -6,7 +6,7 @@ program collapse1
   l = .false.
   a(:, :, :) = 0
   !$acc parallel
-  !$acc loop collapse(4 - 1)
+  !$acc loop collapse(4 - 1) gang(static:*)
     do i = 1, 3
       do j = 4, 6
         do k = 5, 7
@@ -14,7 +14,7 @@ program collapse1
         end do
       end do
     end do
-  !$acc loop collapse(2) reduction(.or.:l)
+  !$acc loop collapse(3) gang(static:*) reduction(.or.:l)
     do i = 1, 3
       do j = 4, 6
         do k = 5, 7
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90
index 98b6987750ec..409d9794e00c 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90
+++ b/libgomp/testsuite/libgomp.oacc-fortran/collapse-2.f90
@@ -7,13 +7,13 @@ program collapse2
   l = .false.
   a(:, :, :) = 0
   !$acc parallel
-  !$acc loop collapse(4 - 1)
+  !$acc loop collapse(4 - 1) gang(static:*)
     do 164 i = 1, 3
       do 164 j = 4, 6
         do 164 k = 5, 7
           a(i, j, k) = i + j + k
 164      end do
-  !$acc loop collapse(2) reduction(.or.:l)
+  !$acc loop collapse(3) gang(static:*) reduction(.or.:l)
 firstdo: do i = 1, 3
       do j = 4, 6
         do k = 5, 7
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f
index 4965e674c27b..95810a6ae93c 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-1.f
@@ -15,6 +15,6 @@
 ! { dg-output "ERROR STOP (\n|\r\n|\r)+" }
 ! PR85463.  The "minimal" libgfortran implementation used with nvptx
 ! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
 ! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
 ! { dg-shouldfail "" }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f
index 7103fdb5d8ed..ce59bbda3c37 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-2.f
@@ -15,6 +15,6 @@
 ! { dg-output "ERROR STOP 35(\n|\r\n|\r)+" }
 ! PR85463.  The "minimal" libgfortran implementation used with nvptx
 ! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
 ! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
 ! { dg-shouldfail "" }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f
index 9c217f14ea1b..9b606c83ad94 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f
+++ b/libgomp/testsuite/libgomp.oacc-fortran/error_stop-3.f
@@ -15,6 +15,6 @@
 ! { dg-output "ERROR STOP SiGN(\n|\r\n|\r)+" }
 ! PR85463.  The "minimal" libgfortran implementation used with nvptx
 ! offloading is a little bit different.
-! { dg-output "Error termination.*" { target { ! openacc_nvidia_accel_selected } } }
+! { dg-output "Error termination.*" { target { { ! openacc_nvidia_accel_selected } && { ! openacc_amdgcn_accel_selected } } } }
 ! { dg-output "libgomp: cuStreamSynchronize error.*" { target openacc_nvidia_accel_selected } }
 ! { dg-shouldfail "" }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
index 56694a6eef55..0f8d1db48951 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
+++ b/libgomp/testsuite/libgomp.oacc-fortran/fortran.exp
@@ -90,6 +90,10 @@ if { $lang_test_file_found } {
 
 		set acc_mem_shared 0
 	    }
+	    amdgcn* {
+		set acc_mem_shared 0
+		set tagopt "-DACC_DEVICE_TYPE_gcn=\"$offload_target_openacc\""
+	    }
 	    default {
 		error "Unknown OpenACC device type: $openacc_device_type (offload target: $offload_target)"
 	    }
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/print-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/print-1.f90
new file mode 100644
index 000000000000..9a94195b6f11
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/print-1.f90
@@ -0,0 +1,15 @@
+! Ensure that printf on the offload device works.
+
+! { dg-do run }
+! { dg-output "The answer is 42(\n|\r\n|\r)+" }
+! { dg-xfail-if "no write for nvidia" { openacc_nvidia_accel_selected } }
+
+program main
+  implicit none
+  integer :: var = 42
+
+!$acc parallel
+  write (0, '("The answer is ", I2)') var
+!$acc end parallel
+
+end program main