amdgcn, libgomp: Manually allocated stacks

author Andrew Stubbs <ams@codesourcery.com>

Thu, 1 Dec 2022 17:30:21 +0000 (17:30 +0000)

committer Andrew Stubbs <ams@codesourcery.com>

Mon, 6 Feb 2023 17:54:24 +0000 (17:54 +0000)
author Andrew Stubbs <ams@codesourcery.com>
Thu, 1 Dec 2022 17:30:21 +0000 (17:30 +0000)
committer Andrew Stubbs <ams@codesourcery.com>
Mon, 6 Feb 2023 17:54:24 +0000 (17:54 +0000)
diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp

index 2ec9724274c334069e2e3edfc47e40c93457e9a1..f7d0c439245f521950ef4e3506f06aa9bc13f325 100644 (file)
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,23 @@
+2023-02-06  Andrew Stubbs  <ams@codesourcery.com>
+
+       * config/gcn/gcn-run.cc: Include libgomp-gcn.h.
+       (struct kernargs): Replace the common content with kernargs_abi.
+       (struct heap): Delete.
+       (main): Read GCN_STACK_SIZE envvar.
+       Allocate space for the device stacks.
+       Write the new kernargs fields.
+       * config/gcn/gcn.cc (gcn_option_override): Remove stack_size_opt.
+       (default_requested_args): Remove PRIVATE_SEGMENT_BUFFER_ARG and
+       PRIVATE_SEGMENT_WAVE_OFFSET_ARG.
+       (gcn_addr_space_convert): Mask the QUEUE_PTR_ARG content.
+       (gcn_expand_prologue): Move the TARGET_PACKED_WORK_ITEMS to the top.
+       Set up the stacks from the values in the kernargs, not private.
+       (gcn_expand_builtin_1): Match the stack configuration in the prologue.
+       (gcn_hsa_declare_function_name): Turn off the private segment.
+       (gcn_conditional_register_usage): Ensure QUEUE_PTR is fixed.
+       * config/gcn/gcn.h (FIXED_REGISTERS): Fix the QUEUE_PTR register.
+       * config/gcn/gcn.opt (mstack-size): Change the description.
+
  2023-02-02  Paul-Antoine Arras  <pa@codesourcery.com>
  
         Backported from master:
diff --git a/gcc/config/gcn/gcn-run.cc b/gcc/config/gcn/gcn-run.cc

index f0d816a4f7a58e975321c14c71dd4e700a0c11af..6da769685bf40e3e0fd93855481172e03f9f9bc3 100644 (file)
--- a/gcc/config/gcn/gcn-run.cc
+++ b/gcc/config/gcn/gcn-run.cc
@@ -35,6 +35,7 @@
  #include <signal.h>
  
  #include "hsa.h"
+#include "../../../libgomp/config/gcn/libgomp-gcn.h"
  
  #ifndef HSA_RUNTIME_LIB
  #define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
@@ -487,39 +488,16 @@ device_malloc (size_t size, hsa_region_t region)
     automatically assign the exit value to *return_value.  */
  struct kernargs
  {
-  /* Kernargs.  */
-  int32_t argc;
-  int64_t argv;
-  int64_t out_ptr;
-  int64_t heap_ptr;
-
-  /* Output data.  */
-  struct output
-  {
-    int return_value;
-    unsigned int next_output;
-    struct printf_data
-    {
-      int written;
-      char msg[128];
-      int type;
-      union
-      {
-       int64_t ivalue;
-       double dvalue;
-       char text[128];
-      };
-    } queue[1024];
-    unsigned int consumed;
-  } output_data;
+  union {
+    struct {
+      int32_t argc;
+      int64_t argv;
+    } args;
+    struct kernargs_abi abi;
+  };
+  struct output output_data;
  };
  
-struct heap
-{
-  int64_t size;
-  char data[0];
-} heap;
-
  /* Print any console output from the kernel.
     We print all entries from "consumed" to the next entry without a "written"
     flag, or "next_output" is reached.  The buffer is circular, but the
@@ -687,6 +665,16 @@ main (int argc, char *argv[])
    for (int i = 0; i < kernel_argc; i++)
      args_size += strlen (kernel_argv[i]) + 1;
  
+  /* The device stack can be adjusted via an environment variable.  */
+  char *envvar = getenv ("GCN_STACK_SIZE");
+  int stack_size = 1 * 1024 * 1024;  /* 1MB default.  */
+  if (envvar)
+    {
+      int val = atoi (envvar);
+      if (val)
+       stack_size = val;
+    }
+
    /* Allocate device memory for both function parameters and the argv
       data.  */
    struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
@@ -702,11 +690,12 @@ main (int argc, char *argv[])
    XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
                                             HSA_ACCESS_PERMISSION_RW),
         "Assign heap to device agent");
+  void *stack = device_malloc (stack_size, heap_region);
  
    /* Write the data to the target.  */
-  kernargs->argc = kernel_argc;
-  kernargs->argv = (int64_t) args->argv_data;
-  kernargs->out_ptr = (int64_t) &kernargs->output_data;
+  kernargs->args.argc = kernel_argc;
+  kernargs->args.argv = (int64_t) args->argv_data;
+  kernargs->abi.out_ptr = (int64_t) &kernargs->output_data;
    kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
    kernargs->output_data.next_output = 0;
    for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
@@ -721,8 +710,11 @@ main (int argc, char *argv[])
        memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
        offset += arg_len;
      }
-  kernargs->heap_ptr = (int64_t) heap;
+  kernargs->abi.heap_ptr = (int64_t) heap;
    hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
+  kernargs->abi.arena_ptr = 0;
+  kernargs->abi.stack_ptr = (int64_t) stack;
+  kernargs->abi.stack_size_per_thread = stack_size;
  
    /* Run constructors on the GPU.  */
    run (init_array_kernel, kernargs);
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc

index b9a18613e1589b3660ad2384268daa640b76a6a4..0b21dbd256e578b520897cc82914af287613eb32 100644 (file)
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -144,21 +144,6 @@ gcn_option_override (void)
        : ISA_UNKNOWN);
    gcc_assert (gcn_isa != ISA_UNKNOWN);
  
-  /* The default stack size needs to be small for offload kernels because
-     there may be many, many threads.  Also, a smaller stack gives a
-     measureable performance boost.  But, a small stack is insufficient
-     for running the testsuite, so we use a larger default for the stand
-     alone case.  */
-  if (stack_size_opt == -1)
-    {
-      if (flag_openacc || flag_openmp)
-       /* 512 bytes per work item = 32kB total.  */
-       stack_size_opt = 512 * 64;
-      else
-       /* 1MB total.  */
-       stack_size_opt = 1048576;
-    }
-
    /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
       worker broadcasts.  */
    if (gang_private_size_opt == -1)
@@ -239,11 +224,9 @@ static const struct gcn_kernel_arg_type
  };
  
  static const long default_requested_args
-       = (1 << PRIVATE_SEGMENT_BUFFER_ARG)
-         | (1 << DISPATCH_PTR_ARG)
+       = (1 << DISPATCH_PTR_ARG)
           | (1 << QUEUE_PTR_ARG)
           | (1 << KERNARG_SEGMENT_PTR_ARG)
-         | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
           | (1 << WORKGROUP_ID_X_ARG)
           | (1 << WORK_ITEM_ID_X_ARG)
           | (1 << WORK_ITEM_ID_Y_ARG)
@@ -1876,10 +1859,14 @@ gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
  
    if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
      {
-      rtx queue = gen_rtx_REG (DImode,
-                              cfun->machine->args.reg[QUEUE_PTR_ARG]);
+      /* The high bits of the QUEUE_PTR_ARG register are used by
+        GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out.  */
+      rtx queue_reg = gen_rtx_REG (DImode,
+                                  cfun->machine->args.reg[QUEUE_PTR_ARG]);
+      rtx queue_ptr = gen_reg_rtx (DImode);
+      emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff)));
        rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
-                                    gen_rtx_PLUS (DImode, queue,
+                                    gen_rtx_PLUS (DImode, queue_ptr,
                                                    gen_int_mode (64, SImode)));
        rtx tmp = gen_reg_rtx (DImode);
  
@@ -2532,6 +2519,11 @@ gcn_conditional_register_usage (void)
        fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
        fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
      }
+  if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
+    {
+      fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG]] = 1;
+      fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG] + 1] = 1;
+    }
    if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
      fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
    if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
@@ -3377,10 +3369,56 @@ gcn_expand_prologue ()
      }
    else
      {
-      rtx wave_offset = gen_rtx_REG (SImode,
-                                    cfun->machine->args.
-                                    reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
+      if (TARGET_PACKED_WORK_ITEMS)
+       {
+         /* v0 conatins the X, Y and Z dimensions all in one.
+            Expand them out for ABI compatibility.  */
+         /* TODO: implement and use zero_extract.  */
+         rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+         emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+                                   gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
+         emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
+         emit_insn (gen_prologue_use (v1));
+
+         rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
+         emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+                                   gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
+         emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
+         emit_insn (gen_prologue_use (v2));
+       }
+
+      /* We no longer use the private segment for the stack (it's not
+        accessible to reverse offload), so we must calculate a wave offset
+        from the grid dimensions and stack size, which is calculated on the
+        host, and passed in the kernargs region.
+        See libgomp-gcn.h for details.  */
+      rtx wave_offset = gen_rtx_REG (SImode, FIRST_PARM_REG);
+
+      rtx num_waves_mem = gcn_oacc_dim_size (1);
+      rtx num_waves = gen_rtx_REG (SImode, FIRST_PARM_REG+1);
+      set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
+      emit_move_insn (num_waves, num_waves_mem);
+
+      rtx workgroup_num = gcn_oacc_dim_pos (0);
+      rtx wave_num = gen_rtx_REG (SImode, FIRST_PARM_REG+2);
+      emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
  
+      rtx thread_id = gen_rtx_REG (SImode, FIRST_PARM_REG+3);
+      emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
+      emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
+
+      rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+                                    [KERNARG_SEGMENT_PTR_ARG]);
+      rtx stack_size_mem = gen_rtx_MEM (SImode,
+                                       gen_rtx_PLUS (DImode, kernarg_reg,
+                                                     GEN_INT (52)));
+      set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
+      emit_move_insn (wave_offset, stack_size_mem);
+
+      emit_insn (gen_mulsi3 (wave_offset, wave_offset, thread_id));
+
+      /* The FLAT_SCRATCH_INIT is not usually needed, but can be enabled
+        via the function attributes.  */
        if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
         {
           rtx fs_init_lo =
@@ -3417,10 +3455,12 @@ gcn_expand_prologue ()
        HOST_WIDE_INT sp_adjust = (offsets->local_vars
                                  + offsets->outgoing_args_size);
  
-      /* Initialise FP and SP from the buffer descriptor in s[0:3].  */
-      emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
-      emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
-                                gen_int_mode (0xffff, SImode)));
+      /* Initialize FP and SP from space allocated on the host.  */
+      rtx stack_addr_mem = gen_rtx_MEM (DImode,
+                                       gen_rtx_PLUS (DImode, kernarg_reg,
+                                                     GEN_INT (40)));
+      set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
+      emit_move_insn (fp, stack_addr_mem);
        rtx scc = gen_rtx_REG (BImode, SCC_REG);
        emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
        emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
@@ -3476,25 +3516,6 @@ gcn_expand_prologue ()
      emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
    }
  
-  if (TARGET_PACKED_WORK_ITEMS
-      && cfun && cfun->machine && !cfun->machine->normal_function)
-  {
-    /* v0 conatins the X, Y and Z dimensions all in one.
-       Expand them out for ABI compatibility.  */
-    /* TODO: implement and use zero_extract.  */
-    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
-    emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
-              gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
-    emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
-    emit_insn (gen_prologue_use (v1));
-
-    rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
-    emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
-              gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
-    emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
-    emit_insn (gen_prologue_use (v2));
-  }
-
    if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
      {
        /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel.  */
@@ -4537,26 +4558,53 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
            cf. struct hsa_kernel_dispatch_packet_s in the HSA doc.  */
         rtx ptr;
         if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
-           && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+           && cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
           {
-           rtx size_rtx = gen_rtx_REG (DImode,
-                            cfun->machine->args.reg[DISPATCH_PTR_ARG]);
-           size_rtx = gen_rtx_MEM (SImode,
-                                   gen_rtx_PLUS (DImode, size_rtx,
-                                                 GEN_INT (6*2 + 3*4)));
-           size_rtx = gen_rtx_MULT (SImode, size_rtx, GEN_INT (64));
-
-           ptr = gen_rtx_REG (DImode,
-                   cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
-           ptr = gen_rtx_AND (DImode, ptr, GEN_INT (0x0000ffffffffffff));
-           ptr = gen_rtx_PLUS (DImode, ptr, size_rtx);
-           if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
-             {
-               rtx off;
-               off = gen_rtx_REG (SImode,
-                     cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
-               ptr = gen_rtx_PLUS (DImode, ptr, off);
-             }
+           rtx num_waves_mem = gcn_oacc_dim_size (1);
+           rtx num_waves = gen_reg_rtx (SImode);
+           set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
+           emit_move_insn (num_waves, num_waves_mem);
+
+           rtx workgroup_num = gcn_oacc_dim_pos (0);
+           rtx wave_num = gen_reg_rtx (SImode);
+           emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
+
+           rtx thread_id = gen_reg_rtx (SImode);
+           emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
+           emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
+
+           rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+                                          [KERNARG_SEGMENT_PTR_ARG]);
+           rtx stack_size_mem = gen_rtx_MEM (SImode,
+                                             gen_rtx_PLUS (DImode,
+                                                           kernarg_reg,
+                                                           GEN_INT (52)));
+           set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
+           rtx stack_size = gen_reg_rtx (SImode);
+           emit_move_insn (stack_size, stack_size_mem);
+
+           rtx wave_offset = gen_reg_rtx (SImode);
+           emit_insn (gen_mulsi3 (wave_offset, stack_size, thread_id));
+
+           rtx stack_limit_offset = gen_reg_rtx (SImode);
+           emit_insn (gen_addsi3 (stack_limit_offset, wave_offset,
+                                  stack_size));
+
+           rtx stack_limit_offset_di = gen_reg_rtx (DImode);
+           emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 4),
+                           const0_rtx);
+           emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 0),
+                           stack_limit_offset);
+
+           rtx stack_addr_mem = gen_rtx_MEM (DImode,
+                                             gen_rtx_PLUS (DImode,
+                                                           kernarg_reg,
+                                                           GEN_INT (40)));
+           set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
+           rtx stack_addr = gen_reg_rtx (DImode);
+           emit_move_insn (stack_addr, stack_addr_mem);
+
+           ptr = gen_rtx_PLUS (DImode, stack_addr, stack_limit_offset_di);
           }
         else
           {
@@ -4584,11 +4632,11 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
            whether it was the first call.  */
         rtx result = gen_reg_rtx (BImode);
         emit_move_insn (result, const0_rtx);
-       if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+       if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
           {
             rtx not_first = gen_label_rtx ();
             rtx reg = gen_rtx_REG (DImode,
-                       cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
+                       cfun->machine->args.reg[QUEUE_PTR_ARG]);
             reg = gcn_operand_part (DImode, reg, 1);
             rtx cmp = force_reg (SImode,
                                  gen_rtx_LSHIFTRT (SImode, reg, GEN_INT (16)));
@@ -6180,16 +6228,13 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
            "\t  .amdhsa_reserve_vcc\t1\n"
            "\t  .amdhsa_reserve_flat_scratch\t0\n"
            "\t  .amdhsa_reserve_xnack_mask\t%i\n"
-          "\t  .amdhsa_private_segment_fixed_size\t%i\n"
+          "\t  .amdhsa_private_segment_fixed_size\t0\n"
            "\t  .amdhsa_group_segment_fixed_size\t%u\n"
            "\t  .amdhsa_float_denorm_mode_32\t3\n"
            "\t  .amdhsa_float_denorm_mode_16_64\t3\n",
            vgpr,
            sgpr,
            xnack_enabled,
-          /* workitem_private_segment_bytes_size needs to be
-             one 64th the wave-front stack size.  */
-          stack_size_opt / 64,
            LDS_SIZE);
    if (gcn_arch == PROCESSOR_GFX90a)
      fprintf (file,
@@ -6214,7 +6259,7 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
            "            .kernarg_segment_size: %i\n"
            "            .kernarg_segment_align: %i\n"
            "            .group_segment_fixed_size: %u\n"
-          "            .private_segment_fixed_size: %i\n"
+          "            .private_segment_fixed_size: 0\n"
            "            .wavefront_size: 64\n"
            "            .sgpr_count: %i\n"
            "            .vgpr_count: %i\n"
@@ -6222,7 +6267,6 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
            cfun->machine->kernarg_segment_byte_size,
            cfun->machine->kernarg_segment_alignment,
            LDS_SIZE,
-          stack_size_opt / 64,
            sgpr, vgpr);
    if (gcn_arch == PROCESSOR_GFX90a)
      fprintf (file, "            .agpr_count: 0\n"); // AGPRs are not used, yet
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h

index 1cc5981d904ef2c89a6dac228ab2afde5bf6cf92..0658cdd0d7e1cb5f3f474403eece0ad4dd48e89d 100644 (file)
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -183,7 +183,7 @@
  \f
  #define FIXED_REGISTERS {                          \
      /* Scalars.  */                                \
-    1, 1, 0, 0, 1, 1, 1, 1, 1, 1,                  \
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,                  \
  /*             fp    sp    lr.  */                 \
      1, 1, 0, 0, 0, 0, 1, 1, 0, 0,                  \
  /*  exec_save, cc_save */                          \
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt

index 759f7a064c93b3ee0df67b926faab60adbec316f..c93ee458c9cf5d0644c22877c932ec4c62de095b 100644 (file)
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -69,7 +69,7 @@ int stack_size_opt = -1
  
  mstack-size=
  Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
--mstack-size=<number>  Set the private segment size per wave-front, in bytes.
+Obsolete; use GCN_STACK_SIZE at runtime.
  
  int gang_private_size_opt = -1
  
diff --git a/gcc/testsuite/ChangeLog.omp b/gcc/testsuite/ChangeLog.omp

index bbd81327c6e304b3f41da4a6badd978e9eb6ac7b..c5549aa8995a2fcaaa7a05c1f88609b74a341695 100644 (file)
--- a/gcc/testsuite/ChangeLog.omp
+++ b/gcc/testsuite/ChangeLog.omp
@@ -1,3 +1,11 @@
+2023-02-06  Andrew Stubbs  <ams@codesourcery.com>
+
+       * gcc.c-torture/execute/pr47237.c: Xfail on amdgcn.
+       * gcc.dg/builtin-apply3.c: Xfail for amdgcn.
+       * gcc.dg/builtin-apply4.c: Xfail for amdgcn.
+       * gcc.dg/torture/stackalign/builtin-apply-3.c: Xfail for amdgcn.
+       * gcc.dg/torture/stackalign/builtin-apply-4.c: Xfail for amdgcn.
+
  2023-02-02  Paul-Antoine Arras  <pa@codesourcery.com>
  
         Backported from master:
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr47237.c b/gcc/testsuite/gcc.c-torture/execute/pr47237.c

index 98124065b2fc36355a2e4e6d2ba15e76b447cfdc..944bdb7c93aee6b78629001639f6b1faee93f623 100644 (file)
--- a/gcc/testsuite/gcc.c-torture/execute/pr47237.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr47237.c
@@ -1,4 +1,4 @@
-/* { dg-xfail-if "can cause stack underflow" { nios2-*-* } } */
+/* { dg-xfail-run-if "can cause stack underflow" { nios2-*-* amdgcn-*-* } } */
  /* { dg-require-effective-target untyped_assembly } */
  #define INTEGER_ARG  5
  
diff --git a/gcc/testsuite/gcc.dg/builtin-apply3.c b/gcc/testsuite/gcc.dg/builtin-apply3.c

index 37c5209b91cb296430e4ed17c5e122cb987fd77f..8fc20030ed76c7cd6e848d584c4d25995cae8686 100644 (file)
--- a/gcc/testsuite/gcc.dg/builtin-apply3.c
+++ b/gcc/testsuite/gcc.dg/builtin-apply3.c
@@ -6,6 +6,7 @@
  
  /* { dg-do run } */
  /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
  
  
  #define INTEGER_ARG  5
diff --git a/gcc/testsuite/gcc.dg/builtin-apply4.c b/gcc/testsuite/gcc.dg/builtin-apply4.c

index cca9187a1d36620b34d1cf8b2228db20f6bfbfb9..aa491c18de4f46469310a2072b22407d3f927c4a 100644 (file)
--- a/gcc/testsuite/gcc.dg/builtin-apply4.c
+++ b/gcc/testsuite/gcc.dg/builtin-apply4.c
@@ -3,6 +3,7 @@
  /* { dg-additional-options "-mno-mmx" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */
  /* { dg-do run } */
  /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
  
  extern void abort (void);
  
diff --git a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c

index 37c5209b91cb296430e4ed17c5e122cb987fd77f..8fc20030ed76c7cd6e848d584c4d25995cae8686 100644 (file)
--- a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c
+++ b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c
@@ -6,6 +6,7 @@
  
  /* { dg-do run } */
  /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
  
  
  #define INTEGER_ARG  5
diff --git a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c

index 78b10322edc49662c899c010d5c76aa708162de3..94b20123724de571211843cd3d5723b5abd87fba 100644 (file)
--- a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c
+++ b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c
@@ -2,6 +2,7 @@
  /* { dg-do run } */
  /* { dg-additional-options "-fgnu89-inline" } */
  /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
  
  extern void abort (void);
  
diff --git a/include/ChangeLog.omp b/include/ChangeLog.omp

index 7c2a3fa71344be79d82e18da12858535a64db371..222da13e03876666c0cac13c36d06dc45c3e7a38 100644 (file)
--- a/include/ChangeLog.omp
+++ b/include/ChangeLog.omp
@@ -1,3 +1,7 @@
+2023-02-06  Andrew Stubbs  <ams@codesourcery.com>
+
+       * gomp-constants.h (GOMP_VERSION_GCN): Bump.
+
  2022-10-24  Tobias Burnus  <tobias@codesourcery.com>
  
         Backport from mainline:
diff --git a/include/gomp-constants.h b/include/gomp-constants.h

index dd753a8210644690e8f95a0bc219c440e84532b8..5b32d114e1c066f77050b7d686771fd07b80ba89 100644 (file)
--- a/include/gomp-constants.h
+++ b/include/gomp-constants.h
@@ -312,7 +312,7 @@ enum gomp_map_kind
  #define GOMP_VERSION   2
  #define GOMP_VERSION_NVIDIA_PTX 1
  #define GOMP_VERSION_INTEL_MIC 0
-#define GOMP_VERSION_GCN 2
+#define GOMP_VERSION_GCN 3
  
  #define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
  #define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp

index 2b716f67501e24a89cab5ab9142b0a1f4120ef81..eb8e4abd1838bfde548123ace1e50b2d310936d9 100644 (file)
--- a/libgomp/ChangeLog.omp
+++ b/libgomp/ChangeLog.omp
@@ -1,3 +1,38 @@
+2023-02-06  Andrew Stubbs  <ams@codesourcery.com>
+
+       * config/gcn/libgomp-gcn.h (DEFAULT_GCN_STACK_SIZE): New define.
+       (DEFAULT_TEAM_ARENA_SIZE): New define.
+       (struct heap): Move to this file.
+       (struct kernargs_abi): Likewise.
+       * config/gcn/team.c (gomp_gcn_enter_kernel): Use team arena size from
+       the kernargs.
+       * libgomp.h: Include libgomp-gcn.h.
+       (TEAM_ARENA_SIZE): Remove.
+       (team_malloc): Update the error message.
+       * plugin/plugin-gcn.c (struct kernargs): Move common content to
+       struct kernargs_abi.
+       (struct agent_info): Rename team arenas to ephemeral memories.
+       (struct team_arena_list): Rename ....
+       (struct ephemeral_memories_list): to this.
+       (struct heap): Delete.
+       (team_arena_size): New variable.
+       (stack_size): New variable.
+       (print_kernel_dispatch): Update debug messages.
+       (init_environment_variables): Read GCN_TEAM_ARENA_SIZE.
+       Read GCN_STACK_SIZE.
+       (get_team_arena): Rename ...
+       (configure_ephemeral_memories): ... to this, and set up stacks.
+       (release_team_arena): Rename ...
+       (release_ephemeral_memories): ... to this.
+       (destroy_team_arenas): Rename ...
+       (destroy_ephemeral_memories): ... to this.
+       (create_kernel_dispatch): Add num_threads parameter.
+       Adjust for kernargs_abi refactor and ephemeral memories.
+       (release_kernel_dispatch): Adjust for ephemeral memories.
+       (run_kernel): Pass thread-count to create_kernel_dispatch.
+       (GOMP_OFFLOAD_init_device): Adjust for ephemeral memories.
+       (GOMP_OFFLOAD_fini_device): Adjust for ephemeral memories.
+
  2023-02-03  Tobias Burnus  <tobias@codesourcery.com>
  
         Backported from master:
diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h

index b59cbee288e946ac87c5571c6ee0654cdf59a82b..1521166baa3fcad6eb567820cefdee957ac7ce2e 100644 (file)
--- a/libgomp/config/gcn/libgomp-gcn.h
+++ b/libgomp/config/gcn/libgomp-gcn.h
@@ -30,6 +30,40 @@
  #ifndef LIBGOMP_GCN_H
  #define LIBGOMP_GCN_H 1
  
+#define DEFAULT_GCN_STACK_SIZE (32*1024)
+#define DEFAULT_TEAM_ARENA_SIZE (64*1024)
+
+struct heap
+{
+  int64_t size;
+  char data[0];
+};
+
+/* This struct defines the (unofficial) ABI-defined values the compiler
+   expects to find in first bytes of the kernargs space.
+   The plugin may choose to place additional data later in the kernargs
+   memory allocation, but those are not in any fixed location.  */
+struct kernargs_abi {
+  /* Leave space for the real kernel arguments.
+     OpenACC and OpenMP only use one pointer.  */
+  int64_t dummy1;
+  int64_t dummy2;
+
+  /* A pointer to struct output, below, for console output data.  */
+  int64_t out_ptr;             /* Offset 16.  */
+
+  /* A pointer to struct heap.  */
+  int64_t heap_ptr;            /* Offset 24.  */
+
+  /* A pointer to the ephemeral memory areas.
+     The team arena is only needed for OpenMP.
+     Each should have enough space for all the teams and threads.  */
+  int64_t arena_ptr;           /* Offset 32.  */
+  int64_t stack_ptr;           /* Offset 40.  */
+  int arena_size_per_team;     /* Offset 48.  */
+  int stack_size_per_thread;   /* Offset 52.  */
+};
+
  /* This struct is also used in Newlib's libc/sys/amdgcn/write.c.  */
  struct output
  {
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c

index 7cf4c0be3cc0cd8af30ae0207bc630280b22957b..ffdc09b7f353bab9310472f56ab60c1aeac51ba3 100644 (file)
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -60,14 +60,16 @@ gomp_gcn_enter_kernel (void)
        /* Initialize the team arena for optimized memory allocation.
           The arena has been allocated on the host side, and the address
           passed in via the kernargs.  Each team takes a small slice of it.  */
-      void **kernargs = (void**) __builtin_gcn_kernarg_ptr ();
-      void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid);
+      struct kernargs_abi *kernargs =
+       (struct kernargs_abi*) __builtin_gcn_kernarg_ptr ();
+      void *team_arena = ((void*)kernargs->arena_ptr
+                         + kernargs->arena_size_per_team * teamid);
        void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START;
        void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE;
        void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END;
        *arena_start = team_arena;
        *arena_free = team_arena;
-      *arena_end = team_arena + TEAM_ARENA_SIZE;
+      *arena_end = team_arena + kernargs->arena_size_per_team;
  
        /* Allocate and initialize the team-local-storage data.  */
        struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h

index 7d55f3cf825bd809f98a06370877f1d90983fcd2..a0af66e396b2ace41d8acc9e6c5828aa9f21ba78 100644 (file)
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -112,8 +112,8 @@ extern void gomp_aligned_free (void *);
  /* Optimized allocators for team-specific data that will die with the team.  */
  
  #ifdef __AMDGCN__
+#include "libgomp-gcn.h"
  /* The arena is initialized in config/gcn/team.c.  */
-#define TEAM_ARENA_SIZE  64*1024  /* Must match the value in plugin-gcn.c.  */
  #define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
  #define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
  #define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
@@ -135,7 +135,8 @@ team_malloc (size_t size)
      {
        /* While this is experimental, let's make sure we know when OOM
          happens.  */
-      const char msg[] = "GCN team arena exhausted\n";
+      const char msg[] = "GCN team arena exhausted;"
+                        " configure with GCN_TEAM_ARENA_SIZE=bytes\n";
        write (2, msg, sizeof(msg)-1);
  
        /* Fall back to using the heap (slowly).  */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c

index 36fab3951d53a7faf604394a28e8deb0a13ae645..0e22aec87cb4dd25223b206ecc6cc9e6a04b013d 100644 (file)
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -252,20 +252,7 @@ struct kernel_dispatch
     in libgomp target code.  */
  
  struct kernargs {
-  /* Leave space for the real kernel arguments.
-     OpenACC and OpenMP only use one pointer.  */
-  int64_t dummy1;
-  int64_t dummy2;
-
-  /* A pointer to struct output, below, for console output data.  */
-  int64_t out_ptr;
-
-  /* A pointer to struct heap, below.  */
-  int64_t heap_ptr;
-
-  /* A pointer to an ephemeral memory arena.
-    Only needed for OpenMP.  */
-  int64_t arena_ptr;
+  struct kernargs_abi abi;
  
    /* Output data.  */
    struct output output_data;
@@ -441,9 +428,9 @@ struct agent_info
    /* The HSA memory region from which to allocate device data.  */
    hsa_region_t data_region;
  
-  /* Allocated team arenas.  */
-  struct team_arena_list *team_arena_list;
-  pthread_mutex_t team_arena_write_lock;
+  /* Allocated ephemeral memories (team arena and stack space).  */
+  struct ephemeral_memories_list *ephemeral_memories_list;
+  pthread_mutex_t ephemeral_memories_write_lock;
  
    /* Read-write lock that protects kernels which are running or about to be run
       from interference with loading and unloading of images.  Needs to be
@@ -525,17 +512,18 @@ struct module_info
  };
  
  /* A linked list of memory arenas allocated on the device.
-   These are only used by OpenMP, as a means to optimize per-team malloc.  */
+   These are used by OpenMP, as a means to optimize per-team malloc,
+   and for host-accessible stack space.  */
  
-struct team_arena_list
+struct ephemeral_memories_list
  {
-  struct team_arena_list *next;
+  struct ephemeral_memories_list *next;
  
-  /* The number of teams determines the size of the allocation.  */
-  int num_teams;
-  /* The device address of the arena itself.  */
-  void *arena;
-  /* A flag to prevent two asynchronous kernels trying to use the same arena.
+  /* The size is determined by the number of teams and threads.  */
+  size_t size;
+  /* The device address allocated memory.  */
+  void *address;
+  /* A flag to prevent two asynchronous kernels trying to use the same memory.
       The mutex is locked until the kernel exits.  */
    pthread_mutex_t in_use;
  };
@@ -554,15 +542,6 @@ struct hsa_context_info
    char driver_version_s[30];
  };
  
-/* Format of the on-device heap.
-
-   This must match the definition in Newlib and gcn-run.  */
-
-struct heap {
-  int64_t size;
-  char data[0];
-};
-
  /* }}}  */
  /* {{{ Global variables  */
  
@@ -580,6 +559,11 @@ static struct hsa_runtime_fn_info hsa_fns;
  
  static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
  
+/* Ephemeral memory sizes for each kernel launch.  */
+
+static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
+static int stack_size = DEFAULT_GCN_STACK_SIZE;
+
  /* Flag to decide whether print to stderr information about what is going on.
     Set in init_debug depending on environment variables.  */
  
@@ -1053,9 +1037,13 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
    fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
    fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
    fprintf (stderr, "%*sheap address: %p\n", indent, "",
-          (void*)kernargs->heap_ptr);
-  fprintf (stderr, "%*sarena address: %p\n", indent, "",
-          (void*)kernargs->arena_ptr);
+          (void*)kernargs->abi.heap_ptr);
+  fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
+          "", (void*)kernargs->abi.arena_ptr,
+          kernargs->abi.arena_size_per_team);
+  fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
+          "", (void*)kernargs->abi.stack_ptr,
+          kernargs->abi.stack_size_per_thread);
    fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
    fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
            dispatch->private_segment_size);
@@ -1115,6 +1103,22 @@ init_environment_variables (void)
        if (tmp)
         gcn_kernel_heap_size = tmp;
      }
+
+  const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
+  if (arena)
+    {
+      int tmp = atoi (arena);
+      if (tmp)
+       team_arena_size = tmp;;
+    }
+
+  const char *stack = secure_getenv ("GCN_STACK_SIZE");
+  if (stack)
+    {
+      int tmp = atoi (stack);
+      if (tmp)
+       stack_size = tmp;;
+    }
  }
  
  /* Return malloc'd string with name of SYMBOL.  */
@@ -1729,85 +1733,103 @@ isa_code(const char *isa) {
  /* }}}  */
  /* {{{ Run  */
  
-/* Create or reuse a team arena.
+/* Create or reuse a team arena and stack space.
   
     Team arenas are used by OpenMP to avoid calling malloc multiple times
     while setting up each team.  This is purely a performance optimization.
  
-   Allocating an arena also costs performance, albeit on the host side, so
-   this function will reuse an existing arena if a large enough one is idle.
-   The arena is released, but not deallocated, when the kernel exits.  */
+   The stack space is used by all kernels.  We must allocate it in such a
+   way that the reverse offload implmentation can access the data.
  
-static void *
-get_team_arena (struct agent_info *agent, int num_teams)
+   Allocating this memory costs performance, so this function will reuse an
+   existing allocation if a large enough one is idle.
+   The memory lock is released, but not deallocated, when the kernel exits.  */
+
+static void
+configure_ephemeral_memories (struct kernel_info *kernel,
+                             struct kernargs_abi *kernargs, int num_teams,
+                             int num_threads)
  {
-  struct team_arena_list **next_ptr = &agent->team_arena_list;
-  struct team_arena_list *item;
+  struct agent_info *agent = kernel->agent;
+  struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
+  struct ephemeral_memories_list *item;
+
+  int actual_arena_size = (kernel->kind == KIND_OPENMP
+                          ? team_arena_size : 0);
+  int actual_arena_total_size = actual_arena_size * num_teams;
+  size_t size = (actual_arena_total_size
+                + num_teams * num_threads * stack_size);
  
    for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
      {
-      if (item->num_teams < num_teams)
+      if (item->size < size)
         continue;
  
-      if (pthread_mutex_trylock (&item->in_use))
-       continue;
-
-      return item->arena;
+      if (pthread_mutex_trylock (&item->in_use) == 0)
+       break;
      }
  
-  GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
-
-  if (pthread_mutex_lock (&agent->team_arena_write_lock))
+  if (!item)
      {
-      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
-      return false;
-    }
-  item = malloc (sizeof (*item));
-  item->num_teams = num_teams;
-  item->next = NULL;
-  *next_ptr = item;
+      GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
+                " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
+                num_teams, num_threads, size);
  
-  if (pthread_mutex_init (&item->in_use, NULL))
-    {
-      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
-      return false;
-    }
-  if (pthread_mutex_lock (&item->in_use))
-    {
-      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
-      return false;
-    }
-  if (pthread_mutex_unlock (&agent->team_arena_write_lock))
-    {
-      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
-      return false;
-    }
+      if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
+       {
+         GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+         return;
+       }
+      item = malloc (sizeof (*item));
+      item->size = size;
+      item->next = NULL;
+      *next_ptr = item;
  
-  const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
-  hsa_status_t status;
-  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
-                                          TEAM_ARENA_SIZE*num_teams,
-                                          &item->arena);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
-  status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
-                                              HSA_ACCESS_PERMISSION_RW);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not assign arena memory to device", status);
+      if (pthread_mutex_init (&item->in_use, NULL))
+       {
+         GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
+         return;
+       }
+      if (pthread_mutex_lock (&item->in_use))
+       {
+         GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+         return;
+       }
+      if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
+       {
+         GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+         return;
+       }
+
+      hsa_status_t status;
+      status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
+                                              &item->address);
+      if (status != HSA_STATUS_SUCCESS)
+       hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
+      status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
+                                                  HSA_ACCESS_PERMISSION_RW);
+      if (status != HSA_STATUS_SUCCESS)
+       hsa_fatal ("Could not assign arena & stack memory to device", status);
+    }
  
-  return item->arena;
+  kernargs->arena_ptr = (actual_arena_total_size
+                        ? (uint64_t)item->address
+                        : 0);
+  kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
+  kernargs->arena_size_per_team = actual_arena_size;
+  kernargs->stack_size_per_thread = stack_size;
  }
  
-/* Mark a team arena available for reuse.  */
+/* Mark an ephemeral memory space available for reuse.  */
  
  static void
-release_team_arena (struct agent_info* agent, void *arena)
+release_ephemeral_memories (struct agent_info* agent, void *address)
  {
-  struct team_arena_list *item;
+  struct ephemeral_memories_list *item;
  
-  for (item = agent->team_arena_list; item; item = item->next)
+  for (item = agent->ephemeral_memories_list; item; item = item->next)
      {
-      if (item->arena == arena)
+      if (item->address == address)
         {
           if (pthread_mutex_unlock (&item->in_use))
             GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
@@ -1820,22 +1842,22 @@ release_team_arena (struct agent_info* agent, void *arena)
  /* Clean up all the allocated team arenas.  */
  
  static bool
-destroy_team_arenas (struct agent_info *agent)
+destroy_ephemeral_memories (struct agent_info *agent)
  {
-  struct team_arena_list *item, *next;
+  struct ephemeral_memories_list *item, *next;
  
-  for (item = agent->team_arena_list; item; item = next)
+  for (item = agent->ephemeral_memories_list; item; item = next)
      {
        next = item->next;
-      hsa_fns.hsa_memory_free_fn (item->arena);
+      hsa_fns.hsa_memory_free_fn (item->address);
        if (pthread_mutex_destroy (&item->in_use))
         {
-         GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+         GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
           return false;
         }
        free (item);
      }
-  agent->team_arena_list = NULL;
+  agent->ephemeral_memories_list = NULL;
  
    return true;
  }
@@ -1907,7 +1929,8 @@ alloc_by_agent (struct agent_info *agent, size_t size)
     the necessary device signals and memory allocations.  */
  
  static struct kernel_dispatch *
-create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
+create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
+                       int num_threads)
  {
    struct agent_info *agent = kernel->agent;
    struct kernel_dispatch *shadow
@@ -1942,7 +1965,7 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
    struct kernargs *kernargs = shadow->kernarg_address;
  
    /* Zero-initialize the output_data (minimum needed).  */
-  kernargs->out_ptr = (int64_t)&kernargs->output_data;
+  kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
    kernargs->output_data.next_output = 0;
    for (unsigned i = 0;
         i < (sizeof (kernargs->output_data.queue)
@@ -1952,13 +1975,10 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
    kernargs->output_data.consumed = 0;
  
    /* Pass in the heap location.  */
-  kernargs->heap_ptr = (int64_t)kernel->module->heap;
+  kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
  
-  /* Create an arena.  */
-  if (kernel->kind == KIND_OPENMP)
-    kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
-  else
-    kernargs->arena_ptr = 0;
+  /* Create the ephemeral memory spaces.  */
+  configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
  
    /* Ensure we can recognize unset return values.  */
    kernargs->output_data.return_value = 0xcafe0000;
@@ -2042,9 +2062,10 @@ release_kernel_dispatch (struct kernel_dispatch *shadow)
    GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
  
    struct kernargs *kernargs = shadow->kernarg_address;
-  void *arena = (void *)kernargs->arena_ptr;
-  if (arena)
-    release_team_arena (shadow->agent, arena);
+  void *addr = (void *)kernargs->abi.arena_ptr;
+  if (!addr)
+    addr = (void *)kernargs->abi.stack_ptr;
+  release_ephemeral_memories (shadow->agent, addr);
  
    hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
  
@@ -2274,7 +2295,8 @@ run_kernel (struct kernel_info *kernel, void *vars,
              packet->workgroup_size_z);
  
    struct kernel_dispatch *shadow
-    = create_kernel_dispatch (kernel, packet->grid_size_x);
+    = create_kernel_dispatch (kernel, packet->grid_size_x,
+                             packet->grid_size_z);
    shadow->queue = command_q;
  
    if (debug)
@@ -3414,14 +3436,14 @@ GOMP_OFFLOAD_init_device (int n)
        GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
        return false;
      }
-  if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
+  if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
      {
        GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
        return false;
      }
    agent->async_queues = NULL;
    agent->omp_async_queue = NULL;
-  agent->team_arena_list = NULL;
+  agent->ephemeral_memories_list = NULL;
  
    uint32_t queue_size;
    hsa_status_t status;
@@ -3774,7 +3796,7 @@ GOMP_OFFLOAD_fini_device (int n)
        agent->module = NULL;
      }
  
-  if (!destroy_team_arenas (agent))
+  if (!destroy_ephemeral_memories (agent))
      return false;
  
    if (!destroy_hsa_program (agent))
@@ -3800,9 +3822,9 @@ GOMP_OFFLOAD_fini_device (int n)
        GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
        return false;
      }
-  if (pthread_mutex_destroy (&agent->team_arena_write_lock))
+  if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
      {
-      GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+      GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
        return false;
      }
    agent->initialized = false;
author	Andrew Stubbs <ams@codesourcery.com>
	Thu, 1 Dec 2022 17:30:21 +0000 (17:30 +0000)
committer	Andrew Stubbs <ams@codesourcery.com>
	Mon, 6 Feb 2023 17:54:24 +0000 (17:54 +0000)
gcc/ChangeLog.omp		patch \| blob \| blame \| history
gcc/config/gcn/gcn-run.cc		patch \| blob \| blame \| history
gcc/config/gcn/gcn.cc		patch \| blob \| blame \| history
gcc/config/gcn/gcn.h		patch \| blob \| blame \| history
gcc/config/gcn/gcn.opt		patch \| blob \| blame \| history
gcc/testsuite/ChangeLog.omp		patch \| blob \| blame \| history
gcc/testsuite/gcc.c-torture/execute/pr47237.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/builtin-apply3.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/builtin-apply4.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c		patch \| blob \| blame \| history
include/ChangeLog.omp		patch \| blob \| blame \| history
include/gomp-constants.h		patch \| blob \| blame \| history
libgomp/ChangeLog.omp		patch \| blob \| blame \| history
libgomp/config/gcn/libgomp-gcn.h		patch \| blob \| blame \| history
libgomp/config/gcn/team.c		patch \| blob \| blame \| history
libgomp/libgomp.h		patch \| blob \| blame \| history
libgomp/plugin/plugin-gcn.c		patch \| blob \| blame \| history