* config/nvptx/nvptx.c (nvptx_sese_number, nvptx_sese_pseudo): Don't

[thirdparty/gcc.git] / gcc / config / nvptx / nvptx.c
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c

index 74ca0f585aa418c852b9e82e993db58f54b0cd56..c53a1ae9f26abf9d05231b9bb84753e4e9403a2f 100644 (file)
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -1,5 +1,5 @@
  /* Target code for NVPTX.
-   Copyright (C) 2014-2018 Free Software Foundation, Inc.
+   Copyright (C) 2014-2019 Free Software Foundation, Inc.
     Contributed by Bernd Schmidt <bernds@codesourcery.com>
  
     This file is part of GCC.
@@ -59,6 +59,7 @@
  #include "builtins.h"
  #include "omp-general.h"
  #include "omp-low.h"
+#include "omp-offload.h"
  #include "gomp-constants.h"
  #include "dumpfile.h"
  #include "internal-fn.h"
@@ -81,8 +82,21 @@
  #define WORKAROUND_PTXJIT_BUG_2 1
  #define WORKAROUND_PTXJIT_BUG_3 1
  
+/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
+   block, which has had a maximum number of threads of 1024 since CUDA version
+   2.x.  */
+#define PTX_CTA_SIZE 1024
+
+#define PTX_CTA_NUM_BARRIERS 16
  #define PTX_WARP_SIZE 32
-#define PTX_VECTOR_LENGTH 32
+
+#define PTX_PER_CTA_BARRIER 0
+#define PTX_NUM_PER_CTA_BARRIERS 1
+#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
+#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
+
+#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
+#define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
  #define PTX_WORKER_LENGTH 32
  #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
  
@@ -127,14 +141,16 @@ struct tree_hasher : ggc_cache_ptr_hash<tree_node>
  static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
  static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
  
-/* Buffer needed to broadcast across workers.  This is used for both
-   worker-neutering and worker broadcasting.  It is shared by all
-   functions emitted.  The buffer is placed in shared memory.  It'd be
-   nice if PTX supported common blocks, because then this could be
-   shared across TUs (taking the largest size).  */
-static unsigned worker_bcast_size;
-static unsigned worker_bcast_align;
-static GTY(()) rtx worker_bcast_sym;
+/* Buffer needed to broadcast across workers and vectors.  This is
+   used for both worker-neutering and worker broadcasting, and
+   vector-neutering and boardcasting when vector_length > 32.  It is
+   shared by all functions emitted.  The buffer is placed in shared
+   memory.  It'd be nice if PTX supported common blocks, because then
+   this could be shared across TUs (taking the largest size).  */
+static unsigned oacc_bcast_size;
+static unsigned oacc_bcast_partition;
+static unsigned oacc_bcast_align;
+static GTY(()) rtx oacc_bcast_sym;
  
  /* Buffer needed for worker reductions.  This has to be distinct from
     the worker broadcast array, as both may be live concurrently.  */
@@ -142,6 +158,14 @@ static unsigned worker_red_size;
  static unsigned worker_red_align;
  static GTY(()) rtx worker_red_sym;
  
+/* Buffer needed for vector reductions, when vector_length >
+   PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
+   array, as both may be live concurrently.  */
+static unsigned vector_red_size;
+static unsigned vector_red_align;
+static unsigned vector_red_partition;
+static GTY(()) rtx vector_red_sym;
+
  /* Global lock variable, needed for 128bit worker & gang reductions.  */
  static GTY(()) tree global_lock_var;
  
@@ -151,6 +175,8 @@ static bool need_softstack_decl;
  /* True if any function references __nvptx_uni.  */
  static bool need_unisimt_decl;
  
+static int nvptx_mach_max_workers ();
+
  /* Allocate a new, cleared machine_function structure.  */
  
  static struct machine_function *
@@ -168,7 +194,7 @@ static void
  diagnose_openacc_conflict (bool optval, const char *optname)
  {
    if (flag_openacc && optval)
-    error ("option %s is not supported together with -fopenacc", optname);
+    error ("option %s is not supported together with %<-fopenacc%>", optname);
  }
  
  /* Implement TARGET_OPTION_OVERRIDE.  */
@@ -207,14 +233,20 @@ nvptx_option_override (void)
    declared_libfuncs_htab
      = hash_table<declared_libfunc_hasher>::create_ggc (17);
  
-  worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
-  SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
-  worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+  oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
+  SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
+  oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+  oacc_bcast_partition = 0;
  
    worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
    SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
    worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
  
+  vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
+  SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
+  vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+  vector_red_partition = 0;
+
    diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
    diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
    diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
@@ -1093,8 +1125,59 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
  {
    fprintf (file, "\t{\n");
    fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
+  if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
+    {
+      fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
+      fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+    }
    fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
    fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
+  if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
+    {
+      fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
+      fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
+      fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
+              "// vector reduction buffer\n",
+              REGNO (cfun->machine->red_partition),
+              vector_red_partition);
+    }
+  /* Verify vector_red_size.  */
+  gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
+             <= vector_red_size);
+  fprintf (file, "\t}\n");
+}
+
+/* Emit code to initialize OpenACC worker broadcast and synchronization
+   registers.  */
+
+static void
+nvptx_init_oacc_workers (FILE *file)
+{
+  fprintf (file, "\t{\n");
+  fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
+  if (cfun->machine->bcast_partition)
+    {
+      fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
+      fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+    }
+  fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
+  if (cfun->machine->bcast_partition)
+    {
+      fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
+      fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
+      fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
+      fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
+              "// vector broadcast offset\n",
+              REGNO (cfun->machine->bcast_partition),
+              oacc_bcast_partition);
+    }
+  /* Verify oacc_bcast_size.  */
+  gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
+             <= oacc_bcast_size);
+  if (cfun->machine->sync_bar)
+    fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
+            "// vector synchronization barrier\n",
+            REGNO (cfun->machine->sync_bar));
    fprintf (file, "\t}\n");
  }
  
@@ -1297,6 +1380,13 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
         fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
                 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
      }
+
+  /* Restore the vector reduction partition register, if necessary.
+     FIXME: Find out when and why this is necessary, and fix it.  */
+  if (cfun->machine->red_partition)
+    regno_reg_rtx[REGNO (cfun->machine->red_partition)]
+      = cfun->machine->red_partition;
+
    /* Declare the pseudos we have as ptx registers.  */
    int maxregs = max_reg_num ();
    for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
@@ -1324,6 +1414,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
    if (cfun->machine->unisimt_predicate
        || (cfun->machine->has_simtreg && !crtl->is_leaf))
      nvptx_init_unisimt_predicate (file);
+  if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
+    nvptx_init_oacc_workers (file);
  }
  
  /* Output code for switching uniform-simt state.  ENTERING indicates whether
@@ -1383,7 +1475,7 @@ nvptx_output_softstack_switch (FILE *file, bool entering,
        fputs (";\n", file);
        if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
         fprintf (file,
-                "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
+                "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
                  bits, regno, regno, UINTVAL (align));
      }
    if (cfun->machine->has_softstack)
@@ -1747,14 +1839,14 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
     across the vectors of a single warp.  */
  
  static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_warp_bcast (rtx reg)
  {
    return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
  }
  
  /* Structure used when generating a worker-level spill or fill.  */
  
-struct wcast_data_t
+struct broadcast_data_t
  {
    rtx base;  /* Register holding base addr of buffer.  */
    rtx ptr;  /* Iteration var,  if needed.  */
@@ -1778,7 +1870,8 @@ enum propagate_mask
     how many loop iterations will be executed (0 for not a loop).  */
     
  static rtx
-nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
+nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
+                       broadcast_data_t *data, bool vector)
  {
    rtx  res;
    machine_mode mode = GET_MODE (reg);
@@ -1792,7 +1885,7 @@ nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
         start_sequence ();
         if (pm & PM_read)
           emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
-       emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+       emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
         if (pm & PM_write)
           emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
         res = get_insns ();
@@ -1808,10 +1901,10 @@ nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
           {
             unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
  
-           if (align > worker_bcast_align)
-             worker_bcast_align = align;
-           data->offset = (data->offset + align - 1) & ~(align - 1);
+           oacc_bcast_align = MAX (oacc_bcast_align, align);
+           data->offset = ROUND_UP (data->offset, align);
             addr = data->base;
+           gcc_assert (data->base != NULL);
             if (data->offset)
               addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
           }
@@ -1931,8 +2024,7 @@ nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
      {
        val >>= part * BITS_PER_UNIT;
        part = init_frag.size - init_frag.offset;
-      if (part > size)
-       part = size;
+      part = MIN (part, size);
  
        unsigned HOST_WIDE_INT partial
         = val << (init_frag.offset * BITS_PER_UNIT);
@@ -1995,8 +2087,7 @@ nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
    if (init_frag.offset)
      {
        unsigned part = init_frag.size - init_frag.offset;
-      if (part > size)
-       part = (unsigned) size;
+      part = MIN (part, (unsigned)size);
        size -= part;
        nvptx_assemble_value (0, part);
      }
@@ -2868,6 +2959,57 @@ nvptx_reorg_uniform_simt ()
      }
  }
  
+/* Offloading function attributes.  */
+
+struct offload_attrs
+{
+  unsigned mask;
+  int num_gangs;
+  int num_workers;
+  int vector_length;
+};
+
+/* Define entries for cfun->machine->axis_dim.  */
+
+#define MACH_VECTOR_LENGTH 0
+#define MACH_MAX_WORKERS 1
+
+static void populate_offload_attrs (offload_attrs *oa);
+
+static void
+init_axis_dim (void)
+{
+  offload_attrs oa;
+  int max_workers;
+
+  populate_offload_attrs (&oa);
+
+  if (oa.num_workers == 0)
+    max_workers = PTX_CTA_SIZE / oa.vector_length;
+  else
+    max_workers = oa.num_workers;
+
+  cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
+  cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
+  cfun->machine->axis_dim_init_p = true;
+}
+
+static int ATTRIBUTE_UNUSED
+nvptx_mach_max_workers ()
+{
+  if (!cfun->machine->axis_dim_init_p)
+    init_axis_dim ();
+  return cfun->machine->axis_dim[MACH_MAX_WORKERS];
+}
+
+static int ATTRIBUTE_UNUSED
+nvptx_mach_vector_length ()
+{
+  if (!cfun->machine->axis_dim_init_p)
+    init_axis_dim ();
+  return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
+}
+
  /* Loop structure of the function.  The entire function is described as
     a NULL loop.  */
  
@@ -3015,6 +3157,19 @@ nvptx_split_blocks (bb_insn_map_t *map)
      }
  }
  
+/* Return true if MASK contains parallelism that requires shared
+   memory to broadcast.  */
+
+static bool
+nvptx_needs_shared_bcast (unsigned mask)
+{
+  bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+  bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+    && nvptx_mach_vector_length () != PTX_WARP_SIZE;
+
+  return worker || large_vector;
+}
+
  /* BLOCK is a basic block containing a head or tail instruction.
     Locate the associated prehead or pretail instruction, which must be
     in the single predecessor block.  */
@@ -3090,7 +3245,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
             par = new parallel (par, mask);
             par->forked_block = block;
             par->forked_insn = end;
-           if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+           if (nvptx_needs_shared_bcast (mask))
               par->fork_insn
                 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
           }
@@ -3103,9 +3258,10 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
             unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
  
             gcc_assert (par->mask == mask);
+           gcc_assert (par->join_block == NULL);
             par->join_block = block;
             par->join_insn = end;
-           if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+           if (nvptx_needs_shared_bcast (mask))
               par->joining_insn
                 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
             par = par->parent;
@@ -3396,7 +3552,7 @@ nvptx_sese_number (int n, int p, int dir, basic_block b,
        size_t offset = (dir > 0 ? offsetof (edge_def, dest)
                        : offsetof (edge_def, src));
        edge e;
-      edge_iterator (ei);
+      edge_iterator ei;
  
        FOR_EACH_EDGE (e, ei, edges)
         {
@@ -3419,7 +3575,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
                    vec<edge, va_gc> *edges, size_t offset)
  {
    edge e;
-  edge_iterator (ei);
+  edge_iterator ei;
    int hi_back = depth;
    pseudo_node_t node_back (0, depth);
    int hi_child = depth;
@@ -3802,11 +3958,11 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
     regions and (b) only propagating stack entries that are used.  The
     latter might be quite hard to determine.  */
  
-typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
+typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
  
  static bool
  nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
-                propagate_mask rw, propagator_fn fn, void *data)
+                propagate_mask rw, propagator_fn fn, void *data, bool vector)
  {
    bitmap live = DF_LIVE_IN (block);
    bitmap_iterator iterator;
@@ -3841,7 +3997,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
           
           emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
           /* Allow worker function to initialize anything needed.  */
-         rtx init = fn (tmp, PM_loop_begin, fs, data);
+         rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
           if (init)
             emit_insn (init);
           emit_label (label);
@@ -3850,7 +4006,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
         }
        if (rw & PM_read)
         emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
-      emit_insn (fn (tmp, rw, fs, data));
+      emit_insn (fn (tmp, rw, fs, data, vector));
        if (rw & PM_write)
         emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
        if (fs)
@@ -3858,7 +4014,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
           emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
           emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
           emit_insn (gen_br_true_uni (pred, label));
-         rtx fini = fn (tmp, PM_loop_end, fs, data);
+         rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
           if (fini)
             emit_insn (fini);
           emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
@@ -3878,7 +4034,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
  
         if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
           {
-           rtx bcast = fn (reg, rw, 0, data);
+           rtx bcast = fn (reg, rw, 0, data, vector);
  
             insn = emit_insn_after (bcast, insn);
             empty = false;
@@ -3887,16 +4043,17 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
    return empty;
  }
  
-/* Worker for nvptx_vpropagate.  */
+/* Worker for nvptx_warp_propagate.  */
  
  static rtx
-vprop_gen (rtx reg, propagate_mask pm,
-          unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+warp_prop_gen (rtx reg, propagate_mask pm,
+              unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
+              bool ARG_UNUSED (vector))
  {
    if (!(pm & PM_read_write))
      return 0;
    
-  return nvptx_gen_vcast (reg);
+  return nvptx_gen_warp_bcast (reg);
  }
  
  /* Propagate state that is live at start of BLOCK across the vectors
@@ -3904,26 +4061,27 @@ vprop_gen (rtx reg, propagate_mask pm,
     IS_CALL and return as for nvptx_propagate.  */
  
  static bool
-nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
+nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
  {
-  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write,
+                         warp_prop_gen, 0, false);
  }
  
-/* Worker for nvptx_wpropagate.  */
+/* Worker for nvptx_shared_propagate.  */
  
  static rtx
-wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
+                bool vector)
  {
-  wcast_data_t *data = (wcast_data_t *)data_;
+  broadcast_data_t *data = (broadcast_data_t *)data_;
  
    if (pm & PM_loop_begin)
      {
        /* Starting a loop, initialize pointer.    */
        unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
  
-      if (align > worker_bcast_align)
-       worker_bcast_align = align;
-      data->offset = (data->offset + align - 1) & ~(align - 1);
+      oacc_bcast_align = MAX (oacc_bcast_align, align);
+      data->offset = ROUND_UP (data->offset, align);
  
        data->ptr = gen_reg_rtx (Pmode);
  
@@ -3936,7 +4094,7 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
        return clobber;
      }
    else
-    return nvptx_gen_wcast (reg, pm, rep, data);
+    return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
  }
  
  /* Spill or fill live state that is live at start of BLOCK.  PRE_P
@@ -3945,36 +4103,60 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
     INSN.  IS_CALL and return as for nvptx_propagate.  */
  
  static bool
-nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
+nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
+                       rtx_insn *insn, bool vector)
  {
-  wcast_data_t data;
+  broadcast_data_t data;
  
    data.base = gen_reg_rtx (Pmode);
    data.offset = 0;
    data.ptr = NULL_RTX;
  
    bool empty = nvptx_propagate (is_call, block, insn,
-                               pre_p ? PM_read : PM_write, wprop_gen, &data);
+                               pre_p ? PM_read : PM_write, shared_prop_gen,
+                               &data, vector);
    gcc_assert (empty == !data.offset);
    if (data.offset)
      {
+      rtx bcast_sym = oacc_bcast_sym;
+
        /* Stuff was emitted, initialize the base pointer now.  */
-      rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
+      if (vector && nvptx_mach_max_workers () > 1)
+       {
+         if (!cfun->machine->bcast_partition)
+           {
+             /* It would be nice to place this register in
+                DATA_AREA_SHARED.  */
+             cfun->machine->bcast_partition = gen_reg_rtx (DImode);
+           }
+         if (!cfun->machine->sync_bar)
+           cfun->machine->sync_bar = gen_reg_rtx (SImode);
+
+         bcast_sym = cfun->machine->bcast_partition;
+       }
+
+      rtx init = gen_rtx_SET (data.base, bcast_sym);
        emit_insn_after (init, insn);
  
-      if (worker_bcast_size < data.offset)
-       worker_bcast_size = data.offset;
+      unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
+      unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+                          ? nvptx_mach_max_workers () + 1
+                          : 1);
+
+      oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+      oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
      }
    return empty;
  }
  
-/* Emit a worker-level synchronization barrier.  We use different
-   markers for before and after synchronizations.  */
+/* Emit a CTA-level synchronization barrier.  LOCK is the barrier number,
+   which is an integer or a register.  THREADS is the number of threads
+   controlled by the barrier.  */
  
  static rtx
-nvptx_wsync (bool after)
+nvptx_cta_sync (rtx lock, int threads)
  {
-  return gen_nvptx_barsync (GEN_INT (after));
+  return gen_nvptx_barsync (lock, GEN_INT (threads));
  }
  
  #if WORKAROUND_PTXJIT_BUG
@@ -4221,8 +4403,10 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
        {
         rtx_code_label *label = gen_label_rtx ();
         rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
-       rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
-       rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
+       rtx_insn **mode_jump
+         = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
+       rtx_insn **mode_label
+         = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
  
         if (!pred)
           {
@@ -4256,10 +4440,7 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
               emit_insn_after (gen_exit (), label_insn);
           }
  
-       if (mode == GOMP_DIM_VECTOR)
-         vector_label = label_insn;
-       else
-         worker_label = label_insn;
+       *mode_label = label_insn;
        }
  
    /* Now deal with propagating the branch condition.  */
@@ -4267,7 +4448,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
      {
        rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
  
-      if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
+      if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
+         && nvptx_mach_vector_length () == PTX_WARP_SIZE)
         {
           /* Vector mode only, do a shuffle.  */
  #if WORKAROUND_PTXJIT_BUG
@@ -4327,31 +4509,57 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
           emit_insn_before (gen_rtx_SET (tmp, pvar), label);
           emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
  #endif
-         emit_insn_before (nvptx_gen_vcast (pvar), tail);
+         emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
         }
        else
         {
           /* Includes worker mode, do spill & fill.  By construction
              we should never have worker mode only. */
-         wcast_data_t data;
-
-         data.base = worker_bcast_sym;
+         broadcast_data_t data;
+         unsigned size = GET_MODE_SIZE (SImode);
+         bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
+         bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
+         rtx barrier = GEN_INT (0);
+         int threads = 0;
+
+         data.base = oacc_bcast_sym;
           data.ptr = 0;
  
-         if (worker_bcast_size < GET_MODE_SIZE (SImode))
-           worker_bcast_size = GET_MODE_SIZE (SImode);
+         bool use_partitioning_p = (vector && !worker
+                                    && nvptx_mach_max_workers () > 1
+                                    && cfun->machine->bcast_partition);
+         if (use_partitioning_p)
+           {
+             data.base = cfun->machine->bcast_partition;
+             barrier = cfun->machine->sync_bar;
+             threads = nvptx_mach_vector_length ();
+           }
+         gcc_assert (data.base != NULL);
+         gcc_assert (barrier);
+
+         unsigned int psize = ROUND_UP (size, oacc_bcast_align);
+         unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+                              ? nvptx_mach_max_workers () + 1
+                              : 1);
+
+         oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+         oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
  
           data.offset = 0;
-         emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+         emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
+                                                   vector),
                             before);
+
           /* Barrier so other workers can see the write.  */
-         emit_insn_before (nvptx_wsync (false), tail);
+         emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
           data.offset = 0;
-         emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+         emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
+                                                   vector),
+                           tail);
           /* This barrier is needed to avoid worker zero clobbering
              the broadcast buffer before all the other workers have
              had a chance to read this instance of it.  */
-         emit_insn_before (nvptx_wsync (true), tail);
+         emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
         }
  
        extract_insn (tail);
@@ -4465,22 +4673,43 @@ nvptx_process_pars (parallel *par)
      }
  
    bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+  bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
+  bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+                     && nvptx_mach_vector_length () > PTX_WARP_SIZE);
+
+  if (worker || large_vector)
+    {
+      nvptx_shared_propagate (false, is_call, par->forked_block,
+                             par->forked_insn, !worker);
+      bool no_prop_p
+       = nvptx_shared_propagate (true, is_call, par->forked_block,
+                                 par->fork_insn, !worker);
+      bool empty_loop_p
+       = !is_call && (NEXT_INSN (par->forked_insn)
+                      && NEXT_INSN (par->forked_insn) == par->joining_insn);
+      rtx barrier = GEN_INT (0);
+      int threads = 0;
+
+      if (!worker && cfun->machine->sync_bar)
+       {
+         barrier = cfun->machine->sync_bar;
+         threads = nvptx_mach_vector_length ();
+       }
  
-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-    {
-      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
-      bool empty = nvptx_wpropagate (true, is_call,
-                                    par->forked_block, par->fork_insn);
-
-      if (!empty || !is_call)
+      if (no_prop_p && empty_loop_p)
+       ;
+      else if (no_prop_p && is_call)
+       ;
+      else
         {
           /* Insert begin and end synchronizations.  */
-         emit_insn_before (nvptx_wsync (false), par->forked_insn);
-         emit_insn_before (nvptx_wsync (true), par->join_insn);
+         emit_insn_before (nvptx_cta_sync (barrier, threads),
+                           par->forked_insn);
+         emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
         }
      }
    else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
+    nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
  
    /* Now do siblings.  */
    if (par->next)
@@ -4565,6 +4794,41 @@ nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
      nvptx_neuter_pars (par->next, modes, outer);
  }
  
+static void
+populate_offload_attrs (offload_attrs *oa)
+{
+  tree attr = oacc_get_fn_attrib (current_function_decl);
+  tree dims = TREE_VALUE (attr);
+  unsigned ix;
+
+  oa->mask = 0;
+
+  for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+    {
+      tree t = TREE_VALUE (dims);
+      int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
+      tree allowed = TREE_PURPOSE (dims);
+
+      if (size != 1 && !(allowed && integer_zerop (allowed)))
+       oa->mask |= GOMP_DIM_MASK (ix);
+
+      switch (ix)
+       {
+       case GOMP_DIM_GANG:
+         oa->num_gangs = size;
+         break;
+
+       case GOMP_DIM_WORKER:
+         oa->num_workers = size;
+         break;
+
+       case GOMP_DIM_VECTOR:
+         oa->vector_length = size;
+         break;
+       }
+    }
+}
+
  #if WORKAROUND_PTXJIT_BUG_2
  /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
     is needed in the nvptx target because the branches generated for
@@ -4746,27 +5010,19 @@ nvptx_reorg (void)
      {
        /* If we determined this mask before RTL expansion, we could
          elide emission of some levels of forks and joins.  */
-      unsigned mask = 0;
-      tree dims = TREE_VALUE (attr);
-      unsigned ix;
+      offload_attrs oa;
  
-      for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
-       {
-         int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
-         tree allowed = TREE_PURPOSE (dims);
+      populate_offload_attrs (&oa);
  
-         if (size != 1 && !(allowed && integer_zerop (allowed)))
-           mask |= GOMP_DIM_MASK (ix);
-       }
        /* If there is worker neutering, there must be vector
          neutering.  Otherwise the hardware will fail.  */
-      gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-                 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+      gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+                 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
  
        /* Discover & process partitioned regions.  */
        parallel *pars = nvptx_discover_pars (&bb_insn_map);
        nvptx_process_pars (pars);
-      nvptx_neuter_pars (pars, mask, 0);
+      nvptx_neuter_pars (pars, oa.mask, 0);
        delete pars;
      }
  
@@ -4944,10 +5200,11 @@ nvptx_file_start (void)
    fputs ("// END PREAMBLE\n", asm_out_file);
  }
  
-/* Emit a declaration for a worker-level buffer in .shared memory.  */
+/* Emit a declaration for a worker and vector-level buffer in .shared
+   memory.  */
  
  static void
-write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
+write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
  {
    const char *name = XSTR (sym, 0);
  
@@ -4968,14 +5225,18 @@ nvptx_file_end (void)
      nvptx_record_fndecl (decl);
    fputs (func_decls.str().c_str(), asm_out_file);
  
-  if (worker_bcast_size)
-    write_worker_buffer (asm_out_file, worker_bcast_sym,
-                        worker_bcast_align, worker_bcast_size);
+  if (oacc_bcast_size)
+    write_shared_buffer (asm_out_file, oacc_bcast_sym,
+                        oacc_bcast_align, oacc_bcast_size);
  
    if (worker_red_size)
-    write_worker_buffer (asm_out_file, worker_red_sym,
+    write_shared_buffer (asm_out_file, worker_red_sym,
                          worker_red_align, worker_red_size);
  
+  if (vector_red_size)
+    write_shared_buffer (asm_out_file, vector_red_sym,
+                        vector_red_align, vector_red_size);
+
    if (need_softstack_decl)
      {
        write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
@@ -5021,33 +5282,68 @@ nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
    return target;
  }
  
-/* Worker reduction address expander.  */
+const char *
+nvptx_output_red_partition (rtx dst, rtx offset)
+{
+  const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
+  const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
+
+  if (offset == const0_rtx)
+    fprintf (asm_out_file, zero_offset, REGNO (dst),
+            REGNO (cfun->machine->red_partition));
+  else
+    fprintf (asm_out_file, with_offset, REGNO (dst),
+            REGNO (cfun->machine->red_partition), UINTVAL (offset));
+
+  return "";
+}
+
+/* Shared-memory reduction address expander.  */
  
  static rtx
-nvptx_expand_worker_addr (tree exp, rtx target,
-                         machine_mode ARG_UNUSED (mode), int ignore)
+nvptx_expand_shared_addr (tree exp, rtx target,
+                         machine_mode ARG_UNUSED (mode), int ignore,
+                         int vector)
  {
    if (ignore)
      return target;
  
    unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
-  if (align > worker_red_align)
-    worker_red_align = align;
-
    unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
    unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
-  if (size + offset > worker_red_size)
-    worker_red_size = size + offset;
-
    rtx addr = worker_red_sym;
-  if (offset)
+
+  if (vector)
      {
-      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
-      addr = gen_rtx_CONST (Pmode, addr);
+      offload_attrs oa;
+
+      populate_offload_attrs (&oa);
+
+      unsigned int psize = ROUND_UP (size + offset, align);
+      unsigned int pnum = nvptx_mach_max_workers ();
+      vector_red_partition = MAX (vector_red_partition, psize);
+      vector_red_size = MAX (vector_red_size, psize * pnum);
+      vector_red_align = MAX (vector_red_align, align);
+
+      if (cfun->machine->red_partition == NULL)
+       cfun->machine->red_partition = gen_reg_rtx (Pmode);
+
+      addr = gen_reg_rtx (Pmode);
+      emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
      }
+  else
+    {
+      worker_red_align = MAX (worker_red_align, align);
+      worker_red_size = MAX (worker_red_size, size + offset);
  
-  emit_move_insn (target, addr);
+      if (offset)
+       {
+         addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
+         addr = gen_rtx_CONST (Pmode, addr);
+       }
+   }
  
+  emit_move_insn (target, addr);
    return target;
  }
  
@@ -5095,6 +5391,7 @@ enum nvptx_builtins
    NVPTX_BUILTIN_SHUFFLE,
    NVPTX_BUILTIN_SHUFFLELL,
    NVPTX_BUILTIN_WORKER_ADDR,
+  NVPTX_BUILTIN_VECTOR_ADDR,
    NVPTX_BUILTIN_CMP_SWAP,
    NVPTX_BUILTIN_CMP_SWAPLL,
    NVPTX_BUILTIN_MAX
@@ -5132,6 +5429,8 @@ nvptx_init_builtins (void)
    DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
    DEF (WORKER_ADDR, "worker_addr",
         (PTRVOID, ST, UINT, UINT, NULL_TREE));
+  DEF (VECTOR_ADDR, "vector_addr",
+       (PTRVOID, ST, UINT, UINT, NULL_TREE));
    DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
    DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
  
@@ -5160,7 +5459,10 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
        return nvptx_expand_shuffle (exp, target, mode, ignore);
  
      case NVPTX_BUILTIN_WORKER_ADDR:
-      return nvptx_expand_worker_addr (exp, target, mode, ignore);
+      return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
+
+    case NVPTX_BUILTIN_VECTOR_ADDR:
+      return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
  
      case NVPTX_BUILTIN_CMP_SWAP:
      case NVPTX_BUILTIN_CMP_SWAPLL:
@@ -5178,20 +5480,83 @@ nvptx_simt_vf ()
    return PTX_WARP_SIZE;
  }
  
-/* Validate compute dimensions of an OpenACC offload or routine, fill
-   in non-unity defaults.  FN_LEVEL indicates the level at which a
-   routine might spawn a loop.  It is negative for non-routines.  If
-   DECL is null, we are validating the default dimensions.  */
+static bool
+nvptx_welformed_vector_length_p (int l)
+{
+  gcc_assert (l > 0);
+  return l % PTX_WARP_SIZE == 0;
+}
+
+static void
+nvptx_apply_dim_limits (int dims[])
+{
+  /* Check that the vector_length is not too large.  */
+  if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
+    dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
+
+  /* Check that the number of workers is not too large.  */
+  if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
+    dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+
+  /* Ensure that num_worker * vector_length <= cta size.  */
+  if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
+      && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
+    dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+  /* If we need a per-worker barrier ... .  */
+  if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
+      && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+    /* Don't use more barriers than available.  */
+    dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
+                                PTX_NUM_PER_WORKER_BARRIERS);
+}
+
+/* Return true if FNDECL contains calls to vector-partitionable routines.  */
  
  static bool
-nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
+has_vector_partitionable_routine_calls_p (tree fndecl)
+{
+  if (!fndecl)
+    return false;
+
+  basic_block bb;
+  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
+    for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
+        gsi_next_nondebug (&i))
+      {
+       gimple *stmt = gsi_stmt (i);
+       if (gimple_code (stmt) != GIMPLE_CALL)
+         continue;
+
+       tree callee = gimple_call_fndecl (stmt);
+       if (!callee)
+         continue;
+
+       tree attrs  = oacc_get_fn_attrib (callee);
+       if (attrs == NULL_TREE)
+         return false;
+
+       int partition_level = oacc_fn_attrib_level (attrs);
+       bool seq_routine_p = partition_level == GOMP_DIM_MAX;
+       if (!seq_routine_p)
+         return true;
+      }
+
+  return false;
+}
+
+/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
+   DIMS has changed.  */
+
+static void
+nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
  {
-  bool changed = false;
    bool oacc_default_dims_p = false;
    bool oacc_min_dims_p = false;
    bool offload_region_p = false;
    bool routine_p = false;
    bool routine_seq_p = false;
+  int default_vector_length = -1;
  
    if (decl == NULL_TREE)
      {
@@ -5212,80 +5577,153 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
    else
      gcc_unreachable ();
  
+  if (oacc_min_dims_p)
+    {
+      gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
+      gcc_assert (dims[GOMP_DIM_WORKER] == 1);
+      gcc_assert (dims[GOMP_DIM_GANG] == 1);
+
+      dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+      return;
+    }
+
    if (routine_p)
      {
-      /* OpenACC routines in C arrive here with the following attributes
-        (omitting the 'omp declare target'):
-        seq   : __attribute__((oacc function (0 1, 0 1, 0 1)))
-        vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
-        worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
-        gang  : __attribute__((oacc function (1 0, 1 0, 1 0)))
-
-        If we take f.i. the oacc function attribute of the worker routine
-        (0 1, 1 0, 1 0), then:
-        - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
-          meaning: worker routine, that is:
-          - can't contain gang loop (0),
-          - can contain worker loop (1),
-          - can contain vector loop (1).
-        - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
-        dimensions: gang: 1, worker: 0, vector: 0.
-
-        OTOH, routines in Fortran arrive here with these attributes:
-        seq   : __attribute__((oacc function (0 0, 0 0, 0 0)))
-        vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
-        worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
-        gang  : __attribute__((oacc function (1 0, 1 0, 1 0)))
-        that is, the same as for C but with the dimensions set to 0.
-
-        This is due to a bug in the Fortran front-end: PR72741.  Work around
-        this bug by forcing the dimensions to be the same in Fortran as for C,
-        to be able to handle C and Fortran routines uniformly in this
-        function.  */
-      dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
-      dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
-      dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
-      changed = true;
-    }
-
-  /* The vector size must be 32, unless this is a SEQ routine.  */
-  if ((offload_region_p || oacc_default_dims_p
-       || (routine_p && !routine_seq_p))
-      && dims[GOMP_DIM_VECTOR] >= 0
-      && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
-    {
-      if ((offload_region_p || oacc_default_dims_p)
-         && dims[GOMP_DIM_VECTOR] >= 0)
-       warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
-                   dims[GOMP_DIM_VECTOR]
-                   ? G_("using vector_length (%d), ignoring %d")
-                   : G_("using vector_length (%d), ignoring runtime setting"),
-                   PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
-      dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
-      changed = true;
-    }
-
-  /* Check the num workers is not too large.  */
-  if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
+      if (!routine_seq_p)
+       dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+      return;
+    }
+
+  if (oacc_default_dims_p)
      {
-      warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
-                 "using num_workers (%d), ignoring %d",
-                 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
-      dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
-      changed = true;
+      /* -1  : not set
+         0  : set at runtime, f.i. -fopenacc-dims=-
+         >= 1: set at compile time, f.i. -fopenacc-dims=1.  */
+      gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
+      gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
+      gcc_assert (dims[GOMP_DIM_GANG] >= -1);
+
+      /* But -fopenacc-dims=- is not yet supported on trunk.  */
+      gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
+      gcc_assert (dims[GOMP_DIM_WORKER] != 0);
+      gcc_assert (dims[GOMP_DIM_GANG] != 0);
      }
  
-  if (oacc_default_dims_p || oacc_min_dims_p)
+  if (offload_region_p)
      {
-      dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
+      /* -1   : not set
+         0   : set using variable, f.i. num_gangs (n)
+         >= 1: set using constant, f.i. num_gangs (1).  */
+      gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
+      gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
+      gcc_assert (dims[GOMP_DIM_GANG] >= -1);
+    }
+
+  if (offload_region_p)
+    default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
+  else
+    /* oacc_default_dims_p.  */
+    default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
+
+  int old_dims[GOMP_DIM_MAX];
+  unsigned int i;
+  for (i = 0; i < GOMP_DIM_MAX; ++i)
+    old_dims[i] = dims[i];
+
+  const char *vector_reason = NULL;
+  if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
+    {
+      default_vector_length = PTX_WARP_SIZE;
+
+      if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+       {
+         vector_reason = G_("using vector_length (%d) due to call to"
+                            " vector-partitionable routine, ignoring %d");
+         dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+       }
+    }
+
+  if (dims[GOMP_DIM_VECTOR] == 0)
+    {
+      vector_reason = G_("using vector_length (%d), ignoring runtime setting");
+      dims[GOMP_DIM_VECTOR] = default_vector_length;
+    }
+
+  if (dims[GOMP_DIM_VECTOR] > 0
+      && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
+    dims[GOMP_DIM_VECTOR] = default_vector_length;
+
+  nvptx_apply_dim_limits (dims);
+
+  if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
+    warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
+               vector_reason != NULL
+               ? vector_reason
+               : G_("using vector_length (%d), ignoring %d"),
+               dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
+
+  if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
+    warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
+               G_("using num_workers (%d), ignoring %d"),
+               dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
+
+  if (oacc_default_dims_p)
+    {
+      if (dims[GOMP_DIM_VECTOR] < 0)
+       dims[GOMP_DIM_VECTOR] = default_vector_length;
        if (dims[GOMP_DIM_WORKER] < 0)
         dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
        if (dims[GOMP_DIM_GANG] < 0)
         dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
-      changed = true;
+      nvptx_apply_dim_limits (dims);
+    }
+
+  if (offload_region_p)
+    {
+      for (i = 0; i < GOMP_DIM_MAX; i++)
+       {
+         if (!(dims[i] < 0))
+           continue;
+
+         if ((used & GOMP_DIM_MASK (i)) == 0)
+           /* Function oacc_validate_dims will apply the minimal dimension.  */
+           continue;
+
+         dims[i] = (i == GOMP_DIM_VECTOR
+                    ? default_vector_length
+                    : oacc_get_default_dim (i));
+       }
+
+      nvptx_apply_dim_limits (dims);
      }
+}
+
+/* Validate compute dimensions of an OpenACC offload or routine, fill
+   in non-unity defaults.  FN_LEVEL indicates the level at which a
+   routine might spawn a loop.  It is negative for non-routines.  If
+   DECL is null, we are validating the default dimensions.  */
+
+static bool
+nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
+{
+  int old_dims[GOMP_DIM_MAX];
+  unsigned int i;
+
+  for (i = 0; i < GOMP_DIM_MAX; ++i)
+    old_dims[i] = dims[i];
+
+  nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
  
-  return changed;
+  gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
+  if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
+    gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
+
+  for (i = 0; i < GOMP_DIM_MAX; ++i)
+    if (old_dims[i] != dims[i])
+      return true;
+
+  return false;
  }
  
  /* Return maximum dimension size, or zero for unbounded.  */
@@ -5296,7 +5734,7 @@ nvptx_dim_limit (int axis)
    switch (axis)
      {
      case GOMP_DIM_VECTOR:
-      return PTX_VECTOR_LENGTH;
+      return PTX_MAX_VECTOR_LENGTH;
  
      default:
        break;
@@ -5329,10 +5767,13 @@ nvptx_goacc_fork_join (gcall *call, const int dims[],
     data at that location.  */
  
  static tree
-nvptx_get_worker_red_addr (tree type, tree offset)
+nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
  {
+  enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
+  if (vector)
+    addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
    machine_mode mode = TYPE_MODE (type);
-  tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
+  tree fndecl = nvptx_builtin_decl (addr_dim, true);
    tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
    tree align = build_int_cst (unsigned_type_node,
                               GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
@@ -5648,7 +6089,7 @@ nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
  /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
  
  static void
-nvptx_goacc_reduction_setup (gcall *call)
+nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
  {
    gimple_stmt_iterator gsi = gsi_for_stmt (call);
    tree lhs = gimple_call_lhs (call);
@@ -5667,11 +6108,13 @@ nvptx_goacc_reduction_setup (gcall *call)
         var = build_simple_mem_ref (ref_to_res);
      }
    
-  if (level == GOMP_DIM_WORKER)
+  if (level == GOMP_DIM_WORKER
+      || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
      {
        /* Store incoming value to worker reduction buffer.  */
        tree offset = gimple_call_arg (call, 5);
-      tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+      tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
+                                            level == GOMP_DIM_VECTOR);
        tree ptr = make_ssa_name (TREE_TYPE (call));
  
        gimplify_assign (ptr, call, &seq);
@@ -5690,7 +6133,7 @@ nvptx_goacc_reduction_setup (gcall *call)
  /* NVPTX implementation of GOACC_REDUCTION_INIT. */
  
  static void
-nvptx_goacc_reduction_init (gcall *call)
+nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
  {
    gimple_stmt_iterator gsi = gsi_for_stmt (call);
    tree lhs = gimple_call_lhs (call);
@@ -5704,7 +6147,7 @@ nvptx_goacc_reduction_init (gcall *call)
    
    push_gimplify_context (true);
  
-  if (level == GOMP_DIM_VECTOR)
+  if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
      {
        /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
        tree tid = make_ssa_name (integer_type_node);
@@ -5764,7 +6207,8 @@ nvptx_goacc_reduction_init (gcall *call)
             init = var;
         }
  
-      gimplify_assign (lhs, init, &seq);
+      if (lhs != NULL_TREE)
+       gimplify_assign (lhs, init, &seq);
      }
  
    pop_gimplify_context (NULL);
@@ -5774,7 +6218,7 @@ nvptx_goacc_reduction_init (gcall *call)
  /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
  
  static void
-nvptx_goacc_reduction_fini (gcall *call)
+nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
  {
    gimple_stmt_iterator gsi = gsi_for_stmt (call);
    tree lhs = gimple_call_lhs (call);
@@ -5788,12 +6232,12 @@ nvptx_goacc_reduction_fini (gcall *call)
  
    push_gimplify_context (true);
  
-  if (level == GOMP_DIM_VECTOR)
+  if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
      {
        /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
          but that requires a method of emitting a unified jump at the
          gimple level.  */
-      for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
+      for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
         {
           tree other_var = make_ssa_name (TREE_TYPE (var));
           nvptx_generate_vector_shuffle (gimple_location (call),
@@ -5809,11 +6253,12 @@ nvptx_goacc_reduction_fini (gcall *call)
      {
        tree accum = NULL_TREE;
  
-      if (level == GOMP_DIM_WORKER)
+      if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
         {
           /* Get reduction buffer address.  */
           tree offset = gimple_call_arg (call, 5);
-         tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+         tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
+                                                level == GOMP_DIM_VECTOR);
           tree ptr = make_ssa_name (TREE_TYPE (call));
  
           gimplify_assign (ptr, call, &seq);
@@ -5844,7 +6289,7 @@ nvptx_goacc_reduction_fini (gcall *call)
  /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
  
  static void
-nvptx_goacc_reduction_teardown (gcall *call)
+nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
  {
    gimple_stmt_iterator gsi = gsi_for_stmt (call);
    tree lhs = gimple_call_lhs (call);
@@ -5853,11 +6298,13 @@ nvptx_goacc_reduction_teardown (gcall *call)
    gimple_seq seq = NULL;
    
    push_gimplify_context (true);
-  if (level == GOMP_DIM_WORKER)
+  if (level == GOMP_DIM_WORKER
+      || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
      {
        /* Read the worker reduction buffer.  */
        tree offset = gimple_call_arg (call, 5);
-      tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
+      tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
+                                            level == GOMP_DIM_VECTOR);
        tree ptr = make_ssa_name (TREE_TYPE (call));
  
        gimplify_assign (ptr, call, &seq);
@@ -5888,23 +6335,26 @@ static void
  nvptx_goacc_reduction (gcall *call)
  {
    unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+  offload_attrs oa;
+
+  populate_offload_attrs (&oa);
  
    switch (code)
      {
      case IFN_GOACC_REDUCTION_SETUP:
-      nvptx_goacc_reduction_setup (call);
+      nvptx_goacc_reduction_setup (call, &oa);
        break;
  
      case IFN_GOACC_REDUCTION_INIT:
-      nvptx_goacc_reduction_init (call);
+      nvptx_goacc_reduction_init (call, &oa);
        break;
  
      case IFN_GOACC_REDUCTION_FINI:
-      nvptx_goacc_reduction_fini (call);
+      nvptx_goacc_reduction_fini (call, &oa);
        break;
  
      case IFN_GOACC_REDUCTION_TEARDOWN:
-      nvptx_goacc_reduction_teardown (call);
+      nvptx_goacc_reduction_teardown (call, &oa);
        break;
  
      default:
@@ -5980,6 +6430,19 @@ nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
    return false;
  }
  
+static GTY(()) tree nvptx_previous_fndecl;
+
+static void
+nvptx_set_current_function (tree fndecl)
+{
+  if (!fndecl || fndecl == nvptx_previous_fndecl)
+    return;
+
+  nvptx_previous_fndecl = fndecl;
+  vector_red_partition = 0;
+  oacc_bcast_partition = 0;
+}
+
  #undef TARGET_OPTION_OVERRIDE
  #define TARGET_OPTION_OVERRIDE nvptx_option_override
  
@@ -6116,6 +6579,9 @@ nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
  #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
  #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
  
+#undef TARGET_SET_CURRENT_FUNCTION
+#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  
  #include "gt-nvptx.h"