/* Target code for NVPTX.
- Copyright (C) 2014-2018 Free Software Foundation, Inc.
+ Copyright (C) 2014-2019 Free Software Foundation, Inc.
Contributed by Bernd Schmidt <bernds@codesourcery.com>
This file is part of GCC.
#include "builtins.h"
#include "omp-general.h"
#include "omp-low.h"
+#include "omp-offload.h"
#include "gomp-constants.h"
#include "dumpfile.h"
#include "internal-fn.h"
#define WORKAROUND_PTXJIT_BUG_2 1
#define WORKAROUND_PTXJIT_BUG_3 1
+/* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
+ block, which has had a maximum number of threads of 1024 since CUDA version
+ 2.x. */
+#define PTX_CTA_SIZE 1024
+
+#define PTX_CTA_NUM_BARRIERS 16
#define PTX_WARP_SIZE 32
-#define PTX_VECTOR_LENGTH 32
+
+#define PTX_PER_CTA_BARRIER 0
+#define PTX_NUM_PER_CTA_BARRIERS 1
+#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
+#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
+
+#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
+#define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
#define PTX_WORKER_LENGTH 32
#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
-/* Buffer needed to broadcast across workers. This is used for both
- worker-neutering and worker broadcasting. It is shared by all
- functions emitted. The buffer is placed in shared memory. It'd be
- nice if PTX supported common blocks, because then this could be
- shared across TUs (taking the largest size). */
-static unsigned worker_bcast_size;
-static unsigned worker_bcast_align;
-static GTY(()) rtx worker_bcast_sym;
+/* Buffer needed to broadcast across workers and vectors. This is
+ used for both worker-neutering and worker broadcasting, and
+ vector-neutering and boardcasting when vector_length > 32. It is
+ shared by all functions emitted. The buffer is placed in shared
+ memory. It'd be nice if PTX supported common blocks, because then
+ this could be shared across TUs (taking the largest size). */
+static unsigned oacc_bcast_size;
+static unsigned oacc_bcast_partition;
+static unsigned oacc_bcast_align;
+static GTY(()) rtx oacc_bcast_sym;
/* Buffer needed for worker reductions. This has to be distinct from
the worker broadcast array, as both may be live concurrently. */
static unsigned worker_red_align;
static GTY(()) rtx worker_red_sym;
+/* Buffer needed for vector reductions, when vector_length >
+ PTX_WARP_SIZE. This has to be distinct from the worker broadcast
+ array, as both may be live concurrently. */
+static unsigned vector_red_size;
+static unsigned vector_red_align;
+static unsigned vector_red_partition;
+static GTY(()) rtx vector_red_sym;
+
/* Global lock variable, needed for 128bit worker & gang reductions. */
static GTY(()) tree global_lock_var;
/* True if any function references __nvptx_uni. */
static bool need_unisimt_decl;
+static int nvptx_mach_max_workers ();
+
/* Allocate a new, cleared machine_function structure. */
static struct machine_function *
diagnose_openacc_conflict (bool optval, const char *optname)
{
if (flag_openacc && optval)
- error ("option %s is not supported together with -fopenacc", optname);
+ error ("option %s is not supported together with %<-fopenacc%>", optname);
}
/* Implement TARGET_OPTION_OVERRIDE. */
declared_libfuncs_htab
= hash_table<declared_libfunc_hasher>::create_ggc (17);
- worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
- SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
- worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+ oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
+ SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
+ oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+ oacc_bcast_partition = 0;
worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+ vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
+ SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
+ vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+ vector_red_partition = 0;
+
diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
{
fprintf (file, "\t{\n");
fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
+ if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
+ {
+ fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
+ fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+ }
fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
+ if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
+ {
+ fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
+ fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
+ fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
+ "// vector reduction buffer\n",
+ REGNO (cfun->machine->red_partition),
+ vector_red_partition);
+ }
+ /* Verify vector_red_size. */
+ gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
+ <= vector_red_size);
+ fprintf (file, "\t}\n");
+}
+
+/* Emit code to initialize OpenACC worker broadcast and synchronization
+ registers. */
+
+static void
+nvptx_init_oacc_workers (FILE *file)
+{
+ fprintf (file, "\t{\n");
+ fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
+ if (cfun->machine->bcast_partition)
+ {
+ fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
+ fprintf (file, "\t\t.reg.u64\t%%y64;\n");
+ }
+ fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
+ if (cfun->machine->bcast_partition)
+ {
+ fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
+ fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
+ fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
+ fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
+ "// vector broadcast offset\n",
+ REGNO (cfun->machine->bcast_partition),
+ oacc_bcast_partition);
+ }
+ /* Verify oacc_bcast_size. */
+ gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
+ <= oacc_bcast_size);
+ if (cfun->machine->sync_bar)
+ fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
+ "// vector synchronization barrier\n",
+ REGNO (cfun->machine->sync_bar));
fprintf (file, "\t}\n");
}
fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
}
+
+ /* Restore the vector reduction partition register, if necessary.
+ FIXME: Find out when and why this is necessary, and fix it. */
+ if (cfun->machine->red_partition)
+ regno_reg_rtx[REGNO (cfun->machine->red_partition)]
+ = cfun->machine->red_partition;
+
/* Declare the pseudos we have as ptx registers. */
int maxregs = max_reg_num ();
for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
if (cfun->machine->unisimt_predicate
|| (cfun->machine->has_simtreg && !crtl->is_leaf))
nvptx_init_unisimt_predicate (file);
+ if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
+ nvptx_init_oacc_workers (file);
}
/* Output code for switching uniform-simt state. ENTERING indicates whether
fputs (";\n", file);
if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
fprintf (file,
- "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
+ "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
bits, regno, regno, UINTVAL (align));
}
if (cfun->machine->has_softstack)
across the vectors of a single warp. */
static rtx
-nvptx_gen_vcast (rtx reg)
+nvptx_gen_warp_bcast (rtx reg)
{
return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
}
/* Structure used when generating a worker-level spill or fill. */
-struct wcast_data_t
+struct broadcast_data_t
{
rtx base; /* Register holding base addr of buffer. */
rtx ptr; /* Iteration var, if needed. */
how many loop iterations will be executed (0 for not a loop). */
static rtx
-nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
+nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
+ broadcast_data_t *data, bool vector)
{
rtx res;
machine_mode mode = GET_MODE (reg);
start_sequence ();
if (pm & PM_read)
emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
- emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
+ emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
if (pm & PM_write)
emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
res = get_insns ();
{
unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
- if (align > worker_bcast_align)
- worker_bcast_align = align;
- data->offset = (data->offset + align - 1) & ~(align - 1);
+ oacc_bcast_align = MAX (oacc_bcast_align, align);
+ data->offset = ROUND_UP (data->offset, align);
addr = data->base;
+ gcc_assert (data->base != NULL);
if (data->offset)
addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
}
{
val >>= part * BITS_PER_UNIT;
part = init_frag.size - init_frag.offset;
- if (part > size)
- part = size;
+ part = MIN (part, size);
unsigned HOST_WIDE_INT partial
= val << (init_frag.offset * BITS_PER_UNIT);
if (init_frag.offset)
{
unsigned part = init_frag.size - init_frag.offset;
- if (part > size)
- part = (unsigned) size;
+ part = MIN (part, (unsigned)size);
size -= part;
nvptx_assemble_value (0, part);
}
}
}
+/* Offloading function attributes. */
+
+struct offload_attrs
+{
+ unsigned mask;
+ int num_gangs;
+ int num_workers;
+ int vector_length;
+};
+
+/* Define entries for cfun->machine->axis_dim. */
+
+#define MACH_VECTOR_LENGTH 0
+#define MACH_MAX_WORKERS 1
+
+static void populate_offload_attrs (offload_attrs *oa);
+
+static void
+init_axis_dim (void)
+{
+ offload_attrs oa;
+ int max_workers;
+
+ populate_offload_attrs (&oa);
+
+ if (oa.num_workers == 0)
+ max_workers = PTX_CTA_SIZE / oa.vector_length;
+ else
+ max_workers = oa.num_workers;
+
+ cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
+ cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
+ cfun->machine->axis_dim_init_p = true;
+}
+
+static int ATTRIBUTE_UNUSED
+nvptx_mach_max_workers ()
+{
+ if (!cfun->machine->axis_dim_init_p)
+ init_axis_dim ();
+ return cfun->machine->axis_dim[MACH_MAX_WORKERS];
+}
+
+static int ATTRIBUTE_UNUSED
+nvptx_mach_vector_length ()
+{
+ if (!cfun->machine->axis_dim_init_p)
+ init_axis_dim ();
+ return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
+}
+
/* Loop structure of the function. The entire function is described as
a NULL loop. */
}
}
+/* Return true if MASK contains parallelism that requires shared
+ memory to broadcast. */
+
+static bool
+nvptx_needs_shared_bcast (unsigned mask)
+{
+ bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+ bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ && nvptx_mach_vector_length () != PTX_WARP_SIZE;
+
+ return worker || large_vector;
+}
+
/* BLOCK is a basic block containing a head or tail instruction.
Locate the associated prehead or pretail instruction, which must be
in the single predecessor block. */
par = new parallel (par, mask);
par->forked_block = block;
par->forked_insn = end;
- if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ if (nvptx_needs_shared_bcast (mask))
par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
}
unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
gcc_assert (par->mask == mask);
+ gcc_assert (par->join_block == NULL);
par->join_block = block;
par->join_insn = end;
- if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ if (nvptx_needs_shared_bcast (mask))
par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent;
size_t offset = (dir > 0 ? offsetof (edge_def, dest)
: offsetof (edge_def, src));
edge e;
- edge_iterator (ei);
+ edge_iterator ei;
FOR_EACH_EDGE (e, ei, edges)
{
vec<edge, va_gc> *edges, size_t offset)
{
edge e;
- edge_iterator (ei);
+ edge_iterator ei;
int hi_back = depth;
pseudo_node_t node_back (0, depth);
int hi_child = depth;
regions and (b) only propagating stack entries that are used. The
latter might be quite hard to determine. */
-typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
+typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
static bool
nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
- propagate_mask rw, propagator_fn fn, void *data)
+ propagate_mask rw, propagator_fn fn, void *data, bool vector)
{
bitmap live = DF_LIVE_IN (block);
bitmap_iterator iterator;
emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
/* Allow worker function to initialize anything needed. */
- rtx init = fn (tmp, PM_loop_begin, fs, data);
+ rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
if (init)
emit_insn (init);
emit_label (label);
}
if (rw & PM_read)
emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
- emit_insn (fn (tmp, rw, fs, data));
+ emit_insn (fn (tmp, rw, fs, data, vector));
if (rw & PM_write)
emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
if (fs)
emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
emit_insn (gen_br_true_uni (pred, label));
- rtx fini = fn (tmp, PM_loop_end, fs, data);
+ rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
if (fini)
emit_insn (fini);
emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
{
- rtx bcast = fn (reg, rw, 0, data);
+ rtx bcast = fn (reg, rw, 0, data, vector);
insn = emit_insn_after (bcast, insn);
empty = false;
return empty;
}
-/* Worker for nvptx_vpropagate. */
+/* Worker for nvptx_warp_propagate. */
static rtx
-vprop_gen (rtx reg, propagate_mask pm,
- unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
+warp_prop_gen (rtx reg, propagate_mask pm,
+ unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
+ bool ARG_UNUSED (vector))
{
if (!(pm & PM_read_write))
return 0;
- return nvptx_gen_vcast (reg);
+ return nvptx_gen_warp_bcast (reg);
}
/* Propagate state that is live at start of BLOCK across the vectors
IS_CALL and return as for nvptx_propagate. */
static bool
-nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
+nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
{
- return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
+ return nvptx_propagate (is_call, block, insn, PM_read_write,
+ warp_prop_gen, 0, false);
}
-/* Worker for nvptx_wpropagate. */
+/* Worker for nvptx_shared_propagate. */
static rtx
-wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
+shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
+ bool vector)
{
- wcast_data_t *data = (wcast_data_t *)data_;
+ broadcast_data_t *data = (broadcast_data_t *)data_;
if (pm & PM_loop_begin)
{
/* Starting a loop, initialize pointer. */
unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
- if (align > worker_bcast_align)
- worker_bcast_align = align;
- data->offset = (data->offset + align - 1) & ~(align - 1);
+ oacc_bcast_align = MAX (oacc_bcast_align, align);
+ data->offset = ROUND_UP (data->offset, align);
data->ptr = gen_reg_rtx (Pmode);
return clobber;
}
else
- return nvptx_gen_wcast (reg, pm, rep, data);
+ return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
}
/* Spill or fill live state that is live at start of BLOCK. PRE_P
INSN. IS_CALL and return as for nvptx_propagate. */
static bool
-nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
+nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
+ rtx_insn *insn, bool vector)
{
- wcast_data_t data;
+ broadcast_data_t data;
data.base = gen_reg_rtx (Pmode);
data.offset = 0;
data.ptr = NULL_RTX;
bool empty = nvptx_propagate (is_call, block, insn,
- pre_p ? PM_read : PM_write, wprop_gen, &data);
+ pre_p ? PM_read : PM_write, shared_prop_gen,
+ &data, vector);
gcc_assert (empty == !data.offset);
if (data.offset)
{
+ rtx bcast_sym = oacc_bcast_sym;
+
/* Stuff was emitted, initialize the base pointer now. */
- rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
+ if (vector && nvptx_mach_max_workers () > 1)
+ {
+ if (!cfun->machine->bcast_partition)
+ {
+ /* It would be nice to place this register in
+ DATA_AREA_SHARED. */
+ cfun->machine->bcast_partition = gen_reg_rtx (DImode);
+ }
+ if (!cfun->machine->sync_bar)
+ cfun->machine->sync_bar = gen_reg_rtx (SImode);
+
+ bcast_sym = cfun->machine->bcast_partition;
+ }
+
+ rtx init = gen_rtx_SET (data.base, bcast_sym);
emit_insn_after (init, insn);
- if (worker_bcast_size < data.offset)
- worker_bcast_size = data.offset;
+ unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
+ unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+ ? nvptx_mach_max_workers () + 1
+ : 1);
+
+ oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+ oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
}
return empty;
}
-/* Emit a worker-level synchronization barrier. We use different
- markers for before and after synchronizations. */
+/* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
+ which is an integer or a register. THREADS is the number of threads
+ controlled by the barrier. */
static rtx
-nvptx_wsync (bool after)
+nvptx_cta_sync (rtx lock, int threads)
{
- return gen_nvptx_barsync (GEN_INT (after));
+ return gen_nvptx_barsync (lock, GEN_INT (threads));
}
#if WORKAROUND_PTXJIT_BUG
{
rtx_code_label *label = gen_label_rtx ();
rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
- rtx_insn **mode_jump = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
- rtx_insn **mode_label = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
+ rtx_insn **mode_jump
+ = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
+ rtx_insn **mode_label
+ = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
if (!pred)
{
emit_insn_after (gen_exit (), label_insn);
}
- if (mode == GOMP_DIM_VECTOR)
- vector_label = label_insn;
- else
- worker_label = label_insn;
+ *mode_label = label_insn;
}
/* Now deal with propagating the branch condition. */
{
rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
- if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
+ if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
+ && nvptx_mach_vector_length () == PTX_WARP_SIZE)
{
/* Vector mode only, do a shuffle. */
#if WORKAROUND_PTXJIT_BUG
emit_insn_before (gen_rtx_SET (tmp, pvar), label);
emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
#endif
- emit_insn_before (nvptx_gen_vcast (pvar), tail);
+ emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
}
else
{
/* Includes worker mode, do spill & fill. By construction
we should never have worker mode only. */
- wcast_data_t data;
-
- data.base = worker_bcast_sym;
+ broadcast_data_t data;
+ unsigned size = GET_MODE_SIZE (SImode);
+ bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
+ bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
+ rtx barrier = GEN_INT (0);
+ int threads = 0;
+
+ data.base = oacc_bcast_sym;
data.ptr = 0;
- if (worker_bcast_size < GET_MODE_SIZE (SImode))
- worker_bcast_size = GET_MODE_SIZE (SImode);
+ bool use_partitioning_p = (vector && !worker
+ && nvptx_mach_max_workers () > 1
+ && cfun->machine->bcast_partition);
+ if (use_partitioning_p)
+ {
+ data.base = cfun->machine->bcast_partition;
+ barrier = cfun->machine->sync_bar;
+ threads = nvptx_mach_vector_length ();
+ }
+ gcc_assert (data.base != NULL);
+ gcc_assert (barrier);
+
+ unsigned int psize = ROUND_UP (size, oacc_bcast_align);
+ unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
+ ? nvptx_mach_max_workers () + 1
+ : 1);
+
+ oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
+ oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
data.offset = 0;
- emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
+ emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
+ vector),
before);
+
/* Barrier so other workers can see the write. */
- emit_insn_before (nvptx_wsync (false), tail);
+ emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
data.offset = 0;
- emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
+ emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
+ vector),
+ tail);
/* This barrier is needed to avoid worker zero clobbering
the broadcast buffer before all the other workers have
had a chance to read this instance of it. */
- emit_insn_before (nvptx_wsync (true), tail);
+ emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
}
extract_insn (tail);
}
bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+ bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
+ bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+ && nvptx_mach_vector_length () > PTX_WARP_SIZE);
+
+ if (worker || large_vector)
+ {
+ nvptx_shared_propagate (false, is_call, par->forked_block,
+ par->forked_insn, !worker);
+ bool no_prop_p
+ = nvptx_shared_propagate (true, is_call, par->forked_block,
+ par->fork_insn, !worker);
+ bool empty_loop_p
+ = !is_call && (NEXT_INSN (par->forked_insn)
+ && NEXT_INSN (par->forked_insn) == par->joining_insn);
+ rtx barrier = GEN_INT (0);
+ int threads = 0;
+
+ if (!worker && cfun->machine->sync_bar)
+ {
+ barrier = cfun->machine->sync_bar;
+ threads = nvptx_mach_vector_length ();
+ }
- if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
- {
- nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
- bool empty = nvptx_wpropagate (true, is_call,
- par->forked_block, par->fork_insn);
-
- if (!empty || !is_call)
+ if (no_prop_p && empty_loop_p)
+ ;
+ else if (no_prop_p && is_call)
+ ;
+ else
{
/* Insert begin and end synchronizations. */
- emit_insn_before (nvptx_wsync (false), par->forked_insn);
- emit_insn_before (nvptx_wsync (true), par->join_insn);
+ emit_insn_before (nvptx_cta_sync (barrier, threads),
+ par->forked_insn);
+ emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
}
}
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
- nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
+ nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
/* Now do siblings. */
if (par->next)
nvptx_neuter_pars (par->next, modes, outer);
}
+static void
+populate_offload_attrs (offload_attrs *oa)
+{
+ tree attr = oacc_get_fn_attrib (current_function_decl);
+ tree dims = TREE_VALUE (attr);
+ unsigned ix;
+
+ oa->mask = 0;
+
+ for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+ {
+ tree t = TREE_VALUE (dims);
+ int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
+ tree allowed = TREE_PURPOSE (dims);
+
+ if (size != 1 && !(allowed && integer_zerop (allowed)))
+ oa->mask |= GOMP_DIM_MASK (ix);
+
+ switch (ix)
+ {
+ case GOMP_DIM_GANG:
+ oa->num_gangs = size;
+ break;
+
+ case GOMP_DIM_WORKER:
+ oa->num_workers = size;
+ break;
+
+ case GOMP_DIM_VECTOR:
+ oa->vector_length = size;
+ break;
+ }
+ }
+}
+
#if WORKAROUND_PTXJIT_BUG_2
/* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
is needed in the nvptx target because the branches generated for
{
/* If we determined this mask before RTL expansion, we could
elide emission of some levels of forks and joins. */
- unsigned mask = 0;
- tree dims = TREE_VALUE (attr);
- unsigned ix;
+ offload_attrs oa;
- for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
- {
- int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
- tree allowed = TREE_PURPOSE (dims);
+ populate_offload_attrs (&oa);
- if (size != 1 && !(allowed && integer_zerop (allowed)))
- mask |= GOMP_DIM_MASK (ix);
- }
/* If there is worker neutering, there must be vector
neutering. Otherwise the hardware will fail. */
- gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
- || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+ gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
/* Discover & process partitioned regions. */
parallel *pars = nvptx_discover_pars (&bb_insn_map);
nvptx_process_pars (pars);
- nvptx_neuter_pars (pars, mask, 0);
+ nvptx_neuter_pars (pars, oa.mask, 0);
delete pars;
}
fputs ("// END PREAMBLE\n", asm_out_file);
}
-/* Emit a declaration for a worker-level buffer in .shared memory. */
+/* Emit a declaration for a worker and vector-level buffer in .shared
+ memory. */
static void
-write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
+write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
{
const char *name = XSTR (sym, 0);
nvptx_record_fndecl (decl);
fputs (func_decls.str().c_str(), asm_out_file);
- if (worker_bcast_size)
- write_worker_buffer (asm_out_file, worker_bcast_sym,
- worker_bcast_align, worker_bcast_size);
+ if (oacc_bcast_size)
+ write_shared_buffer (asm_out_file, oacc_bcast_sym,
+ oacc_bcast_align, oacc_bcast_size);
if (worker_red_size)
- write_worker_buffer (asm_out_file, worker_red_sym,
+ write_shared_buffer (asm_out_file, worker_red_sym,
worker_red_align, worker_red_size);
+ if (vector_red_size)
+ write_shared_buffer (asm_out_file, vector_red_sym,
+ vector_red_align, vector_red_size);
+
if (need_softstack_decl)
{
write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
return target;
}
-/* Worker reduction address expander. */
+const char *
+nvptx_output_red_partition (rtx dst, rtx offset)
+{
+ const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
+ const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
+
+ if (offset == const0_rtx)
+ fprintf (asm_out_file, zero_offset, REGNO (dst),
+ REGNO (cfun->machine->red_partition));
+ else
+ fprintf (asm_out_file, with_offset, REGNO (dst),
+ REGNO (cfun->machine->red_partition), UINTVAL (offset));
+
+ return "";
+}
+
+/* Shared-memory reduction address expander. */
static rtx
-nvptx_expand_worker_addr (tree exp, rtx target,
- machine_mode ARG_UNUSED (mode), int ignore)
+nvptx_expand_shared_addr (tree exp, rtx target,
+ machine_mode ARG_UNUSED (mode), int ignore,
+ int vector)
{
if (ignore)
return target;
unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
- if (align > worker_red_align)
- worker_red_align = align;
-
unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
- if (size + offset > worker_red_size)
- worker_red_size = size + offset;
-
rtx addr = worker_red_sym;
- if (offset)
+
+ if (vector)
{
- addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
- addr = gen_rtx_CONST (Pmode, addr);
+ offload_attrs oa;
+
+ populate_offload_attrs (&oa);
+
+ unsigned int psize = ROUND_UP (size + offset, align);
+ unsigned int pnum = nvptx_mach_max_workers ();
+ vector_red_partition = MAX (vector_red_partition, psize);
+ vector_red_size = MAX (vector_red_size, psize * pnum);
+ vector_red_align = MAX (vector_red_align, align);
+
+ if (cfun->machine->red_partition == NULL)
+ cfun->machine->red_partition = gen_reg_rtx (Pmode);
+
+ addr = gen_reg_rtx (Pmode);
+ emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
}
+ else
+ {
+ worker_red_align = MAX (worker_red_align, align);
+ worker_red_size = MAX (worker_red_size, size + offset);
- emit_move_insn (target, addr);
+ if (offset)
+ {
+ addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
+ addr = gen_rtx_CONST (Pmode, addr);
+ }
+ }
+ emit_move_insn (target, addr);
return target;
}
NVPTX_BUILTIN_SHUFFLE,
NVPTX_BUILTIN_SHUFFLELL,
NVPTX_BUILTIN_WORKER_ADDR,
+ NVPTX_BUILTIN_VECTOR_ADDR,
NVPTX_BUILTIN_CMP_SWAP,
NVPTX_BUILTIN_CMP_SWAPLL,
NVPTX_BUILTIN_MAX
DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
DEF (WORKER_ADDR, "worker_addr",
(PTRVOID, ST, UINT, UINT, NULL_TREE));
+ DEF (VECTOR_ADDR, "vector_addr",
+ (PTRVOID, ST, UINT, UINT, NULL_TREE));
DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
return nvptx_expand_shuffle (exp, target, mode, ignore);
case NVPTX_BUILTIN_WORKER_ADDR:
- return nvptx_expand_worker_addr (exp, target, mode, ignore);
+ return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
+
+ case NVPTX_BUILTIN_VECTOR_ADDR:
+ return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
case NVPTX_BUILTIN_CMP_SWAP:
case NVPTX_BUILTIN_CMP_SWAPLL:
return PTX_WARP_SIZE;
}
-/* Validate compute dimensions of an OpenACC offload or routine, fill
- in non-unity defaults. FN_LEVEL indicates the level at which a
- routine might spawn a loop. It is negative for non-routines. If
- DECL is null, we are validating the default dimensions. */
+static bool
+nvptx_welformed_vector_length_p (int l)
+{
+ gcc_assert (l > 0);
+ return l % PTX_WARP_SIZE == 0;
+}
+
+static void
+nvptx_apply_dim_limits (int dims[])
+{
+ /* Check that the vector_length is not too large. */
+ if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
+ dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
+
+ /* Check that the number of workers is not too large. */
+ if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
+ dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+
+ /* Ensure that num_worker * vector_length <= cta size. */
+ if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
+ && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+ /* If we need a per-worker barrier ... . */
+ if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
+ && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+ /* Don't use more barriers than available. */
+ dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
+ PTX_NUM_PER_WORKER_BARRIERS);
+}
+
+/* Return true if FNDECL contains calls to vector-partitionable routines. */
static bool
-nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
+has_vector_partitionable_routine_calls_p (tree fndecl)
+{
+ if (!fndecl)
+ return false;
+
+ basic_block bb;
+ FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
+ for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
+ gsi_next_nondebug (&i))
+ {
+ gimple *stmt = gsi_stmt (i);
+ if (gimple_code (stmt) != GIMPLE_CALL)
+ continue;
+
+ tree callee = gimple_call_fndecl (stmt);
+ if (!callee)
+ continue;
+
+ tree attrs = oacc_get_fn_attrib (callee);
+ if (attrs == NULL_TREE)
+ return false;
+
+ int partition_level = oacc_fn_attrib_level (attrs);
+ bool seq_routine_p = partition_level == GOMP_DIM_MAX;
+ if (!seq_routine_p)
+ return true;
+ }
+
+ return false;
+}
+
+/* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
+ DIMS has changed. */
+
+static void
+nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
{
- bool changed = false;
bool oacc_default_dims_p = false;
bool oacc_min_dims_p = false;
bool offload_region_p = false;
bool routine_p = false;
bool routine_seq_p = false;
+ int default_vector_length = -1;
if (decl == NULL_TREE)
{
else
gcc_unreachable ();
+ if (oacc_min_dims_p)
+ {
+ gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
+ gcc_assert (dims[GOMP_DIM_WORKER] == 1);
+ gcc_assert (dims[GOMP_DIM_GANG] == 1);
+
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+ return;
+ }
+
if (routine_p)
{
- /* OpenACC routines in C arrive here with the following attributes
- (omitting the 'omp declare target'):
- seq : __attribute__((oacc function (0 1, 0 1, 0 1)))
- vector: __attribute__((oacc function (0 1, 0 1, 1 0)))
- worker: __attribute__((oacc function (0 1, 1 0, 1 0)))
- gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
-
- If we take f.i. the oacc function attribute of the worker routine
- (0 1, 1 0, 1 0), then:
- - the slice (0, 1, 1) is interpreted by oacc_fn_attrib_level as
- meaning: worker routine, that is:
- - can't contain gang loop (0),
- - can contain worker loop (1),
- - can contain vector loop (1).
- - the slice (1, 0, 0) is interpreted by oacc_validate_dims as the
- dimensions: gang: 1, worker: 0, vector: 0.
-
- OTOH, routines in Fortran arrive here with these attributes:
- seq : __attribute__((oacc function (0 0, 0 0, 0 0)))
- vector: __attribute__((oacc function (0 0, 0 0, 1 0)))
- worker: __attribute__((oacc function (0 0, 1 0, 1 0)))
- gang : __attribute__((oacc function (1 0, 1 0, 1 0)))
- that is, the same as for C but with the dimensions set to 0.
-
- This is due to a bug in the Fortran front-end: PR72741. Work around
- this bug by forcing the dimensions to be the same in Fortran as for C,
- to be able to handle C and Fortran routines uniformly in this
- function. */
- dims[GOMP_DIM_VECTOR] = fn_level > GOMP_DIM_VECTOR ? 1 : 0;
- dims[GOMP_DIM_WORKER] = fn_level > GOMP_DIM_WORKER ? 1 : 0;
- dims[GOMP_DIM_GANG] = fn_level > GOMP_DIM_GANG ? 1 : 0;
- changed = true;
- }
-
- /* The vector size must be 32, unless this is a SEQ routine. */
- if ((offload_region_p || oacc_default_dims_p
- || (routine_p && !routine_seq_p))
- && dims[GOMP_DIM_VECTOR] >= 0
- && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
- {
- if ((offload_region_p || oacc_default_dims_p)
- && dims[GOMP_DIM_VECTOR] >= 0)
- warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
- dims[GOMP_DIM_VECTOR]
- ? G_("using vector_length (%d), ignoring %d")
- : G_("using vector_length (%d), ignoring runtime setting"),
- PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
- dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
- changed = true;
- }
-
- /* Check the num workers is not too large. */
- if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
+ if (!routine_seq_p)
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+ return;
+ }
+
+ if (oacc_default_dims_p)
{
- warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
- "using num_workers (%d), ignoring %d",
- PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
- dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
- changed = true;
+ /* -1 : not set
+ 0 : set at runtime, f.i. -fopenacc-dims=-
+ >= 1: set at compile time, f.i. -fopenacc-dims=1. */
+ gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
+ gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
+ gcc_assert (dims[GOMP_DIM_GANG] >= -1);
+
+ /* But -fopenacc-dims=- is not yet supported on trunk. */
+ gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
+ gcc_assert (dims[GOMP_DIM_WORKER] != 0);
+ gcc_assert (dims[GOMP_DIM_GANG] != 0);
}
- if (oacc_default_dims_p || oacc_min_dims_p)
+ if (offload_region_p)
{
- dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
+ /* -1 : not set
+ 0 : set using variable, f.i. num_gangs (n)
+ >= 1: set using constant, f.i. num_gangs (1). */
+ gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
+ gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
+ gcc_assert (dims[GOMP_DIM_GANG] >= -1);
+ }
+
+ if (offload_region_p)
+ default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
+ else
+ /* oacc_default_dims_p. */
+ default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
+
+ int old_dims[GOMP_DIM_MAX];
+ unsigned int i;
+ for (i = 0; i < GOMP_DIM_MAX; ++i)
+ old_dims[i] = dims[i];
+
+ const char *vector_reason = NULL;
+ if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
+ {
+ default_vector_length = PTX_WARP_SIZE;
+
+ if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+ {
+ vector_reason = G_("using vector_length (%d) due to call to"
+ " vector-partitionable routine, ignoring %d");
+ dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+ }
+ }
+
+ if (dims[GOMP_DIM_VECTOR] == 0)
+ {
+ vector_reason = G_("using vector_length (%d), ignoring runtime setting");
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
+ }
+
+ if (dims[GOMP_DIM_VECTOR] > 0
+ && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
+
+ nvptx_apply_dim_limits (dims);
+
+ if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
+ vector_reason != NULL
+ ? vector_reason
+ : G_("using vector_length (%d), ignoring %d"),
+ dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
+
+ if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
+ warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
+ G_("using num_workers (%d), ignoring %d"),
+ dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
+
+ if (oacc_default_dims_p)
+ {
+ if (dims[GOMP_DIM_VECTOR] < 0)
+ dims[GOMP_DIM_VECTOR] = default_vector_length;
if (dims[GOMP_DIM_WORKER] < 0)
dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
if (dims[GOMP_DIM_GANG] < 0)
dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
- changed = true;
+ nvptx_apply_dim_limits (dims);
+ }
+
+ if (offload_region_p)
+ {
+ for (i = 0; i < GOMP_DIM_MAX; i++)
+ {
+ if (!(dims[i] < 0))
+ continue;
+
+ if ((used & GOMP_DIM_MASK (i)) == 0)
+ /* Function oacc_validate_dims will apply the minimal dimension. */
+ continue;
+
+ dims[i] = (i == GOMP_DIM_VECTOR
+ ? default_vector_length
+ : oacc_get_default_dim (i));
+ }
+
+ nvptx_apply_dim_limits (dims);
}
+}
+
+/* Validate compute dimensions of an OpenACC offload or routine, fill
+ in non-unity defaults. FN_LEVEL indicates the level at which a
+ routine might spawn a loop. It is negative for non-routines. If
+ DECL is null, we are validating the default dimensions. */
+
+static bool
+nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
+{
+ int old_dims[GOMP_DIM_MAX];
+ unsigned int i;
+
+ for (i = 0; i < GOMP_DIM_MAX; ++i)
+ old_dims[i] = dims[i];
+
+ nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
- return changed;
+ gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
+ if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
+ gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
+
+ for (i = 0; i < GOMP_DIM_MAX; ++i)
+ if (old_dims[i] != dims[i])
+ return true;
+
+ return false;
}
/* Return maximum dimension size, or zero for unbounded. */
switch (axis)
{
case GOMP_DIM_VECTOR:
- return PTX_VECTOR_LENGTH;
+ return PTX_MAX_VECTOR_LENGTH;
default:
break;
data at that location. */
static tree
-nvptx_get_worker_red_addr (tree type, tree offset)
+nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
{
+ enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
+ if (vector)
+ addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
machine_mode mode = TYPE_MODE (type);
- tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
+ tree fndecl = nvptx_builtin_decl (addr_dim, true);
tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
tree align = build_int_cst (unsigned_type_node,
GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
/* NVPTX implementation of GOACC_REDUCTION_SETUP. */
static void
-nvptx_goacc_reduction_setup (gcall *call)
+nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
{
gimple_stmt_iterator gsi = gsi_for_stmt (call);
tree lhs = gimple_call_lhs (call);
var = build_simple_mem_ref (ref_to_res);
}
- if (level == GOMP_DIM_WORKER)
+ if (level == GOMP_DIM_WORKER
+ || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
{
/* Store incoming value to worker reduction buffer. */
tree offset = gimple_call_arg (call, 5);
- tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+ tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
+ level == GOMP_DIM_VECTOR);
tree ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);
/* NVPTX implementation of GOACC_REDUCTION_INIT. */
static void
-nvptx_goacc_reduction_init (gcall *call)
+nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
{
gimple_stmt_iterator gsi = gsi_for_stmt (call);
tree lhs = gimple_call_lhs (call);
push_gimplify_context (true);
- if (level == GOMP_DIM_VECTOR)
+ if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
{
/* Initialize vector-non-zeroes to INIT_VAL (OP). */
tree tid = make_ssa_name (integer_type_node);
init = var;
}
- gimplify_assign (lhs, init, &seq);
+ if (lhs != NULL_TREE)
+ gimplify_assign (lhs, init, &seq);
}
pop_gimplify_context (NULL);
/* NVPTX implementation of GOACC_REDUCTION_FINI. */
static void
-nvptx_goacc_reduction_fini (gcall *call)
+nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
{
gimple_stmt_iterator gsi = gsi_for_stmt (call);
tree lhs = gimple_call_lhs (call);
push_gimplify_context (true);
- if (level == GOMP_DIM_VECTOR)
+ if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
{
/* Emit binary shuffle tree. TODO. Emit this as an actual loop,
but that requires a method of emitting a unified jump at the
gimple level. */
- for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
+ for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
{
tree other_var = make_ssa_name (TREE_TYPE (var));
nvptx_generate_vector_shuffle (gimple_location (call),
{
tree accum = NULL_TREE;
- if (level == GOMP_DIM_WORKER)
+ if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
{
/* Get reduction buffer address. */
tree offset = gimple_call_arg (call, 5);
- tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
+ tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
+ level == GOMP_DIM_VECTOR);
tree ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);
/* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
static void
-nvptx_goacc_reduction_teardown (gcall *call)
+nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
{
gimple_stmt_iterator gsi = gsi_for_stmt (call);
tree lhs = gimple_call_lhs (call);
gimple_seq seq = NULL;
push_gimplify_context (true);
- if (level == GOMP_DIM_WORKER)
+ if (level == GOMP_DIM_WORKER
+ || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
{
/* Read the worker reduction buffer. */
tree offset = gimple_call_arg (call, 5);
- tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
+ tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
+ level == GOMP_DIM_VECTOR);
tree ptr = make_ssa_name (TREE_TYPE (call));
gimplify_assign (ptr, call, &seq);
nvptx_goacc_reduction (gcall *call)
{
unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+ offload_attrs oa;
+
+ populate_offload_attrs (&oa);
switch (code)
{
case IFN_GOACC_REDUCTION_SETUP:
- nvptx_goacc_reduction_setup (call);
+ nvptx_goacc_reduction_setup (call, &oa);
break;
case IFN_GOACC_REDUCTION_INIT:
- nvptx_goacc_reduction_init (call);
+ nvptx_goacc_reduction_init (call, &oa);
break;
case IFN_GOACC_REDUCTION_FINI:
- nvptx_goacc_reduction_fini (call);
+ nvptx_goacc_reduction_fini (call, &oa);
break;
case IFN_GOACC_REDUCTION_TEARDOWN:
- nvptx_goacc_reduction_teardown (call);
+ nvptx_goacc_reduction_teardown (call, &oa);
break;
default:
return false;
}
+static GTY(()) tree nvptx_previous_fndecl;
+
+static void
+nvptx_set_current_function (tree fndecl)
+{
+ if (!fndecl || fndecl == nvptx_previous_fndecl)
+ return;
+
+ nvptx_previous_fndecl = fndecl;
+ vector_red_partition = 0;
+ oacc_bcast_partition = 0;
+}
+
#undef TARGET_OPTION_OVERRIDE
#define TARGET_OPTION_OVERRIDE nvptx_option_override
#undef TARGET_HAVE_SPECULATION_SAFE_VALUE
#define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
+#undef TARGET_SET_CURRENT_FUNCTION
+#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-nvptx.h"