-/* Copyright (C) 2005-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>.
- This file is part of the GNU OpenMP Library (libgomp).
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
/* This file handles the ORDERED construct. */
#include "libgomp.h"
+#include <stdarg.h>
+#include <string.h>
+#include "doacross.h"
/* This function is called when first allocating an iteration block. That
Either way we get correct results.
However, there is an implicit flush on entry to an ordered region,
so we do need to have a barrier here. If we were taking a lock
- this could be MEMMODEL_RELEASE since the acquire would be coverd
+ this could be MEMMODEL_RELEASE since the acquire would be covered
by the lock. */
__atomic_thread_fence (MEMMODEL_ACQ_REL);
However, the current implementation has a flaw in that it does not allow
the next thread into the ORDERED section immediately after the current
- thread exits the ORDERED section in its last iteration. The existance
+ thread exits the ORDERED section in its last iteration. The existence
of this function allows the implementation to change. */
void
GOMP_ordered_end (void)
{
}
+
+/* DOACROSS initialization. */
+
+#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
+
+void
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
+ size_t extra)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+ struct gomp_work_share *ws = thr->ts.work_share;
+ unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+ unsigned long ent, num_ents, elt_sz, shift_sz;
+ struct gomp_doacross_work_share *doacross;
+
+ if (team == NULL || team->nthreads == 1)
+ {
+ empty:
+ if (!extra)
+ ws->doacross = NULL;
+ else
+ {
+ doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+ doacross->extra = (void *) (doacross + 1);
+ ws->doacross = doacross;
+ }
+ return;
+ }
+
+ for (i = 0; i < ncounts; i++)
+ {
+ /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
+ if (counts[i] == 0)
+ goto empty;
+
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ unsigned int this_bits;
+ if (counts[i] == 1)
+ this_bits = 1;
+ else
+ this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
+ - __builtin_clzl (counts[i] - 1);
+ if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+ {
+ bits[i] = this_bits;
+ num_bits += this_bits;
+ }
+ else
+ num_bits = MAX_COLLAPSED_BITS + 1;
+ }
+ }
+
+ if (ws->sched == GFS_STATIC)
+ num_ents = team->nthreads;
+ else if (ws->sched == GFS_GUIDED)
+ num_ents = counts[0];
+ else
+ num_ents = (counts[0] - 1) / chunk_size + 1;
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ elt_sz = sizeof (unsigned long);
+ shift_sz = ncounts * sizeof (unsigned int);
+ }
+ else
+ {
+ elt_sz = sizeof (unsigned long) * ncounts;
+ shift_sz = 0;
+ }
+ elt_sz = (elt_sz + 63) & ~63UL;
+
+ doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+ + shift_sz + extra);
+ doacross->chunk_size = chunk_size;
+ doacross->elt_sz = elt_sz;
+ doacross->ncounts = ncounts;
+ doacross->flattened = false;
+ doacross->array = (unsigned char *)
+ ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+ & ~(uintptr_t) 63);
+ if (extra)
+ {
+ doacross->extra = doacross->array + num_ents * elt_sz;
+ memset (doacross->extra, '\0', extra);
+ }
+ else
+ doacross->extra = NULL;
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ unsigned int shift_count = 0;
+ doacross->flattened = true;
+ for (i = ncounts; i > 0; i--)
+ {
+ doacross->shift_counts[i - 1] = shift_count;
+ shift_count += bits[i - 1];
+ }
+ for (ent = 0; ent < num_ents; ent++)
+ *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+ }
+ else
+ for (ent = 0; ent < num_ents; ent++)
+ memset (doacross->array + ent * elt_sz, '\0',
+ sizeof (unsigned long) * ncounts);
+ if (ws->sched == GFS_STATIC && chunk_size == 0)
+ {
+ unsigned long q = counts[0] / num_ents;
+ unsigned long t = counts[0] % num_ents;
+ doacross->boundary = t * (q + 1);
+ doacross->q = q;
+ doacross->t = t;
+ }
+ ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation. */
+
+void
+GOMP_doacross_post (long *counts)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_work_share *ws = thr->ts.work_share;
+ struct gomp_doacross_work_share *doacross = ws->doacross;
+ unsigned long ent;
+ unsigned int i;
+
+ if (__builtin_expect (doacross == NULL, 0)
+ || __builtin_expect (doacross->array == NULL, 0))
+ {
+ __sync_synchronize ();
+ return;
+ }
+
+ if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+ ent = thr->ts.team_id;
+ else if (ws->sched == GFS_GUIDED)
+ ent = counts[0];
+ else
+ ent = counts[0] / doacross->chunk_size;
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+
+ if (__builtin_expect (doacross->flattened, 1))
+ {
+ unsigned long flattened
+ = (unsigned long) counts[0] << doacross->shift_counts[0];
+
+ for (i = 1; i < doacross->ncounts; i++)
+ flattened |= (unsigned long) counts[i]
+ << doacross->shift_counts[i];
+ flattened++;
+ if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ else
+ __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+ return;
+ }
+
+ __atomic_thread_fence (MEMMODEL_ACQUIRE);
+ for (i = doacross->ncounts; i-- > 0; )
+ {
+ if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+ }
+}
+
+/* DOACROSS WAIT operation. */
+
+void
+GOMP_doacross_wait (long first, ...)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_work_share *ws = thr->ts.work_share;
+ struct gomp_doacross_work_share *doacross = ws->doacross;
+ va_list ap;
+ unsigned long ent;
+ unsigned int i;
+
+ if (__builtin_expect (doacross == NULL, 0)
+ || __builtin_expect (doacross->array == NULL, 0))
+ {
+ __sync_synchronize ();
+ return;
+ }
+
+ if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+ {
+ if (ws->chunk_size == 0)
+ {
+ if (first < doacross->boundary)
+ ent = first / (doacross->q + 1);
+ else
+ ent = (first - doacross->boundary) / doacross->q
+ + doacross->t;
+ }
+ else
+ ent = first / ws->chunk_size % thr->ts.team->nthreads;
+ }
+ else if (ws->sched == GFS_GUIDED)
+ ent = first;
+ else
+ ent = first / doacross->chunk_size;
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+
+ if (__builtin_expect (doacross->flattened, 1))
+ {
+ unsigned long flattened
+ = (unsigned long) first << doacross->shift_counts[0];
+ unsigned long cur;
+
+ va_start (ap, first);
+ for (i = 1; i < doacross->ncounts; i++)
+ flattened |= (unsigned long) va_arg (ap, long)
+ << doacross->shift_counts[i];
+ cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+ if (flattened < cur)
+ {
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ va_end (ap);
+ return;
+ }
+ doacross_spin (array, flattened, cur);
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ va_end (ap);
+ return;
+ }
+
+ do
+ {
+ va_start (ap, first);
+ for (i = 0; i < doacross->ncounts; i++)
+ {
+ unsigned long thisv
+ = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
+ unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+ if (thisv < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (thisv > cur)
+ break;
+ }
+ va_end (ap);
+ if (i == doacross->ncounts)
+ break;
+ cpu_relax ();
+ }
+ while (1);
+ __sync_synchronize ();
+}
+
+typedef unsigned long long gomp_ull;
+
+void
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
+ gomp_ull chunk_size, size_t extra)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_team *team = thr->ts.team;
+ struct gomp_work_share *ws = thr->ts.work_share;
+ unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+ unsigned long ent, num_ents, elt_sz, shift_sz;
+ struct gomp_doacross_work_share *doacross;
+
+ if (team == NULL || team->nthreads == 1)
+ {
+ empty:
+ if (!extra)
+ ws->doacross = NULL;
+ else
+ {
+ doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+ doacross->extra = (void *) (doacross + 1);
+ ws->doacross = doacross;
+ }
+ return;
+ }
+
+ for (i = 0; i < ncounts; i++)
+ {
+ /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
+ if (counts[i] == 0)
+ goto empty;
+
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ unsigned int this_bits;
+ if (counts[i] == 1)
+ this_bits = 1;
+ else
+ this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
+ - __builtin_clzll (counts[i] - 1);
+ if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+ {
+ bits[i] = this_bits;
+ num_bits += this_bits;
+ }
+ else
+ num_bits = MAX_COLLAPSED_BITS + 1;
+ }
+ }
+
+ if (ws->sched == GFS_STATIC)
+ num_ents = team->nthreads;
+ else if (ws->sched == GFS_GUIDED)
+ num_ents = counts[0];
+ else
+ num_ents = (counts[0] - 1) / chunk_size + 1;
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ elt_sz = sizeof (unsigned long);
+ shift_sz = ncounts * sizeof (unsigned int);
+ }
+ else
+ {
+ if (sizeof (gomp_ull) == sizeof (unsigned long))
+ elt_sz = sizeof (gomp_ull) * ncounts;
+ else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
+ elt_sz = sizeof (unsigned long) * 2 * ncounts;
+ else
+ abort ();
+ shift_sz = 0;
+ }
+ elt_sz = (elt_sz + 63) & ~63UL;
+
+ doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+ + shift_sz);
+ doacross->chunk_size_ull = chunk_size;
+ doacross->elt_sz = elt_sz;
+ doacross->ncounts = ncounts;
+ doacross->flattened = false;
+ doacross->boundary = 0;
+ doacross->array = (unsigned char *)
+ ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+ & ~(uintptr_t) 63);
+ if (extra)
+ {
+ doacross->extra = doacross->array + num_ents * elt_sz;
+ memset (doacross->extra, '\0', extra);
+ }
+ else
+ doacross->extra = NULL;
+ if (num_bits <= MAX_COLLAPSED_BITS)
+ {
+ unsigned int shift_count = 0;
+ doacross->flattened = true;
+ for (i = ncounts; i > 0; i--)
+ {
+ doacross->shift_counts[i - 1] = shift_count;
+ shift_count += bits[i - 1];
+ }
+ for (ent = 0; ent < num_ents; ent++)
+ *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+ }
+ else
+ for (ent = 0; ent < num_ents; ent++)
+ memset (doacross->array + ent * elt_sz, '\0',
+ sizeof (unsigned long) * ncounts);
+ if (ws->sched == GFS_STATIC && chunk_size == 0)
+ {
+ gomp_ull q = counts[0] / num_ents;
+ gomp_ull t = counts[0] % num_ents;
+ doacross->boundary_ull = t * (q + 1);
+ doacross->q_ull = q;
+ doacross->t = t;
+ }
+ ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation. */
+
+void
+GOMP_doacross_ull_post (gomp_ull *counts)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_work_share *ws = thr->ts.work_share;
+ struct gomp_doacross_work_share *doacross = ws->doacross;
+ unsigned long ent;
+ unsigned int i;
+
+ if (__builtin_expect (doacross == NULL, 0)
+ || __builtin_expect (doacross->array == NULL, 0))
+ {
+ __sync_synchronize ();
+ return;
+ }
+
+ if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+ ent = thr->ts.team_id;
+ else if (ws->sched == GFS_GUIDED)
+ ent = counts[0];
+ else
+ ent = counts[0] / doacross->chunk_size_ull;
+
+ if (__builtin_expect (doacross->flattened, 1))
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+ gomp_ull flattened
+ = counts[0] << doacross->shift_counts[0];
+
+ for (i = 1; i < doacross->ncounts; i++)
+ flattened |= counts[i] << doacross->shift_counts[i];
+ flattened++;
+ if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ else
+ __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+ return;
+ }
+
+ __atomic_thread_fence (MEMMODEL_ACQUIRE);
+ if (sizeof (gomp_ull) == sizeof (unsigned long))
+ {
+ gomp_ull *array = (gomp_ull *) (doacross->array
+ + ent * doacross->elt_sz);
+
+ for (i = doacross->ncounts; i-- > 0; )
+ {
+ if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+ }
+ }
+ else
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+
+ for (i = doacross->ncounts; i-- > 0; )
+ {
+ gomp_ull cull = counts[i] + 1UL;
+ unsigned long c = (unsigned long) cull;
+ if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
+ c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+ if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
+ __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
+ }
+ }
+}
+
+/* DOACROSS WAIT operation. */
+
+void
+GOMP_doacross_ull_wait (gomp_ull first, ...)
+{
+ struct gomp_thread *thr = gomp_thread ();
+ struct gomp_work_share *ws = thr->ts.work_share;
+ struct gomp_doacross_work_share *doacross = ws->doacross;
+ va_list ap;
+ unsigned long ent;
+ unsigned int i;
+
+ if (__builtin_expect (doacross == NULL, 0)
+ || __builtin_expect (doacross->array == NULL, 0))
+ {
+ __sync_synchronize ();
+ return;
+ }
+
+ if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+ {
+ if (ws->chunk_size_ull == 0)
+ {
+ if (first < doacross->boundary_ull)
+ ent = first / (doacross->q_ull + 1);
+ else
+ ent = (first - doacross->boundary_ull) / doacross->q_ull
+ + doacross->t;
+ }
+ else
+ ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
+ }
+ else if (ws->sched == GFS_GUIDED)
+ ent = first;
+ else
+ ent = first / doacross->chunk_size_ull;
+
+ if (__builtin_expect (doacross->flattened, 1))
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+ gomp_ull flattened = first << doacross->shift_counts[0];
+ unsigned long cur;
+
+ va_start (ap, first);
+ for (i = 1; i < doacross->ncounts; i++)
+ flattened |= va_arg (ap, gomp_ull)
+ << doacross->shift_counts[i];
+ cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+ if (flattened < cur)
+ {
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ va_end (ap);
+ return;
+ }
+ doacross_spin (array, flattened, cur);
+ __atomic_thread_fence (MEMMODEL_RELEASE);
+ va_end (ap);
+ return;
+ }
+
+ if (sizeof (gomp_ull) == sizeof (unsigned long))
+ {
+ gomp_ull *array = (gomp_ull *) (doacross->array
+ + ent * doacross->elt_sz);
+ do
+ {
+ va_start (ap, first);
+ for (i = 0; i < doacross->ncounts; i++)
+ {
+ gomp_ull thisv
+ = (i ? va_arg (ap, gomp_ull) : first) + 1;
+ gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+ if (thisv < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (thisv > cur)
+ break;
+ }
+ va_end (ap);
+ if (i == doacross->ncounts)
+ break;
+ cpu_relax ();
+ }
+ while (1);
+ }
+ else
+ {
+ unsigned long *array = (unsigned long *) (doacross->array
+ + ent * doacross->elt_sz);
+ do
+ {
+ va_start (ap, first);
+ for (i = 0; i < doacross->ncounts; i++)
+ {
+ gomp_ull thisv
+ = (i ? va_arg (ap, gomp_ull) : first) + 1;
+ unsigned long t
+ = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+ unsigned long cur
+ = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
+ if (t < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (t > cur)
+ break;
+ t = thisv;
+ cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
+ if (t < cur)
+ {
+ i = doacross->ncounts;
+ break;
+ }
+ if (t > cur)
+ break;
+ }
+ va_end (ap);
+ if (i == doacross->ncounts)
+ break;
+ cpu_relax ();
+ }
+ while (1);
+ }
+ __sync_synchronize ();
+}