Update copyright years.

[thirdparty/gcc.git] / libgomp / ordered.c
diff --git a/libgomp/ordered.c b/libgomp/ordered.c

index 6b1f5c1d1b240280b14861dff4f1c69013385134..8272b8fe21b70d075cf80691555272a954dc5969 100644 (file)
--- a/libgomp/ordered.c
+++ b/libgomp/ordered.c
@@ -1,7 +1,8 @@
-/* Copyright (C) 2005-2013 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2024 Free Software Foundation, Inc.
     Contributed by Richard Henderson <rth@redhat.com>.
  
-   This file is part of the GNU OpenMP Library (libgomp).
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
  
     Libgomp is free software; you can redistribute it and/or modify it
     under the terms of the GNU General Public License as published by
@@ -25,6 +26,9 @@
  /* This file handles the ORDERED construct.  */
  
  #include "libgomp.h"
+#include <stdarg.h>
+#include <string.h>
+#include "doacross.h"
  
  
  /* This function is called when first allocating an iteration block.  That
@@ -210,7 +214,7 @@ gomp_ordered_sync (void)
       Either way we get correct results.
       However, there is an implicit flush on entry to an ordered region,
       so we do need to have a barrier here.  If we were taking a lock
-     this could be MEMMODEL_RELEASE since the acquire would be coverd
+     this could be MEMMODEL_RELEASE since the acquire would be covered
       by the lock.  */
  
    __atomic_thread_fence (MEMMODEL_ACQ_REL);
@@ -242,10 +246,582 @@ GOMP_ordered_start (void)
  
     However, the current implementation has a flaw in that it does not allow
     the next thread into the ORDERED section immediately after the current
-   thread exits the ORDERED section in its last iteration.  The existance
+   thread exits the ORDERED section in its last iteration.  The existence
     of this function allows the implementation to change.  */
  
  void
  GOMP_ordered_end (void)
  {
  }
+
+/* DOACROSS initialization.  */
+
+#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
+
+void
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
+                   size_t extra)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    {
+    empty:
+      if (!extra)
+       ws->doacross = NULL;
+      else
+       {
+         doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+         doacross->extra = (void *) (doacross + 1);
+         ws->doacross = doacross;
+       }
+      return;
+    }
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+       goto empty;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+       {
+         unsigned int this_bits;
+         if (counts[i] == 1)
+           this_bits = 1;
+         else
+           this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
+                       - __builtin_clzl (counts[i] - 1);
+         if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+           {
+             bits[i] = this_bits;
+             num_bits += this_bits;
+           }
+         else
+           num_bits = MAX_COLLAPSED_BITS + 1;
+       }
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else if (ws->sched == GFS_GUIDED)
+    num_ents = counts[0];
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      elt_sz = sizeof (unsigned long) * ncounts;
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+                         + shift_sz + extra);
+  doacross->chunk_size = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->array = (unsigned char *)
+                   ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+                    & ~(uintptr_t) 63);
+  if (extra)
+    {
+      doacross->extra = doacross->array + num_ents * elt_sz;
+      memset (doacross->extra, '\0', extra);
+    }
+  else
+    doacross->extra = NULL;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+       {
+         doacross->shift_counts[i - 1] = shift_count;
+         shift_count += bits[i - 1];
+       }
+      for (ent = 0; ent < num_ents; ent++)
+       *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+             sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      unsigned long q = counts[0] / num_ents;
+      unsigned long t = counts[0] % num_ents;
+      doacross->boundary = t * (q + 1);
+      doacross->q = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation.  */
+
+void
+GOMP_doacross_post (long *counts)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else if (ws->sched == GFS_GUIDED)
+    ent = counts[0];
+  else
+    ent = counts[0] / doacross->chunk_size;
+  unsigned long *array = (unsigned long *) (doacross->array
+                                           + ent * doacross->elt_sz);
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long flattened
+       = (unsigned long) counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+       flattened |= (unsigned long) counts[i]
+                    << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+       __atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+       __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  for (i = doacross->ncounts; i-- > 0; )
+    {
+      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+       __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+    }
+}
+
+/* DOACROSS WAIT operation.  */
+
+void
+GOMP_doacross_wait (long first, ...)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  va_list ap;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size == 0)
+       {
+         if (first < doacross->boundary)
+           ent = first / (doacross->q + 1);
+         else
+           ent = (first - doacross->boundary) / doacross->q
+                 + doacross->t;
+       }
+      else
+       ent = first / ws->chunk_size % thr->ts.team->nthreads;
+    }
+  else if (ws->sched == GFS_GUIDED)
+    ent = first;
+  else
+    ent = first / doacross->chunk_size;
+  unsigned long *array = (unsigned long *) (doacross->array
+                                           + ent * doacross->elt_sz);
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long flattened
+       = (unsigned long) first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+       flattened |= (unsigned long) va_arg (ap, long)
+                    << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+       {
+         __atomic_thread_fence (MEMMODEL_RELEASE);
+         va_end (ap);
+         return;
+       }
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  do
+    {
+      va_start (ap, first);
+      for (i = 0; i < doacross->ncounts; i++)
+       {
+         unsigned long thisv
+           = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
+         unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+         if (thisv < cur)
+           {
+             i = doacross->ncounts;
+             break;
+           }
+         if (thisv > cur)
+           break;
+       }
+      va_end (ap);
+      if (i == doacross->ncounts)
+       break;
+      cpu_relax ();
+    }
+  while (1);
+  __sync_synchronize ();
+}
+
+typedef unsigned long long gomp_ull;
+
+void
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
+                       gomp_ull chunk_size, size_t extra)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    {
+    empty:
+      if (!extra)
+       ws->doacross = NULL;
+      else
+       {
+         doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+         doacross->extra = (void *) (doacross + 1);
+         ws->doacross = doacross;
+       }
+      return;
+    }
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+       goto empty;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+       {
+         unsigned int this_bits;
+         if (counts[i] == 1)
+           this_bits = 1;
+         else
+           this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
+                       - __builtin_clzll (counts[i] - 1);
+         if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+           {
+             bits[i] = this_bits;
+             num_bits += this_bits;
+           }
+         else
+           num_bits = MAX_COLLAPSED_BITS + 1;
+       }
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else if (ws->sched == GFS_GUIDED)
+    num_ents = counts[0];
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      if (sizeof (gomp_ull) == sizeof (unsigned long))
+       elt_sz = sizeof (gomp_ull) * ncounts;
+      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
+       elt_sz = sizeof (unsigned long) * 2 * ncounts;
+      else
+       abort ();
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+                         + shift_sz);
+  doacross->chunk_size_ull = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->boundary = 0;
+  doacross->array = (unsigned char *)
+                   ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+                    & ~(uintptr_t) 63);
+  if (extra)
+    {
+      doacross->extra = doacross->array + num_ents * elt_sz;
+      memset (doacross->extra, '\0', extra);
+    }
+  else
+    doacross->extra = NULL;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+       {
+         doacross->shift_counts[i - 1] = shift_count;
+         shift_count += bits[i - 1];
+       }
+      for (ent = 0; ent < num_ents; ent++)
+       *(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+             sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      gomp_ull q = counts[0] / num_ents;
+      gomp_ull t = counts[0] % num_ents;
+      doacross->boundary_ull = t * (q + 1);
+      doacross->q_ull = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation.  */
+
+void
+GOMP_doacross_ull_post (gomp_ull *counts)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else if (ws->sched == GFS_GUIDED)
+    ent = counts[0];
+  else
+    ent = counts[0] / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                             + ent * doacross->elt_sz);
+      gomp_ull flattened
+       = counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+       flattened |= counts[i] << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+       __atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+       __atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+                                     + ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+       {
+         if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+           __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+       }
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                                               + ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+       {
+         gomp_ull cull = counts[i] + 1UL;
+         unsigned long c = (unsigned long) cull;
+         if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
+           __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
+         c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+         if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
+           __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
+       }
+    }
+}
+
+/* DOACROSS WAIT operation.  */
+
+void
+GOMP_doacross_ull_wait (gomp_ull first, ...)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  va_list ap;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size_ull == 0)
+       {
+         if (first < doacross->boundary_ull)
+           ent = first / (doacross->q_ull + 1);
+         else
+           ent = (first - doacross->boundary_ull) / doacross->q_ull
+                 + doacross->t;
+       }
+      else
+       ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
+    }
+  else if (ws->sched == GFS_GUIDED)
+    ent = first;
+  else
+    ent = first / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                                               + ent * doacross->elt_sz);
+      gomp_ull flattened = first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+       flattened |= va_arg (ap, gomp_ull)
+                    << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+       {
+         __atomic_thread_fence (MEMMODEL_RELEASE);
+         va_end (ap);
+         return;
+       }
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+                                     + ent * doacross->elt_sz);
+      do
+       {
+         va_start (ap, first);
+         for (i = 0; i < doacross->ncounts; i++)
+           {
+             gomp_ull thisv
+               = (i ? va_arg (ap, gomp_ull) : first) + 1;
+             gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+             if (thisv < cur)
+               {
+                 i = doacross->ncounts;
+                 break;
+               }
+             if (thisv > cur)
+               break;
+           }
+         va_end (ap);
+         if (i == doacross->ncounts)
+           break;
+         cpu_relax ();
+       }
+      while (1);
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+                                               + ent * doacross->elt_sz);
+      do
+       {
+         va_start (ap, first);
+         for (i = 0; i < doacross->ncounts; i++)
+           {
+             gomp_ull thisv
+               = (i ? va_arg (ap, gomp_ull) : first) + 1;
+             unsigned long t
+               = thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+             unsigned long cur
+               = __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
+             if (t < cur)
+               {
+                 i = doacross->ncounts;
+                 break;
+               }
+             if (t > cur)
+               break;
+             t = thisv;
+             cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
+             if (t < cur)
+               {
+                 i = doacross->ncounts;
+                 break;
+               }
+             if (t > cur)
+               break;
+           }
+         va_end (ap);
+         if (i == doacross->ncounts)
+           break;
+         cpu_relax ();
+       }
+      while (1);
+    }
+  __sync_synchronize ();
+}