[thirdparty/glibc.git] / nptl / sem_waitcommon.c

/* sem_waitcommon -- wait on a semaphore, shared code.
   Copyright (C) 2003-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Paul Mackerras <paulus@au.ibm.com>, 2003.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <kernel-features.h>
#include <errno.h>
#include <sysdep.h>
#include <futex-internal.h>
#include <internaltypes.h>
#include <semaphore.h>
#include <sys/time.h>

#include <pthreadP.h>
#include <shlib-compat.h>
#include <atomic.h>


/* The semaphore provides two main operations: sem_post adds a token to the
   semaphore; sem_wait grabs a token from the semaphore, potentially waiting
   until there is a token available.  A sem_wait needs to synchronize with
   the sem_post that provided the token, so that whatever lead to the sem_post
   happens before the code after sem_wait.

   Conceptually, available tokens can simply be counted; let's call that the
   value of the semaphore.  However, we also want to know whether there might
   be a sem_wait that is blocked on the value because it was zero (using a
   futex with the value being the futex variable); if there is no blocked
   sem_wait, sem_post does not need to execute a futex_wake call.  Therefore,
   we also need to count the number of potentially blocked sem_wait calls
   (which we call nwaiters).

   What makes this tricky is that POSIX requires that a semaphore can be
   destroyed as soon as the last remaining sem_wait has returned, and no
   other sem_wait or sem_post calls are executing concurrently.  However, the
   sem_post call whose token was consumed by the last sem_wait is considered
   to have finished once it provided the token to the sem_wait.
   Thus, sem_post must not access the semaphore struct anymore after it has
   made a token available; IOW, it needs to be able to atomically provide
   a token and check whether any blocked sem_wait calls might exist.

   This is straightforward to do if the architecture provides 64b atomics
   because we can just put both the value and nwaiters into one variable that
   we access atomically: This is the data field, the value is in the
   least-significant 32 bits, and nwaiters in the other bits.  When sem_post
   makes a value available, it can atomically check nwaiters.

   If we have only 32b atomics available, we cannot put both nwaiters and
   value into one 32b value because then we might have too few bits for both
   of those counters.  Therefore, we need to use two distinct fields.

   To allow sem_post to atomically make a token available and check for
   blocked sem_wait calls, we use one bit in value to indicate whether
   nwaiters is nonzero.  That allows sem_post to use basically the same
   algorithm as with 64b atomics, but requires sem_wait to update the bit; it
   can't do this atomically with another access to nwaiters, but it can compute
   a conservative value for the bit because it's benign if the bit is set
   even if nwaiters is zero (all we get is an unnecessary futex wake call by
   sem_post).
   Specifically, sem_wait will unset the bit speculatively if it believes that
   there is no other concurrently executing sem_wait.  If it misspeculated,
   it will have to clean up by waking any other sem_wait call (i.e., what
   sem_post would do otherwise).  This does not conflict with the destruction
   requirement because the semaphore must not be destructed while any sem_wait
   is still executing.  */

#if !__HAVE_64B_ATOMICS
static void
__sem_wait_32_finish (struct new_sem *sem);
#endif

static void
__sem_wait_cleanup (void *arg)
{
  struct new_sem *sem = (struct new_sem *) arg;

#if __HAVE_64B_ATOMICS
  /* Stop being registered as a waiter.  See below for MO.  */
  atomic_fetch_add_relaxed (&sem->data, -((uint64_t) 1 << SEM_NWAITERS_SHIFT));
#else
  __sem_wait_32_finish (sem);
#endif
}

/* Wait until at least one token is available, possibly with a timeout.
   This is in a separate function in order to make sure gcc
   puts the call site into an exception region, and thus the
   cleanups get properly run.  TODO still necessary?  Other futex_wait
   users don't seem to need it.  */
static int
__attribute__ ((noinline))
do_futex_wait (struct new_sem *sem, clockid_t clockid,
	       const struct __timespec64 *abstime)
{
  int err;

#if __HAVE_64B_ATOMICS
  err = __futex_abstimed_wait_cancelable64 (
      (unsigned int *) &sem->data + SEM_VALUE_OFFSET, 0,
      clockid, abstime,
      sem->private);
#else
  err = __futex_abstimed_wait_cancelable64 (&sem->value, SEM_NWAITERS_MASK,
					    clockid, abstime, sem->private);
#endif

  return err;
}

/* Fast path: Try to grab a token without blocking.  */
static int
__new_sem_wait_fast (struct new_sem *sem, int definitive_result)
{
  /* We need acquire MO if we actually grab a token, so that this
     synchronizes with all token providers (i.e., the RMW operation we read
     from or all those before it in modification order; also see sem_post).
     We do not need to guarantee any ordering if we observed that there is
     no token (POSIX leaves it unspecified whether functions that fail
     synchronize memory); thus, relaxed MO is sufficient for the initial load
     and the failure path of the CAS.  If the weak CAS fails and we need a
     definitive result, retry.  */
#if __HAVE_64B_ATOMICS
  uint64_t d = atomic_load_relaxed (&sem->data);
  do
    {
      if ((d & SEM_VALUE_MASK) == 0)
	break;
      if (atomic_compare_exchange_weak_acquire (&sem->data, &d, d - 1))
	return 0;
    }
  while (definitive_result);
  return -1;
#else
  unsigned int v = atomic_load_relaxed (&sem->value);
  do
    {
      if ((v >> SEM_VALUE_SHIFT) == 0)
	break;
      if (atomic_compare_exchange_weak_acquire (&sem->value,
	  &v, v - (1 << SEM_VALUE_SHIFT)))
	return 0;
    }
  while (definitive_result);
  return -1;
#endif
}

/* Slow path that blocks.  */
static int
__attribute__ ((noinline))
__new_sem_wait_slow64 (struct new_sem *sem, clockid_t clockid,
		       const struct __timespec64 *abstime)
{
  int err = 0;

#if __HAVE_64B_ATOMICS
  /* Add a waiter.  Relaxed MO is sufficient because we can rely on the
     ordering provided by the RMW operations we use.  */
  uint64_t d = atomic_fetch_add_relaxed (&sem->data,
      (uint64_t) 1 << SEM_NWAITERS_SHIFT);

  pthread_cleanup_push (__sem_wait_cleanup, sem);

  /* Wait for a token to be available.  Retry until we can grab one.  */
  for (;;)
    {
      /* If there is no token available, sleep until there is.  */
      if ((d & SEM_VALUE_MASK) == 0)
	{
	  err = do_futex_wait (sem, clockid, abstime);
	  /* A futex return value of 0 or EAGAIN is due to a real or spurious
	     wake-up, or due to a change in the number of tokens.  We retry in
	     these cases.
	     If we timed out, forward this to the caller.
	     EINTR is returned if we are interrupted by a signal; we
	     forward this to the caller.  (See futex_wait and related
	     documentation.  Before Linux 2.6.22, EINTR was also returned on
	     spurious wake-ups; we only support more recent Linux versions,
	     so do not need to consider this here.)  */
	  if (err == ETIMEDOUT || err == EINTR || err == EOVERFLOW)
	    {
	      __set_errno (err);
	      err = -1;
	      /* Stop being registered as a waiter.  */
	      atomic_fetch_add_relaxed (&sem->data,
		  -((uint64_t) 1 << SEM_NWAITERS_SHIFT));
	      break;
	    }
	  /* Relaxed MO is sufficient; see below.  */
	  d = atomic_load_relaxed (&sem->data);
	}
      else
	{
	  /* Try to grab both a token and stop being a waiter.  We need
	     acquire MO so this synchronizes with all token providers (i.e.,
	     the RMW operation we read from or all those before it in
	     modification order; also see sem_post).  On the failure path,
	     relaxed MO is sufficient because we only eventually need the
	     up-to-date value; the futex_wait or the CAS perform the real
	     work.  */
	  if (atomic_compare_exchange_weak_acquire (&sem->data,
	      &d, d - 1 - ((uint64_t) 1 << SEM_NWAITERS_SHIFT)))
	    {
	      err = 0;
	      break;
	    }
	}
    }

  pthread_cleanup_pop (0);
#else
  /* The main difference to the 64b-atomics implementation is that we need to
     access value and nwaiters in separate steps, and that the nwaiters bit
     in the value can temporarily not be set even if nwaiters is nonzero.
     We work around incorrectly unsetting the nwaiters bit by letting sem_wait
     set the bit again and waking the number of waiters that could grab a
     token.  There are two additional properties we need to ensure:
     (1) We make sure that whenever unsetting the bit, we see the increment of
     nwaiters by the other thread that set the bit.  IOW, we will notice if
     we make a mistake.
     (2) When setting the nwaiters bit, we make sure that we see the unsetting
     of the bit by another waiter that happened before us.  This avoids having
     to blindly set the bit whenever we need to block on it.  We set/unset
     the bit while having incremented nwaiters (i.e., are a registered
     waiter), and the problematic case only happens when one waiter indeed
     followed another (i.e., nwaiters was never larger than 1); thus, this
     works similarly as with a critical section using nwaiters (see the MOs
     and related comments below).

     An alternative approach would be to unset the bit after decrementing
     nwaiters; however, that would result in needing Dekker-like
     synchronization and thus full memory barriers.  We also would not be able
     to prevent misspeculation, so this alternative scheme does not seem
     beneficial.  */
  unsigned int v;

  /* Add a waiter.  We need acquire MO so this synchronizes with the release
     MO we use when decrementing nwaiters below; it ensures that if another
     waiter unset the bit before us, we see that and set it again.  Also see
     property (2) above.  */
  atomic_fetch_add_acquire (&sem->nwaiters, 1);

  pthread_cleanup_push (__sem_wait_cleanup, sem);

  /* Wait for a token to be available.  Retry until we can grab one.  */
  /* We do not need any ordering wrt. to this load's reads-from, so relaxed
     MO is sufficient.  The acquire MO above ensures that in the problematic
     case, we do see the unsetting of the bit by another waiter.  */
  v = atomic_load_relaxed (&sem->value);
  do
    {
      do
	{
	  /* We are about to block, so make sure that the nwaiters bit is
	     set.  We need release MO on the CAS to ensure that when another
	     waiter unsets the nwaiters bit, it will also observe that we
	     incremented nwaiters in the meantime (also see the unsetting of
	     the bit below).  Relaxed MO on CAS failure is sufficient (see
	     above).  */
	  do
	    {
	      if ((v & SEM_NWAITERS_MASK) != 0)
		break;
	    }
	  while (!atomic_compare_exchange_weak_release (&sem->value,
	      &v, v | SEM_NWAITERS_MASK));
	  /* If there is no token, wait.  */
	  if ((v >> SEM_VALUE_SHIFT) == 0)
	    {
	      /* See __HAVE_64B_ATOMICS variant.  */
	      err = do_futex_wait (sem, clockid, abstime);
	      if (err == ETIMEDOUT || err == EINTR)
		{
		  __set_errno (err);
		  err = -1;
		  goto error;
		}
	      err = 0;
	      /* We blocked, so there might be a token now.  Relaxed MO is
		 sufficient (see above).  */
	      v = atomic_load_relaxed (&sem->value);
	    }
	}
      /* If there is no token, we must not try to grab one.  */
      while ((v >> SEM_VALUE_SHIFT) == 0);
    }
  /* Try to grab a token.  We need acquire MO so this synchronizes with
     all token providers (i.e., the RMW operation we read from or all those
     before it in modification order; also see sem_post).  */
  while (!atomic_compare_exchange_weak_acquire (&sem->value,
      &v, v - (1 << SEM_VALUE_SHIFT)));

error:
  pthread_cleanup_pop (0);

  __sem_wait_32_finish (sem);
#endif

  return err;
}

/* Stop being a registered waiter (non-64b-atomics code only).  */
#if !__HAVE_64B_ATOMICS
static void
__sem_wait_32_finish (struct new_sem *sem)
{
  /* The nwaiters bit is still set, try to unset it now if this seems
     necessary.  We do this before decrementing nwaiters so that the unsetting
     is visible to other waiters entering after us.  Relaxed MO is sufficient
     because we are just speculating here; a stronger MO would not prevent
     misspeculation.  */
  unsigned int wguess = atomic_load_relaxed (&sem->nwaiters);
  if (wguess == 1)
    /* We might be the last waiter, so unset.  This needs acquire MO so that
       it syncronizes with the release MO when setting the bit above; if we
       overwrite someone else that set the bit, we'll read in the following
       decrement of nwaiters at least from that release sequence, so we'll
       see if the other waiter is still active or if another writer entered
       in the meantime (i.e., using the check below).  */
    atomic_fetch_and_acquire (&sem->value, ~SEM_NWAITERS_MASK);

  /* Now stop being a waiter, and see whether our guess was correct.
     This needs release MO so that it synchronizes with the acquire MO when
     a waiter increments nwaiters; this makes sure that newer writers see that
     we reset the waiters_present bit.  */
  unsigned int wfinal = atomic_fetch_add_release (&sem->nwaiters, -1);
  if (wfinal > 1 && wguess == 1)
    {
      /* We guessed wrong, and so need to clean up after the mistake and
         unblock any waiters that could have not been woken.  There is no
         additional ordering that we need to set up, so relaxed MO is
         sufficient.  */
      unsigned int v = atomic_fetch_or_relaxed (&sem->value,
						SEM_NWAITERS_MASK);
      /* If there are available tokens, then wake as many waiters.  If there
         aren't any, then there is no need to wake anyone because there is
         none to grab for another waiter.  If tokens become available
         subsequently, then the respective sem_post calls will do the wake-up
         due to us having set the nwaiters bit again.  */
      v >>= SEM_VALUE_SHIFT;
      if (v > 0)
	futex_wake (&sem->value, v, sem->private);
    }
}
#endif
Commit	Line	Data
042e1521	1	/* sem_waitcommon -- wait on a semaphore, shared code.
2b778ceb	2	Copyright (C) 2003-2021 Free Software Foundation, Inc.
042e1521 CD	3	This file is part of the GNU C Library.
	4	Contributed by Paul Mackerras <paulus@au.ibm.com>, 2003.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, see
5a82c748	18	<https://www.gnu.org/licenses/>. */
042e1521	19
c2f5813a	20	#include <kernel-features.h>
042e1521 CD	21	#include <errno.h>
042e1521 CD	22	#include <sysdep.h>
a2f0363f	23	#include <futex-internal.h>
042e1521 CD	24	#include <internaltypes.h>
	25	#include <semaphore.h>
	26	#include <sys/time.h>
	27
	28	#include <pthreadP.h>
	29	#include <shlib-compat.h>
	30	#include <atomic.h>
	31
042e1521 CD	32
	33	/* The semaphore provides two main operations: sem_post adds a token to the
	34	semaphore; sem_wait grabs a token from the semaphore, potentially waiting
	35	until there is a token available. A sem_wait needs to synchronize with
	36	the sem_post that provided the token, so that whatever lead to the sem_post
	37	happens before the code after sem_wait.
	38
	39	Conceptually, available tokens can simply be counted; let's call that the
	40	value of the semaphore. However, we also want to know whether there might
	41	be a sem_wait that is blocked on the value because it was zero (using a
	42	futex with the value being the futex variable); if there is no blocked
	43	sem_wait, sem_post does not need to execute a futex_wake call. Therefore,
	44	we also need to count the number of potentially blocked sem_wait calls
	45	(which we call nwaiters).
	46
	47	What makes this tricky is that POSIX requires that a semaphore can be
	48	destroyed as soon as the last remaining sem_wait has returned, and no
	49	other sem_wait or sem_post calls are executing concurrently. However, the
	50	sem_post call whose token was consumed by the last sem_wait is considered
	51	to have finished once it provided the token to the sem_wait.
	52	Thus, sem_post must not access the semaphore struct anymore after it has
	53	made a token available; IOW, it needs to be able to atomically provide
	54	a token and check whether any blocked sem_wait calls might exist.
	55
	56	This is straightforward to do if the architecture provides 64b atomics
	57	because we can just put both the value and nwaiters into one variable that
	58	we access atomically: This is the data field, the value is in the
	59	least-significant 32 bits, and nwaiters in the other bits. When sem_post
	60	makes a value available, it can atomically check nwaiters.
	61
	62	If we have only 32b atomics available, we cannot put both nwaiters and
	63	value into one 32b value because then we might have too few bits for both
	64	of those counters. Therefore, we need to use two distinct fields.
	65
	66	To allow sem_post to atomically make a token available and check for
	67	blocked sem_wait calls, we use one bit in value to indicate whether
	68	nwaiters is nonzero. That allows sem_post to use basically the same
	69	algorithm as with 64b atomics, but requires sem_wait to update the bit; it
	70	can't do this atomically with another access to nwaiters, but it can compute
	71	a conservative value for the bit because it's benign if the bit is set
	72	even if nwaiters is zero (all we get is an unnecessary futex wake call by
	73	sem_post).
	74	Specifically, sem_wait will unset the bit speculatively if it believes that
	75	there is no other concurrently executing sem_wait. If it misspeculated,
	76	it will have to clean up by waking any other sem_wait call (i.e., what
	77	sem_post would do otherwise). This does not conflict with the destruction
	78	requirement because the semaphore must not be destructed while any sem_wait
	79	is still executing. */
	80
042e1521 CD	81	#if !__HAVE_64B_ATOMICS
	82	static void
	83	__sem_wait_32_finish (struct new_sem *sem);
	84	#endif
	85
	86	static void
	87	__sem_wait_cleanup (void *arg)
	88	{
	89	struct new_sem sem = (struct new_sem ) arg;
	90
	91	#if __HAVE_64B_ATOMICS
	92	/* Stop being registered as a waiter. See below for MO. */
22971c35	93	atomic_fetch_add_relaxed (&sem->data, -((uint64_t) 1 << SEM_NWAITERS_SHIFT));
042e1521 CD	94	#else
	95	__sem_wait_32_finish (sem);
	96	#endif
	97	}
	98
	99	/* Wait until at least one token is available, possibly with a timeout.
	100	This is in a separate function in order to make sure gcc
	101	puts the call site into an exception region, and thus the
	102	cleanups get properly run. TODO still necessary? Other futex_wait
	103	users don't seem to need it. */
	104	static int
	105	__attribute__ ((noinline))
6615f779	106	do_futex_wait (struct new_sem *sem, clockid_t clockid,
b8d3e8fb	107	const struct __timespec64 *abstime)
042e1521 CD	108	{
	109	int err;
	110
	111	#if __HAVE_64B_ATOMICS
b8d3e8fb	112	err = __futex_abstimed_wait_cancelable64 (
99d01ffc	113	(unsigned int *) &sem->data + SEM_VALUE_OFFSET, 0,
6615f779	114	clockid, abstime,
a2f0363f	115	sem->private);
042e1521	116	#else
b8d3e8fb LM	117	err = __futex_abstimed_wait_cancelable64 (&sem->value, SEM_NWAITERS_MASK,
b8d3e8fb LM	118	clockid, abstime, sem->private);
042e1521 CD	119	#endif
	120
	121	return err;
	122	}
	123
	124	/* Fast path: Try to grab a token without blocking. */
	125	static int
	126	__new_sem_wait_fast (struct new_sem *sem, int definitive_result)
	127	{
	128	/* We need acquire MO if we actually grab a token, so that this
	129	synchronizes with all token providers (i.e., the RMW operation we read
	130	from or all those before it in modification order; also see sem_post).
	131	We do not need to guarantee any ordering if we observed that there is
	132	no token (POSIX leaves it unspecified whether functions that fail
	133	synchronize memory); thus, relaxed MO is sufficient for the initial load
	134	and the failure path of the CAS. If the weak CAS fails and we need a
	135	definitive result, retry. */
	136	#if __HAVE_64B_ATOMICS
e0da28a1	137	uint64_t d = atomic_load_relaxed (&sem->data);
042e1521 CD	138	do
	139	{
	140	if ((d & SEM_VALUE_MASK) == 0)
	141	break;
	142	if (atomic_compare_exchange_weak_acquire (&sem->data, &d, d - 1))
	143	return 0;
	144	}
	145	while (definitive_result);
	146	return -1;
	147	#else
	148	unsigned int v = atomic_load_relaxed (&sem->value);
	149	do
	150	{
	151	if ((v >> SEM_VALUE_SHIFT) == 0)
	152	break;
	153	if (atomic_compare_exchange_weak_acquire (&sem->value,
	154	&v, v - (1 << SEM_VALUE_SHIFT)))
	155	return 0;
	156	}
	157	while (definitive_result);
	158	return -1;
	159	#endif
	160	}
	161
	162	/* Slow path that blocks. */
	163	static int
	164	__attribute__ ((noinline))
b8d3e8fb LM	165	__new_sem_wait_slow64 (struct new_sem *sem, clockid_t clockid,
b8d3e8fb LM	166	const struct __timespec64 *abstime)
042e1521 CD	167	{
	168	int err = 0;
	169
	170	#if __HAVE_64B_ATOMICS
	171	/* Add a waiter. Relaxed MO is sufficient because we can rely on the
	172	ordering provided by the RMW operations we use. */
22971c35 L	173	uint64_t d = atomic_fetch_add_relaxed (&sem->data,
22971c35 L	174	(uint64_t) 1 << SEM_NWAITERS_SHIFT);
042e1521 CD	175
	176	pthread_cleanup_push (__sem_wait_cleanup, sem);
	177
	178	/* Wait for a token to be available. Retry until we can grab one. */
	179	for (;;)
	180	{
	181	/* If there is no token available, sleep until there is. */
	182	if ((d & SEM_VALUE_MASK) == 0)
	183	{
6615f779	184	err = do_futex_wait (sem, clockid, abstime);
042e1521 CD	185	/* A futex return value of 0 or EAGAIN is due to a real or spurious
	186	wake-up, or due to a change in the number of tokens. We retry in
	187	these cases.
	188	If we timed out, forward this to the caller.
1aa8d144 TR	189	EINTR is returned if we are interrupted by a signal; we
	190	forward this to the caller. (See futex_wait and related
	191	documentation. Before Linux 2.6.22, EINTR was also returned on
	192	spurious wake-ups; we only support more recent Linux versions,
	193	so do not need to consider this here.) */
aa69f19a	194	if (err == ETIMEDOUT \|\| err == EINTR \|\| err == EOVERFLOW)
042e1521 CD	195	{
	196	__set_errno (err);
	197	err = -1;
	198	/* Stop being registered as a waiter. */
	199	atomic_fetch_add_relaxed (&sem->data,
22971c35	200	-((uint64_t) 1 << SEM_NWAITERS_SHIFT));
042e1521 CD	201	break;
	202	}
	203	/* Relaxed MO is sufficient; see below. */
	204	d = atomic_load_relaxed (&sem->data);
	205	}
	206	else
	207	{
	208	/* Try to grab both a token and stop being a waiter. We need
	209	acquire MO so this synchronizes with all token providers (i.e.,
	210	the RMW operation we read from or all those before it in
	211	modification order; also see sem_post). On the failure path,
	212	relaxed MO is sufficient because we only eventually need the
	213	up-to-date value; the futex_wait or the CAS perform the real
	214	work. */
	215	if (atomic_compare_exchange_weak_acquire (&sem->data,
22971c35	216	&d, d - 1 - ((uint64_t) 1 << SEM_NWAITERS_SHIFT)))
042e1521 CD	217	{
	218	err = 0;
	219	break;
	220	}
	221	}
	222	}
	223
	224	pthread_cleanup_pop (0);
	225	#else
	226	/* The main difference to the 64b-atomics implementation is that we need to
	227	access value and nwaiters in separate steps, and that the nwaiters bit
	228	in the value can temporarily not be set even if nwaiters is nonzero.
	229	We work around incorrectly unsetting the nwaiters bit by letting sem_wait
	230	set the bit again and waking the number of waiters that could grab a
	231	token. There are two additional properties we need to ensure:
	232	(1) We make sure that whenever unsetting the bit, we see the increment of
	233	nwaiters by the other thread that set the bit. IOW, we will notice if
	234	we make a mistake.
	235	(2) When setting the nwaiters bit, we make sure that we see the unsetting
	236	of the bit by another waiter that happened before us. This avoids having
	237	to blindly set the bit whenever we need to block on it. We set/unset
	238	the bit while having incremented nwaiters (i.e., are a registered
	239	waiter), and the problematic case only happens when one waiter indeed
	240	followed another (i.e., nwaiters was never larger than 1); thus, this
	241	works similarly as with a critical section using nwaiters (see the MOs
	242	and related comments below).
	243
	244	An alternative approach would be to unset the bit after decrementing
	245	nwaiters; however, that would result in needing Dekker-like
	246	synchronization and thus full memory barriers. We also would not be able
	247	to prevent misspeculation, so this alternative scheme does not seem
	248	beneficial. */
	249	unsigned int v;
	250
	251	/* Add a waiter. We need acquire MO so this synchronizes with the release
	252	MO we use when decrementing nwaiters below; it ensures that if another
	253	waiter unset the bit before us, we see that and set it again. Also see
	254	property (2) above. */
	255	atomic_fetch_add_acquire (&sem->nwaiters, 1);
	256
	257	pthread_cleanup_push (__sem_wait_cleanup, sem);
	258
	259	/* Wait for a token to be available. Retry until we can grab one. */
	260	/* We do not need any ordering wrt. to this load's reads-from, so relaxed
	261	MO is sufficient. The acquire MO above ensures that in the problematic
	262	case, we do see the unsetting of the bit by another waiter. */
	263	v = atomic_load_relaxed (&sem->value);
	264	do
	265	{
	266	do
	267	{
	268	/* We are about to block, so make sure that the nwaiters bit is
	269	set. We need release MO on the CAS to ensure that when another
	270	waiter unsets the nwaiters bit, it will also observe that we
	271	incremented nwaiters in the meantime (also see the unsetting of
	272	the bit below). Relaxed MO on CAS failure is sufficient (see
	273	above). */
	274	do
	275	{
	276	if ((v & SEM_NWAITERS_MASK) != 0)
	277	break;
	278	}
	279	while (!atomic_compare_exchange_weak_release (&sem->value,
	280	&v, v \| SEM_NWAITERS_MASK));
281	/* If there is no token, wait. */
282	if ((v >> SEM_VALUE_SHIFT) == 0)
283	{
284	/* See __HAVE_64B_ATOMICS variant. */
6615f779	285	err = do_futex_wait (sem, clockid, abstime);
1aa8d144	286	if (err == ETIMEDOUT \|\| err == EINTR)
042e1521 CD	287	{
	288	__set_errno (err);
	289	err = -1;
	290	goto error;
	291	}
	292	err = 0;
	293	/* We blocked, so there might be a token now. Relaxed MO is
	294	sufficient (see above). */
	295	v = atomic_load_relaxed (&sem->value);
	296	}
	297	}
	298	/* If there is no token, we must not try to grab one. */
	299	while ((v >> SEM_VALUE_SHIFT) == 0);
	300	}
	301	/* Try to grab a token. We need acquire MO so this synchronizes with
	302	all token providers (i.e., the RMW operation we read from or all those
	303	before it in modification order; also see sem_post). */
	304	while (!atomic_compare_exchange_weak_acquire (&sem->value,
	305	&v, v - (1 << SEM_VALUE_SHIFT)));
	306
	307	error:
	308	pthread_cleanup_pop (0);
	309
	310	__sem_wait_32_finish (sem);
	311	#endif
	312
	313	return err;
	314	}
	315
	316	/* Stop being a registered waiter (non-64b-atomics code only). */
	317	#if !__HAVE_64B_ATOMICS
	318	static void
	319	__sem_wait_32_finish (struct new_sem *sem)
	320	{
	321	/* The nwaiters bit is still set, try to unset it now if this seems
	322	necessary. We do this before decrementing nwaiters so that the unsetting
	323	is visible to other waiters entering after us. Relaxed MO is sufficient
	324	because we are just speculating here; a stronger MO would not prevent
	325	misspeculation. */
	326	unsigned int wguess = atomic_load_relaxed (&sem->nwaiters);
	327	if (wguess == 1)
	328	/* We might be the last waiter, so unset. This needs acquire MO so that
	329	it syncronizes with the release MO when setting the bit above; if we
	330	overwrite someone else that set the bit, we'll read in the following
	331	decrement of nwaiters at least from that release sequence, so we'll
	332	see if the other waiter is still active or if another writer entered
	333	in the meantime (i.e., using the check below). */
	334	atomic_fetch_and_acquire (&sem->value, ~SEM_NWAITERS_MASK);
	335
	336	/* Now stop being a waiter, and see whether our guess was correct.
	337	This needs release MO so that it synchronizes with the acquire MO when
	338	a waiter increments nwaiters; this makes sure that newer writers see that
	339	we reset the waiters_present bit. */
	340	unsigned int wfinal = atomic_fetch_add_release (&sem->nwaiters, -1);
	341	if (wfinal > 1 && wguess == 1)
	342	{
	343	/* We guessed wrong, and so need to clean up after the mistake and
	344	unblock any waiters that could have not been woken. There is no
	345	additional ordering that we need to set up, so relaxed MO is
	346	sufficient. */
	347	unsigned int v = atomic_fetch_or_relaxed (&sem->value,
	348	SEM_NWAITERS_MASK);
	349	/* If there are available tokens, then wake as many waiters. If there
	350	aren't any, then there is no need to wake anyone because there is
351	none to grab for another waiter. If tokens become available
352	subsequently, then the respective sem_post calls will do the wake-up
353	due to us having set the nwaiters bit again. */
354	v >>= SEM_VALUE_SHIFT;
355	if (v > 0)
356	futex_wake (&sem->value, v, sem->private);
357	}
358	}
359	#endif